1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * bcachefs setup/teardown code, and some metadata io - read a superblock and 4 * figure out what to do with it. 5 * 6 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> 7 * Copyright 2012 Google, Inc. 8 */ 9 10 #include "bcachefs.h" 11 #include "alloc_background.h" 12 #include "alloc_foreground.h" 13 #include "async_objs.h" 14 #include "backpointers.h" 15 #include "bkey_sort.h" 16 #include "btree_cache.h" 17 #include "btree_gc.h" 18 #include "btree_journal_iter.h" 19 #include "btree_key_cache.h" 20 #include "btree_node_scan.h" 21 #include "btree_update_interior.h" 22 #include "btree_io.h" 23 #include "btree_write_buffer.h" 24 #include "buckets_waiting_for_journal.h" 25 #include "chardev.h" 26 #include "checksum.h" 27 #include "clock.h" 28 #include "compress.h" 29 #include "debug.h" 30 #include "disk_accounting.h" 31 #include "disk_groups.h" 32 #include "ec.h" 33 #include "enumerated_ref.h" 34 #include "errcode.h" 35 #include "error.h" 36 #include "fs.h" 37 #include "fs-io.h" 38 #include "fs-io-buffered.h" 39 #include "fs-io-direct.h" 40 #include "fsck.h" 41 #include "inode.h" 42 #include "io_read.h" 43 #include "io_write.h" 44 #include "journal.h" 45 #include "journal_reclaim.h" 46 #include "journal_seq_blacklist.h" 47 #include "move.h" 48 #include "migrate.h" 49 #include "movinggc.h" 50 #include "nocow_locking.h" 51 #include "quota.h" 52 #include "rebalance.h" 53 #include "recovery.h" 54 #include "recovery_passes.h" 55 #include "replicas.h" 56 #include "sb-clean.h" 57 #include "sb-counters.h" 58 #include "sb-errors.h" 59 #include "sb-members.h" 60 #include "snapshot.h" 61 #include "subvolume.h" 62 #include "super.h" 63 #include "super-io.h" 64 #include "sysfs.h" 65 #include "thread_with_file.h" 66 #include "trace.h" 67 68 #include <linux/backing-dev.h> 69 #include <linux/blkdev.h> 70 #include <linux/debugfs.h> 71 #include <linux/device.h> 72 #include <linux/idr.h> 73 #include <linux/module.h> 74 #include <linux/percpu.h> 75 #include <linux/random.h> 76 #include <linux/sysfs.h> 77 78 MODULE_LICENSE("GPL"); 79 MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>"); 80 MODULE_DESCRIPTION("bcachefs filesystem"); 81 82 typedef DARRAY(struct bch_sb_handle) bch_sb_handles; 83 84 #define x(n) #n, 85 const char * const bch2_fs_flag_strs[] = { 86 BCH_FS_FLAGS() 87 NULL 88 }; 89 90 const char * const bch2_write_refs[] = { 91 BCH_WRITE_REFS() 92 NULL 93 }; 94 95 const char * const bch2_dev_read_refs[] = { 96 BCH_DEV_READ_REFS() 97 NULL 98 }; 99 100 const char * const bch2_dev_write_refs[] = { 101 BCH_DEV_WRITE_REFS() 102 NULL 103 }; 104 #undef x 105 106 static void __bch2_print_str(struct bch_fs *c, const char *prefix, 107 const char *str) 108 { 109 #ifdef __KERNEL__ 110 struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c); 111 112 if (unlikely(stdio)) { 113 bch2_stdio_redirect_printf(stdio, true, "%s", str); 114 return; 115 } 116 #endif 117 bch2_print_string_as_lines(KERN_ERR, str); 118 } 119 120 void bch2_print_str(struct bch_fs *c, const char *prefix, const char *str) 121 { 122 __bch2_print_str(c, prefix, str); 123 } 124 125 __printf(2, 0) 126 static void bch2_print_maybe_redirect(struct stdio_redirect *stdio, const char *fmt, va_list args) 127 { 128 #ifdef __KERNEL__ 129 if (unlikely(stdio)) { 130 if (fmt[0] == KERN_SOH[0]) 131 fmt += 2; 132 133 bch2_stdio_redirect_vprintf(stdio, true, fmt, args); 134 return; 135 } 136 #endif 137 vprintk(fmt, args); 138 } 139 140 void bch2_print_opts(struct bch_opts *opts, const char *fmt, ...) 141 { 142 struct stdio_redirect *stdio = (void *)(unsigned long)opts->stdio; 143 144 va_list args; 145 va_start(args, fmt); 146 bch2_print_maybe_redirect(stdio, fmt, args); 147 va_end(args); 148 } 149 150 void __bch2_print(struct bch_fs *c, const char *fmt, ...) 151 { 152 struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c); 153 154 va_list args; 155 va_start(args, fmt); 156 bch2_print_maybe_redirect(stdio, fmt, args); 157 va_end(args); 158 } 159 160 #define KTYPE(type) \ 161 static const struct attribute_group type ## _group = { \ 162 .attrs = type ## _files \ 163 }; \ 164 \ 165 static const struct attribute_group *type ## _groups[] = { \ 166 &type ## _group, \ 167 NULL \ 168 }; \ 169 \ 170 static const struct kobj_type type ## _ktype = { \ 171 .release = type ## _release, \ 172 .sysfs_ops = &type ## _sysfs_ops, \ 173 .default_groups = type ## _groups \ 174 } 175 176 static void bch2_fs_release(struct kobject *); 177 static void bch2_dev_release(struct kobject *); 178 static void bch2_fs_counters_release(struct kobject *k) 179 { 180 } 181 182 static void bch2_fs_internal_release(struct kobject *k) 183 { 184 } 185 186 static void bch2_fs_opts_dir_release(struct kobject *k) 187 { 188 } 189 190 static void bch2_fs_time_stats_release(struct kobject *k) 191 { 192 } 193 194 KTYPE(bch2_fs); 195 KTYPE(bch2_fs_counters); 196 KTYPE(bch2_fs_internal); 197 KTYPE(bch2_fs_opts_dir); 198 KTYPE(bch2_fs_time_stats); 199 KTYPE(bch2_dev); 200 201 static struct kset *bcachefs_kset; 202 static LIST_HEAD(bch_fs_list); 203 static DEFINE_MUTEX(bch_fs_list_lock); 204 205 DECLARE_WAIT_QUEUE_HEAD(bch2_read_only_wait); 206 207 static void bch2_dev_unlink(struct bch_dev *); 208 static void bch2_dev_free(struct bch_dev *); 209 static int bch2_dev_alloc(struct bch_fs *, unsigned); 210 static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *); 211 static void bch2_dev_io_ref_stop(struct bch_dev *, int); 212 static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *); 213 static int bch2_fs_init_rw(struct bch_fs *); 214 215 struct bch_fs *bch2_dev_to_fs(dev_t dev) 216 { 217 guard(mutex)(&bch_fs_list_lock); 218 guard(rcu)(); 219 220 struct bch_fs *c; 221 list_for_each_entry(c, &bch_fs_list, list) 222 for_each_member_device_rcu(c, ca, NULL) 223 if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) { 224 closure_get(&c->cl); 225 return c; 226 } 227 return NULL; 228 } 229 230 static struct bch_fs *__bch2_uuid_to_fs(__uuid_t uuid) 231 { 232 struct bch_fs *c; 233 234 lockdep_assert_held(&bch_fs_list_lock); 235 236 list_for_each_entry(c, &bch_fs_list, list) 237 if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid))) 238 return c; 239 240 return NULL; 241 } 242 243 struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid) 244 { 245 struct bch_fs *c; 246 247 mutex_lock(&bch_fs_list_lock); 248 c = __bch2_uuid_to_fs(uuid); 249 if (c) 250 closure_get(&c->cl); 251 mutex_unlock(&bch_fs_list_lock); 252 253 return c; 254 } 255 256 /* Filesystem RO/RW: */ 257 258 /* 259 * For startup/shutdown of RW stuff, the dependencies are: 260 * 261 * - foreground writes depend on copygc and rebalance (to free up space) 262 * 263 * - copygc and rebalance depend on mark and sweep gc (they actually probably 264 * don't because they either reserve ahead of time or don't block if 265 * allocations fail, but allocations can require mark and sweep gc to run 266 * because of generation number wraparound) 267 * 268 * - all of the above depends on the allocator threads 269 * 270 * - allocator depends on the journal (when it rewrites prios and gens) 271 */ 272 273 static void __bch2_fs_read_only(struct bch_fs *c) 274 { 275 unsigned clean_passes = 0; 276 u64 seq = 0; 277 278 bch2_fs_ec_stop(c); 279 bch2_open_buckets_stop(c, NULL, true); 280 bch2_rebalance_stop(c); 281 bch2_copygc_stop(c); 282 bch2_fs_ec_flush(c); 283 284 bch_verbose(c, "flushing journal and stopping allocators, journal seq %llu", 285 journal_cur_seq(&c->journal)); 286 287 do { 288 clean_passes++; 289 290 if (bch2_btree_interior_updates_flush(c) || 291 bch2_btree_write_buffer_flush_going_ro(c) || 292 bch2_journal_flush_all_pins(&c->journal) || 293 bch2_btree_flush_all_writes(c) || 294 seq != atomic64_read(&c->journal.seq)) { 295 seq = atomic64_read(&c->journal.seq); 296 clean_passes = 0; 297 } 298 } while (clean_passes < 2); 299 300 bch_verbose(c, "flushing journal and stopping allocators complete, journal seq %llu", 301 journal_cur_seq(&c->journal)); 302 303 if (test_bit(JOURNAL_replay_done, &c->journal.flags) && 304 !test_bit(BCH_FS_emergency_ro, &c->flags)) 305 set_bit(BCH_FS_clean_shutdown, &c->flags); 306 307 bch2_fs_journal_stop(&c->journal); 308 309 bch_info(c, "%sclean shutdown complete, journal seq %llu", 310 test_bit(BCH_FS_clean_shutdown, &c->flags) ? "" : "un", 311 c->journal.seq_ondisk); 312 313 /* 314 * After stopping journal: 315 */ 316 for_each_member_device(c, ca) { 317 bch2_dev_io_ref_stop(ca, WRITE); 318 bch2_dev_allocator_remove(c, ca); 319 } 320 } 321 322 static void bch2_writes_disabled(struct enumerated_ref *writes) 323 { 324 struct bch_fs *c = container_of(writes, struct bch_fs, writes); 325 326 set_bit(BCH_FS_write_disable_complete, &c->flags); 327 wake_up(&bch2_read_only_wait); 328 } 329 330 void bch2_fs_read_only(struct bch_fs *c) 331 { 332 if (!test_bit(BCH_FS_rw, &c->flags)) { 333 bch2_journal_reclaim_stop(&c->journal); 334 return; 335 } 336 337 BUG_ON(test_bit(BCH_FS_write_disable_complete, &c->flags)); 338 339 bch_verbose(c, "going read-only"); 340 341 /* 342 * Block new foreground-end write operations from starting - any new 343 * writes will return -EROFS: 344 */ 345 set_bit(BCH_FS_going_ro, &c->flags); 346 enumerated_ref_stop_async(&c->writes); 347 348 /* 349 * If we're not doing an emergency shutdown, we want to wait on 350 * outstanding writes to complete so they don't see spurious errors due 351 * to shutting down the allocator: 352 * 353 * If we are doing an emergency shutdown outstanding writes may 354 * hang until we shutdown the allocator so we don't want to wait 355 * on outstanding writes before shutting everything down - but 356 * we do need to wait on them before returning and signalling 357 * that going RO is complete: 358 */ 359 wait_event(bch2_read_only_wait, 360 test_bit(BCH_FS_write_disable_complete, &c->flags) || 361 test_bit(BCH_FS_emergency_ro, &c->flags)); 362 363 bool writes_disabled = test_bit(BCH_FS_write_disable_complete, &c->flags); 364 if (writes_disabled) 365 bch_verbose(c, "finished waiting for writes to stop"); 366 367 __bch2_fs_read_only(c); 368 369 wait_event(bch2_read_only_wait, 370 test_bit(BCH_FS_write_disable_complete, &c->flags)); 371 372 if (!writes_disabled) 373 bch_verbose(c, "finished waiting for writes to stop"); 374 375 clear_bit(BCH_FS_write_disable_complete, &c->flags); 376 clear_bit(BCH_FS_going_ro, &c->flags); 377 clear_bit(BCH_FS_rw, &c->flags); 378 379 if (!bch2_journal_error(&c->journal) && 380 !test_bit(BCH_FS_error, &c->flags) && 381 !test_bit(BCH_FS_emergency_ro, &c->flags) && 382 test_bit(BCH_FS_started, &c->flags) && 383 test_bit(BCH_FS_clean_shutdown, &c->flags) && 384 c->recovery.pass_done >= BCH_RECOVERY_PASS_journal_replay) { 385 BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal)); 386 BUG_ON(atomic_long_read(&c->btree_cache.nr_dirty)); 387 BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty)); 388 BUG_ON(c->btree_write_buffer.inc.keys.nr); 389 BUG_ON(c->btree_write_buffer.flushing.keys.nr); 390 bch2_verify_accounting_clean(c); 391 392 bch_verbose(c, "marking filesystem clean"); 393 bch2_fs_mark_clean(c); 394 } else { 395 /* Make sure error counts/counters are persisted */ 396 mutex_lock(&c->sb_lock); 397 bch2_write_super(c); 398 mutex_unlock(&c->sb_lock); 399 400 bch_verbose(c, "done going read-only, filesystem not clean"); 401 } 402 } 403 404 static void bch2_fs_read_only_work(struct work_struct *work) 405 { 406 struct bch_fs *c = 407 container_of(work, struct bch_fs, read_only_work); 408 409 down_write(&c->state_lock); 410 bch2_fs_read_only(c); 411 up_write(&c->state_lock); 412 } 413 414 static void bch2_fs_read_only_async(struct bch_fs *c) 415 { 416 queue_work(system_long_wq, &c->read_only_work); 417 } 418 419 bool bch2_fs_emergency_read_only(struct bch_fs *c) 420 { 421 bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); 422 423 bch2_journal_halt(&c->journal); 424 bch2_fs_read_only_async(c); 425 426 wake_up(&bch2_read_only_wait); 427 return ret; 428 } 429 430 static bool __bch2_fs_emergency_read_only2(struct bch_fs *c, struct printbuf *out, 431 bool locked) 432 { 433 bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); 434 435 if (!locked) 436 bch2_journal_halt(&c->journal); 437 else 438 bch2_journal_halt_locked(&c->journal); 439 bch2_fs_read_only_async(c); 440 wake_up(&bch2_read_only_wait); 441 442 if (ret) 443 prt_printf(out, "emergency read only at seq %llu\n", 444 journal_cur_seq(&c->journal)); 445 446 return ret; 447 } 448 449 bool bch2_fs_emergency_read_only2(struct bch_fs *c, struct printbuf *out) 450 { 451 return __bch2_fs_emergency_read_only2(c, out, false); 452 } 453 454 bool bch2_fs_emergency_read_only_locked(struct bch_fs *c) 455 { 456 bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); 457 458 bch2_journal_halt_locked(&c->journal); 459 bch2_fs_read_only_async(c); 460 461 wake_up(&bch2_read_only_wait); 462 return ret; 463 } 464 465 static int __bch2_fs_read_write(struct bch_fs *c, bool early) 466 { 467 int ret; 468 469 BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags)); 470 471 if (WARN_ON(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))) 472 return bch_err_throw(c, erofs_no_alloc_info); 473 474 if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) { 475 bch_err(c, "cannot go rw, unfixed btree errors"); 476 return bch_err_throw(c, erofs_unfixed_errors); 477 } 478 479 if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { 480 bch_err(c, "cannot go rw, filesystem is an unresized image file"); 481 return bch_err_throw(c, erofs_filesystem_full); 482 } 483 484 if (test_bit(BCH_FS_rw, &c->flags)) 485 return 0; 486 487 bch_info(c, "going read-write"); 488 489 ret = bch2_fs_init_rw(c); 490 if (ret) 491 goto err; 492 493 ret = bch2_sb_members_v2_init(c); 494 if (ret) 495 goto err; 496 497 clear_bit(BCH_FS_clean_shutdown, &c->flags); 498 499 scoped_guard(rcu) 500 for_each_online_member_rcu(c, ca) 501 if (ca->mi.state == BCH_MEMBER_STATE_rw) { 502 bch2_dev_allocator_add(c, ca); 503 enumerated_ref_start(&ca->io_ref[WRITE]); 504 } 505 506 bch2_recalc_capacity(c); 507 508 /* 509 * First journal write must be a flush write: after a clean shutdown we 510 * don't read the journal, so the first journal write may end up 511 * overwriting whatever was there previously, and there must always be 512 * at least one non-flush write in the journal or recovery will fail: 513 */ 514 spin_lock(&c->journal.lock); 515 set_bit(JOURNAL_need_flush_write, &c->journal.flags); 516 set_bit(JOURNAL_running, &c->journal.flags); 517 bch2_journal_space_available(&c->journal); 518 spin_unlock(&c->journal.lock); 519 520 ret = bch2_fs_mark_dirty(c); 521 if (ret) 522 goto err; 523 524 ret = bch2_journal_reclaim_start(&c->journal); 525 if (ret) 526 goto err; 527 528 set_bit(BCH_FS_rw, &c->flags); 529 set_bit(BCH_FS_was_rw, &c->flags); 530 531 enumerated_ref_start(&c->writes); 532 533 ret = bch2_copygc_start(c); 534 if (ret) { 535 bch_err_msg(c, ret, "error starting copygc thread"); 536 goto err; 537 } 538 539 ret = bch2_rebalance_start(c); 540 if (ret) { 541 bch_err_msg(c, ret, "error starting rebalance thread"); 542 goto err; 543 } 544 545 bch2_do_discards(c); 546 bch2_do_invalidates(c); 547 bch2_do_stripe_deletes(c); 548 bch2_do_pending_node_rewrites(c); 549 return 0; 550 err: 551 if (test_bit(BCH_FS_rw, &c->flags)) 552 bch2_fs_read_only(c); 553 else 554 __bch2_fs_read_only(c); 555 return ret; 556 } 557 558 int bch2_fs_read_write(struct bch_fs *c) 559 { 560 if (c->opts.recovery_pass_last && 561 c->opts.recovery_pass_last < BCH_RECOVERY_PASS_journal_replay) 562 return bch_err_throw(c, erofs_norecovery); 563 564 if (c->opts.nochanges) 565 return bch_err_throw(c, erofs_nochanges); 566 567 if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) 568 return bch_err_throw(c, erofs_no_alloc_info); 569 570 return __bch2_fs_read_write(c, false); 571 } 572 573 int bch2_fs_read_write_early(struct bch_fs *c) 574 { 575 down_write(&c->state_lock); 576 int ret = __bch2_fs_read_write(c, true); 577 up_write(&c->state_lock); 578 579 return ret; 580 } 581 582 /* Filesystem startup/shutdown: */ 583 584 static void __bch2_fs_free(struct bch_fs *c) 585 { 586 for (unsigned i = 0; i < BCH_TIME_STAT_NR; i++) 587 bch2_time_stats_exit(&c->times[i]); 588 589 #ifdef CONFIG_UNICODE 590 utf8_unload(c->cf_encoding); 591 #endif 592 593 bch2_find_btree_nodes_exit(&c->found_btree_nodes); 594 bch2_free_pending_node_rewrites(c); 595 bch2_free_fsck_errs(c); 596 bch2_fs_vfs_exit(c); 597 bch2_fs_snapshots_exit(c); 598 bch2_fs_sb_errors_exit(c); 599 bch2_fs_replicas_exit(c); 600 bch2_fs_rebalance_exit(c); 601 bch2_fs_quota_exit(c); 602 bch2_fs_nocow_locking_exit(c); 603 bch2_fs_journal_exit(&c->journal); 604 bch2_fs_fs_io_direct_exit(c); 605 bch2_fs_fs_io_buffered_exit(c); 606 bch2_fs_fsio_exit(c); 607 bch2_fs_io_write_exit(c); 608 bch2_fs_io_read_exit(c); 609 bch2_fs_encryption_exit(c); 610 bch2_fs_ec_exit(c); 611 bch2_fs_counters_exit(c); 612 bch2_fs_compress_exit(c); 613 bch2_io_clock_exit(&c->io_clock[WRITE]); 614 bch2_io_clock_exit(&c->io_clock[READ]); 615 bch2_fs_buckets_waiting_for_journal_exit(c); 616 bch2_fs_btree_write_buffer_exit(c); 617 bch2_fs_btree_key_cache_exit(&c->btree_key_cache); 618 bch2_fs_btree_iter_exit(c); 619 bch2_fs_btree_interior_update_exit(c); 620 bch2_fs_btree_cache_exit(c); 621 bch2_fs_accounting_exit(c); 622 bch2_fs_async_obj_exit(c); 623 bch2_journal_keys_put_initial(c); 624 bch2_find_btree_nodes_exit(&c->found_btree_nodes); 625 626 BUG_ON(atomic_read(&c->journal_keys.ref)); 627 percpu_free_rwsem(&c->mark_lock); 628 if (c->online_reserved) { 629 u64 v = percpu_u64_get(c->online_reserved); 630 WARN(v, "online_reserved not 0 at shutdown: %lli", v); 631 free_percpu(c->online_reserved); 632 } 633 634 darray_exit(&c->incompat_versions_requested); 635 darray_exit(&c->btree_roots_extra); 636 free_percpu(c->pcpu); 637 free_percpu(c->usage); 638 mempool_exit(&c->large_bkey_pool); 639 mempool_exit(&c->btree_bounce_pool); 640 bioset_exit(&c->btree_bio); 641 mempool_exit(&c->fill_iter); 642 enumerated_ref_exit(&c->writes); 643 kfree(rcu_dereference_protected(c->disk_groups, 1)); 644 kfree(c->journal_seq_blacklist_table); 645 646 if (c->write_ref_wq) 647 destroy_workqueue(c->write_ref_wq); 648 if (c->btree_write_submit_wq) 649 destroy_workqueue(c->btree_write_submit_wq); 650 if (c->btree_read_complete_wq) 651 destroy_workqueue(c->btree_read_complete_wq); 652 if (c->copygc_wq) 653 destroy_workqueue(c->copygc_wq); 654 if (c->btree_write_complete_wq) 655 destroy_workqueue(c->btree_write_complete_wq); 656 if (c->btree_update_wq) 657 destroy_workqueue(c->btree_update_wq); 658 659 bch2_free_super(&c->disk_sb); 660 kvfree(c); 661 module_put(THIS_MODULE); 662 } 663 664 static void bch2_fs_release(struct kobject *kobj) 665 { 666 struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); 667 668 __bch2_fs_free(c); 669 } 670 671 void __bch2_fs_stop(struct bch_fs *c) 672 { 673 bch_verbose(c, "shutting down"); 674 675 set_bit(BCH_FS_stopping, &c->flags); 676 677 down_write(&c->state_lock); 678 bch2_fs_read_only(c); 679 up_write(&c->state_lock); 680 681 for (unsigned i = 0; i < c->sb.nr_devices; i++) { 682 struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true); 683 if (ca) 684 bch2_dev_io_ref_stop(ca, READ); 685 } 686 687 for_each_member_device(c, ca) 688 bch2_dev_unlink(ca); 689 690 if (c->kobj.state_in_sysfs) 691 kobject_del(&c->kobj); 692 693 bch2_fs_debug_exit(c); 694 bch2_fs_chardev_exit(c); 695 696 bch2_ro_ref_put(c); 697 wait_event(c->ro_ref_wait, !refcount_read(&c->ro_ref)); 698 699 kobject_put(&c->counters_kobj); 700 kobject_put(&c->time_stats); 701 kobject_put(&c->opts_dir); 702 kobject_put(&c->internal); 703 704 /* btree prefetch might have kicked off reads in the background: */ 705 bch2_btree_flush_all_reads(c); 706 707 for_each_member_device(c, ca) 708 cancel_work_sync(&ca->io_error_work); 709 710 cancel_work_sync(&c->read_only_work); 711 } 712 713 void bch2_fs_free(struct bch_fs *c) 714 { 715 mutex_lock(&bch_fs_list_lock); 716 list_del(&c->list); 717 mutex_unlock(&bch_fs_list_lock); 718 719 closure_sync(&c->cl); 720 closure_debug_destroy(&c->cl); 721 722 for (unsigned i = 0; i < c->sb.nr_devices; i++) { 723 struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true); 724 725 if (ca) { 726 EBUG_ON(atomic_long_read(&ca->ref) != 1); 727 bch2_dev_io_ref_stop(ca, READ); 728 bch2_free_super(&ca->disk_sb); 729 bch2_dev_free(ca); 730 } 731 } 732 733 bch_verbose(c, "shutdown complete"); 734 735 kobject_put(&c->kobj); 736 } 737 738 void bch2_fs_stop(struct bch_fs *c) 739 { 740 __bch2_fs_stop(c); 741 bch2_fs_free(c); 742 } 743 744 static int bch2_fs_online(struct bch_fs *c) 745 { 746 int ret = 0; 747 748 lockdep_assert_held(&bch_fs_list_lock); 749 750 if (c->sb.multi_device && 751 __bch2_uuid_to_fs(c->sb.uuid)) { 752 bch_err(c, "filesystem UUID already open"); 753 return bch_err_throw(c, filesystem_uuid_already_open); 754 } 755 756 ret = bch2_fs_chardev_init(c); 757 if (ret) { 758 bch_err(c, "error creating character device"); 759 return ret; 760 } 761 762 bch2_fs_debug_init(c); 763 764 ret = (c->sb.multi_device 765 ? kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) 766 : kobject_add(&c->kobj, NULL, "%s", c->name)) ?: 767 kobject_add(&c->internal, &c->kobj, "internal") ?: 768 kobject_add(&c->opts_dir, &c->kobj, "options") ?: 769 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT 770 kobject_add(&c->time_stats, &c->kobj, "time_stats") ?: 771 #endif 772 kobject_add(&c->counters_kobj, &c->kobj, "counters") ?: 773 bch2_opts_create_sysfs_files(&c->opts_dir, OPT_FS); 774 if (ret) { 775 bch_err(c, "error creating sysfs objects"); 776 return ret; 777 } 778 779 down_write(&c->state_lock); 780 781 for_each_member_device(c, ca) { 782 ret = bch2_dev_sysfs_online(c, ca); 783 if (ret) { 784 bch_err(c, "error creating sysfs objects"); 785 bch2_dev_put(ca); 786 goto err; 787 } 788 } 789 790 BUG_ON(!list_empty(&c->list)); 791 list_add(&c->list, &bch_fs_list); 792 err: 793 up_write(&c->state_lock); 794 return ret; 795 } 796 797 static int bch2_fs_init_rw(struct bch_fs *c) 798 { 799 if (test_bit(BCH_FS_rw_init_done, &c->flags)) 800 return 0; 801 802 if (!(c->btree_update_wq = alloc_workqueue("bcachefs", 803 WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) || 804 !(c->btree_write_complete_wq = alloc_workqueue("bcachefs_btree_write_complete", 805 WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || 806 !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", 807 WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || 808 !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit", 809 WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || 810 !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", 811 WQ_FREEZABLE, 0))) 812 return bch_err_throw(c, ENOMEM_fs_other_alloc); 813 814 int ret = bch2_fs_btree_interior_update_init(c) ?: 815 bch2_fs_btree_write_buffer_init(c) ?: 816 bch2_fs_fs_io_buffered_init(c) ?: 817 bch2_fs_io_write_init(c) ?: 818 bch2_fs_journal_init(&c->journal); 819 if (ret) 820 return ret; 821 822 set_bit(BCH_FS_rw_init_done, &c->flags); 823 return 0; 824 } 825 826 static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts, 827 bch_sb_handles *sbs) 828 { 829 struct bch_fs *c; 830 struct printbuf name = PRINTBUF; 831 unsigned i, iter_size; 832 int ret = 0; 833 834 c = kvmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO); 835 if (!c) { 836 c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc); 837 goto out; 838 } 839 840 c->stdio = (void *)(unsigned long) opts->stdio; 841 842 __module_get(THIS_MODULE); 843 844 closure_init(&c->cl, NULL); 845 846 c->kobj.kset = bcachefs_kset; 847 kobject_init(&c->kobj, &bch2_fs_ktype); 848 kobject_init(&c->internal, &bch2_fs_internal_ktype); 849 kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype); 850 kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype); 851 kobject_init(&c->counters_kobj, &bch2_fs_counters_ktype); 852 853 c->minor = -1; 854 c->disk_sb.fs_sb = true; 855 856 init_rwsem(&c->state_lock); 857 mutex_init(&c->sb_lock); 858 mutex_init(&c->replicas_gc_lock); 859 mutex_init(&c->btree_root_lock); 860 INIT_WORK(&c->read_only_work, bch2_fs_read_only_work); 861 862 refcount_set(&c->ro_ref, 1); 863 init_waitqueue_head(&c->ro_ref_wait); 864 865 for (i = 0; i < BCH_TIME_STAT_NR; i++) 866 bch2_time_stats_init(&c->times[i]); 867 868 bch2_fs_allocator_background_init(c); 869 bch2_fs_allocator_foreground_init(c); 870 bch2_fs_btree_cache_init_early(&c->btree_cache); 871 bch2_fs_btree_gc_init_early(c); 872 bch2_fs_btree_interior_update_init_early(c); 873 bch2_fs_btree_iter_init_early(c); 874 bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); 875 bch2_fs_btree_write_buffer_init_early(c); 876 bch2_fs_copygc_init(c); 877 bch2_fs_ec_init_early(c); 878 bch2_fs_journal_init_early(&c->journal); 879 bch2_fs_journal_keys_init(c); 880 bch2_fs_move_init(c); 881 bch2_fs_nocow_locking_init_early(c); 882 bch2_fs_quota_init(c); 883 bch2_fs_recovery_passes_init(c); 884 bch2_fs_sb_errors_init_early(c); 885 bch2_fs_snapshots_init_early(c); 886 bch2_fs_subvolumes_init_early(c); 887 888 INIT_LIST_HEAD(&c->list); 889 890 mutex_init(&c->bio_bounce_pages_lock); 891 mutex_init(&c->snapshot_table_lock); 892 init_rwsem(&c->snapshot_create_lock); 893 894 spin_lock_init(&c->btree_write_error_lock); 895 896 INIT_LIST_HEAD(&c->journal_iters); 897 898 INIT_LIST_HEAD(&c->fsck_error_msgs); 899 mutex_init(&c->fsck_error_msgs_lock); 900 901 seqcount_init(&c->usage_lock); 902 903 sema_init(&c->io_in_flight, 128); 904 905 INIT_LIST_HEAD(&c->vfs_inodes_list); 906 mutex_init(&c->vfs_inodes_lock); 907 908 c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write]; 909 c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write]; 910 c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; 911 912 mutex_init(&c->sectors_available_lock); 913 914 ret = percpu_init_rwsem(&c->mark_lock); 915 if (ret) 916 goto err; 917 918 mutex_lock(&c->sb_lock); 919 ret = bch2_sb_to_fs(c, sb); 920 mutex_unlock(&c->sb_lock); 921 922 if (ret) 923 goto err; 924 925 /* Compat: */ 926 if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && 927 !BCH_SB_JOURNAL_FLUSH_DELAY(sb)) 928 SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000); 929 930 if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && 931 !BCH_SB_JOURNAL_RECLAIM_DELAY(sb)) 932 SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 100); 933 934 c->opts = bch2_opts_default; 935 ret = bch2_opts_from_sb(&c->opts, sb); 936 if (ret) 937 goto err; 938 939 bch2_opts_apply(&c->opts, *opts); 940 941 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && 942 c->opts.block_size > PAGE_SIZE) { 943 bch_err(c, "cannot mount bs > ps filesystem without CONFIG_TRANSPARENT_HUGEPAGE"); 944 ret = -EINVAL; 945 goto err; 946 } 947 948 c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc; 949 if (c->opts.inodes_use_key_cache) 950 c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes; 951 c->btree_key_cache_btrees |= 1U << BTREE_ID_logged_ops; 952 953 c->block_bits = ilog2(block_sectors(c)); 954 c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c); 955 956 if (bch2_fs_init_fault("fs_alloc")) { 957 bch_err(c, "fs_alloc fault injected"); 958 ret = -EFAULT; 959 goto err; 960 } 961 962 if (c->sb.multi_device) 963 pr_uuid(&name, c->sb.user_uuid.b); 964 else 965 prt_bdevname(&name, sbs->data[0].bdev); 966 967 ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0; 968 if (ret) 969 goto err; 970 971 strscpy(c->name, name.buf, sizeof(c->name)); 972 printbuf_exit(&name); 973 974 iter_size = sizeof(struct sort_iter) + 975 (btree_blocks(c) + 1) * 2 * 976 sizeof(struct sort_iter_set); 977 978 if (!(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete", 979 WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) || 980 enumerated_ref_init(&c->writes, BCH_WRITE_REF_NR, 981 bch2_writes_disabled) || 982 mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || 983 bioset_init(&c->btree_bio, 1, 984 max(offsetof(struct btree_read_bio, bio), 985 offsetof(struct btree_write_bio, wbio.bio)), 986 BIOSET_NEED_BVECS) || 987 !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || 988 !(c->usage = alloc_percpu(struct bch_fs_usage_base)) || 989 !(c->online_reserved = alloc_percpu(u64)) || 990 mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1, 991 c->opts.btree_node_size) || 992 mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048)) { 993 ret = bch_err_throw(c, ENOMEM_fs_other_alloc); 994 goto err; 995 } 996 997 ret = 998 bch2_fs_async_obj_init(c) ?: 999 bch2_fs_btree_cache_init(c) ?: 1000 bch2_fs_btree_iter_init(c) ?: 1001 bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: 1002 bch2_fs_buckets_waiting_for_journal_init(c) ?: 1003 bch2_io_clock_init(&c->io_clock[READ]) ?: 1004 bch2_io_clock_init(&c->io_clock[WRITE]) ?: 1005 bch2_fs_compress_init(c) ?: 1006 bch2_fs_counters_init(c) ?: 1007 bch2_fs_ec_init(c) ?: 1008 bch2_fs_encryption_init(c) ?: 1009 bch2_fs_fsio_init(c) ?: 1010 bch2_fs_fs_io_direct_init(c) ?: 1011 bch2_fs_io_read_init(c) ?: 1012 bch2_fs_rebalance_init(c) ?: 1013 bch2_fs_sb_errors_init(c) ?: 1014 bch2_fs_vfs_init(c); 1015 if (ret) 1016 goto err; 1017 1018 #ifdef CONFIG_UNICODE 1019 /* Default encoding until we can potentially have more as an option. */ 1020 c->cf_encoding = utf8_load(BCH_FS_DEFAULT_UTF8_ENCODING); 1021 if (IS_ERR(c->cf_encoding)) { 1022 printk(KERN_ERR "Cannot load UTF-8 encoding for filesystem. Version: %u.%u.%u", 1023 unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), 1024 unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), 1025 unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); 1026 ret = -EINVAL; 1027 goto err; 1028 } 1029 #else 1030 if (c->sb.features & BIT_ULL(BCH_FEATURE_casefolding)) { 1031 printk(KERN_ERR "Cannot mount a filesystem with casefolding on a kernel without CONFIG_UNICODE\n"); 1032 ret = -EINVAL; 1033 goto err; 1034 } 1035 #endif 1036 1037 for (i = 0; i < c->sb.nr_devices; i++) { 1038 if (!bch2_member_exists(c->disk_sb.sb, i)) 1039 continue; 1040 ret = bch2_dev_alloc(c, i); 1041 if (ret) 1042 goto err; 1043 } 1044 1045 bch2_journal_entry_res_resize(&c->journal, 1046 &c->btree_root_journal_res, 1047 BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX)); 1048 bch2_journal_entry_res_resize(&c->journal, 1049 &c->clock_journal_res, 1050 (sizeof(struct jset_entry_clock) / sizeof(u64)) * 2); 1051 1052 mutex_lock(&bch_fs_list_lock); 1053 ret = bch2_fs_online(c); 1054 mutex_unlock(&bch_fs_list_lock); 1055 1056 if (ret) 1057 goto err; 1058 out: 1059 return c; 1060 err: 1061 bch2_fs_free(c); 1062 c = ERR_PTR(ret); 1063 goto out; 1064 } 1065 1066 noinline_for_stack 1067 static void print_mount_opts(struct bch_fs *c) 1068 { 1069 enum bch_opt_id i; 1070 CLASS(printbuf, p)(); 1071 bch2_log_msg_start(c, &p); 1072 1073 prt_str(&p, "starting version "); 1074 bch2_version_to_text(&p, c->sb.version); 1075 1076 bool first = true; 1077 for (i = 0; i < bch2_opts_nr; i++) { 1078 const struct bch_option *opt = &bch2_opt_table[i]; 1079 u64 v = bch2_opt_get_by_id(&c->opts, i); 1080 1081 if (!(opt->flags & OPT_MOUNT)) 1082 continue; 1083 1084 if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) 1085 continue; 1086 1087 prt_str(&p, first ? " opts=" : ","); 1088 first = false; 1089 bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE); 1090 } 1091 1092 if (c->sb.version_incompat_allowed != c->sb.version) { 1093 prt_printf(&p, "\nallowing incompatible features above "); 1094 bch2_version_to_text(&p, c->sb.version_incompat_allowed); 1095 } 1096 1097 if (c->opts.verbose) { 1098 prt_printf(&p, "\nfeatures: "); 1099 prt_bitflags(&p, bch2_sb_features, c->sb.features); 1100 } 1101 1102 if (c->sb.multi_device) { 1103 prt_printf(&p, "\nwith devices"); 1104 for_each_online_member(c, ca, BCH_DEV_READ_REF_bch2_online_devs) { 1105 prt_char(&p, ' '); 1106 prt_str(&p, ca->name); 1107 } 1108 } 1109 1110 bch2_print_str(c, KERN_INFO, p.buf); 1111 } 1112 1113 static bool bch2_fs_may_start(struct bch_fs *c) 1114 { 1115 struct bch_dev *ca; 1116 unsigned flags = 0; 1117 1118 switch (c->opts.degraded) { 1119 case BCH_DEGRADED_very: 1120 flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST; 1121 break; 1122 case BCH_DEGRADED_yes: 1123 flags |= BCH_FORCE_IF_DEGRADED; 1124 break; 1125 default: 1126 mutex_lock(&c->sb_lock); 1127 for (unsigned i = 0; i < c->disk_sb.sb->nr_devices; i++) { 1128 if (!bch2_member_exists(c->disk_sb.sb, i)) 1129 continue; 1130 1131 ca = bch2_dev_locked(c, i); 1132 1133 if (!bch2_dev_is_online(ca) && 1134 (ca->mi.state == BCH_MEMBER_STATE_rw || 1135 ca->mi.state == BCH_MEMBER_STATE_ro)) { 1136 mutex_unlock(&c->sb_lock); 1137 return false; 1138 } 1139 } 1140 mutex_unlock(&c->sb_lock); 1141 break; 1142 } 1143 1144 return bch2_have_enough_devs(c, c->online_devs, flags, true); 1145 } 1146 1147 int bch2_fs_start(struct bch_fs *c) 1148 { 1149 time64_t now = ktime_get_real_seconds(); 1150 int ret = 0; 1151 1152 print_mount_opts(c); 1153 1154 #ifdef CONFIG_UNICODE 1155 bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u", 1156 unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), 1157 unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), 1158 unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); 1159 #endif 1160 1161 if (!bch2_fs_may_start(c)) 1162 return bch_err_throw(c, insufficient_devices_to_start); 1163 1164 down_write(&c->state_lock); 1165 mutex_lock(&c->sb_lock); 1166 1167 BUG_ON(test_bit(BCH_FS_started, &c->flags)); 1168 1169 if (!bch2_sb_field_get_minsize(&c->disk_sb, ext, 1170 sizeof(struct bch_sb_field_ext) / sizeof(u64))) { 1171 mutex_unlock(&c->sb_lock); 1172 up_write(&c->state_lock); 1173 ret = bch_err_throw(c, ENOSPC_sb); 1174 goto err; 1175 } 1176 1177 ret = bch2_sb_members_v2_init(c); 1178 if (ret) { 1179 mutex_unlock(&c->sb_lock); 1180 up_write(&c->state_lock); 1181 goto err; 1182 } 1183 1184 scoped_guard(rcu) 1185 for_each_online_member_rcu(c, ca) 1186 bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = 1187 cpu_to_le64(now); 1188 1189 /* 1190 * Dno't write superblock yet: recovery might have to downgrade 1191 */ 1192 mutex_unlock(&c->sb_lock); 1193 1194 scoped_guard(rcu) 1195 for_each_online_member_rcu(c, ca) 1196 if (ca->mi.state == BCH_MEMBER_STATE_rw) 1197 bch2_dev_allocator_add(c, ca); 1198 bch2_recalc_capacity(c); 1199 up_write(&c->state_lock); 1200 1201 c->recovery_task = current; 1202 ret = BCH_SB_INITIALIZED(c->disk_sb.sb) 1203 ? bch2_fs_recovery(c) 1204 : bch2_fs_initialize(c); 1205 c->recovery_task = NULL; 1206 1207 if (ret) 1208 goto err; 1209 1210 ret = bch2_opts_hooks_pre_set(c); 1211 if (ret) 1212 goto err; 1213 1214 if (bch2_fs_init_fault("fs_start")) { 1215 ret = bch_err_throw(c, injected_fs_start); 1216 goto err; 1217 } 1218 1219 set_bit(BCH_FS_started, &c->flags); 1220 wake_up(&c->ro_ref_wait); 1221 1222 down_write(&c->state_lock); 1223 if (c->opts.read_only) 1224 bch2_fs_read_only(c); 1225 else if (!test_bit(BCH_FS_rw, &c->flags)) 1226 ret = bch2_fs_read_write(c); 1227 up_write(&c->state_lock); 1228 1229 err: 1230 if (ret) 1231 bch_err_msg(c, ret, "starting filesystem"); 1232 else 1233 bch_verbose(c, "done starting filesystem"); 1234 return ret; 1235 } 1236 1237 static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) 1238 { 1239 struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx); 1240 1241 if (le16_to_cpu(sb->block_size) != block_sectors(c)) 1242 return bch_err_throw(c, mismatched_block_size); 1243 1244 if (le16_to_cpu(m.bucket_size) < 1245 BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb)) 1246 return bch_err_throw(c, bucket_size_too_small); 1247 1248 return 0; 1249 } 1250 1251 static int bch2_dev_in_fs(struct bch_sb_handle *fs, 1252 struct bch_sb_handle *sb, 1253 struct bch_opts *opts) 1254 { 1255 if (fs == sb) 1256 return 0; 1257 1258 if (!uuid_equal(&fs->sb->uuid, &sb->sb->uuid)) 1259 return -BCH_ERR_device_not_a_member_of_filesystem; 1260 1261 if (!bch2_member_exists(fs->sb, sb->sb->dev_idx)) 1262 return -BCH_ERR_device_has_been_removed; 1263 1264 if (fs->sb->block_size != sb->sb->block_size) 1265 return -BCH_ERR_mismatched_block_size; 1266 1267 if (le16_to_cpu(fs->sb->version) < bcachefs_metadata_version_member_seq || 1268 le16_to_cpu(sb->sb->version) < bcachefs_metadata_version_member_seq) 1269 return 0; 1270 1271 if (fs->sb->seq == sb->sb->seq && 1272 fs->sb->write_time != sb->sb->write_time) { 1273 struct printbuf buf = PRINTBUF; 1274 1275 prt_str(&buf, "Split brain detected between "); 1276 prt_bdevname(&buf, sb->bdev); 1277 prt_str(&buf, " and "); 1278 prt_bdevname(&buf, fs->bdev); 1279 prt_char(&buf, ':'); 1280 prt_newline(&buf); 1281 prt_printf(&buf, "seq=%llu but write_time different, got", le64_to_cpu(sb->sb->seq)); 1282 prt_newline(&buf); 1283 1284 prt_bdevname(&buf, fs->bdev); 1285 prt_char(&buf, ' '); 1286 bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time)); 1287 prt_newline(&buf); 1288 1289 prt_bdevname(&buf, sb->bdev); 1290 prt_char(&buf, ' '); 1291 bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time)); 1292 prt_newline(&buf); 1293 1294 if (!opts->no_splitbrain_check) 1295 prt_printf(&buf, "Not using older sb"); 1296 1297 pr_err("%s", buf.buf); 1298 printbuf_exit(&buf); 1299 1300 if (!opts->no_splitbrain_check) 1301 return -BCH_ERR_device_splitbrain; 1302 } 1303 1304 struct bch_member m = bch2_sb_member_get(fs->sb, sb->sb->dev_idx); 1305 u64 seq_from_fs = le64_to_cpu(m.seq); 1306 u64 seq_from_member = le64_to_cpu(sb->sb->seq); 1307 1308 if (seq_from_fs && seq_from_fs < seq_from_member) { 1309 struct printbuf buf = PRINTBUF; 1310 1311 prt_str(&buf, "Split brain detected between "); 1312 prt_bdevname(&buf, sb->bdev); 1313 prt_str(&buf, " and "); 1314 prt_bdevname(&buf, fs->bdev); 1315 prt_char(&buf, ':'); 1316 prt_newline(&buf); 1317 1318 prt_bdevname(&buf, fs->bdev); 1319 prt_str(&buf, " believes seq of "); 1320 prt_bdevname(&buf, sb->bdev); 1321 prt_printf(&buf, " to be %llu, but ", seq_from_fs); 1322 prt_bdevname(&buf, sb->bdev); 1323 prt_printf(&buf, " has %llu\n", seq_from_member); 1324 1325 if (!opts->no_splitbrain_check) { 1326 prt_str(&buf, "Not using "); 1327 prt_bdevname(&buf, sb->bdev); 1328 } 1329 1330 pr_err("%s", buf.buf); 1331 printbuf_exit(&buf); 1332 1333 if (!opts->no_splitbrain_check) 1334 return -BCH_ERR_device_splitbrain; 1335 } 1336 1337 return 0; 1338 } 1339 1340 /* Device startup/shutdown: */ 1341 1342 static void bch2_dev_io_ref_stop(struct bch_dev *ca, int rw) 1343 { 1344 if (rw == READ) 1345 clear_bit(ca->dev_idx, ca->fs->online_devs.d); 1346 1347 if (!enumerated_ref_is_zero(&ca->io_ref[rw])) 1348 enumerated_ref_stop(&ca->io_ref[rw], 1349 rw == READ 1350 ? bch2_dev_read_refs 1351 : bch2_dev_write_refs); 1352 } 1353 1354 static void bch2_dev_release(struct kobject *kobj) 1355 { 1356 struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); 1357 1358 kfree(ca); 1359 } 1360 1361 static void bch2_dev_free(struct bch_dev *ca) 1362 { 1363 WARN_ON(!enumerated_ref_is_zero(&ca->io_ref[WRITE])); 1364 WARN_ON(!enumerated_ref_is_zero(&ca->io_ref[READ])); 1365 1366 cancel_work_sync(&ca->io_error_work); 1367 1368 bch2_dev_unlink(ca); 1369 1370 if (ca->kobj.state_in_sysfs) 1371 kobject_del(&ca->kobj); 1372 1373 bch2_bucket_bitmap_free(&ca->bucket_backpointer_mismatch); 1374 bch2_bucket_bitmap_free(&ca->bucket_backpointer_empty); 1375 1376 bch2_free_super(&ca->disk_sb); 1377 bch2_dev_allocator_background_exit(ca); 1378 bch2_dev_journal_exit(ca); 1379 1380 free_percpu(ca->io_done); 1381 bch2_dev_buckets_free(ca); 1382 kfree(ca->sb_read_scratch); 1383 1384 bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]); 1385 bch2_time_stats_quantiles_exit(&ca->io_latency[READ]); 1386 1387 enumerated_ref_exit(&ca->io_ref[WRITE]); 1388 enumerated_ref_exit(&ca->io_ref[READ]); 1389 #ifndef CONFIG_BCACHEFS_DEBUG 1390 percpu_ref_exit(&ca->ref); 1391 #endif 1392 kobject_put(&ca->kobj); 1393 } 1394 1395 static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca) 1396 { 1397 1398 lockdep_assert_held(&c->state_lock); 1399 1400 if (enumerated_ref_is_zero(&ca->io_ref[READ])) 1401 return; 1402 1403 __bch2_dev_read_only(c, ca); 1404 1405 bch2_dev_io_ref_stop(ca, READ); 1406 1407 bch2_dev_unlink(ca); 1408 1409 bch2_free_super(&ca->disk_sb); 1410 bch2_dev_journal_exit(ca); 1411 } 1412 1413 #ifndef CONFIG_BCACHEFS_DEBUG 1414 static void bch2_dev_ref_complete(struct percpu_ref *ref) 1415 { 1416 struct bch_dev *ca = container_of(ref, struct bch_dev, ref); 1417 1418 complete(&ca->ref_completion); 1419 } 1420 #endif 1421 1422 static void bch2_dev_unlink(struct bch_dev *ca) 1423 { 1424 struct kobject *b; 1425 1426 /* 1427 * This is racy w.r.t. the underlying block device being hot-removed, 1428 * which removes it from sysfs. 1429 * 1430 * It'd be lovely if we had a way to handle this race, but the sysfs 1431 * code doesn't appear to provide a good method and block/holder.c is 1432 * susceptible as well: 1433 */ 1434 if (ca->kobj.state_in_sysfs && 1435 ca->disk_sb.bdev && 1436 (b = bdev_kobj(ca->disk_sb.bdev))->state_in_sysfs) { 1437 sysfs_remove_link(b, "bcachefs"); 1438 sysfs_remove_link(&ca->kobj, "block"); 1439 } 1440 } 1441 1442 static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca) 1443 { 1444 int ret; 1445 1446 if (!c->kobj.state_in_sysfs) 1447 return 0; 1448 1449 if (!ca->kobj.state_in_sysfs) { 1450 ret = kobject_add(&ca->kobj, &c->kobj, "dev-%u", ca->dev_idx) ?: 1451 bch2_opts_create_sysfs_files(&ca->kobj, OPT_DEVICE); 1452 if (ret) 1453 return ret; 1454 } 1455 1456 if (ca->disk_sb.bdev) { 1457 struct kobject *block = bdev_kobj(ca->disk_sb.bdev); 1458 1459 ret = sysfs_create_link(block, &ca->kobj, "bcachefs"); 1460 if (ret) 1461 return ret; 1462 1463 ret = sysfs_create_link(&ca->kobj, block, "block"); 1464 if (ret) 1465 return ret; 1466 } 1467 1468 return 0; 1469 } 1470 1471 static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, 1472 struct bch_member *member) 1473 { 1474 struct bch_dev *ca; 1475 unsigned i; 1476 1477 ca = kzalloc(sizeof(*ca), GFP_KERNEL); 1478 if (!ca) 1479 return NULL; 1480 1481 kobject_init(&ca->kobj, &bch2_dev_ktype); 1482 init_completion(&ca->ref_completion); 1483 1484 INIT_WORK(&ca->io_error_work, bch2_io_error_work); 1485 1486 bch2_time_stats_quantiles_init(&ca->io_latency[READ]); 1487 bch2_time_stats_quantiles_init(&ca->io_latency[WRITE]); 1488 1489 ca->mi = bch2_mi_to_cpu(member); 1490 1491 for (i = 0; i < ARRAY_SIZE(member->errors); i++) 1492 atomic64_set(&ca->errors[i], le64_to_cpu(member->errors[i])); 1493 1494 ca->uuid = member->uuid; 1495 1496 ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, 1497 ca->mi.bucket_size / btree_sectors(c)); 1498 1499 #ifndef CONFIG_BCACHEFS_DEBUG 1500 if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, 0, GFP_KERNEL)) 1501 goto err; 1502 #else 1503 atomic_long_set(&ca->ref, 1); 1504 #endif 1505 1506 mutex_init(&ca->bucket_backpointer_mismatch.lock); 1507 mutex_init(&ca->bucket_backpointer_empty.lock); 1508 1509 bch2_dev_allocator_background_init(ca); 1510 1511 if (enumerated_ref_init(&ca->io_ref[READ], BCH_DEV_READ_REF_NR, NULL) || 1512 enumerated_ref_init(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_NR, NULL) || 1513 !(ca->sb_read_scratch = kmalloc(BCH_SB_READ_SCRATCH_BUF_SIZE, GFP_KERNEL)) || 1514 bch2_dev_buckets_alloc(c, ca) || 1515 !(ca->io_done = alloc_percpu(*ca->io_done))) 1516 goto err; 1517 1518 return ca; 1519 err: 1520 bch2_dev_free(ca); 1521 return NULL; 1522 } 1523 1524 static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca, 1525 unsigned dev_idx) 1526 { 1527 ca->dev_idx = dev_idx; 1528 __set_bit(ca->dev_idx, ca->self.d); 1529 1530 if (!ca->name[0]) 1531 scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx); 1532 1533 ca->fs = c; 1534 rcu_assign_pointer(c->devs[ca->dev_idx], ca); 1535 1536 if (bch2_dev_sysfs_online(c, ca)) 1537 pr_warn("error creating sysfs objects"); 1538 } 1539 1540 static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) 1541 { 1542 struct bch_member member = bch2_sb_member_get(c->disk_sb.sb, dev_idx); 1543 struct bch_dev *ca = NULL; 1544 1545 if (bch2_fs_init_fault("dev_alloc")) 1546 goto err; 1547 1548 ca = __bch2_dev_alloc(c, &member); 1549 if (!ca) 1550 goto err; 1551 1552 ca->fs = c; 1553 1554 bch2_dev_attach(c, ca, dev_idx); 1555 return 0; 1556 err: 1557 return bch_err_throw(c, ENOMEM_dev_alloc); 1558 } 1559 1560 static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) 1561 { 1562 unsigned ret; 1563 1564 if (bch2_dev_is_online(ca)) { 1565 bch_err(ca, "already have device online in slot %u", 1566 sb->sb->dev_idx); 1567 return bch_err_throw(ca->fs, device_already_online); 1568 } 1569 1570 if (get_capacity(sb->bdev->bd_disk) < 1571 ca->mi.bucket_size * ca->mi.nbuckets) { 1572 bch_err(ca, "cannot online: device too small"); 1573 return bch_err_throw(ca->fs, device_size_too_small); 1574 } 1575 1576 BUG_ON(!enumerated_ref_is_zero(&ca->io_ref[READ])); 1577 BUG_ON(!enumerated_ref_is_zero(&ca->io_ref[WRITE])); 1578 1579 ret = bch2_dev_journal_init(ca, sb->sb); 1580 if (ret) 1581 return ret; 1582 1583 struct printbuf name = PRINTBUF; 1584 prt_bdevname(&name, sb->bdev); 1585 strscpy(ca->name, name.buf, sizeof(ca->name)); 1586 printbuf_exit(&name); 1587 1588 /* Commit: */ 1589 ca->disk_sb = *sb; 1590 memset(sb, 0, sizeof(*sb)); 1591 1592 /* 1593 * Stash pointer to the filesystem for blk_holder_ops - note that once 1594 * attached to a filesystem, we will always close the block device 1595 * before tearing down the filesystem object. 1596 */ 1597 ca->disk_sb.holder->c = ca->fs; 1598 1599 ca->dev = ca->disk_sb.bdev->bd_dev; 1600 1601 enumerated_ref_start(&ca->io_ref[READ]); 1602 1603 return 0; 1604 } 1605 1606 static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) 1607 { 1608 struct bch_dev *ca; 1609 int ret; 1610 1611 lockdep_assert_held(&c->state_lock); 1612 1613 if (le64_to_cpu(sb->sb->seq) > 1614 le64_to_cpu(c->disk_sb.sb->seq)) 1615 bch2_sb_to_fs(c, sb->sb); 1616 1617 BUG_ON(!bch2_dev_exists(c, sb->sb->dev_idx)); 1618 1619 ca = bch2_dev_locked(c, sb->sb->dev_idx); 1620 1621 ret = __bch2_dev_attach_bdev(ca, sb); 1622 if (ret) 1623 return ret; 1624 1625 set_bit(ca->dev_idx, c->online_devs.d); 1626 1627 bch2_dev_sysfs_online(c, ca); 1628 1629 bch2_rebalance_wakeup(c); 1630 return 0; 1631 } 1632 1633 /* Device management: */ 1634 1635 /* 1636 * Note: this function is also used by the error paths - when a particular 1637 * device sees an error, we call it to determine whether we can just set the 1638 * device RO, or - if this function returns false - we'll set the whole 1639 * filesystem RO: 1640 * 1641 * XXX: maybe we should be more explicit about whether we're changing state 1642 * because we got an error or what have you? 1643 */ 1644 bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, 1645 enum bch_member_state new_state, int flags) 1646 { 1647 struct bch_devs_mask new_online_devs; 1648 int nr_rw = 0, required; 1649 1650 lockdep_assert_held(&c->state_lock); 1651 1652 switch (new_state) { 1653 case BCH_MEMBER_STATE_rw: 1654 return true; 1655 case BCH_MEMBER_STATE_ro: 1656 if (ca->mi.state != BCH_MEMBER_STATE_rw) 1657 return true; 1658 1659 /* do we have enough devices to write to? */ 1660 for_each_member_device(c, ca2) 1661 if (ca2 != ca) 1662 nr_rw += ca2->mi.state == BCH_MEMBER_STATE_rw; 1663 1664 required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED) 1665 ? c->opts.metadata_replicas 1666 : metadata_replicas_required(c), 1667 !(flags & BCH_FORCE_IF_DATA_DEGRADED) 1668 ? c->opts.data_replicas 1669 : data_replicas_required(c)); 1670 1671 return nr_rw >= required; 1672 case BCH_MEMBER_STATE_failed: 1673 case BCH_MEMBER_STATE_spare: 1674 if (ca->mi.state != BCH_MEMBER_STATE_rw && 1675 ca->mi.state != BCH_MEMBER_STATE_ro) 1676 return true; 1677 1678 /* do we have enough devices to read from? */ 1679 new_online_devs = c->online_devs; 1680 __clear_bit(ca->dev_idx, new_online_devs.d); 1681 1682 return bch2_have_enough_devs(c, new_online_devs, flags, false); 1683 default: 1684 BUG(); 1685 } 1686 } 1687 1688 static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) 1689 { 1690 bch2_dev_io_ref_stop(ca, WRITE); 1691 1692 /* 1693 * The allocator thread itself allocates btree nodes, so stop it first: 1694 */ 1695 bch2_dev_allocator_remove(c, ca); 1696 bch2_recalc_capacity(c); 1697 bch2_dev_journal_stop(&c->journal, ca); 1698 } 1699 1700 static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) 1701 { 1702 lockdep_assert_held(&c->state_lock); 1703 1704 BUG_ON(ca->mi.state != BCH_MEMBER_STATE_rw); 1705 1706 bch2_dev_allocator_add(c, ca); 1707 bch2_recalc_capacity(c); 1708 1709 if (enumerated_ref_is_zero(&ca->io_ref[WRITE])) 1710 enumerated_ref_start(&ca->io_ref[WRITE]); 1711 1712 bch2_dev_do_discards(ca); 1713 } 1714 1715 int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, 1716 enum bch_member_state new_state, int flags) 1717 { 1718 struct bch_member *m; 1719 int ret = 0; 1720 1721 if (ca->mi.state == new_state) 1722 return 0; 1723 1724 if (!bch2_dev_state_allowed(c, ca, new_state, flags)) 1725 return bch_err_throw(c, device_state_not_allowed); 1726 1727 if (new_state != BCH_MEMBER_STATE_rw) 1728 __bch2_dev_read_only(c, ca); 1729 1730 bch_notice(ca, "%s", bch2_member_states[new_state]); 1731 1732 mutex_lock(&c->sb_lock); 1733 m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); 1734 SET_BCH_MEMBER_STATE(m, new_state); 1735 bch2_write_super(c); 1736 mutex_unlock(&c->sb_lock); 1737 1738 if (new_state == BCH_MEMBER_STATE_rw) 1739 __bch2_dev_read_write(c, ca); 1740 1741 bch2_rebalance_wakeup(c); 1742 1743 return ret; 1744 } 1745 1746 int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, 1747 enum bch_member_state new_state, int flags) 1748 { 1749 int ret; 1750 1751 down_write(&c->state_lock); 1752 ret = __bch2_dev_set_state(c, ca, new_state, flags); 1753 up_write(&c->state_lock); 1754 1755 return ret; 1756 } 1757 1758 /* Device add/removal: */ 1759 1760 int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) 1761 { 1762 struct bch_member *m; 1763 unsigned dev_idx = ca->dev_idx, data; 1764 bool fast_device_removal = !bch2_request_incompat_feature(c, 1765 bcachefs_metadata_version_fast_device_removal); 1766 int ret; 1767 1768 down_write(&c->state_lock); 1769 1770 /* 1771 * We consume a reference to ca->ref, regardless of whether we succeed 1772 * or fail: 1773 */ 1774 bch2_dev_put(ca); 1775 1776 if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { 1777 bch_err(ca, "Cannot remove without losing data"); 1778 ret = bch_err_throw(c, device_state_not_allowed); 1779 goto err; 1780 } 1781 1782 __bch2_dev_read_only(c, ca); 1783 1784 ret = fast_device_removal 1785 ? bch2_dev_data_drop_by_backpointers(c, ca->dev_idx, flags) 1786 : (bch2_dev_data_drop(c, ca->dev_idx, flags) ?: 1787 bch2_dev_remove_stripes(c, ca->dev_idx, flags)); 1788 if (ret) 1789 goto err; 1790 1791 /* Check if device still has data before blowing away alloc info */ 1792 struct bch_dev_usage usage = bch2_dev_usage_read(ca); 1793 for (unsigned i = 0; i < BCH_DATA_NR; i++) 1794 if (!data_type_is_empty(i) && 1795 !data_type_is_hidden(i) && 1796 usage.buckets[i]) { 1797 bch_err(ca, "Remove failed: still has data (%s, %llu buckets)", 1798 __bch2_data_types[i], usage.buckets[i]); 1799 ret = -EBUSY; 1800 goto err; 1801 } 1802 1803 ret = bch2_dev_remove_alloc(c, ca); 1804 bch_err_msg(ca, ret, "bch2_dev_remove_alloc()"); 1805 if (ret) 1806 goto err; 1807 1808 /* 1809 * We need to flush the entire journal to get rid of keys that reference 1810 * the device being removed before removing the superblock entry 1811 */ 1812 bch2_journal_flush_all_pins(&c->journal); 1813 1814 /* 1815 * this is really just needed for the bch2_replicas_gc_(start|end) 1816 * calls, and could be cleaned up: 1817 */ 1818 ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx); 1819 bch_err_msg(ca, ret, "bch2_journal_flush_device_pins()"); 1820 if (ret) 1821 goto err; 1822 1823 ret = bch2_journal_flush(&c->journal); 1824 bch_err_msg(ca, ret, "bch2_journal_flush()"); 1825 if (ret) 1826 goto err; 1827 1828 ret = bch2_replicas_gc2(c); 1829 bch_err_msg(ca, ret, "bch2_replicas_gc2()"); 1830 if (ret) 1831 goto err; 1832 1833 data = bch2_dev_has_data(c, ca); 1834 if (data) { 1835 struct printbuf data_has = PRINTBUF; 1836 1837 prt_bitflags(&data_has, __bch2_data_types, data); 1838 bch_err(ca, "Remove failed, still has data (%s)", data_has.buf); 1839 printbuf_exit(&data_has); 1840 ret = -EBUSY; 1841 goto err; 1842 } 1843 1844 __bch2_dev_offline(c, ca); 1845 1846 mutex_lock(&c->sb_lock); 1847 rcu_assign_pointer(c->devs[ca->dev_idx], NULL); 1848 mutex_unlock(&c->sb_lock); 1849 1850 #ifndef CONFIG_BCACHEFS_DEBUG 1851 percpu_ref_kill(&ca->ref); 1852 #else 1853 ca->dying = true; 1854 bch2_dev_put(ca); 1855 #endif 1856 wait_for_completion(&ca->ref_completion); 1857 1858 bch2_dev_free(ca); 1859 1860 /* 1861 * Free this device's slot in the bch_member array - all pointers to 1862 * this device must be gone: 1863 */ 1864 mutex_lock(&c->sb_lock); 1865 m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx); 1866 1867 if (fast_device_removal) 1868 m->uuid = BCH_SB_MEMBER_DELETED_UUID; 1869 else 1870 memset(&m->uuid, 0, sizeof(m->uuid)); 1871 1872 bch2_write_super(c); 1873 1874 mutex_unlock(&c->sb_lock); 1875 up_write(&c->state_lock); 1876 return 0; 1877 err: 1878 if (test_bit(BCH_FS_rw, &c->flags) && 1879 ca->mi.state == BCH_MEMBER_STATE_rw && 1880 !enumerated_ref_is_zero(&ca->io_ref[READ])) 1881 __bch2_dev_read_write(c, ca); 1882 up_write(&c->state_lock); 1883 return ret; 1884 } 1885 1886 /* Add new device to running filesystem: */ 1887 int bch2_dev_add(struct bch_fs *c, const char *path) 1888 { 1889 struct bch_opts opts = bch2_opts_empty(); 1890 struct bch_sb_handle sb = {}; 1891 struct bch_dev *ca = NULL; 1892 struct printbuf errbuf = PRINTBUF; 1893 struct printbuf label = PRINTBUF; 1894 int ret = 0; 1895 1896 ret = bch2_read_super(path, &opts, &sb); 1897 bch_err_msg(c, ret, "reading super"); 1898 if (ret) 1899 goto err; 1900 1901 struct bch_member dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx); 1902 1903 if (BCH_MEMBER_GROUP(&dev_mi)) { 1904 bch2_disk_path_to_text_sb(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1); 1905 if (label.allocation_failure) { 1906 ret = -ENOMEM; 1907 goto err; 1908 } 1909 } 1910 1911 if (list_empty(&c->list)) { 1912 mutex_lock(&bch_fs_list_lock); 1913 if (__bch2_uuid_to_fs(c->sb.uuid)) 1914 ret = bch_err_throw(c, filesystem_uuid_already_open); 1915 else 1916 list_add(&c->list, &bch_fs_list); 1917 mutex_unlock(&bch_fs_list_lock); 1918 1919 if (ret) { 1920 bch_err(c, "filesystem UUID already open"); 1921 goto err; 1922 } 1923 } 1924 1925 ret = bch2_dev_may_add(sb.sb, c); 1926 if (ret) 1927 goto err; 1928 1929 ca = __bch2_dev_alloc(c, &dev_mi); 1930 if (!ca) { 1931 ret = -ENOMEM; 1932 goto err; 1933 } 1934 1935 ret = __bch2_dev_attach_bdev(ca, &sb); 1936 if (ret) 1937 goto err; 1938 1939 down_write(&c->state_lock); 1940 mutex_lock(&c->sb_lock); 1941 SET_BCH_SB_MULTI_DEVICE(c->disk_sb.sb, true); 1942 1943 ret = bch2_sb_from_fs(c, ca); 1944 bch_err_msg(c, ret, "setting up new superblock"); 1945 if (ret) 1946 goto err_unlock; 1947 1948 if (dynamic_fault("bcachefs:add:no_slot")) 1949 goto err_unlock; 1950 1951 ret = bch2_sb_member_alloc(c); 1952 if (ret < 0) { 1953 bch_err_msg(c, ret, "setting up new superblock"); 1954 goto err_unlock; 1955 } 1956 unsigned dev_idx = ret; 1957 ret = 0; 1958 1959 /* success: */ 1960 1961 dev_mi.last_mount = cpu_to_le64(ktime_get_real_seconds()); 1962 *bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx) = dev_mi; 1963 1964 ca->disk_sb.sb->dev_idx = dev_idx; 1965 bch2_dev_attach(c, ca, dev_idx); 1966 1967 if (BCH_MEMBER_GROUP(&dev_mi)) { 1968 ret = __bch2_dev_group_set(c, ca, label.buf); 1969 bch_err_msg(c, ret, "creating new label"); 1970 if (ret) 1971 goto err_unlock; 1972 } 1973 1974 bch2_write_super(c); 1975 mutex_unlock(&c->sb_lock); 1976 1977 if (test_bit(BCH_FS_started, &c->flags)) { 1978 ret = bch2_dev_usage_init(ca, false); 1979 if (ret) 1980 goto err_late; 1981 1982 ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); 1983 bch_err_msg(ca, ret, "marking new superblock"); 1984 if (ret) 1985 goto err_late; 1986 1987 ret = bch2_fs_freespace_init(c); 1988 bch_err_msg(ca, ret, "initializing free space"); 1989 if (ret) 1990 goto err_late; 1991 1992 if (ca->mi.state == BCH_MEMBER_STATE_rw) 1993 __bch2_dev_read_write(c, ca); 1994 1995 ret = bch2_dev_journal_alloc(ca, false); 1996 bch_err_msg(c, ret, "allocating journal"); 1997 if (ret) 1998 goto err_late; 1999 } 2000 2001 /* 2002 * We just changed the superblock UUID, invalidate cache and send a 2003 * uevent to update /dev/disk/by-uuid 2004 */ 2005 invalidate_bdev(ca->disk_sb.bdev); 2006 2007 char uuid_str[37]; 2008 snprintf(uuid_str, sizeof(uuid_str), "UUID=%pUb", &c->sb.uuid); 2009 2010 char *envp[] = { 2011 "CHANGE=uuid", 2012 uuid_str, 2013 NULL, 2014 }; 2015 kobject_uevent_env(&ca->disk_sb.bdev->bd_device.kobj, KOBJ_CHANGE, envp); 2016 2017 up_write(&c->state_lock); 2018 out: 2019 printbuf_exit(&label); 2020 printbuf_exit(&errbuf); 2021 bch_err_fn(c, ret); 2022 return ret; 2023 2024 err_unlock: 2025 mutex_unlock(&c->sb_lock); 2026 up_write(&c->state_lock); 2027 err: 2028 if (ca) 2029 bch2_dev_free(ca); 2030 bch2_free_super(&sb); 2031 goto out; 2032 err_late: 2033 up_write(&c->state_lock); 2034 ca = NULL; 2035 goto err; 2036 } 2037 2038 /* Hot add existing device to running filesystem: */ 2039 int bch2_dev_online(struct bch_fs *c, const char *path) 2040 { 2041 struct bch_opts opts = bch2_opts_empty(); 2042 struct bch_sb_handle sb = { NULL }; 2043 struct bch_dev *ca; 2044 unsigned dev_idx; 2045 int ret; 2046 2047 down_write(&c->state_lock); 2048 2049 ret = bch2_read_super(path, &opts, &sb); 2050 if (ret) { 2051 up_write(&c->state_lock); 2052 return ret; 2053 } 2054 2055 dev_idx = sb.sb->dev_idx; 2056 2057 ret = bch2_dev_in_fs(&c->disk_sb, &sb, &c->opts); 2058 bch_err_msg(c, ret, "bringing %s online", path); 2059 if (ret) 2060 goto err; 2061 2062 ret = bch2_dev_attach_bdev(c, &sb); 2063 if (ret) 2064 goto err; 2065 2066 ca = bch2_dev_locked(c, dev_idx); 2067 2068 ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); 2069 bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path); 2070 if (ret) 2071 goto err; 2072 2073 if (ca->mi.state == BCH_MEMBER_STATE_rw) 2074 __bch2_dev_read_write(c, ca); 2075 2076 if (!ca->mi.freespace_initialized) { 2077 ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); 2078 bch_err_msg(ca, ret, "initializing free space"); 2079 if (ret) 2080 goto err; 2081 } 2082 2083 if (!ca->journal.nr) { 2084 ret = bch2_dev_journal_alloc(ca, false); 2085 bch_err_msg(ca, ret, "allocating journal"); 2086 if (ret) 2087 goto err; 2088 } 2089 2090 mutex_lock(&c->sb_lock); 2091 bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = 2092 cpu_to_le64(ktime_get_real_seconds()); 2093 bch2_write_super(c); 2094 mutex_unlock(&c->sb_lock); 2095 2096 up_write(&c->state_lock); 2097 return 0; 2098 err: 2099 up_write(&c->state_lock); 2100 bch2_free_super(&sb); 2101 return ret; 2102 } 2103 2104 int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) 2105 { 2106 down_write(&c->state_lock); 2107 2108 if (!bch2_dev_is_online(ca)) { 2109 bch_err(ca, "Already offline"); 2110 up_write(&c->state_lock); 2111 return 0; 2112 } 2113 2114 if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { 2115 bch_err(ca, "Cannot offline required disk"); 2116 up_write(&c->state_lock); 2117 return bch_err_throw(c, device_state_not_allowed); 2118 } 2119 2120 __bch2_dev_offline(c, ca); 2121 2122 up_write(&c->state_lock); 2123 return 0; 2124 } 2125 2126 static int __bch2_dev_resize_alloc(struct bch_dev *ca, u64 old_nbuckets, u64 new_nbuckets) 2127 { 2128 struct bch_fs *c = ca->fs; 2129 u64 v[3] = { new_nbuckets - old_nbuckets, 0, 0 }; 2130 2131 return bch2_trans_commit_do(ca->fs, NULL, NULL, 0, 2132 bch2_disk_accounting_mod2(trans, false, v, dev_data_type, 2133 .dev = ca->dev_idx, 2134 .data_type = BCH_DATA_free)) ?: 2135 bch2_dev_freespace_init(c, ca, old_nbuckets, new_nbuckets); 2136 } 2137 2138 int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) 2139 { 2140 struct bch_member *m; 2141 u64 old_nbuckets; 2142 int ret = 0; 2143 2144 down_write(&c->state_lock); 2145 old_nbuckets = ca->mi.nbuckets; 2146 2147 if (nbuckets < ca->mi.nbuckets) { 2148 bch_err(ca, "Cannot shrink yet"); 2149 ret = -EINVAL; 2150 goto err; 2151 } 2152 2153 if (nbuckets > BCH_MEMBER_NBUCKETS_MAX) { 2154 bch_err(ca, "New device size too big (%llu greater than max %u)", 2155 nbuckets, BCH_MEMBER_NBUCKETS_MAX); 2156 ret = bch_err_throw(c, device_size_too_big); 2157 goto err; 2158 } 2159 2160 if (bch2_dev_is_online(ca) && 2161 get_capacity(ca->disk_sb.bdev->bd_disk) < 2162 ca->mi.bucket_size * nbuckets) { 2163 bch_err(ca, "New size larger than device"); 2164 ret = bch_err_throw(c, device_size_too_small); 2165 goto err; 2166 } 2167 2168 ret = bch2_dev_buckets_resize(c, ca, nbuckets); 2169 bch_err_msg(ca, ret, "resizing buckets"); 2170 if (ret) 2171 goto err; 2172 2173 ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); 2174 if (ret) 2175 goto err; 2176 2177 mutex_lock(&c->sb_lock); 2178 m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); 2179 m->nbuckets = cpu_to_le64(nbuckets); 2180 2181 bch2_write_super(c); 2182 mutex_unlock(&c->sb_lock); 2183 2184 if (ca->mi.freespace_initialized) { 2185 ret = __bch2_dev_resize_alloc(ca, old_nbuckets, nbuckets); 2186 if (ret) 2187 goto err; 2188 } 2189 2190 bch2_recalc_capacity(c); 2191 err: 2192 up_write(&c->state_lock); 2193 return ret; 2194 } 2195 2196 int bch2_fs_resize_on_mount(struct bch_fs *c) 2197 { 2198 for_each_online_member(c, ca, BCH_DEV_READ_REF_fs_resize_on_mount) { 2199 u64 old_nbuckets = ca->mi.nbuckets; 2200 u64 new_nbuckets = div64_u64(get_capacity(ca->disk_sb.bdev->bd_disk), 2201 ca->mi.bucket_size); 2202 2203 if (ca->mi.resize_on_mount && 2204 new_nbuckets > ca->mi.nbuckets) { 2205 bch_info(ca, "resizing to size %llu", new_nbuckets * ca->mi.bucket_size); 2206 int ret = bch2_dev_buckets_resize(c, ca, new_nbuckets); 2207 bch_err_fn(ca, ret); 2208 if (ret) { 2209 enumerated_ref_put(&ca->io_ref[READ], 2210 BCH_DEV_READ_REF_fs_resize_on_mount); 2211 up_write(&c->state_lock); 2212 return ret; 2213 } 2214 2215 mutex_lock(&c->sb_lock); 2216 struct bch_member *m = 2217 bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); 2218 m->nbuckets = cpu_to_le64(new_nbuckets); 2219 SET_BCH_MEMBER_RESIZE_ON_MOUNT(m, false); 2220 2221 c->disk_sb.sb->features[0] &= ~cpu_to_le64(BIT_ULL(BCH_FEATURE_small_image)); 2222 bch2_write_super(c); 2223 mutex_unlock(&c->sb_lock); 2224 2225 if (ca->mi.freespace_initialized) { 2226 ret = __bch2_dev_resize_alloc(ca, old_nbuckets, new_nbuckets); 2227 if (ret) { 2228 enumerated_ref_put(&ca->io_ref[READ], 2229 BCH_DEV_READ_REF_fs_resize_on_mount); 2230 up_write(&c->state_lock); 2231 return ret; 2232 } 2233 } 2234 } 2235 } 2236 return 0; 2237 } 2238 2239 /* return with ref on ca->ref: */ 2240 struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name) 2241 { 2242 if (!strncmp(name, "/dev/", strlen("/dev/"))) 2243 name += strlen("/dev/"); 2244 2245 for_each_member_device(c, ca) 2246 if (!strcmp(name, ca->name)) 2247 return ca; 2248 return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found); 2249 } 2250 2251 /* blk_holder_ops: */ 2252 2253 static struct bch_fs *bdev_get_fs(struct block_device *bdev) 2254 __releases(&bdev->bd_holder_lock) 2255 { 2256 struct bch_sb_handle_holder *holder = bdev->bd_holder; 2257 struct bch_fs *c = holder->c; 2258 2259 if (c && !bch2_ro_ref_tryget(c)) 2260 c = NULL; 2261 2262 mutex_unlock(&bdev->bd_holder_lock); 2263 2264 if (c) 2265 wait_event(c->ro_ref_wait, test_bit(BCH_FS_started, &c->flags)); 2266 return c; 2267 } 2268 2269 /* returns with ref on ca->ref */ 2270 static struct bch_dev *bdev_to_bch_dev(struct bch_fs *c, struct block_device *bdev) 2271 { 2272 for_each_member_device(c, ca) 2273 if (ca->disk_sb.bdev == bdev) 2274 return ca; 2275 return NULL; 2276 } 2277 2278 static void bch2_fs_bdev_mark_dead(struct block_device *bdev, bool surprise) 2279 { 2280 struct bch_fs *c = bdev_get_fs(bdev); 2281 if (!c) 2282 return; 2283 2284 struct super_block *sb = c->vfs_sb; 2285 if (sb) { 2286 /* 2287 * Not necessary, c->ro_ref guards against the filesystem being 2288 * unmounted - we only take this to avoid a warning in 2289 * sync_filesystem: 2290 */ 2291 down_read(&sb->s_umount); 2292 } 2293 2294 down_write(&c->state_lock); 2295 struct bch_dev *ca = bdev_to_bch_dev(c, bdev); 2296 if (!ca) 2297 goto unlock; 2298 2299 bool dev = bch2_dev_state_allowed(c, ca, 2300 BCH_MEMBER_STATE_failed, 2301 BCH_FORCE_IF_DEGRADED); 2302 2303 if (!dev && sb) { 2304 if (!surprise) 2305 sync_filesystem(sb); 2306 shrink_dcache_sb(sb); 2307 evict_inodes(sb); 2308 } 2309 2310 struct printbuf buf = PRINTBUF; 2311 __bch2_log_msg_start(ca->name, &buf); 2312 2313 prt_printf(&buf, "offline from block layer"); 2314 2315 if (dev) { 2316 __bch2_dev_offline(c, ca); 2317 } else { 2318 bch2_journal_flush(&c->journal); 2319 bch2_fs_emergency_read_only2(c, &buf); 2320 } 2321 2322 bch2_print_str(c, KERN_ERR, buf.buf); 2323 printbuf_exit(&buf); 2324 2325 bch2_dev_put(ca); 2326 unlock: 2327 if (sb) 2328 up_read(&sb->s_umount); 2329 up_write(&c->state_lock); 2330 bch2_ro_ref_put(c); 2331 } 2332 2333 static void bch2_fs_bdev_sync(struct block_device *bdev) 2334 { 2335 struct bch_fs *c = bdev_get_fs(bdev); 2336 if (!c) 2337 return; 2338 2339 struct super_block *sb = c->vfs_sb; 2340 if (sb) { 2341 /* 2342 * Not necessary, c->ro_ref guards against the filesystem being 2343 * unmounted - we only take this to avoid a warning in 2344 * sync_filesystem: 2345 */ 2346 down_read(&sb->s_umount); 2347 sync_filesystem(sb); 2348 up_read(&sb->s_umount); 2349 } 2350 2351 bch2_ro_ref_put(c); 2352 } 2353 2354 const struct blk_holder_ops bch2_sb_handle_bdev_ops = { 2355 .mark_dead = bch2_fs_bdev_mark_dead, 2356 .sync = bch2_fs_bdev_sync, 2357 }; 2358 2359 /* Filesystem open: */ 2360 2361 static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r) 2362 { 2363 return cmp_int(le64_to_cpu(l->seq), le64_to_cpu(r->seq)) ?: 2364 cmp_int(le64_to_cpu(l->write_time), le64_to_cpu(r->write_time)); 2365 } 2366 2367 struct bch_fs *bch2_fs_open(darray_const_str *devices, 2368 struct bch_opts *opts) 2369 { 2370 bch_sb_handles sbs = {}; 2371 struct bch_fs *c = NULL; 2372 struct bch_sb_handle *best = NULL; 2373 struct printbuf errbuf = PRINTBUF; 2374 int ret = 0; 2375 2376 if (!try_module_get(THIS_MODULE)) 2377 return ERR_PTR(-ENODEV); 2378 2379 if (!devices->nr) { 2380 ret = -EINVAL; 2381 goto err; 2382 } 2383 2384 ret = darray_make_room(&sbs, devices->nr); 2385 if (ret) 2386 goto err; 2387 2388 darray_for_each(*devices, i) { 2389 struct bch_sb_handle sb = { NULL }; 2390 2391 ret = bch2_read_super(*i, opts, &sb); 2392 if (ret) 2393 goto err; 2394 2395 BUG_ON(darray_push(&sbs, sb)); 2396 } 2397 2398 if (opts->nochanges && !opts->read_only) { 2399 ret = bch_err_throw(c, erofs_nochanges); 2400 goto err_print; 2401 } 2402 2403 darray_for_each(sbs, sb) 2404 if (!best || sb_cmp(sb->sb, best->sb) > 0) 2405 best = sb; 2406 2407 darray_for_each_reverse(sbs, sb) { 2408 ret = bch2_dev_in_fs(best, sb, opts); 2409 2410 if (ret == -BCH_ERR_device_has_been_removed || 2411 ret == -BCH_ERR_device_splitbrain) { 2412 bch2_free_super(sb); 2413 darray_remove_item(&sbs, sb); 2414 best -= best > sb; 2415 ret = 0; 2416 continue; 2417 } 2418 2419 if (ret) 2420 goto err_print; 2421 } 2422 2423 c = bch2_fs_alloc(best->sb, opts, &sbs); 2424 ret = PTR_ERR_OR_ZERO(c); 2425 if (ret) 2426 goto err; 2427 2428 down_write(&c->state_lock); 2429 darray_for_each(sbs, sb) { 2430 ret = bch2_dev_attach_bdev(c, sb); 2431 if (ret) { 2432 up_write(&c->state_lock); 2433 goto err; 2434 } 2435 } 2436 up_write(&c->state_lock); 2437 2438 if (!c->opts.nostart) { 2439 ret = bch2_fs_start(c); 2440 if (ret) 2441 goto err; 2442 } 2443 out: 2444 darray_for_each(sbs, sb) 2445 bch2_free_super(sb); 2446 darray_exit(&sbs); 2447 printbuf_exit(&errbuf); 2448 module_put(THIS_MODULE); 2449 return c; 2450 err_print: 2451 pr_err("bch_fs_open err opening %s: %s", 2452 devices->data[0], bch2_err_str(ret)); 2453 err: 2454 if (!IS_ERR_OR_NULL(c)) 2455 bch2_fs_stop(c); 2456 c = ERR_PTR(ret); 2457 goto out; 2458 } 2459 2460 /* Global interfaces/init */ 2461 2462 static void bcachefs_exit(void) 2463 { 2464 bch2_debug_exit(); 2465 bch2_vfs_exit(); 2466 bch2_chardev_exit(); 2467 bch2_btree_key_cache_exit(); 2468 if (bcachefs_kset) 2469 kset_unregister(bcachefs_kset); 2470 } 2471 2472 static int __init bcachefs_init(void) 2473 { 2474 bch2_bkey_pack_test(); 2475 2476 if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) || 2477 bch2_btree_key_cache_init() || 2478 bch2_chardev_init() || 2479 bch2_vfs_init() || 2480 bch2_debug_init()) 2481 goto err; 2482 2483 return 0; 2484 err: 2485 bcachefs_exit(); 2486 return -ENOMEM; 2487 } 2488 2489 #define BCH_DEBUG_PARAM(name, description) DEFINE_STATIC_KEY_FALSE(bch2_##name); 2490 BCH_DEBUG_PARAMS_ALL() 2491 #undef BCH_DEBUG_PARAM 2492 2493 static int bch2_param_set_static_key_t(const char *val, const struct kernel_param *kp) 2494 { 2495 /* Match bool exactly, by re-using it. */ 2496 struct static_key *key = kp->arg; 2497 struct kernel_param boolkp = *kp; 2498 bool v; 2499 int ret; 2500 2501 boolkp.arg = &v; 2502 2503 ret = param_set_bool(val, &boolkp); 2504 if (ret) 2505 return ret; 2506 if (v) 2507 static_key_enable(key); 2508 else 2509 static_key_disable(key); 2510 return 0; 2511 } 2512 2513 static int bch2_param_get_static_key_t(char *buffer, const struct kernel_param *kp) 2514 { 2515 struct static_key *key = kp->arg; 2516 return sprintf(buffer, "%c\n", static_key_enabled(key) ? 'N' : 'Y'); 2517 } 2518 2519 static const struct kernel_param_ops bch2_param_ops_static_key_t = { 2520 .flags = KERNEL_PARAM_OPS_FL_NOARG, 2521 .set = bch2_param_set_static_key_t, 2522 .get = bch2_param_get_static_key_t, 2523 }; 2524 2525 #define BCH_DEBUG_PARAM(name, description) \ 2526 module_param_cb(name, &bch2_param_ops_static_key_t, &bch2_##name.key, 0644);\ 2527 __MODULE_PARM_TYPE(name, "static_key_t"); \ 2528 MODULE_PARM_DESC(name, description); 2529 BCH_DEBUG_PARAMS() 2530 #undef BCH_DEBUG_PARAM 2531 2532 __maybe_unused 2533 static unsigned bch2_metadata_version = bcachefs_metadata_version_current; 2534 module_param_named(version, bch2_metadata_version, uint, 0444); 2535 2536 module_exit(bcachefs_exit); 2537 module_init(bcachefs_init); 2538