1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * bcachefs setup/teardown code, and some metadata io - read a superblock and 4 * figure out what to do with it. 5 * 6 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> 7 * Copyright 2012 Google, Inc. 8 */ 9 10 #include "bcachefs.h" 11 #include "alloc_background.h" 12 #include "alloc_foreground.h" 13 #include "async_objs.h" 14 #include "backpointers.h" 15 #include "bkey_sort.h" 16 #include "btree_cache.h" 17 #include "btree_gc.h" 18 #include "btree_journal_iter.h" 19 #include "btree_key_cache.h" 20 #include "btree_node_scan.h" 21 #include "btree_update_interior.h" 22 #include "btree_io.h" 23 #include "btree_write_buffer.h" 24 #include "buckets_waiting_for_journal.h" 25 #include "chardev.h" 26 #include "checksum.h" 27 #include "clock.h" 28 #include "compress.h" 29 #include "debug.h" 30 #include "disk_accounting.h" 31 #include "disk_groups.h" 32 #include "ec.h" 33 #include "enumerated_ref.h" 34 #include "errcode.h" 35 #include "error.h" 36 #include "fs.h" 37 #include "fs-io.h" 38 #include "fs-io-buffered.h" 39 #include "fs-io-direct.h" 40 #include "fsck.h" 41 #include "inode.h" 42 #include "io_read.h" 43 #include "io_write.h" 44 #include "journal.h" 45 #include "journal_reclaim.h" 46 #include "journal_seq_blacklist.h" 47 #include "move.h" 48 #include "migrate.h" 49 #include "movinggc.h" 50 #include "nocow_locking.h" 51 #include "quota.h" 52 #include "rebalance.h" 53 #include "recovery.h" 54 #include "recovery_passes.h" 55 #include "replicas.h" 56 #include "sb-clean.h" 57 #include "sb-counters.h" 58 #include "sb-errors.h" 59 #include "sb-members.h" 60 #include "snapshot.h" 61 #include "subvolume.h" 62 #include "super.h" 63 #include "super-io.h" 64 #include "sysfs.h" 65 #include "thread_with_file.h" 66 #include "trace.h" 67 68 #include <linux/backing-dev.h> 69 #include <linux/blkdev.h> 70 #include <linux/debugfs.h> 71 #include <linux/device.h> 72 #include <linux/idr.h> 73 #include <linux/module.h> 74 #include <linux/percpu.h> 75 #include <linux/random.h> 76 #include <linux/sysfs.h> 77 78 MODULE_LICENSE("GPL"); 79 MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>"); 80 MODULE_DESCRIPTION("bcachefs filesystem"); 81 82 typedef DARRAY(struct bch_sb_handle) bch_sb_handles; 83 84 #define x(n) #n, 85 const char * const bch2_fs_flag_strs[] = { 86 BCH_FS_FLAGS() 87 NULL 88 }; 89 90 const char * const bch2_write_refs[] = { 91 BCH_WRITE_REFS() 92 NULL 93 }; 94 95 const char * const bch2_dev_read_refs[] = { 96 BCH_DEV_READ_REFS() 97 NULL 98 }; 99 100 const char * const bch2_dev_write_refs[] = { 101 BCH_DEV_WRITE_REFS() 102 NULL 103 }; 104 #undef x 105 106 static void __bch2_print_str(struct bch_fs *c, const char *prefix, 107 const char *str) 108 { 109 #ifdef __KERNEL__ 110 struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c); 111 112 if (unlikely(stdio)) { 113 bch2_stdio_redirect_printf(stdio, true, "%s", str); 114 return; 115 } 116 #endif 117 bch2_print_string_as_lines(KERN_ERR, str); 118 } 119 120 void bch2_print_str(struct bch_fs *c, const char *prefix, const char *str) 121 { 122 __bch2_print_str(c, prefix, str); 123 } 124 125 __printf(2, 0) 126 static void bch2_print_maybe_redirect(struct stdio_redirect *stdio, const char *fmt, va_list args) 127 { 128 #ifdef __KERNEL__ 129 if (unlikely(stdio)) { 130 if (fmt[0] == KERN_SOH[0]) 131 fmt += 2; 132 133 bch2_stdio_redirect_vprintf(stdio, true, fmt, args); 134 return; 135 } 136 #endif 137 vprintk(fmt, args); 138 } 139 140 void bch2_print_opts(struct bch_opts *opts, const char *fmt, ...) 141 { 142 struct stdio_redirect *stdio = (void *)(unsigned long)opts->stdio; 143 144 va_list args; 145 va_start(args, fmt); 146 bch2_print_maybe_redirect(stdio, fmt, args); 147 va_end(args); 148 } 149 150 void __bch2_print(struct bch_fs *c, const char *fmt, ...) 151 { 152 struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c); 153 154 va_list args; 155 va_start(args, fmt); 156 bch2_print_maybe_redirect(stdio, fmt, args); 157 va_end(args); 158 } 159 160 #define KTYPE(type) \ 161 static const struct attribute_group type ## _group = { \ 162 .attrs = type ## _files \ 163 }; \ 164 \ 165 static const struct attribute_group *type ## _groups[] = { \ 166 &type ## _group, \ 167 NULL \ 168 }; \ 169 \ 170 static const struct kobj_type type ## _ktype = { \ 171 .release = type ## _release, \ 172 .sysfs_ops = &type ## _sysfs_ops, \ 173 .default_groups = type ## _groups \ 174 } 175 176 static void bch2_fs_release(struct kobject *); 177 static void bch2_dev_release(struct kobject *); 178 static void bch2_fs_counters_release(struct kobject *k) 179 { 180 } 181 182 static void bch2_fs_internal_release(struct kobject *k) 183 { 184 } 185 186 static void bch2_fs_opts_dir_release(struct kobject *k) 187 { 188 } 189 190 static void bch2_fs_time_stats_release(struct kobject *k) 191 { 192 } 193 194 KTYPE(bch2_fs); 195 KTYPE(bch2_fs_counters); 196 KTYPE(bch2_fs_internal); 197 KTYPE(bch2_fs_opts_dir); 198 KTYPE(bch2_fs_time_stats); 199 KTYPE(bch2_dev); 200 201 static struct kset *bcachefs_kset; 202 static LIST_HEAD(bch_fs_list); 203 static DEFINE_MUTEX(bch_fs_list_lock); 204 205 DECLARE_WAIT_QUEUE_HEAD(bch2_read_only_wait); 206 207 static void bch2_dev_unlink(struct bch_dev *); 208 static void bch2_dev_free(struct bch_dev *); 209 static int bch2_dev_alloc(struct bch_fs *, unsigned); 210 static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *); 211 static void bch2_dev_io_ref_stop(struct bch_dev *, int); 212 static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *); 213 214 struct bch_fs *bch2_dev_to_fs(dev_t dev) 215 { 216 guard(mutex)(&bch_fs_list_lock); 217 guard(rcu)(); 218 219 struct bch_fs *c; 220 list_for_each_entry(c, &bch_fs_list, list) 221 for_each_member_device_rcu(c, ca, NULL) 222 if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) { 223 closure_get(&c->cl); 224 return c; 225 } 226 return NULL; 227 } 228 229 static struct bch_fs *__bch2_uuid_to_fs(__uuid_t uuid) 230 { 231 struct bch_fs *c; 232 233 lockdep_assert_held(&bch_fs_list_lock); 234 235 list_for_each_entry(c, &bch_fs_list, list) 236 if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid))) 237 return c; 238 239 return NULL; 240 } 241 242 struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid) 243 { 244 struct bch_fs *c; 245 246 mutex_lock(&bch_fs_list_lock); 247 c = __bch2_uuid_to_fs(uuid); 248 if (c) 249 closure_get(&c->cl); 250 mutex_unlock(&bch_fs_list_lock); 251 252 return c; 253 } 254 255 /* Filesystem RO/RW: */ 256 257 /* 258 * For startup/shutdown of RW stuff, the dependencies are: 259 * 260 * - foreground writes depend on copygc and rebalance (to free up space) 261 * 262 * - copygc and rebalance depend on mark and sweep gc (they actually probably 263 * don't because they either reserve ahead of time or don't block if 264 * allocations fail, but allocations can require mark and sweep gc to run 265 * because of generation number wraparound) 266 * 267 * - all of the above depends on the allocator threads 268 * 269 * - allocator depends on the journal (when it rewrites prios and gens) 270 */ 271 272 static void __bch2_fs_read_only(struct bch_fs *c) 273 { 274 unsigned clean_passes = 0; 275 u64 seq = 0; 276 277 bch2_fs_ec_stop(c); 278 bch2_open_buckets_stop(c, NULL, true); 279 bch2_rebalance_stop(c); 280 bch2_copygc_stop(c); 281 bch2_fs_ec_flush(c); 282 283 bch_verbose(c, "flushing journal and stopping allocators, journal seq %llu", 284 journal_cur_seq(&c->journal)); 285 286 do { 287 clean_passes++; 288 289 if (bch2_btree_interior_updates_flush(c) || 290 bch2_btree_write_buffer_flush_going_ro(c) || 291 bch2_journal_flush_all_pins(&c->journal) || 292 bch2_btree_flush_all_writes(c) || 293 seq != atomic64_read(&c->journal.seq)) { 294 seq = atomic64_read(&c->journal.seq); 295 clean_passes = 0; 296 } 297 } while (clean_passes < 2); 298 299 bch_verbose(c, "flushing journal and stopping allocators complete, journal seq %llu", 300 journal_cur_seq(&c->journal)); 301 302 if (test_bit(JOURNAL_replay_done, &c->journal.flags) && 303 !test_bit(BCH_FS_emergency_ro, &c->flags)) 304 set_bit(BCH_FS_clean_shutdown, &c->flags); 305 306 bch2_fs_journal_stop(&c->journal); 307 308 bch_info(c, "%sclean shutdown complete, journal seq %llu", 309 test_bit(BCH_FS_clean_shutdown, &c->flags) ? "" : "un", 310 c->journal.seq_ondisk); 311 312 /* 313 * After stopping journal: 314 */ 315 for_each_member_device(c, ca) { 316 bch2_dev_io_ref_stop(ca, WRITE); 317 bch2_dev_allocator_remove(c, ca); 318 } 319 } 320 321 static void bch2_writes_disabled(struct enumerated_ref *writes) 322 { 323 struct bch_fs *c = container_of(writes, struct bch_fs, writes); 324 325 set_bit(BCH_FS_write_disable_complete, &c->flags); 326 wake_up(&bch2_read_only_wait); 327 } 328 329 void bch2_fs_read_only(struct bch_fs *c) 330 { 331 if (!test_bit(BCH_FS_rw, &c->flags)) { 332 bch2_journal_reclaim_stop(&c->journal); 333 return; 334 } 335 336 BUG_ON(test_bit(BCH_FS_write_disable_complete, &c->flags)); 337 338 bch_verbose(c, "going read-only"); 339 340 /* 341 * Block new foreground-end write operations from starting - any new 342 * writes will return -EROFS: 343 */ 344 set_bit(BCH_FS_going_ro, &c->flags); 345 enumerated_ref_stop_async(&c->writes); 346 347 /* 348 * If we're not doing an emergency shutdown, we want to wait on 349 * outstanding writes to complete so they don't see spurious errors due 350 * to shutting down the allocator: 351 * 352 * If we are doing an emergency shutdown outstanding writes may 353 * hang until we shutdown the allocator so we don't want to wait 354 * on outstanding writes before shutting everything down - but 355 * we do need to wait on them before returning and signalling 356 * that going RO is complete: 357 */ 358 wait_event(bch2_read_only_wait, 359 test_bit(BCH_FS_write_disable_complete, &c->flags) || 360 test_bit(BCH_FS_emergency_ro, &c->flags)); 361 362 bool writes_disabled = test_bit(BCH_FS_write_disable_complete, &c->flags); 363 if (writes_disabled) 364 bch_verbose(c, "finished waiting for writes to stop"); 365 366 __bch2_fs_read_only(c); 367 368 wait_event(bch2_read_only_wait, 369 test_bit(BCH_FS_write_disable_complete, &c->flags)); 370 371 if (!writes_disabled) 372 bch_verbose(c, "finished waiting for writes to stop"); 373 374 clear_bit(BCH_FS_write_disable_complete, &c->flags); 375 clear_bit(BCH_FS_going_ro, &c->flags); 376 clear_bit(BCH_FS_rw, &c->flags); 377 378 if (!bch2_journal_error(&c->journal) && 379 !test_bit(BCH_FS_error, &c->flags) && 380 !test_bit(BCH_FS_emergency_ro, &c->flags) && 381 test_bit(BCH_FS_started, &c->flags) && 382 test_bit(BCH_FS_clean_shutdown, &c->flags) && 383 c->recovery.pass_done >= BCH_RECOVERY_PASS_journal_replay) { 384 BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal)); 385 BUG_ON(atomic_long_read(&c->btree_cache.nr_dirty)); 386 BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty)); 387 BUG_ON(c->btree_write_buffer.inc.keys.nr); 388 BUG_ON(c->btree_write_buffer.flushing.keys.nr); 389 bch2_verify_accounting_clean(c); 390 391 bch_verbose(c, "marking filesystem clean"); 392 bch2_fs_mark_clean(c); 393 } else { 394 /* Make sure error counts/counters are persisted */ 395 mutex_lock(&c->sb_lock); 396 bch2_write_super(c); 397 mutex_unlock(&c->sb_lock); 398 399 bch_verbose(c, "done going read-only, filesystem not clean"); 400 } 401 } 402 403 static void bch2_fs_read_only_work(struct work_struct *work) 404 { 405 struct bch_fs *c = 406 container_of(work, struct bch_fs, read_only_work); 407 408 down_write(&c->state_lock); 409 bch2_fs_read_only(c); 410 up_write(&c->state_lock); 411 } 412 413 static void bch2_fs_read_only_async(struct bch_fs *c) 414 { 415 queue_work(system_long_wq, &c->read_only_work); 416 } 417 418 bool bch2_fs_emergency_read_only(struct bch_fs *c) 419 { 420 bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); 421 422 bch2_journal_halt(&c->journal); 423 bch2_fs_read_only_async(c); 424 425 wake_up(&bch2_read_only_wait); 426 return ret; 427 } 428 429 static bool __bch2_fs_emergency_read_only2(struct bch_fs *c, struct printbuf *out, 430 bool locked) 431 { 432 bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); 433 434 if (!locked) 435 bch2_journal_halt(&c->journal); 436 else 437 bch2_journal_halt_locked(&c->journal); 438 bch2_fs_read_only_async(c); 439 wake_up(&bch2_read_only_wait); 440 441 if (ret) 442 prt_printf(out, "emergency read only at seq %llu\n", 443 journal_cur_seq(&c->journal)); 444 445 return ret; 446 } 447 448 bool bch2_fs_emergency_read_only2(struct bch_fs *c, struct printbuf *out) 449 { 450 return __bch2_fs_emergency_read_only2(c, out, false); 451 } 452 453 bool bch2_fs_emergency_read_only_locked(struct bch_fs *c) 454 { 455 bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); 456 457 bch2_journal_halt_locked(&c->journal); 458 bch2_fs_read_only_async(c); 459 460 wake_up(&bch2_read_only_wait); 461 return ret; 462 } 463 464 static int __bch2_fs_read_write(struct bch_fs *c, bool early) 465 { 466 int ret; 467 468 BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags)); 469 470 if (WARN_ON(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))) 471 return bch_err_throw(c, erofs_no_alloc_info); 472 473 if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) { 474 bch_err(c, "cannot go rw, unfixed btree errors"); 475 return bch_err_throw(c, erofs_unfixed_errors); 476 } 477 478 if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { 479 bch_err(c, "cannot go rw, filesystem is an unresized image file"); 480 return bch_err_throw(c, erofs_filesystem_full); 481 } 482 483 if (test_bit(BCH_FS_rw, &c->flags)) 484 return 0; 485 486 bch_info(c, "going read-write"); 487 488 ret = bch2_fs_init_rw(c); 489 if (ret) 490 goto err; 491 492 ret = bch2_sb_members_v2_init(c); 493 if (ret) 494 goto err; 495 496 clear_bit(BCH_FS_clean_shutdown, &c->flags); 497 498 scoped_guard(rcu) 499 for_each_online_member_rcu(c, ca) 500 if (ca->mi.state == BCH_MEMBER_STATE_rw) { 501 bch2_dev_allocator_add(c, ca); 502 enumerated_ref_start(&ca->io_ref[WRITE]); 503 } 504 505 bch2_recalc_capacity(c); 506 507 /* 508 * First journal write must be a flush write: after a clean shutdown we 509 * don't read the journal, so the first journal write may end up 510 * overwriting whatever was there previously, and there must always be 511 * at least one non-flush write in the journal or recovery will fail: 512 */ 513 spin_lock(&c->journal.lock); 514 set_bit(JOURNAL_need_flush_write, &c->journal.flags); 515 set_bit(JOURNAL_running, &c->journal.flags); 516 bch2_journal_space_available(&c->journal); 517 spin_unlock(&c->journal.lock); 518 519 ret = bch2_fs_mark_dirty(c); 520 if (ret) 521 goto err; 522 523 ret = bch2_journal_reclaim_start(&c->journal); 524 if (ret) 525 goto err; 526 527 set_bit(BCH_FS_rw, &c->flags); 528 set_bit(BCH_FS_was_rw, &c->flags); 529 530 enumerated_ref_start(&c->writes); 531 532 ret = bch2_copygc_start(c); 533 if (ret) { 534 bch_err_msg(c, ret, "error starting copygc thread"); 535 goto err; 536 } 537 538 ret = bch2_rebalance_start(c); 539 if (ret) { 540 bch_err_msg(c, ret, "error starting rebalance thread"); 541 goto err; 542 } 543 544 bch2_do_discards(c); 545 bch2_do_invalidates(c); 546 bch2_do_stripe_deletes(c); 547 bch2_do_pending_node_rewrites(c); 548 return 0; 549 err: 550 if (test_bit(BCH_FS_rw, &c->flags)) 551 bch2_fs_read_only(c); 552 else 553 __bch2_fs_read_only(c); 554 return ret; 555 } 556 557 int bch2_fs_read_write(struct bch_fs *c) 558 { 559 if (c->opts.recovery_pass_last && 560 c->opts.recovery_pass_last < BCH_RECOVERY_PASS_journal_replay) 561 return bch_err_throw(c, erofs_norecovery); 562 563 if (c->opts.nochanges) 564 return bch_err_throw(c, erofs_nochanges); 565 566 if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) 567 return bch_err_throw(c, erofs_no_alloc_info); 568 569 return __bch2_fs_read_write(c, false); 570 } 571 572 int bch2_fs_read_write_early(struct bch_fs *c) 573 { 574 down_write(&c->state_lock); 575 int ret = __bch2_fs_read_write(c, true); 576 up_write(&c->state_lock); 577 578 return ret; 579 } 580 581 /* Filesystem startup/shutdown: */ 582 583 static void __bch2_fs_free(struct bch_fs *c) 584 { 585 for (unsigned i = 0; i < BCH_TIME_STAT_NR; i++) 586 bch2_time_stats_exit(&c->times[i]); 587 588 #ifdef CONFIG_UNICODE 589 utf8_unload(c->cf_encoding); 590 #endif 591 592 bch2_find_btree_nodes_exit(&c->found_btree_nodes); 593 bch2_free_pending_node_rewrites(c); 594 bch2_free_fsck_errs(c); 595 bch2_fs_vfs_exit(c); 596 bch2_fs_snapshots_exit(c); 597 bch2_fs_sb_errors_exit(c); 598 bch2_fs_replicas_exit(c); 599 bch2_fs_rebalance_exit(c); 600 bch2_fs_quota_exit(c); 601 bch2_fs_nocow_locking_exit(c); 602 bch2_fs_journal_exit(&c->journal); 603 bch2_fs_fs_io_direct_exit(c); 604 bch2_fs_fs_io_buffered_exit(c); 605 bch2_fs_fsio_exit(c); 606 bch2_fs_io_write_exit(c); 607 bch2_fs_io_read_exit(c); 608 bch2_fs_encryption_exit(c); 609 bch2_fs_ec_exit(c); 610 bch2_fs_counters_exit(c); 611 bch2_fs_compress_exit(c); 612 bch2_io_clock_exit(&c->io_clock[WRITE]); 613 bch2_io_clock_exit(&c->io_clock[READ]); 614 bch2_fs_buckets_waiting_for_journal_exit(c); 615 bch2_fs_btree_write_buffer_exit(c); 616 bch2_fs_btree_key_cache_exit(&c->btree_key_cache); 617 bch2_fs_btree_iter_exit(c); 618 bch2_fs_btree_interior_update_exit(c); 619 bch2_fs_btree_cache_exit(c); 620 bch2_fs_accounting_exit(c); 621 bch2_fs_async_obj_exit(c); 622 bch2_journal_keys_put_initial(c); 623 bch2_find_btree_nodes_exit(&c->found_btree_nodes); 624 625 BUG_ON(atomic_read(&c->journal_keys.ref)); 626 percpu_free_rwsem(&c->mark_lock); 627 if (c->online_reserved) { 628 u64 v = percpu_u64_get(c->online_reserved); 629 WARN(v, "online_reserved not 0 at shutdown: %lli", v); 630 free_percpu(c->online_reserved); 631 } 632 633 darray_exit(&c->incompat_versions_requested); 634 darray_exit(&c->btree_roots_extra); 635 free_percpu(c->pcpu); 636 free_percpu(c->usage); 637 mempool_exit(&c->large_bkey_pool); 638 mempool_exit(&c->btree_bounce_pool); 639 bioset_exit(&c->btree_bio); 640 mempool_exit(&c->fill_iter); 641 enumerated_ref_exit(&c->writes); 642 kfree(rcu_dereference_protected(c->disk_groups, 1)); 643 kfree(c->journal_seq_blacklist_table); 644 645 if (c->write_ref_wq) 646 destroy_workqueue(c->write_ref_wq); 647 if (c->btree_write_submit_wq) 648 destroy_workqueue(c->btree_write_submit_wq); 649 if (c->btree_read_complete_wq) 650 destroy_workqueue(c->btree_read_complete_wq); 651 if (c->copygc_wq) 652 destroy_workqueue(c->copygc_wq); 653 if (c->btree_write_complete_wq) 654 destroy_workqueue(c->btree_write_complete_wq); 655 if (c->btree_update_wq) 656 destroy_workqueue(c->btree_update_wq); 657 658 bch2_free_super(&c->disk_sb); 659 kvfree(c); 660 module_put(THIS_MODULE); 661 } 662 663 static void bch2_fs_release(struct kobject *kobj) 664 { 665 struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); 666 667 __bch2_fs_free(c); 668 } 669 670 void __bch2_fs_stop(struct bch_fs *c) 671 { 672 bch_verbose(c, "shutting down"); 673 674 set_bit(BCH_FS_stopping, &c->flags); 675 676 down_write(&c->state_lock); 677 bch2_fs_read_only(c); 678 up_write(&c->state_lock); 679 680 for (unsigned i = 0; i < c->sb.nr_devices; i++) { 681 struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true); 682 if (ca) 683 bch2_dev_io_ref_stop(ca, READ); 684 } 685 686 for_each_member_device(c, ca) 687 bch2_dev_unlink(ca); 688 689 if (c->kobj.state_in_sysfs) 690 kobject_del(&c->kobj); 691 692 bch2_fs_debug_exit(c); 693 bch2_fs_chardev_exit(c); 694 695 bch2_ro_ref_put(c); 696 wait_event(c->ro_ref_wait, !refcount_read(&c->ro_ref)); 697 698 kobject_put(&c->counters_kobj); 699 kobject_put(&c->time_stats); 700 kobject_put(&c->opts_dir); 701 kobject_put(&c->internal); 702 703 /* btree prefetch might have kicked off reads in the background: */ 704 bch2_btree_flush_all_reads(c); 705 706 for_each_member_device(c, ca) 707 cancel_work_sync(&ca->io_error_work); 708 709 cancel_work_sync(&c->read_only_work); 710 } 711 712 void bch2_fs_free(struct bch_fs *c) 713 { 714 mutex_lock(&bch_fs_list_lock); 715 list_del(&c->list); 716 mutex_unlock(&bch_fs_list_lock); 717 718 closure_sync(&c->cl); 719 closure_debug_destroy(&c->cl); 720 721 for (unsigned i = 0; i < c->sb.nr_devices; i++) { 722 struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true); 723 724 if (ca) { 725 EBUG_ON(atomic_long_read(&ca->ref) != 1); 726 bch2_dev_io_ref_stop(ca, READ); 727 bch2_free_super(&ca->disk_sb); 728 bch2_dev_free(ca); 729 } 730 } 731 732 bch_verbose(c, "shutdown complete"); 733 734 kobject_put(&c->kobj); 735 } 736 737 void bch2_fs_stop(struct bch_fs *c) 738 { 739 __bch2_fs_stop(c); 740 bch2_fs_free(c); 741 } 742 743 static int bch2_fs_online(struct bch_fs *c) 744 { 745 int ret = 0; 746 747 lockdep_assert_held(&bch_fs_list_lock); 748 749 if (c->sb.multi_device && 750 __bch2_uuid_to_fs(c->sb.uuid)) { 751 bch_err(c, "filesystem UUID already open"); 752 return bch_err_throw(c, filesystem_uuid_already_open); 753 } 754 755 ret = bch2_fs_chardev_init(c); 756 if (ret) { 757 bch_err(c, "error creating character device"); 758 return ret; 759 } 760 761 bch2_fs_debug_init(c); 762 763 ret = (c->sb.multi_device 764 ? kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) 765 : kobject_add(&c->kobj, NULL, "%s", c->name)) ?: 766 kobject_add(&c->internal, &c->kobj, "internal") ?: 767 kobject_add(&c->opts_dir, &c->kobj, "options") ?: 768 #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT 769 kobject_add(&c->time_stats, &c->kobj, "time_stats") ?: 770 #endif 771 kobject_add(&c->counters_kobj, &c->kobj, "counters") ?: 772 bch2_opts_create_sysfs_files(&c->opts_dir, OPT_FS); 773 if (ret) { 774 bch_err(c, "error creating sysfs objects"); 775 return ret; 776 } 777 778 down_write(&c->state_lock); 779 780 for_each_member_device(c, ca) { 781 ret = bch2_dev_sysfs_online(c, ca); 782 if (ret) { 783 bch_err(c, "error creating sysfs objects"); 784 bch2_dev_put(ca); 785 goto err; 786 } 787 } 788 789 BUG_ON(!list_empty(&c->list)); 790 list_add(&c->list, &bch_fs_list); 791 err: 792 up_write(&c->state_lock); 793 return ret; 794 } 795 796 int bch2_fs_init_rw(struct bch_fs *c) 797 { 798 if (test_bit(BCH_FS_rw_init_done, &c->flags)) 799 return 0; 800 801 if (!(c->btree_update_wq = alloc_workqueue("bcachefs", 802 WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) || 803 !(c->btree_write_complete_wq = alloc_workqueue("bcachefs_btree_write_complete", 804 WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || 805 !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", 806 WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || 807 !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit", 808 WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || 809 !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", 810 WQ_FREEZABLE, 0))) 811 return bch_err_throw(c, ENOMEM_fs_other_alloc); 812 813 int ret = bch2_fs_btree_interior_update_init(c) ?: 814 bch2_fs_btree_write_buffer_init(c) ?: 815 bch2_fs_fs_io_buffered_init(c) ?: 816 bch2_fs_io_write_init(c) ?: 817 bch2_fs_journal_init(&c->journal); 818 if (ret) 819 return ret; 820 821 set_bit(BCH_FS_rw_init_done, &c->flags); 822 return 0; 823 } 824 825 static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts, 826 bch_sb_handles *sbs) 827 { 828 struct bch_fs *c; 829 struct printbuf name = PRINTBUF; 830 unsigned i, iter_size; 831 int ret = 0; 832 833 c = kvmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO); 834 if (!c) { 835 c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc); 836 goto out; 837 } 838 839 c->stdio = (void *)(unsigned long) opts->stdio; 840 841 __module_get(THIS_MODULE); 842 843 closure_init(&c->cl, NULL); 844 845 c->kobj.kset = bcachefs_kset; 846 kobject_init(&c->kobj, &bch2_fs_ktype); 847 kobject_init(&c->internal, &bch2_fs_internal_ktype); 848 kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype); 849 kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype); 850 kobject_init(&c->counters_kobj, &bch2_fs_counters_ktype); 851 852 c->minor = -1; 853 c->disk_sb.fs_sb = true; 854 855 init_rwsem(&c->state_lock); 856 mutex_init(&c->sb_lock); 857 mutex_init(&c->replicas_gc_lock); 858 mutex_init(&c->btree_root_lock); 859 INIT_WORK(&c->read_only_work, bch2_fs_read_only_work); 860 861 refcount_set(&c->ro_ref, 1); 862 init_waitqueue_head(&c->ro_ref_wait); 863 864 for (i = 0; i < BCH_TIME_STAT_NR; i++) 865 bch2_time_stats_init(&c->times[i]); 866 867 bch2_fs_allocator_background_init(c); 868 bch2_fs_allocator_foreground_init(c); 869 bch2_fs_btree_cache_init_early(&c->btree_cache); 870 bch2_fs_btree_gc_init_early(c); 871 bch2_fs_btree_interior_update_init_early(c); 872 bch2_fs_btree_iter_init_early(c); 873 bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); 874 bch2_fs_btree_write_buffer_init_early(c); 875 bch2_fs_copygc_init(c); 876 bch2_fs_ec_init_early(c); 877 bch2_fs_journal_init_early(&c->journal); 878 bch2_fs_journal_keys_init(c); 879 bch2_fs_move_init(c); 880 bch2_fs_nocow_locking_init_early(c); 881 bch2_fs_quota_init(c); 882 bch2_fs_recovery_passes_init(c); 883 bch2_fs_sb_errors_init_early(c); 884 bch2_fs_snapshots_init_early(c); 885 bch2_fs_subvolumes_init_early(c); 886 887 INIT_LIST_HEAD(&c->list); 888 889 mutex_init(&c->bio_bounce_pages_lock); 890 mutex_init(&c->snapshot_table_lock); 891 init_rwsem(&c->snapshot_create_lock); 892 893 spin_lock_init(&c->btree_write_error_lock); 894 895 INIT_LIST_HEAD(&c->journal_iters); 896 897 INIT_LIST_HEAD(&c->fsck_error_msgs); 898 mutex_init(&c->fsck_error_msgs_lock); 899 900 seqcount_init(&c->usage_lock); 901 902 sema_init(&c->io_in_flight, 128); 903 904 INIT_LIST_HEAD(&c->vfs_inodes_list); 905 mutex_init(&c->vfs_inodes_lock); 906 907 c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write]; 908 c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write]; 909 c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; 910 911 mutex_init(&c->sectors_available_lock); 912 913 ret = percpu_init_rwsem(&c->mark_lock); 914 if (ret) 915 goto err; 916 917 mutex_lock(&c->sb_lock); 918 ret = bch2_sb_to_fs(c, sb); 919 mutex_unlock(&c->sb_lock); 920 921 if (ret) 922 goto err; 923 924 /* Compat: */ 925 if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && 926 !BCH_SB_JOURNAL_FLUSH_DELAY(sb)) 927 SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000); 928 929 if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && 930 !BCH_SB_JOURNAL_RECLAIM_DELAY(sb)) 931 SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 100); 932 933 c->opts = bch2_opts_default; 934 ret = bch2_opts_from_sb(&c->opts, sb); 935 if (ret) 936 goto err; 937 938 bch2_opts_apply(&c->opts, *opts); 939 940 if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && 941 c->opts.block_size > PAGE_SIZE) { 942 bch_err(c, "cannot mount bs > ps filesystem without CONFIG_TRANSPARENT_HUGEPAGE"); 943 ret = -EINVAL; 944 goto err; 945 } 946 947 c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc; 948 if (c->opts.inodes_use_key_cache) 949 c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes; 950 c->btree_key_cache_btrees |= 1U << BTREE_ID_logged_ops; 951 952 c->block_bits = ilog2(block_sectors(c)); 953 c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c); 954 955 if (bch2_fs_init_fault("fs_alloc")) { 956 bch_err(c, "fs_alloc fault injected"); 957 ret = -EFAULT; 958 goto err; 959 } 960 961 if (c->sb.multi_device) 962 pr_uuid(&name, c->sb.user_uuid.b); 963 else 964 prt_bdevname(&name, sbs->data[0].bdev); 965 966 ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0; 967 if (ret) 968 goto err; 969 970 strscpy(c->name, name.buf, sizeof(c->name)); 971 printbuf_exit(&name); 972 973 iter_size = sizeof(struct sort_iter) + 974 (btree_blocks(c) + 1) * 2 * 975 sizeof(struct sort_iter_set); 976 977 if (!(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete", 978 WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) || 979 enumerated_ref_init(&c->writes, BCH_WRITE_REF_NR, 980 bch2_writes_disabled) || 981 mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || 982 bioset_init(&c->btree_bio, 1, 983 max(offsetof(struct btree_read_bio, bio), 984 offsetof(struct btree_write_bio, wbio.bio)), 985 BIOSET_NEED_BVECS) || 986 !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || 987 !(c->usage = alloc_percpu(struct bch_fs_usage_base)) || 988 !(c->online_reserved = alloc_percpu(u64)) || 989 mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1, 990 c->opts.btree_node_size) || 991 mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048)) { 992 ret = bch_err_throw(c, ENOMEM_fs_other_alloc); 993 goto err; 994 } 995 996 ret = 997 bch2_fs_async_obj_init(c) ?: 998 bch2_fs_btree_cache_init(c) ?: 999 bch2_fs_btree_iter_init(c) ?: 1000 bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: 1001 bch2_fs_buckets_waiting_for_journal_init(c) ?: 1002 bch2_io_clock_init(&c->io_clock[READ]) ?: 1003 bch2_io_clock_init(&c->io_clock[WRITE]) ?: 1004 bch2_fs_compress_init(c) ?: 1005 bch2_fs_counters_init(c) ?: 1006 bch2_fs_ec_init(c) ?: 1007 bch2_fs_encryption_init(c) ?: 1008 bch2_fs_fsio_init(c) ?: 1009 bch2_fs_fs_io_direct_init(c) ?: 1010 bch2_fs_io_read_init(c) ?: 1011 bch2_fs_rebalance_init(c) ?: 1012 bch2_fs_sb_errors_init(c) ?: 1013 bch2_fs_vfs_init(c); 1014 if (ret) 1015 goto err; 1016 1017 if (go_rw_in_recovery(c)) { 1018 /* 1019 * start workqueues/kworkers early - kthread creation checks for 1020 * pending signals, which is _very_ annoying 1021 */ 1022 ret = bch2_fs_init_rw(c); 1023 if (ret) 1024 goto err; 1025 } 1026 1027 #ifdef CONFIG_UNICODE 1028 /* Default encoding until we can potentially have more as an option. */ 1029 c->cf_encoding = utf8_load(BCH_FS_DEFAULT_UTF8_ENCODING); 1030 if (IS_ERR(c->cf_encoding)) { 1031 printk(KERN_ERR "Cannot load UTF-8 encoding for filesystem. Version: %u.%u.%u", 1032 unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), 1033 unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), 1034 unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); 1035 ret = -EINVAL; 1036 goto err; 1037 } 1038 #else 1039 if (c->sb.features & BIT_ULL(BCH_FEATURE_casefolding)) { 1040 printk(KERN_ERR "Cannot mount a filesystem with casefolding on a kernel without CONFIG_UNICODE\n"); 1041 ret = -EINVAL; 1042 goto err; 1043 } 1044 #endif 1045 1046 for (i = 0; i < c->sb.nr_devices; i++) { 1047 if (!bch2_member_exists(c->disk_sb.sb, i)) 1048 continue; 1049 ret = bch2_dev_alloc(c, i); 1050 if (ret) 1051 goto err; 1052 } 1053 1054 bch2_journal_entry_res_resize(&c->journal, 1055 &c->btree_root_journal_res, 1056 BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX)); 1057 bch2_journal_entry_res_resize(&c->journal, 1058 &c->clock_journal_res, 1059 (sizeof(struct jset_entry_clock) / sizeof(u64)) * 2); 1060 1061 mutex_lock(&bch_fs_list_lock); 1062 ret = bch2_fs_online(c); 1063 mutex_unlock(&bch_fs_list_lock); 1064 1065 if (ret) 1066 goto err; 1067 out: 1068 return c; 1069 err: 1070 bch2_fs_free(c); 1071 c = ERR_PTR(ret); 1072 goto out; 1073 } 1074 1075 noinline_for_stack 1076 static void print_mount_opts(struct bch_fs *c) 1077 { 1078 enum bch_opt_id i; 1079 CLASS(printbuf, p)(); 1080 bch2_log_msg_start(c, &p); 1081 1082 prt_str(&p, "starting version "); 1083 bch2_version_to_text(&p, c->sb.version); 1084 1085 bool first = true; 1086 for (i = 0; i < bch2_opts_nr; i++) { 1087 const struct bch_option *opt = &bch2_opt_table[i]; 1088 u64 v = bch2_opt_get_by_id(&c->opts, i); 1089 1090 if (!(opt->flags & OPT_MOUNT)) 1091 continue; 1092 1093 if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) 1094 continue; 1095 1096 prt_str(&p, first ? " opts=" : ","); 1097 first = false; 1098 bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE); 1099 } 1100 1101 if (c->sb.version_incompat_allowed != c->sb.version) { 1102 prt_printf(&p, "\nallowing incompatible features above "); 1103 bch2_version_to_text(&p, c->sb.version_incompat_allowed); 1104 } 1105 1106 if (c->opts.verbose) { 1107 prt_printf(&p, "\nfeatures: "); 1108 prt_bitflags(&p, bch2_sb_features, c->sb.features); 1109 } 1110 1111 if (c->sb.multi_device) { 1112 prt_printf(&p, "\nwith devices"); 1113 for_each_online_member(c, ca, BCH_DEV_READ_REF_bch2_online_devs) { 1114 prt_char(&p, ' '); 1115 prt_str(&p, ca->name); 1116 } 1117 } 1118 1119 bch2_print_str(c, KERN_INFO, p.buf); 1120 } 1121 1122 static bool bch2_fs_may_start(struct bch_fs *c) 1123 { 1124 struct bch_dev *ca; 1125 unsigned flags = 0; 1126 1127 switch (c->opts.degraded) { 1128 case BCH_DEGRADED_very: 1129 flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST; 1130 break; 1131 case BCH_DEGRADED_yes: 1132 flags |= BCH_FORCE_IF_DEGRADED; 1133 break; 1134 default: 1135 mutex_lock(&c->sb_lock); 1136 for (unsigned i = 0; i < c->disk_sb.sb->nr_devices; i++) { 1137 if (!bch2_member_exists(c->disk_sb.sb, i)) 1138 continue; 1139 1140 ca = bch2_dev_locked(c, i); 1141 1142 if (!bch2_dev_is_online(ca) && 1143 (ca->mi.state == BCH_MEMBER_STATE_rw || 1144 ca->mi.state == BCH_MEMBER_STATE_ro)) { 1145 mutex_unlock(&c->sb_lock); 1146 return false; 1147 } 1148 } 1149 mutex_unlock(&c->sb_lock); 1150 break; 1151 } 1152 1153 return bch2_have_enough_devs(c, c->online_devs, flags, true); 1154 } 1155 1156 int bch2_fs_start(struct bch_fs *c) 1157 { 1158 time64_t now = ktime_get_real_seconds(); 1159 int ret = 0; 1160 1161 print_mount_opts(c); 1162 1163 #ifdef CONFIG_UNICODE 1164 bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u", 1165 unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), 1166 unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), 1167 unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); 1168 #endif 1169 1170 if (!bch2_fs_may_start(c)) 1171 return bch_err_throw(c, insufficient_devices_to_start); 1172 1173 down_write(&c->state_lock); 1174 mutex_lock(&c->sb_lock); 1175 1176 BUG_ON(test_bit(BCH_FS_started, &c->flags)); 1177 1178 if (!bch2_sb_field_get_minsize(&c->disk_sb, ext, 1179 sizeof(struct bch_sb_field_ext) / sizeof(u64))) { 1180 mutex_unlock(&c->sb_lock); 1181 up_write(&c->state_lock); 1182 ret = bch_err_throw(c, ENOSPC_sb); 1183 goto err; 1184 } 1185 1186 ret = bch2_sb_members_v2_init(c); 1187 if (ret) { 1188 mutex_unlock(&c->sb_lock); 1189 up_write(&c->state_lock); 1190 goto err; 1191 } 1192 1193 scoped_guard(rcu) 1194 for_each_online_member_rcu(c, ca) 1195 bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = 1196 cpu_to_le64(now); 1197 1198 /* 1199 * Dno't write superblock yet: recovery might have to downgrade 1200 */ 1201 mutex_unlock(&c->sb_lock); 1202 1203 scoped_guard(rcu) 1204 for_each_online_member_rcu(c, ca) 1205 if (ca->mi.state == BCH_MEMBER_STATE_rw) 1206 bch2_dev_allocator_add(c, ca); 1207 bch2_recalc_capacity(c); 1208 up_write(&c->state_lock); 1209 1210 c->recovery_task = current; 1211 ret = BCH_SB_INITIALIZED(c->disk_sb.sb) 1212 ? bch2_fs_recovery(c) 1213 : bch2_fs_initialize(c); 1214 c->recovery_task = NULL; 1215 1216 if (ret) 1217 goto err; 1218 1219 ret = bch2_opts_hooks_pre_set(c); 1220 if (ret) 1221 goto err; 1222 1223 if (bch2_fs_init_fault("fs_start")) { 1224 ret = bch_err_throw(c, injected_fs_start); 1225 goto err; 1226 } 1227 1228 set_bit(BCH_FS_started, &c->flags); 1229 wake_up(&c->ro_ref_wait); 1230 1231 down_write(&c->state_lock); 1232 if (c->opts.read_only) 1233 bch2_fs_read_only(c); 1234 else if (!test_bit(BCH_FS_rw, &c->flags)) 1235 ret = bch2_fs_read_write(c); 1236 up_write(&c->state_lock); 1237 1238 err: 1239 if (ret) 1240 bch_err_msg(c, ret, "starting filesystem"); 1241 else 1242 bch_verbose(c, "done starting filesystem"); 1243 return ret; 1244 } 1245 1246 static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) 1247 { 1248 struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx); 1249 1250 if (le16_to_cpu(sb->block_size) != block_sectors(c)) 1251 return bch_err_throw(c, mismatched_block_size); 1252 1253 if (le16_to_cpu(m.bucket_size) < 1254 BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb)) 1255 return bch_err_throw(c, bucket_size_too_small); 1256 1257 return 0; 1258 } 1259 1260 static int bch2_dev_in_fs(struct bch_sb_handle *fs, 1261 struct bch_sb_handle *sb, 1262 struct bch_opts *opts) 1263 { 1264 if (fs == sb) 1265 return 0; 1266 1267 if (!uuid_equal(&fs->sb->uuid, &sb->sb->uuid)) 1268 return -BCH_ERR_device_not_a_member_of_filesystem; 1269 1270 if (!bch2_member_exists(fs->sb, sb->sb->dev_idx)) 1271 return -BCH_ERR_device_has_been_removed; 1272 1273 if (fs->sb->block_size != sb->sb->block_size) 1274 return -BCH_ERR_mismatched_block_size; 1275 1276 if (le16_to_cpu(fs->sb->version) < bcachefs_metadata_version_member_seq || 1277 le16_to_cpu(sb->sb->version) < bcachefs_metadata_version_member_seq) 1278 return 0; 1279 1280 if (fs->sb->seq == sb->sb->seq && 1281 fs->sb->write_time != sb->sb->write_time) { 1282 struct printbuf buf = PRINTBUF; 1283 1284 prt_str(&buf, "Split brain detected between "); 1285 prt_bdevname(&buf, sb->bdev); 1286 prt_str(&buf, " and "); 1287 prt_bdevname(&buf, fs->bdev); 1288 prt_char(&buf, ':'); 1289 prt_newline(&buf); 1290 prt_printf(&buf, "seq=%llu but write_time different, got", le64_to_cpu(sb->sb->seq)); 1291 prt_newline(&buf); 1292 1293 prt_bdevname(&buf, fs->bdev); 1294 prt_char(&buf, ' '); 1295 bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time)); 1296 prt_newline(&buf); 1297 1298 prt_bdevname(&buf, sb->bdev); 1299 prt_char(&buf, ' '); 1300 bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time)); 1301 prt_newline(&buf); 1302 1303 if (!opts->no_splitbrain_check) 1304 prt_printf(&buf, "Not using older sb"); 1305 1306 pr_err("%s", buf.buf); 1307 printbuf_exit(&buf); 1308 1309 if (!opts->no_splitbrain_check) 1310 return -BCH_ERR_device_splitbrain; 1311 } 1312 1313 struct bch_member m = bch2_sb_member_get(fs->sb, sb->sb->dev_idx); 1314 u64 seq_from_fs = le64_to_cpu(m.seq); 1315 u64 seq_from_member = le64_to_cpu(sb->sb->seq); 1316 1317 if (seq_from_fs && seq_from_fs < seq_from_member) { 1318 struct printbuf buf = PRINTBUF; 1319 1320 prt_str(&buf, "Split brain detected between "); 1321 prt_bdevname(&buf, sb->bdev); 1322 prt_str(&buf, " and "); 1323 prt_bdevname(&buf, fs->bdev); 1324 prt_char(&buf, ':'); 1325 prt_newline(&buf); 1326 1327 prt_bdevname(&buf, fs->bdev); 1328 prt_str(&buf, " believes seq of "); 1329 prt_bdevname(&buf, sb->bdev); 1330 prt_printf(&buf, " to be %llu, but ", seq_from_fs); 1331 prt_bdevname(&buf, sb->bdev); 1332 prt_printf(&buf, " has %llu\n", seq_from_member); 1333 1334 if (!opts->no_splitbrain_check) { 1335 prt_str(&buf, "Not using "); 1336 prt_bdevname(&buf, sb->bdev); 1337 } 1338 1339 pr_err("%s", buf.buf); 1340 printbuf_exit(&buf); 1341 1342 if (!opts->no_splitbrain_check) 1343 return -BCH_ERR_device_splitbrain; 1344 } 1345 1346 return 0; 1347 } 1348 1349 /* Device startup/shutdown: */ 1350 1351 static void bch2_dev_io_ref_stop(struct bch_dev *ca, int rw) 1352 { 1353 if (rw == READ) 1354 clear_bit(ca->dev_idx, ca->fs->online_devs.d); 1355 1356 if (!enumerated_ref_is_zero(&ca->io_ref[rw])) 1357 enumerated_ref_stop(&ca->io_ref[rw], 1358 rw == READ 1359 ? bch2_dev_read_refs 1360 : bch2_dev_write_refs); 1361 } 1362 1363 static void bch2_dev_release(struct kobject *kobj) 1364 { 1365 struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); 1366 1367 kfree(ca); 1368 } 1369 1370 static void bch2_dev_free(struct bch_dev *ca) 1371 { 1372 WARN_ON(!enumerated_ref_is_zero(&ca->io_ref[WRITE])); 1373 WARN_ON(!enumerated_ref_is_zero(&ca->io_ref[READ])); 1374 1375 cancel_work_sync(&ca->io_error_work); 1376 1377 bch2_dev_unlink(ca); 1378 1379 if (ca->kobj.state_in_sysfs) 1380 kobject_del(&ca->kobj); 1381 1382 bch2_bucket_bitmap_free(&ca->bucket_backpointer_mismatch); 1383 bch2_bucket_bitmap_free(&ca->bucket_backpointer_empty); 1384 1385 bch2_free_super(&ca->disk_sb); 1386 bch2_dev_allocator_background_exit(ca); 1387 bch2_dev_journal_exit(ca); 1388 1389 free_percpu(ca->io_done); 1390 bch2_dev_buckets_free(ca); 1391 kfree(ca->sb_read_scratch); 1392 1393 bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]); 1394 bch2_time_stats_quantiles_exit(&ca->io_latency[READ]); 1395 1396 enumerated_ref_exit(&ca->io_ref[WRITE]); 1397 enumerated_ref_exit(&ca->io_ref[READ]); 1398 #ifndef CONFIG_BCACHEFS_DEBUG 1399 percpu_ref_exit(&ca->ref); 1400 #endif 1401 kobject_put(&ca->kobj); 1402 } 1403 1404 static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca) 1405 { 1406 1407 lockdep_assert_held(&c->state_lock); 1408 1409 if (enumerated_ref_is_zero(&ca->io_ref[READ])) 1410 return; 1411 1412 __bch2_dev_read_only(c, ca); 1413 1414 bch2_dev_io_ref_stop(ca, READ); 1415 1416 bch2_dev_unlink(ca); 1417 1418 bch2_free_super(&ca->disk_sb); 1419 bch2_dev_journal_exit(ca); 1420 } 1421 1422 #ifndef CONFIG_BCACHEFS_DEBUG 1423 static void bch2_dev_ref_complete(struct percpu_ref *ref) 1424 { 1425 struct bch_dev *ca = container_of(ref, struct bch_dev, ref); 1426 1427 complete(&ca->ref_completion); 1428 } 1429 #endif 1430 1431 static void bch2_dev_unlink(struct bch_dev *ca) 1432 { 1433 struct kobject *b; 1434 1435 /* 1436 * This is racy w.r.t. the underlying block device being hot-removed, 1437 * which removes it from sysfs. 1438 * 1439 * It'd be lovely if we had a way to handle this race, but the sysfs 1440 * code doesn't appear to provide a good method and block/holder.c is 1441 * susceptible as well: 1442 */ 1443 if (ca->kobj.state_in_sysfs && 1444 ca->disk_sb.bdev && 1445 (b = bdev_kobj(ca->disk_sb.bdev))->state_in_sysfs) { 1446 sysfs_remove_link(b, "bcachefs"); 1447 sysfs_remove_link(&ca->kobj, "block"); 1448 } 1449 } 1450 1451 static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca) 1452 { 1453 int ret; 1454 1455 if (!c->kobj.state_in_sysfs) 1456 return 0; 1457 1458 if (!ca->kobj.state_in_sysfs) { 1459 ret = kobject_add(&ca->kobj, &c->kobj, "dev-%u", ca->dev_idx) ?: 1460 bch2_opts_create_sysfs_files(&ca->kobj, OPT_DEVICE); 1461 if (ret) 1462 return ret; 1463 } 1464 1465 if (ca->disk_sb.bdev) { 1466 struct kobject *block = bdev_kobj(ca->disk_sb.bdev); 1467 1468 ret = sysfs_create_link(block, &ca->kobj, "bcachefs"); 1469 if (ret) 1470 return ret; 1471 1472 ret = sysfs_create_link(&ca->kobj, block, "block"); 1473 if (ret) 1474 return ret; 1475 } 1476 1477 return 0; 1478 } 1479 1480 static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, 1481 struct bch_member *member) 1482 { 1483 struct bch_dev *ca; 1484 unsigned i; 1485 1486 ca = kzalloc(sizeof(*ca), GFP_KERNEL); 1487 if (!ca) 1488 return NULL; 1489 1490 kobject_init(&ca->kobj, &bch2_dev_ktype); 1491 init_completion(&ca->ref_completion); 1492 1493 INIT_WORK(&ca->io_error_work, bch2_io_error_work); 1494 1495 bch2_time_stats_quantiles_init(&ca->io_latency[READ]); 1496 bch2_time_stats_quantiles_init(&ca->io_latency[WRITE]); 1497 1498 ca->mi = bch2_mi_to_cpu(member); 1499 1500 for (i = 0; i < ARRAY_SIZE(member->errors); i++) 1501 atomic64_set(&ca->errors[i], le64_to_cpu(member->errors[i])); 1502 1503 ca->uuid = member->uuid; 1504 1505 ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, 1506 ca->mi.bucket_size / btree_sectors(c)); 1507 1508 #ifndef CONFIG_BCACHEFS_DEBUG 1509 if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, 0, GFP_KERNEL)) 1510 goto err; 1511 #else 1512 atomic_long_set(&ca->ref, 1); 1513 #endif 1514 1515 mutex_init(&ca->bucket_backpointer_mismatch.lock); 1516 mutex_init(&ca->bucket_backpointer_empty.lock); 1517 1518 bch2_dev_allocator_background_init(ca); 1519 1520 if (enumerated_ref_init(&ca->io_ref[READ], BCH_DEV_READ_REF_NR, NULL) || 1521 enumerated_ref_init(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_NR, NULL) || 1522 !(ca->sb_read_scratch = kmalloc(BCH_SB_READ_SCRATCH_BUF_SIZE, GFP_KERNEL)) || 1523 bch2_dev_buckets_alloc(c, ca) || 1524 !(ca->io_done = alloc_percpu(*ca->io_done))) 1525 goto err; 1526 1527 return ca; 1528 err: 1529 bch2_dev_free(ca); 1530 return NULL; 1531 } 1532 1533 static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca, 1534 unsigned dev_idx) 1535 { 1536 ca->dev_idx = dev_idx; 1537 __set_bit(ca->dev_idx, ca->self.d); 1538 1539 if (!ca->name[0]) 1540 scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx); 1541 1542 ca->fs = c; 1543 rcu_assign_pointer(c->devs[ca->dev_idx], ca); 1544 1545 if (bch2_dev_sysfs_online(c, ca)) 1546 pr_warn("error creating sysfs objects"); 1547 } 1548 1549 static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) 1550 { 1551 struct bch_member member = bch2_sb_member_get(c->disk_sb.sb, dev_idx); 1552 struct bch_dev *ca = NULL; 1553 1554 if (bch2_fs_init_fault("dev_alloc")) 1555 goto err; 1556 1557 ca = __bch2_dev_alloc(c, &member); 1558 if (!ca) 1559 goto err; 1560 1561 ca->fs = c; 1562 1563 bch2_dev_attach(c, ca, dev_idx); 1564 return 0; 1565 err: 1566 return bch_err_throw(c, ENOMEM_dev_alloc); 1567 } 1568 1569 static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) 1570 { 1571 unsigned ret; 1572 1573 if (bch2_dev_is_online(ca)) { 1574 bch_err(ca, "already have device online in slot %u", 1575 sb->sb->dev_idx); 1576 return bch_err_throw(ca->fs, device_already_online); 1577 } 1578 1579 if (get_capacity(sb->bdev->bd_disk) < 1580 ca->mi.bucket_size * ca->mi.nbuckets) { 1581 bch_err(ca, "cannot online: device too small"); 1582 return bch_err_throw(ca->fs, device_size_too_small); 1583 } 1584 1585 BUG_ON(!enumerated_ref_is_zero(&ca->io_ref[READ])); 1586 BUG_ON(!enumerated_ref_is_zero(&ca->io_ref[WRITE])); 1587 1588 ret = bch2_dev_journal_init(ca, sb->sb); 1589 if (ret) 1590 return ret; 1591 1592 struct printbuf name = PRINTBUF; 1593 prt_bdevname(&name, sb->bdev); 1594 strscpy(ca->name, name.buf, sizeof(ca->name)); 1595 printbuf_exit(&name); 1596 1597 /* Commit: */ 1598 ca->disk_sb = *sb; 1599 memset(sb, 0, sizeof(*sb)); 1600 1601 /* 1602 * Stash pointer to the filesystem for blk_holder_ops - note that once 1603 * attached to a filesystem, we will always close the block device 1604 * before tearing down the filesystem object. 1605 */ 1606 ca->disk_sb.holder->c = ca->fs; 1607 1608 ca->dev = ca->disk_sb.bdev->bd_dev; 1609 1610 enumerated_ref_start(&ca->io_ref[READ]); 1611 1612 return 0; 1613 } 1614 1615 static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) 1616 { 1617 struct bch_dev *ca; 1618 int ret; 1619 1620 lockdep_assert_held(&c->state_lock); 1621 1622 if (le64_to_cpu(sb->sb->seq) > 1623 le64_to_cpu(c->disk_sb.sb->seq)) 1624 bch2_sb_to_fs(c, sb->sb); 1625 1626 BUG_ON(!bch2_dev_exists(c, sb->sb->dev_idx)); 1627 1628 ca = bch2_dev_locked(c, sb->sb->dev_idx); 1629 1630 ret = __bch2_dev_attach_bdev(ca, sb); 1631 if (ret) 1632 return ret; 1633 1634 set_bit(ca->dev_idx, c->online_devs.d); 1635 1636 bch2_dev_sysfs_online(c, ca); 1637 1638 bch2_rebalance_wakeup(c); 1639 return 0; 1640 } 1641 1642 /* Device management: */ 1643 1644 /* 1645 * Note: this function is also used by the error paths - when a particular 1646 * device sees an error, we call it to determine whether we can just set the 1647 * device RO, or - if this function returns false - we'll set the whole 1648 * filesystem RO: 1649 * 1650 * XXX: maybe we should be more explicit about whether we're changing state 1651 * because we got an error or what have you? 1652 */ 1653 bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, 1654 enum bch_member_state new_state, int flags) 1655 { 1656 struct bch_devs_mask new_online_devs; 1657 int nr_rw = 0, required; 1658 1659 lockdep_assert_held(&c->state_lock); 1660 1661 switch (new_state) { 1662 case BCH_MEMBER_STATE_rw: 1663 return true; 1664 case BCH_MEMBER_STATE_ro: 1665 if (ca->mi.state != BCH_MEMBER_STATE_rw) 1666 return true; 1667 1668 /* do we have enough devices to write to? */ 1669 for_each_member_device(c, ca2) 1670 if (ca2 != ca) 1671 nr_rw += ca2->mi.state == BCH_MEMBER_STATE_rw; 1672 1673 required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED) 1674 ? c->opts.metadata_replicas 1675 : metadata_replicas_required(c), 1676 !(flags & BCH_FORCE_IF_DATA_DEGRADED) 1677 ? c->opts.data_replicas 1678 : data_replicas_required(c)); 1679 1680 return nr_rw >= required; 1681 case BCH_MEMBER_STATE_failed: 1682 case BCH_MEMBER_STATE_spare: 1683 if (ca->mi.state != BCH_MEMBER_STATE_rw && 1684 ca->mi.state != BCH_MEMBER_STATE_ro) 1685 return true; 1686 1687 /* do we have enough devices to read from? */ 1688 new_online_devs = c->online_devs; 1689 __clear_bit(ca->dev_idx, new_online_devs.d); 1690 1691 return bch2_have_enough_devs(c, new_online_devs, flags, false); 1692 default: 1693 BUG(); 1694 } 1695 } 1696 1697 static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) 1698 { 1699 bch2_dev_io_ref_stop(ca, WRITE); 1700 1701 /* 1702 * The allocator thread itself allocates btree nodes, so stop it first: 1703 */ 1704 bch2_dev_allocator_remove(c, ca); 1705 bch2_recalc_capacity(c); 1706 bch2_dev_journal_stop(&c->journal, ca); 1707 } 1708 1709 static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) 1710 { 1711 lockdep_assert_held(&c->state_lock); 1712 1713 BUG_ON(ca->mi.state != BCH_MEMBER_STATE_rw); 1714 1715 bch2_dev_allocator_add(c, ca); 1716 bch2_recalc_capacity(c); 1717 1718 if (enumerated_ref_is_zero(&ca->io_ref[WRITE])) 1719 enumerated_ref_start(&ca->io_ref[WRITE]); 1720 1721 bch2_dev_do_discards(ca); 1722 } 1723 1724 int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, 1725 enum bch_member_state new_state, int flags) 1726 { 1727 struct bch_member *m; 1728 int ret = 0; 1729 1730 if (ca->mi.state == new_state) 1731 return 0; 1732 1733 if (!bch2_dev_state_allowed(c, ca, new_state, flags)) 1734 return bch_err_throw(c, device_state_not_allowed); 1735 1736 if (new_state != BCH_MEMBER_STATE_rw) 1737 __bch2_dev_read_only(c, ca); 1738 1739 bch_notice(ca, "%s", bch2_member_states[new_state]); 1740 1741 mutex_lock(&c->sb_lock); 1742 m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); 1743 SET_BCH_MEMBER_STATE(m, new_state); 1744 bch2_write_super(c); 1745 mutex_unlock(&c->sb_lock); 1746 1747 if (new_state == BCH_MEMBER_STATE_rw) 1748 __bch2_dev_read_write(c, ca); 1749 1750 bch2_rebalance_wakeup(c); 1751 1752 return ret; 1753 } 1754 1755 int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, 1756 enum bch_member_state new_state, int flags) 1757 { 1758 int ret; 1759 1760 down_write(&c->state_lock); 1761 ret = __bch2_dev_set_state(c, ca, new_state, flags); 1762 up_write(&c->state_lock); 1763 1764 return ret; 1765 } 1766 1767 /* Device add/removal: */ 1768 1769 int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) 1770 { 1771 struct bch_member *m; 1772 unsigned dev_idx = ca->dev_idx, data; 1773 bool fast_device_removal = !bch2_request_incompat_feature(c, 1774 bcachefs_metadata_version_fast_device_removal); 1775 int ret; 1776 1777 down_write(&c->state_lock); 1778 1779 /* 1780 * We consume a reference to ca->ref, regardless of whether we succeed 1781 * or fail: 1782 */ 1783 bch2_dev_put(ca); 1784 1785 if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { 1786 bch_err(ca, "Cannot remove without losing data"); 1787 ret = bch_err_throw(c, device_state_not_allowed); 1788 goto err; 1789 } 1790 1791 __bch2_dev_read_only(c, ca); 1792 1793 ret = fast_device_removal 1794 ? bch2_dev_data_drop_by_backpointers(c, ca->dev_idx, flags) 1795 : (bch2_dev_data_drop(c, ca->dev_idx, flags) ?: 1796 bch2_dev_remove_stripes(c, ca->dev_idx, flags)); 1797 if (ret) 1798 goto err; 1799 1800 /* Check if device still has data before blowing away alloc info */ 1801 struct bch_dev_usage usage = bch2_dev_usage_read(ca); 1802 for (unsigned i = 0; i < BCH_DATA_NR; i++) 1803 if (!data_type_is_empty(i) && 1804 !data_type_is_hidden(i) && 1805 usage.buckets[i]) { 1806 bch_err(ca, "Remove failed: still has data (%s, %llu buckets)", 1807 __bch2_data_types[i], usage.buckets[i]); 1808 ret = -EBUSY; 1809 goto err; 1810 } 1811 1812 ret = bch2_dev_remove_alloc(c, ca); 1813 bch_err_msg(ca, ret, "bch2_dev_remove_alloc()"); 1814 if (ret) 1815 goto err; 1816 1817 /* 1818 * We need to flush the entire journal to get rid of keys that reference 1819 * the device being removed before removing the superblock entry 1820 */ 1821 bch2_journal_flush_all_pins(&c->journal); 1822 1823 /* 1824 * this is really just needed for the bch2_replicas_gc_(start|end) 1825 * calls, and could be cleaned up: 1826 */ 1827 ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx); 1828 bch_err_msg(ca, ret, "bch2_journal_flush_device_pins()"); 1829 if (ret) 1830 goto err; 1831 1832 ret = bch2_journal_flush(&c->journal); 1833 bch_err_msg(ca, ret, "bch2_journal_flush()"); 1834 if (ret) 1835 goto err; 1836 1837 ret = bch2_replicas_gc2(c); 1838 bch_err_msg(ca, ret, "bch2_replicas_gc2()"); 1839 if (ret) 1840 goto err; 1841 1842 data = bch2_dev_has_data(c, ca); 1843 if (data) { 1844 struct printbuf data_has = PRINTBUF; 1845 1846 prt_bitflags(&data_has, __bch2_data_types, data); 1847 bch_err(ca, "Remove failed, still has data (%s)", data_has.buf); 1848 printbuf_exit(&data_has); 1849 ret = -EBUSY; 1850 goto err; 1851 } 1852 1853 __bch2_dev_offline(c, ca); 1854 1855 mutex_lock(&c->sb_lock); 1856 rcu_assign_pointer(c->devs[ca->dev_idx], NULL); 1857 mutex_unlock(&c->sb_lock); 1858 1859 #ifndef CONFIG_BCACHEFS_DEBUG 1860 percpu_ref_kill(&ca->ref); 1861 #else 1862 ca->dying = true; 1863 bch2_dev_put(ca); 1864 #endif 1865 wait_for_completion(&ca->ref_completion); 1866 1867 bch2_dev_free(ca); 1868 1869 /* 1870 * Free this device's slot in the bch_member array - all pointers to 1871 * this device must be gone: 1872 */ 1873 mutex_lock(&c->sb_lock); 1874 m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx); 1875 1876 if (fast_device_removal) 1877 m->uuid = BCH_SB_MEMBER_DELETED_UUID; 1878 else 1879 memset(&m->uuid, 0, sizeof(m->uuid)); 1880 1881 bch2_write_super(c); 1882 1883 mutex_unlock(&c->sb_lock); 1884 up_write(&c->state_lock); 1885 return 0; 1886 err: 1887 if (test_bit(BCH_FS_rw, &c->flags) && 1888 ca->mi.state == BCH_MEMBER_STATE_rw && 1889 !enumerated_ref_is_zero(&ca->io_ref[READ])) 1890 __bch2_dev_read_write(c, ca); 1891 up_write(&c->state_lock); 1892 return ret; 1893 } 1894 1895 /* Add new device to running filesystem: */ 1896 int bch2_dev_add(struct bch_fs *c, const char *path) 1897 { 1898 struct bch_opts opts = bch2_opts_empty(); 1899 struct bch_sb_handle sb = {}; 1900 struct bch_dev *ca = NULL; 1901 struct printbuf errbuf = PRINTBUF; 1902 struct printbuf label = PRINTBUF; 1903 int ret = 0; 1904 1905 ret = bch2_read_super(path, &opts, &sb); 1906 bch_err_msg(c, ret, "reading super"); 1907 if (ret) 1908 goto err; 1909 1910 struct bch_member dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx); 1911 1912 if (BCH_MEMBER_GROUP(&dev_mi)) { 1913 bch2_disk_path_to_text_sb(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1); 1914 if (label.allocation_failure) { 1915 ret = -ENOMEM; 1916 goto err; 1917 } 1918 } 1919 1920 if (list_empty(&c->list)) { 1921 mutex_lock(&bch_fs_list_lock); 1922 if (__bch2_uuid_to_fs(c->sb.uuid)) 1923 ret = bch_err_throw(c, filesystem_uuid_already_open); 1924 else 1925 list_add(&c->list, &bch_fs_list); 1926 mutex_unlock(&bch_fs_list_lock); 1927 1928 if (ret) { 1929 bch_err(c, "filesystem UUID already open"); 1930 goto err; 1931 } 1932 } 1933 1934 ret = bch2_dev_may_add(sb.sb, c); 1935 if (ret) 1936 goto err; 1937 1938 ca = __bch2_dev_alloc(c, &dev_mi); 1939 if (!ca) { 1940 ret = -ENOMEM; 1941 goto err; 1942 } 1943 1944 ret = __bch2_dev_attach_bdev(ca, &sb); 1945 if (ret) 1946 goto err; 1947 1948 down_write(&c->state_lock); 1949 mutex_lock(&c->sb_lock); 1950 SET_BCH_SB_MULTI_DEVICE(c->disk_sb.sb, true); 1951 1952 ret = bch2_sb_from_fs(c, ca); 1953 bch_err_msg(c, ret, "setting up new superblock"); 1954 if (ret) 1955 goto err_unlock; 1956 1957 if (dynamic_fault("bcachefs:add:no_slot")) 1958 goto err_unlock; 1959 1960 ret = bch2_sb_member_alloc(c); 1961 if (ret < 0) { 1962 bch_err_msg(c, ret, "setting up new superblock"); 1963 goto err_unlock; 1964 } 1965 unsigned dev_idx = ret; 1966 ret = 0; 1967 1968 /* success: */ 1969 1970 dev_mi.last_mount = cpu_to_le64(ktime_get_real_seconds()); 1971 *bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx) = dev_mi; 1972 1973 ca->disk_sb.sb->dev_idx = dev_idx; 1974 bch2_dev_attach(c, ca, dev_idx); 1975 1976 if (BCH_MEMBER_GROUP(&dev_mi)) { 1977 ret = __bch2_dev_group_set(c, ca, label.buf); 1978 bch_err_msg(c, ret, "creating new label"); 1979 if (ret) 1980 goto err_unlock; 1981 } 1982 1983 bch2_write_super(c); 1984 mutex_unlock(&c->sb_lock); 1985 1986 if (test_bit(BCH_FS_started, &c->flags)) { 1987 ret = bch2_dev_usage_init(ca, false); 1988 if (ret) 1989 goto err_late; 1990 1991 ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); 1992 bch_err_msg(ca, ret, "marking new superblock"); 1993 if (ret) 1994 goto err_late; 1995 1996 ret = bch2_fs_freespace_init(c); 1997 bch_err_msg(ca, ret, "initializing free space"); 1998 if (ret) 1999 goto err_late; 2000 2001 if (ca->mi.state == BCH_MEMBER_STATE_rw) 2002 __bch2_dev_read_write(c, ca); 2003 2004 ret = bch2_dev_journal_alloc(ca, false); 2005 bch_err_msg(c, ret, "allocating journal"); 2006 if (ret) 2007 goto err_late; 2008 } 2009 2010 /* 2011 * We just changed the superblock UUID, invalidate cache and send a 2012 * uevent to update /dev/disk/by-uuid 2013 */ 2014 invalidate_bdev(ca->disk_sb.bdev); 2015 2016 char uuid_str[37]; 2017 snprintf(uuid_str, sizeof(uuid_str), "UUID=%pUb", &c->sb.uuid); 2018 2019 char *envp[] = { 2020 "CHANGE=uuid", 2021 uuid_str, 2022 NULL, 2023 }; 2024 kobject_uevent_env(&ca->disk_sb.bdev->bd_device.kobj, KOBJ_CHANGE, envp); 2025 2026 up_write(&c->state_lock); 2027 out: 2028 printbuf_exit(&label); 2029 printbuf_exit(&errbuf); 2030 bch_err_fn(c, ret); 2031 return ret; 2032 2033 err_unlock: 2034 mutex_unlock(&c->sb_lock); 2035 up_write(&c->state_lock); 2036 err: 2037 if (ca) 2038 bch2_dev_free(ca); 2039 bch2_free_super(&sb); 2040 goto out; 2041 err_late: 2042 up_write(&c->state_lock); 2043 ca = NULL; 2044 goto err; 2045 } 2046 2047 /* Hot add existing device to running filesystem: */ 2048 int bch2_dev_online(struct bch_fs *c, const char *path) 2049 { 2050 struct bch_opts opts = bch2_opts_empty(); 2051 struct bch_sb_handle sb = { NULL }; 2052 struct bch_dev *ca; 2053 unsigned dev_idx; 2054 int ret; 2055 2056 down_write(&c->state_lock); 2057 2058 ret = bch2_read_super(path, &opts, &sb); 2059 if (ret) { 2060 up_write(&c->state_lock); 2061 return ret; 2062 } 2063 2064 dev_idx = sb.sb->dev_idx; 2065 2066 ret = bch2_dev_in_fs(&c->disk_sb, &sb, &c->opts); 2067 bch_err_msg(c, ret, "bringing %s online", path); 2068 if (ret) 2069 goto err; 2070 2071 ret = bch2_dev_attach_bdev(c, &sb); 2072 if (ret) 2073 goto err; 2074 2075 ca = bch2_dev_locked(c, dev_idx); 2076 2077 ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); 2078 bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path); 2079 if (ret) 2080 goto err; 2081 2082 if (ca->mi.state == BCH_MEMBER_STATE_rw) 2083 __bch2_dev_read_write(c, ca); 2084 2085 if (!ca->mi.freespace_initialized) { 2086 ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); 2087 bch_err_msg(ca, ret, "initializing free space"); 2088 if (ret) 2089 goto err; 2090 } 2091 2092 if (!ca->journal.nr) { 2093 ret = bch2_dev_journal_alloc(ca, false); 2094 bch_err_msg(ca, ret, "allocating journal"); 2095 if (ret) 2096 goto err; 2097 } 2098 2099 mutex_lock(&c->sb_lock); 2100 bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = 2101 cpu_to_le64(ktime_get_real_seconds()); 2102 bch2_write_super(c); 2103 mutex_unlock(&c->sb_lock); 2104 2105 up_write(&c->state_lock); 2106 return 0; 2107 err: 2108 up_write(&c->state_lock); 2109 bch2_free_super(&sb); 2110 return ret; 2111 } 2112 2113 int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) 2114 { 2115 down_write(&c->state_lock); 2116 2117 if (!bch2_dev_is_online(ca)) { 2118 bch_err(ca, "Already offline"); 2119 up_write(&c->state_lock); 2120 return 0; 2121 } 2122 2123 if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { 2124 bch_err(ca, "Cannot offline required disk"); 2125 up_write(&c->state_lock); 2126 return bch_err_throw(c, device_state_not_allowed); 2127 } 2128 2129 __bch2_dev_offline(c, ca); 2130 2131 up_write(&c->state_lock); 2132 return 0; 2133 } 2134 2135 static int __bch2_dev_resize_alloc(struct bch_dev *ca, u64 old_nbuckets, u64 new_nbuckets) 2136 { 2137 struct bch_fs *c = ca->fs; 2138 u64 v[3] = { new_nbuckets - old_nbuckets, 0, 0 }; 2139 2140 return bch2_trans_commit_do(ca->fs, NULL, NULL, 0, 2141 bch2_disk_accounting_mod2(trans, false, v, dev_data_type, 2142 .dev = ca->dev_idx, 2143 .data_type = BCH_DATA_free)) ?: 2144 bch2_dev_freespace_init(c, ca, old_nbuckets, new_nbuckets); 2145 } 2146 2147 int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) 2148 { 2149 struct bch_member *m; 2150 u64 old_nbuckets; 2151 int ret = 0; 2152 2153 down_write(&c->state_lock); 2154 old_nbuckets = ca->mi.nbuckets; 2155 2156 if (nbuckets < ca->mi.nbuckets) { 2157 bch_err(ca, "Cannot shrink yet"); 2158 ret = -EINVAL; 2159 goto err; 2160 } 2161 2162 if (nbuckets > BCH_MEMBER_NBUCKETS_MAX) { 2163 bch_err(ca, "New device size too big (%llu greater than max %u)", 2164 nbuckets, BCH_MEMBER_NBUCKETS_MAX); 2165 ret = bch_err_throw(c, device_size_too_big); 2166 goto err; 2167 } 2168 2169 if (bch2_dev_is_online(ca) && 2170 get_capacity(ca->disk_sb.bdev->bd_disk) < 2171 ca->mi.bucket_size * nbuckets) { 2172 bch_err(ca, "New size larger than device"); 2173 ret = bch_err_throw(c, device_size_too_small); 2174 goto err; 2175 } 2176 2177 ret = bch2_dev_buckets_resize(c, ca, nbuckets); 2178 bch_err_msg(ca, ret, "resizing buckets"); 2179 if (ret) 2180 goto err; 2181 2182 ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); 2183 if (ret) 2184 goto err; 2185 2186 mutex_lock(&c->sb_lock); 2187 m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); 2188 m->nbuckets = cpu_to_le64(nbuckets); 2189 2190 bch2_write_super(c); 2191 mutex_unlock(&c->sb_lock); 2192 2193 if (ca->mi.freespace_initialized) { 2194 ret = __bch2_dev_resize_alloc(ca, old_nbuckets, nbuckets); 2195 if (ret) 2196 goto err; 2197 } 2198 2199 bch2_recalc_capacity(c); 2200 err: 2201 up_write(&c->state_lock); 2202 return ret; 2203 } 2204 2205 int bch2_fs_resize_on_mount(struct bch_fs *c) 2206 { 2207 for_each_online_member(c, ca, BCH_DEV_READ_REF_fs_resize_on_mount) { 2208 u64 old_nbuckets = ca->mi.nbuckets; 2209 u64 new_nbuckets = div64_u64(get_capacity(ca->disk_sb.bdev->bd_disk), 2210 ca->mi.bucket_size); 2211 2212 if (ca->mi.resize_on_mount && 2213 new_nbuckets > ca->mi.nbuckets) { 2214 bch_info(ca, "resizing to size %llu", new_nbuckets * ca->mi.bucket_size); 2215 int ret = bch2_dev_buckets_resize(c, ca, new_nbuckets); 2216 bch_err_fn(ca, ret); 2217 if (ret) { 2218 enumerated_ref_put(&ca->io_ref[READ], 2219 BCH_DEV_READ_REF_fs_resize_on_mount); 2220 up_write(&c->state_lock); 2221 return ret; 2222 } 2223 2224 mutex_lock(&c->sb_lock); 2225 struct bch_member *m = 2226 bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); 2227 m->nbuckets = cpu_to_le64(new_nbuckets); 2228 SET_BCH_MEMBER_RESIZE_ON_MOUNT(m, false); 2229 2230 c->disk_sb.sb->features[0] &= ~cpu_to_le64(BIT_ULL(BCH_FEATURE_small_image)); 2231 bch2_write_super(c); 2232 mutex_unlock(&c->sb_lock); 2233 2234 if (ca->mi.freespace_initialized) { 2235 ret = __bch2_dev_resize_alloc(ca, old_nbuckets, new_nbuckets); 2236 if (ret) { 2237 enumerated_ref_put(&ca->io_ref[READ], 2238 BCH_DEV_READ_REF_fs_resize_on_mount); 2239 up_write(&c->state_lock); 2240 return ret; 2241 } 2242 } 2243 } 2244 } 2245 return 0; 2246 } 2247 2248 /* return with ref on ca->ref: */ 2249 struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name) 2250 { 2251 if (!strncmp(name, "/dev/", strlen("/dev/"))) 2252 name += strlen("/dev/"); 2253 2254 for_each_member_device(c, ca) 2255 if (!strcmp(name, ca->name)) 2256 return ca; 2257 return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found); 2258 } 2259 2260 /* blk_holder_ops: */ 2261 2262 static struct bch_fs *bdev_get_fs(struct block_device *bdev) 2263 __releases(&bdev->bd_holder_lock) 2264 { 2265 struct bch_sb_handle_holder *holder = bdev->bd_holder; 2266 struct bch_fs *c = holder->c; 2267 2268 if (c && !bch2_ro_ref_tryget(c)) 2269 c = NULL; 2270 2271 mutex_unlock(&bdev->bd_holder_lock); 2272 2273 if (c) 2274 wait_event(c->ro_ref_wait, test_bit(BCH_FS_started, &c->flags)); 2275 return c; 2276 } 2277 2278 /* returns with ref on ca->ref */ 2279 static struct bch_dev *bdev_to_bch_dev(struct bch_fs *c, struct block_device *bdev) 2280 { 2281 for_each_member_device(c, ca) 2282 if (ca->disk_sb.bdev == bdev) 2283 return ca; 2284 return NULL; 2285 } 2286 2287 static void bch2_fs_bdev_mark_dead(struct block_device *bdev, bool surprise) 2288 { 2289 struct bch_fs *c = bdev_get_fs(bdev); 2290 if (!c) 2291 return; 2292 2293 struct super_block *sb = c->vfs_sb; 2294 if (sb) { 2295 /* 2296 * Not necessary, c->ro_ref guards against the filesystem being 2297 * unmounted - we only take this to avoid a warning in 2298 * sync_filesystem: 2299 */ 2300 down_read(&sb->s_umount); 2301 } 2302 2303 down_write(&c->state_lock); 2304 struct bch_dev *ca = bdev_to_bch_dev(c, bdev); 2305 if (!ca) 2306 goto unlock; 2307 2308 bool dev = bch2_dev_state_allowed(c, ca, 2309 BCH_MEMBER_STATE_failed, 2310 BCH_FORCE_IF_DEGRADED); 2311 2312 if (!dev && sb) { 2313 if (!surprise) 2314 sync_filesystem(sb); 2315 shrink_dcache_sb(sb); 2316 evict_inodes(sb); 2317 } 2318 2319 struct printbuf buf = PRINTBUF; 2320 __bch2_log_msg_start(ca->name, &buf); 2321 2322 prt_printf(&buf, "offline from block layer"); 2323 2324 if (dev) { 2325 __bch2_dev_offline(c, ca); 2326 } else { 2327 bch2_journal_flush(&c->journal); 2328 bch2_fs_emergency_read_only2(c, &buf); 2329 } 2330 2331 bch2_print_str(c, KERN_ERR, buf.buf); 2332 printbuf_exit(&buf); 2333 2334 bch2_dev_put(ca); 2335 unlock: 2336 if (sb) 2337 up_read(&sb->s_umount); 2338 up_write(&c->state_lock); 2339 bch2_ro_ref_put(c); 2340 } 2341 2342 static void bch2_fs_bdev_sync(struct block_device *bdev) 2343 { 2344 struct bch_fs *c = bdev_get_fs(bdev); 2345 if (!c) 2346 return; 2347 2348 struct super_block *sb = c->vfs_sb; 2349 if (sb) { 2350 /* 2351 * Not necessary, c->ro_ref guards against the filesystem being 2352 * unmounted - we only take this to avoid a warning in 2353 * sync_filesystem: 2354 */ 2355 down_read(&sb->s_umount); 2356 sync_filesystem(sb); 2357 up_read(&sb->s_umount); 2358 } 2359 2360 bch2_ro_ref_put(c); 2361 } 2362 2363 const struct blk_holder_ops bch2_sb_handle_bdev_ops = { 2364 .mark_dead = bch2_fs_bdev_mark_dead, 2365 .sync = bch2_fs_bdev_sync, 2366 }; 2367 2368 /* Filesystem open: */ 2369 2370 static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r) 2371 { 2372 return cmp_int(le64_to_cpu(l->seq), le64_to_cpu(r->seq)) ?: 2373 cmp_int(le64_to_cpu(l->write_time), le64_to_cpu(r->write_time)); 2374 } 2375 2376 struct bch_fs *bch2_fs_open(darray_const_str *devices, 2377 struct bch_opts *opts) 2378 { 2379 bch_sb_handles sbs = {}; 2380 struct bch_fs *c = NULL; 2381 struct bch_sb_handle *best = NULL; 2382 struct printbuf errbuf = PRINTBUF; 2383 int ret = 0; 2384 2385 if (!try_module_get(THIS_MODULE)) 2386 return ERR_PTR(-ENODEV); 2387 2388 if (!devices->nr) { 2389 ret = -EINVAL; 2390 goto err; 2391 } 2392 2393 ret = darray_make_room(&sbs, devices->nr); 2394 if (ret) 2395 goto err; 2396 2397 darray_for_each(*devices, i) { 2398 struct bch_sb_handle sb = { NULL }; 2399 2400 ret = bch2_read_super(*i, opts, &sb); 2401 if (ret) 2402 goto err; 2403 2404 BUG_ON(darray_push(&sbs, sb)); 2405 } 2406 2407 if (opts->nochanges && !opts->read_only) { 2408 ret = bch_err_throw(c, erofs_nochanges); 2409 goto err_print; 2410 } 2411 2412 darray_for_each(sbs, sb) 2413 if (!best || sb_cmp(sb->sb, best->sb) > 0) 2414 best = sb; 2415 2416 darray_for_each_reverse(sbs, sb) { 2417 ret = bch2_dev_in_fs(best, sb, opts); 2418 2419 if (ret == -BCH_ERR_device_has_been_removed || 2420 ret == -BCH_ERR_device_splitbrain) { 2421 bch2_free_super(sb); 2422 darray_remove_item(&sbs, sb); 2423 best -= best > sb; 2424 ret = 0; 2425 continue; 2426 } 2427 2428 if (ret) 2429 goto err_print; 2430 } 2431 2432 c = bch2_fs_alloc(best->sb, opts, &sbs); 2433 ret = PTR_ERR_OR_ZERO(c); 2434 if (ret) 2435 goto err; 2436 2437 down_write(&c->state_lock); 2438 darray_for_each(sbs, sb) { 2439 ret = bch2_dev_attach_bdev(c, sb); 2440 if (ret) { 2441 up_write(&c->state_lock); 2442 goto err; 2443 } 2444 } 2445 up_write(&c->state_lock); 2446 2447 if (!c->opts.nostart) { 2448 ret = bch2_fs_start(c); 2449 if (ret) 2450 goto err; 2451 } 2452 out: 2453 darray_for_each(sbs, sb) 2454 bch2_free_super(sb); 2455 darray_exit(&sbs); 2456 printbuf_exit(&errbuf); 2457 module_put(THIS_MODULE); 2458 return c; 2459 err_print: 2460 pr_err("bch_fs_open err opening %s: %s", 2461 devices->data[0], bch2_err_str(ret)); 2462 err: 2463 if (!IS_ERR_OR_NULL(c)) 2464 bch2_fs_stop(c); 2465 c = ERR_PTR(ret); 2466 goto out; 2467 } 2468 2469 /* Global interfaces/init */ 2470 2471 static void bcachefs_exit(void) 2472 { 2473 bch2_debug_exit(); 2474 bch2_vfs_exit(); 2475 bch2_chardev_exit(); 2476 bch2_btree_key_cache_exit(); 2477 if (bcachefs_kset) 2478 kset_unregister(bcachefs_kset); 2479 } 2480 2481 static int __init bcachefs_init(void) 2482 { 2483 bch2_bkey_pack_test(); 2484 2485 if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) || 2486 bch2_btree_key_cache_init() || 2487 bch2_chardev_init() || 2488 bch2_vfs_init() || 2489 bch2_debug_init()) 2490 goto err; 2491 2492 return 0; 2493 err: 2494 bcachefs_exit(); 2495 return -ENOMEM; 2496 } 2497 2498 #define BCH_DEBUG_PARAM(name, description) DEFINE_STATIC_KEY_FALSE(bch2_##name); 2499 BCH_DEBUG_PARAMS_ALL() 2500 #undef BCH_DEBUG_PARAM 2501 2502 static int bch2_param_set_static_key_t(const char *val, const struct kernel_param *kp) 2503 { 2504 /* Match bool exactly, by re-using it. */ 2505 struct static_key *key = kp->arg; 2506 struct kernel_param boolkp = *kp; 2507 bool v; 2508 int ret; 2509 2510 boolkp.arg = &v; 2511 2512 ret = param_set_bool(val, &boolkp); 2513 if (ret) 2514 return ret; 2515 if (v) 2516 static_key_enable(key); 2517 else 2518 static_key_disable(key); 2519 return 0; 2520 } 2521 2522 static int bch2_param_get_static_key_t(char *buffer, const struct kernel_param *kp) 2523 { 2524 struct static_key *key = kp->arg; 2525 return sprintf(buffer, "%c\n", static_key_enabled(key) ? 'N' : 'Y'); 2526 } 2527 2528 static const struct kernel_param_ops bch2_param_ops_static_key_t = { 2529 .flags = KERNEL_PARAM_OPS_FL_NOARG, 2530 .set = bch2_param_set_static_key_t, 2531 .get = bch2_param_get_static_key_t, 2532 }; 2533 2534 #define BCH_DEBUG_PARAM(name, description) \ 2535 module_param_cb(name, &bch2_param_ops_static_key_t, &bch2_##name.key, 0644);\ 2536 __MODULE_PARM_TYPE(name, "static_key_t"); \ 2537 MODULE_PARM_DESC(name, description); 2538 BCH_DEBUG_PARAMS() 2539 #undef BCH_DEBUG_PARAM 2540 2541 __maybe_unused 2542 static unsigned bch2_metadata_version = bcachefs_metadata_version_current; 2543 module_param_named(version, bch2_metadata_version, uint, 0444); 2544 2545 module_exit(bcachefs_exit); 2546 module_init(bcachefs_init); 2547