1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "checksum.h" 5 #include "counters.h" 6 #include "disk_groups.h" 7 #include "ec.h" 8 #include "error.h" 9 #include "journal.h" 10 #include "journal_sb.h" 11 #include "journal_seq_blacklist.h" 12 #include "recovery.h" 13 #include "replicas.h" 14 #include "quota.h" 15 #include "sb-clean.h" 16 #include "sb-errors.h" 17 #include "sb-members.h" 18 #include "super-io.h" 19 #include "super.h" 20 #include "trace.h" 21 #include "vstructs.h" 22 23 #include <linux/backing-dev.h> 24 #include <linux/sort.h> 25 26 static const struct blk_holder_ops bch2_sb_handle_bdev_ops = { 27 }; 28 29 struct bch2_metadata_version { 30 u16 version; 31 const char *name; 32 u64 recovery_passes; 33 }; 34 35 static const struct bch2_metadata_version bch2_metadata_versions[] = { 36 #define x(n, v, _recovery_passes) { \ 37 .version = v, \ 38 .name = #n, \ 39 .recovery_passes = _recovery_passes, \ 40 }, 41 BCH_METADATA_VERSIONS() 42 #undef x 43 }; 44 45 void bch2_version_to_text(struct printbuf *out, unsigned v) 46 { 47 const char *str = "(unknown version)"; 48 49 for (unsigned i = 0; i < ARRAY_SIZE(bch2_metadata_versions); i++) 50 if (bch2_metadata_versions[i].version == v) { 51 str = bch2_metadata_versions[i].name; 52 break; 53 } 54 55 prt_printf(out, "%u.%u: %s", BCH_VERSION_MAJOR(v), BCH_VERSION_MINOR(v), str); 56 } 57 58 unsigned bch2_latest_compatible_version(unsigned v) 59 { 60 if (!BCH_VERSION_MAJOR(v)) 61 return v; 62 63 for (unsigned i = 0; i < ARRAY_SIZE(bch2_metadata_versions); i++) 64 if (bch2_metadata_versions[i].version > v && 65 BCH_VERSION_MAJOR(bch2_metadata_versions[i].version) == 66 BCH_VERSION_MAJOR(v)) 67 v = bch2_metadata_versions[i].version; 68 69 return v; 70 } 71 72 u64 bch2_upgrade_recovery_passes(struct bch_fs *c, 73 unsigned old_version, 74 unsigned new_version) 75 { 76 u64 ret = 0; 77 78 for (const struct bch2_metadata_version *i = bch2_metadata_versions; 79 i < bch2_metadata_versions + ARRAY_SIZE(bch2_metadata_versions); 80 i++) 81 if (i->version > old_version && i->version <= new_version) { 82 if (i->recovery_passes & RECOVERY_PASS_ALL_FSCK) 83 ret |= bch2_fsck_recovery_passes(); 84 ret |= i->recovery_passes; 85 } 86 87 return ret &= ~RECOVERY_PASS_ALL_FSCK; 88 } 89 90 const char * const bch2_sb_fields[] = { 91 #define x(name, nr) #name, 92 BCH_SB_FIELDS() 93 #undef x 94 NULL 95 }; 96 97 static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *, 98 struct printbuf *); 99 100 struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *sb, 101 enum bch_sb_field_type type) 102 { 103 struct bch_sb_field *f; 104 105 /* XXX: need locking around superblock to access optional fields */ 106 107 vstruct_for_each(sb, f) 108 if (le32_to_cpu(f->type) == type) 109 return f; 110 return NULL; 111 } 112 113 static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb, 114 struct bch_sb_field *f, 115 unsigned u64s) 116 { 117 unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0; 118 unsigned sb_u64s = le32_to_cpu(sb->sb->u64s) + u64s - old_u64s; 119 120 BUG_ON(__vstruct_bytes(struct bch_sb, sb_u64s) > sb->buffer_size); 121 122 if (!f && !u64s) { 123 /* nothing to do: */ 124 } else if (!f) { 125 f = vstruct_last(sb->sb); 126 memset(f, 0, sizeof(u64) * u64s); 127 f->u64s = cpu_to_le32(u64s); 128 f->type = 0; 129 } else { 130 void *src, *dst; 131 132 src = vstruct_end(f); 133 134 if (u64s) { 135 f->u64s = cpu_to_le32(u64s); 136 dst = vstruct_end(f); 137 } else { 138 dst = f; 139 } 140 141 memmove(dst, src, vstruct_end(sb->sb) - src); 142 143 if (dst > src) 144 memset(src, 0, dst - src); 145 } 146 147 sb->sb->u64s = cpu_to_le32(sb_u64s); 148 149 return u64s ? f : NULL; 150 } 151 152 void bch2_sb_field_delete(struct bch_sb_handle *sb, 153 enum bch_sb_field_type type) 154 { 155 struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type); 156 157 if (f) 158 __bch2_sb_field_resize(sb, f, 0); 159 } 160 161 /* Superblock realloc/free: */ 162 163 void bch2_free_super(struct bch_sb_handle *sb) 164 { 165 kfree(sb->bio); 166 if (!IS_ERR_OR_NULL(sb->bdev)) 167 blkdev_put(sb->bdev, sb->holder); 168 kfree(sb->holder); 169 170 kfree(sb->sb); 171 memset(sb, 0, sizeof(*sb)); 172 } 173 174 int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) 175 { 176 size_t new_bytes = __vstruct_bytes(struct bch_sb, u64s); 177 size_t new_buffer_size; 178 struct bch_sb *new_sb; 179 struct bio *bio; 180 181 if (sb->bdev) 182 new_bytes = max_t(size_t, new_bytes, bdev_logical_block_size(sb->bdev)); 183 184 new_buffer_size = roundup_pow_of_two(new_bytes); 185 186 if (sb->sb && sb->buffer_size >= new_buffer_size) 187 return 0; 188 189 if (sb->sb && sb->have_layout) { 190 u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits; 191 192 if (new_bytes > max_bytes) { 193 pr_err("%pg: superblock too big: want %zu but have %llu", 194 sb->bdev, new_bytes, max_bytes); 195 return -BCH_ERR_ENOSPC_sb; 196 } 197 } 198 199 if (sb->buffer_size >= new_buffer_size && sb->sb) 200 return 0; 201 202 if (dynamic_fault("bcachefs:add:super_realloc")) 203 return -BCH_ERR_ENOMEM_sb_realloc_injected; 204 205 new_sb = krealloc(sb->sb, new_buffer_size, GFP_NOFS|__GFP_ZERO); 206 if (!new_sb) 207 return -BCH_ERR_ENOMEM_sb_buf_realloc; 208 209 sb->sb = new_sb; 210 211 if (sb->have_bio) { 212 unsigned nr_bvecs = buf_pages(sb->sb, new_buffer_size); 213 214 bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); 215 if (!bio) 216 return -BCH_ERR_ENOMEM_sb_bio_realloc; 217 218 bio_init(bio, NULL, bio->bi_inline_vecs, nr_bvecs, 0); 219 220 kfree(sb->bio); 221 sb->bio = bio; 222 } 223 224 sb->buffer_size = new_buffer_size; 225 226 return 0; 227 } 228 229 struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb, 230 enum bch_sb_field_type type, 231 unsigned u64s) 232 { 233 struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type); 234 ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0; 235 ssize_t d = -old_u64s + u64s; 236 237 if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) 238 return NULL; 239 240 if (sb->fs_sb) { 241 struct bch_fs *c = container_of(sb, struct bch_fs, disk_sb); 242 struct bch_dev *ca; 243 unsigned i; 244 245 lockdep_assert_held(&c->sb_lock); 246 247 /* XXX: we're not checking that offline device have enough space */ 248 249 for_each_online_member(ca, c, i) { 250 struct bch_sb_handle *dev_sb = &ca->disk_sb; 251 252 if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) { 253 percpu_ref_put(&ca->ref); 254 return NULL; 255 } 256 } 257 } 258 259 f = bch2_sb_field_get_id(sb->sb, type); 260 f = __bch2_sb_field_resize(sb, f, u64s); 261 if (f) 262 f->type = cpu_to_le32(type); 263 return f; 264 } 265 266 /* Superblock validate: */ 267 268 static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out) 269 { 270 u64 offset, prev_offset, max_sectors; 271 unsigned i; 272 273 BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512); 274 275 if (!uuid_equal(&layout->magic, &BCACHE_MAGIC) && 276 !uuid_equal(&layout->magic, &BCHFS_MAGIC)) { 277 prt_printf(out, "Not a bcachefs superblock layout"); 278 return -BCH_ERR_invalid_sb_layout; 279 } 280 281 if (layout->layout_type != 0) { 282 prt_printf(out, "Invalid superblock layout type %u", 283 layout->layout_type); 284 return -BCH_ERR_invalid_sb_layout_type; 285 } 286 287 if (!layout->nr_superblocks) { 288 prt_printf(out, "Invalid superblock layout: no superblocks"); 289 return -BCH_ERR_invalid_sb_layout_nr_superblocks; 290 } 291 292 if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) { 293 prt_printf(out, "Invalid superblock layout: too many superblocks"); 294 return -BCH_ERR_invalid_sb_layout_nr_superblocks; 295 } 296 297 max_sectors = 1 << layout->sb_max_size_bits; 298 299 prev_offset = le64_to_cpu(layout->sb_offset[0]); 300 301 for (i = 1; i < layout->nr_superblocks; i++) { 302 offset = le64_to_cpu(layout->sb_offset[i]); 303 304 if (offset < prev_offset + max_sectors) { 305 prt_printf(out, "Invalid superblock layout: superblocks overlap\n" 306 " (sb %u ends at %llu next starts at %llu", 307 i - 1, prev_offset + max_sectors, offset); 308 return -BCH_ERR_invalid_sb_layout_superblocks_overlap; 309 } 310 prev_offset = offset; 311 } 312 313 return 0; 314 } 315 316 static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out) 317 { 318 u16 version = le16_to_cpu(sb->version); 319 u16 version_min = le16_to_cpu(sb->version_min); 320 321 if (!bch2_version_compatible(version)) { 322 prt_str(out, "Unsupported superblock version "); 323 bch2_version_to_text(out, version); 324 prt_str(out, " (min "); 325 bch2_version_to_text(out, bcachefs_metadata_version_min); 326 prt_str(out, ", max "); 327 bch2_version_to_text(out, bcachefs_metadata_version_current); 328 prt_str(out, ")"); 329 return -BCH_ERR_invalid_sb_version; 330 } 331 332 if (!bch2_version_compatible(version_min)) { 333 prt_str(out, "Unsupported superblock version_min "); 334 bch2_version_to_text(out, version_min); 335 prt_str(out, " (min "); 336 bch2_version_to_text(out, bcachefs_metadata_version_min); 337 prt_str(out, ", max "); 338 bch2_version_to_text(out, bcachefs_metadata_version_current); 339 prt_str(out, ")"); 340 return -BCH_ERR_invalid_sb_version; 341 } 342 343 if (version_min > version) { 344 prt_str(out, "Bad minimum version "); 345 bch2_version_to_text(out, version_min); 346 prt_str(out, ", greater than version field "); 347 bch2_version_to_text(out, version); 348 return -BCH_ERR_invalid_sb_version; 349 } 350 351 return 0; 352 } 353 354 static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, 355 int rw) 356 { 357 struct bch_sb *sb = disk_sb->sb; 358 struct bch_sb_field *f; 359 struct bch_sb_field_members_v1 *mi; 360 enum bch_opt_id opt_id; 361 u16 block_size; 362 int ret; 363 364 ret = bch2_sb_compatible(sb, out); 365 if (ret) 366 return ret; 367 368 if (sb->features[1] || 369 (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) { 370 prt_printf(out, "Filesystem has incompatible features"); 371 return -BCH_ERR_invalid_sb_features; 372 } 373 374 block_size = le16_to_cpu(sb->block_size); 375 376 if (block_size > PAGE_SECTORS) { 377 prt_printf(out, "Block size too big (got %u, max %u)", 378 block_size, PAGE_SECTORS); 379 return -BCH_ERR_invalid_sb_block_size; 380 } 381 382 if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid))) { 383 prt_printf(out, "Bad user UUID (got zeroes)"); 384 return -BCH_ERR_invalid_sb_uuid; 385 } 386 387 if (bch2_is_zero(sb->uuid.b, sizeof(sb->uuid))) { 388 prt_printf(out, "Bad internal UUID (got zeroes)"); 389 return -BCH_ERR_invalid_sb_uuid; 390 } 391 392 if (!sb->nr_devices || 393 sb->nr_devices > BCH_SB_MEMBERS_MAX) { 394 prt_printf(out, "Bad number of member devices %u (max %u)", 395 sb->nr_devices, BCH_SB_MEMBERS_MAX); 396 return -BCH_ERR_invalid_sb_too_many_members; 397 } 398 399 if (sb->dev_idx >= sb->nr_devices) { 400 prt_printf(out, "Bad dev_idx (got %u, nr_devices %u)", 401 sb->dev_idx, sb->nr_devices); 402 return -BCH_ERR_invalid_sb_dev_idx; 403 } 404 405 if (!sb->time_precision || 406 le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) { 407 prt_printf(out, "Invalid time precision: %u (min 1, max %lu)", 408 le32_to_cpu(sb->time_precision), NSEC_PER_SEC); 409 return -BCH_ERR_invalid_sb_time_precision; 410 } 411 412 if (rw == READ) { 413 /* 414 * Been seeing a bug where these are getting inexplicably 415 * zeroed, so we're now validating them, but we have to be 416 * careful not to preven people's filesystems from mounting: 417 */ 418 if (!BCH_SB_JOURNAL_FLUSH_DELAY(sb)) 419 SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000); 420 if (!BCH_SB_JOURNAL_RECLAIM_DELAY(sb)) 421 SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 1000); 422 423 if (!BCH_SB_VERSION_UPGRADE_COMPLETE(sb)) 424 SET_BCH_SB_VERSION_UPGRADE_COMPLETE(sb, le16_to_cpu(sb->version)); 425 } 426 427 for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) { 428 const struct bch_option *opt = bch2_opt_table + opt_id; 429 430 if (opt->get_sb != BCH2_NO_SB_OPT) { 431 u64 v = bch2_opt_from_sb(sb, opt_id); 432 433 prt_printf(out, "Invalid option "); 434 ret = bch2_opt_validate(opt, v, out); 435 if (ret) 436 return ret; 437 438 printbuf_reset(out); 439 } 440 } 441 442 /* validate layout */ 443 ret = validate_sb_layout(&sb->layout, out); 444 if (ret) 445 return ret; 446 447 vstruct_for_each(sb, f) { 448 if (!f->u64s) { 449 prt_printf(out, "Invalid superblock: optional field with size 0 (type %u)", 450 le32_to_cpu(f->type)); 451 return -BCH_ERR_invalid_sb_field_size; 452 } 453 454 if (vstruct_next(f) > vstruct_last(sb)) { 455 prt_printf(out, "Invalid superblock: optional field extends past end of superblock (type %u)", 456 le32_to_cpu(f->type)); 457 return -BCH_ERR_invalid_sb_field_size; 458 } 459 } 460 461 /* members must be validated first: */ 462 mi = bch2_sb_field_get(sb, members_v1); 463 if (!mi) { 464 prt_printf(out, "Invalid superblock: member info area missing"); 465 return -BCH_ERR_invalid_sb_members_missing; 466 } 467 468 ret = bch2_sb_field_validate(sb, &mi->field, out); 469 if (ret) 470 return ret; 471 472 vstruct_for_each(sb, f) { 473 if (le32_to_cpu(f->type) == BCH_SB_FIELD_members_v1) 474 continue; 475 476 ret = bch2_sb_field_validate(sb, f, out); 477 if (ret) 478 return ret; 479 } 480 481 return 0; 482 } 483 484 /* device open: */ 485 486 static void bch2_sb_update(struct bch_fs *c) 487 { 488 struct bch_sb *src = c->disk_sb.sb; 489 struct bch_dev *ca; 490 unsigned i; 491 492 lockdep_assert_held(&c->sb_lock); 493 494 c->sb.uuid = src->uuid; 495 c->sb.user_uuid = src->user_uuid; 496 c->sb.version = le16_to_cpu(src->version); 497 c->sb.version_min = le16_to_cpu(src->version_min); 498 c->sb.version_upgrade_complete = BCH_SB_VERSION_UPGRADE_COMPLETE(src); 499 c->sb.nr_devices = src->nr_devices; 500 c->sb.clean = BCH_SB_CLEAN(src); 501 c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src); 502 503 c->sb.nsec_per_time_unit = le32_to_cpu(src->time_precision); 504 c->sb.time_units_per_sec = NSEC_PER_SEC / c->sb.nsec_per_time_unit; 505 506 /* XXX this is wrong, we need a 96 or 128 bit integer type */ 507 c->sb.time_base_lo = div_u64(le64_to_cpu(src->time_base_lo), 508 c->sb.nsec_per_time_unit); 509 c->sb.time_base_hi = le32_to_cpu(src->time_base_hi); 510 511 c->sb.features = le64_to_cpu(src->features[0]); 512 c->sb.compat = le64_to_cpu(src->compat[0]); 513 514 for_each_member_device(ca, c, i) { 515 struct bch_member m = bch2_sb_member_get(src, i); 516 ca->mi = bch2_mi_to_cpu(&m); 517 } 518 } 519 520 static int __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src) 521 { 522 struct bch_sb_field *src_f, *dst_f; 523 struct bch_sb *dst = dst_handle->sb; 524 unsigned i; 525 526 dst->version = src->version; 527 dst->version_min = src->version_min; 528 dst->seq = src->seq; 529 dst->uuid = src->uuid; 530 dst->user_uuid = src->user_uuid; 531 memcpy(dst->label, src->label, sizeof(dst->label)); 532 533 dst->block_size = src->block_size; 534 dst->nr_devices = src->nr_devices; 535 536 dst->time_base_lo = src->time_base_lo; 537 dst->time_base_hi = src->time_base_hi; 538 dst->time_precision = src->time_precision; 539 540 memcpy(dst->flags, src->flags, sizeof(dst->flags)); 541 memcpy(dst->features, src->features, sizeof(dst->features)); 542 memcpy(dst->compat, src->compat, sizeof(dst->compat)); 543 544 for (i = 0; i < BCH_SB_FIELD_NR; i++) { 545 int d; 546 547 if ((1U << i) & BCH_SINGLE_DEVICE_SB_FIELDS) 548 continue; 549 550 src_f = bch2_sb_field_get_id(src, i); 551 dst_f = bch2_sb_field_get_id(dst, i); 552 553 d = (src_f ? le32_to_cpu(src_f->u64s) : 0) - 554 (dst_f ? le32_to_cpu(dst_f->u64s) : 0); 555 if (d > 0) { 556 int ret = bch2_sb_realloc(dst_handle, 557 le32_to_cpu(dst_handle->sb->u64s) + d); 558 559 if (ret) 560 return ret; 561 562 dst = dst_handle->sb; 563 dst_f = bch2_sb_field_get_id(dst, i); 564 } 565 566 dst_f = __bch2_sb_field_resize(dst_handle, dst_f, 567 src_f ? le32_to_cpu(src_f->u64s) : 0); 568 569 if (src_f) 570 memcpy(dst_f, src_f, vstruct_bytes(src_f)); 571 } 572 573 return 0; 574 } 575 576 int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src) 577 { 578 int ret; 579 580 lockdep_assert_held(&c->sb_lock); 581 582 ret = bch2_sb_realloc(&c->disk_sb, 0) ?: 583 __copy_super(&c->disk_sb, src) ?: 584 bch2_sb_replicas_to_cpu_replicas(c) ?: 585 bch2_sb_disk_groups_to_cpu(c); 586 if (ret) 587 return ret; 588 589 bch2_sb_update(c); 590 return 0; 591 } 592 593 int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca) 594 { 595 return __copy_super(&ca->disk_sb, c->disk_sb.sb); 596 } 597 598 /* read superblock: */ 599 600 static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf *err) 601 { 602 struct bch_csum csum; 603 size_t bytes; 604 int ret; 605 reread: 606 bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META); 607 sb->bio->bi_iter.bi_sector = offset; 608 bch2_bio_map(sb->bio, sb->sb, sb->buffer_size); 609 610 ret = submit_bio_wait(sb->bio); 611 if (ret) { 612 prt_printf(err, "IO error: %i", ret); 613 return ret; 614 } 615 616 if (!uuid_equal(&sb->sb->magic, &BCACHE_MAGIC) && 617 !uuid_equal(&sb->sb->magic, &BCHFS_MAGIC)) { 618 prt_printf(err, "Not a bcachefs superblock"); 619 return -BCH_ERR_invalid_sb_magic; 620 } 621 622 ret = bch2_sb_compatible(sb->sb, err); 623 if (ret) 624 return ret; 625 626 bytes = vstruct_bytes(sb->sb); 627 628 if (bytes > 512 << sb->sb->layout.sb_max_size_bits) { 629 prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)", 630 bytes, 512UL << sb->sb->layout.sb_max_size_bits); 631 return -BCH_ERR_invalid_sb_too_big; 632 } 633 634 if (bytes > sb->buffer_size) { 635 ret = bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s)); 636 if (ret) 637 return ret; 638 goto reread; 639 } 640 641 if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) { 642 prt_printf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb)); 643 return -BCH_ERR_invalid_sb_csum_type; 644 } 645 646 /* XXX: verify MACs */ 647 csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb), 648 null_nonce(), sb->sb); 649 650 if (bch2_crc_cmp(csum, sb->sb->csum)) { 651 prt_printf(err, "bad checksum"); 652 return -BCH_ERR_invalid_sb_csum; 653 } 654 655 sb->seq = le64_to_cpu(sb->sb->seq); 656 657 return 0; 658 } 659 660 int bch2_read_super(const char *path, struct bch_opts *opts, 661 struct bch_sb_handle *sb) 662 { 663 u64 offset = opt_get(*opts, sb); 664 struct bch_sb_layout layout; 665 struct printbuf err = PRINTBUF; 666 __le64 *i; 667 int ret; 668 #ifndef __KERNEL__ 669 retry: 670 #endif 671 memset(sb, 0, sizeof(*sb)); 672 sb->mode = BLK_OPEN_READ; 673 sb->have_bio = true; 674 sb->holder = kmalloc(1, GFP_KERNEL); 675 if (!sb->holder) 676 return -ENOMEM; 677 678 #ifndef __KERNEL__ 679 if (opt_get(*opts, direct_io) == false) 680 sb->mode |= BLK_OPEN_BUFFERED; 681 #endif 682 683 if (!opt_get(*opts, noexcl)) 684 sb->mode |= BLK_OPEN_EXCL; 685 686 if (!opt_get(*opts, nochanges)) 687 sb->mode |= BLK_OPEN_WRITE; 688 689 sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); 690 if (IS_ERR(sb->bdev) && 691 PTR_ERR(sb->bdev) == -EACCES && 692 opt_get(*opts, read_only)) { 693 sb->mode &= ~BLK_OPEN_WRITE; 694 695 sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); 696 if (!IS_ERR(sb->bdev)) 697 opt_set(*opts, nochanges, true); 698 } 699 700 if (IS_ERR(sb->bdev)) { 701 ret = PTR_ERR(sb->bdev); 702 goto out; 703 } 704 705 ret = bch2_sb_realloc(sb, 0); 706 if (ret) { 707 prt_printf(&err, "error allocating memory for superblock"); 708 goto err; 709 } 710 711 if (bch2_fs_init_fault("read_super")) { 712 prt_printf(&err, "dynamic fault"); 713 ret = -EFAULT; 714 goto err; 715 } 716 717 ret = read_one_super(sb, offset, &err); 718 if (!ret) 719 goto got_super; 720 721 if (opt_defined(*opts, sb)) 722 goto err; 723 724 printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s\n", 725 path, err.buf); 726 printbuf_reset(&err); 727 728 /* 729 * Error reading primary superblock - read location of backup 730 * superblocks: 731 */ 732 bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META); 733 sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR; 734 /* 735 * use sb buffer to read layout, since sb buffer is page aligned but 736 * layout won't be: 737 */ 738 bch2_bio_map(sb->bio, sb->sb, sizeof(struct bch_sb_layout)); 739 740 ret = submit_bio_wait(sb->bio); 741 if (ret) { 742 prt_printf(&err, "IO error: %i", ret); 743 goto err; 744 } 745 746 memcpy(&layout, sb->sb, sizeof(layout)); 747 ret = validate_sb_layout(&layout, &err); 748 if (ret) 749 goto err; 750 751 for (i = layout.sb_offset; 752 i < layout.sb_offset + layout.nr_superblocks; i++) { 753 offset = le64_to_cpu(*i); 754 755 if (offset == opt_get(*opts, sb)) 756 continue; 757 758 ret = read_one_super(sb, offset, &err); 759 if (!ret) 760 goto got_super; 761 } 762 763 goto err; 764 765 got_super: 766 if (le16_to_cpu(sb->sb->block_size) << 9 < 767 bdev_logical_block_size(sb->bdev) && 768 opt_get(*opts, direct_io)) { 769 #ifndef __KERNEL__ 770 opt_set(*opts, direct_io, false); 771 bch2_free_super(sb); 772 goto retry; 773 #endif 774 prt_printf(&err, "block size (%u) smaller than device block size (%u)", 775 le16_to_cpu(sb->sb->block_size) << 9, 776 bdev_logical_block_size(sb->bdev)); 777 ret = -BCH_ERR_block_size_too_small; 778 goto err; 779 } 780 781 ret = 0; 782 sb->have_layout = true; 783 784 ret = bch2_sb_validate(sb, &err, READ); 785 if (ret) { 786 printk(KERN_ERR "bcachefs (%s): error validating superblock: %s\n", 787 path, err.buf); 788 goto err_no_print; 789 } 790 out: 791 printbuf_exit(&err); 792 return ret; 793 err: 794 printk(KERN_ERR "bcachefs (%s): error reading superblock: %s\n", 795 path, err.buf); 796 err_no_print: 797 bch2_free_super(sb); 798 goto out; 799 } 800 801 /* write superblock: */ 802 803 static void write_super_endio(struct bio *bio) 804 { 805 struct bch_dev *ca = bio->bi_private; 806 807 /* XXX: return errors directly */ 808 809 if (bch2_dev_io_err_on(bio->bi_status, ca, 810 bio_data_dir(bio) 811 ? BCH_MEMBER_ERROR_write 812 : BCH_MEMBER_ERROR_read, 813 "superblock %s error: %s", 814 bio_data_dir(bio) ? "write" : "read", 815 bch2_blk_status_to_str(bio->bi_status))) 816 ca->sb_write_error = 1; 817 818 closure_put(&ca->fs->sb_write); 819 percpu_ref_put(&ca->io_ref); 820 } 821 822 static void read_back_super(struct bch_fs *c, struct bch_dev *ca) 823 { 824 struct bch_sb *sb = ca->disk_sb.sb; 825 struct bio *bio = ca->disk_sb.bio; 826 827 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ|REQ_SYNC|REQ_META); 828 bio->bi_iter.bi_sector = le64_to_cpu(sb->layout.sb_offset[0]); 829 bio->bi_end_io = write_super_endio; 830 bio->bi_private = ca; 831 bch2_bio_map(bio, ca->sb_read_scratch, PAGE_SIZE); 832 833 this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], 834 bio_sectors(bio)); 835 836 percpu_ref_get(&ca->io_ref); 837 closure_bio_submit(bio, &c->sb_write); 838 } 839 840 static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) 841 { 842 struct bch_sb *sb = ca->disk_sb.sb; 843 struct bio *bio = ca->disk_sb.bio; 844 845 sb->offset = sb->layout.sb_offset[idx]; 846 847 SET_BCH_SB_CSUM_TYPE(sb, bch2_csum_opt_to_type(c->opts.metadata_checksum, false)); 848 sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb), 849 null_nonce(), sb); 850 851 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); 852 bio->bi_iter.bi_sector = le64_to_cpu(sb->offset); 853 bio->bi_end_io = write_super_endio; 854 bio->bi_private = ca; 855 bch2_bio_map(bio, sb, 856 roundup((size_t) vstruct_bytes(sb), 857 bdev_logical_block_size(ca->disk_sb.bdev))); 858 859 this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb], 860 bio_sectors(bio)); 861 862 percpu_ref_get(&ca->io_ref); 863 closure_bio_submit(bio, &c->sb_write); 864 } 865 866 int bch2_write_super(struct bch_fs *c) 867 { 868 struct closure *cl = &c->sb_write; 869 struct bch_dev *ca; 870 struct printbuf err = PRINTBUF; 871 unsigned i, sb = 0, nr_wrote; 872 struct bch_devs_mask sb_written; 873 bool wrote, can_mount_without_written, can_mount_with_written; 874 unsigned degraded_flags = BCH_FORCE_IF_DEGRADED; 875 int ret = 0; 876 877 trace_and_count(c, write_super, c, _RET_IP_); 878 879 if (c->opts.very_degraded) 880 degraded_flags |= BCH_FORCE_IF_LOST; 881 882 lockdep_assert_held(&c->sb_lock); 883 884 closure_init_stack(cl); 885 memset(&sb_written, 0, sizeof(sb_written)); 886 887 /* Make sure we're using the new magic numbers: */ 888 c->disk_sb.sb->magic = BCHFS_MAGIC; 889 c->disk_sb.sb->layout.magic = BCHFS_MAGIC; 890 891 le64_add_cpu(&c->disk_sb.sb->seq, 1); 892 893 if (test_bit(BCH_FS_ERROR, &c->flags)) 894 SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1); 895 if (test_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags)) 896 SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 1); 897 898 SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN); 899 900 bch2_sb_counters_from_cpu(c); 901 bch2_sb_members_from_cpu(c); 902 bch2_sb_members_cpy_v2_v1(&c->disk_sb); 903 bch2_sb_errors_from_cpu(c); 904 905 for_each_online_member(ca, c, i) 906 bch2_sb_from_fs(c, ca); 907 908 for_each_online_member(ca, c, i) { 909 printbuf_reset(&err); 910 911 ret = bch2_sb_validate(&ca->disk_sb, &err, WRITE); 912 if (ret) { 913 bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf); 914 percpu_ref_put(&ca->io_ref); 915 goto out; 916 } 917 } 918 919 if (c->opts.nochanges) 920 goto out; 921 922 /* 923 * Defer writing the superblock until filesystem initialization is 924 * complete - don't write out a partly initialized superblock: 925 */ 926 if (!BCH_SB_INITIALIZED(c->disk_sb.sb)) 927 goto out; 928 929 for_each_online_member(ca, c, i) { 930 __set_bit(ca->dev_idx, sb_written.d); 931 ca->sb_write_error = 0; 932 } 933 934 for_each_online_member(ca, c, i) 935 read_back_super(c, ca); 936 closure_sync(cl); 937 938 for_each_online_member(ca, c, i) { 939 if (ca->sb_write_error) 940 continue; 941 942 if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) { 943 bch2_fs_fatal_error(c, 944 "Superblock write was silently dropped! (seq %llu expected %llu)", 945 le64_to_cpu(ca->sb_read_scratch->seq), 946 ca->disk_sb.seq); 947 percpu_ref_put(&ca->io_ref); 948 ret = -BCH_ERR_erofs_sb_err; 949 goto out; 950 } 951 952 if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) { 953 bch2_fs_fatal_error(c, 954 "Superblock modified by another process (seq %llu expected %llu)", 955 le64_to_cpu(ca->sb_read_scratch->seq), 956 ca->disk_sb.seq); 957 percpu_ref_put(&ca->io_ref); 958 ret = -BCH_ERR_erofs_sb_err; 959 goto out; 960 } 961 } 962 963 do { 964 wrote = false; 965 for_each_online_member(ca, c, i) 966 if (!ca->sb_write_error && 967 sb < ca->disk_sb.sb->layout.nr_superblocks) { 968 write_one_super(c, ca, sb); 969 wrote = true; 970 } 971 closure_sync(cl); 972 sb++; 973 } while (wrote); 974 975 for_each_online_member(ca, c, i) { 976 if (ca->sb_write_error) 977 __clear_bit(ca->dev_idx, sb_written.d); 978 else 979 ca->disk_sb.seq = le64_to_cpu(ca->disk_sb.sb->seq); 980 } 981 982 nr_wrote = dev_mask_nr(&sb_written); 983 984 can_mount_with_written = 985 bch2_have_enough_devs(c, sb_written, degraded_flags, false); 986 987 for (i = 0; i < ARRAY_SIZE(sb_written.d); i++) 988 sb_written.d[i] = ~sb_written.d[i]; 989 990 can_mount_without_written = 991 bch2_have_enough_devs(c, sb_written, degraded_flags, false); 992 993 /* 994 * If we would be able to mount _without_ the devices we successfully 995 * wrote superblocks to, we weren't able to write to enough devices: 996 * 997 * Exception: if we can mount without the successes because we haven't 998 * written anything (new filesystem), we continue if we'd be able to 999 * mount with the devices we did successfully write to: 1000 */ 1001 if (bch2_fs_fatal_err_on(!nr_wrote || 1002 !can_mount_with_written || 1003 (can_mount_without_written && 1004 !can_mount_with_written), c, 1005 "Unable to write superblock to sufficient devices (from %ps)", 1006 (void *) _RET_IP_)) 1007 ret = -1; 1008 out: 1009 /* Make new options visible after they're persistent: */ 1010 bch2_sb_update(c); 1011 printbuf_exit(&err); 1012 return ret; 1013 } 1014 1015 void __bch2_check_set_feature(struct bch_fs *c, unsigned feat) 1016 { 1017 mutex_lock(&c->sb_lock); 1018 if (!(c->sb.features & (1ULL << feat))) { 1019 c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << feat); 1020 1021 bch2_write_super(c); 1022 } 1023 mutex_unlock(&c->sb_lock); 1024 } 1025 1026 /* Downgrade if superblock is at a higher version than currently supported: */ 1027 void bch2_sb_maybe_downgrade(struct bch_fs *c) 1028 { 1029 lockdep_assert_held(&c->sb_lock); 1030 1031 /* 1032 * Downgrade, if superblock is at a higher version than currently 1033 * supported: 1034 */ 1035 if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current) 1036 SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); 1037 if (c->sb.version > bcachefs_metadata_version_current) 1038 c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current); 1039 if (c->sb.version_min > bcachefs_metadata_version_current) 1040 c->disk_sb.sb->version_min = cpu_to_le16(bcachefs_metadata_version_current); 1041 c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1); 1042 } 1043 1044 void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version) 1045 { 1046 lockdep_assert_held(&c->sb_lock); 1047 1048 c->disk_sb.sb->version = cpu_to_le16(new_version); 1049 c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); 1050 } 1051 1052 static const struct bch_sb_field_ops *bch2_sb_field_ops[] = { 1053 #define x(f, nr) \ 1054 [BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f, 1055 BCH_SB_FIELDS() 1056 #undef x 1057 }; 1058 1059 static const struct bch_sb_field_ops bch2_sb_field_null_ops; 1060 1061 static const struct bch_sb_field_ops *bch2_sb_field_type_ops(unsigned type) 1062 { 1063 return likely(type < ARRAY_SIZE(bch2_sb_field_ops)) 1064 ? bch2_sb_field_ops[type] 1065 : &bch2_sb_field_null_ops; 1066 } 1067 1068 static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f, 1069 struct printbuf *err) 1070 { 1071 unsigned type = le32_to_cpu(f->type); 1072 struct printbuf field_err = PRINTBUF; 1073 const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type); 1074 int ret; 1075 1076 ret = ops->validate ? ops->validate(sb, f, &field_err) : 0; 1077 if (ret) { 1078 prt_printf(err, "Invalid superblock section %s: %s", 1079 bch2_sb_fields[type], field_err.buf); 1080 prt_newline(err); 1081 bch2_sb_field_to_text(err, sb, f); 1082 } 1083 1084 printbuf_exit(&field_err); 1085 return ret; 1086 } 1087 1088 void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, 1089 struct bch_sb_field *f) 1090 { 1091 unsigned type = le32_to_cpu(f->type); 1092 const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type); 1093 1094 if (!out->nr_tabstops) 1095 printbuf_tabstop_push(out, 32); 1096 1097 if (type < BCH_SB_FIELD_NR) 1098 prt_printf(out, "%s", bch2_sb_fields[type]); 1099 else 1100 prt_printf(out, "(unknown field %u)", type); 1101 1102 prt_printf(out, " (size %zu):", vstruct_bytes(f)); 1103 prt_newline(out); 1104 1105 if (ops->to_text) { 1106 printbuf_indent_add(out, 2); 1107 ops->to_text(out, sb, f); 1108 printbuf_indent_sub(out, 2); 1109 } 1110 } 1111 1112 void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l) 1113 { 1114 unsigned i; 1115 1116 prt_printf(out, "Type: %u", l->layout_type); 1117 prt_newline(out); 1118 1119 prt_str(out, "Superblock max size: "); 1120 prt_units_u64(out, 512 << l->sb_max_size_bits); 1121 prt_newline(out); 1122 1123 prt_printf(out, "Nr superblocks: %u", l->nr_superblocks); 1124 prt_newline(out); 1125 1126 prt_str(out, "Offsets: "); 1127 for (i = 0; i < l->nr_superblocks; i++) { 1128 if (i) 1129 prt_str(out, ", "); 1130 prt_printf(out, "%llu", le64_to_cpu(l->sb_offset[i])); 1131 } 1132 prt_newline(out); 1133 } 1134 1135 void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, 1136 bool print_layout, unsigned fields) 1137 { 1138 struct bch_sb_field *f; 1139 u64 fields_have = 0; 1140 unsigned nr_devices = 0; 1141 1142 if (!out->nr_tabstops) 1143 printbuf_tabstop_push(out, 44); 1144 1145 for (int i = 0; i < sb->nr_devices; i++) 1146 nr_devices += bch2_dev_exists(sb, i); 1147 1148 prt_printf(out, "External UUID:"); 1149 prt_tab(out); 1150 pr_uuid(out, sb->user_uuid.b); 1151 prt_newline(out); 1152 1153 prt_printf(out, "Internal UUID:"); 1154 prt_tab(out); 1155 pr_uuid(out, sb->uuid.b); 1156 prt_newline(out); 1157 1158 prt_str(out, "Device index:"); 1159 prt_tab(out); 1160 prt_printf(out, "%u", sb->dev_idx); 1161 prt_newline(out); 1162 1163 prt_str(out, "Label:"); 1164 prt_tab(out); 1165 prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label); 1166 prt_newline(out); 1167 1168 prt_str(out, "Version:"); 1169 prt_tab(out); 1170 bch2_version_to_text(out, le16_to_cpu(sb->version)); 1171 prt_newline(out); 1172 1173 prt_str(out, "Version upgrade complete:"); 1174 prt_tab(out); 1175 bch2_version_to_text(out, BCH_SB_VERSION_UPGRADE_COMPLETE(sb)); 1176 prt_newline(out); 1177 1178 prt_printf(out, "Oldest version on disk:"); 1179 prt_tab(out); 1180 bch2_version_to_text(out, le16_to_cpu(sb->version_min)); 1181 prt_newline(out); 1182 1183 prt_printf(out, "Created:"); 1184 prt_tab(out); 1185 if (sb->time_base_lo) 1186 bch2_prt_datetime(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC)); 1187 else 1188 prt_printf(out, "(not set)"); 1189 prt_newline(out); 1190 1191 prt_printf(out, "Sequence number:"); 1192 prt_tab(out); 1193 prt_printf(out, "%llu", le64_to_cpu(sb->seq)); 1194 prt_newline(out); 1195 1196 prt_printf(out, "Superblock size:"); 1197 prt_tab(out); 1198 prt_printf(out, "%zu", vstruct_bytes(sb)); 1199 prt_newline(out); 1200 1201 prt_printf(out, "Clean:"); 1202 prt_tab(out); 1203 prt_printf(out, "%llu", BCH_SB_CLEAN(sb)); 1204 prt_newline(out); 1205 1206 prt_printf(out, "Devices:"); 1207 prt_tab(out); 1208 prt_printf(out, "%u", nr_devices); 1209 prt_newline(out); 1210 1211 prt_printf(out, "Sections:"); 1212 vstruct_for_each(sb, f) 1213 fields_have |= 1 << le32_to_cpu(f->type); 1214 prt_tab(out); 1215 prt_bitflags(out, bch2_sb_fields, fields_have); 1216 prt_newline(out); 1217 1218 prt_printf(out, "Features:"); 1219 prt_tab(out); 1220 prt_bitflags(out, bch2_sb_features, le64_to_cpu(sb->features[0])); 1221 prt_newline(out); 1222 1223 prt_printf(out, "Compat features:"); 1224 prt_tab(out); 1225 prt_bitflags(out, bch2_sb_compat, le64_to_cpu(sb->compat[0])); 1226 prt_newline(out); 1227 1228 prt_newline(out); 1229 prt_printf(out, "Options:"); 1230 prt_newline(out); 1231 printbuf_indent_add(out, 2); 1232 { 1233 enum bch_opt_id id; 1234 1235 for (id = 0; id < bch2_opts_nr; id++) { 1236 const struct bch_option *opt = bch2_opt_table + id; 1237 1238 if (opt->get_sb != BCH2_NO_SB_OPT) { 1239 u64 v = bch2_opt_from_sb(sb, id); 1240 1241 prt_printf(out, "%s:", opt->attr.name); 1242 prt_tab(out); 1243 bch2_opt_to_text(out, NULL, sb, opt, v, 1244 OPT_HUMAN_READABLE|OPT_SHOW_FULL_LIST); 1245 prt_newline(out); 1246 } 1247 } 1248 } 1249 1250 printbuf_indent_sub(out, 2); 1251 1252 if (print_layout) { 1253 prt_newline(out); 1254 prt_printf(out, "layout:"); 1255 prt_newline(out); 1256 printbuf_indent_add(out, 2); 1257 bch2_sb_layout_to_text(out, &sb->layout); 1258 printbuf_indent_sub(out, 2); 1259 } 1260 1261 vstruct_for_each(sb, f) 1262 if (fields & (1 << le32_to_cpu(f->type))) { 1263 prt_newline(out); 1264 bch2_sb_field_to_text(out, sb, f); 1265 } 1266 } 1267