1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "checksum.h" 5 #include "counters.h" 6 #include "disk_groups.h" 7 #include "ec.h" 8 #include "error.h" 9 #include "journal.h" 10 #include "journal_sb.h" 11 #include "journal_seq_blacklist.h" 12 #include "recovery.h" 13 #include "replicas.h" 14 #include "quota.h" 15 #include "sb-clean.h" 16 #include "sb-errors.h" 17 #include "sb-members.h" 18 #include "super-io.h" 19 #include "super.h" 20 #include "trace.h" 21 #include "vstructs.h" 22 23 #include <linux/backing-dev.h> 24 #include <linux/sort.h> 25 26 static const struct blk_holder_ops bch2_sb_handle_bdev_ops = { 27 }; 28 29 struct bch2_metadata_version { 30 u16 version; 31 const char *name; 32 u64 recovery_passes; 33 }; 34 35 static const struct bch2_metadata_version bch2_metadata_versions[] = { 36 #define x(n, v, _recovery_passes) { \ 37 .version = v, \ 38 .name = #n, \ 39 .recovery_passes = _recovery_passes, \ 40 }, 41 BCH_METADATA_VERSIONS() 42 #undef x 43 }; 44 45 void bch2_version_to_text(struct printbuf *out, unsigned v) 46 { 47 const char *str = "(unknown version)"; 48 49 for (unsigned i = 0; i < ARRAY_SIZE(bch2_metadata_versions); i++) 50 if (bch2_metadata_versions[i].version == v) { 51 str = bch2_metadata_versions[i].name; 52 break; 53 } 54 55 prt_printf(out, "%u.%u: %s", BCH_VERSION_MAJOR(v), BCH_VERSION_MINOR(v), str); 56 } 57 58 unsigned bch2_latest_compatible_version(unsigned v) 59 { 60 if (!BCH_VERSION_MAJOR(v)) 61 return v; 62 63 for (unsigned i = 0; i < ARRAY_SIZE(bch2_metadata_versions); i++) 64 if (bch2_metadata_versions[i].version > v && 65 BCH_VERSION_MAJOR(bch2_metadata_versions[i].version) == 66 BCH_VERSION_MAJOR(v)) 67 v = bch2_metadata_versions[i].version; 68 69 return v; 70 } 71 72 u64 bch2_upgrade_recovery_passes(struct bch_fs *c, 73 unsigned old_version, 74 unsigned new_version) 75 { 76 u64 ret = 0; 77 78 for (const struct bch2_metadata_version *i = bch2_metadata_versions; 79 i < bch2_metadata_versions + ARRAY_SIZE(bch2_metadata_versions); 80 i++) 81 if (i->version > old_version && i->version <= new_version) { 82 if (i->recovery_passes & RECOVERY_PASS_ALL_FSCK) 83 ret |= bch2_fsck_recovery_passes(); 84 ret |= i->recovery_passes; 85 } 86 87 return ret &= ~RECOVERY_PASS_ALL_FSCK; 88 } 89 90 const char * const bch2_sb_fields[] = { 91 #define x(name, nr) #name, 92 BCH_SB_FIELDS() 93 #undef x 94 NULL 95 }; 96 97 static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *, 98 struct printbuf *); 99 100 struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *sb, 101 enum bch_sb_field_type type) 102 { 103 struct bch_sb_field *f; 104 105 /* XXX: need locking around superblock to access optional fields */ 106 107 vstruct_for_each(sb, f) 108 if (le32_to_cpu(f->type) == type) 109 return f; 110 return NULL; 111 } 112 113 static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb, 114 struct bch_sb_field *f, 115 unsigned u64s) 116 { 117 unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0; 118 unsigned sb_u64s = le32_to_cpu(sb->sb->u64s) + u64s - old_u64s; 119 120 BUG_ON(__vstruct_bytes(struct bch_sb, sb_u64s) > sb->buffer_size); 121 122 if (!f && !u64s) { 123 /* nothing to do: */ 124 } else if (!f) { 125 f = vstruct_last(sb->sb); 126 memset(f, 0, sizeof(u64) * u64s); 127 f->u64s = cpu_to_le32(u64s); 128 f->type = 0; 129 } else { 130 void *src, *dst; 131 132 src = vstruct_end(f); 133 134 if (u64s) { 135 f->u64s = cpu_to_le32(u64s); 136 dst = vstruct_end(f); 137 } else { 138 dst = f; 139 } 140 141 memmove(dst, src, vstruct_end(sb->sb) - src); 142 143 if (dst > src) 144 memset(src, 0, dst - src); 145 } 146 147 sb->sb->u64s = cpu_to_le32(sb_u64s); 148 149 return u64s ? f : NULL; 150 } 151 152 void bch2_sb_field_delete(struct bch_sb_handle *sb, 153 enum bch_sb_field_type type) 154 { 155 struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type); 156 157 if (f) 158 __bch2_sb_field_resize(sb, f, 0); 159 } 160 161 /* Superblock realloc/free: */ 162 163 void bch2_free_super(struct bch_sb_handle *sb) 164 { 165 kfree(sb->bio); 166 if (!IS_ERR_OR_NULL(sb->bdev)) 167 blkdev_put(sb->bdev, sb->holder); 168 kfree(sb->holder); 169 kfree(sb->sb_name); 170 171 kfree(sb->sb); 172 memset(sb, 0, sizeof(*sb)); 173 } 174 175 int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) 176 { 177 size_t new_bytes = __vstruct_bytes(struct bch_sb, u64s); 178 size_t new_buffer_size; 179 struct bch_sb *new_sb; 180 struct bio *bio; 181 182 if (sb->bdev) 183 new_bytes = max_t(size_t, new_bytes, bdev_logical_block_size(sb->bdev)); 184 185 new_buffer_size = roundup_pow_of_two(new_bytes); 186 187 if (sb->sb && sb->buffer_size >= new_buffer_size) 188 return 0; 189 190 if (sb->sb && sb->have_layout) { 191 u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits; 192 193 if (new_bytes > max_bytes) { 194 pr_err("%pg: superblock too big: want %zu but have %llu", 195 sb->bdev, new_bytes, max_bytes); 196 return -BCH_ERR_ENOSPC_sb; 197 } 198 } 199 200 if (sb->buffer_size >= new_buffer_size && sb->sb) 201 return 0; 202 203 if (dynamic_fault("bcachefs:add:super_realloc")) 204 return -BCH_ERR_ENOMEM_sb_realloc_injected; 205 206 new_sb = krealloc(sb->sb, new_buffer_size, GFP_NOFS|__GFP_ZERO); 207 if (!new_sb) 208 return -BCH_ERR_ENOMEM_sb_buf_realloc; 209 210 sb->sb = new_sb; 211 212 if (sb->have_bio) { 213 unsigned nr_bvecs = buf_pages(sb->sb, new_buffer_size); 214 215 bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); 216 if (!bio) 217 return -BCH_ERR_ENOMEM_sb_bio_realloc; 218 219 bio_init(bio, NULL, bio->bi_inline_vecs, nr_bvecs, 0); 220 221 kfree(sb->bio); 222 sb->bio = bio; 223 } 224 225 sb->buffer_size = new_buffer_size; 226 227 return 0; 228 } 229 230 struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb, 231 enum bch_sb_field_type type, 232 unsigned u64s) 233 { 234 struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type); 235 ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0; 236 ssize_t d = -old_u64s + u64s; 237 238 if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) 239 return NULL; 240 241 if (sb->fs_sb) { 242 struct bch_fs *c = container_of(sb, struct bch_fs, disk_sb); 243 struct bch_dev *ca; 244 unsigned i; 245 246 lockdep_assert_held(&c->sb_lock); 247 248 /* XXX: we're not checking that offline device have enough space */ 249 250 for_each_online_member(ca, c, i) { 251 struct bch_sb_handle *dev_sb = &ca->disk_sb; 252 253 if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) { 254 percpu_ref_put(&ca->ref); 255 return NULL; 256 } 257 } 258 } 259 260 f = bch2_sb_field_get_id(sb->sb, type); 261 f = __bch2_sb_field_resize(sb, f, u64s); 262 if (f) 263 f->type = cpu_to_le32(type); 264 return f; 265 } 266 267 /* Superblock validate: */ 268 269 static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out) 270 { 271 u64 offset, prev_offset, max_sectors; 272 unsigned i; 273 274 BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512); 275 276 if (!uuid_equal(&layout->magic, &BCACHE_MAGIC) && 277 !uuid_equal(&layout->magic, &BCHFS_MAGIC)) { 278 prt_printf(out, "Not a bcachefs superblock layout"); 279 return -BCH_ERR_invalid_sb_layout; 280 } 281 282 if (layout->layout_type != 0) { 283 prt_printf(out, "Invalid superblock layout type %u", 284 layout->layout_type); 285 return -BCH_ERR_invalid_sb_layout_type; 286 } 287 288 if (!layout->nr_superblocks) { 289 prt_printf(out, "Invalid superblock layout: no superblocks"); 290 return -BCH_ERR_invalid_sb_layout_nr_superblocks; 291 } 292 293 if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) { 294 prt_printf(out, "Invalid superblock layout: too many superblocks"); 295 return -BCH_ERR_invalid_sb_layout_nr_superblocks; 296 } 297 298 max_sectors = 1 << layout->sb_max_size_bits; 299 300 prev_offset = le64_to_cpu(layout->sb_offset[0]); 301 302 for (i = 1; i < layout->nr_superblocks; i++) { 303 offset = le64_to_cpu(layout->sb_offset[i]); 304 305 if (offset < prev_offset + max_sectors) { 306 prt_printf(out, "Invalid superblock layout: superblocks overlap\n" 307 " (sb %u ends at %llu next starts at %llu", 308 i - 1, prev_offset + max_sectors, offset); 309 return -BCH_ERR_invalid_sb_layout_superblocks_overlap; 310 } 311 prev_offset = offset; 312 } 313 314 return 0; 315 } 316 317 static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out) 318 { 319 u16 version = le16_to_cpu(sb->version); 320 u16 version_min = le16_to_cpu(sb->version_min); 321 322 if (!bch2_version_compatible(version)) { 323 prt_str(out, "Unsupported superblock version "); 324 bch2_version_to_text(out, version); 325 prt_str(out, " (min "); 326 bch2_version_to_text(out, bcachefs_metadata_version_min); 327 prt_str(out, ", max "); 328 bch2_version_to_text(out, bcachefs_metadata_version_current); 329 prt_str(out, ")"); 330 return -BCH_ERR_invalid_sb_version; 331 } 332 333 if (!bch2_version_compatible(version_min)) { 334 prt_str(out, "Unsupported superblock version_min "); 335 bch2_version_to_text(out, version_min); 336 prt_str(out, " (min "); 337 bch2_version_to_text(out, bcachefs_metadata_version_min); 338 prt_str(out, ", max "); 339 bch2_version_to_text(out, bcachefs_metadata_version_current); 340 prt_str(out, ")"); 341 return -BCH_ERR_invalid_sb_version; 342 } 343 344 if (version_min > version) { 345 prt_str(out, "Bad minimum version "); 346 bch2_version_to_text(out, version_min); 347 prt_str(out, ", greater than version field "); 348 bch2_version_to_text(out, version); 349 return -BCH_ERR_invalid_sb_version; 350 } 351 352 return 0; 353 } 354 355 static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, 356 int rw) 357 { 358 struct bch_sb *sb = disk_sb->sb; 359 struct bch_sb_field *f; 360 struct bch_sb_field_members_v1 *mi; 361 enum bch_opt_id opt_id; 362 u16 block_size; 363 int ret; 364 365 ret = bch2_sb_compatible(sb, out); 366 if (ret) 367 return ret; 368 369 if (sb->features[1] || 370 (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) { 371 prt_printf(out, "Filesystem has incompatible features"); 372 return -BCH_ERR_invalid_sb_features; 373 } 374 375 block_size = le16_to_cpu(sb->block_size); 376 377 if (block_size > PAGE_SECTORS) { 378 prt_printf(out, "Block size too big (got %u, max %u)", 379 block_size, PAGE_SECTORS); 380 return -BCH_ERR_invalid_sb_block_size; 381 } 382 383 if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid))) { 384 prt_printf(out, "Bad user UUID (got zeroes)"); 385 return -BCH_ERR_invalid_sb_uuid; 386 } 387 388 if (bch2_is_zero(sb->uuid.b, sizeof(sb->uuid))) { 389 prt_printf(out, "Bad internal UUID (got zeroes)"); 390 return -BCH_ERR_invalid_sb_uuid; 391 } 392 393 if (!sb->nr_devices || 394 sb->nr_devices > BCH_SB_MEMBERS_MAX) { 395 prt_printf(out, "Bad number of member devices %u (max %u)", 396 sb->nr_devices, BCH_SB_MEMBERS_MAX); 397 return -BCH_ERR_invalid_sb_too_many_members; 398 } 399 400 if (sb->dev_idx >= sb->nr_devices) { 401 prt_printf(out, "Bad dev_idx (got %u, nr_devices %u)", 402 sb->dev_idx, sb->nr_devices); 403 return -BCH_ERR_invalid_sb_dev_idx; 404 } 405 406 if (!sb->time_precision || 407 le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) { 408 prt_printf(out, "Invalid time precision: %u (min 1, max %lu)", 409 le32_to_cpu(sb->time_precision), NSEC_PER_SEC); 410 return -BCH_ERR_invalid_sb_time_precision; 411 } 412 413 if (rw == READ) { 414 /* 415 * Been seeing a bug where these are getting inexplicably 416 * zeroed, so we're now validating them, but we have to be 417 * careful not to preven people's filesystems from mounting: 418 */ 419 if (!BCH_SB_JOURNAL_FLUSH_DELAY(sb)) 420 SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000); 421 if (!BCH_SB_JOURNAL_RECLAIM_DELAY(sb)) 422 SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 1000); 423 424 if (!BCH_SB_VERSION_UPGRADE_COMPLETE(sb)) 425 SET_BCH_SB_VERSION_UPGRADE_COMPLETE(sb, le16_to_cpu(sb->version)); 426 } 427 428 for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) { 429 const struct bch_option *opt = bch2_opt_table + opt_id; 430 431 if (opt->get_sb != BCH2_NO_SB_OPT) { 432 u64 v = bch2_opt_from_sb(sb, opt_id); 433 434 prt_printf(out, "Invalid option "); 435 ret = bch2_opt_validate(opt, v, out); 436 if (ret) 437 return ret; 438 439 printbuf_reset(out); 440 } 441 } 442 443 /* validate layout */ 444 ret = validate_sb_layout(&sb->layout, out); 445 if (ret) 446 return ret; 447 448 vstruct_for_each(sb, f) { 449 if (!f->u64s) { 450 prt_printf(out, "Invalid superblock: optional field with size 0 (type %u)", 451 le32_to_cpu(f->type)); 452 return -BCH_ERR_invalid_sb_field_size; 453 } 454 455 if (vstruct_next(f) > vstruct_last(sb)) { 456 prt_printf(out, "Invalid superblock: optional field extends past end of superblock (type %u)", 457 le32_to_cpu(f->type)); 458 return -BCH_ERR_invalid_sb_field_size; 459 } 460 } 461 462 /* members must be validated first: */ 463 mi = bch2_sb_field_get(sb, members_v1); 464 if (!mi) { 465 prt_printf(out, "Invalid superblock: member info area missing"); 466 return -BCH_ERR_invalid_sb_members_missing; 467 } 468 469 ret = bch2_sb_field_validate(sb, &mi->field, out); 470 if (ret) 471 return ret; 472 473 vstruct_for_each(sb, f) { 474 if (le32_to_cpu(f->type) == BCH_SB_FIELD_members_v1) 475 continue; 476 477 ret = bch2_sb_field_validate(sb, f, out); 478 if (ret) 479 return ret; 480 } 481 482 return 0; 483 } 484 485 /* device open: */ 486 487 static void bch2_sb_update(struct bch_fs *c) 488 { 489 struct bch_sb *src = c->disk_sb.sb; 490 struct bch_dev *ca; 491 unsigned i; 492 493 lockdep_assert_held(&c->sb_lock); 494 495 c->sb.uuid = src->uuid; 496 c->sb.user_uuid = src->user_uuid; 497 c->sb.version = le16_to_cpu(src->version); 498 c->sb.version_min = le16_to_cpu(src->version_min); 499 c->sb.version_upgrade_complete = BCH_SB_VERSION_UPGRADE_COMPLETE(src); 500 c->sb.nr_devices = src->nr_devices; 501 c->sb.clean = BCH_SB_CLEAN(src); 502 c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src); 503 504 c->sb.nsec_per_time_unit = le32_to_cpu(src->time_precision); 505 c->sb.time_units_per_sec = NSEC_PER_SEC / c->sb.nsec_per_time_unit; 506 507 /* XXX this is wrong, we need a 96 or 128 bit integer type */ 508 c->sb.time_base_lo = div_u64(le64_to_cpu(src->time_base_lo), 509 c->sb.nsec_per_time_unit); 510 c->sb.time_base_hi = le32_to_cpu(src->time_base_hi); 511 512 c->sb.features = le64_to_cpu(src->features[0]); 513 c->sb.compat = le64_to_cpu(src->compat[0]); 514 515 for_each_member_device(ca, c, i) { 516 struct bch_member m = bch2_sb_member_get(src, i); 517 ca->mi = bch2_mi_to_cpu(&m); 518 } 519 } 520 521 static int __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src) 522 { 523 struct bch_sb_field *src_f, *dst_f; 524 struct bch_sb *dst = dst_handle->sb; 525 unsigned i; 526 527 dst->version = src->version; 528 dst->version_min = src->version_min; 529 dst->seq = src->seq; 530 dst->uuid = src->uuid; 531 dst->user_uuid = src->user_uuid; 532 memcpy(dst->label, src->label, sizeof(dst->label)); 533 534 dst->block_size = src->block_size; 535 dst->nr_devices = src->nr_devices; 536 537 dst->time_base_lo = src->time_base_lo; 538 dst->time_base_hi = src->time_base_hi; 539 dst->time_precision = src->time_precision; 540 541 memcpy(dst->flags, src->flags, sizeof(dst->flags)); 542 memcpy(dst->features, src->features, sizeof(dst->features)); 543 memcpy(dst->compat, src->compat, sizeof(dst->compat)); 544 545 for (i = 0; i < BCH_SB_FIELD_NR; i++) { 546 int d; 547 548 if ((1U << i) & BCH_SINGLE_DEVICE_SB_FIELDS) 549 continue; 550 551 src_f = bch2_sb_field_get_id(src, i); 552 dst_f = bch2_sb_field_get_id(dst, i); 553 554 d = (src_f ? le32_to_cpu(src_f->u64s) : 0) - 555 (dst_f ? le32_to_cpu(dst_f->u64s) : 0); 556 if (d > 0) { 557 int ret = bch2_sb_realloc(dst_handle, 558 le32_to_cpu(dst_handle->sb->u64s) + d); 559 560 if (ret) 561 return ret; 562 563 dst = dst_handle->sb; 564 dst_f = bch2_sb_field_get_id(dst, i); 565 } 566 567 dst_f = __bch2_sb_field_resize(dst_handle, dst_f, 568 src_f ? le32_to_cpu(src_f->u64s) : 0); 569 570 if (src_f) 571 memcpy(dst_f, src_f, vstruct_bytes(src_f)); 572 } 573 574 return 0; 575 } 576 577 int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src) 578 { 579 int ret; 580 581 lockdep_assert_held(&c->sb_lock); 582 583 ret = bch2_sb_realloc(&c->disk_sb, 0) ?: 584 __copy_super(&c->disk_sb, src) ?: 585 bch2_sb_replicas_to_cpu_replicas(c) ?: 586 bch2_sb_disk_groups_to_cpu(c); 587 if (ret) 588 return ret; 589 590 bch2_sb_update(c); 591 return 0; 592 } 593 594 int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca) 595 { 596 return __copy_super(&ca->disk_sb, c->disk_sb.sb); 597 } 598 599 /* read superblock: */ 600 601 static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf *err) 602 { 603 struct bch_csum csum; 604 size_t bytes; 605 int ret; 606 reread: 607 bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META); 608 sb->bio->bi_iter.bi_sector = offset; 609 bch2_bio_map(sb->bio, sb->sb, sb->buffer_size); 610 611 ret = submit_bio_wait(sb->bio); 612 if (ret) { 613 prt_printf(err, "IO error: %i", ret); 614 return ret; 615 } 616 617 if (!uuid_equal(&sb->sb->magic, &BCACHE_MAGIC) && 618 !uuid_equal(&sb->sb->magic, &BCHFS_MAGIC)) { 619 prt_printf(err, "Not a bcachefs superblock"); 620 return -BCH_ERR_invalid_sb_magic; 621 } 622 623 ret = bch2_sb_compatible(sb->sb, err); 624 if (ret) 625 return ret; 626 627 bytes = vstruct_bytes(sb->sb); 628 629 if (bytes > 512 << sb->sb->layout.sb_max_size_bits) { 630 prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)", 631 bytes, 512UL << sb->sb->layout.sb_max_size_bits); 632 return -BCH_ERR_invalid_sb_too_big; 633 } 634 635 if (bytes > sb->buffer_size) { 636 ret = bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s)); 637 if (ret) 638 return ret; 639 goto reread; 640 } 641 642 if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) { 643 prt_printf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb)); 644 return -BCH_ERR_invalid_sb_csum_type; 645 } 646 647 /* XXX: verify MACs */ 648 csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb), 649 null_nonce(), sb->sb); 650 651 if (bch2_crc_cmp(csum, sb->sb->csum)) { 652 prt_printf(err, "bad checksum"); 653 return -BCH_ERR_invalid_sb_csum; 654 } 655 656 sb->seq = le64_to_cpu(sb->sb->seq); 657 658 return 0; 659 } 660 661 int bch2_read_super(const char *path, struct bch_opts *opts, 662 struct bch_sb_handle *sb) 663 { 664 u64 offset = opt_get(*opts, sb); 665 struct bch_sb_layout layout; 666 struct printbuf err = PRINTBUF; 667 __le64 *i; 668 int ret; 669 #ifndef __KERNEL__ 670 retry: 671 #endif 672 memset(sb, 0, sizeof(*sb)); 673 sb->mode = BLK_OPEN_READ; 674 sb->have_bio = true; 675 sb->holder = kmalloc(1, GFP_KERNEL); 676 if (!sb->holder) 677 return -ENOMEM; 678 679 sb->sb_name = kstrdup(path, GFP_KERNEL); 680 if (!sb->sb_name) 681 return -ENOMEM; 682 683 #ifndef __KERNEL__ 684 if (opt_get(*opts, direct_io) == false) 685 sb->mode |= BLK_OPEN_BUFFERED; 686 #endif 687 688 if (!opt_get(*opts, noexcl)) 689 sb->mode |= BLK_OPEN_EXCL; 690 691 if (!opt_get(*opts, nochanges)) 692 sb->mode |= BLK_OPEN_WRITE; 693 694 sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); 695 if (IS_ERR(sb->bdev) && 696 PTR_ERR(sb->bdev) == -EACCES && 697 opt_get(*opts, read_only)) { 698 sb->mode &= ~BLK_OPEN_WRITE; 699 700 sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); 701 if (!IS_ERR(sb->bdev)) 702 opt_set(*opts, nochanges, true); 703 } 704 705 if (IS_ERR(sb->bdev)) { 706 ret = PTR_ERR(sb->bdev); 707 goto out; 708 } 709 710 ret = bch2_sb_realloc(sb, 0); 711 if (ret) { 712 prt_printf(&err, "error allocating memory for superblock"); 713 goto err; 714 } 715 716 if (bch2_fs_init_fault("read_super")) { 717 prt_printf(&err, "dynamic fault"); 718 ret = -EFAULT; 719 goto err; 720 } 721 722 ret = read_one_super(sb, offset, &err); 723 if (!ret) 724 goto got_super; 725 726 if (opt_defined(*opts, sb)) 727 goto err; 728 729 printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s\n", 730 path, err.buf); 731 printbuf_reset(&err); 732 733 /* 734 * Error reading primary superblock - read location of backup 735 * superblocks: 736 */ 737 bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META); 738 sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR; 739 /* 740 * use sb buffer to read layout, since sb buffer is page aligned but 741 * layout won't be: 742 */ 743 bch2_bio_map(sb->bio, sb->sb, sizeof(struct bch_sb_layout)); 744 745 ret = submit_bio_wait(sb->bio); 746 if (ret) { 747 prt_printf(&err, "IO error: %i", ret); 748 goto err; 749 } 750 751 memcpy(&layout, sb->sb, sizeof(layout)); 752 ret = validate_sb_layout(&layout, &err); 753 if (ret) 754 goto err; 755 756 for (i = layout.sb_offset; 757 i < layout.sb_offset + layout.nr_superblocks; i++) { 758 offset = le64_to_cpu(*i); 759 760 if (offset == opt_get(*opts, sb)) 761 continue; 762 763 ret = read_one_super(sb, offset, &err); 764 if (!ret) 765 goto got_super; 766 } 767 768 goto err; 769 770 got_super: 771 if (le16_to_cpu(sb->sb->block_size) << 9 < 772 bdev_logical_block_size(sb->bdev) && 773 opt_get(*opts, direct_io)) { 774 #ifndef __KERNEL__ 775 opt_set(*opts, direct_io, false); 776 bch2_free_super(sb); 777 goto retry; 778 #endif 779 prt_printf(&err, "block size (%u) smaller than device block size (%u)", 780 le16_to_cpu(sb->sb->block_size) << 9, 781 bdev_logical_block_size(sb->bdev)); 782 ret = -BCH_ERR_block_size_too_small; 783 goto err; 784 } 785 786 ret = 0; 787 sb->have_layout = true; 788 789 ret = bch2_sb_validate(sb, &err, READ); 790 if (ret) { 791 printk(KERN_ERR "bcachefs (%s): error validating superblock: %s\n", 792 path, err.buf); 793 goto err_no_print; 794 } 795 out: 796 printbuf_exit(&err); 797 return ret; 798 err: 799 printk(KERN_ERR "bcachefs (%s): error reading superblock: %s\n", 800 path, err.buf); 801 err_no_print: 802 bch2_free_super(sb); 803 goto out; 804 } 805 806 /* write superblock: */ 807 808 static void write_super_endio(struct bio *bio) 809 { 810 struct bch_dev *ca = bio->bi_private; 811 812 /* XXX: return errors directly */ 813 814 if (bch2_dev_io_err_on(bio->bi_status, ca, 815 bio_data_dir(bio) 816 ? BCH_MEMBER_ERROR_write 817 : BCH_MEMBER_ERROR_read, 818 "superblock %s error: %s", 819 bio_data_dir(bio) ? "write" : "read", 820 bch2_blk_status_to_str(bio->bi_status))) 821 ca->sb_write_error = 1; 822 823 closure_put(&ca->fs->sb_write); 824 percpu_ref_put(&ca->io_ref); 825 } 826 827 static void read_back_super(struct bch_fs *c, struct bch_dev *ca) 828 { 829 struct bch_sb *sb = ca->disk_sb.sb; 830 struct bio *bio = ca->disk_sb.bio; 831 832 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ|REQ_SYNC|REQ_META); 833 bio->bi_iter.bi_sector = le64_to_cpu(sb->layout.sb_offset[0]); 834 bio->bi_end_io = write_super_endio; 835 bio->bi_private = ca; 836 bch2_bio_map(bio, ca->sb_read_scratch, PAGE_SIZE); 837 838 this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], 839 bio_sectors(bio)); 840 841 percpu_ref_get(&ca->io_ref); 842 closure_bio_submit(bio, &c->sb_write); 843 } 844 845 static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) 846 { 847 struct bch_sb *sb = ca->disk_sb.sb; 848 struct bio *bio = ca->disk_sb.bio; 849 850 sb->offset = sb->layout.sb_offset[idx]; 851 852 SET_BCH_SB_CSUM_TYPE(sb, bch2_csum_opt_to_type(c->opts.metadata_checksum, false)); 853 sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb), 854 null_nonce(), sb); 855 856 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); 857 bio->bi_iter.bi_sector = le64_to_cpu(sb->offset); 858 bio->bi_end_io = write_super_endio; 859 bio->bi_private = ca; 860 bch2_bio_map(bio, sb, 861 roundup((size_t) vstruct_bytes(sb), 862 bdev_logical_block_size(ca->disk_sb.bdev))); 863 864 this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb], 865 bio_sectors(bio)); 866 867 percpu_ref_get(&ca->io_ref); 868 closure_bio_submit(bio, &c->sb_write); 869 } 870 871 int bch2_write_super(struct bch_fs *c) 872 { 873 struct closure *cl = &c->sb_write; 874 struct bch_dev *ca; 875 struct printbuf err = PRINTBUF; 876 unsigned i, sb = 0, nr_wrote; 877 struct bch_devs_mask sb_written; 878 bool wrote, can_mount_without_written, can_mount_with_written; 879 unsigned degraded_flags = BCH_FORCE_IF_DEGRADED; 880 int ret = 0; 881 882 trace_and_count(c, write_super, c, _RET_IP_); 883 884 if (c->opts.very_degraded) 885 degraded_flags |= BCH_FORCE_IF_LOST; 886 887 lockdep_assert_held(&c->sb_lock); 888 889 closure_init_stack(cl); 890 memset(&sb_written, 0, sizeof(sb_written)); 891 892 /* Make sure we're using the new magic numbers: */ 893 c->disk_sb.sb->magic = BCHFS_MAGIC; 894 c->disk_sb.sb->layout.magic = BCHFS_MAGIC; 895 896 le64_add_cpu(&c->disk_sb.sb->seq, 1); 897 898 if (test_bit(BCH_FS_ERROR, &c->flags)) 899 SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1); 900 if (test_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags)) 901 SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 1); 902 903 SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN); 904 905 bch2_sb_counters_from_cpu(c); 906 bch2_sb_members_from_cpu(c); 907 bch2_sb_members_cpy_v2_v1(&c->disk_sb); 908 bch2_sb_errors_from_cpu(c); 909 910 for_each_online_member(ca, c, i) 911 bch2_sb_from_fs(c, ca); 912 913 for_each_online_member(ca, c, i) { 914 printbuf_reset(&err); 915 916 ret = bch2_sb_validate(&ca->disk_sb, &err, WRITE); 917 if (ret) { 918 bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf); 919 percpu_ref_put(&ca->io_ref); 920 goto out; 921 } 922 } 923 924 if (c->opts.nochanges) 925 goto out; 926 927 /* 928 * Defer writing the superblock until filesystem initialization is 929 * complete - don't write out a partly initialized superblock: 930 */ 931 if (!BCH_SB_INITIALIZED(c->disk_sb.sb)) 932 goto out; 933 934 for_each_online_member(ca, c, i) { 935 __set_bit(ca->dev_idx, sb_written.d); 936 ca->sb_write_error = 0; 937 } 938 939 for_each_online_member(ca, c, i) 940 read_back_super(c, ca); 941 closure_sync(cl); 942 943 for_each_online_member(ca, c, i) { 944 if (ca->sb_write_error) 945 continue; 946 947 if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) { 948 bch2_fs_fatal_error(c, 949 "Superblock write was silently dropped! (seq %llu expected %llu)", 950 le64_to_cpu(ca->sb_read_scratch->seq), 951 ca->disk_sb.seq); 952 percpu_ref_put(&ca->io_ref); 953 ret = -BCH_ERR_erofs_sb_err; 954 goto out; 955 } 956 957 if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) { 958 bch2_fs_fatal_error(c, 959 "Superblock modified by another process (seq %llu expected %llu)", 960 le64_to_cpu(ca->sb_read_scratch->seq), 961 ca->disk_sb.seq); 962 percpu_ref_put(&ca->io_ref); 963 ret = -BCH_ERR_erofs_sb_err; 964 goto out; 965 } 966 } 967 968 do { 969 wrote = false; 970 for_each_online_member(ca, c, i) 971 if (!ca->sb_write_error && 972 sb < ca->disk_sb.sb->layout.nr_superblocks) { 973 write_one_super(c, ca, sb); 974 wrote = true; 975 } 976 closure_sync(cl); 977 sb++; 978 } while (wrote); 979 980 for_each_online_member(ca, c, i) { 981 if (ca->sb_write_error) 982 __clear_bit(ca->dev_idx, sb_written.d); 983 else 984 ca->disk_sb.seq = le64_to_cpu(ca->disk_sb.sb->seq); 985 } 986 987 nr_wrote = dev_mask_nr(&sb_written); 988 989 can_mount_with_written = 990 bch2_have_enough_devs(c, sb_written, degraded_flags, false); 991 992 for (i = 0; i < ARRAY_SIZE(sb_written.d); i++) 993 sb_written.d[i] = ~sb_written.d[i]; 994 995 can_mount_without_written = 996 bch2_have_enough_devs(c, sb_written, degraded_flags, false); 997 998 /* 999 * If we would be able to mount _without_ the devices we successfully 1000 * wrote superblocks to, we weren't able to write to enough devices: 1001 * 1002 * Exception: if we can mount without the successes because we haven't 1003 * written anything (new filesystem), we continue if we'd be able to 1004 * mount with the devices we did successfully write to: 1005 */ 1006 if (bch2_fs_fatal_err_on(!nr_wrote || 1007 !can_mount_with_written || 1008 (can_mount_without_written && 1009 !can_mount_with_written), c, 1010 "Unable to write superblock to sufficient devices (from %ps)", 1011 (void *) _RET_IP_)) 1012 ret = -1; 1013 out: 1014 /* Make new options visible after they're persistent: */ 1015 bch2_sb_update(c); 1016 printbuf_exit(&err); 1017 return ret; 1018 } 1019 1020 void __bch2_check_set_feature(struct bch_fs *c, unsigned feat) 1021 { 1022 mutex_lock(&c->sb_lock); 1023 if (!(c->sb.features & (1ULL << feat))) { 1024 c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << feat); 1025 1026 bch2_write_super(c); 1027 } 1028 mutex_unlock(&c->sb_lock); 1029 } 1030 1031 /* Downgrade if superblock is at a higher version than currently supported: */ 1032 void bch2_sb_maybe_downgrade(struct bch_fs *c) 1033 { 1034 lockdep_assert_held(&c->sb_lock); 1035 1036 /* 1037 * Downgrade, if superblock is at a higher version than currently 1038 * supported: 1039 */ 1040 if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current) 1041 SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); 1042 if (c->sb.version > bcachefs_metadata_version_current) 1043 c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current); 1044 if (c->sb.version_min > bcachefs_metadata_version_current) 1045 c->disk_sb.sb->version_min = cpu_to_le16(bcachefs_metadata_version_current); 1046 c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1); 1047 } 1048 1049 void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version) 1050 { 1051 lockdep_assert_held(&c->sb_lock); 1052 1053 c->disk_sb.sb->version = cpu_to_le16(new_version); 1054 c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); 1055 } 1056 1057 static const struct bch_sb_field_ops *bch2_sb_field_ops[] = { 1058 #define x(f, nr) \ 1059 [BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f, 1060 BCH_SB_FIELDS() 1061 #undef x 1062 }; 1063 1064 static const struct bch_sb_field_ops bch2_sb_field_null_ops; 1065 1066 static const struct bch_sb_field_ops *bch2_sb_field_type_ops(unsigned type) 1067 { 1068 return likely(type < ARRAY_SIZE(bch2_sb_field_ops)) 1069 ? bch2_sb_field_ops[type] 1070 : &bch2_sb_field_null_ops; 1071 } 1072 1073 static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f, 1074 struct printbuf *err) 1075 { 1076 unsigned type = le32_to_cpu(f->type); 1077 struct printbuf field_err = PRINTBUF; 1078 const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type); 1079 int ret; 1080 1081 ret = ops->validate ? ops->validate(sb, f, &field_err) : 0; 1082 if (ret) { 1083 prt_printf(err, "Invalid superblock section %s: %s", 1084 bch2_sb_fields[type], field_err.buf); 1085 prt_newline(err); 1086 bch2_sb_field_to_text(err, sb, f); 1087 } 1088 1089 printbuf_exit(&field_err); 1090 return ret; 1091 } 1092 1093 void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, 1094 struct bch_sb_field *f) 1095 { 1096 unsigned type = le32_to_cpu(f->type); 1097 const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type); 1098 1099 if (!out->nr_tabstops) 1100 printbuf_tabstop_push(out, 32); 1101 1102 if (type < BCH_SB_FIELD_NR) 1103 prt_printf(out, "%s", bch2_sb_fields[type]); 1104 else 1105 prt_printf(out, "(unknown field %u)", type); 1106 1107 prt_printf(out, " (size %zu):", vstruct_bytes(f)); 1108 prt_newline(out); 1109 1110 if (ops->to_text) { 1111 printbuf_indent_add(out, 2); 1112 ops->to_text(out, sb, f); 1113 printbuf_indent_sub(out, 2); 1114 } 1115 } 1116 1117 void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l) 1118 { 1119 unsigned i; 1120 1121 prt_printf(out, "Type: %u", l->layout_type); 1122 prt_newline(out); 1123 1124 prt_str(out, "Superblock max size: "); 1125 prt_units_u64(out, 512 << l->sb_max_size_bits); 1126 prt_newline(out); 1127 1128 prt_printf(out, "Nr superblocks: %u", l->nr_superblocks); 1129 prt_newline(out); 1130 1131 prt_str(out, "Offsets: "); 1132 for (i = 0; i < l->nr_superblocks; i++) { 1133 if (i) 1134 prt_str(out, ", "); 1135 prt_printf(out, "%llu", le64_to_cpu(l->sb_offset[i])); 1136 } 1137 prt_newline(out); 1138 } 1139 1140 void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, 1141 bool print_layout, unsigned fields) 1142 { 1143 struct bch_sb_field *f; 1144 u64 fields_have = 0; 1145 unsigned nr_devices = 0; 1146 1147 if (!out->nr_tabstops) 1148 printbuf_tabstop_push(out, 44); 1149 1150 for (int i = 0; i < sb->nr_devices; i++) 1151 nr_devices += bch2_dev_exists(sb, i); 1152 1153 prt_printf(out, "External UUID:"); 1154 prt_tab(out); 1155 pr_uuid(out, sb->user_uuid.b); 1156 prt_newline(out); 1157 1158 prt_printf(out, "Internal UUID:"); 1159 prt_tab(out); 1160 pr_uuid(out, sb->uuid.b); 1161 prt_newline(out); 1162 1163 prt_str(out, "Device index:"); 1164 prt_tab(out); 1165 prt_printf(out, "%u", sb->dev_idx); 1166 prt_newline(out); 1167 1168 prt_str(out, "Label:"); 1169 prt_tab(out); 1170 prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label); 1171 prt_newline(out); 1172 1173 prt_str(out, "Version:"); 1174 prt_tab(out); 1175 bch2_version_to_text(out, le16_to_cpu(sb->version)); 1176 prt_newline(out); 1177 1178 prt_str(out, "Version upgrade complete:"); 1179 prt_tab(out); 1180 bch2_version_to_text(out, BCH_SB_VERSION_UPGRADE_COMPLETE(sb)); 1181 prt_newline(out); 1182 1183 prt_printf(out, "Oldest version on disk:"); 1184 prt_tab(out); 1185 bch2_version_to_text(out, le16_to_cpu(sb->version_min)); 1186 prt_newline(out); 1187 1188 prt_printf(out, "Created:"); 1189 prt_tab(out); 1190 if (sb->time_base_lo) 1191 bch2_prt_datetime(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC)); 1192 else 1193 prt_printf(out, "(not set)"); 1194 prt_newline(out); 1195 1196 prt_printf(out, "Sequence number:"); 1197 prt_tab(out); 1198 prt_printf(out, "%llu", le64_to_cpu(sb->seq)); 1199 prt_newline(out); 1200 1201 prt_printf(out, "Superblock size:"); 1202 prt_tab(out); 1203 prt_printf(out, "%zu", vstruct_bytes(sb)); 1204 prt_newline(out); 1205 1206 prt_printf(out, "Clean:"); 1207 prt_tab(out); 1208 prt_printf(out, "%llu", BCH_SB_CLEAN(sb)); 1209 prt_newline(out); 1210 1211 prt_printf(out, "Devices:"); 1212 prt_tab(out); 1213 prt_printf(out, "%u", nr_devices); 1214 prt_newline(out); 1215 1216 prt_printf(out, "Sections:"); 1217 vstruct_for_each(sb, f) 1218 fields_have |= 1 << le32_to_cpu(f->type); 1219 prt_tab(out); 1220 prt_bitflags(out, bch2_sb_fields, fields_have); 1221 prt_newline(out); 1222 1223 prt_printf(out, "Features:"); 1224 prt_tab(out); 1225 prt_bitflags(out, bch2_sb_features, le64_to_cpu(sb->features[0])); 1226 prt_newline(out); 1227 1228 prt_printf(out, "Compat features:"); 1229 prt_tab(out); 1230 prt_bitflags(out, bch2_sb_compat, le64_to_cpu(sb->compat[0])); 1231 prt_newline(out); 1232 1233 prt_newline(out); 1234 prt_printf(out, "Options:"); 1235 prt_newline(out); 1236 printbuf_indent_add(out, 2); 1237 { 1238 enum bch_opt_id id; 1239 1240 for (id = 0; id < bch2_opts_nr; id++) { 1241 const struct bch_option *opt = bch2_opt_table + id; 1242 1243 if (opt->get_sb != BCH2_NO_SB_OPT) { 1244 u64 v = bch2_opt_from_sb(sb, id); 1245 1246 prt_printf(out, "%s:", opt->attr.name); 1247 prt_tab(out); 1248 bch2_opt_to_text(out, NULL, sb, opt, v, 1249 OPT_HUMAN_READABLE|OPT_SHOW_FULL_LIST); 1250 prt_newline(out); 1251 } 1252 } 1253 } 1254 1255 printbuf_indent_sub(out, 2); 1256 1257 if (print_layout) { 1258 prt_newline(out); 1259 prt_printf(out, "layout:"); 1260 prt_newline(out); 1261 printbuf_indent_add(out, 2); 1262 bch2_sb_layout_to_text(out, &sb->layout); 1263 printbuf_indent_sub(out, 2); 1264 } 1265 1266 vstruct_for_each(sb, f) 1267 if (fields & (1 << le32_to_cpu(f->type))) { 1268 prt_newline(out); 1269 bch2_sb_field_to_text(out, sb, f); 1270 } 1271 } 1272