1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "checksum.h" 5 #include "counters.h" 6 #include "disk_groups.h" 7 #include "ec.h" 8 #include "error.h" 9 #include "journal.h" 10 #include "journal_sb.h" 11 #include "journal_seq_blacklist.h" 12 #include "recovery.h" 13 #include "replicas.h" 14 #include "quota.h" 15 #include "sb-clean.h" 16 #include "sb-members.h" 17 #include "super-io.h" 18 #include "super.h" 19 #include "trace.h" 20 #include "vstructs.h" 21 22 #include <linux/backing-dev.h> 23 #include <linux/sort.h> 24 25 static const struct blk_holder_ops bch2_sb_handle_bdev_ops = { 26 }; 27 28 struct bch2_metadata_version { 29 u16 version; 30 const char *name; 31 u64 recovery_passes; 32 }; 33 34 static const struct bch2_metadata_version bch2_metadata_versions[] = { 35 #define x(n, v, _recovery_passes) { \ 36 .version = v, \ 37 .name = #n, \ 38 .recovery_passes = _recovery_passes, \ 39 }, 40 BCH_METADATA_VERSIONS() 41 #undef x 42 }; 43 44 void bch2_version_to_text(struct printbuf *out, unsigned v) 45 { 46 const char *str = "(unknown version)"; 47 48 for (unsigned i = 0; i < ARRAY_SIZE(bch2_metadata_versions); i++) 49 if (bch2_metadata_versions[i].version == v) { 50 str = bch2_metadata_versions[i].name; 51 break; 52 } 53 54 prt_printf(out, "%u.%u: %s", BCH_VERSION_MAJOR(v), BCH_VERSION_MINOR(v), str); 55 } 56 57 unsigned bch2_latest_compatible_version(unsigned v) 58 { 59 if (!BCH_VERSION_MAJOR(v)) 60 return v; 61 62 for (unsigned i = 0; i < ARRAY_SIZE(bch2_metadata_versions); i++) 63 if (bch2_metadata_versions[i].version > v && 64 BCH_VERSION_MAJOR(bch2_metadata_versions[i].version) == 65 BCH_VERSION_MAJOR(v)) 66 v = bch2_metadata_versions[i].version; 67 68 return v; 69 } 70 71 u64 bch2_upgrade_recovery_passes(struct bch_fs *c, 72 unsigned old_version, 73 unsigned new_version) 74 { 75 u64 ret = 0; 76 77 for (const struct bch2_metadata_version *i = bch2_metadata_versions; 78 i < bch2_metadata_versions + ARRAY_SIZE(bch2_metadata_versions); 79 i++) 80 if (i->version > old_version && i->version <= new_version) { 81 if (i->recovery_passes & RECOVERY_PASS_ALL_FSCK) 82 ret |= bch2_fsck_recovery_passes(); 83 ret |= i->recovery_passes; 84 } 85 86 return ret &= ~RECOVERY_PASS_ALL_FSCK; 87 } 88 89 const char * const bch2_sb_fields[] = { 90 #define x(name, nr) #name, 91 BCH_SB_FIELDS() 92 #undef x 93 NULL 94 }; 95 96 static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *, 97 struct printbuf *); 98 99 struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *sb, 100 enum bch_sb_field_type type) 101 { 102 struct bch_sb_field *f; 103 104 /* XXX: need locking around superblock to access optional fields */ 105 106 vstruct_for_each(sb, f) 107 if (le32_to_cpu(f->type) == type) 108 return f; 109 return NULL; 110 } 111 112 static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb, 113 struct bch_sb_field *f, 114 unsigned u64s) 115 { 116 unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0; 117 unsigned sb_u64s = le32_to_cpu(sb->sb->u64s) + u64s - old_u64s; 118 119 BUG_ON(__vstruct_bytes(struct bch_sb, sb_u64s) > sb->buffer_size); 120 121 if (!f && !u64s) { 122 /* nothing to do: */ 123 } else if (!f) { 124 f = vstruct_last(sb->sb); 125 memset(f, 0, sizeof(u64) * u64s); 126 f->u64s = cpu_to_le32(u64s); 127 f->type = 0; 128 } else { 129 void *src, *dst; 130 131 src = vstruct_end(f); 132 133 if (u64s) { 134 f->u64s = cpu_to_le32(u64s); 135 dst = vstruct_end(f); 136 } else { 137 dst = f; 138 } 139 140 memmove(dst, src, vstruct_end(sb->sb) - src); 141 142 if (dst > src) 143 memset(src, 0, dst - src); 144 } 145 146 sb->sb->u64s = cpu_to_le32(sb_u64s); 147 148 return u64s ? f : NULL; 149 } 150 151 void bch2_sb_field_delete(struct bch_sb_handle *sb, 152 enum bch_sb_field_type type) 153 { 154 struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type); 155 156 if (f) 157 __bch2_sb_field_resize(sb, f, 0); 158 } 159 160 /* Superblock realloc/free: */ 161 162 void bch2_free_super(struct bch_sb_handle *sb) 163 { 164 kfree(sb->bio); 165 if (!IS_ERR_OR_NULL(sb->bdev)) 166 blkdev_put(sb->bdev, sb->holder); 167 kfree(sb->holder); 168 169 kfree(sb->sb); 170 memset(sb, 0, sizeof(*sb)); 171 } 172 173 int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) 174 { 175 size_t new_bytes = __vstruct_bytes(struct bch_sb, u64s); 176 size_t new_buffer_size; 177 struct bch_sb *new_sb; 178 struct bio *bio; 179 180 if (sb->bdev) 181 new_bytes = max_t(size_t, new_bytes, bdev_logical_block_size(sb->bdev)); 182 183 new_buffer_size = roundup_pow_of_two(new_bytes); 184 185 if (sb->sb && sb->buffer_size >= new_buffer_size) 186 return 0; 187 188 if (sb->sb && sb->have_layout) { 189 u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits; 190 191 if (new_bytes > max_bytes) { 192 pr_err("%pg: superblock too big: want %zu but have %llu", 193 sb->bdev, new_bytes, max_bytes); 194 return -BCH_ERR_ENOSPC_sb; 195 } 196 } 197 198 if (sb->buffer_size >= new_buffer_size && sb->sb) 199 return 0; 200 201 if (dynamic_fault("bcachefs:add:super_realloc")) 202 return -BCH_ERR_ENOMEM_sb_realloc_injected; 203 204 new_sb = krealloc(sb->sb, new_buffer_size, GFP_NOFS|__GFP_ZERO); 205 if (!new_sb) 206 return -BCH_ERR_ENOMEM_sb_buf_realloc; 207 208 sb->sb = new_sb; 209 210 if (sb->have_bio) { 211 unsigned nr_bvecs = buf_pages(sb->sb, new_buffer_size); 212 213 bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); 214 if (!bio) 215 return -BCH_ERR_ENOMEM_sb_bio_realloc; 216 217 bio_init(bio, NULL, bio->bi_inline_vecs, nr_bvecs, 0); 218 219 kfree(sb->bio); 220 sb->bio = bio; 221 } 222 223 sb->buffer_size = new_buffer_size; 224 225 return 0; 226 } 227 228 struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb, 229 enum bch_sb_field_type type, 230 unsigned u64s) 231 { 232 struct bch_sb_field *f = bch2_sb_field_get_id(sb->sb, type); 233 ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0; 234 ssize_t d = -old_u64s + u64s; 235 236 if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) 237 return NULL; 238 239 if (sb->fs_sb) { 240 struct bch_fs *c = container_of(sb, struct bch_fs, disk_sb); 241 struct bch_dev *ca; 242 unsigned i; 243 244 lockdep_assert_held(&c->sb_lock); 245 246 /* XXX: we're not checking that offline device have enough space */ 247 248 for_each_online_member(ca, c, i) { 249 struct bch_sb_handle *dev_sb = &ca->disk_sb; 250 251 if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) { 252 percpu_ref_put(&ca->ref); 253 return NULL; 254 } 255 } 256 } 257 258 f = bch2_sb_field_get_id(sb->sb, type); 259 f = __bch2_sb_field_resize(sb, f, u64s); 260 if (f) 261 f->type = cpu_to_le32(type); 262 return f; 263 } 264 265 /* Superblock validate: */ 266 267 static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out) 268 { 269 u64 offset, prev_offset, max_sectors; 270 unsigned i; 271 272 BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512); 273 274 if (!uuid_equal(&layout->magic, &BCACHE_MAGIC) && 275 !uuid_equal(&layout->magic, &BCHFS_MAGIC)) { 276 prt_printf(out, "Not a bcachefs superblock layout"); 277 return -BCH_ERR_invalid_sb_layout; 278 } 279 280 if (layout->layout_type != 0) { 281 prt_printf(out, "Invalid superblock layout type %u", 282 layout->layout_type); 283 return -BCH_ERR_invalid_sb_layout_type; 284 } 285 286 if (!layout->nr_superblocks) { 287 prt_printf(out, "Invalid superblock layout: no superblocks"); 288 return -BCH_ERR_invalid_sb_layout_nr_superblocks; 289 } 290 291 if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) { 292 prt_printf(out, "Invalid superblock layout: too many superblocks"); 293 return -BCH_ERR_invalid_sb_layout_nr_superblocks; 294 } 295 296 max_sectors = 1 << layout->sb_max_size_bits; 297 298 prev_offset = le64_to_cpu(layout->sb_offset[0]); 299 300 for (i = 1; i < layout->nr_superblocks; i++) { 301 offset = le64_to_cpu(layout->sb_offset[i]); 302 303 if (offset < prev_offset + max_sectors) { 304 prt_printf(out, "Invalid superblock layout: superblocks overlap\n" 305 " (sb %u ends at %llu next starts at %llu", 306 i - 1, prev_offset + max_sectors, offset); 307 return -BCH_ERR_invalid_sb_layout_superblocks_overlap; 308 } 309 prev_offset = offset; 310 } 311 312 return 0; 313 } 314 315 static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out) 316 { 317 u16 version = le16_to_cpu(sb->version); 318 u16 version_min = le16_to_cpu(sb->version_min); 319 320 if (!bch2_version_compatible(version)) { 321 prt_str(out, "Unsupported superblock version "); 322 bch2_version_to_text(out, version); 323 prt_str(out, " (min "); 324 bch2_version_to_text(out, bcachefs_metadata_version_min); 325 prt_str(out, ", max "); 326 bch2_version_to_text(out, bcachefs_metadata_version_current); 327 prt_str(out, ")"); 328 return -BCH_ERR_invalid_sb_version; 329 } 330 331 if (!bch2_version_compatible(version_min)) { 332 prt_str(out, "Unsupported superblock version_min "); 333 bch2_version_to_text(out, version_min); 334 prt_str(out, " (min "); 335 bch2_version_to_text(out, bcachefs_metadata_version_min); 336 prt_str(out, ", max "); 337 bch2_version_to_text(out, bcachefs_metadata_version_current); 338 prt_str(out, ")"); 339 return -BCH_ERR_invalid_sb_version; 340 } 341 342 if (version_min > version) { 343 prt_str(out, "Bad minimum version "); 344 bch2_version_to_text(out, version_min); 345 prt_str(out, ", greater than version field "); 346 bch2_version_to_text(out, version); 347 return -BCH_ERR_invalid_sb_version; 348 } 349 350 return 0; 351 } 352 353 static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, 354 int rw) 355 { 356 struct bch_sb *sb = disk_sb->sb; 357 struct bch_sb_field *f; 358 struct bch_sb_field_members_v1 *mi; 359 enum bch_opt_id opt_id; 360 u16 block_size; 361 int ret; 362 363 ret = bch2_sb_compatible(sb, out); 364 if (ret) 365 return ret; 366 367 if (sb->features[1] || 368 (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) { 369 prt_printf(out, "Filesystem has incompatible features"); 370 return -BCH_ERR_invalid_sb_features; 371 } 372 373 block_size = le16_to_cpu(sb->block_size); 374 375 if (block_size > PAGE_SECTORS) { 376 prt_printf(out, "Block size too big (got %u, max %u)", 377 block_size, PAGE_SECTORS); 378 return -BCH_ERR_invalid_sb_block_size; 379 } 380 381 if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid))) { 382 prt_printf(out, "Bad user UUID (got zeroes)"); 383 return -BCH_ERR_invalid_sb_uuid; 384 } 385 386 if (bch2_is_zero(sb->uuid.b, sizeof(sb->uuid))) { 387 prt_printf(out, "Bad internal UUID (got zeroes)"); 388 return -BCH_ERR_invalid_sb_uuid; 389 } 390 391 if (!sb->nr_devices || 392 sb->nr_devices > BCH_SB_MEMBERS_MAX) { 393 prt_printf(out, "Bad number of member devices %u (max %u)", 394 sb->nr_devices, BCH_SB_MEMBERS_MAX); 395 return -BCH_ERR_invalid_sb_too_many_members; 396 } 397 398 if (sb->dev_idx >= sb->nr_devices) { 399 prt_printf(out, "Bad dev_idx (got %u, nr_devices %u)", 400 sb->dev_idx, sb->nr_devices); 401 return -BCH_ERR_invalid_sb_dev_idx; 402 } 403 404 if (!sb->time_precision || 405 le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) { 406 prt_printf(out, "Invalid time precision: %u (min 1, max %lu)", 407 le32_to_cpu(sb->time_precision), NSEC_PER_SEC); 408 return -BCH_ERR_invalid_sb_time_precision; 409 } 410 411 if (rw == READ) { 412 /* 413 * Been seeing a bug where these are getting inexplicably 414 * zeroed, so we're now validating them, but we have to be 415 * careful not to preven people's filesystems from mounting: 416 */ 417 if (!BCH_SB_JOURNAL_FLUSH_DELAY(sb)) 418 SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000); 419 if (!BCH_SB_JOURNAL_RECLAIM_DELAY(sb)) 420 SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 1000); 421 422 if (!BCH_SB_VERSION_UPGRADE_COMPLETE(sb)) 423 SET_BCH_SB_VERSION_UPGRADE_COMPLETE(sb, le16_to_cpu(sb->version)); 424 } 425 426 for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) { 427 const struct bch_option *opt = bch2_opt_table + opt_id; 428 429 if (opt->get_sb != BCH2_NO_SB_OPT) { 430 u64 v = bch2_opt_from_sb(sb, opt_id); 431 432 prt_printf(out, "Invalid option "); 433 ret = bch2_opt_validate(opt, v, out); 434 if (ret) 435 return ret; 436 437 printbuf_reset(out); 438 } 439 } 440 441 /* validate layout */ 442 ret = validate_sb_layout(&sb->layout, out); 443 if (ret) 444 return ret; 445 446 vstruct_for_each(sb, f) { 447 if (!f->u64s) { 448 prt_printf(out, "Invalid superblock: optional field with size 0 (type %u)", 449 le32_to_cpu(f->type)); 450 return -BCH_ERR_invalid_sb_field_size; 451 } 452 453 if (vstruct_next(f) > vstruct_last(sb)) { 454 prt_printf(out, "Invalid superblock: optional field extends past end of superblock (type %u)", 455 le32_to_cpu(f->type)); 456 return -BCH_ERR_invalid_sb_field_size; 457 } 458 } 459 460 /* members must be validated first: */ 461 mi = bch2_sb_field_get(sb, members_v1); 462 if (!mi) { 463 prt_printf(out, "Invalid superblock: member info area missing"); 464 return -BCH_ERR_invalid_sb_members_missing; 465 } 466 467 ret = bch2_sb_field_validate(sb, &mi->field, out); 468 if (ret) 469 return ret; 470 471 vstruct_for_each(sb, f) { 472 if (le32_to_cpu(f->type) == BCH_SB_FIELD_members_v1) 473 continue; 474 475 ret = bch2_sb_field_validate(sb, f, out); 476 if (ret) 477 return ret; 478 } 479 480 return 0; 481 } 482 483 /* device open: */ 484 485 static void bch2_sb_update(struct bch_fs *c) 486 { 487 struct bch_sb *src = c->disk_sb.sb; 488 struct bch_dev *ca; 489 unsigned i; 490 491 lockdep_assert_held(&c->sb_lock); 492 493 c->sb.uuid = src->uuid; 494 c->sb.user_uuid = src->user_uuid; 495 c->sb.version = le16_to_cpu(src->version); 496 c->sb.version_min = le16_to_cpu(src->version_min); 497 c->sb.version_upgrade_complete = BCH_SB_VERSION_UPGRADE_COMPLETE(src); 498 c->sb.nr_devices = src->nr_devices; 499 c->sb.clean = BCH_SB_CLEAN(src); 500 c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src); 501 502 c->sb.nsec_per_time_unit = le32_to_cpu(src->time_precision); 503 c->sb.time_units_per_sec = NSEC_PER_SEC / c->sb.nsec_per_time_unit; 504 505 /* XXX this is wrong, we need a 96 or 128 bit integer type */ 506 c->sb.time_base_lo = div_u64(le64_to_cpu(src->time_base_lo), 507 c->sb.nsec_per_time_unit); 508 c->sb.time_base_hi = le32_to_cpu(src->time_base_hi); 509 510 c->sb.features = le64_to_cpu(src->features[0]); 511 c->sb.compat = le64_to_cpu(src->compat[0]); 512 513 for_each_member_device(ca, c, i) { 514 struct bch_member m = bch2_sb_member_get(src, i); 515 ca->mi = bch2_mi_to_cpu(&m); 516 } 517 } 518 519 static int __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src) 520 { 521 struct bch_sb_field *src_f, *dst_f; 522 struct bch_sb *dst = dst_handle->sb; 523 unsigned i; 524 525 dst->version = src->version; 526 dst->version_min = src->version_min; 527 dst->seq = src->seq; 528 dst->uuid = src->uuid; 529 dst->user_uuid = src->user_uuid; 530 memcpy(dst->label, src->label, sizeof(dst->label)); 531 532 dst->block_size = src->block_size; 533 dst->nr_devices = src->nr_devices; 534 535 dst->time_base_lo = src->time_base_lo; 536 dst->time_base_hi = src->time_base_hi; 537 dst->time_precision = src->time_precision; 538 539 memcpy(dst->flags, src->flags, sizeof(dst->flags)); 540 memcpy(dst->features, src->features, sizeof(dst->features)); 541 memcpy(dst->compat, src->compat, sizeof(dst->compat)); 542 543 for (i = 0; i < BCH_SB_FIELD_NR; i++) { 544 int d; 545 546 if ((1U << i) & BCH_SINGLE_DEVICE_SB_FIELDS) 547 continue; 548 549 src_f = bch2_sb_field_get_id(src, i); 550 dst_f = bch2_sb_field_get_id(dst, i); 551 552 d = (src_f ? le32_to_cpu(src_f->u64s) : 0) - 553 (dst_f ? le32_to_cpu(dst_f->u64s) : 0); 554 if (d > 0) { 555 int ret = bch2_sb_realloc(dst_handle, 556 le32_to_cpu(dst_handle->sb->u64s) + d); 557 558 if (ret) 559 return ret; 560 561 dst = dst_handle->sb; 562 dst_f = bch2_sb_field_get_id(dst, i); 563 } 564 565 dst_f = __bch2_sb_field_resize(dst_handle, dst_f, 566 src_f ? le32_to_cpu(src_f->u64s) : 0); 567 568 if (src_f) 569 memcpy(dst_f, src_f, vstruct_bytes(src_f)); 570 } 571 572 return 0; 573 } 574 575 int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src) 576 { 577 int ret; 578 579 lockdep_assert_held(&c->sb_lock); 580 581 ret = bch2_sb_realloc(&c->disk_sb, 0) ?: 582 __copy_super(&c->disk_sb, src) ?: 583 bch2_sb_replicas_to_cpu_replicas(c) ?: 584 bch2_sb_disk_groups_to_cpu(c); 585 if (ret) 586 return ret; 587 588 bch2_sb_update(c); 589 return 0; 590 } 591 592 int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca) 593 { 594 return __copy_super(&ca->disk_sb, c->disk_sb.sb); 595 } 596 597 /* read superblock: */ 598 599 static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf *err) 600 { 601 struct bch_csum csum; 602 size_t bytes; 603 int ret; 604 reread: 605 bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META); 606 sb->bio->bi_iter.bi_sector = offset; 607 bch2_bio_map(sb->bio, sb->sb, sb->buffer_size); 608 609 ret = submit_bio_wait(sb->bio); 610 if (ret) { 611 prt_printf(err, "IO error: %i", ret); 612 return ret; 613 } 614 615 if (!uuid_equal(&sb->sb->magic, &BCACHE_MAGIC) && 616 !uuid_equal(&sb->sb->magic, &BCHFS_MAGIC)) { 617 prt_printf(err, "Not a bcachefs superblock"); 618 return -BCH_ERR_invalid_sb_magic; 619 } 620 621 ret = bch2_sb_compatible(sb->sb, err); 622 if (ret) 623 return ret; 624 625 bytes = vstruct_bytes(sb->sb); 626 627 if (bytes > 512 << sb->sb->layout.sb_max_size_bits) { 628 prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)", 629 bytes, 512UL << sb->sb->layout.sb_max_size_bits); 630 return -BCH_ERR_invalid_sb_too_big; 631 } 632 633 if (bytes > sb->buffer_size) { 634 ret = bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s)); 635 if (ret) 636 return ret; 637 goto reread; 638 } 639 640 if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) { 641 prt_printf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb)); 642 return -BCH_ERR_invalid_sb_csum_type; 643 } 644 645 /* XXX: verify MACs */ 646 csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb), 647 null_nonce(), sb->sb); 648 649 if (bch2_crc_cmp(csum, sb->sb->csum)) { 650 prt_printf(err, "bad checksum"); 651 return -BCH_ERR_invalid_sb_csum; 652 } 653 654 sb->seq = le64_to_cpu(sb->sb->seq); 655 656 return 0; 657 } 658 659 int bch2_read_super(const char *path, struct bch_opts *opts, 660 struct bch_sb_handle *sb) 661 { 662 u64 offset = opt_get(*opts, sb); 663 struct bch_sb_layout layout; 664 struct printbuf err = PRINTBUF; 665 __le64 *i; 666 int ret; 667 #ifndef __KERNEL__ 668 retry: 669 #endif 670 memset(sb, 0, sizeof(*sb)); 671 sb->mode = BLK_OPEN_READ; 672 sb->have_bio = true; 673 sb->holder = kmalloc(1, GFP_KERNEL); 674 if (!sb->holder) 675 return -ENOMEM; 676 677 #ifndef __KERNEL__ 678 if (opt_get(*opts, direct_io) == false) 679 sb->mode |= BLK_OPEN_BUFFERED; 680 #endif 681 682 if (!opt_get(*opts, noexcl)) 683 sb->mode |= BLK_OPEN_EXCL; 684 685 if (!opt_get(*opts, nochanges)) 686 sb->mode |= BLK_OPEN_WRITE; 687 688 sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); 689 if (IS_ERR(sb->bdev) && 690 PTR_ERR(sb->bdev) == -EACCES && 691 opt_get(*opts, read_only)) { 692 sb->mode &= ~BLK_OPEN_WRITE; 693 694 sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); 695 if (!IS_ERR(sb->bdev)) 696 opt_set(*opts, nochanges, true); 697 } 698 699 if (IS_ERR(sb->bdev)) { 700 ret = PTR_ERR(sb->bdev); 701 goto out; 702 } 703 704 ret = bch2_sb_realloc(sb, 0); 705 if (ret) { 706 prt_printf(&err, "error allocating memory for superblock"); 707 goto err; 708 } 709 710 if (bch2_fs_init_fault("read_super")) { 711 prt_printf(&err, "dynamic fault"); 712 ret = -EFAULT; 713 goto err; 714 } 715 716 ret = read_one_super(sb, offset, &err); 717 if (!ret) 718 goto got_super; 719 720 if (opt_defined(*opts, sb)) 721 goto err; 722 723 printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s", 724 path, err.buf); 725 printbuf_reset(&err); 726 727 /* 728 * Error reading primary superblock - read location of backup 729 * superblocks: 730 */ 731 bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META); 732 sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR; 733 /* 734 * use sb buffer to read layout, since sb buffer is page aligned but 735 * layout won't be: 736 */ 737 bch2_bio_map(sb->bio, sb->sb, sizeof(struct bch_sb_layout)); 738 739 ret = submit_bio_wait(sb->bio); 740 if (ret) { 741 prt_printf(&err, "IO error: %i", ret); 742 goto err; 743 } 744 745 memcpy(&layout, sb->sb, sizeof(layout)); 746 ret = validate_sb_layout(&layout, &err); 747 if (ret) 748 goto err; 749 750 for (i = layout.sb_offset; 751 i < layout.sb_offset + layout.nr_superblocks; i++) { 752 offset = le64_to_cpu(*i); 753 754 if (offset == opt_get(*opts, sb)) 755 continue; 756 757 ret = read_one_super(sb, offset, &err); 758 if (!ret) 759 goto got_super; 760 } 761 762 goto err; 763 764 got_super: 765 if (le16_to_cpu(sb->sb->block_size) << 9 < 766 bdev_logical_block_size(sb->bdev) && 767 opt_get(*opts, direct_io)) { 768 #ifndef __KERNEL__ 769 opt_set(*opts, direct_io, false); 770 bch2_free_super(sb); 771 goto retry; 772 #endif 773 prt_printf(&err, "block size (%u) smaller than device block size (%u)", 774 le16_to_cpu(sb->sb->block_size) << 9, 775 bdev_logical_block_size(sb->bdev)); 776 ret = -BCH_ERR_block_size_too_small; 777 goto err; 778 } 779 780 ret = 0; 781 sb->have_layout = true; 782 783 ret = bch2_sb_validate(sb, &err, READ); 784 if (ret) { 785 printk(KERN_ERR "bcachefs (%s): error validating superblock: %s", 786 path, err.buf); 787 goto err_no_print; 788 } 789 out: 790 printbuf_exit(&err); 791 return ret; 792 err: 793 printk(KERN_ERR "bcachefs (%s): error reading superblock: %s", 794 path, err.buf); 795 err_no_print: 796 bch2_free_super(sb); 797 goto out; 798 } 799 800 /* write superblock: */ 801 802 static void write_super_endio(struct bio *bio) 803 { 804 struct bch_dev *ca = bio->bi_private; 805 806 /* XXX: return errors directly */ 807 808 if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write error: %s", 809 bch2_blk_status_to_str(bio->bi_status))) 810 ca->sb_write_error = 1; 811 812 closure_put(&ca->fs->sb_write); 813 percpu_ref_put(&ca->io_ref); 814 } 815 816 static void read_back_super(struct bch_fs *c, struct bch_dev *ca) 817 { 818 struct bch_sb *sb = ca->disk_sb.sb; 819 struct bio *bio = ca->disk_sb.bio; 820 821 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ|REQ_SYNC|REQ_META); 822 bio->bi_iter.bi_sector = le64_to_cpu(sb->layout.sb_offset[0]); 823 bio->bi_end_io = write_super_endio; 824 bio->bi_private = ca; 825 bch2_bio_map(bio, ca->sb_read_scratch, PAGE_SIZE); 826 827 this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], 828 bio_sectors(bio)); 829 830 percpu_ref_get(&ca->io_ref); 831 closure_bio_submit(bio, &c->sb_write); 832 } 833 834 static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) 835 { 836 struct bch_sb *sb = ca->disk_sb.sb; 837 struct bio *bio = ca->disk_sb.bio; 838 839 sb->offset = sb->layout.sb_offset[idx]; 840 841 SET_BCH_SB_CSUM_TYPE(sb, bch2_csum_opt_to_type(c->opts.metadata_checksum, false)); 842 sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb), 843 null_nonce(), sb); 844 845 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); 846 bio->bi_iter.bi_sector = le64_to_cpu(sb->offset); 847 bio->bi_end_io = write_super_endio; 848 bio->bi_private = ca; 849 bch2_bio_map(bio, sb, 850 roundup((size_t) vstruct_bytes(sb), 851 bdev_logical_block_size(ca->disk_sb.bdev))); 852 853 this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb], 854 bio_sectors(bio)); 855 856 percpu_ref_get(&ca->io_ref); 857 closure_bio_submit(bio, &c->sb_write); 858 } 859 860 int bch2_write_super(struct bch_fs *c) 861 { 862 struct closure *cl = &c->sb_write; 863 struct bch_dev *ca; 864 struct printbuf err = PRINTBUF; 865 unsigned i, sb = 0, nr_wrote; 866 struct bch_devs_mask sb_written; 867 bool wrote, can_mount_without_written, can_mount_with_written; 868 unsigned degraded_flags = BCH_FORCE_IF_DEGRADED; 869 int ret = 0; 870 871 trace_and_count(c, write_super, c, _RET_IP_); 872 873 if (c->opts.very_degraded) 874 degraded_flags |= BCH_FORCE_IF_LOST; 875 876 lockdep_assert_held(&c->sb_lock); 877 878 closure_init_stack(cl); 879 memset(&sb_written, 0, sizeof(sb_written)); 880 881 /* Make sure we're using the new magic numbers: */ 882 c->disk_sb.sb->magic = BCHFS_MAGIC; 883 c->disk_sb.sb->layout.magic = BCHFS_MAGIC; 884 885 le64_add_cpu(&c->disk_sb.sb->seq, 1); 886 887 if (test_bit(BCH_FS_ERROR, &c->flags)) 888 SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1); 889 if (test_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags)) 890 SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 1); 891 892 SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN); 893 894 bch2_sb_counters_from_cpu(c); 895 bch_members_cpy_v2_v1(&c->disk_sb); 896 897 for_each_online_member(ca, c, i) 898 bch2_sb_from_fs(c, ca); 899 900 for_each_online_member(ca, c, i) { 901 printbuf_reset(&err); 902 903 ret = bch2_sb_validate(&ca->disk_sb, &err, WRITE); 904 if (ret) { 905 bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf); 906 percpu_ref_put(&ca->io_ref); 907 goto out; 908 } 909 } 910 911 if (c->opts.nochanges) 912 goto out; 913 914 /* 915 * Defer writing the superblock until filesystem initialization is 916 * complete - don't write out a partly initialized superblock: 917 */ 918 if (!BCH_SB_INITIALIZED(c->disk_sb.sb)) 919 goto out; 920 921 for_each_online_member(ca, c, i) { 922 __set_bit(ca->dev_idx, sb_written.d); 923 ca->sb_write_error = 0; 924 } 925 926 for_each_online_member(ca, c, i) 927 read_back_super(c, ca); 928 closure_sync(cl); 929 930 for_each_online_member(ca, c, i) { 931 if (ca->sb_write_error) 932 continue; 933 934 if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) { 935 bch2_fs_fatal_error(c, 936 "Superblock write was silently dropped! (seq %llu expected %llu)", 937 le64_to_cpu(ca->sb_read_scratch->seq), 938 ca->disk_sb.seq); 939 percpu_ref_put(&ca->io_ref); 940 ret = -BCH_ERR_erofs_sb_err; 941 goto out; 942 } 943 944 if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) { 945 bch2_fs_fatal_error(c, 946 "Superblock modified by another process (seq %llu expected %llu)", 947 le64_to_cpu(ca->sb_read_scratch->seq), 948 ca->disk_sb.seq); 949 percpu_ref_put(&ca->io_ref); 950 ret = -BCH_ERR_erofs_sb_err; 951 goto out; 952 } 953 } 954 955 do { 956 wrote = false; 957 for_each_online_member(ca, c, i) 958 if (!ca->sb_write_error && 959 sb < ca->disk_sb.sb->layout.nr_superblocks) { 960 write_one_super(c, ca, sb); 961 wrote = true; 962 } 963 closure_sync(cl); 964 sb++; 965 } while (wrote); 966 967 for_each_online_member(ca, c, i) { 968 if (ca->sb_write_error) 969 __clear_bit(ca->dev_idx, sb_written.d); 970 else 971 ca->disk_sb.seq = le64_to_cpu(ca->disk_sb.sb->seq); 972 } 973 974 nr_wrote = dev_mask_nr(&sb_written); 975 976 can_mount_with_written = 977 bch2_have_enough_devs(c, sb_written, degraded_flags, false); 978 979 for (i = 0; i < ARRAY_SIZE(sb_written.d); i++) 980 sb_written.d[i] = ~sb_written.d[i]; 981 982 can_mount_without_written = 983 bch2_have_enough_devs(c, sb_written, degraded_flags, false); 984 985 /* 986 * If we would be able to mount _without_ the devices we successfully 987 * wrote superblocks to, we weren't able to write to enough devices: 988 * 989 * Exception: if we can mount without the successes because we haven't 990 * written anything (new filesystem), we continue if we'd be able to 991 * mount with the devices we did successfully write to: 992 */ 993 if (bch2_fs_fatal_err_on(!nr_wrote || 994 !can_mount_with_written || 995 (can_mount_without_written && 996 !can_mount_with_written), c, 997 "Unable to write superblock to sufficient devices (from %ps)", 998 (void *) _RET_IP_)) 999 ret = -1; 1000 out: 1001 /* Make new options visible after they're persistent: */ 1002 bch2_sb_update(c); 1003 printbuf_exit(&err); 1004 return ret; 1005 } 1006 1007 void __bch2_check_set_feature(struct bch_fs *c, unsigned feat) 1008 { 1009 mutex_lock(&c->sb_lock); 1010 if (!(c->sb.features & (1ULL << feat))) { 1011 c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << feat); 1012 1013 bch2_write_super(c); 1014 } 1015 mutex_unlock(&c->sb_lock); 1016 } 1017 1018 /* Downgrade if superblock is at a higher version than currently supported: */ 1019 void bch2_sb_maybe_downgrade(struct bch_fs *c) 1020 { 1021 lockdep_assert_held(&c->sb_lock); 1022 1023 /* 1024 * Downgrade, if superblock is at a higher version than currently 1025 * supported: 1026 */ 1027 if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current) 1028 SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); 1029 if (c->sb.version > bcachefs_metadata_version_current) 1030 c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current); 1031 if (c->sb.version_min > bcachefs_metadata_version_current) 1032 c->disk_sb.sb->version_min = cpu_to_le16(bcachefs_metadata_version_current); 1033 c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1); 1034 } 1035 1036 void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version) 1037 { 1038 lockdep_assert_held(&c->sb_lock); 1039 1040 c->disk_sb.sb->version = cpu_to_le16(new_version); 1041 c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); 1042 } 1043 1044 static const struct bch_sb_field_ops *bch2_sb_field_ops[] = { 1045 #define x(f, nr) \ 1046 [BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f, 1047 BCH_SB_FIELDS() 1048 #undef x 1049 }; 1050 1051 static const struct bch_sb_field_ops bch2_sb_field_null_ops; 1052 1053 static const struct bch_sb_field_ops *bch2_sb_field_type_ops(unsigned type) 1054 { 1055 return likely(type < ARRAY_SIZE(bch2_sb_field_ops)) 1056 ? bch2_sb_field_ops[type] 1057 : &bch2_sb_field_null_ops; 1058 } 1059 1060 static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f, 1061 struct printbuf *err) 1062 { 1063 unsigned type = le32_to_cpu(f->type); 1064 struct printbuf field_err = PRINTBUF; 1065 const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type); 1066 int ret; 1067 1068 ret = ops->validate ? ops->validate(sb, f, &field_err) : 0; 1069 if (ret) { 1070 prt_printf(err, "Invalid superblock section %s: %s", 1071 bch2_sb_fields[type], field_err.buf); 1072 prt_newline(err); 1073 bch2_sb_field_to_text(err, sb, f); 1074 } 1075 1076 printbuf_exit(&field_err); 1077 return ret; 1078 } 1079 1080 void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, 1081 struct bch_sb_field *f) 1082 { 1083 unsigned type = le32_to_cpu(f->type); 1084 const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type); 1085 1086 if (!out->nr_tabstops) 1087 printbuf_tabstop_push(out, 32); 1088 1089 if (type < BCH_SB_FIELD_NR) 1090 prt_printf(out, "%s", bch2_sb_fields[type]); 1091 else 1092 prt_printf(out, "(unknown field %u)", type); 1093 1094 prt_printf(out, " (size %zu):", vstruct_bytes(f)); 1095 prt_newline(out); 1096 1097 if (ops->to_text) { 1098 printbuf_indent_add(out, 2); 1099 ops->to_text(out, sb, f); 1100 printbuf_indent_sub(out, 2); 1101 } 1102 } 1103 1104 void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l) 1105 { 1106 unsigned i; 1107 1108 prt_printf(out, "Type: %u", l->layout_type); 1109 prt_newline(out); 1110 1111 prt_str(out, "Superblock max size: "); 1112 prt_units_u64(out, 512 << l->sb_max_size_bits); 1113 prt_newline(out); 1114 1115 prt_printf(out, "Nr superblocks: %u", l->nr_superblocks); 1116 prt_newline(out); 1117 1118 prt_str(out, "Offsets: "); 1119 for (i = 0; i < l->nr_superblocks; i++) { 1120 if (i) 1121 prt_str(out, ", "); 1122 prt_printf(out, "%llu", le64_to_cpu(l->sb_offset[i])); 1123 } 1124 prt_newline(out); 1125 } 1126 1127 void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, 1128 bool print_layout, unsigned fields) 1129 { 1130 struct bch_sb_field *f; 1131 u64 fields_have = 0; 1132 unsigned nr_devices = 0; 1133 1134 if (!out->nr_tabstops) 1135 printbuf_tabstop_push(out, 44); 1136 1137 for (int i = 0; i < sb->nr_devices; i++) 1138 nr_devices += bch2_dev_exists(sb, i); 1139 1140 prt_printf(out, "External UUID:"); 1141 prt_tab(out); 1142 pr_uuid(out, sb->user_uuid.b); 1143 prt_newline(out); 1144 1145 prt_printf(out, "Internal UUID:"); 1146 prt_tab(out); 1147 pr_uuid(out, sb->uuid.b); 1148 prt_newline(out); 1149 1150 prt_str(out, "Device index:"); 1151 prt_tab(out); 1152 prt_printf(out, "%u", sb->dev_idx); 1153 prt_newline(out); 1154 1155 prt_str(out, "Label:"); 1156 prt_tab(out); 1157 prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label); 1158 prt_newline(out); 1159 1160 prt_str(out, "Version:"); 1161 prt_tab(out); 1162 bch2_version_to_text(out, le16_to_cpu(sb->version)); 1163 prt_newline(out); 1164 1165 prt_str(out, "Version upgrade complete:"); 1166 prt_tab(out); 1167 bch2_version_to_text(out, BCH_SB_VERSION_UPGRADE_COMPLETE(sb)); 1168 prt_newline(out); 1169 1170 prt_printf(out, "Oldest version on disk:"); 1171 prt_tab(out); 1172 bch2_version_to_text(out, le16_to_cpu(sb->version_min)); 1173 prt_newline(out); 1174 1175 prt_printf(out, "Created:"); 1176 prt_tab(out); 1177 if (sb->time_base_lo) 1178 pr_time(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC)); 1179 else 1180 prt_printf(out, "(not set)"); 1181 prt_newline(out); 1182 1183 prt_printf(out, "Sequence number:"); 1184 prt_tab(out); 1185 prt_printf(out, "%llu", le64_to_cpu(sb->seq)); 1186 prt_newline(out); 1187 1188 prt_printf(out, "Superblock size:"); 1189 prt_tab(out); 1190 prt_printf(out, "%zu", vstruct_bytes(sb)); 1191 prt_newline(out); 1192 1193 prt_printf(out, "Clean:"); 1194 prt_tab(out); 1195 prt_printf(out, "%llu", BCH_SB_CLEAN(sb)); 1196 prt_newline(out); 1197 1198 prt_printf(out, "Devices:"); 1199 prt_tab(out); 1200 prt_printf(out, "%u", nr_devices); 1201 prt_newline(out); 1202 1203 prt_printf(out, "Sections:"); 1204 vstruct_for_each(sb, f) 1205 fields_have |= 1 << le32_to_cpu(f->type); 1206 prt_tab(out); 1207 prt_bitflags(out, bch2_sb_fields, fields_have); 1208 prt_newline(out); 1209 1210 prt_printf(out, "Features:"); 1211 prt_tab(out); 1212 prt_bitflags(out, bch2_sb_features, le64_to_cpu(sb->features[0])); 1213 prt_newline(out); 1214 1215 prt_printf(out, "Compat features:"); 1216 prt_tab(out); 1217 prt_bitflags(out, bch2_sb_compat, le64_to_cpu(sb->compat[0])); 1218 prt_newline(out); 1219 1220 prt_newline(out); 1221 prt_printf(out, "Options:"); 1222 prt_newline(out); 1223 printbuf_indent_add(out, 2); 1224 { 1225 enum bch_opt_id id; 1226 1227 for (id = 0; id < bch2_opts_nr; id++) { 1228 const struct bch_option *opt = bch2_opt_table + id; 1229 1230 if (opt->get_sb != BCH2_NO_SB_OPT) { 1231 u64 v = bch2_opt_from_sb(sb, id); 1232 1233 prt_printf(out, "%s:", opt->attr.name); 1234 prt_tab(out); 1235 bch2_opt_to_text(out, NULL, sb, opt, v, 1236 OPT_HUMAN_READABLE|OPT_SHOW_FULL_LIST); 1237 prt_newline(out); 1238 } 1239 } 1240 } 1241 1242 printbuf_indent_sub(out, 2); 1243 1244 if (print_layout) { 1245 prt_newline(out); 1246 prt_printf(out, "layout:"); 1247 prt_newline(out); 1248 printbuf_indent_add(out, 2); 1249 bch2_sb_layout_to_text(out, &sb->layout); 1250 printbuf_indent_sub(out, 2); 1251 } 1252 1253 vstruct_for_each(sb, f) 1254 if (fields & (1 << le32_to_cpu(f->type))) { 1255 prt_newline(out); 1256 bch2_sb_field_to_text(out, sb, f); 1257 } 1258 } 1259