1 // SPDX-License-Identifier: GPL-2.0 2 #include "bcachefs.h" 3 #include "alloc_background.h" 4 #include "alloc_foreground.h" 5 #include "btree_io.h" 6 #include "btree_update_interior.h" 7 #include "buckets.h" 8 #include "checksum.h" 9 #include "disk_groups.h" 10 #include "error.h" 11 #include "journal.h" 12 #include "journal_io.h" 13 #include "journal_reclaim.h" 14 #include "journal_seq_blacklist.h" 15 #include "replicas.h" 16 #include "sb-clean.h" 17 #include "trace.h" 18 19 static struct nonce journal_nonce(const struct jset *jset) 20 { 21 return (struct nonce) {{ 22 [0] = 0, 23 [1] = ((__le32 *) &jset->seq)[0], 24 [2] = ((__le32 *) &jset->seq)[1], 25 [3] = BCH_NONCE_JOURNAL, 26 }}; 27 } 28 29 static bool jset_csum_good(struct bch_fs *c, struct jset *j) 30 { 31 return bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j)) && 32 !bch2_crc_cmp(j->csum, 33 csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j)); 34 } 35 36 static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq) 37 { 38 return (seq - c->journal_entries_base_seq) & (~0U >> 1); 39 } 40 41 static void __journal_replay_free(struct bch_fs *c, 42 struct journal_replay *i) 43 { 44 struct journal_replay **p = 45 genradix_ptr(&c->journal_entries, 46 journal_entry_radix_idx(c, le64_to_cpu(i->j.seq))); 47 48 BUG_ON(*p != i); 49 *p = NULL; 50 kvpfree(i, offsetof(struct journal_replay, j) + 51 vstruct_bytes(&i->j)); 52 } 53 54 static void journal_replay_free(struct bch_fs *c, struct journal_replay *i) 55 { 56 i->ignore = true; 57 58 if (!c->opts.read_entire_journal) 59 __journal_replay_free(c, i); 60 } 61 62 struct journal_list { 63 struct closure cl; 64 u64 last_seq; 65 struct mutex lock; 66 int ret; 67 }; 68 69 #define JOURNAL_ENTRY_ADD_OK 0 70 #define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5 71 72 /* 73 * Given a journal entry we just read, add it to the list of journal entries to 74 * be replayed: 75 */ 76 static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, 77 struct journal_ptr entry_ptr, 78 struct journal_list *jlist, struct jset *j) 79 { 80 struct genradix_iter iter; 81 struct journal_replay **_i, *i, *dup; 82 struct journal_ptr *ptr; 83 size_t bytes = vstruct_bytes(j); 84 u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0; 85 int ret = JOURNAL_ENTRY_ADD_OK; 86 87 /* Is this entry older than the range we need? */ 88 if (!c->opts.read_entire_journal && 89 le64_to_cpu(j->seq) < jlist->last_seq) 90 return JOURNAL_ENTRY_ADD_OUT_OF_RANGE; 91 92 /* 93 * genradixes are indexed by a ulong, not a u64, so we can't index them 94 * by sequence number directly: Assume instead that they will all fall 95 * within the range of +-2billion of the filrst one we find. 96 */ 97 if (!c->journal_entries_base_seq) 98 c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX); 99 100 /* Drop entries we don't need anymore */ 101 if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) { 102 genradix_for_each_from(&c->journal_entries, iter, _i, 103 journal_entry_radix_idx(c, jlist->last_seq)) { 104 i = *_i; 105 106 if (!i || i->ignore) 107 continue; 108 109 if (le64_to_cpu(i->j.seq) >= last_seq) 110 break; 111 journal_replay_free(c, i); 112 } 113 } 114 115 jlist->last_seq = max(jlist->last_seq, last_seq); 116 117 _i = genradix_ptr_alloc(&c->journal_entries, 118 journal_entry_radix_idx(c, le64_to_cpu(j->seq)), 119 GFP_KERNEL); 120 if (!_i) 121 return -BCH_ERR_ENOMEM_journal_entry_add; 122 123 /* 124 * Duplicate journal entries? If so we want the one that didn't have a 125 * checksum error: 126 */ 127 dup = *_i; 128 if (dup) { 129 if (bytes == vstruct_bytes(&dup->j) && 130 !memcmp(j, &dup->j, bytes)) { 131 i = dup; 132 goto found; 133 } 134 135 if (!entry_ptr.csum_good) { 136 i = dup; 137 goto found; 138 } 139 140 if (!dup->csum_good) 141 goto replace; 142 143 fsck_err(c, journal_entry_replicas_data_mismatch, 144 "found duplicate but non identical journal entries (seq %llu)", 145 le64_to_cpu(j->seq)); 146 i = dup; 147 goto found; 148 } 149 replace: 150 i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); 151 if (!i) 152 return -BCH_ERR_ENOMEM_journal_entry_add; 153 154 i->nr_ptrs = 0; 155 i->csum_good = entry_ptr.csum_good; 156 i->ignore = false; 157 unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct"); 158 i->ptrs[i->nr_ptrs++] = entry_ptr; 159 160 if (dup) { 161 if (dup->nr_ptrs >= ARRAY_SIZE(dup->ptrs)) { 162 bch_err(c, "found too many copies of journal entry %llu", 163 le64_to_cpu(i->j.seq)); 164 dup->nr_ptrs = ARRAY_SIZE(dup->ptrs) - 1; 165 } 166 167 /* The first ptr should represent the jset we kept: */ 168 memcpy(i->ptrs + i->nr_ptrs, 169 dup->ptrs, 170 sizeof(dup->ptrs[0]) * dup->nr_ptrs); 171 i->nr_ptrs += dup->nr_ptrs; 172 __journal_replay_free(c, dup); 173 } 174 175 *_i = i; 176 return 0; 177 found: 178 for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) { 179 if (ptr->dev == ca->dev_idx) { 180 bch_err(c, "duplicate journal entry %llu on same device", 181 le64_to_cpu(i->j.seq)); 182 goto out; 183 } 184 } 185 186 if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) { 187 bch_err(c, "found too many copies of journal entry %llu", 188 le64_to_cpu(i->j.seq)); 189 goto out; 190 } 191 192 i->ptrs[i->nr_ptrs++] = entry_ptr; 193 out: 194 fsck_err: 195 return ret; 196 } 197 198 /* this fills in a range with empty jset_entries: */ 199 static void journal_entry_null_range(void *start, void *end) 200 { 201 struct jset_entry *entry; 202 203 for (entry = start; entry != end; entry = vstruct_next(entry)) 204 memset(entry, 0, sizeof(*entry)); 205 } 206 207 #define JOURNAL_ENTRY_REREAD 5 208 #define JOURNAL_ENTRY_NONE 6 209 #define JOURNAL_ENTRY_BAD 7 210 211 static void journal_entry_err_msg(struct printbuf *out, 212 u32 version, 213 struct jset *jset, 214 struct jset_entry *entry) 215 { 216 prt_str(out, "invalid journal entry, version="); 217 bch2_version_to_text(out, version); 218 219 if (entry) { 220 prt_str(out, " type="); 221 prt_str(out, bch2_jset_entry_types[entry->type]); 222 } 223 224 if (!jset) { 225 prt_printf(out, " in superblock"); 226 } else { 227 228 prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq)); 229 230 if (entry) 231 prt_printf(out, " offset=%zi/%u", 232 (u64 *) entry - jset->_data, 233 le32_to_cpu(jset->u64s)); 234 } 235 236 prt_str(out, ": "); 237 } 238 239 #define journal_entry_err(c, version, jset, entry, _err, msg, ...) \ 240 ({ \ 241 struct printbuf _buf = PRINTBUF; \ 242 \ 243 journal_entry_err_msg(&_buf, version, jset, entry); \ 244 prt_printf(&_buf, msg, ##__VA_ARGS__); \ 245 \ 246 switch (flags & BKEY_INVALID_WRITE) { \ 247 case READ: \ 248 mustfix_fsck_err(c, _err, "%s", _buf.buf); \ 249 break; \ 250 case WRITE: \ 251 bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \ 252 bch_err(c, "corrupt metadata before write: %s\n", _buf.buf);\ 253 if (bch2_fs_inconsistent(c)) { \ 254 ret = -BCH_ERR_fsck_errors_not_fixed; \ 255 goto fsck_err; \ 256 } \ 257 break; \ 258 } \ 259 \ 260 printbuf_exit(&_buf); \ 261 true; \ 262 }) 263 264 #define journal_entry_err_on(cond, ...) \ 265 ((cond) ? journal_entry_err(__VA_ARGS__) : false) 266 267 #define FSCK_DELETED_KEY 5 268 269 static int journal_validate_key(struct bch_fs *c, 270 struct jset *jset, 271 struct jset_entry *entry, 272 unsigned level, enum btree_id btree_id, 273 struct bkey_i *k, 274 unsigned version, int big_endian, 275 enum bkey_invalid_flags flags) 276 { 277 int write = flags & BKEY_INVALID_WRITE; 278 void *next = vstruct_next(entry); 279 struct printbuf buf = PRINTBUF; 280 int ret = 0; 281 282 if (journal_entry_err_on(!k->k.u64s, 283 c, version, jset, entry, 284 journal_entry_bkey_u64s_0, 285 "k->u64s 0")) { 286 entry->u64s = cpu_to_le16((u64 *) k - entry->_data); 287 journal_entry_null_range(vstruct_next(entry), next); 288 return FSCK_DELETED_KEY; 289 } 290 291 if (journal_entry_err_on((void *) bkey_next(k) > 292 (void *) vstruct_next(entry), 293 c, version, jset, entry, 294 journal_entry_bkey_past_end, 295 "extends past end of journal entry")) { 296 entry->u64s = cpu_to_le16((u64 *) k - entry->_data); 297 journal_entry_null_range(vstruct_next(entry), next); 298 return FSCK_DELETED_KEY; 299 } 300 301 if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, 302 c, version, jset, entry, 303 journal_entry_bkey_bad_format, 304 "bad format %u", k->k.format)) { 305 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); 306 memmove(k, bkey_next(k), next - (void *) bkey_next(k)); 307 journal_entry_null_range(vstruct_next(entry), next); 308 return FSCK_DELETED_KEY; 309 } 310 311 if (!write) 312 bch2_bkey_compat(level, btree_id, version, big_endian, 313 write, NULL, bkey_to_packed(k)); 314 315 if (bch2_bkey_invalid(c, bkey_i_to_s_c(k), 316 __btree_node_type(level, btree_id), write, &buf)) { 317 printbuf_reset(&buf); 318 journal_entry_err_msg(&buf, version, jset, entry); 319 prt_newline(&buf); 320 printbuf_indent_add(&buf, 2); 321 322 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); 323 prt_newline(&buf); 324 bch2_bkey_invalid(c, bkey_i_to_s_c(k), 325 __btree_node_type(level, btree_id), write, &buf); 326 327 mustfix_fsck_err(c, journal_entry_bkey_invalid, 328 "%s", buf.buf); 329 330 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); 331 memmove(k, bkey_next(k), next - (void *) bkey_next(k)); 332 journal_entry_null_range(vstruct_next(entry), next); 333 334 printbuf_exit(&buf); 335 return FSCK_DELETED_KEY; 336 } 337 338 if (write) 339 bch2_bkey_compat(level, btree_id, version, big_endian, 340 write, NULL, bkey_to_packed(k)); 341 fsck_err: 342 printbuf_exit(&buf); 343 return ret; 344 } 345 346 static int journal_entry_btree_keys_validate(struct bch_fs *c, 347 struct jset *jset, 348 struct jset_entry *entry, 349 unsigned version, int big_endian, 350 enum bkey_invalid_flags flags) 351 { 352 struct bkey_i *k = entry->start; 353 354 while (k != vstruct_last(entry)) { 355 int ret = journal_validate_key(c, jset, entry, 356 entry->level, 357 entry->btree_id, 358 k, version, big_endian, 359 flags|BKEY_INVALID_JOURNAL); 360 if (ret == FSCK_DELETED_KEY) 361 continue; 362 363 k = bkey_next(k); 364 } 365 366 return 0; 367 } 368 369 static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c, 370 struct jset_entry *entry) 371 { 372 struct bkey_i *k; 373 bool first = true; 374 375 jset_entry_for_each_key(entry, k) { 376 if (!first) { 377 prt_newline(out); 378 prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]); 379 } 380 prt_printf(out, "btree=%s l=%u ", bch2_btree_id_str(entry->btree_id), entry->level); 381 bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); 382 first = false; 383 } 384 } 385 386 static int journal_entry_btree_root_validate(struct bch_fs *c, 387 struct jset *jset, 388 struct jset_entry *entry, 389 unsigned version, int big_endian, 390 enum bkey_invalid_flags flags) 391 { 392 struct bkey_i *k = entry->start; 393 int ret = 0; 394 395 if (journal_entry_err_on(!entry->u64s || 396 le16_to_cpu(entry->u64s) != k->k.u64s, 397 c, version, jset, entry, 398 journal_entry_btree_root_bad_size, 399 "invalid btree root journal entry: wrong number of keys")) { 400 void *next = vstruct_next(entry); 401 /* 402 * we don't want to null out this jset_entry, 403 * just the contents, so that later we can tell 404 * we were _supposed_ to have a btree root 405 */ 406 entry->u64s = 0; 407 journal_entry_null_range(vstruct_next(entry), next); 408 return 0; 409 } 410 411 return journal_validate_key(c, jset, entry, 1, entry->btree_id, k, 412 version, big_endian, flags); 413 fsck_err: 414 return ret; 415 } 416 417 static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c, 418 struct jset_entry *entry) 419 { 420 journal_entry_btree_keys_to_text(out, c, entry); 421 } 422 423 static int journal_entry_prio_ptrs_validate(struct bch_fs *c, 424 struct jset *jset, 425 struct jset_entry *entry, 426 unsigned version, int big_endian, 427 enum bkey_invalid_flags flags) 428 { 429 /* obsolete, don't care: */ 430 return 0; 431 } 432 433 static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c, 434 struct jset_entry *entry) 435 { 436 } 437 438 static int journal_entry_blacklist_validate(struct bch_fs *c, 439 struct jset *jset, 440 struct jset_entry *entry, 441 unsigned version, int big_endian, 442 enum bkey_invalid_flags flags) 443 { 444 int ret = 0; 445 446 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, 447 c, version, jset, entry, 448 journal_entry_blacklist_bad_size, 449 "invalid journal seq blacklist entry: bad size")) { 450 journal_entry_null_range(entry, vstruct_next(entry)); 451 } 452 fsck_err: 453 return ret; 454 } 455 456 static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c, 457 struct jset_entry *entry) 458 { 459 struct jset_entry_blacklist *bl = 460 container_of(entry, struct jset_entry_blacklist, entry); 461 462 prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq)); 463 } 464 465 static int journal_entry_blacklist_v2_validate(struct bch_fs *c, 466 struct jset *jset, 467 struct jset_entry *entry, 468 unsigned version, int big_endian, 469 enum bkey_invalid_flags flags) 470 { 471 struct jset_entry_blacklist_v2 *bl_entry; 472 int ret = 0; 473 474 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, 475 c, version, jset, entry, 476 journal_entry_blacklist_v2_bad_size, 477 "invalid journal seq blacklist entry: bad size")) { 478 journal_entry_null_range(entry, vstruct_next(entry)); 479 goto out; 480 } 481 482 bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry); 483 484 if (journal_entry_err_on(le64_to_cpu(bl_entry->start) > 485 le64_to_cpu(bl_entry->end), 486 c, version, jset, entry, 487 journal_entry_blacklist_v2_start_past_end, 488 "invalid journal seq blacklist entry: start > end")) { 489 journal_entry_null_range(entry, vstruct_next(entry)); 490 } 491 out: 492 fsck_err: 493 return ret; 494 } 495 496 static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c, 497 struct jset_entry *entry) 498 { 499 struct jset_entry_blacklist_v2 *bl = 500 container_of(entry, struct jset_entry_blacklist_v2, entry); 501 502 prt_printf(out, "start=%llu end=%llu", 503 le64_to_cpu(bl->start), 504 le64_to_cpu(bl->end)); 505 } 506 507 static int journal_entry_usage_validate(struct bch_fs *c, 508 struct jset *jset, 509 struct jset_entry *entry, 510 unsigned version, int big_endian, 511 enum bkey_invalid_flags flags) 512 { 513 struct jset_entry_usage *u = 514 container_of(entry, struct jset_entry_usage, entry); 515 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 516 int ret = 0; 517 518 if (journal_entry_err_on(bytes < sizeof(*u), 519 c, version, jset, entry, 520 journal_entry_usage_bad_size, 521 "invalid journal entry usage: bad size")) { 522 journal_entry_null_range(entry, vstruct_next(entry)); 523 return ret; 524 } 525 526 fsck_err: 527 return ret; 528 } 529 530 static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c, 531 struct jset_entry *entry) 532 { 533 struct jset_entry_usage *u = 534 container_of(entry, struct jset_entry_usage, entry); 535 536 prt_printf(out, "type=%s v=%llu", 537 bch2_fs_usage_types[u->entry.btree_id], 538 le64_to_cpu(u->v)); 539 } 540 541 static int journal_entry_data_usage_validate(struct bch_fs *c, 542 struct jset *jset, 543 struct jset_entry *entry, 544 unsigned version, int big_endian, 545 enum bkey_invalid_flags flags) 546 { 547 struct jset_entry_data_usage *u = 548 container_of(entry, struct jset_entry_data_usage, entry); 549 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 550 struct printbuf err = PRINTBUF; 551 int ret = 0; 552 553 if (journal_entry_err_on(bytes < sizeof(*u) || 554 bytes < sizeof(*u) + u->r.nr_devs, 555 c, version, jset, entry, 556 journal_entry_data_usage_bad_size, 557 "invalid journal entry usage: bad size")) { 558 journal_entry_null_range(entry, vstruct_next(entry)); 559 goto out; 560 } 561 562 if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c->disk_sb.sb, &err), 563 c, version, jset, entry, 564 journal_entry_data_usage_bad_size, 565 "invalid journal entry usage: %s", err.buf)) { 566 journal_entry_null_range(entry, vstruct_next(entry)); 567 goto out; 568 } 569 out: 570 fsck_err: 571 printbuf_exit(&err); 572 return ret; 573 } 574 575 static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c, 576 struct jset_entry *entry) 577 { 578 struct jset_entry_data_usage *u = 579 container_of(entry, struct jset_entry_data_usage, entry); 580 581 bch2_replicas_entry_to_text(out, &u->r); 582 prt_printf(out, "=%llu", le64_to_cpu(u->v)); 583 } 584 585 static int journal_entry_clock_validate(struct bch_fs *c, 586 struct jset *jset, 587 struct jset_entry *entry, 588 unsigned version, int big_endian, 589 enum bkey_invalid_flags flags) 590 { 591 struct jset_entry_clock *clock = 592 container_of(entry, struct jset_entry_clock, entry); 593 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 594 int ret = 0; 595 596 if (journal_entry_err_on(bytes != sizeof(*clock), 597 c, version, jset, entry, 598 journal_entry_clock_bad_size, 599 "bad size")) { 600 journal_entry_null_range(entry, vstruct_next(entry)); 601 return ret; 602 } 603 604 if (journal_entry_err_on(clock->rw > 1, 605 c, version, jset, entry, 606 journal_entry_clock_bad_rw, 607 "bad rw")) { 608 journal_entry_null_range(entry, vstruct_next(entry)); 609 return ret; 610 } 611 612 fsck_err: 613 return ret; 614 } 615 616 static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c, 617 struct jset_entry *entry) 618 { 619 struct jset_entry_clock *clock = 620 container_of(entry, struct jset_entry_clock, entry); 621 622 prt_printf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time)); 623 } 624 625 static int journal_entry_dev_usage_validate(struct bch_fs *c, 626 struct jset *jset, 627 struct jset_entry *entry, 628 unsigned version, int big_endian, 629 enum bkey_invalid_flags flags) 630 { 631 struct jset_entry_dev_usage *u = 632 container_of(entry, struct jset_entry_dev_usage, entry); 633 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 634 unsigned expected = sizeof(*u); 635 unsigned dev; 636 int ret = 0; 637 638 if (journal_entry_err_on(bytes < expected, 639 c, version, jset, entry, 640 journal_entry_dev_usage_bad_size, 641 "bad size (%u < %u)", 642 bytes, expected)) { 643 journal_entry_null_range(entry, vstruct_next(entry)); 644 return ret; 645 } 646 647 dev = le32_to_cpu(u->dev); 648 649 if (journal_entry_err_on(!bch2_dev_exists2(c, dev), 650 c, version, jset, entry, 651 journal_entry_dev_usage_bad_dev, 652 "bad dev")) { 653 journal_entry_null_range(entry, vstruct_next(entry)); 654 return ret; 655 } 656 657 if (journal_entry_err_on(u->pad, 658 c, version, jset, entry, 659 journal_entry_dev_usage_bad_pad, 660 "bad pad")) { 661 journal_entry_null_range(entry, vstruct_next(entry)); 662 return ret; 663 } 664 665 fsck_err: 666 return ret; 667 } 668 669 static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c, 670 struct jset_entry *entry) 671 { 672 struct jset_entry_dev_usage *u = 673 container_of(entry, struct jset_entry_dev_usage, entry); 674 unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); 675 676 prt_printf(out, "dev=%u", le32_to_cpu(u->dev)); 677 678 for (i = 0; i < nr_types; i++) { 679 if (i < BCH_DATA_NR) 680 prt_printf(out, " %s", bch2_data_types[i]); 681 else 682 prt_printf(out, " (unknown data type %u)", i); 683 prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu", 684 le64_to_cpu(u->d[i].buckets), 685 le64_to_cpu(u->d[i].sectors), 686 le64_to_cpu(u->d[i].fragmented)); 687 } 688 689 prt_printf(out, " buckets_ec: %llu", le64_to_cpu(u->buckets_ec)); 690 } 691 692 static int journal_entry_log_validate(struct bch_fs *c, 693 struct jset *jset, 694 struct jset_entry *entry, 695 unsigned version, int big_endian, 696 enum bkey_invalid_flags flags) 697 { 698 return 0; 699 } 700 701 static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c, 702 struct jset_entry *entry) 703 { 704 struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); 705 unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d); 706 707 prt_printf(out, "%.*s", bytes, l->d); 708 } 709 710 static int journal_entry_overwrite_validate(struct bch_fs *c, 711 struct jset *jset, 712 struct jset_entry *entry, 713 unsigned version, int big_endian, 714 enum bkey_invalid_flags flags) 715 { 716 return journal_entry_btree_keys_validate(c, jset, entry, 717 version, big_endian, READ); 718 } 719 720 static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c, 721 struct jset_entry *entry) 722 { 723 journal_entry_btree_keys_to_text(out, c, entry); 724 } 725 726 struct jset_entry_ops { 727 int (*validate)(struct bch_fs *, struct jset *, 728 struct jset_entry *, unsigned, int, 729 enum bkey_invalid_flags); 730 void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); 731 }; 732 733 static const struct jset_entry_ops bch2_jset_entry_ops[] = { 734 #define x(f, nr) \ 735 [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \ 736 .validate = journal_entry_##f##_validate, \ 737 .to_text = journal_entry_##f##_to_text, \ 738 }, 739 BCH_JSET_ENTRY_TYPES() 740 #undef x 741 }; 742 743 int bch2_journal_entry_validate(struct bch_fs *c, 744 struct jset *jset, 745 struct jset_entry *entry, 746 unsigned version, int big_endian, 747 enum bkey_invalid_flags flags) 748 { 749 return entry->type < BCH_JSET_ENTRY_NR 750 ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry, 751 version, big_endian, flags) 752 : 0; 753 } 754 755 void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, 756 struct jset_entry *entry) 757 { 758 if (entry->type < BCH_JSET_ENTRY_NR) { 759 prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]); 760 bch2_jset_entry_ops[entry->type].to_text(out, c, entry); 761 } else { 762 prt_printf(out, "(unknown type %u)", entry->type); 763 } 764 } 765 766 static int jset_validate_entries(struct bch_fs *c, struct jset *jset, 767 enum bkey_invalid_flags flags) 768 { 769 struct jset_entry *entry; 770 unsigned version = le32_to_cpu(jset->version); 771 int ret = 0; 772 773 vstruct_for_each(jset, entry) { 774 if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset), 775 c, version, jset, entry, 776 journal_entry_past_jset_end, 777 "journal entry extends past end of jset")) { 778 jset->u64s = cpu_to_le32((u64 *) entry - jset->_data); 779 break; 780 } 781 782 ret = bch2_journal_entry_validate(c, jset, entry, 783 version, JSET_BIG_ENDIAN(jset), flags); 784 if (ret) 785 break; 786 } 787 fsck_err: 788 return ret; 789 } 790 791 static int jset_validate(struct bch_fs *c, 792 struct bch_dev *ca, 793 struct jset *jset, u64 sector, 794 enum bkey_invalid_flags flags) 795 { 796 unsigned version; 797 int ret = 0; 798 799 if (le64_to_cpu(jset->magic) != jset_magic(c)) 800 return JOURNAL_ENTRY_NONE; 801 802 version = le32_to_cpu(jset->version); 803 if (journal_entry_err_on(!bch2_version_compatible(version), 804 c, version, jset, NULL, 805 jset_unsupported_version, 806 "%s sector %llu seq %llu: incompatible journal entry version %u.%u", 807 ca ? ca->name : c->name, 808 sector, le64_to_cpu(jset->seq), 809 BCH_VERSION_MAJOR(version), 810 BCH_VERSION_MINOR(version))) { 811 /* don't try to continue: */ 812 return -EINVAL; 813 } 814 815 if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), 816 c, version, jset, NULL, 817 jset_unknown_csum, 818 "%s sector %llu seq %llu: journal entry with unknown csum type %llu", 819 ca ? ca->name : c->name, 820 sector, le64_to_cpu(jset->seq), 821 JSET_CSUM_TYPE(jset))) 822 ret = JOURNAL_ENTRY_BAD; 823 824 /* last_seq is ignored when JSET_NO_FLUSH is true */ 825 if (journal_entry_err_on(!JSET_NO_FLUSH(jset) && 826 le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), 827 c, version, jset, NULL, 828 jset_last_seq_newer_than_seq, 829 "invalid journal entry: last_seq > seq (%llu > %llu)", 830 le64_to_cpu(jset->last_seq), 831 le64_to_cpu(jset->seq))) { 832 jset->last_seq = jset->seq; 833 return JOURNAL_ENTRY_BAD; 834 } 835 836 ret = jset_validate_entries(c, jset, flags); 837 fsck_err: 838 return ret; 839 } 840 841 static int jset_validate_early(struct bch_fs *c, 842 struct bch_dev *ca, 843 struct jset *jset, u64 sector, 844 unsigned bucket_sectors_left, 845 unsigned sectors_read) 846 { 847 size_t bytes = vstruct_bytes(jset); 848 unsigned version; 849 enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL; 850 int ret = 0; 851 852 if (le64_to_cpu(jset->magic) != jset_magic(c)) 853 return JOURNAL_ENTRY_NONE; 854 855 version = le32_to_cpu(jset->version); 856 if (journal_entry_err_on(!bch2_version_compatible(version), 857 c, version, jset, NULL, 858 jset_unsupported_version, 859 "%s sector %llu seq %llu: unknown journal entry version %u.%u", 860 ca ? ca->name : c->name, 861 sector, le64_to_cpu(jset->seq), 862 BCH_VERSION_MAJOR(version), 863 BCH_VERSION_MINOR(version))) { 864 /* don't try to continue: */ 865 return -EINVAL; 866 } 867 868 if (bytes > (sectors_read << 9) && 869 sectors_read < bucket_sectors_left) 870 return JOURNAL_ENTRY_REREAD; 871 872 if (journal_entry_err_on(bytes > bucket_sectors_left << 9, 873 c, version, jset, NULL, 874 jset_past_bucket_end, 875 "%s sector %llu seq %llu: journal entry too big (%zu bytes)", 876 ca ? ca->name : c->name, 877 sector, le64_to_cpu(jset->seq), bytes)) 878 le32_add_cpu(&jset->u64s, 879 -((bytes - (bucket_sectors_left << 9)) / 8)); 880 fsck_err: 881 return ret; 882 } 883 884 struct journal_read_buf { 885 void *data; 886 size_t size; 887 }; 888 889 static int journal_read_buf_realloc(struct journal_read_buf *b, 890 size_t new_size) 891 { 892 void *n; 893 894 /* the bios are sized for this many pages, max: */ 895 if (new_size > JOURNAL_ENTRY_SIZE_MAX) 896 return -BCH_ERR_ENOMEM_journal_read_buf_realloc; 897 898 new_size = roundup_pow_of_two(new_size); 899 n = kvpmalloc(new_size, GFP_KERNEL); 900 if (!n) 901 return -BCH_ERR_ENOMEM_journal_read_buf_realloc; 902 903 kvpfree(b->data, b->size); 904 b->data = n; 905 b->size = new_size; 906 return 0; 907 } 908 909 static int journal_read_bucket(struct bch_dev *ca, 910 struct journal_read_buf *buf, 911 struct journal_list *jlist, 912 unsigned bucket) 913 { 914 struct bch_fs *c = ca->fs; 915 struct journal_device *ja = &ca->journal; 916 struct jset *j = NULL; 917 unsigned sectors, sectors_read = 0; 918 u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), 919 end = offset + ca->mi.bucket_size; 920 bool saw_bad = false, csum_good; 921 int ret = 0; 922 923 pr_debug("reading %u", bucket); 924 925 while (offset < end) { 926 if (!sectors_read) { 927 struct bio *bio; 928 unsigned nr_bvecs; 929 reread: 930 sectors_read = min_t(unsigned, 931 end - offset, buf->size >> 9); 932 nr_bvecs = buf_pages(buf->data, sectors_read << 9); 933 934 bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); 935 bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ); 936 937 bio->bi_iter.bi_sector = offset; 938 bch2_bio_map(bio, buf->data, sectors_read << 9); 939 940 ret = submit_bio_wait(bio); 941 kfree(bio); 942 943 if (bch2_dev_io_err_on(ret, ca, BCH_MEMBER_ERROR_read, 944 "journal read error: sector %llu", 945 offset) || 946 bch2_meta_read_fault("journal")) { 947 /* 948 * We don't error out of the recovery process 949 * here, since the relevant journal entry may be 950 * found on a different device, and missing or 951 * no journal entries will be handled later 952 */ 953 return 0; 954 } 955 956 j = buf->data; 957 } 958 959 ret = jset_validate_early(c, ca, j, offset, 960 end - offset, sectors_read); 961 switch (ret) { 962 case 0: 963 sectors = vstruct_sectors(j, c->block_bits); 964 break; 965 case JOURNAL_ENTRY_REREAD: 966 if (vstruct_bytes(j) > buf->size) { 967 ret = journal_read_buf_realloc(buf, 968 vstruct_bytes(j)); 969 if (ret) 970 return ret; 971 } 972 goto reread; 973 case JOURNAL_ENTRY_NONE: 974 if (!saw_bad) 975 return 0; 976 /* 977 * On checksum error we don't really trust the size 978 * field of the journal entry we read, so try reading 979 * again at next block boundary: 980 */ 981 sectors = block_sectors(c); 982 goto next_block; 983 default: 984 return ret; 985 } 986 987 /* 988 * This happens sometimes if we don't have discards on - 989 * when we've partially overwritten a bucket with new 990 * journal entries. We don't need the rest of the 991 * bucket: 992 */ 993 if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) 994 return 0; 995 996 ja->bucket_seq[bucket] = le64_to_cpu(j->seq); 997 998 csum_good = jset_csum_good(c, j); 999 if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum, 1000 "journal checksum error")) 1001 saw_bad = true; 1002 1003 ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), 1004 j->encrypted_start, 1005 vstruct_end(j) - (void *) j->encrypted_start); 1006 bch2_fs_fatal_err_on(ret, c, 1007 "error decrypting journal entry: %i", ret); 1008 1009 mutex_lock(&jlist->lock); 1010 ret = journal_entry_add(c, ca, (struct journal_ptr) { 1011 .csum_good = csum_good, 1012 .dev = ca->dev_idx, 1013 .bucket = bucket, 1014 .bucket_offset = offset - 1015 bucket_to_sector(ca, ja->buckets[bucket]), 1016 .sector = offset, 1017 }, jlist, j); 1018 mutex_unlock(&jlist->lock); 1019 1020 switch (ret) { 1021 case JOURNAL_ENTRY_ADD_OK: 1022 break; 1023 case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: 1024 break; 1025 default: 1026 return ret; 1027 } 1028 next_block: 1029 pr_debug("next"); 1030 offset += sectors; 1031 sectors_read -= sectors; 1032 j = ((void *) j) + (sectors << 9); 1033 } 1034 1035 return 0; 1036 } 1037 1038 static CLOSURE_CALLBACK(bch2_journal_read_device) 1039 { 1040 closure_type(ja, struct journal_device, read); 1041 struct bch_dev *ca = container_of(ja, struct bch_dev, journal); 1042 struct bch_fs *c = ca->fs; 1043 struct journal_list *jlist = 1044 container_of(cl->parent, struct journal_list, cl); 1045 struct journal_replay *r, **_r; 1046 struct genradix_iter iter; 1047 struct journal_read_buf buf = { NULL, 0 }; 1048 unsigned i; 1049 int ret = 0; 1050 1051 if (!ja->nr) 1052 goto out; 1053 1054 ret = journal_read_buf_realloc(&buf, PAGE_SIZE); 1055 if (ret) 1056 goto err; 1057 1058 pr_debug("%u journal buckets", ja->nr); 1059 1060 for (i = 0; i < ja->nr; i++) { 1061 ret = journal_read_bucket(ca, &buf, jlist, i); 1062 if (ret) 1063 goto err; 1064 } 1065 1066 ja->sectors_free = ca->mi.bucket_size; 1067 1068 mutex_lock(&jlist->lock); 1069 genradix_for_each_reverse(&c->journal_entries, iter, _r) { 1070 r = *_r; 1071 1072 if (!r) 1073 continue; 1074 1075 for (i = 0; i < r->nr_ptrs; i++) { 1076 if (r->ptrs[i].dev == ca->dev_idx) { 1077 unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) + 1078 vstruct_sectors(&r->j, c->block_bits); 1079 1080 ja->cur_idx = r->ptrs[i].bucket; 1081 ja->sectors_free = ca->mi.bucket_size - wrote; 1082 goto found; 1083 } 1084 } 1085 } 1086 found: 1087 mutex_unlock(&jlist->lock); 1088 1089 if (ja->bucket_seq[ja->cur_idx] && 1090 ja->sectors_free == ca->mi.bucket_size) { 1091 #if 0 1092 /* 1093 * Debug code for ZNS support, where we (probably) want to be 1094 * correlated where we stopped in the journal to the zone write 1095 * points: 1096 */ 1097 bch_err(c, "ja->sectors_free == ca->mi.bucket_size"); 1098 bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr); 1099 for (i = 0; i < 3; i++) { 1100 unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr; 1101 1102 bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]); 1103 } 1104 #endif 1105 ja->sectors_free = 0; 1106 } 1107 1108 /* 1109 * Set dirty_idx to indicate the entire journal is full and needs to be 1110 * reclaimed - journal reclaim will immediately reclaim whatever isn't 1111 * pinned when it first runs: 1112 */ 1113 ja->discard_idx = ja->dirty_idx_ondisk = 1114 ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; 1115 out: 1116 bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); 1117 kvpfree(buf.data, buf.size); 1118 percpu_ref_put(&ca->io_ref); 1119 closure_return(cl); 1120 return; 1121 err: 1122 mutex_lock(&jlist->lock); 1123 jlist->ret = ret; 1124 mutex_unlock(&jlist->lock); 1125 goto out; 1126 } 1127 1128 void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, 1129 struct journal_replay *j) 1130 { 1131 unsigned i; 1132 1133 for (i = 0; i < j->nr_ptrs; i++) { 1134 struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev); 1135 u64 offset; 1136 1137 div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset); 1138 1139 if (i) 1140 prt_printf(out, " "); 1141 prt_printf(out, "%u:%u:%u (sector %llu)", 1142 j->ptrs[i].dev, 1143 j->ptrs[i].bucket, 1144 j->ptrs[i].bucket_offset, 1145 j->ptrs[i].sector); 1146 } 1147 } 1148 1149 int bch2_journal_read(struct bch_fs *c, 1150 u64 *last_seq, 1151 u64 *blacklist_seq, 1152 u64 *start_seq) 1153 { 1154 struct journal_list jlist; 1155 struct journal_replay *i, **_i, *prev = NULL; 1156 struct genradix_iter radix_iter; 1157 struct bch_dev *ca; 1158 unsigned iter; 1159 struct printbuf buf = PRINTBUF; 1160 bool degraded = false, last_write_torn = false; 1161 u64 seq; 1162 int ret = 0; 1163 1164 closure_init_stack(&jlist.cl); 1165 mutex_init(&jlist.lock); 1166 jlist.last_seq = 0; 1167 jlist.ret = 0; 1168 1169 for_each_member_device(ca, c, iter) { 1170 if (!c->opts.fsck && 1171 !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal))) 1172 continue; 1173 1174 if ((ca->mi.state == BCH_MEMBER_STATE_rw || 1175 ca->mi.state == BCH_MEMBER_STATE_ro) && 1176 percpu_ref_tryget(&ca->io_ref)) 1177 closure_call(&ca->journal.read, 1178 bch2_journal_read_device, 1179 system_unbound_wq, 1180 &jlist.cl); 1181 else 1182 degraded = true; 1183 } 1184 1185 closure_sync(&jlist.cl); 1186 1187 if (jlist.ret) 1188 return jlist.ret; 1189 1190 *last_seq = 0; 1191 *start_seq = 0; 1192 *blacklist_seq = 0; 1193 1194 /* 1195 * Find most recent flush entry, and ignore newer non flush entries - 1196 * those entries will be blacklisted: 1197 */ 1198 genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) { 1199 enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL; 1200 1201 i = *_i; 1202 1203 if (!i || i->ignore) 1204 continue; 1205 1206 if (!*start_seq) 1207 *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1; 1208 1209 if (JSET_NO_FLUSH(&i->j)) { 1210 i->ignore = true; 1211 continue; 1212 } 1213 1214 if (!last_write_torn && !i->csum_good) { 1215 last_write_torn = true; 1216 i->ignore = true; 1217 continue; 1218 } 1219 1220 if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq), 1221 c, le32_to_cpu(i->j.version), &i->j, NULL, 1222 jset_last_seq_newer_than_seq, 1223 "invalid journal entry: last_seq > seq (%llu > %llu)", 1224 le64_to_cpu(i->j.last_seq), 1225 le64_to_cpu(i->j.seq))) 1226 i->j.last_seq = i->j.seq; 1227 1228 *last_seq = le64_to_cpu(i->j.last_seq); 1229 *blacklist_seq = le64_to_cpu(i->j.seq) + 1; 1230 break; 1231 } 1232 1233 if (!*start_seq) { 1234 bch_info(c, "journal read done, but no entries found"); 1235 return 0; 1236 } 1237 1238 if (!*last_seq) { 1239 fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes, 1240 "journal read done, but no entries found after dropping non-flushes"); 1241 return 0; 1242 } 1243 1244 bch_info(c, "journal read done, replaying entries %llu-%llu", 1245 *last_seq, *blacklist_seq - 1); 1246 1247 if (*start_seq != *blacklist_seq) 1248 bch_info(c, "dropped unflushed entries %llu-%llu", 1249 *blacklist_seq, *start_seq - 1); 1250 1251 /* Drop blacklisted entries and entries older than last_seq: */ 1252 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1253 i = *_i; 1254 1255 if (!i || i->ignore) 1256 continue; 1257 1258 seq = le64_to_cpu(i->j.seq); 1259 if (seq < *last_seq) { 1260 journal_replay_free(c, i); 1261 continue; 1262 } 1263 1264 if (bch2_journal_seq_is_blacklisted(c, seq, true)) { 1265 fsck_err_on(!JSET_NO_FLUSH(&i->j), c, 1266 jset_seq_blacklisted, 1267 "found blacklisted journal entry %llu", seq); 1268 i->ignore = true; 1269 } 1270 } 1271 1272 /* Check for missing entries: */ 1273 seq = *last_seq; 1274 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1275 i = *_i; 1276 1277 if (!i || i->ignore) 1278 continue; 1279 1280 BUG_ON(seq > le64_to_cpu(i->j.seq)); 1281 1282 while (seq < le64_to_cpu(i->j.seq)) { 1283 u64 missing_start, missing_end; 1284 struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; 1285 1286 while (seq < le64_to_cpu(i->j.seq) && 1287 bch2_journal_seq_is_blacklisted(c, seq, false)) 1288 seq++; 1289 1290 if (seq == le64_to_cpu(i->j.seq)) 1291 break; 1292 1293 missing_start = seq; 1294 1295 while (seq < le64_to_cpu(i->j.seq) && 1296 !bch2_journal_seq_is_blacklisted(c, seq, false)) 1297 seq++; 1298 1299 if (prev) { 1300 bch2_journal_ptrs_to_text(&buf1, c, prev); 1301 prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits)); 1302 } else 1303 prt_printf(&buf1, "(none)"); 1304 bch2_journal_ptrs_to_text(&buf2, c, i); 1305 1306 missing_end = seq - 1; 1307 fsck_err(c, journal_entries_missing, 1308 "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" 1309 " prev at %s\n" 1310 " next at %s", 1311 missing_start, missing_end, 1312 *last_seq, *blacklist_seq - 1, 1313 buf1.buf, buf2.buf); 1314 1315 printbuf_exit(&buf1); 1316 printbuf_exit(&buf2); 1317 } 1318 1319 prev = i; 1320 seq++; 1321 } 1322 1323 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1324 struct bch_replicas_padded replicas = { 1325 .e.data_type = BCH_DATA_journal, 1326 .e.nr_required = 1, 1327 }; 1328 unsigned ptr; 1329 1330 i = *_i; 1331 if (!i || i->ignore) 1332 continue; 1333 1334 for (ptr = 0; ptr < i->nr_ptrs; ptr++) { 1335 ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev); 1336 1337 if (!i->ptrs[ptr].csum_good) 1338 bch_err_dev_offset(ca, i->ptrs[ptr].sector, 1339 "invalid journal checksum, seq %llu%s", 1340 le64_to_cpu(i->j.seq), 1341 i->csum_good ? " (had good copy on another device)" : ""); 1342 } 1343 1344 ret = jset_validate(c, 1345 bch_dev_bkey_exists(c, i->ptrs[0].dev), 1346 &i->j, 1347 i->ptrs[0].sector, 1348 READ); 1349 if (ret) 1350 goto err; 1351 1352 for (ptr = 0; ptr < i->nr_ptrs; ptr++) 1353 replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev; 1354 1355 bch2_replicas_entry_sort(&replicas.e); 1356 1357 printbuf_reset(&buf); 1358 bch2_replicas_entry_to_text(&buf, &replicas.e); 1359 1360 if (!degraded && 1361 !bch2_replicas_marked(c, &replicas.e) && 1362 (le64_to_cpu(i->j.seq) == *last_seq || 1363 fsck_err(c, journal_entry_replicas_not_marked, 1364 "superblock not marked as containing replicas for journal entry %llu\n %s", 1365 le64_to_cpu(i->j.seq), buf.buf))) { 1366 ret = bch2_mark_replicas(c, &replicas.e); 1367 if (ret) 1368 goto err; 1369 } 1370 } 1371 err: 1372 fsck_err: 1373 printbuf_exit(&buf); 1374 return ret; 1375 } 1376 1377 /* journal write: */ 1378 1379 static void __journal_write_alloc(struct journal *j, 1380 struct journal_buf *w, 1381 struct dev_alloc_list *devs_sorted, 1382 unsigned sectors, 1383 unsigned *replicas, 1384 unsigned replicas_want) 1385 { 1386 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1387 struct journal_device *ja; 1388 struct bch_dev *ca; 1389 unsigned i; 1390 1391 if (*replicas >= replicas_want) 1392 return; 1393 1394 for (i = 0; i < devs_sorted->nr; i++) { 1395 ca = rcu_dereference(c->devs[devs_sorted->devs[i]]); 1396 if (!ca) 1397 continue; 1398 1399 ja = &ca->journal; 1400 1401 /* 1402 * Check that we can use this device, and aren't already using 1403 * it: 1404 */ 1405 if (!ca->mi.durability || 1406 ca->mi.state != BCH_MEMBER_STATE_rw || 1407 !ja->nr || 1408 bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) || 1409 sectors > ja->sectors_free) 1410 continue; 1411 1412 bch2_dev_stripe_increment(ca, &j->wp.stripe); 1413 1414 bch2_bkey_append_ptr(&w->key, 1415 (struct bch_extent_ptr) { 1416 .offset = bucket_to_sector(ca, 1417 ja->buckets[ja->cur_idx]) + 1418 ca->mi.bucket_size - 1419 ja->sectors_free, 1420 .dev = ca->dev_idx, 1421 }); 1422 1423 ja->sectors_free -= sectors; 1424 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1425 1426 *replicas += ca->mi.durability; 1427 1428 if (*replicas >= replicas_want) 1429 break; 1430 } 1431 } 1432 1433 /** 1434 * journal_write_alloc - decide where to write next journal entry 1435 * 1436 * @j: journal object 1437 * @w: journal buf (entry to be written) 1438 * 1439 * Returns: 0 on success, or -EROFS on failure 1440 */ 1441 static int journal_write_alloc(struct journal *j, struct journal_buf *w) 1442 { 1443 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1444 struct bch_devs_mask devs; 1445 struct journal_device *ja; 1446 struct bch_dev *ca; 1447 struct dev_alloc_list devs_sorted; 1448 unsigned sectors = vstruct_sectors(w->data, c->block_bits); 1449 unsigned target = c->opts.metadata_target ?: 1450 c->opts.foreground_target; 1451 unsigned i, replicas = 0, replicas_want = 1452 READ_ONCE(c->opts.metadata_replicas); 1453 1454 rcu_read_lock(); 1455 retry: 1456 devs = target_rw_devs(c, BCH_DATA_journal, target); 1457 1458 devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); 1459 1460 __journal_write_alloc(j, w, &devs_sorted, 1461 sectors, &replicas, replicas_want); 1462 1463 if (replicas >= replicas_want) 1464 goto done; 1465 1466 for (i = 0; i < devs_sorted.nr; i++) { 1467 ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); 1468 if (!ca) 1469 continue; 1470 1471 ja = &ca->journal; 1472 1473 if (sectors > ja->sectors_free && 1474 sectors <= ca->mi.bucket_size && 1475 bch2_journal_dev_buckets_available(j, ja, 1476 journal_space_discarded)) { 1477 ja->cur_idx = (ja->cur_idx + 1) % ja->nr; 1478 ja->sectors_free = ca->mi.bucket_size; 1479 1480 /* 1481 * ja->bucket_seq[ja->cur_idx] must always have 1482 * something sensible: 1483 */ 1484 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1485 } 1486 } 1487 1488 __journal_write_alloc(j, w, &devs_sorted, 1489 sectors, &replicas, replicas_want); 1490 1491 if (replicas < replicas_want && target) { 1492 /* Retry from all devices: */ 1493 target = 0; 1494 goto retry; 1495 } 1496 done: 1497 rcu_read_unlock(); 1498 1499 BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); 1500 1501 return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS; 1502 } 1503 1504 static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) 1505 { 1506 /* we aren't holding j->lock: */ 1507 unsigned new_size = READ_ONCE(j->buf_size_want); 1508 void *new_buf; 1509 1510 if (buf->buf_size >= new_size) 1511 return; 1512 1513 new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN); 1514 if (!new_buf) 1515 return; 1516 1517 memcpy(new_buf, buf->data, buf->buf_size); 1518 1519 spin_lock(&j->lock); 1520 swap(buf->data, new_buf); 1521 swap(buf->buf_size, new_size); 1522 spin_unlock(&j->lock); 1523 1524 kvpfree(new_buf, new_size); 1525 } 1526 1527 static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) 1528 { 1529 return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK); 1530 } 1531 1532 static CLOSURE_CALLBACK(journal_write_done) 1533 { 1534 closure_type(j, struct journal, io); 1535 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1536 struct journal_buf *w = journal_last_unwritten_buf(j); 1537 struct bch_replicas_padded replicas; 1538 union journal_res_state old, new; 1539 u64 v, seq; 1540 int err = 0; 1541 1542 bch2_time_stats_update(!JSET_NO_FLUSH(w->data) 1543 ? j->flush_write_time 1544 : j->noflush_write_time, j->write_start_time); 1545 1546 if (!w->devs_written.nr) { 1547 bch_err(c, "unable to write journal to sufficient devices"); 1548 err = -EIO; 1549 } else { 1550 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, 1551 w->devs_written); 1552 if (bch2_mark_replicas(c, &replicas.e)) 1553 err = -EIO; 1554 } 1555 1556 if (err) 1557 bch2_fatal_error(c); 1558 1559 spin_lock(&j->lock); 1560 seq = le64_to_cpu(w->data->seq); 1561 1562 if (seq >= j->pin.front) 1563 journal_seq_pin(j, seq)->devs = w->devs_written; 1564 1565 if (!err) { 1566 if (!JSET_NO_FLUSH(w->data)) { 1567 j->flushed_seq_ondisk = seq; 1568 j->last_seq_ondisk = w->last_seq; 1569 1570 bch2_do_discards(c); 1571 closure_wake_up(&c->freelist_wait); 1572 1573 bch2_reset_alloc_cursors(c); 1574 } 1575 } else if (!j->err_seq || seq < j->err_seq) 1576 j->err_seq = seq; 1577 1578 j->seq_ondisk = seq; 1579 1580 /* 1581 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard 1582 * more buckets: 1583 * 1584 * Must come before signaling write completion, for 1585 * bch2_fs_journal_stop(): 1586 */ 1587 if (j->watermark != BCH_WATERMARK_stripe) 1588 journal_reclaim_kick(&c->journal); 1589 1590 /* also must come before signalling write completion: */ 1591 closure_debug_destroy(cl); 1592 1593 v = atomic64_read(&j->reservations.counter); 1594 do { 1595 old.v = new.v = v; 1596 BUG_ON(journal_state_count(new, new.unwritten_idx)); 1597 1598 new.unwritten_idx++; 1599 } while ((v = atomic64_cmpxchg(&j->reservations.counter, 1600 old.v, new.v)) != old.v); 1601 1602 bch2_journal_reclaim_fast(j); 1603 bch2_journal_space_available(j); 1604 1605 closure_wake_up(&w->wait); 1606 journal_wake(j); 1607 1608 if (!journal_state_count(new, new.unwritten_idx) && 1609 journal_last_unwritten_seq(j) <= journal_cur_seq(j)) { 1610 spin_unlock(&j->lock); 1611 closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); 1612 } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && 1613 new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { 1614 struct journal_buf *buf = journal_cur_buf(j); 1615 long delta = buf->expires - jiffies; 1616 1617 /* 1618 * We don't close a journal entry to write it while there's 1619 * previous entries still in flight - the current journal entry 1620 * might want to be written now: 1621 */ 1622 1623 spin_unlock(&j->lock); 1624 mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta)); 1625 } else { 1626 spin_unlock(&j->lock); 1627 } 1628 } 1629 1630 static void journal_write_endio(struct bio *bio) 1631 { 1632 struct bch_dev *ca = bio->bi_private; 1633 struct journal *j = &ca->fs->journal; 1634 struct journal_buf *w = journal_last_unwritten_buf(j); 1635 unsigned long flags; 1636 1637 if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, 1638 "error writing journal entry %llu: %s", 1639 le64_to_cpu(w->data->seq), 1640 bch2_blk_status_to_str(bio->bi_status)) || 1641 bch2_meta_write_fault("journal")) { 1642 spin_lock_irqsave(&j->err_lock, flags); 1643 bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx); 1644 spin_unlock_irqrestore(&j->err_lock, flags); 1645 } 1646 1647 closure_put(&j->io); 1648 percpu_ref_put(&ca->io_ref); 1649 } 1650 1651 static CLOSURE_CALLBACK(do_journal_write) 1652 { 1653 closure_type(j, struct journal, io); 1654 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1655 struct bch_dev *ca; 1656 struct journal_buf *w = journal_last_unwritten_buf(j); 1657 struct bch_extent_ptr *ptr; 1658 struct bio *bio; 1659 unsigned sectors = vstruct_sectors(w->data, c->block_bits); 1660 1661 extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { 1662 ca = bch_dev_bkey_exists(c, ptr->dev); 1663 if (!percpu_ref_tryget(&ca->io_ref)) { 1664 /* XXX: fix this */ 1665 bch_err(c, "missing device for journal write\n"); 1666 continue; 1667 } 1668 1669 this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], 1670 sectors); 1671 1672 bio = ca->journal.bio; 1673 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); 1674 bio->bi_iter.bi_sector = ptr->offset; 1675 bio->bi_end_io = journal_write_endio; 1676 bio->bi_private = ca; 1677 1678 BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector); 1679 ca->prev_journal_sector = bio->bi_iter.bi_sector; 1680 1681 if (!JSET_NO_FLUSH(w->data)) 1682 bio->bi_opf |= REQ_FUA; 1683 if (!JSET_NO_FLUSH(w->data) && !w->separate_flush) 1684 bio->bi_opf |= REQ_PREFLUSH; 1685 1686 bch2_bio_map(bio, w->data, sectors << 9); 1687 1688 trace_and_count(c, journal_write, bio); 1689 closure_bio_submit(bio, cl); 1690 1691 ca->journal.bucket_seq[ca->journal.cur_idx] = 1692 le64_to_cpu(w->data->seq); 1693 } 1694 1695 continue_at(cl, journal_write_done, c->io_complete_wq); 1696 } 1697 1698 static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) 1699 { 1700 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1701 struct jset_entry *start, *end, *i, *next, *prev = NULL; 1702 struct jset *jset = w->data; 1703 unsigned sectors, bytes, u64s; 1704 bool validate_before_checksum = false; 1705 unsigned long btree_roots_have = 0; 1706 int ret; 1707 1708 /* 1709 * Simple compaction, dropping empty jset_entries (from journal 1710 * reservations that weren't fully used) and merging jset_entries that 1711 * can be. 1712 * 1713 * If we wanted to be really fancy here, we could sort all the keys in 1714 * the jset and drop keys that were overwritten - probably not worth it: 1715 */ 1716 vstruct_for_each_safe(jset, i, next) { 1717 unsigned u64s = le16_to_cpu(i->u64s); 1718 1719 /* Empty entry: */ 1720 if (!u64s) 1721 continue; 1722 1723 /* 1724 * New btree roots are set by journalling them; when the journal 1725 * entry gets written we have to propagate them to 1726 * c->btree_roots 1727 * 1728 * But, every journal entry we write has to contain all the 1729 * btree roots (at least for now); so after we copy btree roots 1730 * to c->btree_roots we have to get any missing btree roots and 1731 * add them to this journal entry: 1732 */ 1733 if (i->type == BCH_JSET_ENTRY_btree_root) { 1734 bch2_journal_entry_to_btree_root(c, i); 1735 __set_bit(i->btree_id, &btree_roots_have); 1736 } 1737 1738 /* Can we merge with previous entry? */ 1739 if (prev && 1740 i->btree_id == prev->btree_id && 1741 i->level == prev->level && 1742 i->type == prev->type && 1743 i->type == BCH_JSET_ENTRY_btree_keys && 1744 le16_to_cpu(prev->u64s) + u64s <= U16_MAX) { 1745 memmove_u64s_down(vstruct_next(prev), 1746 i->_data, 1747 u64s); 1748 le16_add_cpu(&prev->u64s, u64s); 1749 continue; 1750 } 1751 1752 /* Couldn't merge, move i into new position (after prev): */ 1753 prev = prev ? vstruct_next(prev) : jset->start; 1754 if (i != prev) 1755 memmove_u64s_down(prev, i, jset_u64s(u64s)); 1756 } 1757 1758 prev = prev ? vstruct_next(prev) : jset->start; 1759 jset->u64s = cpu_to_le32((u64 *) prev - jset->_data); 1760 1761 start = end = vstruct_last(jset); 1762 1763 end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have); 1764 1765 bch2_journal_super_entries_add_common(c, &end, 1766 le64_to_cpu(jset->seq)); 1767 u64s = (u64 *) end - (u64 *) start; 1768 BUG_ON(u64s > j->entry_u64s_reserved); 1769 1770 le32_add_cpu(&jset->u64s, u64s); 1771 1772 sectors = vstruct_sectors(jset, c->block_bits); 1773 bytes = vstruct_bytes(jset); 1774 1775 if (sectors > w->sectors) { 1776 bch2_fs_fatal_error(c, "aieeee! journal write overran available space, %zu > %u (extra %u reserved %u/%u)", 1777 vstruct_bytes(jset), w->sectors << 9, 1778 u64s, w->u64s_reserved, j->entry_u64s_reserved); 1779 return -EINVAL; 1780 } 1781 1782 jset->magic = cpu_to_le64(jset_magic(c)); 1783 jset->version = cpu_to_le32(c->sb.version); 1784 1785 SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); 1786 SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); 1787 1788 if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset)) 1789 j->last_empty_seq = le64_to_cpu(jset->seq); 1790 1791 if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) 1792 validate_before_checksum = true; 1793 1794 if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current) 1795 validate_before_checksum = true; 1796 1797 if (validate_before_checksum && 1798 (ret = jset_validate(c, NULL, jset, 0, WRITE))) 1799 return ret; 1800 1801 ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), 1802 jset->encrypted_start, 1803 vstruct_end(jset) - (void *) jset->encrypted_start); 1804 if (bch2_fs_fatal_err_on(ret, c, 1805 "error decrypting journal entry: %i", ret)) 1806 return ret; 1807 1808 jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), 1809 journal_nonce(jset), jset); 1810 1811 if (!validate_before_checksum && 1812 (ret = jset_validate(c, NULL, jset, 0, WRITE))) 1813 return ret; 1814 1815 memset((void *) jset + bytes, 0, (sectors << 9) - bytes); 1816 return 0; 1817 } 1818 1819 static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w) 1820 { 1821 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1822 int error = bch2_journal_error(j); 1823 1824 /* 1825 * If the journal is in an error state - we did an emergency shutdown - 1826 * we prefer to continue doing journal writes. We just mark them as 1827 * noflush so they'll never be used, but they'll still be visible by the 1828 * list_journal tool - this helps in debugging. 1829 * 1830 * There's a caveat: the first journal write after marking the 1831 * superblock dirty must always be a flush write, because on startup 1832 * from a clean shutdown we didn't necessarily read the journal and the 1833 * new journal write might overwrite whatever was in the journal 1834 * previously - we can't leave the journal without any flush writes in 1835 * it. 1836 * 1837 * So if we're in an error state, and we're still starting up, we don't 1838 * write anything at all. 1839 */ 1840 if (error && test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags)) 1841 return -EIO; 1842 1843 if (error || 1844 w->noflush || 1845 (!w->must_flush && 1846 (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && 1847 test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) { 1848 w->noflush = true; 1849 SET_JSET_NO_FLUSH(w->data, true); 1850 w->data->last_seq = 0; 1851 w->last_seq = 0; 1852 1853 j->nr_noflush_writes++; 1854 } else { 1855 j->last_flush_write = jiffies; 1856 j->nr_flush_writes++; 1857 clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags); 1858 } 1859 1860 return 0; 1861 } 1862 1863 CLOSURE_CALLBACK(bch2_journal_write) 1864 { 1865 closure_type(j, struct journal, io); 1866 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1867 struct bch_dev *ca; 1868 struct journal_buf *w = journal_last_unwritten_buf(j); 1869 struct bch_replicas_padded replicas; 1870 struct bio *bio; 1871 struct printbuf journal_debug_buf = PRINTBUF; 1872 unsigned i, nr_rw_members = 0; 1873 int ret; 1874 1875 BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); 1876 1877 j->write_start_time = local_clock(); 1878 1879 spin_lock(&j->lock); 1880 ret = bch2_journal_write_pick_flush(j, w); 1881 spin_unlock(&j->lock); 1882 if (ret) 1883 goto err; 1884 1885 journal_buf_realloc(j, w); 1886 1887 ret = bch2_journal_write_prep(j, w); 1888 if (ret) 1889 goto err; 1890 1891 while (1) { 1892 spin_lock(&j->lock); 1893 ret = journal_write_alloc(j, w); 1894 if (!ret || !j->can_discard) 1895 break; 1896 1897 spin_unlock(&j->lock); 1898 bch2_journal_do_discards(j); 1899 } 1900 1901 if (ret) { 1902 __bch2_journal_debug_to_text(&journal_debug_buf, j); 1903 spin_unlock(&j->lock); 1904 bch_err(c, "Unable to allocate journal write:\n%s", 1905 journal_debug_buf.buf); 1906 printbuf_exit(&journal_debug_buf); 1907 goto err; 1908 } 1909 1910 /* 1911 * write is allocated, no longer need to account for it in 1912 * bch2_journal_space_available(): 1913 */ 1914 w->sectors = 0; 1915 1916 /* 1917 * journal entry has been compacted and allocated, recalculate space 1918 * available: 1919 */ 1920 bch2_journal_space_available(j); 1921 spin_unlock(&j->lock); 1922 1923 w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); 1924 1925 if (c->opts.nochanges) 1926 goto no_io; 1927 1928 for_each_rw_member(ca, c, i) 1929 nr_rw_members++; 1930 1931 if (nr_rw_members > 1) 1932 w->separate_flush = true; 1933 1934 /* 1935 * Mark journal replicas before we submit the write to guarantee 1936 * recovery will find the journal entries after a crash. 1937 */ 1938 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, 1939 w->devs_written); 1940 ret = bch2_mark_replicas(c, &replicas.e); 1941 if (ret) 1942 goto err; 1943 1944 if (!JSET_NO_FLUSH(w->data) && w->separate_flush) { 1945 for_each_rw_member(ca, c, i) { 1946 percpu_ref_get(&ca->io_ref); 1947 1948 bio = ca->journal.bio; 1949 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH); 1950 bio->bi_end_io = journal_write_endio; 1951 bio->bi_private = ca; 1952 closure_bio_submit(bio, cl); 1953 } 1954 } 1955 1956 continue_at(cl, do_journal_write, c->io_complete_wq); 1957 return; 1958 no_io: 1959 continue_at(cl, journal_write_done, c->io_complete_wq); 1960 return; 1961 err: 1962 bch2_fatal_error(c); 1963 continue_at(cl, journal_write_done, c->io_complete_wq); 1964 } 1965