1 // SPDX-License-Identifier: GPL-2.0 2 #include "bcachefs.h" 3 #include "alloc_background.h" 4 #include "alloc_foreground.h" 5 #include "btree_io.h" 6 #include "btree_update_interior.h" 7 #include "buckets.h" 8 #include "checksum.h" 9 #include "disk_groups.h" 10 #include "error.h" 11 #include "journal.h" 12 #include "journal_io.h" 13 #include "journal_reclaim.h" 14 #include "journal_seq_blacklist.h" 15 #include "replicas.h" 16 #include "sb-clean.h" 17 #include "trace.h" 18 19 static struct nonce journal_nonce(const struct jset *jset) 20 { 21 return (struct nonce) {{ 22 [0] = 0, 23 [1] = ((__le32 *) &jset->seq)[0], 24 [2] = ((__le32 *) &jset->seq)[1], 25 [3] = BCH_NONCE_JOURNAL, 26 }}; 27 } 28 29 static bool jset_csum_good(struct bch_fs *c, struct jset *j) 30 { 31 return bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j)) && 32 !bch2_crc_cmp(j->csum, 33 csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j)); 34 } 35 36 static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq) 37 { 38 return (seq - c->journal_entries_base_seq) & (~0U >> 1); 39 } 40 41 static void __journal_replay_free(struct bch_fs *c, 42 struct journal_replay *i) 43 { 44 struct journal_replay **p = 45 genradix_ptr(&c->journal_entries, 46 journal_entry_radix_idx(c, le64_to_cpu(i->j.seq))); 47 48 BUG_ON(*p != i); 49 *p = NULL; 50 kvpfree(i, offsetof(struct journal_replay, j) + 51 vstruct_bytes(&i->j)); 52 } 53 54 static void journal_replay_free(struct bch_fs *c, struct journal_replay *i) 55 { 56 i->ignore = true; 57 58 if (!c->opts.read_entire_journal) 59 __journal_replay_free(c, i); 60 } 61 62 struct journal_list { 63 struct closure cl; 64 u64 last_seq; 65 struct mutex lock; 66 int ret; 67 }; 68 69 #define JOURNAL_ENTRY_ADD_OK 0 70 #define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5 71 72 /* 73 * Given a journal entry we just read, add it to the list of journal entries to 74 * be replayed: 75 */ 76 static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, 77 struct journal_ptr entry_ptr, 78 struct journal_list *jlist, struct jset *j) 79 { 80 struct genradix_iter iter; 81 struct journal_replay **_i, *i, *dup; 82 struct journal_ptr *ptr; 83 size_t bytes = vstruct_bytes(j); 84 u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0; 85 int ret = JOURNAL_ENTRY_ADD_OK; 86 87 /* Is this entry older than the range we need? */ 88 if (!c->opts.read_entire_journal && 89 le64_to_cpu(j->seq) < jlist->last_seq) 90 return JOURNAL_ENTRY_ADD_OUT_OF_RANGE; 91 92 /* 93 * genradixes are indexed by a ulong, not a u64, so we can't index them 94 * by sequence number directly: Assume instead that they will all fall 95 * within the range of +-2billion of the filrst one we find. 96 */ 97 if (!c->journal_entries_base_seq) 98 c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX); 99 100 /* Drop entries we don't need anymore */ 101 if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) { 102 genradix_for_each_from(&c->journal_entries, iter, _i, 103 journal_entry_radix_idx(c, jlist->last_seq)) { 104 i = *_i; 105 106 if (!i || i->ignore) 107 continue; 108 109 if (le64_to_cpu(i->j.seq) >= last_seq) 110 break; 111 journal_replay_free(c, i); 112 } 113 } 114 115 jlist->last_seq = max(jlist->last_seq, last_seq); 116 117 _i = genradix_ptr_alloc(&c->journal_entries, 118 journal_entry_radix_idx(c, le64_to_cpu(j->seq)), 119 GFP_KERNEL); 120 if (!_i) 121 return -BCH_ERR_ENOMEM_journal_entry_add; 122 123 /* 124 * Duplicate journal entries? If so we want the one that didn't have a 125 * checksum error: 126 */ 127 dup = *_i; 128 if (dup) { 129 if (bytes == vstruct_bytes(&dup->j) && 130 !memcmp(j, &dup->j, bytes)) { 131 i = dup; 132 goto found; 133 } 134 135 if (!entry_ptr.csum_good) { 136 i = dup; 137 goto found; 138 } 139 140 if (!dup->csum_good) 141 goto replace; 142 143 fsck_err(c, journal_entry_replicas_data_mismatch, 144 "found duplicate but non identical journal entries (seq %llu)", 145 le64_to_cpu(j->seq)); 146 i = dup; 147 goto found; 148 } 149 replace: 150 i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); 151 if (!i) 152 return -BCH_ERR_ENOMEM_journal_entry_add; 153 154 i->nr_ptrs = 0; 155 i->csum_good = entry_ptr.csum_good; 156 i->ignore = false; 157 unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct"); 158 i->ptrs[i->nr_ptrs++] = entry_ptr; 159 160 if (dup) { 161 if (dup->nr_ptrs >= ARRAY_SIZE(dup->ptrs)) { 162 bch_err(c, "found too many copies of journal entry %llu", 163 le64_to_cpu(i->j.seq)); 164 dup->nr_ptrs = ARRAY_SIZE(dup->ptrs) - 1; 165 } 166 167 /* The first ptr should represent the jset we kept: */ 168 memcpy(i->ptrs + i->nr_ptrs, 169 dup->ptrs, 170 sizeof(dup->ptrs[0]) * dup->nr_ptrs); 171 i->nr_ptrs += dup->nr_ptrs; 172 __journal_replay_free(c, dup); 173 } 174 175 *_i = i; 176 return 0; 177 found: 178 for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) { 179 if (ptr->dev == ca->dev_idx) { 180 bch_err(c, "duplicate journal entry %llu on same device", 181 le64_to_cpu(i->j.seq)); 182 goto out; 183 } 184 } 185 186 if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) { 187 bch_err(c, "found too many copies of journal entry %llu", 188 le64_to_cpu(i->j.seq)); 189 goto out; 190 } 191 192 i->ptrs[i->nr_ptrs++] = entry_ptr; 193 out: 194 fsck_err: 195 return ret; 196 } 197 198 /* this fills in a range with empty jset_entries: */ 199 static void journal_entry_null_range(void *start, void *end) 200 { 201 struct jset_entry *entry; 202 203 for (entry = start; entry != end; entry = vstruct_next(entry)) 204 memset(entry, 0, sizeof(*entry)); 205 } 206 207 #define JOURNAL_ENTRY_REREAD 5 208 #define JOURNAL_ENTRY_NONE 6 209 #define JOURNAL_ENTRY_BAD 7 210 211 static void journal_entry_err_msg(struct printbuf *out, 212 u32 version, 213 struct jset *jset, 214 struct jset_entry *entry) 215 { 216 prt_str(out, "invalid journal entry, version="); 217 bch2_version_to_text(out, version); 218 219 if (entry) { 220 prt_str(out, " type="); 221 prt_str(out, bch2_jset_entry_types[entry->type]); 222 } 223 224 if (!jset) { 225 prt_printf(out, " in superblock"); 226 } else { 227 228 prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq)); 229 230 if (entry) 231 prt_printf(out, " offset=%zi/%u", 232 (u64 *) entry - jset->_data, 233 le32_to_cpu(jset->u64s)); 234 } 235 236 prt_str(out, ": "); 237 } 238 239 #define journal_entry_err(c, version, jset, entry, _err, msg, ...) \ 240 ({ \ 241 struct printbuf _buf = PRINTBUF; \ 242 \ 243 journal_entry_err_msg(&_buf, version, jset, entry); \ 244 prt_printf(&_buf, msg, ##__VA_ARGS__); \ 245 \ 246 switch (flags & BKEY_INVALID_WRITE) { \ 247 case READ: \ 248 mustfix_fsck_err(c, _err, "%s", _buf.buf); \ 249 break; \ 250 case WRITE: \ 251 bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \ 252 bch_err(c, "corrupt metadata before write: %s\n", _buf.buf);\ 253 if (bch2_fs_inconsistent(c)) { \ 254 ret = -BCH_ERR_fsck_errors_not_fixed; \ 255 goto fsck_err; \ 256 } \ 257 break; \ 258 } \ 259 \ 260 printbuf_exit(&_buf); \ 261 true; \ 262 }) 263 264 #define journal_entry_err_on(cond, ...) \ 265 ((cond) ? journal_entry_err(__VA_ARGS__) : false) 266 267 #define FSCK_DELETED_KEY 5 268 269 static int journal_validate_key(struct bch_fs *c, 270 struct jset *jset, 271 struct jset_entry *entry, 272 unsigned level, enum btree_id btree_id, 273 struct bkey_i *k, 274 unsigned version, int big_endian, 275 enum bkey_invalid_flags flags) 276 { 277 int write = flags & BKEY_INVALID_WRITE; 278 void *next = vstruct_next(entry); 279 struct printbuf buf = PRINTBUF; 280 int ret = 0; 281 282 if (journal_entry_err_on(!k->k.u64s, 283 c, version, jset, entry, 284 journal_entry_bkey_u64s_0, 285 "k->u64s 0")) { 286 entry->u64s = cpu_to_le16((u64 *) k - entry->_data); 287 journal_entry_null_range(vstruct_next(entry), next); 288 return FSCK_DELETED_KEY; 289 } 290 291 if (journal_entry_err_on((void *) bkey_next(k) > 292 (void *) vstruct_next(entry), 293 c, version, jset, entry, 294 journal_entry_bkey_past_end, 295 "extends past end of journal entry")) { 296 entry->u64s = cpu_to_le16((u64 *) k - entry->_data); 297 journal_entry_null_range(vstruct_next(entry), next); 298 return FSCK_DELETED_KEY; 299 } 300 301 if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, 302 c, version, jset, entry, 303 journal_entry_bkey_bad_format, 304 "bad format %u", k->k.format)) { 305 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); 306 memmove(k, bkey_next(k), next - (void *) bkey_next(k)); 307 journal_entry_null_range(vstruct_next(entry), next); 308 return FSCK_DELETED_KEY; 309 } 310 311 if (!write) 312 bch2_bkey_compat(level, btree_id, version, big_endian, 313 write, NULL, bkey_to_packed(k)); 314 315 if (bch2_bkey_invalid(c, bkey_i_to_s_c(k), 316 __btree_node_type(level, btree_id), write, &buf)) { 317 printbuf_reset(&buf); 318 journal_entry_err_msg(&buf, version, jset, entry); 319 prt_newline(&buf); 320 printbuf_indent_add(&buf, 2); 321 322 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); 323 prt_newline(&buf); 324 bch2_bkey_invalid(c, bkey_i_to_s_c(k), 325 __btree_node_type(level, btree_id), write, &buf); 326 327 mustfix_fsck_err(c, journal_entry_bkey_invalid, 328 "%s", buf.buf); 329 330 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); 331 memmove(k, bkey_next(k), next - (void *) bkey_next(k)); 332 journal_entry_null_range(vstruct_next(entry), next); 333 334 printbuf_exit(&buf); 335 return FSCK_DELETED_KEY; 336 } 337 338 if (write) 339 bch2_bkey_compat(level, btree_id, version, big_endian, 340 write, NULL, bkey_to_packed(k)); 341 fsck_err: 342 printbuf_exit(&buf); 343 return ret; 344 } 345 346 static int journal_entry_btree_keys_validate(struct bch_fs *c, 347 struct jset *jset, 348 struct jset_entry *entry, 349 unsigned version, int big_endian, 350 enum bkey_invalid_flags flags) 351 { 352 struct bkey_i *k = entry->start; 353 354 while (k != vstruct_last(entry)) { 355 int ret = journal_validate_key(c, jset, entry, 356 entry->level, 357 entry->btree_id, 358 k, version, big_endian, 359 flags|BKEY_INVALID_JOURNAL); 360 if (ret == FSCK_DELETED_KEY) 361 continue; 362 363 k = bkey_next(k); 364 } 365 366 return 0; 367 } 368 369 static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c, 370 struct jset_entry *entry) 371 { 372 struct bkey_i *k; 373 bool first = true; 374 375 jset_entry_for_each_key(entry, k) { 376 if (!first) { 377 prt_newline(out); 378 prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]); 379 } 380 prt_printf(out, "btree=%s l=%u ", bch2_btree_id_str(entry->btree_id), entry->level); 381 bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); 382 first = false; 383 } 384 } 385 386 static int journal_entry_btree_root_validate(struct bch_fs *c, 387 struct jset *jset, 388 struct jset_entry *entry, 389 unsigned version, int big_endian, 390 enum bkey_invalid_flags flags) 391 { 392 struct bkey_i *k = entry->start; 393 int ret = 0; 394 395 if (journal_entry_err_on(!entry->u64s || 396 le16_to_cpu(entry->u64s) != k->k.u64s, 397 c, version, jset, entry, 398 journal_entry_btree_root_bad_size, 399 "invalid btree root journal entry: wrong number of keys")) { 400 void *next = vstruct_next(entry); 401 /* 402 * we don't want to null out this jset_entry, 403 * just the contents, so that later we can tell 404 * we were _supposed_ to have a btree root 405 */ 406 entry->u64s = 0; 407 journal_entry_null_range(vstruct_next(entry), next); 408 return 0; 409 } 410 411 ret = journal_validate_key(c, jset, entry, 1, entry->btree_id, k, 412 version, big_endian, flags); 413 if (ret == FSCK_DELETED_KEY) 414 ret = 0; 415 fsck_err: 416 return ret; 417 } 418 419 static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c, 420 struct jset_entry *entry) 421 { 422 journal_entry_btree_keys_to_text(out, c, entry); 423 } 424 425 static int journal_entry_prio_ptrs_validate(struct bch_fs *c, 426 struct jset *jset, 427 struct jset_entry *entry, 428 unsigned version, int big_endian, 429 enum bkey_invalid_flags flags) 430 { 431 /* obsolete, don't care: */ 432 return 0; 433 } 434 435 static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c, 436 struct jset_entry *entry) 437 { 438 } 439 440 static int journal_entry_blacklist_validate(struct bch_fs *c, 441 struct jset *jset, 442 struct jset_entry *entry, 443 unsigned version, int big_endian, 444 enum bkey_invalid_flags flags) 445 { 446 int ret = 0; 447 448 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, 449 c, version, jset, entry, 450 journal_entry_blacklist_bad_size, 451 "invalid journal seq blacklist entry: bad size")) { 452 journal_entry_null_range(entry, vstruct_next(entry)); 453 } 454 fsck_err: 455 return ret; 456 } 457 458 static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c, 459 struct jset_entry *entry) 460 { 461 struct jset_entry_blacklist *bl = 462 container_of(entry, struct jset_entry_blacklist, entry); 463 464 prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq)); 465 } 466 467 static int journal_entry_blacklist_v2_validate(struct bch_fs *c, 468 struct jset *jset, 469 struct jset_entry *entry, 470 unsigned version, int big_endian, 471 enum bkey_invalid_flags flags) 472 { 473 struct jset_entry_blacklist_v2 *bl_entry; 474 int ret = 0; 475 476 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, 477 c, version, jset, entry, 478 journal_entry_blacklist_v2_bad_size, 479 "invalid journal seq blacklist entry: bad size")) { 480 journal_entry_null_range(entry, vstruct_next(entry)); 481 goto out; 482 } 483 484 bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry); 485 486 if (journal_entry_err_on(le64_to_cpu(bl_entry->start) > 487 le64_to_cpu(bl_entry->end), 488 c, version, jset, entry, 489 journal_entry_blacklist_v2_start_past_end, 490 "invalid journal seq blacklist entry: start > end")) { 491 journal_entry_null_range(entry, vstruct_next(entry)); 492 } 493 out: 494 fsck_err: 495 return ret; 496 } 497 498 static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c, 499 struct jset_entry *entry) 500 { 501 struct jset_entry_blacklist_v2 *bl = 502 container_of(entry, struct jset_entry_blacklist_v2, entry); 503 504 prt_printf(out, "start=%llu end=%llu", 505 le64_to_cpu(bl->start), 506 le64_to_cpu(bl->end)); 507 } 508 509 static int journal_entry_usage_validate(struct bch_fs *c, 510 struct jset *jset, 511 struct jset_entry *entry, 512 unsigned version, int big_endian, 513 enum bkey_invalid_flags flags) 514 { 515 struct jset_entry_usage *u = 516 container_of(entry, struct jset_entry_usage, entry); 517 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 518 int ret = 0; 519 520 if (journal_entry_err_on(bytes < sizeof(*u), 521 c, version, jset, entry, 522 journal_entry_usage_bad_size, 523 "invalid journal entry usage: bad size")) { 524 journal_entry_null_range(entry, vstruct_next(entry)); 525 return ret; 526 } 527 528 fsck_err: 529 return ret; 530 } 531 532 static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c, 533 struct jset_entry *entry) 534 { 535 struct jset_entry_usage *u = 536 container_of(entry, struct jset_entry_usage, entry); 537 538 prt_printf(out, "type=%s v=%llu", 539 bch2_fs_usage_types[u->entry.btree_id], 540 le64_to_cpu(u->v)); 541 } 542 543 static int journal_entry_data_usage_validate(struct bch_fs *c, 544 struct jset *jset, 545 struct jset_entry *entry, 546 unsigned version, int big_endian, 547 enum bkey_invalid_flags flags) 548 { 549 struct jset_entry_data_usage *u = 550 container_of(entry, struct jset_entry_data_usage, entry); 551 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 552 struct printbuf err = PRINTBUF; 553 int ret = 0; 554 555 if (journal_entry_err_on(bytes < sizeof(*u) || 556 bytes < sizeof(*u) + u->r.nr_devs, 557 c, version, jset, entry, 558 journal_entry_data_usage_bad_size, 559 "invalid journal entry usage: bad size")) { 560 journal_entry_null_range(entry, vstruct_next(entry)); 561 goto out; 562 } 563 564 if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c->disk_sb.sb, &err), 565 c, version, jset, entry, 566 journal_entry_data_usage_bad_size, 567 "invalid journal entry usage: %s", err.buf)) { 568 journal_entry_null_range(entry, vstruct_next(entry)); 569 goto out; 570 } 571 out: 572 fsck_err: 573 printbuf_exit(&err); 574 return ret; 575 } 576 577 static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c, 578 struct jset_entry *entry) 579 { 580 struct jset_entry_data_usage *u = 581 container_of(entry, struct jset_entry_data_usage, entry); 582 583 bch2_replicas_entry_to_text(out, &u->r); 584 prt_printf(out, "=%llu", le64_to_cpu(u->v)); 585 } 586 587 static int journal_entry_clock_validate(struct bch_fs *c, 588 struct jset *jset, 589 struct jset_entry *entry, 590 unsigned version, int big_endian, 591 enum bkey_invalid_flags flags) 592 { 593 struct jset_entry_clock *clock = 594 container_of(entry, struct jset_entry_clock, entry); 595 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 596 int ret = 0; 597 598 if (journal_entry_err_on(bytes != sizeof(*clock), 599 c, version, jset, entry, 600 journal_entry_clock_bad_size, 601 "bad size")) { 602 journal_entry_null_range(entry, vstruct_next(entry)); 603 return ret; 604 } 605 606 if (journal_entry_err_on(clock->rw > 1, 607 c, version, jset, entry, 608 journal_entry_clock_bad_rw, 609 "bad rw")) { 610 journal_entry_null_range(entry, vstruct_next(entry)); 611 return ret; 612 } 613 614 fsck_err: 615 return ret; 616 } 617 618 static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c, 619 struct jset_entry *entry) 620 { 621 struct jset_entry_clock *clock = 622 container_of(entry, struct jset_entry_clock, entry); 623 624 prt_printf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time)); 625 } 626 627 static int journal_entry_dev_usage_validate(struct bch_fs *c, 628 struct jset *jset, 629 struct jset_entry *entry, 630 unsigned version, int big_endian, 631 enum bkey_invalid_flags flags) 632 { 633 struct jset_entry_dev_usage *u = 634 container_of(entry, struct jset_entry_dev_usage, entry); 635 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 636 unsigned expected = sizeof(*u); 637 unsigned dev; 638 int ret = 0; 639 640 if (journal_entry_err_on(bytes < expected, 641 c, version, jset, entry, 642 journal_entry_dev_usage_bad_size, 643 "bad size (%u < %u)", 644 bytes, expected)) { 645 journal_entry_null_range(entry, vstruct_next(entry)); 646 return ret; 647 } 648 649 dev = le32_to_cpu(u->dev); 650 651 if (journal_entry_err_on(!bch2_dev_exists2(c, dev), 652 c, version, jset, entry, 653 journal_entry_dev_usage_bad_dev, 654 "bad dev")) { 655 journal_entry_null_range(entry, vstruct_next(entry)); 656 return ret; 657 } 658 659 if (journal_entry_err_on(u->pad, 660 c, version, jset, entry, 661 journal_entry_dev_usage_bad_pad, 662 "bad pad")) { 663 journal_entry_null_range(entry, vstruct_next(entry)); 664 return ret; 665 } 666 667 fsck_err: 668 return ret; 669 } 670 671 static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c, 672 struct jset_entry *entry) 673 { 674 struct jset_entry_dev_usage *u = 675 container_of(entry, struct jset_entry_dev_usage, entry); 676 unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); 677 678 prt_printf(out, "dev=%u", le32_to_cpu(u->dev)); 679 680 for (i = 0; i < nr_types; i++) { 681 if (i < BCH_DATA_NR) 682 prt_printf(out, " %s", bch2_data_types[i]); 683 else 684 prt_printf(out, " (unknown data type %u)", i); 685 prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu", 686 le64_to_cpu(u->d[i].buckets), 687 le64_to_cpu(u->d[i].sectors), 688 le64_to_cpu(u->d[i].fragmented)); 689 } 690 691 prt_printf(out, " buckets_ec: %llu", le64_to_cpu(u->buckets_ec)); 692 } 693 694 static int journal_entry_log_validate(struct bch_fs *c, 695 struct jset *jset, 696 struct jset_entry *entry, 697 unsigned version, int big_endian, 698 enum bkey_invalid_flags flags) 699 { 700 return 0; 701 } 702 703 static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c, 704 struct jset_entry *entry) 705 { 706 struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); 707 unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d); 708 709 prt_printf(out, "%.*s", bytes, l->d); 710 } 711 712 static int journal_entry_overwrite_validate(struct bch_fs *c, 713 struct jset *jset, 714 struct jset_entry *entry, 715 unsigned version, int big_endian, 716 enum bkey_invalid_flags flags) 717 { 718 return journal_entry_btree_keys_validate(c, jset, entry, 719 version, big_endian, READ); 720 } 721 722 static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c, 723 struct jset_entry *entry) 724 { 725 journal_entry_btree_keys_to_text(out, c, entry); 726 } 727 728 struct jset_entry_ops { 729 int (*validate)(struct bch_fs *, struct jset *, 730 struct jset_entry *, unsigned, int, 731 enum bkey_invalid_flags); 732 void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); 733 }; 734 735 static const struct jset_entry_ops bch2_jset_entry_ops[] = { 736 #define x(f, nr) \ 737 [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \ 738 .validate = journal_entry_##f##_validate, \ 739 .to_text = journal_entry_##f##_to_text, \ 740 }, 741 BCH_JSET_ENTRY_TYPES() 742 #undef x 743 }; 744 745 int bch2_journal_entry_validate(struct bch_fs *c, 746 struct jset *jset, 747 struct jset_entry *entry, 748 unsigned version, int big_endian, 749 enum bkey_invalid_flags flags) 750 { 751 return entry->type < BCH_JSET_ENTRY_NR 752 ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry, 753 version, big_endian, flags) 754 : 0; 755 } 756 757 void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, 758 struct jset_entry *entry) 759 { 760 if (entry->type < BCH_JSET_ENTRY_NR) { 761 prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]); 762 bch2_jset_entry_ops[entry->type].to_text(out, c, entry); 763 } else { 764 prt_printf(out, "(unknown type %u)", entry->type); 765 } 766 } 767 768 static int jset_validate_entries(struct bch_fs *c, struct jset *jset, 769 enum bkey_invalid_flags flags) 770 { 771 struct jset_entry *entry; 772 unsigned version = le32_to_cpu(jset->version); 773 int ret = 0; 774 775 vstruct_for_each(jset, entry) { 776 if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset), 777 c, version, jset, entry, 778 journal_entry_past_jset_end, 779 "journal entry extends past end of jset")) { 780 jset->u64s = cpu_to_le32((u64 *) entry - jset->_data); 781 break; 782 } 783 784 ret = bch2_journal_entry_validate(c, jset, entry, 785 version, JSET_BIG_ENDIAN(jset), flags); 786 if (ret) 787 break; 788 } 789 fsck_err: 790 return ret; 791 } 792 793 static int jset_validate(struct bch_fs *c, 794 struct bch_dev *ca, 795 struct jset *jset, u64 sector, 796 enum bkey_invalid_flags flags) 797 { 798 unsigned version; 799 int ret = 0; 800 801 if (le64_to_cpu(jset->magic) != jset_magic(c)) 802 return JOURNAL_ENTRY_NONE; 803 804 version = le32_to_cpu(jset->version); 805 if (journal_entry_err_on(!bch2_version_compatible(version), 806 c, version, jset, NULL, 807 jset_unsupported_version, 808 "%s sector %llu seq %llu: incompatible journal entry version %u.%u", 809 ca ? ca->name : c->name, 810 sector, le64_to_cpu(jset->seq), 811 BCH_VERSION_MAJOR(version), 812 BCH_VERSION_MINOR(version))) { 813 /* don't try to continue: */ 814 return -EINVAL; 815 } 816 817 if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), 818 c, version, jset, NULL, 819 jset_unknown_csum, 820 "%s sector %llu seq %llu: journal entry with unknown csum type %llu", 821 ca ? ca->name : c->name, 822 sector, le64_to_cpu(jset->seq), 823 JSET_CSUM_TYPE(jset))) 824 ret = JOURNAL_ENTRY_BAD; 825 826 /* last_seq is ignored when JSET_NO_FLUSH is true */ 827 if (journal_entry_err_on(!JSET_NO_FLUSH(jset) && 828 le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), 829 c, version, jset, NULL, 830 jset_last_seq_newer_than_seq, 831 "invalid journal entry: last_seq > seq (%llu > %llu)", 832 le64_to_cpu(jset->last_seq), 833 le64_to_cpu(jset->seq))) { 834 jset->last_seq = jset->seq; 835 return JOURNAL_ENTRY_BAD; 836 } 837 838 ret = jset_validate_entries(c, jset, flags); 839 fsck_err: 840 return ret; 841 } 842 843 static int jset_validate_early(struct bch_fs *c, 844 struct bch_dev *ca, 845 struct jset *jset, u64 sector, 846 unsigned bucket_sectors_left, 847 unsigned sectors_read) 848 { 849 size_t bytes = vstruct_bytes(jset); 850 unsigned version; 851 enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL; 852 int ret = 0; 853 854 if (le64_to_cpu(jset->magic) != jset_magic(c)) 855 return JOURNAL_ENTRY_NONE; 856 857 version = le32_to_cpu(jset->version); 858 if (journal_entry_err_on(!bch2_version_compatible(version), 859 c, version, jset, NULL, 860 jset_unsupported_version, 861 "%s sector %llu seq %llu: unknown journal entry version %u.%u", 862 ca ? ca->name : c->name, 863 sector, le64_to_cpu(jset->seq), 864 BCH_VERSION_MAJOR(version), 865 BCH_VERSION_MINOR(version))) { 866 /* don't try to continue: */ 867 return -EINVAL; 868 } 869 870 if (bytes > (sectors_read << 9) && 871 sectors_read < bucket_sectors_left) 872 return JOURNAL_ENTRY_REREAD; 873 874 if (journal_entry_err_on(bytes > bucket_sectors_left << 9, 875 c, version, jset, NULL, 876 jset_past_bucket_end, 877 "%s sector %llu seq %llu: journal entry too big (%zu bytes)", 878 ca ? ca->name : c->name, 879 sector, le64_to_cpu(jset->seq), bytes)) 880 le32_add_cpu(&jset->u64s, 881 -((bytes - (bucket_sectors_left << 9)) / 8)); 882 fsck_err: 883 return ret; 884 } 885 886 struct journal_read_buf { 887 void *data; 888 size_t size; 889 }; 890 891 static int journal_read_buf_realloc(struct journal_read_buf *b, 892 size_t new_size) 893 { 894 void *n; 895 896 /* the bios are sized for this many pages, max: */ 897 if (new_size > JOURNAL_ENTRY_SIZE_MAX) 898 return -BCH_ERR_ENOMEM_journal_read_buf_realloc; 899 900 new_size = roundup_pow_of_two(new_size); 901 n = kvpmalloc(new_size, GFP_KERNEL); 902 if (!n) 903 return -BCH_ERR_ENOMEM_journal_read_buf_realloc; 904 905 kvpfree(b->data, b->size); 906 b->data = n; 907 b->size = new_size; 908 return 0; 909 } 910 911 static int journal_read_bucket(struct bch_dev *ca, 912 struct journal_read_buf *buf, 913 struct journal_list *jlist, 914 unsigned bucket) 915 { 916 struct bch_fs *c = ca->fs; 917 struct journal_device *ja = &ca->journal; 918 struct jset *j = NULL; 919 unsigned sectors, sectors_read = 0; 920 u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), 921 end = offset + ca->mi.bucket_size; 922 bool saw_bad = false, csum_good; 923 int ret = 0; 924 925 pr_debug("reading %u", bucket); 926 927 while (offset < end) { 928 if (!sectors_read) { 929 struct bio *bio; 930 unsigned nr_bvecs; 931 reread: 932 sectors_read = min_t(unsigned, 933 end - offset, buf->size >> 9); 934 nr_bvecs = buf_pages(buf->data, sectors_read << 9); 935 936 bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); 937 bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ); 938 939 bio->bi_iter.bi_sector = offset; 940 bch2_bio_map(bio, buf->data, sectors_read << 9); 941 942 ret = submit_bio_wait(bio); 943 kfree(bio); 944 945 if (bch2_dev_io_err_on(ret, ca, BCH_MEMBER_ERROR_read, 946 "journal read error: sector %llu", 947 offset) || 948 bch2_meta_read_fault("journal")) { 949 /* 950 * We don't error out of the recovery process 951 * here, since the relevant journal entry may be 952 * found on a different device, and missing or 953 * no journal entries will be handled later 954 */ 955 return 0; 956 } 957 958 j = buf->data; 959 } 960 961 ret = jset_validate_early(c, ca, j, offset, 962 end - offset, sectors_read); 963 switch (ret) { 964 case 0: 965 sectors = vstruct_sectors(j, c->block_bits); 966 break; 967 case JOURNAL_ENTRY_REREAD: 968 if (vstruct_bytes(j) > buf->size) { 969 ret = journal_read_buf_realloc(buf, 970 vstruct_bytes(j)); 971 if (ret) 972 return ret; 973 } 974 goto reread; 975 case JOURNAL_ENTRY_NONE: 976 if (!saw_bad) 977 return 0; 978 /* 979 * On checksum error we don't really trust the size 980 * field of the journal entry we read, so try reading 981 * again at next block boundary: 982 */ 983 sectors = block_sectors(c); 984 goto next_block; 985 default: 986 return ret; 987 } 988 989 /* 990 * This happens sometimes if we don't have discards on - 991 * when we've partially overwritten a bucket with new 992 * journal entries. We don't need the rest of the 993 * bucket: 994 */ 995 if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) 996 return 0; 997 998 ja->bucket_seq[bucket] = le64_to_cpu(j->seq); 999 1000 csum_good = jset_csum_good(c, j); 1001 if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum, 1002 "journal checksum error")) 1003 saw_bad = true; 1004 1005 ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), 1006 j->encrypted_start, 1007 vstruct_end(j) - (void *) j->encrypted_start); 1008 bch2_fs_fatal_err_on(ret, c, 1009 "error decrypting journal entry: %i", ret); 1010 1011 mutex_lock(&jlist->lock); 1012 ret = journal_entry_add(c, ca, (struct journal_ptr) { 1013 .csum_good = csum_good, 1014 .dev = ca->dev_idx, 1015 .bucket = bucket, 1016 .bucket_offset = offset - 1017 bucket_to_sector(ca, ja->buckets[bucket]), 1018 .sector = offset, 1019 }, jlist, j); 1020 mutex_unlock(&jlist->lock); 1021 1022 switch (ret) { 1023 case JOURNAL_ENTRY_ADD_OK: 1024 break; 1025 case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: 1026 break; 1027 default: 1028 return ret; 1029 } 1030 next_block: 1031 pr_debug("next"); 1032 offset += sectors; 1033 sectors_read -= sectors; 1034 j = ((void *) j) + (sectors << 9); 1035 } 1036 1037 return 0; 1038 } 1039 1040 static CLOSURE_CALLBACK(bch2_journal_read_device) 1041 { 1042 closure_type(ja, struct journal_device, read); 1043 struct bch_dev *ca = container_of(ja, struct bch_dev, journal); 1044 struct bch_fs *c = ca->fs; 1045 struct journal_list *jlist = 1046 container_of(cl->parent, struct journal_list, cl); 1047 struct journal_replay *r, **_r; 1048 struct genradix_iter iter; 1049 struct journal_read_buf buf = { NULL, 0 }; 1050 unsigned i; 1051 int ret = 0; 1052 1053 if (!ja->nr) 1054 goto out; 1055 1056 ret = journal_read_buf_realloc(&buf, PAGE_SIZE); 1057 if (ret) 1058 goto err; 1059 1060 pr_debug("%u journal buckets", ja->nr); 1061 1062 for (i = 0; i < ja->nr; i++) { 1063 ret = journal_read_bucket(ca, &buf, jlist, i); 1064 if (ret) 1065 goto err; 1066 } 1067 1068 ja->sectors_free = ca->mi.bucket_size; 1069 1070 mutex_lock(&jlist->lock); 1071 genradix_for_each_reverse(&c->journal_entries, iter, _r) { 1072 r = *_r; 1073 1074 if (!r) 1075 continue; 1076 1077 for (i = 0; i < r->nr_ptrs; i++) { 1078 if (r->ptrs[i].dev == ca->dev_idx) { 1079 unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) + 1080 vstruct_sectors(&r->j, c->block_bits); 1081 1082 ja->cur_idx = r->ptrs[i].bucket; 1083 ja->sectors_free = ca->mi.bucket_size - wrote; 1084 goto found; 1085 } 1086 } 1087 } 1088 found: 1089 mutex_unlock(&jlist->lock); 1090 1091 if (ja->bucket_seq[ja->cur_idx] && 1092 ja->sectors_free == ca->mi.bucket_size) { 1093 #if 0 1094 /* 1095 * Debug code for ZNS support, where we (probably) want to be 1096 * correlated where we stopped in the journal to the zone write 1097 * points: 1098 */ 1099 bch_err(c, "ja->sectors_free == ca->mi.bucket_size"); 1100 bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr); 1101 for (i = 0; i < 3; i++) { 1102 unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr; 1103 1104 bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]); 1105 } 1106 #endif 1107 ja->sectors_free = 0; 1108 } 1109 1110 /* 1111 * Set dirty_idx to indicate the entire journal is full and needs to be 1112 * reclaimed - journal reclaim will immediately reclaim whatever isn't 1113 * pinned when it first runs: 1114 */ 1115 ja->discard_idx = ja->dirty_idx_ondisk = 1116 ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; 1117 out: 1118 bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); 1119 kvpfree(buf.data, buf.size); 1120 percpu_ref_put(&ca->io_ref); 1121 closure_return(cl); 1122 return; 1123 err: 1124 mutex_lock(&jlist->lock); 1125 jlist->ret = ret; 1126 mutex_unlock(&jlist->lock); 1127 goto out; 1128 } 1129 1130 void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, 1131 struct journal_replay *j) 1132 { 1133 unsigned i; 1134 1135 for (i = 0; i < j->nr_ptrs; i++) { 1136 struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev); 1137 u64 offset; 1138 1139 div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset); 1140 1141 if (i) 1142 prt_printf(out, " "); 1143 prt_printf(out, "%u:%u:%u (sector %llu)", 1144 j->ptrs[i].dev, 1145 j->ptrs[i].bucket, 1146 j->ptrs[i].bucket_offset, 1147 j->ptrs[i].sector); 1148 } 1149 } 1150 1151 int bch2_journal_read(struct bch_fs *c, 1152 u64 *last_seq, 1153 u64 *blacklist_seq, 1154 u64 *start_seq) 1155 { 1156 struct journal_list jlist; 1157 struct journal_replay *i, **_i, *prev = NULL; 1158 struct genradix_iter radix_iter; 1159 struct bch_dev *ca; 1160 unsigned iter; 1161 struct printbuf buf = PRINTBUF; 1162 bool degraded = false, last_write_torn = false; 1163 u64 seq; 1164 int ret = 0; 1165 1166 closure_init_stack(&jlist.cl); 1167 mutex_init(&jlist.lock); 1168 jlist.last_seq = 0; 1169 jlist.ret = 0; 1170 1171 for_each_member_device(ca, c, iter) { 1172 if (!c->opts.fsck && 1173 !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal))) 1174 continue; 1175 1176 if ((ca->mi.state == BCH_MEMBER_STATE_rw || 1177 ca->mi.state == BCH_MEMBER_STATE_ro) && 1178 percpu_ref_tryget(&ca->io_ref)) 1179 closure_call(&ca->journal.read, 1180 bch2_journal_read_device, 1181 system_unbound_wq, 1182 &jlist.cl); 1183 else 1184 degraded = true; 1185 } 1186 1187 closure_sync(&jlist.cl); 1188 1189 if (jlist.ret) 1190 return jlist.ret; 1191 1192 *last_seq = 0; 1193 *start_seq = 0; 1194 *blacklist_seq = 0; 1195 1196 /* 1197 * Find most recent flush entry, and ignore newer non flush entries - 1198 * those entries will be blacklisted: 1199 */ 1200 genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) { 1201 enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL; 1202 1203 i = *_i; 1204 1205 if (!i || i->ignore) 1206 continue; 1207 1208 if (!*start_seq) 1209 *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1; 1210 1211 if (JSET_NO_FLUSH(&i->j)) { 1212 i->ignore = true; 1213 continue; 1214 } 1215 1216 if (!last_write_torn && !i->csum_good) { 1217 last_write_torn = true; 1218 i->ignore = true; 1219 continue; 1220 } 1221 1222 if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq), 1223 c, le32_to_cpu(i->j.version), &i->j, NULL, 1224 jset_last_seq_newer_than_seq, 1225 "invalid journal entry: last_seq > seq (%llu > %llu)", 1226 le64_to_cpu(i->j.last_seq), 1227 le64_to_cpu(i->j.seq))) 1228 i->j.last_seq = i->j.seq; 1229 1230 *last_seq = le64_to_cpu(i->j.last_seq); 1231 *blacklist_seq = le64_to_cpu(i->j.seq) + 1; 1232 break; 1233 } 1234 1235 if (!*start_seq) { 1236 bch_info(c, "journal read done, but no entries found"); 1237 return 0; 1238 } 1239 1240 if (!*last_seq) { 1241 fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes, 1242 "journal read done, but no entries found after dropping non-flushes"); 1243 return 0; 1244 } 1245 1246 bch_info(c, "journal read done, replaying entries %llu-%llu", 1247 *last_seq, *blacklist_seq - 1); 1248 1249 if (*start_seq != *blacklist_seq) 1250 bch_info(c, "dropped unflushed entries %llu-%llu", 1251 *blacklist_seq, *start_seq - 1); 1252 1253 /* Drop blacklisted entries and entries older than last_seq: */ 1254 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1255 i = *_i; 1256 1257 if (!i || i->ignore) 1258 continue; 1259 1260 seq = le64_to_cpu(i->j.seq); 1261 if (seq < *last_seq) { 1262 journal_replay_free(c, i); 1263 continue; 1264 } 1265 1266 if (bch2_journal_seq_is_blacklisted(c, seq, true)) { 1267 fsck_err_on(!JSET_NO_FLUSH(&i->j), c, 1268 jset_seq_blacklisted, 1269 "found blacklisted journal entry %llu", seq); 1270 i->ignore = true; 1271 } 1272 } 1273 1274 /* Check for missing entries: */ 1275 seq = *last_seq; 1276 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1277 i = *_i; 1278 1279 if (!i || i->ignore) 1280 continue; 1281 1282 BUG_ON(seq > le64_to_cpu(i->j.seq)); 1283 1284 while (seq < le64_to_cpu(i->j.seq)) { 1285 u64 missing_start, missing_end; 1286 struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; 1287 1288 while (seq < le64_to_cpu(i->j.seq) && 1289 bch2_journal_seq_is_blacklisted(c, seq, false)) 1290 seq++; 1291 1292 if (seq == le64_to_cpu(i->j.seq)) 1293 break; 1294 1295 missing_start = seq; 1296 1297 while (seq < le64_to_cpu(i->j.seq) && 1298 !bch2_journal_seq_is_blacklisted(c, seq, false)) 1299 seq++; 1300 1301 if (prev) { 1302 bch2_journal_ptrs_to_text(&buf1, c, prev); 1303 prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits)); 1304 } else 1305 prt_printf(&buf1, "(none)"); 1306 bch2_journal_ptrs_to_text(&buf2, c, i); 1307 1308 missing_end = seq - 1; 1309 fsck_err(c, journal_entries_missing, 1310 "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" 1311 " prev at %s\n" 1312 " next at %s", 1313 missing_start, missing_end, 1314 *last_seq, *blacklist_seq - 1, 1315 buf1.buf, buf2.buf); 1316 1317 printbuf_exit(&buf1); 1318 printbuf_exit(&buf2); 1319 } 1320 1321 prev = i; 1322 seq++; 1323 } 1324 1325 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1326 struct bch_replicas_padded replicas = { 1327 .e.data_type = BCH_DATA_journal, 1328 .e.nr_required = 1, 1329 }; 1330 unsigned ptr; 1331 1332 i = *_i; 1333 if (!i || i->ignore) 1334 continue; 1335 1336 for (ptr = 0; ptr < i->nr_ptrs; ptr++) { 1337 ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev); 1338 1339 if (!i->ptrs[ptr].csum_good) 1340 bch_err_dev_offset(ca, i->ptrs[ptr].sector, 1341 "invalid journal checksum, seq %llu%s", 1342 le64_to_cpu(i->j.seq), 1343 i->csum_good ? " (had good copy on another device)" : ""); 1344 } 1345 1346 ret = jset_validate(c, 1347 bch_dev_bkey_exists(c, i->ptrs[0].dev), 1348 &i->j, 1349 i->ptrs[0].sector, 1350 READ); 1351 if (ret) 1352 goto err; 1353 1354 for (ptr = 0; ptr < i->nr_ptrs; ptr++) 1355 replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev; 1356 1357 bch2_replicas_entry_sort(&replicas.e); 1358 1359 printbuf_reset(&buf); 1360 bch2_replicas_entry_to_text(&buf, &replicas.e); 1361 1362 if (!degraded && 1363 !bch2_replicas_marked(c, &replicas.e) && 1364 (le64_to_cpu(i->j.seq) == *last_seq || 1365 fsck_err(c, journal_entry_replicas_not_marked, 1366 "superblock not marked as containing replicas for journal entry %llu\n %s", 1367 le64_to_cpu(i->j.seq), buf.buf))) { 1368 ret = bch2_mark_replicas(c, &replicas.e); 1369 if (ret) 1370 goto err; 1371 } 1372 } 1373 err: 1374 fsck_err: 1375 printbuf_exit(&buf); 1376 return ret; 1377 } 1378 1379 /* journal write: */ 1380 1381 static void __journal_write_alloc(struct journal *j, 1382 struct journal_buf *w, 1383 struct dev_alloc_list *devs_sorted, 1384 unsigned sectors, 1385 unsigned *replicas, 1386 unsigned replicas_want) 1387 { 1388 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1389 struct journal_device *ja; 1390 struct bch_dev *ca; 1391 unsigned i; 1392 1393 if (*replicas >= replicas_want) 1394 return; 1395 1396 for (i = 0; i < devs_sorted->nr; i++) { 1397 ca = rcu_dereference(c->devs[devs_sorted->devs[i]]); 1398 if (!ca) 1399 continue; 1400 1401 ja = &ca->journal; 1402 1403 /* 1404 * Check that we can use this device, and aren't already using 1405 * it: 1406 */ 1407 if (!ca->mi.durability || 1408 ca->mi.state != BCH_MEMBER_STATE_rw || 1409 !ja->nr || 1410 bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) || 1411 sectors > ja->sectors_free) 1412 continue; 1413 1414 bch2_dev_stripe_increment(ca, &j->wp.stripe); 1415 1416 bch2_bkey_append_ptr(&w->key, 1417 (struct bch_extent_ptr) { 1418 .offset = bucket_to_sector(ca, 1419 ja->buckets[ja->cur_idx]) + 1420 ca->mi.bucket_size - 1421 ja->sectors_free, 1422 .dev = ca->dev_idx, 1423 }); 1424 1425 ja->sectors_free -= sectors; 1426 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1427 1428 *replicas += ca->mi.durability; 1429 1430 if (*replicas >= replicas_want) 1431 break; 1432 } 1433 } 1434 1435 /** 1436 * journal_write_alloc - decide where to write next journal entry 1437 * 1438 * @j: journal object 1439 * @w: journal buf (entry to be written) 1440 * 1441 * Returns: 0 on success, or -EROFS on failure 1442 */ 1443 static int journal_write_alloc(struct journal *j, struct journal_buf *w) 1444 { 1445 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1446 struct bch_devs_mask devs; 1447 struct journal_device *ja; 1448 struct bch_dev *ca; 1449 struct dev_alloc_list devs_sorted; 1450 unsigned sectors = vstruct_sectors(w->data, c->block_bits); 1451 unsigned target = c->opts.metadata_target ?: 1452 c->opts.foreground_target; 1453 unsigned i, replicas = 0, replicas_want = 1454 READ_ONCE(c->opts.metadata_replicas); 1455 1456 rcu_read_lock(); 1457 retry: 1458 devs = target_rw_devs(c, BCH_DATA_journal, target); 1459 1460 devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); 1461 1462 __journal_write_alloc(j, w, &devs_sorted, 1463 sectors, &replicas, replicas_want); 1464 1465 if (replicas >= replicas_want) 1466 goto done; 1467 1468 for (i = 0; i < devs_sorted.nr; i++) { 1469 ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); 1470 if (!ca) 1471 continue; 1472 1473 ja = &ca->journal; 1474 1475 if (sectors > ja->sectors_free && 1476 sectors <= ca->mi.bucket_size && 1477 bch2_journal_dev_buckets_available(j, ja, 1478 journal_space_discarded)) { 1479 ja->cur_idx = (ja->cur_idx + 1) % ja->nr; 1480 ja->sectors_free = ca->mi.bucket_size; 1481 1482 /* 1483 * ja->bucket_seq[ja->cur_idx] must always have 1484 * something sensible: 1485 */ 1486 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1487 } 1488 } 1489 1490 __journal_write_alloc(j, w, &devs_sorted, 1491 sectors, &replicas, replicas_want); 1492 1493 if (replicas < replicas_want && target) { 1494 /* Retry from all devices: */ 1495 target = 0; 1496 goto retry; 1497 } 1498 done: 1499 rcu_read_unlock(); 1500 1501 BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); 1502 1503 return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS; 1504 } 1505 1506 static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) 1507 { 1508 /* we aren't holding j->lock: */ 1509 unsigned new_size = READ_ONCE(j->buf_size_want); 1510 void *new_buf; 1511 1512 if (buf->buf_size >= new_size) 1513 return; 1514 1515 new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN); 1516 if (!new_buf) 1517 return; 1518 1519 memcpy(new_buf, buf->data, buf->buf_size); 1520 1521 spin_lock(&j->lock); 1522 swap(buf->data, new_buf); 1523 swap(buf->buf_size, new_size); 1524 spin_unlock(&j->lock); 1525 1526 kvpfree(new_buf, new_size); 1527 } 1528 1529 static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) 1530 { 1531 return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK); 1532 } 1533 1534 static CLOSURE_CALLBACK(journal_write_done) 1535 { 1536 closure_type(j, struct journal, io); 1537 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1538 struct journal_buf *w = journal_last_unwritten_buf(j); 1539 struct bch_replicas_padded replicas; 1540 union journal_res_state old, new; 1541 u64 v, seq; 1542 int err = 0; 1543 1544 bch2_time_stats_update(!JSET_NO_FLUSH(w->data) 1545 ? j->flush_write_time 1546 : j->noflush_write_time, j->write_start_time); 1547 1548 if (!w->devs_written.nr) { 1549 bch_err(c, "unable to write journal to sufficient devices"); 1550 err = -EIO; 1551 } else { 1552 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, 1553 w->devs_written); 1554 if (bch2_mark_replicas(c, &replicas.e)) 1555 err = -EIO; 1556 } 1557 1558 if (err) 1559 bch2_fatal_error(c); 1560 1561 spin_lock(&j->lock); 1562 seq = le64_to_cpu(w->data->seq); 1563 1564 if (seq >= j->pin.front) 1565 journal_seq_pin(j, seq)->devs = w->devs_written; 1566 1567 if (!err) { 1568 if (!JSET_NO_FLUSH(w->data)) { 1569 j->flushed_seq_ondisk = seq; 1570 j->last_seq_ondisk = w->last_seq; 1571 1572 bch2_do_discards(c); 1573 closure_wake_up(&c->freelist_wait); 1574 1575 bch2_reset_alloc_cursors(c); 1576 } 1577 } else if (!j->err_seq || seq < j->err_seq) 1578 j->err_seq = seq; 1579 1580 j->seq_ondisk = seq; 1581 1582 /* 1583 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard 1584 * more buckets: 1585 * 1586 * Must come before signaling write completion, for 1587 * bch2_fs_journal_stop(): 1588 */ 1589 if (j->watermark != BCH_WATERMARK_stripe) 1590 journal_reclaim_kick(&c->journal); 1591 1592 /* also must come before signalling write completion: */ 1593 closure_debug_destroy(cl); 1594 1595 v = atomic64_read(&j->reservations.counter); 1596 do { 1597 old.v = new.v = v; 1598 BUG_ON(journal_state_count(new, new.unwritten_idx)); 1599 1600 new.unwritten_idx++; 1601 } while ((v = atomic64_cmpxchg(&j->reservations.counter, 1602 old.v, new.v)) != old.v); 1603 1604 bch2_journal_reclaim_fast(j); 1605 bch2_journal_space_available(j); 1606 1607 closure_wake_up(&w->wait); 1608 journal_wake(j); 1609 1610 if (!journal_state_count(new, new.unwritten_idx) && 1611 journal_last_unwritten_seq(j) <= journal_cur_seq(j)) { 1612 spin_unlock(&j->lock); 1613 closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); 1614 } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && 1615 new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { 1616 struct journal_buf *buf = journal_cur_buf(j); 1617 long delta = buf->expires - jiffies; 1618 1619 /* 1620 * We don't close a journal entry to write it while there's 1621 * previous entries still in flight - the current journal entry 1622 * might want to be written now: 1623 */ 1624 1625 spin_unlock(&j->lock); 1626 mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta)); 1627 } else { 1628 spin_unlock(&j->lock); 1629 } 1630 } 1631 1632 static void journal_write_endio(struct bio *bio) 1633 { 1634 struct bch_dev *ca = bio->bi_private; 1635 struct journal *j = &ca->fs->journal; 1636 struct journal_buf *w = journal_last_unwritten_buf(j); 1637 unsigned long flags; 1638 1639 if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, 1640 "error writing journal entry %llu: %s", 1641 le64_to_cpu(w->data->seq), 1642 bch2_blk_status_to_str(bio->bi_status)) || 1643 bch2_meta_write_fault("journal")) { 1644 spin_lock_irqsave(&j->err_lock, flags); 1645 bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx); 1646 spin_unlock_irqrestore(&j->err_lock, flags); 1647 } 1648 1649 closure_put(&j->io); 1650 percpu_ref_put(&ca->io_ref); 1651 } 1652 1653 static CLOSURE_CALLBACK(do_journal_write) 1654 { 1655 closure_type(j, struct journal, io); 1656 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1657 struct bch_dev *ca; 1658 struct journal_buf *w = journal_last_unwritten_buf(j); 1659 struct bch_extent_ptr *ptr; 1660 struct bio *bio; 1661 unsigned sectors = vstruct_sectors(w->data, c->block_bits); 1662 1663 extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { 1664 ca = bch_dev_bkey_exists(c, ptr->dev); 1665 if (!percpu_ref_tryget(&ca->io_ref)) { 1666 /* XXX: fix this */ 1667 bch_err(c, "missing device for journal write\n"); 1668 continue; 1669 } 1670 1671 this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], 1672 sectors); 1673 1674 bio = ca->journal.bio; 1675 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); 1676 bio->bi_iter.bi_sector = ptr->offset; 1677 bio->bi_end_io = journal_write_endio; 1678 bio->bi_private = ca; 1679 1680 BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector); 1681 ca->prev_journal_sector = bio->bi_iter.bi_sector; 1682 1683 if (!JSET_NO_FLUSH(w->data)) 1684 bio->bi_opf |= REQ_FUA; 1685 if (!JSET_NO_FLUSH(w->data) && !w->separate_flush) 1686 bio->bi_opf |= REQ_PREFLUSH; 1687 1688 bch2_bio_map(bio, w->data, sectors << 9); 1689 1690 trace_and_count(c, journal_write, bio); 1691 closure_bio_submit(bio, cl); 1692 1693 ca->journal.bucket_seq[ca->journal.cur_idx] = 1694 le64_to_cpu(w->data->seq); 1695 } 1696 1697 continue_at(cl, journal_write_done, c->io_complete_wq); 1698 } 1699 1700 static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) 1701 { 1702 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1703 struct jset_entry *start, *end, *i, *next, *prev = NULL; 1704 struct jset *jset = w->data; 1705 unsigned sectors, bytes, u64s; 1706 bool validate_before_checksum = false; 1707 unsigned long btree_roots_have = 0; 1708 int ret; 1709 1710 /* 1711 * Simple compaction, dropping empty jset_entries (from journal 1712 * reservations that weren't fully used) and merging jset_entries that 1713 * can be. 1714 * 1715 * If we wanted to be really fancy here, we could sort all the keys in 1716 * the jset and drop keys that were overwritten - probably not worth it: 1717 */ 1718 vstruct_for_each_safe(jset, i, next) { 1719 unsigned u64s = le16_to_cpu(i->u64s); 1720 1721 /* Empty entry: */ 1722 if (!u64s) 1723 continue; 1724 1725 /* 1726 * New btree roots are set by journalling them; when the journal 1727 * entry gets written we have to propagate them to 1728 * c->btree_roots 1729 * 1730 * But, every journal entry we write has to contain all the 1731 * btree roots (at least for now); so after we copy btree roots 1732 * to c->btree_roots we have to get any missing btree roots and 1733 * add them to this journal entry: 1734 */ 1735 if (i->type == BCH_JSET_ENTRY_btree_root) { 1736 bch2_journal_entry_to_btree_root(c, i); 1737 __set_bit(i->btree_id, &btree_roots_have); 1738 } 1739 1740 /* Can we merge with previous entry? */ 1741 if (prev && 1742 i->btree_id == prev->btree_id && 1743 i->level == prev->level && 1744 i->type == prev->type && 1745 i->type == BCH_JSET_ENTRY_btree_keys && 1746 le16_to_cpu(prev->u64s) + u64s <= U16_MAX) { 1747 memmove_u64s_down(vstruct_next(prev), 1748 i->_data, 1749 u64s); 1750 le16_add_cpu(&prev->u64s, u64s); 1751 continue; 1752 } 1753 1754 /* Couldn't merge, move i into new position (after prev): */ 1755 prev = prev ? vstruct_next(prev) : jset->start; 1756 if (i != prev) 1757 memmove_u64s_down(prev, i, jset_u64s(u64s)); 1758 } 1759 1760 prev = prev ? vstruct_next(prev) : jset->start; 1761 jset->u64s = cpu_to_le32((u64 *) prev - jset->_data); 1762 1763 start = end = vstruct_last(jset); 1764 1765 end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have); 1766 1767 bch2_journal_super_entries_add_common(c, &end, 1768 le64_to_cpu(jset->seq)); 1769 u64s = (u64 *) end - (u64 *) start; 1770 BUG_ON(u64s > j->entry_u64s_reserved); 1771 1772 le32_add_cpu(&jset->u64s, u64s); 1773 1774 sectors = vstruct_sectors(jset, c->block_bits); 1775 bytes = vstruct_bytes(jset); 1776 1777 if (sectors > w->sectors) { 1778 bch2_fs_fatal_error(c, "aieeee! journal write overran available space, %zu > %u (extra %u reserved %u/%u)", 1779 vstruct_bytes(jset), w->sectors << 9, 1780 u64s, w->u64s_reserved, j->entry_u64s_reserved); 1781 return -EINVAL; 1782 } 1783 1784 jset->magic = cpu_to_le64(jset_magic(c)); 1785 jset->version = cpu_to_le32(c->sb.version); 1786 1787 SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); 1788 SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); 1789 1790 if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset)) 1791 j->last_empty_seq = le64_to_cpu(jset->seq); 1792 1793 if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) 1794 validate_before_checksum = true; 1795 1796 if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current) 1797 validate_before_checksum = true; 1798 1799 if (validate_before_checksum && 1800 (ret = jset_validate(c, NULL, jset, 0, WRITE))) 1801 return ret; 1802 1803 ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), 1804 jset->encrypted_start, 1805 vstruct_end(jset) - (void *) jset->encrypted_start); 1806 if (bch2_fs_fatal_err_on(ret, c, 1807 "error decrypting journal entry: %i", ret)) 1808 return ret; 1809 1810 jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), 1811 journal_nonce(jset), jset); 1812 1813 if (!validate_before_checksum && 1814 (ret = jset_validate(c, NULL, jset, 0, WRITE))) 1815 return ret; 1816 1817 memset((void *) jset + bytes, 0, (sectors << 9) - bytes); 1818 return 0; 1819 } 1820 1821 static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w) 1822 { 1823 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1824 int error = bch2_journal_error(j); 1825 1826 /* 1827 * If the journal is in an error state - we did an emergency shutdown - 1828 * we prefer to continue doing journal writes. We just mark them as 1829 * noflush so they'll never be used, but they'll still be visible by the 1830 * list_journal tool - this helps in debugging. 1831 * 1832 * There's a caveat: the first journal write after marking the 1833 * superblock dirty must always be a flush write, because on startup 1834 * from a clean shutdown we didn't necessarily read the journal and the 1835 * new journal write might overwrite whatever was in the journal 1836 * previously - we can't leave the journal without any flush writes in 1837 * it. 1838 * 1839 * So if we're in an error state, and we're still starting up, we don't 1840 * write anything at all. 1841 */ 1842 if (error && test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags)) 1843 return -EIO; 1844 1845 if (error || 1846 w->noflush || 1847 (!w->must_flush && 1848 (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && 1849 test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) { 1850 w->noflush = true; 1851 SET_JSET_NO_FLUSH(w->data, true); 1852 w->data->last_seq = 0; 1853 w->last_seq = 0; 1854 1855 j->nr_noflush_writes++; 1856 } else { 1857 j->last_flush_write = jiffies; 1858 j->nr_flush_writes++; 1859 clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags); 1860 } 1861 1862 return 0; 1863 } 1864 1865 CLOSURE_CALLBACK(bch2_journal_write) 1866 { 1867 closure_type(j, struct journal, io); 1868 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1869 struct bch_dev *ca; 1870 struct journal_buf *w = journal_last_unwritten_buf(j); 1871 struct bch_replicas_padded replicas; 1872 struct bio *bio; 1873 struct printbuf journal_debug_buf = PRINTBUF; 1874 unsigned i, nr_rw_members = 0; 1875 int ret; 1876 1877 BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); 1878 1879 j->write_start_time = local_clock(); 1880 1881 spin_lock(&j->lock); 1882 ret = bch2_journal_write_pick_flush(j, w); 1883 spin_unlock(&j->lock); 1884 if (ret) 1885 goto err; 1886 1887 journal_buf_realloc(j, w); 1888 1889 ret = bch2_journal_write_prep(j, w); 1890 if (ret) 1891 goto err; 1892 1893 while (1) { 1894 spin_lock(&j->lock); 1895 ret = journal_write_alloc(j, w); 1896 if (!ret || !j->can_discard) 1897 break; 1898 1899 spin_unlock(&j->lock); 1900 bch2_journal_do_discards(j); 1901 } 1902 1903 if (ret) { 1904 __bch2_journal_debug_to_text(&journal_debug_buf, j); 1905 spin_unlock(&j->lock); 1906 bch_err(c, "Unable to allocate journal write:\n%s", 1907 journal_debug_buf.buf); 1908 printbuf_exit(&journal_debug_buf); 1909 goto err; 1910 } 1911 1912 /* 1913 * write is allocated, no longer need to account for it in 1914 * bch2_journal_space_available(): 1915 */ 1916 w->sectors = 0; 1917 1918 /* 1919 * journal entry has been compacted and allocated, recalculate space 1920 * available: 1921 */ 1922 bch2_journal_space_available(j); 1923 spin_unlock(&j->lock); 1924 1925 w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); 1926 1927 if (c->opts.nochanges) 1928 goto no_io; 1929 1930 for_each_rw_member(ca, c, i) 1931 nr_rw_members++; 1932 1933 if (nr_rw_members > 1) 1934 w->separate_flush = true; 1935 1936 /* 1937 * Mark journal replicas before we submit the write to guarantee 1938 * recovery will find the journal entries after a crash. 1939 */ 1940 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, 1941 w->devs_written); 1942 ret = bch2_mark_replicas(c, &replicas.e); 1943 if (ret) 1944 goto err; 1945 1946 if (!JSET_NO_FLUSH(w->data) && w->separate_flush) { 1947 for_each_rw_member(ca, c, i) { 1948 percpu_ref_get(&ca->io_ref); 1949 1950 bio = ca->journal.bio; 1951 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH); 1952 bio->bi_end_io = journal_write_endio; 1953 bio->bi_private = ca; 1954 closure_bio_submit(bio, cl); 1955 } 1956 } 1957 1958 continue_at(cl, do_journal_write, c->io_complete_wq); 1959 return; 1960 no_io: 1961 continue_at(cl, journal_write_done, c->io_complete_wq); 1962 return; 1963 err: 1964 bch2_fatal_error(c); 1965 continue_at(cl, journal_write_done, c->io_complete_wq); 1966 } 1967