1 // SPDX-License-Identifier: GPL-2.0 2 #include "bcachefs.h" 3 #include "alloc_background.h" 4 #include "alloc_foreground.h" 5 #include "btree_io.h" 6 #include "btree_update_interior.h" 7 #include "btree_write_buffer.h" 8 #include "buckets.h" 9 #include "checksum.h" 10 #include "disk_groups.h" 11 #include "error.h" 12 #include "journal.h" 13 #include "journal_io.h" 14 #include "journal_reclaim.h" 15 #include "journal_seq_blacklist.h" 16 #include "replicas.h" 17 #include "sb-clean.h" 18 #include "trace.h" 19 20 static struct nonce journal_nonce(const struct jset *jset) 21 { 22 return (struct nonce) {{ 23 [0] = 0, 24 [1] = ((__le32 *) &jset->seq)[0], 25 [2] = ((__le32 *) &jset->seq)[1], 26 [3] = BCH_NONCE_JOURNAL, 27 }}; 28 } 29 30 static bool jset_csum_good(struct bch_fs *c, struct jset *j, struct bch_csum *csum) 31 { 32 if (!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j))) { 33 *csum = (struct bch_csum) {}; 34 return false; 35 } 36 37 *csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j); 38 return !bch2_crc_cmp(j->csum, *csum); 39 } 40 41 static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq) 42 { 43 return (seq - c->journal_entries_base_seq) & (~0U >> 1); 44 } 45 46 static void __journal_replay_free(struct bch_fs *c, 47 struct journal_replay *i) 48 { 49 struct journal_replay **p = 50 genradix_ptr(&c->journal_entries, 51 journal_entry_radix_idx(c, le64_to_cpu(i->j.seq))); 52 53 BUG_ON(*p != i); 54 *p = NULL; 55 kvpfree(i, offsetof(struct journal_replay, j) + 56 vstruct_bytes(&i->j)); 57 } 58 59 static void journal_replay_free(struct bch_fs *c, struct journal_replay *i) 60 { 61 i->ignore = true; 62 63 if (!c->opts.read_entire_journal) 64 __journal_replay_free(c, i); 65 } 66 67 struct journal_list { 68 struct closure cl; 69 u64 last_seq; 70 struct mutex lock; 71 int ret; 72 }; 73 74 #define JOURNAL_ENTRY_ADD_OK 0 75 #define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5 76 77 /* 78 * Given a journal entry we just read, add it to the list of journal entries to 79 * be replayed: 80 */ 81 static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, 82 struct journal_ptr entry_ptr, 83 struct journal_list *jlist, struct jset *j) 84 { 85 struct genradix_iter iter; 86 struct journal_replay **_i, *i, *dup; 87 struct journal_ptr *ptr; 88 size_t bytes = vstruct_bytes(j); 89 u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0; 90 int ret = JOURNAL_ENTRY_ADD_OK; 91 92 /* Is this entry older than the range we need? */ 93 if (!c->opts.read_entire_journal && 94 le64_to_cpu(j->seq) < jlist->last_seq) 95 return JOURNAL_ENTRY_ADD_OUT_OF_RANGE; 96 97 /* 98 * genradixes are indexed by a ulong, not a u64, so we can't index them 99 * by sequence number directly: Assume instead that they will all fall 100 * within the range of +-2billion of the filrst one we find. 101 */ 102 if (!c->journal_entries_base_seq) 103 c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX); 104 105 /* Drop entries we don't need anymore */ 106 if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) { 107 genradix_for_each_from(&c->journal_entries, iter, _i, 108 journal_entry_radix_idx(c, jlist->last_seq)) { 109 i = *_i; 110 111 if (!i || i->ignore) 112 continue; 113 114 if (le64_to_cpu(i->j.seq) >= last_seq) 115 break; 116 journal_replay_free(c, i); 117 } 118 } 119 120 jlist->last_seq = max(jlist->last_seq, last_seq); 121 122 _i = genradix_ptr_alloc(&c->journal_entries, 123 journal_entry_radix_idx(c, le64_to_cpu(j->seq)), 124 GFP_KERNEL); 125 if (!_i) 126 return -BCH_ERR_ENOMEM_journal_entry_add; 127 128 /* 129 * Duplicate journal entries? If so we want the one that didn't have a 130 * checksum error: 131 */ 132 dup = *_i; 133 if (dup) { 134 if (bytes == vstruct_bytes(&dup->j) && 135 !memcmp(j, &dup->j, bytes)) { 136 i = dup; 137 goto found; 138 } 139 140 if (!entry_ptr.csum_good) { 141 i = dup; 142 goto found; 143 } 144 145 if (!dup->csum_good) 146 goto replace; 147 148 fsck_err(c, journal_entry_replicas_data_mismatch, 149 "found duplicate but non identical journal entries (seq %llu)", 150 le64_to_cpu(j->seq)); 151 i = dup; 152 goto found; 153 } 154 replace: 155 i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); 156 if (!i) 157 return -BCH_ERR_ENOMEM_journal_entry_add; 158 159 i->nr_ptrs = 0; 160 i->csum_good = entry_ptr.csum_good; 161 i->ignore = false; 162 unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct"); 163 i->ptrs[i->nr_ptrs++] = entry_ptr; 164 165 if (dup) { 166 if (dup->nr_ptrs >= ARRAY_SIZE(dup->ptrs)) { 167 bch_err(c, "found too many copies of journal entry %llu", 168 le64_to_cpu(i->j.seq)); 169 dup->nr_ptrs = ARRAY_SIZE(dup->ptrs) - 1; 170 } 171 172 /* The first ptr should represent the jset we kept: */ 173 memcpy(i->ptrs + i->nr_ptrs, 174 dup->ptrs, 175 sizeof(dup->ptrs[0]) * dup->nr_ptrs); 176 i->nr_ptrs += dup->nr_ptrs; 177 __journal_replay_free(c, dup); 178 } 179 180 *_i = i; 181 return 0; 182 found: 183 for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) { 184 if (ptr->dev == ca->dev_idx) { 185 bch_err(c, "duplicate journal entry %llu on same device", 186 le64_to_cpu(i->j.seq)); 187 goto out; 188 } 189 } 190 191 if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) { 192 bch_err(c, "found too many copies of journal entry %llu", 193 le64_to_cpu(i->j.seq)); 194 goto out; 195 } 196 197 i->ptrs[i->nr_ptrs++] = entry_ptr; 198 out: 199 fsck_err: 200 return ret; 201 } 202 203 /* this fills in a range with empty jset_entries: */ 204 static void journal_entry_null_range(void *start, void *end) 205 { 206 struct jset_entry *entry; 207 208 for (entry = start; entry != end; entry = vstruct_next(entry)) 209 memset(entry, 0, sizeof(*entry)); 210 } 211 212 #define JOURNAL_ENTRY_REREAD 5 213 #define JOURNAL_ENTRY_NONE 6 214 #define JOURNAL_ENTRY_BAD 7 215 216 static void journal_entry_err_msg(struct printbuf *out, 217 u32 version, 218 struct jset *jset, 219 struct jset_entry *entry) 220 { 221 prt_str(out, "invalid journal entry, version="); 222 bch2_version_to_text(out, version); 223 224 if (entry) { 225 prt_str(out, " type="); 226 prt_str(out, bch2_jset_entry_types[entry->type]); 227 } 228 229 if (!jset) { 230 prt_printf(out, " in superblock"); 231 } else { 232 233 prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq)); 234 235 if (entry) 236 prt_printf(out, " offset=%zi/%u", 237 (u64 *) entry - jset->_data, 238 le32_to_cpu(jset->u64s)); 239 } 240 241 prt_str(out, ": "); 242 } 243 244 #define journal_entry_err(c, version, jset, entry, _err, msg, ...) \ 245 ({ \ 246 struct printbuf _buf = PRINTBUF; \ 247 \ 248 journal_entry_err_msg(&_buf, version, jset, entry); \ 249 prt_printf(&_buf, msg, ##__VA_ARGS__); \ 250 \ 251 switch (flags & BKEY_INVALID_WRITE) { \ 252 case READ: \ 253 mustfix_fsck_err(c, _err, "%s", _buf.buf); \ 254 break; \ 255 case WRITE: \ 256 bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \ 257 bch_err(c, "corrupt metadata before write: %s\n", _buf.buf);\ 258 if (bch2_fs_inconsistent(c)) { \ 259 ret = -BCH_ERR_fsck_errors_not_fixed; \ 260 goto fsck_err; \ 261 } \ 262 break; \ 263 } \ 264 \ 265 printbuf_exit(&_buf); \ 266 true; \ 267 }) 268 269 #define journal_entry_err_on(cond, ...) \ 270 ((cond) ? journal_entry_err(__VA_ARGS__) : false) 271 272 #define FSCK_DELETED_KEY 5 273 274 static int journal_validate_key(struct bch_fs *c, 275 struct jset *jset, 276 struct jset_entry *entry, 277 unsigned level, enum btree_id btree_id, 278 struct bkey_i *k, 279 unsigned version, int big_endian, 280 enum bkey_invalid_flags flags) 281 { 282 int write = flags & BKEY_INVALID_WRITE; 283 void *next = vstruct_next(entry); 284 struct printbuf buf = PRINTBUF; 285 int ret = 0; 286 287 if (journal_entry_err_on(!k->k.u64s, 288 c, version, jset, entry, 289 journal_entry_bkey_u64s_0, 290 "k->u64s 0")) { 291 entry->u64s = cpu_to_le16((u64 *) k - entry->_data); 292 journal_entry_null_range(vstruct_next(entry), next); 293 return FSCK_DELETED_KEY; 294 } 295 296 if (journal_entry_err_on((void *) bkey_next(k) > 297 (void *) vstruct_next(entry), 298 c, version, jset, entry, 299 journal_entry_bkey_past_end, 300 "extends past end of journal entry")) { 301 entry->u64s = cpu_to_le16((u64 *) k - entry->_data); 302 journal_entry_null_range(vstruct_next(entry), next); 303 return FSCK_DELETED_KEY; 304 } 305 306 if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, 307 c, version, jset, entry, 308 journal_entry_bkey_bad_format, 309 "bad format %u", k->k.format)) { 310 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); 311 memmove(k, bkey_next(k), next - (void *) bkey_next(k)); 312 journal_entry_null_range(vstruct_next(entry), next); 313 return FSCK_DELETED_KEY; 314 } 315 316 if (!write) 317 bch2_bkey_compat(level, btree_id, version, big_endian, 318 write, NULL, bkey_to_packed(k)); 319 320 if (bch2_bkey_invalid(c, bkey_i_to_s_c(k), 321 __btree_node_type(level, btree_id), write, &buf)) { 322 printbuf_reset(&buf); 323 journal_entry_err_msg(&buf, version, jset, entry); 324 prt_newline(&buf); 325 printbuf_indent_add(&buf, 2); 326 327 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); 328 prt_newline(&buf); 329 bch2_bkey_invalid(c, bkey_i_to_s_c(k), 330 __btree_node_type(level, btree_id), write, &buf); 331 332 mustfix_fsck_err(c, journal_entry_bkey_invalid, 333 "%s", buf.buf); 334 335 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); 336 memmove(k, bkey_next(k), next - (void *) bkey_next(k)); 337 journal_entry_null_range(vstruct_next(entry), next); 338 339 printbuf_exit(&buf); 340 return FSCK_DELETED_KEY; 341 } 342 343 if (write) 344 bch2_bkey_compat(level, btree_id, version, big_endian, 345 write, NULL, bkey_to_packed(k)); 346 fsck_err: 347 printbuf_exit(&buf); 348 return ret; 349 } 350 351 static int journal_entry_btree_keys_validate(struct bch_fs *c, 352 struct jset *jset, 353 struct jset_entry *entry, 354 unsigned version, int big_endian, 355 enum bkey_invalid_flags flags) 356 { 357 struct bkey_i *k = entry->start; 358 359 while (k != vstruct_last(entry)) { 360 int ret = journal_validate_key(c, jset, entry, 361 entry->level, 362 entry->btree_id, 363 k, version, big_endian, 364 flags|BKEY_INVALID_JOURNAL); 365 if (ret == FSCK_DELETED_KEY) 366 continue; 367 368 k = bkey_next(k); 369 } 370 371 return 0; 372 } 373 374 static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c, 375 struct jset_entry *entry) 376 { 377 struct bkey_i *k; 378 bool first = true; 379 380 jset_entry_for_each_key(entry, k) { 381 if (!first) { 382 prt_newline(out); 383 prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]); 384 } 385 prt_printf(out, "btree=%s l=%u ", bch2_btree_id_str(entry->btree_id), entry->level); 386 bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); 387 first = false; 388 } 389 } 390 391 static int journal_entry_btree_root_validate(struct bch_fs *c, 392 struct jset *jset, 393 struct jset_entry *entry, 394 unsigned version, int big_endian, 395 enum bkey_invalid_flags flags) 396 { 397 struct bkey_i *k = entry->start; 398 int ret = 0; 399 400 if (journal_entry_err_on(!entry->u64s || 401 le16_to_cpu(entry->u64s) != k->k.u64s, 402 c, version, jset, entry, 403 journal_entry_btree_root_bad_size, 404 "invalid btree root journal entry: wrong number of keys")) { 405 void *next = vstruct_next(entry); 406 /* 407 * we don't want to null out this jset_entry, 408 * just the contents, so that later we can tell 409 * we were _supposed_ to have a btree root 410 */ 411 entry->u64s = 0; 412 journal_entry_null_range(vstruct_next(entry), next); 413 return 0; 414 } 415 416 ret = journal_validate_key(c, jset, entry, 1, entry->btree_id, k, 417 version, big_endian, flags); 418 if (ret == FSCK_DELETED_KEY) 419 ret = 0; 420 fsck_err: 421 return ret; 422 } 423 424 static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c, 425 struct jset_entry *entry) 426 { 427 journal_entry_btree_keys_to_text(out, c, entry); 428 } 429 430 static int journal_entry_prio_ptrs_validate(struct bch_fs *c, 431 struct jset *jset, 432 struct jset_entry *entry, 433 unsigned version, int big_endian, 434 enum bkey_invalid_flags flags) 435 { 436 /* obsolete, don't care: */ 437 return 0; 438 } 439 440 static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c, 441 struct jset_entry *entry) 442 { 443 } 444 445 static int journal_entry_blacklist_validate(struct bch_fs *c, 446 struct jset *jset, 447 struct jset_entry *entry, 448 unsigned version, int big_endian, 449 enum bkey_invalid_flags flags) 450 { 451 int ret = 0; 452 453 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, 454 c, version, jset, entry, 455 journal_entry_blacklist_bad_size, 456 "invalid journal seq blacklist entry: bad size")) { 457 journal_entry_null_range(entry, vstruct_next(entry)); 458 } 459 fsck_err: 460 return ret; 461 } 462 463 static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c, 464 struct jset_entry *entry) 465 { 466 struct jset_entry_blacklist *bl = 467 container_of(entry, struct jset_entry_blacklist, entry); 468 469 prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq)); 470 } 471 472 static int journal_entry_blacklist_v2_validate(struct bch_fs *c, 473 struct jset *jset, 474 struct jset_entry *entry, 475 unsigned version, int big_endian, 476 enum bkey_invalid_flags flags) 477 { 478 struct jset_entry_blacklist_v2 *bl_entry; 479 int ret = 0; 480 481 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, 482 c, version, jset, entry, 483 journal_entry_blacklist_v2_bad_size, 484 "invalid journal seq blacklist entry: bad size")) { 485 journal_entry_null_range(entry, vstruct_next(entry)); 486 goto out; 487 } 488 489 bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry); 490 491 if (journal_entry_err_on(le64_to_cpu(bl_entry->start) > 492 le64_to_cpu(bl_entry->end), 493 c, version, jset, entry, 494 journal_entry_blacklist_v2_start_past_end, 495 "invalid journal seq blacklist entry: start > end")) { 496 journal_entry_null_range(entry, vstruct_next(entry)); 497 } 498 out: 499 fsck_err: 500 return ret; 501 } 502 503 static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c, 504 struct jset_entry *entry) 505 { 506 struct jset_entry_blacklist_v2 *bl = 507 container_of(entry, struct jset_entry_blacklist_v2, entry); 508 509 prt_printf(out, "start=%llu end=%llu", 510 le64_to_cpu(bl->start), 511 le64_to_cpu(bl->end)); 512 } 513 514 static int journal_entry_usage_validate(struct bch_fs *c, 515 struct jset *jset, 516 struct jset_entry *entry, 517 unsigned version, int big_endian, 518 enum bkey_invalid_flags flags) 519 { 520 struct jset_entry_usage *u = 521 container_of(entry, struct jset_entry_usage, entry); 522 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 523 int ret = 0; 524 525 if (journal_entry_err_on(bytes < sizeof(*u), 526 c, version, jset, entry, 527 journal_entry_usage_bad_size, 528 "invalid journal entry usage: bad size")) { 529 journal_entry_null_range(entry, vstruct_next(entry)); 530 return ret; 531 } 532 533 fsck_err: 534 return ret; 535 } 536 537 static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c, 538 struct jset_entry *entry) 539 { 540 struct jset_entry_usage *u = 541 container_of(entry, struct jset_entry_usage, entry); 542 543 prt_printf(out, "type=%s v=%llu", 544 bch2_fs_usage_types[u->entry.btree_id], 545 le64_to_cpu(u->v)); 546 } 547 548 static int journal_entry_data_usage_validate(struct bch_fs *c, 549 struct jset *jset, 550 struct jset_entry *entry, 551 unsigned version, int big_endian, 552 enum bkey_invalid_flags flags) 553 { 554 struct jset_entry_data_usage *u = 555 container_of(entry, struct jset_entry_data_usage, entry); 556 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 557 struct printbuf err = PRINTBUF; 558 int ret = 0; 559 560 if (journal_entry_err_on(bytes < sizeof(*u) || 561 bytes < sizeof(*u) + u->r.nr_devs, 562 c, version, jset, entry, 563 journal_entry_data_usage_bad_size, 564 "invalid journal entry usage: bad size")) { 565 journal_entry_null_range(entry, vstruct_next(entry)); 566 goto out; 567 } 568 569 if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c->disk_sb.sb, &err), 570 c, version, jset, entry, 571 journal_entry_data_usage_bad_size, 572 "invalid journal entry usage: %s", err.buf)) { 573 journal_entry_null_range(entry, vstruct_next(entry)); 574 goto out; 575 } 576 out: 577 fsck_err: 578 printbuf_exit(&err); 579 return ret; 580 } 581 582 static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c, 583 struct jset_entry *entry) 584 { 585 struct jset_entry_data_usage *u = 586 container_of(entry, struct jset_entry_data_usage, entry); 587 588 bch2_replicas_entry_to_text(out, &u->r); 589 prt_printf(out, "=%llu", le64_to_cpu(u->v)); 590 } 591 592 static int journal_entry_clock_validate(struct bch_fs *c, 593 struct jset *jset, 594 struct jset_entry *entry, 595 unsigned version, int big_endian, 596 enum bkey_invalid_flags flags) 597 { 598 struct jset_entry_clock *clock = 599 container_of(entry, struct jset_entry_clock, entry); 600 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 601 int ret = 0; 602 603 if (journal_entry_err_on(bytes != sizeof(*clock), 604 c, version, jset, entry, 605 journal_entry_clock_bad_size, 606 "bad size")) { 607 journal_entry_null_range(entry, vstruct_next(entry)); 608 return ret; 609 } 610 611 if (journal_entry_err_on(clock->rw > 1, 612 c, version, jset, entry, 613 journal_entry_clock_bad_rw, 614 "bad rw")) { 615 journal_entry_null_range(entry, vstruct_next(entry)); 616 return ret; 617 } 618 619 fsck_err: 620 return ret; 621 } 622 623 static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c, 624 struct jset_entry *entry) 625 { 626 struct jset_entry_clock *clock = 627 container_of(entry, struct jset_entry_clock, entry); 628 629 prt_printf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time)); 630 } 631 632 static int journal_entry_dev_usage_validate(struct bch_fs *c, 633 struct jset *jset, 634 struct jset_entry *entry, 635 unsigned version, int big_endian, 636 enum bkey_invalid_flags flags) 637 { 638 struct jset_entry_dev_usage *u = 639 container_of(entry, struct jset_entry_dev_usage, entry); 640 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 641 unsigned expected = sizeof(*u); 642 unsigned dev; 643 int ret = 0; 644 645 if (journal_entry_err_on(bytes < expected, 646 c, version, jset, entry, 647 journal_entry_dev_usage_bad_size, 648 "bad size (%u < %u)", 649 bytes, expected)) { 650 journal_entry_null_range(entry, vstruct_next(entry)); 651 return ret; 652 } 653 654 dev = le32_to_cpu(u->dev); 655 656 if (journal_entry_err_on(!bch2_dev_exists2(c, dev), 657 c, version, jset, entry, 658 journal_entry_dev_usage_bad_dev, 659 "bad dev")) { 660 journal_entry_null_range(entry, vstruct_next(entry)); 661 return ret; 662 } 663 664 if (journal_entry_err_on(u->pad, 665 c, version, jset, entry, 666 journal_entry_dev_usage_bad_pad, 667 "bad pad")) { 668 journal_entry_null_range(entry, vstruct_next(entry)); 669 return ret; 670 } 671 672 fsck_err: 673 return ret; 674 } 675 676 static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c, 677 struct jset_entry *entry) 678 { 679 struct jset_entry_dev_usage *u = 680 container_of(entry, struct jset_entry_dev_usage, entry); 681 unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); 682 683 prt_printf(out, "dev=%u", le32_to_cpu(u->dev)); 684 685 for (i = 0; i < nr_types; i++) { 686 if (i < BCH_DATA_NR) 687 prt_printf(out, " %s", bch2_data_types[i]); 688 else 689 prt_printf(out, " (unknown data type %u)", i); 690 prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu", 691 le64_to_cpu(u->d[i].buckets), 692 le64_to_cpu(u->d[i].sectors), 693 le64_to_cpu(u->d[i].fragmented)); 694 } 695 } 696 697 static int journal_entry_log_validate(struct bch_fs *c, 698 struct jset *jset, 699 struct jset_entry *entry, 700 unsigned version, int big_endian, 701 enum bkey_invalid_flags flags) 702 { 703 return 0; 704 } 705 706 static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c, 707 struct jset_entry *entry) 708 { 709 struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); 710 unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d); 711 712 prt_printf(out, "%.*s", bytes, l->d); 713 } 714 715 static int journal_entry_overwrite_validate(struct bch_fs *c, 716 struct jset *jset, 717 struct jset_entry *entry, 718 unsigned version, int big_endian, 719 enum bkey_invalid_flags flags) 720 { 721 return journal_entry_btree_keys_validate(c, jset, entry, 722 version, big_endian, READ); 723 } 724 725 static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c, 726 struct jset_entry *entry) 727 { 728 journal_entry_btree_keys_to_text(out, c, entry); 729 } 730 731 static int journal_entry_write_buffer_keys_validate(struct bch_fs *c, 732 struct jset *jset, 733 struct jset_entry *entry, 734 unsigned version, int big_endian, 735 enum bkey_invalid_flags flags) 736 { 737 return journal_entry_btree_keys_validate(c, jset, entry, 738 version, big_endian, READ); 739 } 740 741 static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c, 742 struct jset_entry *entry) 743 { 744 journal_entry_btree_keys_to_text(out, c, entry); 745 } 746 747 struct jset_entry_ops { 748 int (*validate)(struct bch_fs *, struct jset *, 749 struct jset_entry *, unsigned, int, 750 enum bkey_invalid_flags); 751 void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); 752 }; 753 754 static const struct jset_entry_ops bch2_jset_entry_ops[] = { 755 #define x(f, nr) \ 756 [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \ 757 .validate = journal_entry_##f##_validate, \ 758 .to_text = journal_entry_##f##_to_text, \ 759 }, 760 BCH_JSET_ENTRY_TYPES() 761 #undef x 762 }; 763 764 int bch2_journal_entry_validate(struct bch_fs *c, 765 struct jset *jset, 766 struct jset_entry *entry, 767 unsigned version, int big_endian, 768 enum bkey_invalid_flags flags) 769 { 770 return entry->type < BCH_JSET_ENTRY_NR 771 ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry, 772 version, big_endian, flags) 773 : 0; 774 } 775 776 void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, 777 struct jset_entry *entry) 778 { 779 if (entry->type < BCH_JSET_ENTRY_NR) { 780 prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]); 781 bch2_jset_entry_ops[entry->type].to_text(out, c, entry); 782 } else { 783 prt_printf(out, "(unknown type %u)", entry->type); 784 } 785 } 786 787 static int jset_validate_entries(struct bch_fs *c, struct jset *jset, 788 enum bkey_invalid_flags flags) 789 { 790 unsigned version = le32_to_cpu(jset->version); 791 int ret = 0; 792 793 vstruct_for_each(jset, entry) { 794 if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset), 795 c, version, jset, entry, 796 journal_entry_past_jset_end, 797 "journal entry extends past end of jset")) { 798 jset->u64s = cpu_to_le32((u64 *) entry - jset->_data); 799 break; 800 } 801 802 ret = bch2_journal_entry_validate(c, jset, entry, 803 version, JSET_BIG_ENDIAN(jset), flags); 804 if (ret) 805 break; 806 } 807 fsck_err: 808 return ret; 809 } 810 811 static int jset_validate(struct bch_fs *c, 812 struct bch_dev *ca, 813 struct jset *jset, u64 sector, 814 enum bkey_invalid_flags flags) 815 { 816 unsigned version; 817 int ret = 0; 818 819 if (le64_to_cpu(jset->magic) != jset_magic(c)) 820 return JOURNAL_ENTRY_NONE; 821 822 version = le32_to_cpu(jset->version); 823 if (journal_entry_err_on(!bch2_version_compatible(version), 824 c, version, jset, NULL, 825 jset_unsupported_version, 826 "%s sector %llu seq %llu: incompatible journal entry version %u.%u", 827 ca ? ca->name : c->name, 828 sector, le64_to_cpu(jset->seq), 829 BCH_VERSION_MAJOR(version), 830 BCH_VERSION_MINOR(version))) { 831 /* don't try to continue: */ 832 return -EINVAL; 833 } 834 835 if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), 836 c, version, jset, NULL, 837 jset_unknown_csum, 838 "%s sector %llu seq %llu: journal entry with unknown csum type %llu", 839 ca ? ca->name : c->name, 840 sector, le64_to_cpu(jset->seq), 841 JSET_CSUM_TYPE(jset))) 842 ret = JOURNAL_ENTRY_BAD; 843 844 /* last_seq is ignored when JSET_NO_FLUSH is true */ 845 if (journal_entry_err_on(!JSET_NO_FLUSH(jset) && 846 le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), 847 c, version, jset, NULL, 848 jset_last_seq_newer_than_seq, 849 "invalid journal entry: last_seq > seq (%llu > %llu)", 850 le64_to_cpu(jset->last_seq), 851 le64_to_cpu(jset->seq))) { 852 jset->last_seq = jset->seq; 853 return JOURNAL_ENTRY_BAD; 854 } 855 856 ret = jset_validate_entries(c, jset, flags); 857 fsck_err: 858 return ret; 859 } 860 861 static int jset_validate_early(struct bch_fs *c, 862 struct bch_dev *ca, 863 struct jset *jset, u64 sector, 864 unsigned bucket_sectors_left, 865 unsigned sectors_read) 866 { 867 size_t bytes = vstruct_bytes(jset); 868 unsigned version; 869 enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL; 870 int ret = 0; 871 872 if (le64_to_cpu(jset->magic) != jset_magic(c)) 873 return JOURNAL_ENTRY_NONE; 874 875 version = le32_to_cpu(jset->version); 876 if (journal_entry_err_on(!bch2_version_compatible(version), 877 c, version, jset, NULL, 878 jset_unsupported_version, 879 "%s sector %llu seq %llu: unknown journal entry version %u.%u", 880 ca ? ca->name : c->name, 881 sector, le64_to_cpu(jset->seq), 882 BCH_VERSION_MAJOR(version), 883 BCH_VERSION_MINOR(version))) { 884 /* don't try to continue: */ 885 return -EINVAL; 886 } 887 888 if (bytes > (sectors_read << 9) && 889 sectors_read < bucket_sectors_left) 890 return JOURNAL_ENTRY_REREAD; 891 892 if (journal_entry_err_on(bytes > bucket_sectors_left << 9, 893 c, version, jset, NULL, 894 jset_past_bucket_end, 895 "%s sector %llu seq %llu: journal entry too big (%zu bytes)", 896 ca ? ca->name : c->name, 897 sector, le64_to_cpu(jset->seq), bytes)) 898 le32_add_cpu(&jset->u64s, 899 -((bytes - (bucket_sectors_left << 9)) / 8)); 900 fsck_err: 901 return ret; 902 } 903 904 struct journal_read_buf { 905 void *data; 906 size_t size; 907 }; 908 909 static int journal_read_buf_realloc(struct journal_read_buf *b, 910 size_t new_size) 911 { 912 void *n; 913 914 /* the bios are sized for this many pages, max: */ 915 if (new_size > JOURNAL_ENTRY_SIZE_MAX) 916 return -BCH_ERR_ENOMEM_journal_read_buf_realloc; 917 918 new_size = roundup_pow_of_two(new_size); 919 n = kvpmalloc(new_size, GFP_KERNEL); 920 if (!n) 921 return -BCH_ERR_ENOMEM_journal_read_buf_realloc; 922 923 kvpfree(b->data, b->size); 924 b->data = n; 925 b->size = new_size; 926 return 0; 927 } 928 929 static int journal_read_bucket(struct bch_dev *ca, 930 struct journal_read_buf *buf, 931 struct journal_list *jlist, 932 unsigned bucket) 933 { 934 struct bch_fs *c = ca->fs; 935 struct journal_device *ja = &ca->journal; 936 struct jset *j = NULL; 937 unsigned sectors, sectors_read = 0; 938 u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), 939 end = offset + ca->mi.bucket_size; 940 bool saw_bad = false, csum_good; 941 struct printbuf err = PRINTBUF; 942 int ret = 0; 943 944 pr_debug("reading %u", bucket); 945 946 while (offset < end) { 947 if (!sectors_read) { 948 struct bio *bio; 949 unsigned nr_bvecs; 950 reread: 951 sectors_read = min_t(unsigned, 952 end - offset, buf->size >> 9); 953 nr_bvecs = buf_pages(buf->data, sectors_read << 9); 954 955 bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); 956 bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ); 957 958 bio->bi_iter.bi_sector = offset; 959 bch2_bio_map(bio, buf->data, sectors_read << 9); 960 961 ret = submit_bio_wait(bio); 962 kfree(bio); 963 964 if (bch2_dev_io_err_on(ret, ca, BCH_MEMBER_ERROR_read, 965 "journal read error: sector %llu", 966 offset) || 967 bch2_meta_read_fault("journal")) { 968 /* 969 * We don't error out of the recovery process 970 * here, since the relevant journal entry may be 971 * found on a different device, and missing or 972 * no journal entries will be handled later 973 */ 974 goto out; 975 } 976 977 j = buf->data; 978 } 979 980 ret = jset_validate_early(c, ca, j, offset, 981 end - offset, sectors_read); 982 switch (ret) { 983 case 0: 984 sectors = vstruct_sectors(j, c->block_bits); 985 break; 986 case JOURNAL_ENTRY_REREAD: 987 if (vstruct_bytes(j) > buf->size) { 988 ret = journal_read_buf_realloc(buf, 989 vstruct_bytes(j)); 990 if (ret) 991 goto err; 992 } 993 goto reread; 994 case JOURNAL_ENTRY_NONE: 995 if (!saw_bad) 996 goto out; 997 /* 998 * On checksum error we don't really trust the size 999 * field of the journal entry we read, so try reading 1000 * again at next block boundary: 1001 */ 1002 sectors = block_sectors(c); 1003 goto next_block; 1004 default: 1005 goto err; 1006 } 1007 1008 /* 1009 * This happens sometimes if we don't have discards on - 1010 * when we've partially overwritten a bucket with new 1011 * journal entries. We don't need the rest of the 1012 * bucket: 1013 */ 1014 if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) 1015 goto out; 1016 1017 ja->bucket_seq[bucket] = le64_to_cpu(j->seq); 1018 1019 enum bch_csum_type csum_type = JSET_CSUM_TYPE(j); 1020 struct bch_csum csum; 1021 csum_good = jset_csum_good(c, j, &csum); 1022 1023 if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum, 1024 "%s", 1025 (printbuf_reset(&err), 1026 prt_str(&err, "journal "), 1027 bch2_csum_err_msg(&err, csum_type, j->csum, csum), 1028 err.buf))) 1029 saw_bad = true; 1030 1031 ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), 1032 j->encrypted_start, 1033 vstruct_end(j) - (void *) j->encrypted_start); 1034 bch2_fs_fatal_err_on(ret, c, 1035 "error decrypting journal entry: %s", 1036 bch2_err_str(ret)); 1037 1038 mutex_lock(&jlist->lock); 1039 ret = journal_entry_add(c, ca, (struct journal_ptr) { 1040 .csum_good = csum_good, 1041 .dev = ca->dev_idx, 1042 .bucket = bucket, 1043 .bucket_offset = offset - 1044 bucket_to_sector(ca, ja->buckets[bucket]), 1045 .sector = offset, 1046 }, jlist, j); 1047 mutex_unlock(&jlist->lock); 1048 1049 switch (ret) { 1050 case JOURNAL_ENTRY_ADD_OK: 1051 break; 1052 case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: 1053 break; 1054 default: 1055 goto err; 1056 } 1057 next_block: 1058 pr_debug("next"); 1059 offset += sectors; 1060 sectors_read -= sectors; 1061 j = ((void *) j) + (sectors << 9); 1062 } 1063 1064 out: 1065 ret = 0; 1066 err: 1067 printbuf_exit(&err); 1068 return ret; 1069 } 1070 1071 static CLOSURE_CALLBACK(bch2_journal_read_device) 1072 { 1073 closure_type(ja, struct journal_device, read); 1074 struct bch_dev *ca = container_of(ja, struct bch_dev, journal); 1075 struct bch_fs *c = ca->fs; 1076 struct journal_list *jlist = 1077 container_of(cl->parent, struct journal_list, cl); 1078 struct journal_replay *r, **_r; 1079 struct genradix_iter iter; 1080 struct journal_read_buf buf = { NULL, 0 }; 1081 unsigned i; 1082 int ret = 0; 1083 1084 if (!ja->nr) 1085 goto out; 1086 1087 ret = journal_read_buf_realloc(&buf, PAGE_SIZE); 1088 if (ret) 1089 goto err; 1090 1091 pr_debug("%u journal buckets", ja->nr); 1092 1093 for (i = 0; i < ja->nr; i++) { 1094 ret = journal_read_bucket(ca, &buf, jlist, i); 1095 if (ret) 1096 goto err; 1097 } 1098 1099 ja->sectors_free = ca->mi.bucket_size; 1100 1101 mutex_lock(&jlist->lock); 1102 genradix_for_each_reverse(&c->journal_entries, iter, _r) { 1103 r = *_r; 1104 1105 if (!r) 1106 continue; 1107 1108 for (i = 0; i < r->nr_ptrs; i++) { 1109 if (r->ptrs[i].dev == ca->dev_idx) { 1110 unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) + 1111 vstruct_sectors(&r->j, c->block_bits); 1112 1113 ja->cur_idx = r->ptrs[i].bucket; 1114 ja->sectors_free = ca->mi.bucket_size - wrote; 1115 goto found; 1116 } 1117 } 1118 } 1119 found: 1120 mutex_unlock(&jlist->lock); 1121 1122 if (ja->bucket_seq[ja->cur_idx] && 1123 ja->sectors_free == ca->mi.bucket_size) { 1124 #if 0 1125 /* 1126 * Debug code for ZNS support, where we (probably) want to be 1127 * correlated where we stopped in the journal to the zone write 1128 * points: 1129 */ 1130 bch_err(c, "ja->sectors_free == ca->mi.bucket_size"); 1131 bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr); 1132 for (i = 0; i < 3; i++) { 1133 unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr; 1134 1135 bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]); 1136 } 1137 #endif 1138 ja->sectors_free = 0; 1139 } 1140 1141 /* 1142 * Set dirty_idx to indicate the entire journal is full and needs to be 1143 * reclaimed - journal reclaim will immediately reclaim whatever isn't 1144 * pinned when it first runs: 1145 */ 1146 ja->discard_idx = ja->dirty_idx_ondisk = 1147 ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; 1148 out: 1149 bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); 1150 kvpfree(buf.data, buf.size); 1151 percpu_ref_put(&ca->io_ref); 1152 closure_return(cl); 1153 return; 1154 err: 1155 mutex_lock(&jlist->lock); 1156 jlist->ret = ret; 1157 mutex_unlock(&jlist->lock); 1158 goto out; 1159 } 1160 1161 void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, 1162 struct journal_replay *j) 1163 { 1164 unsigned i; 1165 1166 for (i = 0; i < j->nr_ptrs; i++) { 1167 struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev); 1168 u64 offset; 1169 1170 div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset); 1171 1172 if (i) 1173 prt_printf(out, " "); 1174 prt_printf(out, "%u:%u:%u (sector %llu)", 1175 j->ptrs[i].dev, 1176 j->ptrs[i].bucket, 1177 j->ptrs[i].bucket_offset, 1178 j->ptrs[i].sector); 1179 } 1180 } 1181 1182 int bch2_journal_read(struct bch_fs *c, 1183 u64 *last_seq, 1184 u64 *blacklist_seq, 1185 u64 *start_seq) 1186 { 1187 struct journal_list jlist; 1188 struct journal_replay *i, **_i, *prev = NULL; 1189 struct genradix_iter radix_iter; 1190 struct printbuf buf = PRINTBUF; 1191 bool degraded = false, last_write_torn = false; 1192 u64 seq; 1193 int ret = 0; 1194 1195 closure_init_stack(&jlist.cl); 1196 mutex_init(&jlist.lock); 1197 jlist.last_seq = 0; 1198 jlist.ret = 0; 1199 1200 for_each_member_device(c, ca) { 1201 if (!c->opts.fsck && 1202 !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal))) 1203 continue; 1204 1205 if ((ca->mi.state == BCH_MEMBER_STATE_rw || 1206 ca->mi.state == BCH_MEMBER_STATE_ro) && 1207 percpu_ref_tryget(&ca->io_ref)) 1208 closure_call(&ca->journal.read, 1209 bch2_journal_read_device, 1210 system_unbound_wq, 1211 &jlist.cl); 1212 else 1213 degraded = true; 1214 } 1215 1216 closure_sync(&jlist.cl); 1217 1218 if (jlist.ret) 1219 return jlist.ret; 1220 1221 *last_seq = 0; 1222 *start_seq = 0; 1223 *blacklist_seq = 0; 1224 1225 /* 1226 * Find most recent flush entry, and ignore newer non flush entries - 1227 * those entries will be blacklisted: 1228 */ 1229 genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) { 1230 enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL; 1231 1232 i = *_i; 1233 1234 if (!i || i->ignore) 1235 continue; 1236 1237 if (!*start_seq) 1238 *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1; 1239 1240 if (JSET_NO_FLUSH(&i->j)) { 1241 i->ignore = true; 1242 continue; 1243 } 1244 1245 if (!last_write_torn && !i->csum_good) { 1246 last_write_torn = true; 1247 i->ignore = true; 1248 continue; 1249 } 1250 1251 if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq), 1252 c, le32_to_cpu(i->j.version), &i->j, NULL, 1253 jset_last_seq_newer_than_seq, 1254 "invalid journal entry: last_seq > seq (%llu > %llu)", 1255 le64_to_cpu(i->j.last_seq), 1256 le64_to_cpu(i->j.seq))) 1257 i->j.last_seq = i->j.seq; 1258 1259 *last_seq = le64_to_cpu(i->j.last_seq); 1260 *blacklist_seq = le64_to_cpu(i->j.seq) + 1; 1261 break; 1262 } 1263 1264 if (!*start_seq) { 1265 bch_info(c, "journal read done, but no entries found"); 1266 return 0; 1267 } 1268 1269 if (!*last_seq) { 1270 fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes, 1271 "journal read done, but no entries found after dropping non-flushes"); 1272 return 0; 1273 } 1274 1275 bch_info(c, "journal read done, replaying entries %llu-%llu", 1276 *last_seq, *blacklist_seq - 1); 1277 1278 if (*start_seq != *blacklist_seq) 1279 bch_info(c, "dropped unflushed entries %llu-%llu", 1280 *blacklist_seq, *start_seq - 1); 1281 1282 /* Drop blacklisted entries and entries older than last_seq: */ 1283 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1284 i = *_i; 1285 1286 if (!i || i->ignore) 1287 continue; 1288 1289 seq = le64_to_cpu(i->j.seq); 1290 if (seq < *last_seq) { 1291 journal_replay_free(c, i); 1292 continue; 1293 } 1294 1295 if (bch2_journal_seq_is_blacklisted(c, seq, true)) { 1296 fsck_err_on(!JSET_NO_FLUSH(&i->j), c, 1297 jset_seq_blacklisted, 1298 "found blacklisted journal entry %llu", seq); 1299 i->ignore = true; 1300 } 1301 } 1302 1303 /* Check for missing entries: */ 1304 seq = *last_seq; 1305 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1306 i = *_i; 1307 1308 if (!i || i->ignore) 1309 continue; 1310 1311 BUG_ON(seq > le64_to_cpu(i->j.seq)); 1312 1313 while (seq < le64_to_cpu(i->j.seq)) { 1314 u64 missing_start, missing_end; 1315 struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; 1316 1317 while (seq < le64_to_cpu(i->j.seq) && 1318 bch2_journal_seq_is_blacklisted(c, seq, false)) 1319 seq++; 1320 1321 if (seq == le64_to_cpu(i->j.seq)) 1322 break; 1323 1324 missing_start = seq; 1325 1326 while (seq < le64_to_cpu(i->j.seq) && 1327 !bch2_journal_seq_is_blacklisted(c, seq, false)) 1328 seq++; 1329 1330 if (prev) { 1331 bch2_journal_ptrs_to_text(&buf1, c, prev); 1332 prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits)); 1333 } else 1334 prt_printf(&buf1, "(none)"); 1335 bch2_journal_ptrs_to_text(&buf2, c, i); 1336 1337 missing_end = seq - 1; 1338 fsck_err(c, journal_entries_missing, 1339 "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" 1340 " prev at %s\n" 1341 " next at %s", 1342 missing_start, missing_end, 1343 *last_seq, *blacklist_seq - 1, 1344 buf1.buf, buf2.buf); 1345 1346 printbuf_exit(&buf1); 1347 printbuf_exit(&buf2); 1348 } 1349 1350 prev = i; 1351 seq++; 1352 } 1353 1354 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1355 struct bch_replicas_padded replicas = { 1356 .e.data_type = BCH_DATA_journal, 1357 .e.nr_required = 1, 1358 }; 1359 unsigned ptr; 1360 1361 i = *_i; 1362 if (!i || i->ignore) 1363 continue; 1364 1365 for (ptr = 0; ptr < i->nr_ptrs; ptr++) { 1366 struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev); 1367 1368 if (!i->ptrs[ptr].csum_good) 1369 bch_err_dev_offset(ca, i->ptrs[ptr].sector, 1370 "invalid journal checksum, seq %llu%s", 1371 le64_to_cpu(i->j.seq), 1372 i->csum_good ? " (had good copy on another device)" : ""); 1373 } 1374 1375 ret = jset_validate(c, 1376 bch_dev_bkey_exists(c, i->ptrs[0].dev), 1377 &i->j, 1378 i->ptrs[0].sector, 1379 READ); 1380 if (ret) 1381 goto err; 1382 1383 for (ptr = 0; ptr < i->nr_ptrs; ptr++) 1384 replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev; 1385 1386 bch2_replicas_entry_sort(&replicas.e); 1387 1388 printbuf_reset(&buf); 1389 bch2_replicas_entry_to_text(&buf, &replicas.e); 1390 1391 if (!degraded && 1392 !bch2_replicas_marked(c, &replicas.e) && 1393 (le64_to_cpu(i->j.seq) == *last_seq || 1394 fsck_err(c, journal_entry_replicas_not_marked, 1395 "superblock not marked as containing replicas for journal entry %llu\n %s", 1396 le64_to_cpu(i->j.seq), buf.buf))) { 1397 ret = bch2_mark_replicas(c, &replicas.e); 1398 if (ret) 1399 goto err; 1400 } 1401 } 1402 err: 1403 fsck_err: 1404 printbuf_exit(&buf); 1405 return ret; 1406 } 1407 1408 /* journal write: */ 1409 1410 static void __journal_write_alloc(struct journal *j, 1411 struct journal_buf *w, 1412 struct dev_alloc_list *devs_sorted, 1413 unsigned sectors, 1414 unsigned *replicas, 1415 unsigned replicas_want) 1416 { 1417 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1418 struct journal_device *ja; 1419 struct bch_dev *ca; 1420 unsigned i; 1421 1422 if (*replicas >= replicas_want) 1423 return; 1424 1425 for (i = 0; i < devs_sorted->nr; i++) { 1426 ca = rcu_dereference(c->devs[devs_sorted->devs[i]]); 1427 if (!ca) 1428 continue; 1429 1430 ja = &ca->journal; 1431 1432 /* 1433 * Check that we can use this device, and aren't already using 1434 * it: 1435 */ 1436 if (!ca->mi.durability || 1437 ca->mi.state != BCH_MEMBER_STATE_rw || 1438 !ja->nr || 1439 bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) || 1440 sectors > ja->sectors_free) 1441 continue; 1442 1443 bch2_dev_stripe_increment(ca, &j->wp.stripe); 1444 1445 bch2_bkey_append_ptr(&w->key, 1446 (struct bch_extent_ptr) { 1447 .offset = bucket_to_sector(ca, 1448 ja->buckets[ja->cur_idx]) + 1449 ca->mi.bucket_size - 1450 ja->sectors_free, 1451 .dev = ca->dev_idx, 1452 }); 1453 1454 ja->sectors_free -= sectors; 1455 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1456 1457 *replicas += ca->mi.durability; 1458 1459 if (*replicas >= replicas_want) 1460 break; 1461 } 1462 } 1463 1464 /** 1465 * journal_write_alloc - decide where to write next journal entry 1466 * 1467 * @j: journal object 1468 * @w: journal buf (entry to be written) 1469 * 1470 * Returns: 0 on success, or -EROFS on failure 1471 */ 1472 static int journal_write_alloc(struct journal *j, struct journal_buf *w) 1473 { 1474 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1475 struct bch_devs_mask devs; 1476 struct journal_device *ja; 1477 struct bch_dev *ca; 1478 struct dev_alloc_list devs_sorted; 1479 unsigned sectors = vstruct_sectors(w->data, c->block_bits); 1480 unsigned target = c->opts.metadata_target ?: 1481 c->opts.foreground_target; 1482 unsigned i, replicas = 0, replicas_want = 1483 READ_ONCE(c->opts.metadata_replicas); 1484 1485 rcu_read_lock(); 1486 retry: 1487 devs = target_rw_devs(c, BCH_DATA_journal, target); 1488 1489 devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); 1490 1491 __journal_write_alloc(j, w, &devs_sorted, 1492 sectors, &replicas, replicas_want); 1493 1494 if (replicas >= replicas_want) 1495 goto done; 1496 1497 for (i = 0; i < devs_sorted.nr; i++) { 1498 ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); 1499 if (!ca) 1500 continue; 1501 1502 ja = &ca->journal; 1503 1504 if (sectors > ja->sectors_free && 1505 sectors <= ca->mi.bucket_size && 1506 bch2_journal_dev_buckets_available(j, ja, 1507 journal_space_discarded)) { 1508 ja->cur_idx = (ja->cur_idx + 1) % ja->nr; 1509 ja->sectors_free = ca->mi.bucket_size; 1510 1511 /* 1512 * ja->bucket_seq[ja->cur_idx] must always have 1513 * something sensible: 1514 */ 1515 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1516 } 1517 } 1518 1519 __journal_write_alloc(j, w, &devs_sorted, 1520 sectors, &replicas, replicas_want); 1521 1522 if (replicas < replicas_want && target) { 1523 /* Retry from all devices: */ 1524 target = 0; 1525 goto retry; 1526 } 1527 done: 1528 rcu_read_unlock(); 1529 1530 BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); 1531 1532 return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS; 1533 } 1534 1535 static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) 1536 { 1537 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1538 1539 /* we aren't holding j->lock: */ 1540 unsigned new_size = READ_ONCE(j->buf_size_want); 1541 void *new_buf; 1542 1543 if (buf->buf_size >= new_size) 1544 return; 1545 1546 size_t btree_write_buffer_size = new_size / 64; 1547 1548 if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size)) 1549 return; 1550 1551 new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN); 1552 if (!new_buf) 1553 return; 1554 1555 memcpy(new_buf, buf->data, buf->buf_size); 1556 1557 spin_lock(&j->lock); 1558 swap(buf->data, new_buf); 1559 swap(buf->buf_size, new_size); 1560 spin_unlock(&j->lock); 1561 1562 kvpfree(new_buf, new_size); 1563 } 1564 1565 static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) 1566 { 1567 return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK); 1568 } 1569 1570 static CLOSURE_CALLBACK(journal_write_done) 1571 { 1572 closure_type(j, struct journal, io); 1573 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1574 struct journal_buf *w = journal_last_unwritten_buf(j); 1575 struct bch_replicas_padded replicas; 1576 union journal_res_state old, new; 1577 u64 v, seq; 1578 int err = 0; 1579 1580 bch2_time_stats_update(!JSET_NO_FLUSH(w->data) 1581 ? j->flush_write_time 1582 : j->noflush_write_time, j->write_start_time); 1583 1584 if (!w->devs_written.nr) { 1585 bch_err(c, "unable to write journal to sufficient devices"); 1586 err = -EIO; 1587 } else { 1588 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, 1589 w->devs_written); 1590 if (bch2_mark_replicas(c, &replicas.e)) 1591 err = -EIO; 1592 } 1593 1594 if (err) 1595 bch2_fatal_error(c); 1596 1597 spin_lock(&j->lock); 1598 seq = le64_to_cpu(w->data->seq); 1599 1600 if (seq >= j->pin.front) 1601 journal_seq_pin(j, seq)->devs = w->devs_written; 1602 1603 if (!err) { 1604 if (!JSET_NO_FLUSH(w->data)) { 1605 j->flushed_seq_ondisk = seq; 1606 j->last_seq_ondisk = w->last_seq; 1607 1608 bch2_do_discards(c); 1609 closure_wake_up(&c->freelist_wait); 1610 1611 bch2_reset_alloc_cursors(c); 1612 } 1613 } else if (!j->err_seq || seq < j->err_seq) 1614 j->err_seq = seq; 1615 1616 j->seq_ondisk = seq; 1617 1618 /* 1619 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard 1620 * more buckets: 1621 * 1622 * Must come before signaling write completion, for 1623 * bch2_fs_journal_stop(): 1624 */ 1625 if (j->watermark != BCH_WATERMARK_stripe) 1626 journal_reclaim_kick(&c->journal); 1627 1628 /* also must come before signalling write completion: */ 1629 closure_debug_destroy(cl); 1630 1631 v = atomic64_read(&j->reservations.counter); 1632 do { 1633 old.v = new.v = v; 1634 BUG_ON(journal_state_count(new, new.unwritten_idx)); 1635 1636 new.unwritten_idx++; 1637 } while ((v = atomic64_cmpxchg(&j->reservations.counter, 1638 old.v, new.v)) != old.v); 1639 1640 bch2_journal_reclaim_fast(j); 1641 bch2_journal_space_available(j); 1642 1643 track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], 1644 &j->max_in_flight_start, false); 1645 1646 closure_wake_up(&w->wait); 1647 journal_wake(j); 1648 1649 if (!journal_state_count(new, new.unwritten_idx) && 1650 journal_last_unwritten_seq(j) <= journal_cur_seq(j)) { 1651 spin_unlock(&j->lock); 1652 closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); 1653 } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && 1654 new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { 1655 struct journal_buf *buf = journal_cur_buf(j); 1656 long delta = buf->expires - jiffies; 1657 1658 /* 1659 * We don't close a journal entry to write it while there's 1660 * previous entries still in flight - the current journal entry 1661 * might want to be written now: 1662 */ 1663 1664 spin_unlock(&j->lock); 1665 mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta)); 1666 } else { 1667 spin_unlock(&j->lock); 1668 } 1669 } 1670 1671 static void journal_write_endio(struct bio *bio) 1672 { 1673 struct bch_dev *ca = bio->bi_private; 1674 struct journal *j = &ca->fs->journal; 1675 struct journal_buf *w = journal_last_unwritten_buf(j); 1676 unsigned long flags; 1677 1678 if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, 1679 "error writing journal entry %llu: %s", 1680 le64_to_cpu(w->data->seq), 1681 bch2_blk_status_to_str(bio->bi_status)) || 1682 bch2_meta_write_fault("journal")) { 1683 spin_lock_irqsave(&j->err_lock, flags); 1684 bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx); 1685 spin_unlock_irqrestore(&j->err_lock, flags); 1686 } 1687 1688 closure_put(&j->io); 1689 percpu_ref_put(&ca->io_ref); 1690 } 1691 1692 static CLOSURE_CALLBACK(do_journal_write) 1693 { 1694 closure_type(j, struct journal, io); 1695 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1696 struct bch_dev *ca; 1697 struct journal_buf *w = journal_last_unwritten_buf(j); 1698 struct bio *bio; 1699 unsigned sectors = vstruct_sectors(w->data, c->block_bits); 1700 1701 extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { 1702 ca = bch_dev_bkey_exists(c, ptr->dev); 1703 if (!percpu_ref_tryget(&ca->io_ref)) { 1704 /* XXX: fix this */ 1705 bch_err(c, "missing device for journal write\n"); 1706 continue; 1707 } 1708 1709 this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], 1710 sectors); 1711 1712 bio = ca->journal.bio; 1713 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); 1714 bio->bi_iter.bi_sector = ptr->offset; 1715 bio->bi_end_io = journal_write_endio; 1716 bio->bi_private = ca; 1717 1718 BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector); 1719 ca->prev_journal_sector = bio->bi_iter.bi_sector; 1720 1721 if (!JSET_NO_FLUSH(w->data)) 1722 bio->bi_opf |= REQ_FUA; 1723 if (!JSET_NO_FLUSH(w->data) && !w->separate_flush) 1724 bio->bi_opf |= REQ_PREFLUSH; 1725 1726 bch2_bio_map(bio, w->data, sectors << 9); 1727 1728 trace_and_count(c, journal_write, bio); 1729 closure_bio_submit(bio, cl); 1730 1731 ca->journal.bucket_seq[ca->journal.cur_idx] = 1732 le64_to_cpu(w->data->seq); 1733 } 1734 1735 continue_at(cl, journal_write_done, c->io_complete_wq); 1736 } 1737 1738 static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) 1739 { 1740 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1741 struct jset_entry *start, *end; 1742 struct jset *jset = w->data; 1743 struct journal_keys_to_wb wb = { NULL }; 1744 unsigned sectors, bytes, u64s; 1745 unsigned long btree_roots_have = 0; 1746 bool validate_before_checksum = false; 1747 u64 seq = le64_to_cpu(jset->seq); 1748 int ret; 1749 1750 /* 1751 * Simple compaction, dropping empty jset_entries (from journal 1752 * reservations that weren't fully used) and merging jset_entries that 1753 * can be. 1754 * 1755 * If we wanted to be really fancy here, we could sort all the keys in 1756 * the jset and drop keys that were overwritten - probably not worth it: 1757 */ 1758 vstruct_for_each(jset, i) { 1759 unsigned u64s = le16_to_cpu(i->u64s); 1760 1761 /* Empty entry: */ 1762 if (!u64s) 1763 continue; 1764 1765 /* 1766 * New btree roots are set by journalling them; when the journal 1767 * entry gets written we have to propagate them to 1768 * c->btree_roots 1769 * 1770 * But, every journal entry we write has to contain all the 1771 * btree roots (at least for now); so after we copy btree roots 1772 * to c->btree_roots we have to get any missing btree roots and 1773 * add them to this journal entry: 1774 */ 1775 switch (i->type) { 1776 case BCH_JSET_ENTRY_btree_root: 1777 bch2_journal_entry_to_btree_root(c, i); 1778 __set_bit(i->btree_id, &btree_roots_have); 1779 break; 1780 case BCH_JSET_ENTRY_write_buffer_keys: 1781 EBUG_ON(!w->need_flush_to_write_buffer); 1782 1783 if (!wb.wb) 1784 bch2_journal_keys_to_write_buffer_start(c, &wb, seq); 1785 1786 struct bkey_i *k; 1787 jset_entry_for_each_key(i, k) { 1788 ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k); 1789 if (ret) { 1790 bch2_fs_fatal_error(c, "-ENOMEM flushing journal keys to btree write buffer"); 1791 bch2_journal_keys_to_write_buffer_end(c, &wb); 1792 return ret; 1793 } 1794 } 1795 i->type = BCH_JSET_ENTRY_btree_keys; 1796 break; 1797 } 1798 } 1799 1800 if (wb.wb) 1801 bch2_journal_keys_to_write_buffer_end(c, &wb); 1802 w->need_flush_to_write_buffer = false; 1803 1804 start = end = vstruct_last(jset); 1805 1806 end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have); 1807 1808 bch2_journal_super_entries_add_common(c, &end, seq); 1809 u64s = (u64 *) end - (u64 *) start; 1810 BUG_ON(u64s > j->entry_u64s_reserved); 1811 1812 le32_add_cpu(&jset->u64s, u64s); 1813 1814 sectors = vstruct_sectors(jset, c->block_bits); 1815 bytes = vstruct_bytes(jset); 1816 1817 if (sectors > w->sectors) { 1818 bch2_fs_fatal_error(c, "aieeee! journal write overran available space, %zu > %u (extra %u reserved %u/%u)", 1819 vstruct_bytes(jset), w->sectors << 9, 1820 u64s, w->u64s_reserved, j->entry_u64s_reserved); 1821 return -EINVAL; 1822 } 1823 1824 jset->magic = cpu_to_le64(jset_magic(c)); 1825 jset->version = cpu_to_le32(c->sb.version); 1826 1827 SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); 1828 SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); 1829 1830 if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset)) 1831 j->last_empty_seq = seq; 1832 1833 if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) 1834 validate_before_checksum = true; 1835 1836 if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current) 1837 validate_before_checksum = true; 1838 1839 if (validate_before_checksum && 1840 (ret = jset_validate(c, NULL, jset, 0, WRITE))) 1841 return ret; 1842 1843 ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), 1844 jset->encrypted_start, 1845 vstruct_end(jset) - (void *) jset->encrypted_start); 1846 if (bch2_fs_fatal_err_on(ret, c, 1847 "error decrypting journal entry: %i", ret)) 1848 return ret; 1849 1850 jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), 1851 journal_nonce(jset), jset); 1852 1853 if (!validate_before_checksum && 1854 (ret = jset_validate(c, NULL, jset, 0, WRITE))) 1855 return ret; 1856 1857 memset((void *) jset + bytes, 0, (sectors << 9) - bytes); 1858 return 0; 1859 } 1860 1861 static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w) 1862 { 1863 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1864 int error = bch2_journal_error(j); 1865 1866 /* 1867 * If the journal is in an error state - we did an emergency shutdown - 1868 * we prefer to continue doing journal writes. We just mark them as 1869 * noflush so they'll never be used, but they'll still be visible by the 1870 * list_journal tool - this helps in debugging. 1871 * 1872 * There's a caveat: the first journal write after marking the 1873 * superblock dirty must always be a flush write, because on startup 1874 * from a clean shutdown we didn't necessarily read the journal and the 1875 * new journal write might overwrite whatever was in the journal 1876 * previously - we can't leave the journal without any flush writes in 1877 * it. 1878 * 1879 * So if we're in an error state, and we're still starting up, we don't 1880 * write anything at all. 1881 */ 1882 if (error && test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags)) 1883 return -EIO; 1884 1885 if (error || 1886 w->noflush || 1887 (!w->must_flush && 1888 (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && 1889 test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) { 1890 w->noflush = true; 1891 SET_JSET_NO_FLUSH(w->data, true); 1892 w->data->last_seq = 0; 1893 w->last_seq = 0; 1894 1895 j->nr_noflush_writes++; 1896 } else { 1897 j->last_flush_write = jiffies; 1898 j->nr_flush_writes++; 1899 clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags); 1900 } 1901 1902 return 0; 1903 } 1904 1905 CLOSURE_CALLBACK(bch2_journal_write) 1906 { 1907 closure_type(j, struct journal, io); 1908 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1909 struct journal_buf *w = journal_last_unwritten_buf(j); 1910 struct bch_replicas_padded replicas; 1911 struct bio *bio; 1912 struct printbuf journal_debug_buf = PRINTBUF; 1913 unsigned nr_rw_members = 0; 1914 int ret; 1915 1916 BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); 1917 1918 j->write_start_time = local_clock(); 1919 1920 spin_lock(&j->lock); 1921 ret = bch2_journal_write_pick_flush(j, w); 1922 spin_unlock(&j->lock); 1923 if (ret) 1924 goto err; 1925 1926 mutex_lock(&j->buf_lock); 1927 journal_buf_realloc(j, w); 1928 1929 ret = bch2_journal_write_prep(j, w); 1930 mutex_unlock(&j->buf_lock); 1931 if (ret) 1932 goto err; 1933 1934 j->entry_bytes_written += vstruct_bytes(w->data); 1935 1936 while (1) { 1937 spin_lock(&j->lock); 1938 ret = journal_write_alloc(j, w); 1939 if (!ret || !j->can_discard) 1940 break; 1941 1942 spin_unlock(&j->lock); 1943 bch2_journal_do_discards(j); 1944 } 1945 1946 if (ret) { 1947 __bch2_journal_debug_to_text(&journal_debug_buf, j); 1948 spin_unlock(&j->lock); 1949 bch_err(c, "Unable to allocate journal write:\n%s", 1950 journal_debug_buf.buf); 1951 printbuf_exit(&journal_debug_buf); 1952 goto err; 1953 } 1954 1955 /* 1956 * write is allocated, no longer need to account for it in 1957 * bch2_journal_space_available(): 1958 */ 1959 w->sectors = 0; 1960 1961 /* 1962 * journal entry has been compacted and allocated, recalculate space 1963 * available: 1964 */ 1965 bch2_journal_space_available(j); 1966 spin_unlock(&j->lock); 1967 1968 w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); 1969 1970 if (c->opts.nochanges) 1971 goto no_io; 1972 1973 for_each_rw_member(c, ca) 1974 nr_rw_members++; 1975 1976 if (nr_rw_members > 1) 1977 w->separate_flush = true; 1978 1979 /* 1980 * Mark journal replicas before we submit the write to guarantee 1981 * recovery will find the journal entries after a crash. 1982 */ 1983 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, 1984 w->devs_written); 1985 ret = bch2_mark_replicas(c, &replicas.e); 1986 if (ret) 1987 goto err; 1988 1989 if (!JSET_NO_FLUSH(w->data) && w->separate_flush) { 1990 for_each_rw_member(c, ca) { 1991 percpu_ref_get(&ca->io_ref); 1992 1993 bio = ca->journal.bio; 1994 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH); 1995 bio->bi_end_io = journal_write_endio; 1996 bio->bi_private = ca; 1997 closure_bio_submit(bio, cl); 1998 } 1999 } 2000 2001 continue_at(cl, do_journal_write, c->io_complete_wq); 2002 return; 2003 no_io: 2004 continue_at(cl, journal_write_done, c->io_complete_wq); 2005 return; 2006 err: 2007 bch2_fatal_error(c); 2008 continue_at(cl, journal_write_done, c->io_complete_wq); 2009 } 2010