1 // SPDX-License-Identifier: GPL-2.0 2 #include "bcachefs.h" 3 #include "alloc_background.h" 4 #include "alloc_foreground.h" 5 #include "btree_io.h" 6 #include "btree_update_interior.h" 7 #include "buckets.h" 8 #include "checksum.h" 9 #include "disk_groups.h" 10 #include "error.h" 11 #include "journal.h" 12 #include "journal_io.h" 13 #include "journal_reclaim.h" 14 #include "journal_seq_blacklist.h" 15 #include "replicas.h" 16 #include "sb-clean.h" 17 #include "trace.h" 18 19 static struct nonce journal_nonce(const struct jset *jset) 20 { 21 return (struct nonce) {{ 22 [0] = 0, 23 [1] = ((__le32 *) &jset->seq)[0], 24 [2] = ((__le32 *) &jset->seq)[1], 25 [3] = BCH_NONCE_JOURNAL, 26 }}; 27 } 28 29 static bool jset_csum_good(struct bch_fs *c, struct jset *j) 30 { 31 return bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j)) && 32 !bch2_crc_cmp(j->csum, 33 csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j)); 34 } 35 36 static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq) 37 { 38 return (seq - c->journal_entries_base_seq) & (~0U >> 1); 39 } 40 41 static void __journal_replay_free(struct bch_fs *c, 42 struct journal_replay *i) 43 { 44 struct journal_replay **p = 45 genradix_ptr(&c->journal_entries, 46 journal_entry_radix_idx(c, le64_to_cpu(i->j.seq))); 47 48 BUG_ON(*p != i); 49 *p = NULL; 50 kvpfree(i, offsetof(struct journal_replay, j) + 51 vstruct_bytes(&i->j)); 52 } 53 54 static void journal_replay_free(struct bch_fs *c, struct journal_replay *i) 55 { 56 i->ignore = true; 57 58 if (!c->opts.read_entire_journal) 59 __journal_replay_free(c, i); 60 } 61 62 struct journal_list { 63 struct closure cl; 64 u64 last_seq; 65 struct mutex lock; 66 int ret; 67 }; 68 69 #define JOURNAL_ENTRY_ADD_OK 0 70 #define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5 71 72 /* 73 * Given a journal entry we just read, add it to the list of journal entries to 74 * be replayed: 75 */ 76 static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, 77 struct journal_ptr entry_ptr, 78 struct journal_list *jlist, struct jset *j) 79 { 80 struct genradix_iter iter; 81 struct journal_replay **_i, *i, *dup; 82 struct journal_ptr *ptr; 83 size_t bytes = vstruct_bytes(j); 84 u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0; 85 int ret = JOURNAL_ENTRY_ADD_OK; 86 87 /* Is this entry older than the range we need? */ 88 if (!c->opts.read_entire_journal && 89 le64_to_cpu(j->seq) < jlist->last_seq) 90 return JOURNAL_ENTRY_ADD_OUT_OF_RANGE; 91 92 /* 93 * genradixes are indexed by a ulong, not a u64, so we can't index them 94 * by sequence number directly: Assume instead that they will all fall 95 * within the range of +-2billion of the filrst one we find. 96 */ 97 if (!c->journal_entries_base_seq) 98 c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX); 99 100 /* Drop entries we don't need anymore */ 101 if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) { 102 genradix_for_each_from(&c->journal_entries, iter, _i, 103 journal_entry_radix_idx(c, jlist->last_seq)) { 104 i = *_i; 105 106 if (!i || i->ignore) 107 continue; 108 109 if (le64_to_cpu(i->j.seq) >= last_seq) 110 break; 111 journal_replay_free(c, i); 112 } 113 } 114 115 jlist->last_seq = max(jlist->last_seq, last_seq); 116 117 _i = genradix_ptr_alloc(&c->journal_entries, 118 journal_entry_radix_idx(c, le64_to_cpu(j->seq)), 119 GFP_KERNEL); 120 if (!_i) 121 return -BCH_ERR_ENOMEM_journal_entry_add; 122 123 /* 124 * Duplicate journal entries? If so we want the one that didn't have a 125 * checksum error: 126 */ 127 dup = *_i; 128 if (dup) { 129 if (bytes == vstruct_bytes(&dup->j) && 130 !memcmp(j, &dup->j, bytes)) { 131 i = dup; 132 goto found; 133 } 134 135 if (!entry_ptr.csum_good) { 136 i = dup; 137 goto found; 138 } 139 140 if (!dup->csum_good) 141 goto replace; 142 143 fsck_err(c, journal_entry_replicas_data_mismatch, 144 "found duplicate but non identical journal entries (seq %llu)", 145 le64_to_cpu(j->seq)); 146 i = dup; 147 goto found; 148 } 149 replace: 150 i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); 151 if (!i) 152 return -BCH_ERR_ENOMEM_journal_entry_add; 153 154 i->nr_ptrs = 0; 155 i->csum_good = entry_ptr.csum_good; 156 i->ignore = false; 157 unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct"); 158 i->ptrs[i->nr_ptrs++] = entry_ptr; 159 160 if (dup) { 161 if (dup->nr_ptrs >= ARRAY_SIZE(dup->ptrs)) { 162 bch_err(c, "found too many copies of journal entry %llu", 163 le64_to_cpu(i->j.seq)); 164 dup->nr_ptrs = ARRAY_SIZE(dup->ptrs) - 1; 165 } 166 167 /* The first ptr should represent the jset we kept: */ 168 memcpy(i->ptrs + i->nr_ptrs, 169 dup->ptrs, 170 sizeof(dup->ptrs[0]) * dup->nr_ptrs); 171 i->nr_ptrs += dup->nr_ptrs; 172 __journal_replay_free(c, dup); 173 } 174 175 *_i = i; 176 return 0; 177 found: 178 for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) { 179 if (ptr->dev == ca->dev_idx) { 180 bch_err(c, "duplicate journal entry %llu on same device", 181 le64_to_cpu(i->j.seq)); 182 goto out; 183 } 184 } 185 186 if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) { 187 bch_err(c, "found too many copies of journal entry %llu", 188 le64_to_cpu(i->j.seq)); 189 goto out; 190 } 191 192 i->ptrs[i->nr_ptrs++] = entry_ptr; 193 out: 194 fsck_err: 195 return ret; 196 } 197 198 /* this fills in a range with empty jset_entries: */ 199 static void journal_entry_null_range(void *start, void *end) 200 { 201 struct jset_entry *entry; 202 203 for (entry = start; entry != end; entry = vstruct_next(entry)) 204 memset(entry, 0, sizeof(*entry)); 205 } 206 207 #define JOURNAL_ENTRY_REREAD 5 208 #define JOURNAL_ENTRY_NONE 6 209 #define JOURNAL_ENTRY_BAD 7 210 211 static void journal_entry_err_msg(struct printbuf *out, 212 u32 version, 213 struct jset *jset, 214 struct jset_entry *entry) 215 { 216 prt_str(out, "invalid journal entry, version="); 217 bch2_version_to_text(out, version); 218 219 if (entry) { 220 prt_str(out, " type="); 221 prt_str(out, bch2_jset_entry_types[entry->type]); 222 } 223 224 if (!jset) { 225 prt_printf(out, " in superblock"); 226 } else { 227 228 prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq)); 229 230 if (entry) 231 prt_printf(out, " offset=%zi/%u", 232 (u64 *) entry - jset->_data, 233 le32_to_cpu(jset->u64s)); 234 } 235 236 prt_str(out, ": "); 237 } 238 239 #define journal_entry_err(c, version, jset, entry, _err, msg, ...) \ 240 ({ \ 241 struct printbuf _buf = PRINTBUF; \ 242 \ 243 journal_entry_err_msg(&_buf, version, jset, entry); \ 244 prt_printf(&_buf, msg, ##__VA_ARGS__); \ 245 \ 246 switch (flags & BKEY_INVALID_WRITE) { \ 247 case READ: \ 248 mustfix_fsck_err(c, _err, "%s", _buf.buf); \ 249 break; \ 250 case WRITE: \ 251 bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \ 252 bch_err(c, "corrupt metadata before write: %s\n", _buf.buf);\ 253 if (bch2_fs_inconsistent(c)) { \ 254 ret = -BCH_ERR_fsck_errors_not_fixed; \ 255 goto fsck_err; \ 256 } \ 257 break; \ 258 } \ 259 \ 260 printbuf_exit(&_buf); \ 261 true; \ 262 }) 263 264 #define journal_entry_err_on(cond, ...) \ 265 ((cond) ? journal_entry_err(__VA_ARGS__) : false) 266 267 #define FSCK_DELETED_KEY 5 268 269 static int journal_validate_key(struct bch_fs *c, 270 struct jset *jset, 271 struct jset_entry *entry, 272 unsigned level, enum btree_id btree_id, 273 struct bkey_i *k, 274 unsigned version, int big_endian, 275 enum bkey_invalid_flags flags) 276 { 277 int write = flags & BKEY_INVALID_WRITE; 278 void *next = vstruct_next(entry); 279 struct printbuf buf = PRINTBUF; 280 int ret = 0; 281 282 if (journal_entry_err_on(!k->k.u64s, 283 c, version, jset, entry, 284 journal_entry_bkey_u64s_0, 285 "k->u64s 0")) { 286 entry->u64s = cpu_to_le16((u64 *) k - entry->_data); 287 journal_entry_null_range(vstruct_next(entry), next); 288 return FSCK_DELETED_KEY; 289 } 290 291 if (journal_entry_err_on((void *) bkey_next(k) > 292 (void *) vstruct_next(entry), 293 c, version, jset, entry, 294 journal_entry_bkey_past_end, 295 "extends past end of journal entry")) { 296 entry->u64s = cpu_to_le16((u64 *) k - entry->_data); 297 journal_entry_null_range(vstruct_next(entry), next); 298 return FSCK_DELETED_KEY; 299 } 300 301 if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, 302 c, version, jset, entry, 303 journal_entry_bkey_bad_format, 304 "bad format %u", k->k.format)) { 305 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); 306 memmove(k, bkey_next(k), next - (void *) bkey_next(k)); 307 journal_entry_null_range(vstruct_next(entry), next); 308 return FSCK_DELETED_KEY; 309 } 310 311 if (!write) 312 bch2_bkey_compat(level, btree_id, version, big_endian, 313 write, NULL, bkey_to_packed(k)); 314 315 if (bch2_bkey_invalid(c, bkey_i_to_s_c(k), 316 __btree_node_type(level, btree_id), write, &buf)) { 317 printbuf_reset(&buf); 318 journal_entry_err_msg(&buf, version, jset, entry); 319 prt_newline(&buf); 320 printbuf_indent_add(&buf, 2); 321 322 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); 323 prt_newline(&buf); 324 bch2_bkey_invalid(c, bkey_i_to_s_c(k), 325 __btree_node_type(level, btree_id), write, &buf); 326 327 mustfix_fsck_err(c, journal_entry_bkey_invalid, 328 "%s", buf.buf); 329 330 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); 331 memmove(k, bkey_next(k), next - (void *) bkey_next(k)); 332 journal_entry_null_range(vstruct_next(entry), next); 333 334 printbuf_exit(&buf); 335 return FSCK_DELETED_KEY; 336 } 337 338 if (write) 339 bch2_bkey_compat(level, btree_id, version, big_endian, 340 write, NULL, bkey_to_packed(k)); 341 fsck_err: 342 printbuf_exit(&buf); 343 return ret; 344 } 345 346 static int journal_entry_btree_keys_validate(struct bch_fs *c, 347 struct jset *jset, 348 struct jset_entry *entry, 349 unsigned version, int big_endian, 350 enum bkey_invalid_flags flags) 351 { 352 struct bkey_i *k = entry->start; 353 354 while (k != vstruct_last(entry)) { 355 int ret = journal_validate_key(c, jset, entry, 356 entry->level, 357 entry->btree_id, 358 k, version, big_endian, 359 flags|BKEY_INVALID_JOURNAL); 360 if (ret == FSCK_DELETED_KEY) 361 continue; 362 363 k = bkey_next(k); 364 } 365 366 return 0; 367 } 368 369 static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c, 370 struct jset_entry *entry) 371 { 372 struct bkey_i *k; 373 bool first = true; 374 375 jset_entry_for_each_key(entry, k) { 376 if (!first) { 377 prt_newline(out); 378 prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]); 379 } 380 prt_printf(out, "btree=%s l=%u ", bch2_btree_id_str(entry->btree_id), entry->level); 381 bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); 382 first = false; 383 } 384 } 385 386 static int journal_entry_btree_root_validate(struct bch_fs *c, 387 struct jset *jset, 388 struct jset_entry *entry, 389 unsigned version, int big_endian, 390 enum bkey_invalid_flags flags) 391 { 392 struct bkey_i *k = entry->start; 393 int ret = 0; 394 395 if (journal_entry_err_on(!entry->u64s || 396 le16_to_cpu(entry->u64s) != k->k.u64s, 397 c, version, jset, entry, 398 journal_entry_btree_root_bad_size, 399 "invalid btree root journal entry: wrong number of keys")) { 400 void *next = vstruct_next(entry); 401 /* 402 * we don't want to null out this jset_entry, 403 * just the contents, so that later we can tell 404 * we were _supposed_ to have a btree root 405 */ 406 entry->u64s = 0; 407 journal_entry_null_range(vstruct_next(entry), next); 408 return 0; 409 } 410 411 return journal_validate_key(c, jset, entry, 1, entry->btree_id, k, 412 version, big_endian, flags); 413 fsck_err: 414 return ret; 415 } 416 417 static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c, 418 struct jset_entry *entry) 419 { 420 journal_entry_btree_keys_to_text(out, c, entry); 421 } 422 423 static int journal_entry_prio_ptrs_validate(struct bch_fs *c, 424 struct jset *jset, 425 struct jset_entry *entry, 426 unsigned version, int big_endian, 427 enum bkey_invalid_flags flags) 428 { 429 /* obsolete, don't care: */ 430 return 0; 431 } 432 433 static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c, 434 struct jset_entry *entry) 435 { 436 } 437 438 static int journal_entry_blacklist_validate(struct bch_fs *c, 439 struct jset *jset, 440 struct jset_entry *entry, 441 unsigned version, int big_endian, 442 enum bkey_invalid_flags flags) 443 { 444 int ret = 0; 445 446 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, 447 c, version, jset, entry, 448 journal_entry_blacklist_bad_size, 449 "invalid journal seq blacklist entry: bad size")) { 450 journal_entry_null_range(entry, vstruct_next(entry)); 451 } 452 fsck_err: 453 return ret; 454 } 455 456 static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c, 457 struct jset_entry *entry) 458 { 459 struct jset_entry_blacklist *bl = 460 container_of(entry, struct jset_entry_blacklist, entry); 461 462 prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq)); 463 } 464 465 static int journal_entry_blacklist_v2_validate(struct bch_fs *c, 466 struct jset *jset, 467 struct jset_entry *entry, 468 unsigned version, int big_endian, 469 enum bkey_invalid_flags flags) 470 { 471 struct jset_entry_blacklist_v2 *bl_entry; 472 int ret = 0; 473 474 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, 475 c, version, jset, entry, 476 journal_entry_blacklist_v2_bad_size, 477 "invalid journal seq blacklist entry: bad size")) { 478 journal_entry_null_range(entry, vstruct_next(entry)); 479 goto out; 480 } 481 482 bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry); 483 484 if (journal_entry_err_on(le64_to_cpu(bl_entry->start) > 485 le64_to_cpu(bl_entry->end), 486 c, version, jset, entry, 487 journal_entry_blacklist_v2_start_past_end, 488 "invalid journal seq blacklist entry: start > end")) { 489 journal_entry_null_range(entry, vstruct_next(entry)); 490 } 491 out: 492 fsck_err: 493 return ret; 494 } 495 496 static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c, 497 struct jset_entry *entry) 498 { 499 struct jset_entry_blacklist_v2 *bl = 500 container_of(entry, struct jset_entry_blacklist_v2, entry); 501 502 prt_printf(out, "start=%llu end=%llu", 503 le64_to_cpu(bl->start), 504 le64_to_cpu(bl->end)); 505 } 506 507 static int journal_entry_usage_validate(struct bch_fs *c, 508 struct jset *jset, 509 struct jset_entry *entry, 510 unsigned version, int big_endian, 511 enum bkey_invalid_flags flags) 512 { 513 struct jset_entry_usage *u = 514 container_of(entry, struct jset_entry_usage, entry); 515 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 516 int ret = 0; 517 518 if (journal_entry_err_on(bytes < sizeof(*u), 519 c, version, jset, entry, 520 journal_entry_usage_bad_size, 521 "invalid journal entry usage: bad size")) { 522 journal_entry_null_range(entry, vstruct_next(entry)); 523 return ret; 524 } 525 526 fsck_err: 527 return ret; 528 } 529 530 static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c, 531 struct jset_entry *entry) 532 { 533 struct jset_entry_usage *u = 534 container_of(entry, struct jset_entry_usage, entry); 535 536 prt_printf(out, "type=%s v=%llu", 537 bch2_fs_usage_types[u->entry.btree_id], 538 le64_to_cpu(u->v)); 539 } 540 541 static int journal_entry_data_usage_validate(struct bch_fs *c, 542 struct jset *jset, 543 struct jset_entry *entry, 544 unsigned version, int big_endian, 545 enum bkey_invalid_flags flags) 546 { 547 struct jset_entry_data_usage *u = 548 container_of(entry, struct jset_entry_data_usage, entry); 549 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 550 int ret = 0; 551 552 if (journal_entry_err_on(bytes < sizeof(*u) || 553 bytes < sizeof(*u) + u->r.nr_devs, 554 c, version, jset, entry, 555 journal_entry_data_usage_bad_size, 556 "invalid journal entry usage: bad size")) { 557 journal_entry_null_range(entry, vstruct_next(entry)); 558 return ret; 559 } 560 561 fsck_err: 562 return ret; 563 } 564 565 static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c, 566 struct jset_entry *entry) 567 { 568 struct jset_entry_data_usage *u = 569 container_of(entry, struct jset_entry_data_usage, entry); 570 571 bch2_replicas_entry_to_text(out, &u->r); 572 prt_printf(out, "=%llu", le64_to_cpu(u->v)); 573 } 574 575 static int journal_entry_clock_validate(struct bch_fs *c, 576 struct jset *jset, 577 struct jset_entry *entry, 578 unsigned version, int big_endian, 579 enum bkey_invalid_flags flags) 580 { 581 struct jset_entry_clock *clock = 582 container_of(entry, struct jset_entry_clock, entry); 583 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 584 int ret = 0; 585 586 if (journal_entry_err_on(bytes != sizeof(*clock), 587 c, version, jset, entry, 588 journal_entry_clock_bad_size, 589 "bad size")) { 590 journal_entry_null_range(entry, vstruct_next(entry)); 591 return ret; 592 } 593 594 if (journal_entry_err_on(clock->rw > 1, 595 c, version, jset, entry, 596 journal_entry_clock_bad_rw, 597 "bad rw")) { 598 journal_entry_null_range(entry, vstruct_next(entry)); 599 return ret; 600 } 601 602 fsck_err: 603 return ret; 604 } 605 606 static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c, 607 struct jset_entry *entry) 608 { 609 struct jset_entry_clock *clock = 610 container_of(entry, struct jset_entry_clock, entry); 611 612 prt_printf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time)); 613 } 614 615 static int journal_entry_dev_usage_validate(struct bch_fs *c, 616 struct jset *jset, 617 struct jset_entry *entry, 618 unsigned version, int big_endian, 619 enum bkey_invalid_flags flags) 620 { 621 struct jset_entry_dev_usage *u = 622 container_of(entry, struct jset_entry_dev_usage, entry); 623 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 624 unsigned expected = sizeof(*u); 625 unsigned dev; 626 int ret = 0; 627 628 if (journal_entry_err_on(bytes < expected, 629 c, version, jset, entry, 630 journal_entry_dev_usage_bad_size, 631 "bad size (%u < %u)", 632 bytes, expected)) { 633 journal_entry_null_range(entry, vstruct_next(entry)); 634 return ret; 635 } 636 637 dev = le32_to_cpu(u->dev); 638 639 if (journal_entry_err_on(!bch2_dev_exists2(c, dev), 640 c, version, jset, entry, 641 journal_entry_dev_usage_bad_dev, 642 "bad dev")) { 643 journal_entry_null_range(entry, vstruct_next(entry)); 644 return ret; 645 } 646 647 if (journal_entry_err_on(u->pad, 648 c, version, jset, entry, 649 journal_entry_dev_usage_bad_pad, 650 "bad pad")) { 651 journal_entry_null_range(entry, vstruct_next(entry)); 652 return ret; 653 } 654 655 fsck_err: 656 return ret; 657 } 658 659 static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c, 660 struct jset_entry *entry) 661 { 662 struct jset_entry_dev_usage *u = 663 container_of(entry, struct jset_entry_dev_usage, entry); 664 unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); 665 666 prt_printf(out, "dev=%u", le32_to_cpu(u->dev)); 667 668 for (i = 0; i < nr_types; i++) { 669 if (i < BCH_DATA_NR) 670 prt_printf(out, " %s", bch2_data_types[i]); 671 else 672 prt_printf(out, " (unknown data type %u)", i); 673 prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu", 674 le64_to_cpu(u->d[i].buckets), 675 le64_to_cpu(u->d[i].sectors), 676 le64_to_cpu(u->d[i].fragmented)); 677 } 678 679 prt_printf(out, " buckets_ec: %llu", le64_to_cpu(u->buckets_ec)); 680 } 681 682 static int journal_entry_log_validate(struct bch_fs *c, 683 struct jset *jset, 684 struct jset_entry *entry, 685 unsigned version, int big_endian, 686 enum bkey_invalid_flags flags) 687 { 688 return 0; 689 } 690 691 static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c, 692 struct jset_entry *entry) 693 { 694 struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); 695 unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d); 696 697 prt_printf(out, "%.*s", bytes, l->d); 698 } 699 700 static int journal_entry_overwrite_validate(struct bch_fs *c, 701 struct jset *jset, 702 struct jset_entry *entry, 703 unsigned version, int big_endian, 704 enum bkey_invalid_flags flags) 705 { 706 return journal_entry_btree_keys_validate(c, jset, entry, 707 version, big_endian, READ); 708 } 709 710 static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c, 711 struct jset_entry *entry) 712 { 713 journal_entry_btree_keys_to_text(out, c, entry); 714 } 715 716 struct jset_entry_ops { 717 int (*validate)(struct bch_fs *, struct jset *, 718 struct jset_entry *, unsigned, int, 719 enum bkey_invalid_flags); 720 void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); 721 }; 722 723 static const struct jset_entry_ops bch2_jset_entry_ops[] = { 724 #define x(f, nr) \ 725 [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \ 726 .validate = journal_entry_##f##_validate, \ 727 .to_text = journal_entry_##f##_to_text, \ 728 }, 729 BCH_JSET_ENTRY_TYPES() 730 #undef x 731 }; 732 733 int bch2_journal_entry_validate(struct bch_fs *c, 734 struct jset *jset, 735 struct jset_entry *entry, 736 unsigned version, int big_endian, 737 enum bkey_invalid_flags flags) 738 { 739 return entry->type < BCH_JSET_ENTRY_NR 740 ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry, 741 version, big_endian, flags) 742 : 0; 743 } 744 745 void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, 746 struct jset_entry *entry) 747 { 748 if (entry->type < BCH_JSET_ENTRY_NR) { 749 prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]); 750 bch2_jset_entry_ops[entry->type].to_text(out, c, entry); 751 } else { 752 prt_printf(out, "(unknown type %u)", entry->type); 753 } 754 } 755 756 static int jset_validate_entries(struct bch_fs *c, struct jset *jset, 757 enum bkey_invalid_flags flags) 758 { 759 struct jset_entry *entry; 760 unsigned version = le32_to_cpu(jset->version); 761 int ret = 0; 762 763 vstruct_for_each(jset, entry) { 764 if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset), 765 c, version, jset, entry, 766 journal_entry_past_jset_end, 767 "journal entry extends past end of jset")) { 768 jset->u64s = cpu_to_le32((u64 *) entry - jset->_data); 769 break; 770 } 771 772 ret = bch2_journal_entry_validate(c, jset, entry, 773 version, JSET_BIG_ENDIAN(jset), flags); 774 if (ret) 775 break; 776 } 777 fsck_err: 778 return ret; 779 } 780 781 static int jset_validate(struct bch_fs *c, 782 struct bch_dev *ca, 783 struct jset *jset, u64 sector, 784 enum bkey_invalid_flags flags) 785 { 786 unsigned version; 787 int ret = 0; 788 789 if (le64_to_cpu(jset->magic) != jset_magic(c)) 790 return JOURNAL_ENTRY_NONE; 791 792 version = le32_to_cpu(jset->version); 793 if (journal_entry_err_on(!bch2_version_compatible(version), 794 c, version, jset, NULL, 795 jset_unsupported_version, 796 "%s sector %llu seq %llu: incompatible journal entry version %u.%u", 797 ca ? ca->name : c->name, 798 sector, le64_to_cpu(jset->seq), 799 BCH_VERSION_MAJOR(version), 800 BCH_VERSION_MINOR(version))) { 801 /* don't try to continue: */ 802 return -EINVAL; 803 } 804 805 if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), 806 c, version, jset, NULL, 807 jset_unknown_csum, 808 "%s sector %llu seq %llu: journal entry with unknown csum type %llu", 809 ca ? ca->name : c->name, 810 sector, le64_to_cpu(jset->seq), 811 JSET_CSUM_TYPE(jset))) 812 ret = JOURNAL_ENTRY_BAD; 813 814 /* last_seq is ignored when JSET_NO_FLUSH is true */ 815 if (journal_entry_err_on(!JSET_NO_FLUSH(jset) && 816 le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), 817 c, version, jset, NULL, 818 jset_last_seq_newer_than_seq, 819 "invalid journal entry: last_seq > seq (%llu > %llu)", 820 le64_to_cpu(jset->last_seq), 821 le64_to_cpu(jset->seq))) { 822 jset->last_seq = jset->seq; 823 return JOURNAL_ENTRY_BAD; 824 } 825 826 ret = jset_validate_entries(c, jset, flags); 827 fsck_err: 828 return ret; 829 } 830 831 static int jset_validate_early(struct bch_fs *c, 832 struct bch_dev *ca, 833 struct jset *jset, u64 sector, 834 unsigned bucket_sectors_left, 835 unsigned sectors_read) 836 { 837 size_t bytes = vstruct_bytes(jset); 838 unsigned version; 839 enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL; 840 int ret = 0; 841 842 if (le64_to_cpu(jset->magic) != jset_magic(c)) 843 return JOURNAL_ENTRY_NONE; 844 845 version = le32_to_cpu(jset->version); 846 if (journal_entry_err_on(!bch2_version_compatible(version), 847 c, version, jset, NULL, 848 jset_unsupported_version, 849 "%s sector %llu seq %llu: unknown journal entry version %u.%u", 850 ca ? ca->name : c->name, 851 sector, le64_to_cpu(jset->seq), 852 BCH_VERSION_MAJOR(version), 853 BCH_VERSION_MINOR(version))) { 854 /* don't try to continue: */ 855 return -EINVAL; 856 } 857 858 if (bytes > (sectors_read << 9) && 859 sectors_read < bucket_sectors_left) 860 return JOURNAL_ENTRY_REREAD; 861 862 if (journal_entry_err_on(bytes > bucket_sectors_left << 9, 863 c, version, jset, NULL, 864 jset_past_bucket_end, 865 "%s sector %llu seq %llu: journal entry too big (%zu bytes)", 866 ca ? ca->name : c->name, 867 sector, le64_to_cpu(jset->seq), bytes)) 868 le32_add_cpu(&jset->u64s, 869 -((bytes - (bucket_sectors_left << 9)) / 8)); 870 fsck_err: 871 return ret; 872 } 873 874 struct journal_read_buf { 875 void *data; 876 size_t size; 877 }; 878 879 static int journal_read_buf_realloc(struct journal_read_buf *b, 880 size_t new_size) 881 { 882 void *n; 883 884 /* the bios are sized for this many pages, max: */ 885 if (new_size > JOURNAL_ENTRY_SIZE_MAX) 886 return -BCH_ERR_ENOMEM_journal_read_buf_realloc; 887 888 new_size = roundup_pow_of_two(new_size); 889 n = kvpmalloc(new_size, GFP_KERNEL); 890 if (!n) 891 return -BCH_ERR_ENOMEM_journal_read_buf_realloc; 892 893 kvpfree(b->data, b->size); 894 b->data = n; 895 b->size = new_size; 896 return 0; 897 } 898 899 static int journal_read_bucket(struct bch_dev *ca, 900 struct journal_read_buf *buf, 901 struct journal_list *jlist, 902 unsigned bucket) 903 { 904 struct bch_fs *c = ca->fs; 905 struct journal_device *ja = &ca->journal; 906 struct jset *j = NULL; 907 unsigned sectors, sectors_read = 0; 908 u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), 909 end = offset + ca->mi.bucket_size; 910 bool saw_bad = false, csum_good; 911 int ret = 0; 912 913 pr_debug("reading %u", bucket); 914 915 while (offset < end) { 916 if (!sectors_read) { 917 struct bio *bio; 918 unsigned nr_bvecs; 919 reread: 920 sectors_read = min_t(unsigned, 921 end - offset, buf->size >> 9); 922 nr_bvecs = buf_pages(buf->data, sectors_read << 9); 923 924 bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); 925 bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ); 926 927 bio->bi_iter.bi_sector = offset; 928 bch2_bio_map(bio, buf->data, sectors_read << 9); 929 930 ret = submit_bio_wait(bio); 931 kfree(bio); 932 933 if (bch2_dev_io_err_on(ret, ca, BCH_MEMBER_ERROR_read, 934 "journal read error: sector %llu", 935 offset) || 936 bch2_meta_read_fault("journal")) { 937 /* 938 * We don't error out of the recovery process 939 * here, since the relevant journal entry may be 940 * found on a different device, and missing or 941 * no journal entries will be handled later 942 */ 943 return 0; 944 } 945 946 j = buf->data; 947 } 948 949 ret = jset_validate_early(c, ca, j, offset, 950 end - offset, sectors_read); 951 switch (ret) { 952 case 0: 953 sectors = vstruct_sectors(j, c->block_bits); 954 break; 955 case JOURNAL_ENTRY_REREAD: 956 if (vstruct_bytes(j) > buf->size) { 957 ret = journal_read_buf_realloc(buf, 958 vstruct_bytes(j)); 959 if (ret) 960 return ret; 961 } 962 goto reread; 963 case JOURNAL_ENTRY_NONE: 964 if (!saw_bad) 965 return 0; 966 /* 967 * On checksum error we don't really trust the size 968 * field of the journal entry we read, so try reading 969 * again at next block boundary: 970 */ 971 sectors = block_sectors(c); 972 goto next_block; 973 default: 974 return ret; 975 } 976 977 /* 978 * This happens sometimes if we don't have discards on - 979 * when we've partially overwritten a bucket with new 980 * journal entries. We don't need the rest of the 981 * bucket: 982 */ 983 if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) 984 return 0; 985 986 ja->bucket_seq[bucket] = le64_to_cpu(j->seq); 987 988 csum_good = jset_csum_good(c, j); 989 if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum, 990 "journal checksum error")) 991 saw_bad = true; 992 993 ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), 994 j->encrypted_start, 995 vstruct_end(j) - (void *) j->encrypted_start); 996 bch2_fs_fatal_err_on(ret, c, 997 "error decrypting journal entry: %i", ret); 998 999 mutex_lock(&jlist->lock); 1000 ret = journal_entry_add(c, ca, (struct journal_ptr) { 1001 .csum_good = csum_good, 1002 .dev = ca->dev_idx, 1003 .bucket = bucket, 1004 .bucket_offset = offset - 1005 bucket_to_sector(ca, ja->buckets[bucket]), 1006 .sector = offset, 1007 }, jlist, j); 1008 mutex_unlock(&jlist->lock); 1009 1010 switch (ret) { 1011 case JOURNAL_ENTRY_ADD_OK: 1012 break; 1013 case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: 1014 break; 1015 default: 1016 return ret; 1017 } 1018 next_block: 1019 pr_debug("next"); 1020 offset += sectors; 1021 sectors_read -= sectors; 1022 j = ((void *) j) + (sectors << 9); 1023 } 1024 1025 return 0; 1026 } 1027 1028 static void bch2_journal_read_device(struct closure *cl) 1029 { 1030 struct journal_device *ja = 1031 container_of(cl, struct journal_device, read); 1032 struct bch_dev *ca = container_of(ja, struct bch_dev, journal); 1033 struct bch_fs *c = ca->fs; 1034 struct journal_list *jlist = 1035 container_of(cl->parent, struct journal_list, cl); 1036 struct journal_replay *r, **_r; 1037 struct genradix_iter iter; 1038 struct journal_read_buf buf = { NULL, 0 }; 1039 unsigned i; 1040 int ret = 0; 1041 1042 if (!ja->nr) 1043 goto out; 1044 1045 ret = journal_read_buf_realloc(&buf, PAGE_SIZE); 1046 if (ret) 1047 goto err; 1048 1049 pr_debug("%u journal buckets", ja->nr); 1050 1051 for (i = 0; i < ja->nr; i++) { 1052 ret = journal_read_bucket(ca, &buf, jlist, i); 1053 if (ret) 1054 goto err; 1055 } 1056 1057 ja->sectors_free = ca->mi.bucket_size; 1058 1059 mutex_lock(&jlist->lock); 1060 genradix_for_each_reverse(&c->journal_entries, iter, _r) { 1061 r = *_r; 1062 1063 if (!r) 1064 continue; 1065 1066 for (i = 0; i < r->nr_ptrs; i++) { 1067 if (r->ptrs[i].dev == ca->dev_idx) { 1068 unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) + 1069 vstruct_sectors(&r->j, c->block_bits); 1070 1071 ja->cur_idx = r->ptrs[i].bucket; 1072 ja->sectors_free = ca->mi.bucket_size - wrote; 1073 goto found; 1074 } 1075 } 1076 } 1077 found: 1078 mutex_unlock(&jlist->lock); 1079 1080 if (ja->bucket_seq[ja->cur_idx] && 1081 ja->sectors_free == ca->mi.bucket_size) { 1082 #if 0 1083 /* 1084 * Debug code for ZNS support, where we (probably) want to be 1085 * correlated where we stopped in the journal to the zone write 1086 * points: 1087 */ 1088 bch_err(c, "ja->sectors_free == ca->mi.bucket_size"); 1089 bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr); 1090 for (i = 0; i < 3; i++) { 1091 unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr; 1092 1093 bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]); 1094 } 1095 #endif 1096 ja->sectors_free = 0; 1097 } 1098 1099 /* 1100 * Set dirty_idx to indicate the entire journal is full and needs to be 1101 * reclaimed - journal reclaim will immediately reclaim whatever isn't 1102 * pinned when it first runs: 1103 */ 1104 ja->discard_idx = ja->dirty_idx_ondisk = 1105 ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; 1106 out: 1107 bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); 1108 kvpfree(buf.data, buf.size); 1109 percpu_ref_put(&ca->io_ref); 1110 closure_return(cl); 1111 return; 1112 err: 1113 mutex_lock(&jlist->lock); 1114 jlist->ret = ret; 1115 mutex_unlock(&jlist->lock); 1116 goto out; 1117 } 1118 1119 void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, 1120 struct journal_replay *j) 1121 { 1122 unsigned i; 1123 1124 for (i = 0; i < j->nr_ptrs; i++) { 1125 struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev); 1126 u64 offset; 1127 1128 div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset); 1129 1130 if (i) 1131 prt_printf(out, " "); 1132 prt_printf(out, "%u:%u:%u (sector %llu)", 1133 j->ptrs[i].dev, 1134 j->ptrs[i].bucket, 1135 j->ptrs[i].bucket_offset, 1136 j->ptrs[i].sector); 1137 } 1138 } 1139 1140 int bch2_journal_read(struct bch_fs *c, 1141 u64 *last_seq, 1142 u64 *blacklist_seq, 1143 u64 *start_seq) 1144 { 1145 struct journal_list jlist; 1146 struct journal_replay *i, **_i, *prev = NULL; 1147 struct genradix_iter radix_iter; 1148 struct bch_dev *ca; 1149 unsigned iter; 1150 struct printbuf buf = PRINTBUF; 1151 bool degraded = false, last_write_torn = false; 1152 u64 seq; 1153 int ret = 0; 1154 1155 closure_init_stack(&jlist.cl); 1156 mutex_init(&jlist.lock); 1157 jlist.last_seq = 0; 1158 jlist.ret = 0; 1159 1160 for_each_member_device(ca, c, iter) { 1161 if (!c->opts.fsck && 1162 !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal))) 1163 continue; 1164 1165 if ((ca->mi.state == BCH_MEMBER_STATE_rw || 1166 ca->mi.state == BCH_MEMBER_STATE_ro) && 1167 percpu_ref_tryget(&ca->io_ref)) 1168 closure_call(&ca->journal.read, 1169 bch2_journal_read_device, 1170 system_unbound_wq, 1171 &jlist.cl); 1172 else 1173 degraded = true; 1174 } 1175 1176 closure_sync(&jlist.cl); 1177 1178 if (jlist.ret) 1179 return jlist.ret; 1180 1181 *last_seq = 0; 1182 *start_seq = 0; 1183 *blacklist_seq = 0; 1184 1185 /* 1186 * Find most recent flush entry, and ignore newer non flush entries - 1187 * those entries will be blacklisted: 1188 */ 1189 genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) { 1190 enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL; 1191 1192 i = *_i; 1193 1194 if (!i || i->ignore) 1195 continue; 1196 1197 if (!*start_seq) 1198 *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1; 1199 1200 if (JSET_NO_FLUSH(&i->j)) { 1201 i->ignore = true; 1202 continue; 1203 } 1204 1205 if (!last_write_torn && !i->csum_good) { 1206 last_write_torn = true; 1207 i->ignore = true; 1208 continue; 1209 } 1210 1211 if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq), 1212 c, le32_to_cpu(i->j.version), &i->j, NULL, 1213 jset_last_seq_newer_than_seq, 1214 "invalid journal entry: last_seq > seq (%llu > %llu)", 1215 le64_to_cpu(i->j.last_seq), 1216 le64_to_cpu(i->j.seq))) 1217 i->j.last_seq = i->j.seq; 1218 1219 *last_seq = le64_to_cpu(i->j.last_seq); 1220 *blacklist_seq = le64_to_cpu(i->j.seq) + 1; 1221 break; 1222 } 1223 1224 if (!*start_seq) { 1225 bch_info(c, "journal read done, but no entries found"); 1226 return 0; 1227 } 1228 1229 if (!*last_seq) { 1230 fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes, 1231 "journal read done, but no entries found after dropping non-flushes"); 1232 return 0; 1233 } 1234 1235 bch_info(c, "journal read done, replaying entries %llu-%llu", 1236 *last_seq, *blacklist_seq - 1); 1237 1238 if (*start_seq != *blacklist_seq) 1239 bch_info(c, "dropped unflushed entries %llu-%llu", 1240 *blacklist_seq, *start_seq - 1); 1241 1242 /* Drop blacklisted entries and entries older than last_seq: */ 1243 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1244 i = *_i; 1245 1246 if (!i || i->ignore) 1247 continue; 1248 1249 seq = le64_to_cpu(i->j.seq); 1250 if (seq < *last_seq) { 1251 journal_replay_free(c, i); 1252 continue; 1253 } 1254 1255 if (bch2_journal_seq_is_blacklisted(c, seq, true)) { 1256 fsck_err_on(!JSET_NO_FLUSH(&i->j), c, 1257 jset_seq_blacklisted, 1258 "found blacklisted journal entry %llu", seq); 1259 i->ignore = true; 1260 } 1261 } 1262 1263 /* Check for missing entries: */ 1264 seq = *last_seq; 1265 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1266 i = *_i; 1267 1268 if (!i || i->ignore) 1269 continue; 1270 1271 BUG_ON(seq > le64_to_cpu(i->j.seq)); 1272 1273 while (seq < le64_to_cpu(i->j.seq)) { 1274 u64 missing_start, missing_end; 1275 struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; 1276 1277 while (seq < le64_to_cpu(i->j.seq) && 1278 bch2_journal_seq_is_blacklisted(c, seq, false)) 1279 seq++; 1280 1281 if (seq == le64_to_cpu(i->j.seq)) 1282 break; 1283 1284 missing_start = seq; 1285 1286 while (seq < le64_to_cpu(i->j.seq) && 1287 !bch2_journal_seq_is_blacklisted(c, seq, false)) 1288 seq++; 1289 1290 if (prev) { 1291 bch2_journal_ptrs_to_text(&buf1, c, prev); 1292 prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits)); 1293 } else 1294 prt_printf(&buf1, "(none)"); 1295 bch2_journal_ptrs_to_text(&buf2, c, i); 1296 1297 missing_end = seq - 1; 1298 fsck_err(c, journal_entries_missing, 1299 "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" 1300 " prev at %s\n" 1301 " next at %s", 1302 missing_start, missing_end, 1303 *last_seq, *blacklist_seq - 1, 1304 buf1.buf, buf2.buf); 1305 1306 printbuf_exit(&buf1); 1307 printbuf_exit(&buf2); 1308 } 1309 1310 prev = i; 1311 seq++; 1312 } 1313 1314 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1315 struct bch_replicas_padded replicas = { 1316 .e.data_type = BCH_DATA_journal, 1317 .e.nr_required = 1, 1318 }; 1319 unsigned ptr; 1320 1321 i = *_i; 1322 if (!i || i->ignore) 1323 continue; 1324 1325 for (ptr = 0; ptr < i->nr_ptrs; ptr++) { 1326 ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev); 1327 1328 if (!i->ptrs[ptr].csum_good) 1329 bch_err_dev_offset(ca, i->ptrs[ptr].sector, 1330 "invalid journal checksum, seq %llu%s", 1331 le64_to_cpu(i->j.seq), 1332 i->csum_good ? " (had good copy on another device)" : ""); 1333 } 1334 1335 ret = jset_validate(c, 1336 bch_dev_bkey_exists(c, i->ptrs[0].dev), 1337 &i->j, 1338 i->ptrs[0].sector, 1339 READ); 1340 if (ret) 1341 goto err; 1342 1343 for (ptr = 0; ptr < i->nr_ptrs; ptr++) 1344 replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev; 1345 1346 bch2_replicas_entry_sort(&replicas.e); 1347 1348 printbuf_reset(&buf); 1349 bch2_replicas_entry_to_text(&buf, &replicas.e); 1350 1351 if (!degraded && 1352 !bch2_replicas_marked(c, &replicas.e) && 1353 (le64_to_cpu(i->j.seq) == *last_seq || 1354 fsck_err(c, journal_entry_replicas_not_marked, 1355 "superblock not marked as containing replicas for journal entry %llu\n %s", 1356 le64_to_cpu(i->j.seq), buf.buf))) { 1357 ret = bch2_mark_replicas(c, &replicas.e); 1358 if (ret) 1359 goto err; 1360 } 1361 } 1362 err: 1363 fsck_err: 1364 printbuf_exit(&buf); 1365 return ret; 1366 } 1367 1368 /* journal write: */ 1369 1370 static void __journal_write_alloc(struct journal *j, 1371 struct journal_buf *w, 1372 struct dev_alloc_list *devs_sorted, 1373 unsigned sectors, 1374 unsigned *replicas, 1375 unsigned replicas_want) 1376 { 1377 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1378 struct journal_device *ja; 1379 struct bch_dev *ca; 1380 unsigned i; 1381 1382 if (*replicas >= replicas_want) 1383 return; 1384 1385 for (i = 0; i < devs_sorted->nr; i++) { 1386 ca = rcu_dereference(c->devs[devs_sorted->devs[i]]); 1387 if (!ca) 1388 continue; 1389 1390 ja = &ca->journal; 1391 1392 /* 1393 * Check that we can use this device, and aren't already using 1394 * it: 1395 */ 1396 if (!ca->mi.durability || 1397 ca->mi.state != BCH_MEMBER_STATE_rw || 1398 !ja->nr || 1399 bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) || 1400 sectors > ja->sectors_free) 1401 continue; 1402 1403 bch2_dev_stripe_increment(ca, &j->wp.stripe); 1404 1405 bch2_bkey_append_ptr(&w->key, 1406 (struct bch_extent_ptr) { 1407 .offset = bucket_to_sector(ca, 1408 ja->buckets[ja->cur_idx]) + 1409 ca->mi.bucket_size - 1410 ja->sectors_free, 1411 .dev = ca->dev_idx, 1412 }); 1413 1414 ja->sectors_free -= sectors; 1415 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1416 1417 *replicas += ca->mi.durability; 1418 1419 if (*replicas >= replicas_want) 1420 break; 1421 } 1422 } 1423 1424 /** 1425 * journal_write_alloc - decide where to write next journal entry 1426 * 1427 * @j: journal object 1428 * @w: journal buf (entry to be written) 1429 * 1430 * Returns: 0 on success, or -EROFS on failure 1431 */ 1432 static int journal_write_alloc(struct journal *j, struct journal_buf *w) 1433 { 1434 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1435 struct bch_devs_mask devs; 1436 struct journal_device *ja; 1437 struct bch_dev *ca; 1438 struct dev_alloc_list devs_sorted; 1439 unsigned sectors = vstruct_sectors(w->data, c->block_bits); 1440 unsigned target = c->opts.metadata_target ?: 1441 c->opts.foreground_target; 1442 unsigned i, replicas = 0, replicas_want = 1443 READ_ONCE(c->opts.metadata_replicas); 1444 1445 rcu_read_lock(); 1446 retry: 1447 devs = target_rw_devs(c, BCH_DATA_journal, target); 1448 1449 devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); 1450 1451 __journal_write_alloc(j, w, &devs_sorted, 1452 sectors, &replicas, replicas_want); 1453 1454 if (replicas >= replicas_want) 1455 goto done; 1456 1457 for (i = 0; i < devs_sorted.nr; i++) { 1458 ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); 1459 if (!ca) 1460 continue; 1461 1462 ja = &ca->journal; 1463 1464 if (sectors > ja->sectors_free && 1465 sectors <= ca->mi.bucket_size && 1466 bch2_journal_dev_buckets_available(j, ja, 1467 journal_space_discarded)) { 1468 ja->cur_idx = (ja->cur_idx + 1) % ja->nr; 1469 ja->sectors_free = ca->mi.bucket_size; 1470 1471 /* 1472 * ja->bucket_seq[ja->cur_idx] must always have 1473 * something sensible: 1474 */ 1475 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1476 } 1477 } 1478 1479 __journal_write_alloc(j, w, &devs_sorted, 1480 sectors, &replicas, replicas_want); 1481 1482 if (replicas < replicas_want && target) { 1483 /* Retry from all devices: */ 1484 target = 0; 1485 goto retry; 1486 } 1487 done: 1488 rcu_read_unlock(); 1489 1490 BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); 1491 1492 return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS; 1493 } 1494 1495 static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) 1496 { 1497 /* we aren't holding j->lock: */ 1498 unsigned new_size = READ_ONCE(j->buf_size_want); 1499 void *new_buf; 1500 1501 if (buf->buf_size >= new_size) 1502 return; 1503 1504 new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN); 1505 if (!new_buf) 1506 return; 1507 1508 memcpy(new_buf, buf->data, buf->buf_size); 1509 1510 spin_lock(&j->lock); 1511 swap(buf->data, new_buf); 1512 swap(buf->buf_size, new_size); 1513 spin_unlock(&j->lock); 1514 1515 kvpfree(new_buf, new_size); 1516 } 1517 1518 static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) 1519 { 1520 return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK); 1521 } 1522 1523 static void journal_write_done(struct closure *cl) 1524 { 1525 struct journal *j = container_of(cl, struct journal, io); 1526 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1527 struct journal_buf *w = journal_last_unwritten_buf(j); 1528 struct bch_replicas_padded replicas; 1529 union journal_res_state old, new; 1530 u64 v, seq; 1531 int err = 0; 1532 1533 bch2_time_stats_update(!JSET_NO_FLUSH(w->data) 1534 ? j->flush_write_time 1535 : j->noflush_write_time, j->write_start_time); 1536 1537 if (!w->devs_written.nr) { 1538 bch_err(c, "unable to write journal to sufficient devices"); 1539 err = -EIO; 1540 } else { 1541 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, 1542 w->devs_written); 1543 if (bch2_mark_replicas(c, &replicas.e)) 1544 err = -EIO; 1545 } 1546 1547 if (err) 1548 bch2_fatal_error(c); 1549 1550 spin_lock(&j->lock); 1551 seq = le64_to_cpu(w->data->seq); 1552 1553 if (seq >= j->pin.front) 1554 journal_seq_pin(j, seq)->devs = w->devs_written; 1555 1556 if (!err) { 1557 if (!JSET_NO_FLUSH(w->data)) { 1558 j->flushed_seq_ondisk = seq; 1559 j->last_seq_ondisk = w->last_seq; 1560 1561 bch2_do_discards(c); 1562 closure_wake_up(&c->freelist_wait); 1563 1564 bch2_reset_alloc_cursors(c); 1565 } 1566 } else if (!j->err_seq || seq < j->err_seq) 1567 j->err_seq = seq; 1568 1569 j->seq_ondisk = seq; 1570 1571 /* 1572 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard 1573 * more buckets: 1574 * 1575 * Must come before signaling write completion, for 1576 * bch2_fs_journal_stop(): 1577 */ 1578 if (j->watermark != BCH_WATERMARK_stripe) 1579 journal_reclaim_kick(&c->journal); 1580 1581 /* also must come before signalling write completion: */ 1582 closure_debug_destroy(cl); 1583 1584 v = atomic64_read(&j->reservations.counter); 1585 do { 1586 old.v = new.v = v; 1587 BUG_ON(journal_state_count(new, new.unwritten_idx)); 1588 1589 new.unwritten_idx++; 1590 } while ((v = atomic64_cmpxchg(&j->reservations.counter, 1591 old.v, new.v)) != old.v); 1592 1593 bch2_journal_space_available(j); 1594 1595 closure_wake_up(&w->wait); 1596 journal_wake(j); 1597 1598 if (!journal_state_count(new, new.unwritten_idx) && 1599 journal_last_unwritten_seq(j) <= journal_cur_seq(j)) { 1600 spin_unlock(&j->lock); 1601 closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); 1602 } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && 1603 new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { 1604 struct journal_buf *buf = journal_cur_buf(j); 1605 long delta = buf->expires - jiffies; 1606 1607 /* 1608 * We don't close a journal entry to write it while there's 1609 * previous entries still in flight - the current journal entry 1610 * might want to be written now: 1611 */ 1612 1613 spin_unlock(&j->lock); 1614 mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta)); 1615 } else { 1616 spin_unlock(&j->lock); 1617 } 1618 } 1619 1620 static void journal_write_endio(struct bio *bio) 1621 { 1622 struct bch_dev *ca = bio->bi_private; 1623 struct journal *j = &ca->fs->journal; 1624 struct journal_buf *w = journal_last_unwritten_buf(j); 1625 unsigned long flags; 1626 1627 if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, 1628 "error writing journal entry %llu: %s", 1629 le64_to_cpu(w->data->seq), 1630 bch2_blk_status_to_str(bio->bi_status)) || 1631 bch2_meta_write_fault("journal")) { 1632 spin_lock_irqsave(&j->err_lock, flags); 1633 bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx); 1634 spin_unlock_irqrestore(&j->err_lock, flags); 1635 } 1636 1637 closure_put(&j->io); 1638 percpu_ref_put(&ca->io_ref); 1639 } 1640 1641 static void do_journal_write(struct closure *cl) 1642 { 1643 struct journal *j = container_of(cl, struct journal, io); 1644 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1645 struct bch_dev *ca; 1646 struct journal_buf *w = journal_last_unwritten_buf(j); 1647 struct bch_extent_ptr *ptr; 1648 struct bio *bio; 1649 unsigned sectors = vstruct_sectors(w->data, c->block_bits); 1650 1651 extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { 1652 ca = bch_dev_bkey_exists(c, ptr->dev); 1653 if (!percpu_ref_tryget(&ca->io_ref)) { 1654 /* XXX: fix this */ 1655 bch_err(c, "missing device for journal write\n"); 1656 continue; 1657 } 1658 1659 this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], 1660 sectors); 1661 1662 bio = ca->journal.bio; 1663 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); 1664 bio->bi_iter.bi_sector = ptr->offset; 1665 bio->bi_end_io = journal_write_endio; 1666 bio->bi_private = ca; 1667 1668 BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector); 1669 ca->prev_journal_sector = bio->bi_iter.bi_sector; 1670 1671 if (!JSET_NO_FLUSH(w->data)) 1672 bio->bi_opf |= REQ_FUA; 1673 if (!JSET_NO_FLUSH(w->data) && !w->separate_flush) 1674 bio->bi_opf |= REQ_PREFLUSH; 1675 1676 bch2_bio_map(bio, w->data, sectors << 9); 1677 1678 trace_and_count(c, journal_write, bio); 1679 closure_bio_submit(bio, cl); 1680 1681 ca->journal.bucket_seq[ca->journal.cur_idx] = 1682 le64_to_cpu(w->data->seq); 1683 } 1684 1685 continue_at(cl, journal_write_done, c->io_complete_wq); 1686 } 1687 1688 static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) 1689 { 1690 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1691 struct jset_entry *start, *end, *i, *next, *prev = NULL; 1692 struct jset *jset = w->data; 1693 unsigned sectors, bytes, u64s; 1694 bool validate_before_checksum = false; 1695 unsigned long btree_roots_have = 0; 1696 int ret; 1697 1698 /* 1699 * Simple compaction, dropping empty jset_entries (from journal 1700 * reservations that weren't fully used) and merging jset_entries that 1701 * can be. 1702 * 1703 * If we wanted to be really fancy here, we could sort all the keys in 1704 * the jset and drop keys that were overwritten - probably not worth it: 1705 */ 1706 vstruct_for_each_safe(jset, i, next) { 1707 unsigned u64s = le16_to_cpu(i->u64s); 1708 1709 /* Empty entry: */ 1710 if (!u64s) 1711 continue; 1712 1713 /* 1714 * New btree roots are set by journalling them; when the journal 1715 * entry gets written we have to propagate them to 1716 * c->btree_roots 1717 * 1718 * But, every journal entry we write has to contain all the 1719 * btree roots (at least for now); so after we copy btree roots 1720 * to c->btree_roots we have to get any missing btree roots and 1721 * add them to this journal entry: 1722 */ 1723 if (i->type == BCH_JSET_ENTRY_btree_root) { 1724 bch2_journal_entry_to_btree_root(c, i); 1725 __set_bit(i->btree_id, &btree_roots_have); 1726 } 1727 1728 /* Can we merge with previous entry? */ 1729 if (prev && 1730 i->btree_id == prev->btree_id && 1731 i->level == prev->level && 1732 i->type == prev->type && 1733 i->type == BCH_JSET_ENTRY_btree_keys && 1734 le16_to_cpu(prev->u64s) + u64s <= U16_MAX) { 1735 memmove_u64s_down(vstruct_next(prev), 1736 i->_data, 1737 u64s); 1738 le16_add_cpu(&prev->u64s, u64s); 1739 continue; 1740 } 1741 1742 /* Couldn't merge, move i into new position (after prev): */ 1743 prev = prev ? vstruct_next(prev) : jset->start; 1744 if (i != prev) 1745 memmove_u64s_down(prev, i, jset_u64s(u64s)); 1746 } 1747 1748 prev = prev ? vstruct_next(prev) : jset->start; 1749 jset->u64s = cpu_to_le32((u64 *) prev - jset->_data); 1750 1751 start = end = vstruct_last(jset); 1752 1753 end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have); 1754 1755 bch2_journal_super_entries_add_common(c, &end, 1756 le64_to_cpu(jset->seq)); 1757 u64s = (u64 *) end - (u64 *) start; 1758 BUG_ON(u64s > j->entry_u64s_reserved); 1759 1760 le32_add_cpu(&jset->u64s, u64s); 1761 1762 sectors = vstruct_sectors(jset, c->block_bits); 1763 bytes = vstruct_bytes(jset); 1764 1765 if (sectors > w->sectors) { 1766 bch2_fs_fatal_error(c, "aieeee! journal write overran available space, %zu > %u (extra %u reserved %u/%u)", 1767 vstruct_bytes(jset), w->sectors << 9, 1768 u64s, w->u64s_reserved, j->entry_u64s_reserved); 1769 return -EINVAL; 1770 } 1771 1772 jset->magic = cpu_to_le64(jset_magic(c)); 1773 jset->version = cpu_to_le32(c->sb.version); 1774 1775 SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); 1776 SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); 1777 1778 if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset)) 1779 j->last_empty_seq = le64_to_cpu(jset->seq); 1780 1781 if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) 1782 validate_before_checksum = true; 1783 1784 if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current) 1785 validate_before_checksum = true; 1786 1787 if (validate_before_checksum && 1788 (ret = jset_validate(c, NULL, jset, 0, WRITE))) 1789 return ret; 1790 1791 ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), 1792 jset->encrypted_start, 1793 vstruct_end(jset) - (void *) jset->encrypted_start); 1794 if (bch2_fs_fatal_err_on(ret, c, 1795 "error decrypting journal entry: %i", ret)) 1796 return ret; 1797 1798 jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), 1799 journal_nonce(jset), jset); 1800 1801 if (!validate_before_checksum && 1802 (ret = jset_validate(c, NULL, jset, 0, WRITE))) 1803 return ret; 1804 1805 memset((void *) jset + bytes, 0, (sectors << 9) - bytes); 1806 return 0; 1807 } 1808 1809 static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w) 1810 { 1811 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1812 int error = bch2_journal_error(j); 1813 1814 /* 1815 * If the journal is in an error state - we did an emergency shutdown - 1816 * we prefer to continue doing journal writes. We just mark them as 1817 * noflush so they'll never be used, but they'll still be visible by the 1818 * list_journal tool - this helps in debugging. 1819 * 1820 * There's a caveat: the first journal write after marking the 1821 * superblock dirty must always be a flush write, because on startup 1822 * from a clean shutdown we didn't necessarily read the journal and the 1823 * new journal write might overwrite whatever was in the journal 1824 * previously - we can't leave the journal without any flush writes in 1825 * it. 1826 * 1827 * So if we're in an error state, and we're still starting up, we don't 1828 * write anything at all. 1829 */ 1830 if (error && test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags)) 1831 return -EIO; 1832 1833 if (error || 1834 w->noflush || 1835 (!w->must_flush && 1836 (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && 1837 test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) { 1838 w->noflush = true; 1839 SET_JSET_NO_FLUSH(w->data, true); 1840 w->data->last_seq = 0; 1841 w->last_seq = 0; 1842 1843 j->nr_noflush_writes++; 1844 } else { 1845 j->last_flush_write = jiffies; 1846 j->nr_flush_writes++; 1847 clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags); 1848 } 1849 1850 return 0; 1851 } 1852 1853 void bch2_journal_write(struct closure *cl) 1854 { 1855 struct journal *j = container_of(cl, struct journal, io); 1856 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1857 struct bch_dev *ca; 1858 struct journal_buf *w = journal_last_unwritten_buf(j); 1859 struct bch_replicas_padded replicas; 1860 struct bio *bio; 1861 struct printbuf journal_debug_buf = PRINTBUF; 1862 unsigned i, nr_rw_members = 0; 1863 int ret; 1864 1865 BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); 1866 1867 j->write_start_time = local_clock(); 1868 1869 spin_lock(&j->lock); 1870 ret = bch2_journal_write_pick_flush(j, w); 1871 spin_unlock(&j->lock); 1872 if (ret) 1873 goto err; 1874 1875 journal_buf_realloc(j, w); 1876 1877 ret = bch2_journal_write_prep(j, w); 1878 if (ret) 1879 goto err; 1880 1881 while (1) { 1882 spin_lock(&j->lock); 1883 ret = journal_write_alloc(j, w); 1884 if (!ret || !j->can_discard) 1885 break; 1886 1887 spin_unlock(&j->lock); 1888 bch2_journal_do_discards(j); 1889 } 1890 1891 if (ret) { 1892 __bch2_journal_debug_to_text(&journal_debug_buf, j); 1893 spin_unlock(&j->lock); 1894 bch_err(c, "Unable to allocate journal write:\n%s", 1895 journal_debug_buf.buf); 1896 printbuf_exit(&journal_debug_buf); 1897 goto err; 1898 } 1899 1900 /* 1901 * write is allocated, no longer need to account for it in 1902 * bch2_journal_space_available(): 1903 */ 1904 w->sectors = 0; 1905 1906 /* 1907 * journal entry has been compacted and allocated, recalculate space 1908 * available: 1909 */ 1910 bch2_journal_space_available(j); 1911 spin_unlock(&j->lock); 1912 1913 w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); 1914 1915 if (c->opts.nochanges) 1916 goto no_io; 1917 1918 for_each_rw_member(ca, c, i) 1919 nr_rw_members++; 1920 1921 if (nr_rw_members > 1) 1922 w->separate_flush = true; 1923 1924 /* 1925 * Mark journal replicas before we submit the write to guarantee 1926 * recovery will find the journal entries after a crash. 1927 */ 1928 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, 1929 w->devs_written); 1930 ret = bch2_mark_replicas(c, &replicas.e); 1931 if (ret) 1932 goto err; 1933 1934 if (!JSET_NO_FLUSH(w->data) && w->separate_flush) { 1935 for_each_rw_member(ca, c, i) { 1936 percpu_ref_get(&ca->io_ref); 1937 1938 bio = ca->journal.bio; 1939 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH); 1940 bio->bi_end_io = journal_write_endio; 1941 bio->bi_private = ca; 1942 closure_bio_submit(bio, cl); 1943 } 1944 } 1945 1946 continue_at(cl, do_journal_write, c->io_complete_wq); 1947 return; 1948 no_io: 1949 continue_at(cl, journal_write_done, c->io_complete_wq); 1950 return; 1951 err: 1952 bch2_fatal_error(c); 1953 continue_at(cl, journal_write_done, c->io_complete_wq); 1954 } 1955