1 // SPDX-License-Identifier: GPL-2.0 2 #include "bcachefs.h" 3 #include "alloc_background.h" 4 #include "alloc_foreground.h" 5 #include "btree_io.h" 6 #include "btree_update_interior.h" 7 #include "buckets.h" 8 #include "checksum.h" 9 #include "disk_groups.h" 10 #include "error.h" 11 #include "journal.h" 12 #include "journal_io.h" 13 #include "journal_reclaim.h" 14 #include "journal_seq_blacklist.h" 15 #include "replicas.h" 16 #include "sb-clean.h" 17 #include "trace.h" 18 19 static struct nonce journal_nonce(const struct jset *jset) 20 { 21 return (struct nonce) {{ 22 [0] = 0, 23 [1] = ((__le32 *) &jset->seq)[0], 24 [2] = ((__le32 *) &jset->seq)[1], 25 [3] = BCH_NONCE_JOURNAL, 26 }}; 27 } 28 29 static bool jset_csum_good(struct bch_fs *c, struct jset *j) 30 { 31 return bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j)) && 32 !bch2_crc_cmp(j->csum, 33 csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j)); 34 } 35 36 static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq) 37 { 38 return (seq - c->journal_entries_base_seq) & (~0U >> 1); 39 } 40 41 static void __journal_replay_free(struct bch_fs *c, 42 struct journal_replay *i) 43 { 44 struct journal_replay **p = 45 genradix_ptr(&c->journal_entries, 46 journal_entry_radix_idx(c, le64_to_cpu(i->j.seq))); 47 48 BUG_ON(*p != i); 49 *p = NULL; 50 kvpfree(i, offsetof(struct journal_replay, j) + 51 vstruct_bytes(&i->j)); 52 } 53 54 static void journal_replay_free(struct bch_fs *c, struct journal_replay *i) 55 { 56 i->ignore = true; 57 58 if (!c->opts.read_entire_journal) 59 __journal_replay_free(c, i); 60 } 61 62 struct journal_list { 63 struct closure cl; 64 u64 last_seq; 65 struct mutex lock; 66 int ret; 67 }; 68 69 #define JOURNAL_ENTRY_ADD_OK 0 70 #define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5 71 72 /* 73 * Given a journal entry we just read, add it to the list of journal entries to 74 * be replayed: 75 */ 76 static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, 77 struct journal_ptr entry_ptr, 78 struct journal_list *jlist, struct jset *j) 79 { 80 struct genradix_iter iter; 81 struct journal_replay **_i, *i, *dup; 82 struct journal_ptr *ptr; 83 size_t bytes = vstruct_bytes(j); 84 u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0; 85 int ret = JOURNAL_ENTRY_ADD_OK; 86 87 /* Is this entry older than the range we need? */ 88 if (!c->opts.read_entire_journal && 89 le64_to_cpu(j->seq) < jlist->last_seq) 90 return JOURNAL_ENTRY_ADD_OUT_OF_RANGE; 91 92 /* 93 * genradixes are indexed by a ulong, not a u64, so we can't index them 94 * by sequence number directly: Assume instead that they will all fall 95 * within the range of +-2billion of the filrst one we find. 96 */ 97 if (!c->journal_entries_base_seq) 98 c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX); 99 100 /* Drop entries we don't need anymore */ 101 if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) { 102 genradix_for_each_from(&c->journal_entries, iter, _i, 103 journal_entry_radix_idx(c, jlist->last_seq)) { 104 i = *_i; 105 106 if (!i || i->ignore) 107 continue; 108 109 if (le64_to_cpu(i->j.seq) >= last_seq) 110 break; 111 journal_replay_free(c, i); 112 } 113 } 114 115 jlist->last_seq = max(jlist->last_seq, last_seq); 116 117 _i = genradix_ptr_alloc(&c->journal_entries, 118 journal_entry_radix_idx(c, le64_to_cpu(j->seq)), 119 GFP_KERNEL); 120 if (!_i) 121 return -BCH_ERR_ENOMEM_journal_entry_add; 122 123 /* 124 * Duplicate journal entries? If so we want the one that didn't have a 125 * checksum error: 126 */ 127 dup = *_i; 128 if (dup) { 129 if (bytes == vstruct_bytes(&dup->j) && 130 !memcmp(j, &dup->j, bytes)) { 131 i = dup; 132 goto found; 133 } 134 135 if (!entry_ptr.csum_good) { 136 i = dup; 137 goto found; 138 } 139 140 if (!dup->csum_good) 141 goto replace; 142 143 fsck_err(c, journal_entry_replicas_data_mismatch, 144 "found duplicate but non identical journal entries (seq %llu)", 145 le64_to_cpu(j->seq)); 146 i = dup; 147 goto found; 148 } 149 replace: 150 i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); 151 if (!i) 152 return -BCH_ERR_ENOMEM_journal_entry_add; 153 154 i->nr_ptrs = 0; 155 i->csum_good = entry_ptr.csum_good; 156 i->ignore = false; 157 unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct"); 158 i->ptrs[i->nr_ptrs++] = entry_ptr; 159 160 if (dup) { 161 if (dup->nr_ptrs >= ARRAY_SIZE(dup->ptrs)) { 162 bch_err(c, "found too many copies of journal entry %llu", 163 le64_to_cpu(i->j.seq)); 164 dup->nr_ptrs = ARRAY_SIZE(dup->ptrs) - 1; 165 } 166 167 /* The first ptr should represent the jset we kept: */ 168 memcpy(i->ptrs + i->nr_ptrs, 169 dup->ptrs, 170 sizeof(dup->ptrs[0]) * dup->nr_ptrs); 171 i->nr_ptrs += dup->nr_ptrs; 172 __journal_replay_free(c, dup); 173 } 174 175 *_i = i; 176 return 0; 177 found: 178 for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) { 179 if (ptr->dev == ca->dev_idx) { 180 bch_err(c, "duplicate journal entry %llu on same device", 181 le64_to_cpu(i->j.seq)); 182 goto out; 183 } 184 } 185 186 if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) { 187 bch_err(c, "found too many copies of journal entry %llu", 188 le64_to_cpu(i->j.seq)); 189 goto out; 190 } 191 192 i->ptrs[i->nr_ptrs++] = entry_ptr; 193 out: 194 fsck_err: 195 return ret; 196 } 197 198 /* this fills in a range with empty jset_entries: */ 199 static void journal_entry_null_range(void *start, void *end) 200 { 201 struct jset_entry *entry; 202 203 for (entry = start; entry != end; entry = vstruct_next(entry)) 204 memset(entry, 0, sizeof(*entry)); 205 } 206 207 #define JOURNAL_ENTRY_REREAD 5 208 #define JOURNAL_ENTRY_NONE 6 209 #define JOURNAL_ENTRY_BAD 7 210 211 static void journal_entry_err_msg(struct printbuf *out, 212 u32 version, 213 struct jset *jset, 214 struct jset_entry *entry) 215 { 216 prt_str(out, "invalid journal entry, version="); 217 bch2_version_to_text(out, version); 218 219 if (entry) { 220 prt_str(out, " type="); 221 prt_str(out, bch2_jset_entry_types[entry->type]); 222 } 223 224 if (!jset) { 225 prt_printf(out, " in superblock"); 226 } else { 227 228 prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq)); 229 230 if (entry) 231 prt_printf(out, " offset=%zi/%u", 232 (u64 *) entry - jset->_data, 233 le32_to_cpu(jset->u64s)); 234 } 235 236 prt_str(out, ": "); 237 } 238 239 #define journal_entry_err(c, version, jset, entry, _err, msg, ...) \ 240 ({ \ 241 struct printbuf _buf = PRINTBUF; \ 242 \ 243 journal_entry_err_msg(&_buf, version, jset, entry); \ 244 prt_printf(&_buf, msg, ##__VA_ARGS__); \ 245 \ 246 switch (flags & BKEY_INVALID_WRITE) { \ 247 case READ: \ 248 mustfix_fsck_err(c, _err, "%s", _buf.buf); \ 249 break; \ 250 case WRITE: \ 251 bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \ 252 bch_err(c, "corrupt metadata before write: %s\n", _buf.buf);\ 253 if (bch2_fs_inconsistent(c)) { \ 254 ret = -BCH_ERR_fsck_errors_not_fixed; \ 255 goto fsck_err; \ 256 } \ 257 break; \ 258 } \ 259 \ 260 printbuf_exit(&_buf); \ 261 true; \ 262 }) 263 264 #define journal_entry_err_on(cond, ...) \ 265 ((cond) ? journal_entry_err(__VA_ARGS__) : false) 266 267 #define FSCK_DELETED_KEY 5 268 269 static int journal_validate_key(struct bch_fs *c, 270 struct jset *jset, 271 struct jset_entry *entry, 272 unsigned level, enum btree_id btree_id, 273 struct bkey_i *k, 274 unsigned version, int big_endian, 275 enum bkey_invalid_flags flags) 276 { 277 int write = flags & BKEY_INVALID_WRITE; 278 void *next = vstruct_next(entry); 279 struct printbuf buf = PRINTBUF; 280 int ret = 0; 281 282 if (journal_entry_err_on(!k->k.u64s, 283 c, version, jset, entry, 284 journal_entry_bkey_u64s_0, 285 "k->u64s 0")) { 286 entry->u64s = cpu_to_le16((u64 *) k - entry->_data); 287 journal_entry_null_range(vstruct_next(entry), next); 288 return FSCK_DELETED_KEY; 289 } 290 291 if (journal_entry_err_on((void *) bkey_next(k) > 292 (void *) vstruct_next(entry), 293 c, version, jset, entry, 294 journal_entry_bkey_past_end, 295 "extends past end of journal entry")) { 296 entry->u64s = cpu_to_le16((u64 *) k - entry->_data); 297 journal_entry_null_range(vstruct_next(entry), next); 298 return FSCK_DELETED_KEY; 299 } 300 301 if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, 302 c, version, jset, entry, 303 journal_entry_bkey_bad_format, 304 "bad format %u", k->k.format)) { 305 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); 306 memmove(k, bkey_next(k), next - (void *) bkey_next(k)); 307 journal_entry_null_range(vstruct_next(entry), next); 308 return FSCK_DELETED_KEY; 309 } 310 311 if (!write) 312 bch2_bkey_compat(level, btree_id, version, big_endian, 313 write, NULL, bkey_to_packed(k)); 314 315 if (bch2_bkey_invalid(c, bkey_i_to_s_c(k), 316 __btree_node_type(level, btree_id), write, &buf)) { 317 printbuf_reset(&buf); 318 journal_entry_err_msg(&buf, version, jset, entry); 319 prt_newline(&buf); 320 printbuf_indent_add(&buf, 2); 321 322 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); 323 prt_newline(&buf); 324 bch2_bkey_invalid(c, bkey_i_to_s_c(k), 325 __btree_node_type(level, btree_id), write, &buf); 326 327 mustfix_fsck_err(c, journal_entry_bkey_invalid, 328 "%s", buf.buf); 329 330 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); 331 memmove(k, bkey_next(k), next - (void *) bkey_next(k)); 332 journal_entry_null_range(vstruct_next(entry), next); 333 334 printbuf_exit(&buf); 335 return FSCK_DELETED_KEY; 336 } 337 338 if (write) 339 bch2_bkey_compat(level, btree_id, version, big_endian, 340 write, NULL, bkey_to_packed(k)); 341 fsck_err: 342 printbuf_exit(&buf); 343 return ret; 344 } 345 346 static int journal_entry_btree_keys_validate(struct bch_fs *c, 347 struct jset *jset, 348 struct jset_entry *entry, 349 unsigned version, int big_endian, 350 enum bkey_invalid_flags flags) 351 { 352 struct bkey_i *k = entry->start; 353 354 while (k != vstruct_last(entry)) { 355 int ret = journal_validate_key(c, jset, entry, 356 entry->level, 357 entry->btree_id, 358 k, version, big_endian, 359 flags|BKEY_INVALID_JOURNAL); 360 if (ret == FSCK_DELETED_KEY) 361 continue; 362 363 k = bkey_next(k); 364 } 365 366 return 0; 367 } 368 369 static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c, 370 struct jset_entry *entry) 371 { 372 struct bkey_i *k; 373 bool first = true; 374 375 jset_entry_for_each_key(entry, k) { 376 if (!first) { 377 prt_newline(out); 378 prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]); 379 } 380 prt_printf(out, "btree=%s l=%u ", bch2_btree_id_str(entry->btree_id), entry->level); 381 bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); 382 first = false; 383 } 384 } 385 386 static int journal_entry_btree_root_validate(struct bch_fs *c, 387 struct jset *jset, 388 struct jset_entry *entry, 389 unsigned version, int big_endian, 390 enum bkey_invalid_flags flags) 391 { 392 struct bkey_i *k = entry->start; 393 int ret = 0; 394 395 if (journal_entry_err_on(!entry->u64s || 396 le16_to_cpu(entry->u64s) != k->k.u64s, 397 c, version, jset, entry, 398 journal_entry_btree_root_bad_size, 399 "invalid btree root journal entry: wrong number of keys")) { 400 void *next = vstruct_next(entry); 401 /* 402 * we don't want to null out this jset_entry, 403 * just the contents, so that later we can tell 404 * we were _supposed_ to have a btree root 405 */ 406 entry->u64s = 0; 407 journal_entry_null_range(vstruct_next(entry), next); 408 return 0; 409 } 410 411 return journal_validate_key(c, jset, entry, 1, entry->btree_id, k, 412 version, big_endian, flags); 413 fsck_err: 414 return ret; 415 } 416 417 static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c, 418 struct jset_entry *entry) 419 { 420 journal_entry_btree_keys_to_text(out, c, entry); 421 } 422 423 static int journal_entry_prio_ptrs_validate(struct bch_fs *c, 424 struct jset *jset, 425 struct jset_entry *entry, 426 unsigned version, int big_endian, 427 enum bkey_invalid_flags flags) 428 { 429 /* obsolete, don't care: */ 430 return 0; 431 } 432 433 static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c, 434 struct jset_entry *entry) 435 { 436 } 437 438 static int journal_entry_blacklist_validate(struct bch_fs *c, 439 struct jset *jset, 440 struct jset_entry *entry, 441 unsigned version, int big_endian, 442 enum bkey_invalid_flags flags) 443 { 444 int ret = 0; 445 446 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, 447 c, version, jset, entry, 448 journal_entry_blacklist_bad_size, 449 "invalid journal seq blacklist entry: bad size")) { 450 journal_entry_null_range(entry, vstruct_next(entry)); 451 } 452 fsck_err: 453 return ret; 454 } 455 456 static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c, 457 struct jset_entry *entry) 458 { 459 struct jset_entry_blacklist *bl = 460 container_of(entry, struct jset_entry_blacklist, entry); 461 462 prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq)); 463 } 464 465 static int journal_entry_blacklist_v2_validate(struct bch_fs *c, 466 struct jset *jset, 467 struct jset_entry *entry, 468 unsigned version, int big_endian, 469 enum bkey_invalid_flags flags) 470 { 471 struct jset_entry_blacklist_v2 *bl_entry; 472 int ret = 0; 473 474 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, 475 c, version, jset, entry, 476 journal_entry_blacklist_v2_bad_size, 477 "invalid journal seq blacklist entry: bad size")) { 478 journal_entry_null_range(entry, vstruct_next(entry)); 479 goto out; 480 } 481 482 bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry); 483 484 if (journal_entry_err_on(le64_to_cpu(bl_entry->start) > 485 le64_to_cpu(bl_entry->end), 486 c, version, jset, entry, 487 journal_entry_blacklist_v2_start_past_end, 488 "invalid journal seq blacklist entry: start > end")) { 489 journal_entry_null_range(entry, vstruct_next(entry)); 490 } 491 out: 492 fsck_err: 493 return ret; 494 } 495 496 static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c, 497 struct jset_entry *entry) 498 { 499 struct jset_entry_blacklist_v2 *bl = 500 container_of(entry, struct jset_entry_blacklist_v2, entry); 501 502 prt_printf(out, "start=%llu end=%llu", 503 le64_to_cpu(bl->start), 504 le64_to_cpu(bl->end)); 505 } 506 507 static int journal_entry_usage_validate(struct bch_fs *c, 508 struct jset *jset, 509 struct jset_entry *entry, 510 unsigned version, int big_endian, 511 enum bkey_invalid_flags flags) 512 { 513 struct jset_entry_usage *u = 514 container_of(entry, struct jset_entry_usage, entry); 515 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 516 int ret = 0; 517 518 if (journal_entry_err_on(bytes < sizeof(*u), 519 c, version, jset, entry, 520 journal_entry_usage_bad_size, 521 "invalid journal entry usage: bad size")) { 522 journal_entry_null_range(entry, vstruct_next(entry)); 523 return ret; 524 } 525 526 fsck_err: 527 return ret; 528 } 529 530 static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c, 531 struct jset_entry *entry) 532 { 533 struct jset_entry_usage *u = 534 container_of(entry, struct jset_entry_usage, entry); 535 536 prt_printf(out, "type=%s v=%llu", 537 bch2_fs_usage_types[u->entry.btree_id], 538 le64_to_cpu(u->v)); 539 } 540 541 static int journal_entry_data_usage_validate(struct bch_fs *c, 542 struct jset *jset, 543 struct jset_entry *entry, 544 unsigned version, int big_endian, 545 enum bkey_invalid_flags flags) 546 { 547 struct jset_entry_data_usage *u = 548 container_of(entry, struct jset_entry_data_usage, entry); 549 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 550 int ret = 0; 551 552 if (journal_entry_err_on(bytes < sizeof(*u) || 553 bytes < sizeof(*u) + u->r.nr_devs, 554 c, version, jset, entry, 555 journal_entry_data_usage_bad_size, 556 "invalid journal entry usage: bad size")) { 557 journal_entry_null_range(entry, vstruct_next(entry)); 558 return ret; 559 } 560 561 fsck_err: 562 return ret; 563 } 564 565 static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c, 566 struct jset_entry *entry) 567 { 568 struct jset_entry_data_usage *u = 569 container_of(entry, struct jset_entry_data_usage, entry); 570 571 bch2_replicas_entry_to_text(out, &u->r); 572 prt_printf(out, "=%llu", le64_to_cpu(u->v)); 573 } 574 575 static int journal_entry_clock_validate(struct bch_fs *c, 576 struct jset *jset, 577 struct jset_entry *entry, 578 unsigned version, int big_endian, 579 enum bkey_invalid_flags flags) 580 { 581 struct jset_entry_clock *clock = 582 container_of(entry, struct jset_entry_clock, entry); 583 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 584 int ret = 0; 585 586 if (journal_entry_err_on(bytes != sizeof(*clock), 587 c, version, jset, entry, 588 journal_entry_clock_bad_size, 589 "bad size")) { 590 journal_entry_null_range(entry, vstruct_next(entry)); 591 return ret; 592 } 593 594 if (journal_entry_err_on(clock->rw > 1, 595 c, version, jset, entry, 596 journal_entry_clock_bad_rw, 597 "bad rw")) { 598 journal_entry_null_range(entry, vstruct_next(entry)); 599 return ret; 600 } 601 602 fsck_err: 603 return ret; 604 } 605 606 static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c, 607 struct jset_entry *entry) 608 { 609 struct jset_entry_clock *clock = 610 container_of(entry, struct jset_entry_clock, entry); 611 612 prt_printf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time)); 613 } 614 615 static int journal_entry_dev_usage_validate(struct bch_fs *c, 616 struct jset *jset, 617 struct jset_entry *entry, 618 unsigned version, int big_endian, 619 enum bkey_invalid_flags flags) 620 { 621 struct jset_entry_dev_usage *u = 622 container_of(entry, struct jset_entry_dev_usage, entry); 623 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 624 unsigned expected = sizeof(*u); 625 unsigned dev; 626 int ret = 0; 627 628 if (journal_entry_err_on(bytes < expected, 629 c, version, jset, entry, 630 journal_entry_dev_usage_bad_size, 631 "bad size (%u < %u)", 632 bytes, expected)) { 633 journal_entry_null_range(entry, vstruct_next(entry)); 634 return ret; 635 } 636 637 dev = le32_to_cpu(u->dev); 638 639 if (journal_entry_err_on(!bch2_dev_exists2(c, dev), 640 c, version, jset, entry, 641 journal_entry_dev_usage_bad_dev, 642 "bad dev")) { 643 journal_entry_null_range(entry, vstruct_next(entry)); 644 return ret; 645 } 646 647 if (journal_entry_err_on(u->pad, 648 c, version, jset, entry, 649 journal_entry_dev_usage_bad_pad, 650 "bad pad")) { 651 journal_entry_null_range(entry, vstruct_next(entry)); 652 return ret; 653 } 654 655 fsck_err: 656 return ret; 657 } 658 659 static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c, 660 struct jset_entry *entry) 661 { 662 struct jset_entry_dev_usage *u = 663 container_of(entry, struct jset_entry_dev_usage, entry); 664 unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); 665 666 prt_printf(out, "dev=%u", le32_to_cpu(u->dev)); 667 668 for (i = 0; i < nr_types; i++) { 669 if (i < BCH_DATA_NR) 670 prt_printf(out, " %s", bch2_data_types[i]); 671 else 672 prt_printf(out, " (unknown data type %u)", i); 673 prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu", 674 le64_to_cpu(u->d[i].buckets), 675 le64_to_cpu(u->d[i].sectors), 676 le64_to_cpu(u->d[i].fragmented)); 677 } 678 679 prt_printf(out, " buckets_ec: %llu", le64_to_cpu(u->buckets_ec)); 680 } 681 682 static int journal_entry_log_validate(struct bch_fs *c, 683 struct jset *jset, 684 struct jset_entry *entry, 685 unsigned version, int big_endian, 686 enum bkey_invalid_flags flags) 687 { 688 return 0; 689 } 690 691 static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c, 692 struct jset_entry *entry) 693 { 694 struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); 695 unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d); 696 697 prt_printf(out, "%.*s", bytes, l->d); 698 } 699 700 static int journal_entry_overwrite_validate(struct bch_fs *c, 701 struct jset *jset, 702 struct jset_entry *entry, 703 unsigned version, int big_endian, 704 enum bkey_invalid_flags flags) 705 { 706 return journal_entry_btree_keys_validate(c, jset, entry, 707 version, big_endian, READ); 708 } 709 710 static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c, 711 struct jset_entry *entry) 712 { 713 journal_entry_btree_keys_to_text(out, c, entry); 714 } 715 716 struct jset_entry_ops { 717 int (*validate)(struct bch_fs *, struct jset *, 718 struct jset_entry *, unsigned, int, 719 enum bkey_invalid_flags); 720 void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); 721 }; 722 723 static const struct jset_entry_ops bch2_jset_entry_ops[] = { 724 #define x(f, nr) \ 725 [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \ 726 .validate = journal_entry_##f##_validate, \ 727 .to_text = journal_entry_##f##_to_text, \ 728 }, 729 BCH_JSET_ENTRY_TYPES() 730 #undef x 731 }; 732 733 int bch2_journal_entry_validate(struct bch_fs *c, 734 struct jset *jset, 735 struct jset_entry *entry, 736 unsigned version, int big_endian, 737 enum bkey_invalid_flags flags) 738 { 739 return entry->type < BCH_JSET_ENTRY_NR 740 ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry, 741 version, big_endian, flags) 742 : 0; 743 } 744 745 void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, 746 struct jset_entry *entry) 747 { 748 if (entry->type < BCH_JSET_ENTRY_NR) { 749 prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]); 750 bch2_jset_entry_ops[entry->type].to_text(out, c, entry); 751 } else { 752 prt_printf(out, "(unknown type %u)", entry->type); 753 } 754 } 755 756 static int jset_validate_entries(struct bch_fs *c, struct jset *jset, 757 enum bkey_invalid_flags flags) 758 { 759 struct jset_entry *entry; 760 unsigned version = le32_to_cpu(jset->version); 761 int ret = 0; 762 763 vstruct_for_each(jset, entry) { 764 if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset), 765 c, version, jset, entry, 766 journal_entry_past_jset_end, 767 "journal entry extends past end of jset")) { 768 jset->u64s = cpu_to_le32((u64 *) entry - jset->_data); 769 break; 770 } 771 772 ret = bch2_journal_entry_validate(c, jset, entry, 773 version, JSET_BIG_ENDIAN(jset), flags); 774 if (ret) 775 break; 776 } 777 fsck_err: 778 return ret; 779 } 780 781 static int jset_validate(struct bch_fs *c, 782 struct bch_dev *ca, 783 struct jset *jset, u64 sector, 784 enum bkey_invalid_flags flags) 785 { 786 unsigned version; 787 int ret = 0; 788 789 if (le64_to_cpu(jset->magic) != jset_magic(c)) 790 return JOURNAL_ENTRY_NONE; 791 792 version = le32_to_cpu(jset->version); 793 if (journal_entry_err_on(!bch2_version_compatible(version), 794 c, version, jset, NULL, 795 jset_unsupported_version, 796 "%s sector %llu seq %llu: incompatible journal entry version %u.%u", 797 ca ? ca->name : c->name, 798 sector, le64_to_cpu(jset->seq), 799 BCH_VERSION_MAJOR(version), 800 BCH_VERSION_MINOR(version))) { 801 /* don't try to continue: */ 802 return -EINVAL; 803 } 804 805 if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), 806 c, version, jset, NULL, 807 jset_unknown_csum, 808 "%s sector %llu seq %llu: journal entry with unknown csum type %llu", 809 ca ? ca->name : c->name, 810 sector, le64_to_cpu(jset->seq), 811 JSET_CSUM_TYPE(jset))) 812 ret = JOURNAL_ENTRY_BAD; 813 814 /* last_seq is ignored when JSET_NO_FLUSH is true */ 815 if (journal_entry_err_on(!JSET_NO_FLUSH(jset) && 816 le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), 817 c, version, jset, NULL, 818 jset_last_seq_newer_than_seq, 819 "invalid journal entry: last_seq > seq (%llu > %llu)", 820 le64_to_cpu(jset->last_seq), 821 le64_to_cpu(jset->seq))) { 822 jset->last_seq = jset->seq; 823 return JOURNAL_ENTRY_BAD; 824 } 825 826 ret = jset_validate_entries(c, jset, flags); 827 fsck_err: 828 return ret; 829 } 830 831 static int jset_validate_early(struct bch_fs *c, 832 struct bch_dev *ca, 833 struct jset *jset, u64 sector, 834 unsigned bucket_sectors_left, 835 unsigned sectors_read) 836 { 837 size_t bytes = vstruct_bytes(jset); 838 unsigned version; 839 enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL; 840 int ret = 0; 841 842 if (le64_to_cpu(jset->magic) != jset_magic(c)) 843 return JOURNAL_ENTRY_NONE; 844 845 version = le32_to_cpu(jset->version); 846 if (journal_entry_err_on(!bch2_version_compatible(version), 847 c, version, jset, NULL, 848 jset_unsupported_version, 849 "%s sector %llu seq %llu: unknown journal entry version %u.%u", 850 ca ? ca->name : c->name, 851 sector, le64_to_cpu(jset->seq), 852 BCH_VERSION_MAJOR(version), 853 BCH_VERSION_MINOR(version))) { 854 /* don't try to continue: */ 855 return -EINVAL; 856 } 857 858 if (bytes > (sectors_read << 9) && 859 sectors_read < bucket_sectors_left) 860 return JOURNAL_ENTRY_REREAD; 861 862 if (journal_entry_err_on(bytes > bucket_sectors_left << 9, 863 c, version, jset, NULL, 864 jset_past_bucket_end, 865 "%s sector %llu seq %llu: journal entry too big (%zu bytes)", 866 ca ? ca->name : c->name, 867 sector, le64_to_cpu(jset->seq), bytes)) 868 le32_add_cpu(&jset->u64s, 869 -((bytes - (bucket_sectors_left << 9)) / 8)); 870 fsck_err: 871 return ret; 872 } 873 874 struct journal_read_buf { 875 void *data; 876 size_t size; 877 }; 878 879 static int journal_read_buf_realloc(struct journal_read_buf *b, 880 size_t new_size) 881 { 882 void *n; 883 884 /* the bios are sized for this many pages, max: */ 885 if (new_size > JOURNAL_ENTRY_SIZE_MAX) 886 return -BCH_ERR_ENOMEM_journal_read_buf_realloc; 887 888 new_size = roundup_pow_of_two(new_size); 889 n = kvpmalloc(new_size, GFP_KERNEL); 890 if (!n) 891 return -BCH_ERR_ENOMEM_journal_read_buf_realloc; 892 893 kvpfree(b->data, b->size); 894 b->data = n; 895 b->size = new_size; 896 return 0; 897 } 898 899 static int journal_read_bucket(struct bch_dev *ca, 900 struct journal_read_buf *buf, 901 struct journal_list *jlist, 902 unsigned bucket) 903 { 904 struct bch_fs *c = ca->fs; 905 struct journal_device *ja = &ca->journal; 906 struct jset *j = NULL; 907 unsigned sectors, sectors_read = 0; 908 u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), 909 end = offset + ca->mi.bucket_size; 910 bool saw_bad = false, csum_good; 911 int ret = 0; 912 913 pr_debug("reading %u", bucket); 914 915 while (offset < end) { 916 if (!sectors_read) { 917 struct bio *bio; 918 unsigned nr_bvecs; 919 reread: 920 sectors_read = min_t(unsigned, 921 end - offset, buf->size >> 9); 922 nr_bvecs = buf_pages(buf->data, sectors_read << 9); 923 924 bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); 925 bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ); 926 927 bio->bi_iter.bi_sector = offset; 928 bch2_bio_map(bio, buf->data, sectors_read << 9); 929 930 ret = submit_bio_wait(bio); 931 kfree(bio); 932 933 if (bch2_dev_io_err_on(ret, ca, BCH_MEMBER_ERROR_read, 934 "journal read error: sector %llu", 935 offset) || 936 bch2_meta_read_fault("journal")) { 937 /* 938 * We don't error out of the recovery process 939 * here, since the relevant journal entry may be 940 * found on a different device, and missing or 941 * no journal entries will be handled later 942 */ 943 return 0; 944 } 945 946 j = buf->data; 947 } 948 949 ret = jset_validate_early(c, ca, j, offset, 950 end - offset, sectors_read); 951 switch (ret) { 952 case 0: 953 sectors = vstruct_sectors(j, c->block_bits); 954 break; 955 case JOURNAL_ENTRY_REREAD: 956 if (vstruct_bytes(j) > buf->size) { 957 ret = journal_read_buf_realloc(buf, 958 vstruct_bytes(j)); 959 if (ret) 960 return ret; 961 } 962 goto reread; 963 case JOURNAL_ENTRY_NONE: 964 if (!saw_bad) 965 return 0; 966 /* 967 * On checksum error we don't really trust the size 968 * field of the journal entry we read, so try reading 969 * again at next block boundary: 970 */ 971 sectors = block_sectors(c); 972 goto next_block; 973 default: 974 return ret; 975 } 976 977 /* 978 * This happens sometimes if we don't have discards on - 979 * when we've partially overwritten a bucket with new 980 * journal entries. We don't need the rest of the 981 * bucket: 982 */ 983 if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) 984 return 0; 985 986 ja->bucket_seq[bucket] = le64_to_cpu(j->seq); 987 988 csum_good = jset_csum_good(c, j); 989 if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum, 990 "journal checksum error")) 991 saw_bad = true; 992 993 ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), 994 j->encrypted_start, 995 vstruct_end(j) - (void *) j->encrypted_start); 996 bch2_fs_fatal_err_on(ret, c, 997 "error decrypting journal entry: %i", ret); 998 999 mutex_lock(&jlist->lock); 1000 ret = journal_entry_add(c, ca, (struct journal_ptr) { 1001 .csum_good = csum_good, 1002 .dev = ca->dev_idx, 1003 .bucket = bucket, 1004 .bucket_offset = offset - 1005 bucket_to_sector(ca, ja->buckets[bucket]), 1006 .sector = offset, 1007 }, jlist, j); 1008 mutex_unlock(&jlist->lock); 1009 1010 switch (ret) { 1011 case JOURNAL_ENTRY_ADD_OK: 1012 break; 1013 case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: 1014 break; 1015 default: 1016 return ret; 1017 } 1018 next_block: 1019 pr_debug("next"); 1020 offset += sectors; 1021 sectors_read -= sectors; 1022 j = ((void *) j) + (sectors << 9); 1023 } 1024 1025 return 0; 1026 } 1027 1028 static void bch2_journal_read_device(struct closure *cl) 1029 { 1030 struct journal_device *ja = 1031 container_of(cl, struct journal_device, read); 1032 struct bch_dev *ca = container_of(ja, struct bch_dev, journal); 1033 struct bch_fs *c = ca->fs; 1034 struct journal_list *jlist = 1035 container_of(cl->parent, struct journal_list, cl); 1036 struct journal_replay *r, **_r; 1037 struct genradix_iter iter; 1038 struct journal_read_buf buf = { NULL, 0 }; 1039 unsigned i; 1040 int ret = 0; 1041 1042 if (!ja->nr) 1043 goto out; 1044 1045 ret = journal_read_buf_realloc(&buf, PAGE_SIZE); 1046 if (ret) 1047 goto err; 1048 1049 pr_debug("%u journal buckets", ja->nr); 1050 1051 for (i = 0; i < ja->nr; i++) { 1052 ret = journal_read_bucket(ca, &buf, jlist, i); 1053 if (ret) 1054 goto err; 1055 } 1056 1057 ja->sectors_free = ca->mi.bucket_size; 1058 1059 mutex_lock(&jlist->lock); 1060 genradix_for_each_reverse(&c->journal_entries, iter, _r) { 1061 r = *_r; 1062 1063 if (!r) 1064 continue; 1065 1066 for (i = 0; i < r->nr_ptrs; i++) { 1067 if (r->ptrs[i].dev == ca->dev_idx) { 1068 unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) + 1069 vstruct_sectors(&r->j, c->block_bits); 1070 1071 ja->cur_idx = r->ptrs[i].bucket; 1072 ja->sectors_free = ca->mi.bucket_size - wrote; 1073 goto found; 1074 } 1075 } 1076 } 1077 found: 1078 mutex_unlock(&jlist->lock); 1079 1080 if (ja->bucket_seq[ja->cur_idx] && 1081 ja->sectors_free == ca->mi.bucket_size) { 1082 bch_err(c, "ja->sectors_free == ca->mi.bucket_size"); 1083 bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr); 1084 for (i = 0; i < 3; i++) { 1085 unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr; 1086 1087 bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]); 1088 } 1089 ja->sectors_free = 0; 1090 } 1091 1092 /* 1093 * Set dirty_idx to indicate the entire journal is full and needs to be 1094 * reclaimed - journal reclaim will immediately reclaim whatever isn't 1095 * pinned when it first runs: 1096 */ 1097 ja->discard_idx = ja->dirty_idx_ondisk = 1098 ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; 1099 out: 1100 bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); 1101 kvpfree(buf.data, buf.size); 1102 percpu_ref_put(&ca->io_ref); 1103 closure_return(cl); 1104 return; 1105 err: 1106 mutex_lock(&jlist->lock); 1107 jlist->ret = ret; 1108 mutex_unlock(&jlist->lock); 1109 goto out; 1110 } 1111 1112 void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, 1113 struct journal_replay *j) 1114 { 1115 unsigned i; 1116 1117 for (i = 0; i < j->nr_ptrs; i++) { 1118 struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev); 1119 u64 offset; 1120 1121 div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset); 1122 1123 if (i) 1124 prt_printf(out, " "); 1125 prt_printf(out, "%u:%u:%u (sector %llu)", 1126 j->ptrs[i].dev, 1127 j->ptrs[i].bucket, 1128 j->ptrs[i].bucket_offset, 1129 j->ptrs[i].sector); 1130 } 1131 } 1132 1133 int bch2_journal_read(struct bch_fs *c, 1134 u64 *last_seq, 1135 u64 *blacklist_seq, 1136 u64 *start_seq) 1137 { 1138 struct journal_list jlist; 1139 struct journal_replay *i, **_i, *prev = NULL; 1140 struct genradix_iter radix_iter; 1141 struct bch_dev *ca; 1142 unsigned iter; 1143 struct printbuf buf = PRINTBUF; 1144 bool degraded = false, last_write_torn = false; 1145 u64 seq; 1146 int ret = 0; 1147 1148 closure_init_stack(&jlist.cl); 1149 mutex_init(&jlist.lock); 1150 jlist.last_seq = 0; 1151 jlist.ret = 0; 1152 1153 for_each_member_device(ca, c, iter) { 1154 if (!c->opts.fsck && 1155 !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal))) 1156 continue; 1157 1158 if ((ca->mi.state == BCH_MEMBER_STATE_rw || 1159 ca->mi.state == BCH_MEMBER_STATE_ro) && 1160 percpu_ref_tryget(&ca->io_ref)) 1161 closure_call(&ca->journal.read, 1162 bch2_journal_read_device, 1163 system_unbound_wq, 1164 &jlist.cl); 1165 else 1166 degraded = true; 1167 } 1168 1169 closure_sync(&jlist.cl); 1170 1171 if (jlist.ret) 1172 return jlist.ret; 1173 1174 *last_seq = 0; 1175 *start_seq = 0; 1176 *blacklist_seq = 0; 1177 1178 /* 1179 * Find most recent flush entry, and ignore newer non flush entries - 1180 * those entries will be blacklisted: 1181 */ 1182 genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) { 1183 enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL; 1184 1185 i = *_i; 1186 1187 if (!i || i->ignore) 1188 continue; 1189 1190 if (!*start_seq) 1191 *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1; 1192 1193 if (JSET_NO_FLUSH(&i->j)) { 1194 i->ignore = true; 1195 continue; 1196 } 1197 1198 if (!last_write_torn && !i->csum_good) { 1199 last_write_torn = true; 1200 i->ignore = true; 1201 continue; 1202 } 1203 1204 if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq), 1205 c, le32_to_cpu(i->j.version), &i->j, NULL, 1206 jset_last_seq_newer_than_seq, 1207 "invalid journal entry: last_seq > seq (%llu > %llu)", 1208 le64_to_cpu(i->j.last_seq), 1209 le64_to_cpu(i->j.seq))) 1210 i->j.last_seq = i->j.seq; 1211 1212 *last_seq = le64_to_cpu(i->j.last_seq); 1213 *blacklist_seq = le64_to_cpu(i->j.seq) + 1; 1214 break; 1215 } 1216 1217 if (!*start_seq) { 1218 bch_info(c, "journal read done, but no entries found"); 1219 return 0; 1220 } 1221 1222 if (!*last_seq) { 1223 fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes, 1224 "journal read done, but no entries found after dropping non-flushes"); 1225 return 0; 1226 } 1227 1228 bch_info(c, "journal read done, replaying entries %llu-%llu", 1229 *last_seq, *blacklist_seq - 1); 1230 1231 if (*start_seq != *blacklist_seq) 1232 bch_info(c, "dropped unflushed entries %llu-%llu", 1233 *blacklist_seq, *start_seq - 1); 1234 1235 /* Drop blacklisted entries and entries older than last_seq: */ 1236 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1237 i = *_i; 1238 1239 if (!i || i->ignore) 1240 continue; 1241 1242 seq = le64_to_cpu(i->j.seq); 1243 if (seq < *last_seq) { 1244 journal_replay_free(c, i); 1245 continue; 1246 } 1247 1248 if (bch2_journal_seq_is_blacklisted(c, seq, true)) { 1249 fsck_err_on(!JSET_NO_FLUSH(&i->j), c, 1250 jset_seq_blacklisted, 1251 "found blacklisted journal entry %llu", seq); 1252 i->ignore = true; 1253 } 1254 } 1255 1256 /* Check for missing entries: */ 1257 seq = *last_seq; 1258 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1259 i = *_i; 1260 1261 if (!i || i->ignore) 1262 continue; 1263 1264 BUG_ON(seq > le64_to_cpu(i->j.seq)); 1265 1266 while (seq < le64_to_cpu(i->j.seq)) { 1267 u64 missing_start, missing_end; 1268 struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; 1269 1270 while (seq < le64_to_cpu(i->j.seq) && 1271 bch2_journal_seq_is_blacklisted(c, seq, false)) 1272 seq++; 1273 1274 if (seq == le64_to_cpu(i->j.seq)) 1275 break; 1276 1277 missing_start = seq; 1278 1279 while (seq < le64_to_cpu(i->j.seq) && 1280 !bch2_journal_seq_is_blacklisted(c, seq, false)) 1281 seq++; 1282 1283 if (prev) { 1284 bch2_journal_ptrs_to_text(&buf1, c, prev); 1285 prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits)); 1286 } else 1287 prt_printf(&buf1, "(none)"); 1288 bch2_journal_ptrs_to_text(&buf2, c, i); 1289 1290 missing_end = seq - 1; 1291 fsck_err(c, journal_entries_missing, 1292 "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" 1293 " prev at %s\n" 1294 " next at %s", 1295 missing_start, missing_end, 1296 *last_seq, *blacklist_seq - 1, 1297 buf1.buf, buf2.buf); 1298 1299 printbuf_exit(&buf1); 1300 printbuf_exit(&buf2); 1301 } 1302 1303 prev = i; 1304 seq++; 1305 } 1306 1307 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1308 struct bch_replicas_padded replicas = { 1309 .e.data_type = BCH_DATA_journal, 1310 .e.nr_required = 1, 1311 }; 1312 unsigned ptr; 1313 1314 i = *_i; 1315 if (!i || i->ignore) 1316 continue; 1317 1318 for (ptr = 0; ptr < i->nr_ptrs; ptr++) { 1319 ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev); 1320 1321 if (!i->ptrs[ptr].csum_good) 1322 bch_err_dev_offset(ca, i->ptrs[ptr].sector, 1323 "invalid journal checksum, seq %llu%s", 1324 le64_to_cpu(i->j.seq), 1325 i->csum_good ? " (had good copy on another device)" : ""); 1326 } 1327 1328 ret = jset_validate(c, 1329 bch_dev_bkey_exists(c, i->ptrs[0].dev), 1330 &i->j, 1331 i->ptrs[0].sector, 1332 READ); 1333 if (ret) 1334 goto err; 1335 1336 for (ptr = 0; ptr < i->nr_ptrs; ptr++) 1337 replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev; 1338 1339 bch2_replicas_entry_sort(&replicas.e); 1340 1341 printbuf_reset(&buf); 1342 bch2_replicas_entry_to_text(&buf, &replicas.e); 1343 1344 if (!degraded && 1345 !bch2_replicas_marked(c, &replicas.e) && 1346 (le64_to_cpu(i->j.seq) == *last_seq || 1347 fsck_err(c, journal_entry_replicas_not_marked, 1348 "superblock not marked as containing replicas for journal entry %llu\n %s", 1349 le64_to_cpu(i->j.seq), buf.buf))) { 1350 ret = bch2_mark_replicas(c, &replicas.e); 1351 if (ret) 1352 goto err; 1353 } 1354 } 1355 err: 1356 fsck_err: 1357 printbuf_exit(&buf); 1358 return ret; 1359 } 1360 1361 /* journal write: */ 1362 1363 static void __journal_write_alloc(struct journal *j, 1364 struct journal_buf *w, 1365 struct dev_alloc_list *devs_sorted, 1366 unsigned sectors, 1367 unsigned *replicas, 1368 unsigned replicas_want) 1369 { 1370 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1371 struct journal_device *ja; 1372 struct bch_dev *ca; 1373 unsigned i; 1374 1375 if (*replicas >= replicas_want) 1376 return; 1377 1378 for (i = 0; i < devs_sorted->nr; i++) { 1379 ca = rcu_dereference(c->devs[devs_sorted->devs[i]]); 1380 if (!ca) 1381 continue; 1382 1383 ja = &ca->journal; 1384 1385 /* 1386 * Check that we can use this device, and aren't already using 1387 * it: 1388 */ 1389 if (!ca->mi.durability || 1390 ca->mi.state != BCH_MEMBER_STATE_rw || 1391 !ja->nr || 1392 bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) || 1393 sectors > ja->sectors_free) 1394 continue; 1395 1396 bch2_dev_stripe_increment(ca, &j->wp.stripe); 1397 1398 bch2_bkey_append_ptr(&w->key, 1399 (struct bch_extent_ptr) { 1400 .offset = bucket_to_sector(ca, 1401 ja->buckets[ja->cur_idx]) + 1402 ca->mi.bucket_size - 1403 ja->sectors_free, 1404 .dev = ca->dev_idx, 1405 }); 1406 1407 ja->sectors_free -= sectors; 1408 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1409 1410 *replicas += ca->mi.durability; 1411 1412 if (*replicas >= replicas_want) 1413 break; 1414 } 1415 } 1416 1417 /** 1418 * journal_write_alloc - decide where to write next journal entry 1419 * 1420 * @j: journal object 1421 * @w: journal buf (entry to be written) 1422 * 1423 * Returns: 0 on success, or -EROFS on failure 1424 */ 1425 static int journal_write_alloc(struct journal *j, struct journal_buf *w) 1426 { 1427 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1428 struct bch_devs_mask devs; 1429 struct journal_device *ja; 1430 struct bch_dev *ca; 1431 struct dev_alloc_list devs_sorted; 1432 unsigned sectors = vstruct_sectors(w->data, c->block_bits); 1433 unsigned target = c->opts.metadata_target ?: 1434 c->opts.foreground_target; 1435 unsigned i, replicas = 0, replicas_want = 1436 READ_ONCE(c->opts.metadata_replicas); 1437 1438 rcu_read_lock(); 1439 retry: 1440 devs = target_rw_devs(c, BCH_DATA_journal, target); 1441 1442 devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); 1443 1444 __journal_write_alloc(j, w, &devs_sorted, 1445 sectors, &replicas, replicas_want); 1446 1447 if (replicas >= replicas_want) 1448 goto done; 1449 1450 for (i = 0; i < devs_sorted.nr; i++) { 1451 ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); 1452 if (!ca) 1453 continue; 1454 1455 ja = &ca->journal; 1456 1457 if (sectors > ja->sectors_free && 1458 sectors <= ca->mi.bucket_size && 1459 bch2_journal_dev_buckets_available(j, ja, 1460 journal_space_discarded)) { 1461 ja->cur_idx = (ja->cur_idx + 1) % ja->nr; 1462 ja->sectors_free = ca->mi.bucket_size; 1463 1464 /* 1465 * ja->bucket_seq[ja->cur_idx] must always have 1466 * something sensible: 1467 */ 1468 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1469 } 1470 } 1471 1472 __journal_write_alloc(j, w, &devs_sorted, 1473 sectors, &replicas, replicas_want); 1474 1475 if (replicas < replicas_want && target) { 1476 /* Retry from all devices: */ 1477 target = 0; 1478 goto retry; 1479 } 1480 done: 1481 rcu_read_unlock(); 1482 1483 BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); 1484 1485 return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS; 1486 } 1487 1488 static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) 1489 { 1490 /* we aren't holding j->lock: */ 1491 unsigned new_size = READ_ONCE(j->buf_size_want); 1492 void *new_buf; 1493 1494 if (buf->buf_size >= new_size) 1495 return; 1496 1497 new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN); 1498 if (!new_buf) 1499 return; 1500 1501 memcpy(new_buf, buf->data, buf->buf_size); 1502 1503 spin_lock(&j->lock); 1504 swap(buf->data, new_buf); 1505 swap(buf->buf_size, new_size); 1506 spin_unlock(&j->lock); 1507 1508 kvpfree(new_buf, new_size); 1509 } 1510 1511 static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) 1512 { 1513 return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK); 1514 } 1515 1516 static void journal_write_done(struct closure *cl) 1517 { 1518 struct journal *j = container_of(cl, struct journal, io); 1519 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1520 struct journal_buf *w = journal_last_unwritten_buf(j); 1521 struct bch_replicas_padded replicas; 1522 union journal_res_state old, new; 1523 u64 v, seq; 1524 int err = 0; 1525 1526 bch2_time_stats_update(!JSET_NO_FLUSH(w->data) 1527 ? j->flush_write_time 1528 : j->noflush_write_time, j->write_start_time); 1529 1530 if (!w->devs_written.nr) { 1531 bch_err(c, "unable to write journal to sufficient devices"); 1532 err = -EIO; 1533 } else { 1534 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, 1535 w->devs_written); 1536 if (bch2_mark_replicas(c, &replicas.e)) 1537 err = -EIO; 1538 } 1539 1540 if (err) 1541 bch2_fatal_error(c); 1542 1543 spin_lock(&j->lock); 1544 seq = le64_to_cpu(w->data->seq); 1545 1546 if (seq >= j->pin.front) 1547 journal_seq_pin(j, seq)->devs = w->devs_written; 1548 1549 if (!err) { 1550 if (!JSET_NO_FLUSH(w->data)) { 1551 j->flushed_seq_ondisk = seq; 1552 j->last_seq_ondisk = w->last_seq; 1553 1554 bch2_do_discards(c); 1555 closure_wake_up(&c->freelist_wait); 1556 1557 bch2_reset_alloc_cursors(c); 1558 } 1559 } else if (!j->err_seq || seq < j->err_seq) 1560 j->err_seq = seq; 1561 1562 j->seq_ondisk = seq; 1563 1564 /* 1565 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard 1566 * more buckets: 1567 * 1568 * Must come before signaling write completion, for 1569 * bch2_fs_journal_stop(): 1570 */ 1571 if (j->watermark != BCH_WATERMARK_stripe) 1572 journal_reclaim_kick(&c->journal); 1573 1574 /* also must come before signalling write completion: */ 1575 closure_debug_destroy(cl); 1576 1577 v = atomic64_read(&j->reservations.counter); 1578 do { 1579 old.v = new.v = v; 1580 BUG_ON(journal_state_count(new, new.unwritten_idx)); 1581 1582 new.unwritten_idx++; 1583 } while ((v = atomic64_cmpxchg(&j->reservations.counter, 1584 old.v, new.v)) != old.v); 1585 1586 bch2_journal_space_available(j); 1587 1588 closure_wake_up(&w->wait); 1589 journal_wake(j); 1590 1591 if (!journal_state_count(new, new.unwritten_idx) && 1592 journal_last_unwritten_seq(j) <= journal_cur_seq(j)) { 1593 spin_unlock(&j->lock); 1594 closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); 1595 } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && 1596 new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { 1597 struct journal_buf *buf = journal_cur_buf(j); 1598 long delta = buf->expires - jiffies; 1599 1600 /* 1601 * We don't close a journal entry to write it while there's 1602 * previous entries still in flight - the current journal entry 1603 * might want to be written now: 1604 */ 1605 1606 spin_unlock(&j->lock); 1607 mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta)); 1608 } else { 1609 spin_unlock(&j->lock); 1610 } 1611 } 1612 1613 static void journal_write_endio(struct bio *bio) 1614 { 1615 struct bch_dev *ca = bio->bi_private; 1616 struct journal *j = &ca->fs->journal; 1617 struct journal_buf *w = journal_last_unwritten_buf(j); 1618 unsigned long flags; 1619 1620 if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, 1621 "error writing journal entry %llu: %s", 1622 le64_to_cpu(w->data->seq), 1623 bch2_blk_status_to_str(bio->bi_status)) || 1624 bch2_meta_write_fault("journal")) { 1625 spin_lock_irqsave(&j->err_lock, flags); 1626 bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx); 1627 spin_unlock_irqrestore(&j->err_lock, flags); 1628 } 1629 1630 closure_put(&j->io); 1631 percpu_ref_put(&ca->io_ref); 1632 } 1633 1634 static void do_journal_write(struct closure *cl) 1635 { 1636 struct journal *j = container_of(cl, struct journal, io); 1637 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1638 struct bch_dev *ca; 1639 struct journal_buf *w = journal_last_unwritten_buf(j); 1640 struct bch_extent_ptr *ptr; 1641 struct bio *bio; 1642 unsigned sectors = vstruct_sectors(w->data, c->block_bits); 1643 1644 extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { 1645 ca = bch_dev_bkey_exists(c, ptr->dev); 1646 if (!percpu_ref_tryget(&ca->io_ref)) { 1647 /* XXX: fix this */ 1648 bch_err(c, "missing device for journal write\n"); 1649 continue; 1650 } 1651 1652 this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], 1653 sectors); 1654 1655 bio = ca->journal.bio; 1656 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); 1657 bio->bi_iter.bi_sector = ptr->offset; 1658 bio->bi_end_io = journal_write_endio; 1659 bio->bi_private = ca; 1660 1661 BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector); 1662 ca->prev_journal_sector = bio->bi_iter.bi_sector; 1663 1664 if (!JSET_NO_FLUSH(w->data)) 1665 bio->bi_opf |= REQ_FUA; 1666 if (!JSET_NO_FLUSH(w->data) && !w->separate_flush) 1667 bio->bi_opf |= REQ_PREFLUSH; 1668 1669 bch2_bio_map(bio, w->data, sectors << 9); 1670 1671 trace_and_count(c, journal_write, bio); 1672 closure_bio_submit(bio, cl); 1673 1674 ca->journal.bucket_seq[ca->journal.cur_idx] = 1675 le64_to_cpu(w->data->seq); 1676 } 1677 1678 continue_at(cl, journal_write_done, c->io_complete_wq); 1679 } 1680 1681 static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) 1682 { 1683 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1684 struct jset_entry *start, *end, *i, *next, *prev = NULL; 1685 struct jset *jset = w->data; 1686 unsigned sectors, bytes, u64s; 1687 bool validate_before_checksum = false; 1688 unsigned long btree_roots_have = 0; 1689 int ret; 1690 1691 /* 1692 * Simple compaction, dropping empty jset_entries (from journal 1693 * reservations that weren't fully used) and merging jset_entries that 1694 * can be. 1695 * 1696 * If we wanted to be really fancy here, we could sort all the keys in 1697 * the jset and drop keys that were overwritten - probably not worth it: 1698 */ 1699 vstruct_for_each_safe(jset, i, next) { 1700 unsigned u64s = le16_to_cpu(i->u64s); 1701 1702 /* Empty entry: */ 1703 if (!u64s) 1704 continue; 1705 1706 /* 1707 * New btree roots are set by journalling them; when the journal 1708 * entry gets written we have to propagate them to 1709 * c->btree_roots 1710 * 1711 * But, every journal entry we write has to contain all the 1712 * btree roots (at least for now); so after we copy btree roots 1713 * to c->btree_roots we have to get any missing btree roots and 1714 * add them to this journal entry: 1715 */ 1716 if (i->type == BCH_JSET_ENTRY_btree_root) { 1717 bch2_journal_entry_to_btree_root(c, i); 1718 __set_bit(i->btree_id, &btree_roots_have); 1719 } 1720 1721 /* Can we merge with previous entry? */ 1722 if (prev && 1723 i->btree_id == prev->btree_id && 1724 i->level == prev->level && 1725 i->type == prev->type && 1726 i->type == BCH_JSET_ENTRY_btree_keys && 1727 le16_to_cpu(prev->u64s) + u64s <= U16_MAX) { 1728 memmove_u64s_down(vstruct_next(prev), 1729 i->_data, 1730 u64s); 1731 le16_add_cpu(&prev->u64s, u64s); 1732 continue; 1733 } 1734 1735 /* Couldn't merge, move i into new position (after prev): */ 1736 prev = prev ? vstruct_next(prev) : jset->start; 1737 if (i != prev) 1738 memmove_u64s_down(prev, i, jset_u64s(u64s)); 1739 } 1740 1741 prev = prev ? vstruct_next(prev) : jset->start; 1742 jset->u64s = cpu_to_le32((u64 *) prev - jset->_data); 1743 1744 start = end = vstruct_last(jset); 1745 1746 end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have); 1747 1748 bch2_journal_super_entries_add_common(c, &end, 1749 le64_to_cpu(jset->seq)); 1750 u64s = (u64 *) end - (u64 *) start; 1751 BUG_ON(u64s > j->entry_u64s_reserved); 1752 1753 le32_add_cpu(&jset->u64s, u64s); 1754 1755 sectors = vstruct_sectors(jset, c->block_bits); 1756 bytes = vstruct_bytes(jset); 1757 1758 if (sectors > w->sectors) { 1759 bch2_fs_fatal_error(c, "aieeee! journal write overran available space, %zu > %u (extra %u reserved %u/%u)", 1760 vstruct_bytes(jset), w->sectors << 9, 1761 u64s, w->u64s_reserved, j->entry_u64s_reserved); 1762 return -EINVAL; 1763 } 1764 1765 jset->magic = cpu_to_le64(jset_magic(c)); 1766 jset->version = cpu_to_le32(c->sb.version); 1767 1768 SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); 1769 SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); 1770 1771 if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset)) 1772 j->last_empty_seq = le64_to_cpu(jset->seq); 1773 1774 if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) 1775 validate_before_checksum = true; 1776 1777 if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current) 1778 validate_before_checksum = true; 1779 1780 if (validate_before_checksum && 1781 (ret = jset_validate(c, NULL, jset, 0, WRITE))) 1782 return ret; 1783 1784 ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), 1785 jset->encrypted_start, 1786 vstruct_end(jset) - (void *) jset->encrypted_start); 1787 if (bch2_fs_fatal_err_on(ret, c, 1788 "error decrypting journal entry: %i", ret)) 1789 return ret; 1790 1791 jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), 1792 journal_nonce(jset), jset); 1793 1794 if (!validate_before_checksum && 1795 (ret = jset_validate(c, NULL, jset, 0, WRITE))) 1796 return ret; 1797 1798 memset((void *) jset + bytes, 0, (sectors << 9) - bytes); 1799 return 0; 1800 } 1801 1802 static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w) 1803 { 1804 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1805 int error = bch2_journal_error(j); 1806 1807 /* 1808 * If the journal is in an error state - we did an emergency shutdown - 1809 * we prefer to continue doing journal writes. We just mark them as 1810 * noflush so they'll never be used, but they'll still be visible by the 1811 * list_journal tool - this helps in debugging. 1812 * 1813 * There's a caveat: the first journal write after marking the 1814 * superblock dirty must always be a flush write, because on startup 1815 * from a clean shutdown we didn't necessarily read the journal and the 1816 * new journal write might overwrite whatever was in the journal 1817 * previously - we can't leave the journal without any flush writes in 1818 * it. 1819 * 1820 * So if we're in an error state, and we're still starting up, we don't 1821 * write anything at all. 1822 */ 1823 if (error && test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags)) 1824 return -EIO; 1825 1826 if (error || 1827 w->noflush || 1828 (!w->must_flush && 1829 (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && 1830 test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) { 1831 w->noflush = true; 1832 SET_JSET_NO_FLUSH(w->data, true); 1833 w->data->last_seq = 0; 1834 w->last_seq = 0; 1835 1836 j->nr_noflush_writes++; 1837 } else { 1838 j->last_flush_write = jiffies; 1839 j->nr_flush_writes++; 1840 clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags); 1841 } 1842 1843 return 0; 1844 } 1845 1846 void bch2_journal_write(struct closure *cl) 1847 { 1848 struct journal *j = container_of(cl, struct journal, io); 1849 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1850 struct bch_dev *ca; 1851 struct journal_buf *w = journal_last_unwritten_buf(j); 1852 struct bch_replicas_padded replicas; 1853 struct bio *bio; 1854 struct printbuf journal_debug_buf = PRINTBUF; 1855 unsigned i, nr_rw_members = 0; 1856 int ret; 1857 1858 BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); 1859 1860 j->write_start_time = local_clock(); 1861 1862 spin_lock(&j->lock); 1863 ret = bch2_journal_write_pick_flush(j, w); 1864 spin_unlock(&j->lock); 1865 if (ret) 1866 goto err; 1867 1868 journal_buf_realloc(j, w); 1869 1870 ret = bch2_journal_write_prep(j, w); 1871 if (ret) 1872 goto err; 1873 1874 while (1) { 1875 spin_lock(&j->lock); 1876 ret = journal_write_alloc(j, w); 1877 if (!ret || !j->can_discard) 1878 break; 1879 1880 spin_unlock(&j->lock); 1881 bch2_journal_do_discards(j); 1882 } 1883 1884 if (ret) { 1885 __bch2_journal_debug_to_text(&journal_debug_buf, j); 1886 spin_unlock(&j->lock); 1887 bch_err(c, "Unable to allocate journal write:\n%s", 1888 journal_debug_buf.buf); 1889 printbuf_exit(&journal_debug_buf); 1890 goto err; 1891 } 1892 1893 /* 1894 * write is allocated, no longer need to account for it in 1895 * bch2_journal_space_available(): 1896 */ 1897 w->sectors = 0; 1898 1899 /* 1900 * journal entry has been compacted and allocated, recalculate space 1901 * available: 1902 */ 1903 bch2_journal_space_available(j); 1904 spin_unlock(&j->lock); 1905 1906 w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); 1907 1908 if (c->opts.nochanges) 1909 goto no_io; 1910 1911 for_each_rw_member(ca, c, i) 1912 nr_rw_members++; 1913 1914 if (nr_rw_members > 1) 1915 w->separate_flush = true; 1916 1917 /* 1918 * Mark journal replicas before we submit the write to guarantee 1919 * recovery will find the journal entries after a crash. 1920 */ 1921 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, 1922 w->devs_written); 1923 ret = bch2_mark_replicas(c, &replicas.e); 1924 if (ret) 1925 goto err; 1926 1927 if (!JSET_NO_FLUSH(w->data) && w->separate_flush) { 1928 for_each_rw_member(ca, c, i) { 1929 percpu_ref_get(&ca->io_ref); 1930 1931 bio = ca->journal.bio; 1932 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH); 1933 bio->bi_end_io = journal_write_endio; 1934 bio->bi_private = ca; 1935 closure_bio_submit(bio, cl); 1936 } 1937 } 1938 1939 continue_at(cl, do_journal_write, c->io_complete_wq); 1940 return; 1941 no_io: 1942 continue_at(cl, journal_write_done, c->io_complete_wq); 1943 return; 1944 err: 1945 bch2_fatal_error(c); 1946 continue_at(cl, journal_write_done, c->io_complete_wq); 1947 } 1948