1 // SPDX-License-Identifier: GPL-2.0 2 #include "bcachefs.h" 3 #include "alloc_background.h" 4 #include "alloc_foreground.h" 5 #include "btree_io.h" 6 #include "btree_update_interior.h" 7 #include "btree_write_buffer.h" 8 #include "buckets.h" 9 #include "checksum.h" 10 #include "disk_groups.h" 11 #include "error.h" 12 #include "journal.h" 13 #include "journal_io.h" 14 #include "journal_reclaim.h" 15 #include "journal_seq_blacklist.h" 16 #include "replicas.h" 17 #include "sb-clean.h" 18 #include "trace.h" 19 20 #include <linux/ioprio.h> 21 #include <linux/string_choices.h> 22 23 void bch2_journal_pos_from_member_info_set(struct bch_fs *c) 24 { 25 lockdep_assert_held(&c->sb_lock); 26 27 for_each_member_device(c, ca) { 28 struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); 29 30 m->last_journal_bucket = cpu_to_le32(ca->journal.cur_idx); 31 m->last_journal_bucket_offset = cpu_to_le32(ca->mi.bucket_size - ca->journal.sectors_free); 32 } 33 } 34 35 void bch2_journal_pos_from_member_info_resume(struct bch_fs *c) 36 { 37 mutex_lock(&c->sb_lock); 38 for_each_member_device(c, ca) { 39 struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx); 40 41 unsigned idx = le32_to_cpu(m.last_journal_bucket); 42 if (idx < ca->journal.nr) 43 ca->journal.cur_idx = idx; 44 unsigned offset = le32_to_cpu(m.last_journal_bucket_offset); 45 if (offset <= ca->mi.bucket_size) 46 ca->journal.sectors_free = ca->mi.bucket_size - offset; 47 } 48 mutex_unlock(&c->sb_lock); 49 } 50 51 void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, 52 struct journal_replay *j) 53 { 54 darray_for_each(j->ptrs, i) { 55 if (i != j->ptrs.data) 56 prt_printf(out, " "); 57 prt_printf(out, "%u:%u:%u (sector %llu)", 58 i->dev, i->bucket, i->bucket_offset, i->sector); 59 } 60 } 61 62 static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c, 63 struct journal_replay *j) 64 { 65 prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq)); 66 67 bch2_journal_ptrs_to_text(out, c, j); 68 69 for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) { 70 struct jset_entry_datetime *datetime = 71 container_of(entry, struct jset_entry_datetime, entry); 72 bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); 73 break; 74 } 75 } 76 77 static struct nonce journal_nonce(const struct jset *jset) 78 { 79 return (struct nonce) {{ 80 [0] = 0, 81 [1] = ((__le32 *) &jset->seq)[0], 82 [2] = ((__le32 *) &jset->seq)[1], 83 [3] = BCH_NONCE_JOURNAL, 84 }}; 85 } 86 87 static bool jset_csum_good(struct bch_fs *c, struct jset *j, struct bch_csum *csum) 88 { 89 if (!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j))) { 90 *csum = (struct bch_csum) {}; 91 return false; 92 } 93 94 *csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j); 95 return !bch2_crc_cmp(j->csum, *csum); 96 } 97 98 static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq) 99 { 100 return (seq - c->journal_entries_base_seq) & (~0U >> 1); 101 } 102 103 static void __journal_replay_free(struct bch_fs *c, 104 struct journal_replay *i) 105 { 106 struct journal_replay **p = 107 genradix_ptr(&c->journal_entries, 108 journal_entry_radix_idx(c, le64_to_cpu(i->j.seq))); 109 110 BUG_ON(*p != i); 111 *p = NULL; 112 kvfree(i); 113 } 114 115 static void journal_replay_free(struct bch_fs *c, struct journal_replay *i, bool blacklisted) 116 { 117 if (blacklisted) 118 i->ignore_blacklisted = true; 119 else 120 i->ignore_not_dirty = true; 121 122 if (!c->opts.read_entire_journal) 123 __journal_replay_free(c, i); 124 } 125 126 struct journal_list { 127 struct closure cl; 128 u64 last_seq; 129 struct mutex lock; 130 int ret; 131 }; 132 133 #define JOURNAL_ENTRY_ADD_OK 0 134 #define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5 135 136 /* 137 * Given a journal entry we just read, add it to the list of journal entries to 138 * be replayed: 139 */ 140 static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, 141 struct journal_ptr entry_ptr, 142 struct journal_list *jlist, struct jset *j) 143 { 144 struct genradix_iter iter; 145 struct journal_replay **_i, *i, *dup; 146 size_t bytes = vstruct_bytes(j); 147 u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0; 148 struct printbuf buf = PRINTBUF; 149 int ret = JOURNAL_ENTRY_ADD_OK; 150 151 if (!c->journal.oldest_seq_found_ondisk || 152 le64_to_cpu(j->seq) < c->journal.oldest_seq_found_ondisk) 153 c->journal.oldest_seq_found_ondisk = le64_to_cpu(j->seq); 154 155 /* Is this entry older than the range we need? */ 156 if (!c->opts.read_entire_journal && 157 le64_to_cpu(j->seq) < jlist->last_seq) 158 return JOURNAL_ENTRY_ADD_OUT_OF_RANGE; 159 160 /* 161 * genradixes are indexed by a ulong, not a u64, so we can't index them 162 * by sequence number directly: Assume instead that they will all fall 163 * within the range of +-2billion of the filrst one we find. 164 */ 165 if (!c->journal_entries_base_seq) 166 c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX); 167 168 /* Drop entries we don't need anymore */ 169 if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) { 170 genradix_for_each_from(&c->journal_entries, iter, _i, 171 journal_entry_radix_idx(c, jlist->last_seq)) { 172 i = *_i; 173 174 if (journal_replay_ignore(i)) 175 continue; 176 177 if (le64_to_cpu(i->j.seq) >= last_seq) 178 break; 179 180 journal_replay_free(c, i, false); 181 } 182 } 183 184 jlist->last_seq = max(jlist->last_seq, last_seq); 185 186 _i = genradix_ptr_alloc(&c->journal_entries, 187 journal_entry_radix_idx(c, le64_to_cpu(j->seq)), 188 GFP_KERNEL); 189 if (!_i) 190 return -BCH_ERR_ENOMEM_journal_entry_add; 191 192 /* 193 * Duplicate journal entries? If so we want the one that didn't have a 194 * checksum error: 195 */ 196 dup = *_i; 197 if (dup) { 198 bool identical = bytes == vstruct_bytes(&dup->j) && 199 !memcmp(j, &dup->j, bytes); 200 bool not_identical = !identical && 201 entry_ptr.csum_good && 202 dup->csum_good; 203 204 bool same_device = false; 205 darray_for_each(dup->ptrs, ptr) 206 if (ptr->dev == ca->dev_idx) 207 same_device = true; 208 209 ret = darray_push(&dup->ptrs, entry_ptr); 210 if (ret) 211 goto out; 212 213 bch2_journal_replay_to_text(&buf, c, dup); 214 215 fsck_err_on(same_device, 216 c, journal_entry_dup_same_device, 217 "duplicate journal entry on same device\n%s", 218 buf.buf); 219 220 fsck_err_on(not_identical, 221 c, journal_entry_replicas_data_mismatch, 222 "found duplicate but non identical journal entries\n%s", 223 buf.buf); 224 225 if (entry_ptr.csum_good && !identical) 226 goto replace; 227 228 goto out; 229 } 230 replace: 231 i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); 232 if (!i) 233 return -BCH_ERR_ENOMEM_journal_entry_add; 234 235 darray_init(&i->ptrs); 236 i->csum_good = entry_ptr.csum_good; 237 i->ignore_blacklisted = false; 238 i->ignore_not_dirty = false; 239 unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct"); 240 241 if (dup) { 242 /* The first ptr should represent the jset we kept: */ 243 darray_for_each(dup->ptrs, ptr) 244 darray_push(&i->ptrs, *ptr); 245 __journal_replay_free(c, dup); 246 } else { 247 darray_push(&i->ptrs, entry_ptr); 248 } 249 250 *_i = i; 251 out: 252 fsck_err: 253 printbuf_exit(&buf); 254 return ret; 255 } 256 257 /* this fills in a range with empty jset_entries: */ 258 static void journal_entry_null_range(void *start, void *end) 259 { 260 struct jset_entry *entry; 261 262 for (entry = start; entry != end; entry = vstruct_next(entry)) 263 memset(entry, 0, sizeof(*entry)); 264 } 265 266 #define JOURNAL_ENTRY_REREAD 5 267 #define JOURNAL_ENTRY_NONE 6 268 #define JOURNAL_ENTRY_BAD 7 269 270 static void journal_entry_err_msg(struct printbuf *out, 271 u32 version, 272 struct jset *jset, 273 struct jset_entry *entry) 274 { 275 prt_str(out, "invalid journal entry, version="); 276 bch2_version_to_text(out, version); 277 278 if (entry) { 279 prt_str(out, " type="); 280 bch2_prt_jset_entry_type(out, entry->type); 281 } 282 283 if (!jset) { 284 prt_printf(out, " in superblock"); 285 } else { 286 287 prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq)); 288 289 if (entry) 290 prt_printf(out, " offset=%zi/%u", 291 (u64 *) entry - jset->_data, 292 le32_to_cpu(jset->u64s)); 293 } 294 295 prt_str(out, ": "); 296 } 297 298 #define journal_entry_err(c, version, jset, entry, _err, msg, ...) \ 299 ({ \ 300 struct printbuf _buf = PRINTBUF; \ 301 \ 302 journal_entry_err_msg(&_buf, version, jset, entry); \ 303 prt_printf(&_buf, msg, ##__VA_ARGS__); \ 304 \ 305 switch (from.flags & BCH_VALIDATE_write) { \ 306 case READ: \ 307 mustfix_fsck_err(c, _err, "%s", _buf.buf); \ 308 break; \ 309 case WRITE: \ 310 bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \ 311 if (bch2_fs_inconsistent(c, \ 312 "corrupt metadata before write: %s\n", _buf.buf)) {\ 313 ret = -BCH_ERR_fsck_errors_not_fixed; \ 314 goto fsck_err; \ 315 } \ 316 break; \ 317 } \ 318 \ 319 printbuf_exit(&_buf); \ 320 true; \ 321 }) 322 323 #define journal_entry_err_on(cond, ...) \ 324 ((cond) ? journal_entry_err(__VA_ARGS__) : false) 325 326 #define FSCK_DELETED_KEY 5 327 328 static int journal_validate_key(struct bch_fs *c, 329 struct jset *jset, 330 struct jset_entry *entry, 331 struct bkey_i *k, 332 struct bkey_validate_context from, 333 unsigned version, int big_endian) 334 { 335 enum bch_validate_flags flags = from.flags; 336 int write = flags & BCH_VALIDATE_write; 337 void *next = vstruct_next(entry); 338 int ret = 0; 339 340 if (journal_entry_err_on(!k->k.u64s, 341 c, version, jset, entry, 342 journal_entry_bkey_u64s_0, 343 "k->u64s 0")) { 344 entry->u64s = cpu_to_le16((u64 *) k - entry->_data); 345 journal_entry_null_range(vstruct_next(entry), next); 346 return FSCK_DELETED_KEY; 347 } 348 349 if (journal_entry_err_on((void *) bkey_next(k) > 350 (void *) vstruct_next(entry), 351 c, version, jset, entry, 352 journal_entry_bkey_past_end, 353 "extends past end of journal entry")) { 354 entry->u64s = cpu_to_le16((u64 *) k - entry->_data); 355 journal_entry_null_range(vstruct_next(entry), next); 356 return FSCK_DELETED_KEY; 357 } 358 359 if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, 360 c, version, jset, entry, 361 journal_entry_bkey_bad_format, 362 "bad format %u", k->k.format)) { 363 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); 364 memmove(k, bkey_next(k), next - (void *) bkey_next(k)); 365 journal_entry_null_range(vstruct_next(entry), next); 366 return FSCK_DELETED_KEY; 367 } 368 369 if (!write) 370 bch2_bkey_compat(from.level, from.btree, version, big_endian, 371 write, NULL, bkey_to_packed(k)); 372 373 ret = bch2_bkey_validate(c, bkey_i_to_s_c(k), from); 374 if (ret == -BCH_ERR_fsck_delete_bkey) { 375 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); 376 memmove(k, bkey_next(k), next - (void *) bkey_next(k)); 377 journal_entry_null_range(vstruct_next(entry), next); 378 return FSCK_DELETED_KEY; 379 } 380 if (ret) 381 goto fsck_err; 382 383 if (write) 384 bch2_bkey_compat(from.level, from.btree, version, big_endian, 385 write, NULL, bkey_to_packed(k)); 386 fsck_err: 387 return ret; 388 } 389 390 static int journal_entry_btree_keys_validate(struct bch_fs *c, 391 struct jset *jset, 392 struct jset_entry *entry, 393 unsigned version, int big_endian, 394 struct bkey_validate_context from) 395 { 396 struct bkey_i *k = entry->start; 397 398 from.level = entry->level; 399 from.btree = entry->btree_id; 400 401 while (k != vstruct_last(entry)) { 402 int ret = journal_validate_key(c, jset, entry, k, from, version, big_endian); 403 if (ret == FSCK_DELETED_KEY) 404 continue; 405 else if (ret) 406 return ret; 407 408 k = bkey_next(k); 409 } 410 411 return 0; 412 } 413 414 static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c, 415 struct jset_entry *entry) 416 { 417 bool first = true; 418 419 jset_entry_for_each_key(entry, k) { 420 if (!first) { 421 prt_newline(out); 422 bch2_prt_jset_entry_type(out, entry->type); 423 prt_str(out, ": "); 424 } 425 bch2_btree_id_level_to_text(out, entry->btree_id, entry->level); 426 prt_char(out, ' '); 427 bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); 428 first = false; 429 } 430 } 431 432 static int journal_entry_btree_root_validate(struct bch_fs *c, 433 struct jset *jset, 434 struct jset_entry *entry, 435 unsigned version, int big_endian, 436 struct bkey_validate_context from) 437 { 438 struct bkey_i *k = entry->start; 439 int ret = 0; 440 441 from.root = true; 442 from.level = entry->level + 1; 443 from.btree = entry->btree_id; 444 445 if (journal_entry_err_on(!entry->u64s || 446 le16_to_cpu(entry->u64s) != k->k.u64s, 447 c, version, jset, entry, 448 journal_entry_btree_root_bad_size, 449 "invalid btree root journal entry: wrong number of keys")) { 450 void *next = vstruct_next(entry); 451 /* 452 * we don't want to null out this jset_entry, 453 * just the contents, so that later we can tell 454 * we were _supposed_ to have a btree root 455 */ 456 entry->u64s = 0; 457 journal_entry_null_range(vstruct_next(entry), next); 458 return 0; 459 } 460 461 ret = journal_validate_key(c, jset, entry, k, from, version, big_endian); 462 if (ret == FSCK_DELETED_KEY) 463 ret = 0; 464 fsck_err: 465 return ret; 466 } 467 468 static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c, 469 struct jset_entry *entry) 470 { 471 journal_entry_btree_keys_to_text(out, c, entry); 472 } 473 474 static int journal_entry_prio_ptrs_validate(struct bch_fs *c, 475 struct jset *jset, 476 struct jset_entry *entry, 477 unsigned version, int big_endian, 478 struct bkey_validate_context from) 479 { 480 /* obsolete, don't care: */ 481 return 0; 482 } 483 484 static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c, 485 struct jset_entry *entry) 486 { 487 } 488 489 static int journal_entry_blacklist_validate(struct bch_fs *c, 490 struct jset *jset, 491 struct jset_entry *entry, 492 unsigned version, int big_endian, 493 struct bkey_validate_context from) 494 { 495 int ret = 0; 496 497 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, 498 c, version, jset, entry, 499 journal_entry_blacklist_bad_size, 500 "invalid journal seq blacklist entry: bad size")) { 501 journal_entry_null_range(entry, vstruct_next(entry)); 502 } 503 fsck_err: 504 return ret; 505 } 506 507 static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c, 508 struct jset_entry *entry) 509 { 510 struct jset_entry_blacklist *bl = 511 container_of(entry, struct jset_entry_blacklist, entry); 512 513 prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq)); 514 } 515 516 static int journal_entry_blacklist_v2_validate(struct bch_fs *c, 517 struct jset *jset, 518 struct jset_entry *entry, 519 unsigned version, int big_endian, 520 struct bkey_validate_context from) 521 { 522 struct jset_entry_blacklist_v2 *bl_entry; 523 int ret = 0; 524 525 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, 526 c, version, jset, entry, 527 journal_entry_blacklist_v2_bad_size, 528 "invalid journal seq blacklist entry: bad size")) { 529 journal_entry_null_range(entry, vstruct_next(entry)); 530 goto out; 531 } 532 533 bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry); 534 535 if (journal_entry_err_on(le64_to_cpu(bl_entry->start) > 536 le64_to_cpu(bl_entry->end), 537 c, version, jset, entry, 538 journal_entry_blacklist_v2_start_past_end, 539 "invalid journal seq blacklist entry: start > end")) { 540 journal_entry_null_range(entry, vstruct_next(entry)); 541 } 542 out: 543 fsck_err: 544 return ret; 545 } 546 547 static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c, 548 struct jset_entry *entry) 549 { 550 struct jset_entry_blacklist_v2 *bl = 551 container_of(entry, struct jset_entry_blacklist_v2, entry); 552 553 prt_printf(out, "start=%llu end=%llu", 554 le64_to_cpu(bl->start), 555 le64_to_cpu(bl->end)); 556 } 557 558 static int journal_entry_usage_validate(struct bch_fs *c, 559 struct jset *jset, 560 struct jset_entry *entry, 561 unsigned version, int big_endian, 562 struct bkey_validate_context from) 563 { 564 struct jset_entry_usage *u = 565 container_of(entry, struct jset_entry_usage, entry); 566 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 567 int ret = 0; 568 569 if (journal_entry_err_on(bytes < sizeof(*u), 570 c, version, jset, entry, 571 journal_entry_usage_bad_size, 572 "invalid journal entry usage: bad size")) { 573 journal_entry_null_range(entry, vstruct_next(entry)); 574 return ret; 575 } 576 577 fsck_err: 578 return ret; 579 } 580 581 static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c, 582 struct jset_entry *entry) 583 { 584 struct jset_entry_usage *u = 585 container_of(entry, struct jset_entry_usage, entry); 586 587 prt_str(out, "type="); 588 bch2_prt_fs_usage_type(out, u->entry.btree_id); 589 prt_printf(out, " v=%llu", le64_to_cpu(u->v)); 590 } 591 592 static int journal_entry_data_usage_validate(struct bch_fs *c, 593 struct jset *jset, 594 struct jset_entry *entry, 595 unsigned version, int big_endian, 596 struct bkey_validate_context from) 597 { 598 struct jset_entry_data_usage *u = 599 container_of(entry, struct jset_entry_data_usage, entry); 600 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 601 struct printbuf err = PRINTBUF; 602 int ret = 0; 603 604 if (journal_entry_err_on(bytes < sizeof(*u) || 605 bytes < sizeof(*u) + u->r.nr_devs, 606 c, version, jset, entry, 607 journal_entry_data_usage_bad_size, 608 "invalid journal entry usage: bad size")) { 609 journal_entry_null_range(entry, vstruct_next(entry)); 610 goto out; 611 } 612 613 if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c, &err), 614 c, version, jset, entry, 615 journal_entry_data_usage_bad_size, 616 "invalid journal entry usage: %s", err.buf)) { 617 journal_entry_null_range(entry, vstruct_next(entry)); 618 goto out; 619 } 620 out: 621 fsck_err: 622 printbuf_exit(&err); 623 return ret; 624 } 625 626 static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c, 627 struct jset_entry *entry) 628 { 629 struct jset_entry_data_usage *u = 630 container_of(entry, struct jset_entry_data_usage, entry); 631 632 bch2_replicas_entry_to_text(out, &u->r); 633 prt_printf(out, "=%llu", le64_to_cpu(u->v)); 634 } 635 636 static int journal_entry_clock_validate(struct bch_fs *c, 637 struct jset *jset, 638 struct jset_entry *entry, 639 unsigned version, int big_endian, 640 struct bkey_validate_context from) 641 { 642 struct jset_entry_clock *clock = 643 container_of(entry, struct jset_entry_clock, entry); 644 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 645 int ret = 0; 646 647 if (journal_entry_err_on(bytes != sizeof(*clock), 648 c, version, jset, entry, 649 journal_entry_clock_bad_size, 650 "bad size")) { 651 journal_entry_null_range(entry, vstruct_next(entry)); 652 return ret; 653 } 654 655 if (journal_entry_err_on(clock->rw > 1, 656 c, version, jset, entry, 657 journal_entry_clock_bad_rw, 658 "bad rw")) { 659 journal_entry_null_range(entry, vstruct_next(entry)); 660 return ret; 661 } 662 663 fsck_err: 664 return ret; 665 } 666 667 static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c, 668 struct jset_entry *entry) 669 { 670 struct jset_entry_clock *clock = 671 container_of(entry, struct jset_entry_clock, entry); 672 673 prt_printf(out, "%s=%llu", str_write_read(clock->rw), le64_to_cpu(clock->time)); 674 } 675 676 static int journal_entry_dev_usage_validate(struct bch_fs *c, 677 struct jset *jset, 678 struct jset_entry *entry, 679 unsigned version, int big_endian, 680 struct bkey_validate_context from) 681 { 682 struct jset_entry_dev_usage *u = 683 container_of(entry, struct jset_entry_dev_usage, entry); 684 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 685 unsigned expected = sizeof(*u); 686 int ret = 0; 687 688 if (journal_entry_err_on(bytes < expected, 689 c, version, jset, entry, 690 journal_entry_dev_usage_bad_size, 691 "bad size (%u < %u)", 692 bytes, expected)) { 693 journal_entry_null_range(entry, vstruct_next(entry)); 694 return ret; 695 } 696 697 if (journal_entry_err_on(u->pad, 698 c, version, jset, entry, 699 journal_entry_dev_usage_bad_pad, 700 "bad pad")) { 701 journal_entry_null_range(entry, vstruct_next(entry)); 702 return ret; 703 } 704 705 fsck_err: 706 return ret; 707 } 708 709 static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c, 710 struct jset_entry *entry) 711 { 712 struct jset_entry_dev_usage *u = 713 container_of(entry, struct jset_entry_dev_usage, entry); 714 unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); 715 716 if (vstruct_bytes(entry) < sizeof(*u)) 717 return; 718 719 prt_printf(out, "dev=%u", le32_to_cpu(u->dev)); 720 721 printbuf_indent_add(out, 2); 722 for (i = 0; i < nr_types; i++) { 723 prt_newline(out); 724 bch2_prt_data_type(out, i); 725 prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu", 726 le64_to_cpu(u->d[i].buckets), 727 le64_to_cpu(u->d[i].sectors), 728 le64_to_cpu(u->d[i].fragmented)); 729 } 730 printbuf_indent_sub(out, 2); 731 } 732 733 static int journal_entry_log_validate(struct bch_fs *c, 734 struct jset *jset, 735 struct jset_entry *entry, 736 unsigned version, int big_endian, 737 struct bkey_validate_context from) 738 { 739 return 0; 740 } 741 742 static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c, 743 struct jset_entry *entry) 744 { 745 struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); 746 747 prt_printf(out, "%.*s", jset_entry_log_msg_bytes(l), l->d); 748 } 749 750 static int journal_entry_overwrite_validate(struct bch_fs *c, 751 struct jset *jset, 752 struct jset_entry *entry, 753 unsigned version, int big_endian, 754 struct bkey_validate_context from) 755 { 756 from.flags = 0; 757 return journal_entry_btree_keys_validate(c, jset, entry, 758 version, big_endian, from); 759 } 760 761 static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c, 762 struct jset_entry *entry) 763 { 764 journal_entry_btree_keys_to_text(out, c, entry); 765 } 766 767 static int journal_entry_log_bkey_validate(struct bch_fs *c, 768 struct jset *jset, 769 struct jset_entry *entry, 770 unsigned version, int big_endian, 771 struct bkey_validate_context from) 772 { 773 from.flags = 0; 774 return journal_entry_btree_keys_validate(c, jset, entry, 775 version, big_endian, from); 776 } 777 778 static void journal_entry_log_bkey_to_text(struct printbuf *out, struct bch_fs *c, 779 struct jset_entry *entry) 780 { 781 journal_entry_btree_keys_to_text(out, c, entry); 782 } 783 784 static int journal_entry_write_buffer_keys_validate(struct bch_fs *c, 785 struct jset *jset, 786 struct jset_entry *entry, 787 unsigned version, int big_endian, 788 struct bkey_validate_context from) 789 { 790 return journal_entry_btree_keys_validate(c, jset, entry, 791 version, big_endian, from); 792 } 793 794 static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c, 795 struct jset_entry *entry) 796 { 797 journal_entry_btree_keys_to_text(out, c, entry); 798 } 799 800 static int journal_entry_datetime_validate(struct bch_fs *c, 801 struct jset *jset, 802 struct jset_entry *entry, 803 unsigned version, int big_endian, 804 struct bkey_validate_context from) 805 { 806 unsigned bytes = vstruct_bytes(entry); 807 unsigned expected = 16; 808 int ret = 0; 809 810 if (journal_entry_err_on(vstruct_bytes(entry) < expected, 811 c, version, jset, entry, 812 journal_entry_dev_usage_bad_size, 813 "bad size (%u < %u)", 814 bytes, expected)) { 815 journal_entry_null_range(entry, vstruct_next(entry)); 816 return ret; 817 } 818 fsck_err: 819 return ret; 820 } 821 822 static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *c, 823 struct jset_entry *entry) 824 { 825 struct jset_entry_datetime *datetime = 826 container_of(entry, struct jset_entry_datetime, entry); 827 828 bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); 829 } 830 831 struct jset_entry_ops { 832 int (*validate)(struct bch_fs *, struct jset *, 833 struct jset_entry *, unsigned, int, 834 struct bkey_validate_context); 835 void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); 836 }; 837 838 static const struct jset_entry_ops bch2_jset_entry_ops[] = { 839 #define x(f, nr) \ 840 [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \ 841 .validate = journal_entry_##f##_validate, \ 842 .to_text = journal_entry_##f##_to_text, \ 843 }, 844 BCH_JSET_ENTRY_TYPES() 845 #undef x 846 }; 847 848 int bch2_journal_entry_validate(struct bch_fs *c, 849 struct jset *jset, 850 struct jset_entry *entry, 851 unsigned version, int big_endian, 852 struct bkey_validate_context from) 853 { 854 return entry->type < BCH_JSET_ENTRY_NR 855 ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry, 856 version, big_endian, from) 857 : 0; 858 } 859 860 void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, 861 struct jset_entry *entry) 862 { 863 bch2_prt_jset_entry_type(out, entry->type); 864 865 if (entry->type < BCH_JSET_ENTRY_NR) { 866 prt_str(out, ": "); 867 bch2_jset_entry_ops[entry->type].to_text(out, c, entry); 868 } 869 } 870 871 static int jset_validate_entries(struct bch_fs *c, struct jset *jset, 872 enum bch_validate_flags flags) 873 { 874 struct bkey_validate_context from = { 875 .flags = flags, 876 .from = BKEY_VALIDATE_journal, 877 .journal_seq = le64_to_cpu(jset->seq), 878 }; 879 880 unsigned version = le32_to_cpu(jset->version); 881 int ret = 0; 882 883 vstruct_for_each(jset, entry) { 884 from.journal_offset = (u64 *) entry - jset->_data; 885 886 if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset), 887 c, version, jset, entry, 888 journal_entry_past_jset_end, 889 "journal entry extends past end of jset")) { 890 jset->u64s = cpu_to_le32((u64 *) entry - jset->_data); 891 break; 892 } 893 894 ret = bch2_journal_entry_validate(c, jset, entry, version, 895 JSET_BIG_ENDIAN(jset), from); 896 if (ret) 897 break; 898 } 899 fsck_err: 900 return ret; 901 } 902 903 static int jset_validate(struct bch_fs *c, 904 struct bch_dev *ca, 905 struct jset *jset, u64 sector, 906 enum bch_validate_flags flags) 907 { 908 struct bkey_validate_context from = { 909 .flags = flags, 910 .from = BKEY_VALIDATE_journal, 911 .journal_seq = le64_to_cpu(jset->seq), 912 }; 913 int ret = 0; 914 915 if (le64_to_cpu(jset->magic) != jset_magic(c)) 916 return JOURNAL_ENTRY_NONE; 917 918 unsigned version = le32_to_cpu(jset->version); 919 if (journal_entry_err_on(!bch2_version_compatible(version), 920 c, version, jset, NULL, 921 jset_unsupported_version, 922 "%s sector %llu seq %llu: incompatible journal entry version %u.%u", 923 ca ? ca->name : c->name, 924 sector, le64_to_cpu(jset->seq), 925 BCH_VERSION_MAJOR(version), 926 BCH_VERSION_MINOR(version))) { 927 /* don't try to continue: */ 928 return -EINVAL; 929 } 930 931 if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), 932 c, version, jset, NULL, 933 jset_unknown_csum, 934 "%s sector %llu seq %llu: journal entry with unknown csum type %llu", 935 ca ? ca->name : c->name, 936 sector, le64_to_cpu(jset->seq), 937 JSET_CSUM_TYPE(jset))) 938 ret = JOURNAL_ENTRY_BAD; 939 940 /* last_seq is ignored when JSET_NO_FLUSH is true */ 941 if (journal_entry_err_on(!JSET_NO_FLUSH(jset) && 942 le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), 943 c, version, jset, NULL, 944 jset_last_seq_newer_than_seq, 945 "invalid journal entry: last_seq > seq (%llu > %llu)", 946 le64_to_cpu(jset->last_seq), 947 le64_to_cpu(jset->seq))) { 948 jset->last_seq = jset->seq; 949 return JOURNAL_ENTRY_BAD; 950 } 951 952 ret = jset_validate_entries(c, jset, flags); 953 fsck_err: 954 return ret; 955 } 956 957 static int jset_validate_early(struct bch_fs *c, 958 struct bch_dev *ca, 959 struct jset *jset, u64 sector, 960 unsigned bucket_sectors_left, 961 unsigned sectors_read) 962 { 963 struct bkey_validate_context from = { 964 .from = BKEY_VALIDATE_journal, 965 .journal_seq = le64_to_cpu(jset->seq), 966 }; 967 int ret = 0; 968 969 if (le64_to_cpu(jset->magic) != jset_magic(c)) 970 return JOURNAL_ENTRY_NONE; 971 972 unsigned version = le32_to_cpu(jset->version); 973 if (journal_entry_err_on(!bch2_version_compatible(version), 974 c, version, jset, NULL, 975 jset_unsupported_version, 976 "%s sector %llu seq %llu: unknown journal entry version %u.%u", 977 ca ? ca->name : c->name, 978 sector, le64_to_cpu(jset->seq), 979 BCH_VERSION_MAJOR(version), 980 BCH_VERSION_MINOR(version))) { 981 /* don't try to continue: */ 982 return -EINVAL; 983 } 984 985 size_t bytes = vstruct_bytes(jset); 986 if (bytes > (sectors_read << 9) && 987 sectors_read < bucket_sectors_left) 988 return JOURNAL_ENTRY_REREAD; 989 990 if (journal_entry_err_on(bytes > bucket_sectors_left << 9, 991 c, version, jset, NULL, 992 jset_past_bucket_end, 993 "%s sector %llu seq %llu: journal entry too big (%zu bytes)", 994 ca ? ca->name : c->name, 995 sector, le64_to_cpu(jset->seq), bytes)) 996 le32_add_cpu(&jset->u64s, 997 -((bytes - (bucket_sectors_left << 9)) / 8)); 998 fsck_err: 999 return ret; 1000 } 1001 1002 struct journal_read_buf { 1003 void *data; 1004 size_t size; 1005 }; 1006 1007 static int journal_read_buf_realloc(struct journal_read_buf *b, 1008 size_t new_size) 1009 { 1010 void *n; 1011 1012 /* the bios are sized for this many pages, max: */ 1013 if (new_size > JOURNAL_ENTRY_SIZE_MAX) 1014 return -BCH_ERR_ENOMEM_journal_read_buf_realloc; 1015 1016 new_size = roundup_pow_of_two(new_size); 1017 n = kvmalloc(new_size, GFP_KERNEL); 1018 if (!n) 1019 return -BCH_ERR_ENOMEM_journal_read_buf_realloc; 1020 1021 kvfree(b->data); 1022 b->data = n; 1023 b->size = new_size; 1024 return 0; 1025 } 1026 1027 static int journal_read_bucket(struct bch_dev *ca, 1028 struct journal_read_buf *buf, 1029 struct journal_list *jlist, 1030 unsigned bucket) 1031 { 1032 struct bch_fs *c = ca->fs; 1033 struct journal_device *ja = &ca->journal; 1034 struct jset *j = NULL; 1035 unsigned sectors, sectors_read = 0; 1036 u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), 1037 end = offset + ca->mi.bucket_size; 1038 bool saw_bad = false, csum_good; 1039 struct printbuf err = PRINTBUF; 1040 int ret = 0; 1041 1042 pr_debug("reading %u", bucket); 1043 1044 while (offset < end) { 1045 if (!sectors_read) { 1046 struct bio *bio; 1047 unsigned nr_bvecs; 1048 reread: 1049 sectors_read = min_t(unsigned, 1050 end - offset, buf->size >> 9); 1051 nr_bvecs = buf_pages(buf->data, sectors_read << 9); 1052 1053 bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); 1054 if (!bio) 1055 return -BCH_ERR_ENOMEM_journal_read_bucket; 1056 bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ); 1057 1058 bio->bi_iter.bi_sector = offset; 1059 bch2_bio_map(bio, buf->data, sectors_read << 9); 1060 1061 u64 submit_time = local_clock(); 1062 ret = submit_bio_wait(bio); 1063 kfree(bio); 1064 1065 if (!ret && bch2_meta_read_fault("journal")) 1066 ret = -BCH_ERR_EIO_fault_injected; 1067 1068 bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, 1069 submit_time, !ret); 1070 1071 if (ret) { 1072 bch_err_dev_ratelimited(ca, 1073 "journal read error: sector %llu", offset); 1074 /* 1075 * We don't error out of the recovery process 1076 * here, since the relevant journal entry may be 1077 * found on a different device, and missing or 1078 * no journal entries will be handled later 1079 */ 1080 goto out; 1081 } 1082 1083 j = buf->data; 1084 } 1085 1086 ret = jset_validate_early(c, ca, j, offset, 1087 end - offset, sectors_read); 1088 switch (ret) { 1089 case 0: 1090 sectors = vstruct_sectors(j, c->block_bits); 1091 break; 1092 case JOURNAL_ENTRY_REREAD: 1093 if (vstruct_bytes(j) > buf->size) { 1094 ret = journal_read_buf_realloc(buf, 1095 vstruct_bytes(j)); 1096 if (ret) 1097 goto err; 1098 } 1099 goto reread; 1100 case JOURNAL_ENTRY_NONE: 1101 if (!saw_bad) 1102 goto out; 1103 /* 1104 * On checksum error we don't really trust the size 1105 * field of the journal entry we read, so try reading 1106 * again at next block boundary: 1107 */ 1108 sectors = block_sectors(c); 1109 goto next_block; 1110 default: 1111 goto err; 1112 } 1113 1114 if (le64_to_cpu(j->seq) > ja->highest_seq_found) { 1115 ja->highest_seq_found = le64_to_cpu(j->seq); 1116 ja->cur_idx = bucket; 1117 ja->sectors_free = ca->mi.bucket_size - 1118 bucket_remainder(ca, offset) - sectors; 1119 } 1120 1121 /* 1122 * This happens sometimes if we don't have discards on - 1123 * when we've partially overwritten a bucket with new 1124 * journal entries. We don't need the rest of the 1125 * bucket: 1126 */ 1127 if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) 1128 goto out; 1129 1130 ja->bucket_seq[bucket] = le64_to_cpu(j->seq); 1131 1132 enum bch_csum_type csum_type = JSET_CSUM_TYPE(j); 1133 struct bch_csum csum; 1134 csum_good = jset_csum_good(c, j, &csum); 1135 1136 bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good); 1137 1138 if (!csum_good) { 1139 bch_err_dev_ratelimited(ca, "%s", 1140 (printbuf_reset(&err), 1141 prt_str(&err, "journal "), 1142 bch2_csum_err_msg(&err, csum_type, j->csum, csum), 1143 err.buf)); 1144 saw_bad = true; 1145 } 1146 1147 ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), 1148 j->encrypted_start, 1149 vstruct_end(j) - (void *) j->encrypted_start); 1150 bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret)); 1151 1152 mutex_lock(&jlist->lock); 1153 ret = journal_entry_add(c, ca, (struct journal_ptr) { 1154 .csum_good = csum_good, 1155 .dev = ca->dev_idx, 1156 .bucket = bucket, 1157 .bucket_offset = offset - 1158 bucket_to_sector(ca, ja->buckets[bucket]), 1159 .sector = offset, 1160 }, jlist, j); 1161 mutex_unlock(&jlist->lock); 1162 1163 switch (ret) { 1164 case JOURNAL_ENTRY_ADD_OK: 1165 break; 1166 case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: 1167 break; 1168 default: 1169 goto err; 1170 } 1171 next_block: 1172 pr_debug("next"); 1173 offset += sectors; 1174 sectors_read -= sectors; 1175 j = ((void *) j) + (sectors << 9); 1176 } 1177 1178 out: 1179 ret = 0; 1180 err: 1181 printbuf_exit(&err); 1182 return ret; 1183 } 1184 1185 static CLOSURE_CALLBACK(bch2_journal_read_device) 1186 { 1187 closure_type(ja, struct journal_device, read); 1188 struct bch_dev *ca = container_of(ja, struct bch_dev, journal); 1189 struct bch_fs *c = ca->fs; 1190 struct journal_list *jlist = 1191 container_of(cl->parent, struct journal_list, cl); 1192 struct journal_read_buf buf = { NULL, 0 }; 1193 unsigned i; 1194 int ret = 0; 1195 1196 if (!ja->nr) 1197 goto out; 1198 1199 ret = journal_read_buf_realloc(&buf, PAGE_SIZE); 1200 if (ret) 1201 goto err; 1202 1203 pr_debug("%u journal buckets", ja->nr); 1204 1205 for (i = 0; i < ja->nr; i++) { 1206 ret = journal_read_bucket(ca, &buf, jlist, i); 1207 if (ret) 1208 goto err; 1209 } 1210 1211 /* 1212 * Set dirty_idx to indicate the entire journal is full and needs to be 1213 * reclaimed - journal reclaim will immediately reclaim whatever isn't 1214 * pinned when it first runs: 1215 */ 1216 ja->discard_idx = ja->dirty_idx_ondisk = 1217 ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; 1218 out: 1219 bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); 1220 kvfree(buf.data); 1221 percpu_ref_put(&ca->io_ref[READ]); 1222 closure_return(cl); 1223 return; 1224 err: 1225 mutex_lock(&jlist->lock); 1226 jlist->ret = ret; 1227 mutex_unlock(&jlist->lock); 1228 goto out; 1229 } 1230 1231 int bch2_journal_read(struct bch_fs *c, 1232 u64 *last_seq, 1233 u64 *blacklist_seq, 1234 u64 *start_seq) 1235 { 1236 struct journal_list jlist; 1237 struct journal_replay *i, **_i, *prev = NULL; 1238 struct genradix_iter radix_iter; 1239 struct printbuf buf = PRINTBUF; 1240 bool degraded = false, last_write_torn = false; 1241 u64 seq; 1242 int ret = 0; 1243 1244 closure_init_stack(&jlist.cl); 1245 mutex_init(&jlist.lock); 1246 jlist.last_seq = 0; 1247 jlist.ret = 0; 1248 1249 for_each_member_device(c, ca) { 1250 if (!c->opts.fsck && 1251 !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal))) 1252 continue; 1253 1254 if ((ca->mi.state == BCH_MEMBER_STATE_rw || 1255 ca->mi.state == BCH_MEMBER_STATE_ro) && 1256 percpu_ref_tryget(&ca->io_ref[READ])) 1257 closure_call(&ca->journal.read, 1258 bch2_journal_read_device, 1259 system_unbound_wq, 1260 &jlist.cl); 1261 else 1262 degraded = true; 1263 } 1264 1265 closure_sync(&jlist.cl); 1266 1267 if (jlist.ret) 1268 return jlist.ret; 1269 1270 *last_seq = 0; 1271 *start_seq = 0; 1272 *blacklist_seq = 0; 1273 1274 /* 1275 * Find most recent flush entry, and ignore newer non flush entries - 1276 * those entries will be blacklisted: 1277 */ 1278 genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) { 1279 i = *_i; 1280 1281 if (journal_replay_ignore(i)) 1282 continue; 1283 1284 if (!*start_seq) 1285 *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1; 1286 1287 if (JSET_NO_FLUSH(&i->j)) { 1288 i->ignore_blacklisted = true; 1289 continue; 1290 } 1291 1292 if (!last_write_torn && !i->csum_good) { 1293 last_write_torn = true; 1294 i->ignore_blacklisted = true; 1295 continue; 1296 } 1297 1298 struct bkey_validate_context from = { 1299 .from = BKEY_VALIDATE_journal, 1300 .journal_seq = le64_to_cpu(i->j.seq), 1301 }; 1302 if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq), 1303 c, le32_to_cpu(i->j.version), &i->j, NULL, 1304 jset_last_seq_newer_than_seq, 1305 "invalid journal entry: last_seq > seq (%llu > %llu)", 1306 le64_to_cpu(i->j.last_seq), 1307 le64_to_cpu(i->j.seq))) 1308 i->j.last_seq = i->j.seq; 1309 1310 *last_seq = le64_to_cpu(i->j.last_seq); 1311 *blacklist_seq = le64_to_cpu(i->j.seq) + 1; 1312 break; 1313 } 1314 1315 if (!*start_seq) { 1316 bch_info(c, "journal read done, but no entries found"); 1317 return 0; 1318 } 1319 1320 if (!*last_seq) { 1321 fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes, 1322 "journal read done, but no entries found after dropping non-flushes"); 1323 return 0; 1324 } 1325 1326 bch_info(c, "journal read done, replaying entries %llu-%llu", 1327 *last_seq, *blacklist_seq - 1); 1328 1329 if (*start_seq != *blacklist_seq) 1330 bch_info(c, "dropped unflushed entries %llu-%llu", 1331 *blacklist_seq, *start_seq - 1); 1332 1333 /* Drop blacklisted entries and entries older than last_seq: */ 1334 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1335 i = *_i; 1336 1337 if (journal_replay_ignore(i)) 1338 continue; 1339 1340 seq = le64_to_cpu(i->j.seq); 1341 if (seq < *last_seq) { 1342 journal_replay_free(c, i, false); 1343 continue; 1344 } 1345 1346 if (bch2_journal_seq_is_blacklisted(c, seq, true)) { 1347 fsck_err_on(!JSET_NO_FLUSH(&i->j), c, 1348 jset_seq_blacklisted, 1349 "found blacklisted journal entry %llu", seq); 1350 i->ignore_blacklisted = true; 1351 } 1352 } 1353 1354 /* Check for missing entries: */ 1355 seq = *last_seq; 1356 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1357 i = *_i; 1358 1359 if (journal_replay_ignore(i)) 1360 continue; 1361 1362 BUG_ON(seq > le64_to_cpu(i->j.seq)); 1363 1364 while (seq < le64_to_cpu(i->j.seq)) { 1365 u64 missing_start, missing_end; 1366 struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; 1367 1368 while (seq < le64_to_cpu(i->j.seq) && 1369 bch2_journal_seq_is_blacklisted(c, seq, false)) 1370 seq++; 1371 1372 if (seq == le64_to_cpu(i->j.seq)) 1373 break; 1374 1375 missing_start = seq; 1376 1377 while (seq < le64_to_cpu(i->j.seq) && 1378 !bch2_journal_seq_is_blacklisted(c, seq, false)) 1379 seq++; 1380 1381 if (prev) { 1382 bch2_journal_ptrs_to_text(&buf1, c, prev); 1383 prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits)); 1384 } else 1385 prt_printf(&buf1, "(none)"); 1386 bch2_journal_ptrs_to_text(&buf2, c, i); 1387 1388 missing_end = seq - 1; 1389 fsck_err(c, journal_entries_missing, 1390 "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" 1391 "prev at %s\n" 1392 "next at %s, continue?", 1393 missing_start, missing_end, 1394 *last_seq, *blacklist_seq - 1, 1395 buf1.buf, buf2.buf); 1396 1397 printbuf_exit(&buf1); 1398 printbuf_exit(&buf2); 1399 } 1400 1401 prev = i; 1402 seq++; 1403 } 1404 1405 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1406 struct bch_replicas_padded replicas = { 1407 .e.data_type = BCH_DATA_journal, 1408 .e.nr_devs = 0, 1409 .e.nr_required = 1, 1410 }; 1411 1412 i = *_i; 1413 if (journal_replay_ignore(i)) 1414 continue; 1415 1416 darray_for_each(i->ptrs, ptr) { 1417 struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); 1418 1419 if (!ptr->csum_good) 1420 bch_err_dev_offset(ca, ptr->sector, 1421 "invalid journal checksum, seq %llu%s", 1422 le64_to_cpu(i->j.seq), 1423 i->csum_good ? " (had good copy on another device)" : ""); 1424 } 1425 1426 ret = jset_validate(c, 1427 bch2_dev_have_ref(c, i->ptrs.data[0].dev), 1428 &i->j, 1429 i->ptrs.data[0].sector, 1430 READ); 1431 if (ret) 1432 goto err; 1433 1434 darray_for_each(i->ptrs, ptr) 1435 replicas_entry_add_dev(&replicas.e, ptr->dev); 1436 1437 bch2_replicas_entry_sort(&replicas.e); 1438 1439 printbuf_reset(&buf); 1440 bch2_replicas_entry_to_text(&buf, &replicas.e); 1441 1442 if (!degraded && 1443 !bch2_replicas_marked(c, &replicas.e) && 1444 (le64_to_cpu(i->j.seq) == *last_seq || 1445 fsck_err(c, journal_entry_replicas_not_marked, 1446 "superblock not marked as containing replicas for journal entry %llu\n%s", 1447 le64_to_cpu(i->j.seq), buf.buf))) { 1448 ret = bch2_mark_replicas(c, &replicas.e); 1449 if (ret) 1450 goto err; 1451 } 1452 } 1453 err: 1454 fsck_err: 1455 printbuf_exit(&buf); 1456 return ret; 1457 } 1458 1459 /* journal write: */ 1460 1461 static void journal_advance_devs_to_next_bucket(struct journal *j, 1462 struct dev_alloc_list *devs, 1463 unsigned sectors, __le64 seq) 1464 { 1465 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1466 1467 darray_for_each(*devs, i) { 1468 struct bch_dev *ca = rcu_dereference(c->devs[*i]); 1469 if (!ca) 1470 continue; 1471 1472 struct journal_device *ja = &ca->journal; 1473 1474 if (sectors > ja->sectors_free && 1475 sectors <= ca->mi.bucket_size && 1476 bch2_journal_dev_buckets_available(j, ja, 1477 journal_space_discarded)) { 1478 ja->cur_idx = (ja->cur_idx + 1) % ja->nr; 1479 ja->sectors_free = ca->mi.bucket_size; 1480 1481 /* 1482 * ja->bucket_seq[ja->cur_idx] must always have 1483 * something sensible: 1484 */ 1485 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(seq); 1486 } 1487 } 1488 } 1489 1490 static void __journal_write_alloc(struct journal *j, 1491 struct journal_buf *w, 1492 struct dev_alloc_list *devs, 1493 unsigned sectors, 1494 unsigned *replicas, 1495 unsigned replicas_want) 1496 { 1497 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1498 1499 darray_for_each(*devs, i) { 1500 struct bch_dev *ca = rcu_dereference(c->devs[*i]); 1501 if (!ca) 1502 continue; 1503 1504 struct journal_device *ja = &ca->journal; 1505 1506 /* 1507 * Check that we can use this device, and aren't already using 1508 * it: 1509 */ 1510 if (!ca->mi.durability || 1511 ca->mi.state != BCH_MEMBER_STATE_rw || 1512 !ja->nr || 1513 bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) || 1514 sectors > ja->sectors_free) 1515 continue; 1516 1517 bch2_dev_stripe_increment(ca, &j->wp.stripe); 1518 1519 bch2_bkey_append_ptr(&w->key, 1520 (struct bch_extent_ptr) { 1521 .offset = bucket_to_sector(ca, 1522 ja->buckets[ja->cur_idx]) + 1523 ca->mi.bucket_size - 1524 ja->sectors_free, 1525 .dev = ca->dev_idx, 1526 }); 1527 1528 ja->sectors_free -= sectors; 1529 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1530 1531 *replicas += ca->mi.durability; 1532 1533 if (*replicas >= replicas_want) 1534 break; 1535 } 1536 } 1537 1538 /** 1539 * journal_write_alloc - decide where to write next journal entry 1540 * 1541 * @j: journal object 1542 * @w: journal buf (entry to be written) 1543 * 1544 * Returns: 0 on success, or -BCH_ERR_insufficient_devices on failure 1545 */ 1546 static int journal_write_alloc(struct journal *j, struct journal_buf *w) 1547 { 1548 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1549 struct bch_devs_mask devs; 1550 struct dev_alloc_list devs_sorted; 1551 unsigned sectors = vstruct_sectors(w->data, c->block_bits); 1552 unsigned target = c->opts.metadata_target ?: 1553 c->opts.foreground_target; 1554 unsigned replicas = 0, replicas_want = 1555 READ_ONCE(c->opts.metadata_replicas); 1556 unsigned replicas_need = min_t(unsigned, replicas_want, 1557 READ_ONCE(c->opts.metadata_replicas_required)); 1558 bool advance_done = false; 1559 1560 rcu_read_lock(); 1561 1562 /* We might run more than once if we have to stop and do discards: */ 1563 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&w->key)); 1564 bkey_for_each_ptr(ptrs, p) { 1565 struct bch_dev *ca = bch2_dev_rcu_noerror(c, p->dev); 1566 if (ca) 1567 replicas += ca->mi.durability; 1568 } 1569 1570 retry_target: 1571 devs = target_rw_devs(c, BCH_DATA_journal, target); 1572 devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); 1573 retry_alloc: 1574 __journal_write_alloc(j, w, &devs_sorted, sectors, &replicas, replicas_want); 1575 1576 if (likely(replicas >= replicas_want)) 1577 goto done; 1578 1579 if (!advance_done) { 1580 journal_advance_devs_to_next_bucket(j, &devs_sorted, sectors, w->data->seq); 1581 advance_done = true; 1582 goto retry_alloc; 1583 } 1584 1585 if (replicas < replicas_want && target) { 1586 /* Retry from all devices: */ 1587 target = 0; 1588 advance_done = false; 1589 goto retry_target; 1590 } 1591 done: 1592 rcu_read_unlock(); 1593 1594 BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); 1595 1596 return replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices; 1597 } 1598 1599 static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) 1600 { 1601 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1602 1603 /* we aren't holding j->lock: */ 1604 unsigned new_size = READ_ONCE(j->buf_size_want); 1605 void *new_buf; 1606 1607 if (buf->buf_size >= new_size) 1608 return; 1609 1610 size_t btree_write_buffer_size = new_size / 64; 1611 1612 if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size)) 1613 return; 1614 1615 new_buf = kvmalloc(new_size, GFP_NOFS|__GFP_NOWARN); 1616 if (!new_buf) 1617 return; 1618 1619 memcpy(new_buf, buf->data, buf->buf_size); 1620 1621 spin_lock(&j->lock); 1622 swap(buf->data, new_buf); 1623 swap(buf->buf_size, new_size); 1624 spin_unlock(&j->lock); 1625 1626 kvfree(new_buf); 1627 } 1628 1629 static CLOSURE_CALLBACK(journal_write_done) 1630 { 1631 closure_type(w, struct journal_buf, io); 1632 struct journal *j = container_of(w, struct journal, buf[w->idx]); 1633 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1634 struct bch_replicas_padded replicas; 1635 u64 seq = le64_to_cpu(w->data->seq); 1636 int err = 0; 1637 1638 bch2_time_stats_update(!JSET_NO_FLUSH(w->data) 1639 ? j->flush_write_time 1640 : j->noflush_write_time, j->write_start_time); 1641 1642 if (!w->devs_written.nr) { 1643 if (!bch2_journal_error(j)) 1644 bch_err(c, "unable to write journal to sufficient devices"); 1645 err = -BCH_ERR_journal_write_err; 1646 } else { 1647 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, 1648 w->devs_written); 1649 err = bch2_mark_replicas(c, &replicas.e); 1650 } 1651 1652 if (err) 1653 bch2_fatal_error(c); 1654 1655 closure_debug_destroy(cl); 1656 1657 spin_lock(&j->lock); 1658 if (seq >= j->pin.front) 1659 journal_seq_pin(j, seq)->devs = w->devs_written; 1660 if (err && (!j->err_seq || seq < j->err_seq)) 1661 j->err_seq = seq; 1662 w->write_done = true; 1663 1664 if (!j->free_buf || j->free_buf_size < w->buf_size) { 1665 swap(j->free_buf, w->data); 1666 swap(j->free_buf_size, w->buf_size); 1667 } 1668 1669 if (w->data) { 1670 void *buf = w->data; 1671 w->data = NULL; 1672 w->buf_size = 0; 1673 1674 spin_unlock(&j->lock); 1675 kvfree(buf); 1676 spin_lock(&j->lock); 1677 } 1678 1679 bool completed = false; 1680 bool do_discards = false; 1681 1682 for (seq = journal_last_unwritten_seq(j); 1683 seq <= journal_cur_seq(j); 1684 seq++) { 1685 w = j->buf + (seq & JOURNAL_BUF_MASK); 1686 if (!w->write_done) 1687 break; 1688 1689 if (!j->err_seq && !w->noflush) { 1690 j->flushed_seq_ondisk = seq; 1691 j->last_seq_ondisk = w->last_seq; 1692 1693 closure_wake_up(&c->freelist_wait); 1694 bch2_reset_alloc_cursors(c); 1695 } 1696 1697 j->seq_ondisk = seq; 1698 1699 /* 1700 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard 1701 * more buckets: 1702 * 1703 * Must come before signaling write completion, for 1704 * bch2_fs_journal_stop(): 1705 */ 1706 if (j->watermark != BCH_WATERMARK_stripe) 1707 journal_reclaim_kick(&c->journal); 1708 1709 closure_wake_up(&w->wait); 1710 completed = true; 1711 } 1712 1713 if (completed) { 1714 bch2_journal_reclaim_fast(j); 1715 bch2_journal_space_available(j); 1716 1717 track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false); 1718 1719 journal_wake(j); 1720 } 1721 1722 if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && 1723 j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { 1724 struct journal_buf *buf = journal_cur_buf(j); 1725 long delta = buf->expires - jiffies; 1726 1727 /* 1728 * We don't close a journal entry to write it while there's 1729 * previous entries still in flight - the current journal entry 1730 * might want to be written now: 1731 */ 1732 mod_delayed_work(j->wq, &j->write_work, max(0L, delta)); 1733 } 1734 1735 /* 1736 * We don't typically trigger journal writes from her - the next journal 1737 * write will be triggered immediately after the previous one is 1738 * allocated, in bch2_journal_write() - but the journal write error path 1739 * is special: 1740 */ 1741 bch2_journal_do_writes(j); 1742 spin_unlock(&j->lock); 1743 1744 if (do_discards) 1745 bch2_do_discards(c); 1746 } 1747 1748 static void journal_write_endio(struct bio *bio) 1749 { 1750 struct journal_bio *jbio = container_of(bio, struct journal_bio, bio); 1751 struct bch_dev *ca = jbio->ca; 1752 struct journal *j = &ca->fs->journal; 1753 struct journal_buf *w = j->buf + jbio->buf_idx; 1754 1755 bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write, 1756 jbio->submit_time, !bio->bi_status); 1757 1758 if (bio->bi_status) { 1759 bch_err_dev_ratelimited(ca, 1760 "error writing journal entry %llu: %s", 1761 le64_to_cpu(w->data->seq), 1762 bch2_blk_status_to_str(bio->bi_status)); 1763 1764 unsigned long flags; 1765 spin_lock_irqsave(&j->err_lock, flags); 1766 bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx); 1767 spin_unlock_irqrestore(&j->err_lock, flags); 1768 } 1769 1770 closure_put(&w->io); 1771 percpu_ref_put(&ca->io_ref[WRITE]); 1772 } 1773 1774 static CLOSURE_CALLBACK(journal_write_submit) 1775 { 1776 closure_type(w, struct journal_buf, io); 1777 struct journal *j = container_of(w, struct journal, buf[w->idx]); 1778 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1779 unsigned sectors = vstruct_sectors(w->data, c->block_bits); 1780 1781 extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { 1782 struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE); 1783 if (!ca) { 1784 /* XXX: fix this */ 1785 bch_err(c, "missing device for journal write\n"); 1786 continue; 1787 } 1788 1789 this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], 1790 sectors); 1791 1792 struct journal_device *ja = &ca->journal; 1793 struct journal_bio *jbio = ja->bio[w->idx]; 1794 struct bio *bio = &jbio->bio; 1795 1796 jbio->submit_time = local_clock(); 1797 1798 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); 1799 bio->bi_iter.bi_sector = ptr->offset; 1800 bio->bi_end_io = journal_write_endio; 1801 bio->bi_private = ca; 1802 bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 0); 1803 1804 BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector); 1805 ca->prev_journal_sector = bio->bi_iter.bi_sector; 1806 1807 if (!JSET_NO_FLUSH(w->data)) 1808 bio->bi_opf |= REQ_FUA; 1809 if (!JSET_NO_FLUSH(w->data) && !w->separate_flush) 1810 bio->bi_opf |= REQ_PREFLUSH; 1811 1812 bch2_bio_map(bio, w->data, sectors << 9); 1813 1814 trace_and_count(c, journal_write, bio); 1815 closure_bio_submit(bio, cl); 1816 1817 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1818 } 1819 1820 continue_at(cl, journal_write_done, j->wq); 1821 } 1822 1823 static CLOSURE_CALLBACK(journal_write_preflush) 1824 { 1825 closure_type(w, struct journal_buf, io); 1826 struct journal *j = container_of(w, struct journal, buf[w->idx]); 1827 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1828 1829 /* 1830 * Wait for previous journal writes to comelete; they won't necessarily 1831 * be flushed if they're still in flight 1832 */ 1833 if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { 1834 spin_lock(&j->lock); 1835 if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { 1836 closure_wait(&j->async_wait, cl); 1837 spin_unlock(&j->lock); 1838 continue_at(cl, journal_write_preflush, j->wq); 1839 return; 1840 } 1841 spin_unlock(&j->lock); 1842 } 1843 1844 if (w->separate_flush) { 1845 for_each_rw_member(c, ca) { 1846 percpu_ref_get(&ca->io_ref[WRITE]); 1847 1848 struct journal_device *ja = &ca->journal; 1849 struct bio *bio = &ja->bio[w->idx]->bio; 1850 bio_reset(bio, ca->disk_sb.bdev, 1851 REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH); 1852 bio->bi_end_io = journal_write_endio; 1853 bio->bi_private = ca; 1854 closure_bio_submit(bio, cl); 1855 } 1856 1857 continue_at(cl, journal_write_submit, j->wq); 1858 } else { 1859 /* 1860 * no need to punt to another work item if we're not waiting on 1861 * preflushes 1862 */ 1863 journal_write_submit(&cl->work); 1864 } 1865 } 1866 1867 static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) 1868 { 1869 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1870 struct jset_entry *start, *end; 1871 struct jset *jset = w->data; 1872 struct journal_keys_to_wb wb = { NULL }; 1873 unsigned sectors, bytes, u64s; 1874 unsigned long btree_roots_have = 0; 1875 bool validate_before_checksum = false; 1876 u64 seq = le64_to_cpu(jset->seq); 1877 int ret; 1878 1879 /* 1880 * Simple compaction, dropping empty jset_entries (from journal 1881 * reservations that weren't fully used) and merging jset_entries that 1882 * can be. 1883 * 1884 * If we wanted to be really fancy here, we could sort all the keys in 1885 * the jset and drop keys that were overwritten - probably not worth it: 1886 */ 1887 vstruct_for_each(jset, i) { 1888 unsigned u64s = le16_to_cpu(i->u64s); 1889 1890 /* Empty entry: */ 1891 if (!u64s) 1892 continue; 1893 1894 /* 1895 * New btree roots are set by journalling them; when the journal 1896 * entry gets written we have to propagate them to 1897 * c->btree_roots 1898 * 1899 * But, every journal entry we write has to contain all the 1900 * btree roots (at least for now); so after we copy btree roots 1901 * to c->btree_roots we have to get any missing btree roots and 1902 * add them to this journal entry: 1903 */ 1904 switch (i->type) { 1905 case BCH_JSET_ENTRY_btree_root: 1906 bch2_journal_entry_to_btree_root(c, i); 1907 __set_bit(i->btree_id, &btree_roots_have); 1908 break; 1909 case BCH_JSET_ENTRY_write_buffer_keys: 1910 EBUG_ON(!w->need_flush_to_write_buffer); 1911 1912 if (!wb.wb) 1913 bch2_journal_keys_to_write_buffer_start(c, &wb, seq); 1914 1915 jset_entry_for_each_key(i, k) { 1916 ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k); 1917 if (ret) { 1918 bch2_fs_fatal_error(c, "flushing journal keys to btree write buffer: %s", 1919 bch2_err_str(ret)); 1920 bch2_journal_keys_to_write_buffer_end(c, &wb); 1921 return ret; 1922 } 1923 } 1924 i->type = BCH_JSET_ENTRY_btree_keys; 1925 break; 1926 } 1927 } 1928 1929 if (wb.wb) { 1930 ret = bch2_journal_keys_to_write_buffer_end(c, &wb); 1931 if (ret) { 1932 bch2_fs_fatal_error(c, "error flushing journal keys to btree write buffer: %s", 1933 bch2_err_str(ret)); 1934 return ret; 1935 } 1936 } 1937 1938 spin_lock(&c->journal.lock); 1939 w->need_flush_to_write_buffer = false; 1940 spin_unlock(&c->journal.lock); 1941 1942 start = end = vstruct_last(jset); 1943 1944 end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have); 1945 1946 struct jset_entry_datetime *d = 1947 container_of(jset_entry_init(&end, sizeof(*d)), struct jset_entry_datetime, entry); 1948 d->entry.type = BCH_JSET_ENTRY_datetime; 1949 d->seconds = cpu_to_le64(ktime_get_real_seconds()); 1950 1951 bch2_journal_super_entries_add_common(c, &end, seq); 1952 u64s = (u64 *) end - (u64 *) start; 1953 1954 WARN_ON(u64s > j->entry_u64s_reserved); 1955 1956 le32_add_cpu(&jset->u64s, u64s); 1957 1958 sectors = vstruct_sectors(jset, c->block_bits); 1959 bytes = vstruct_bytes(jset); 1960 1961 if (sectors > w->sectors) { 1962 bch2_fs_fatal_error(c, ": journal write overran available space, %zu > %u (extra %u reserved %u/%u)", 1963 vstruct_bytes(jset), w->sectors << 9, 1964 u64s, w->u64s_reserved, j->entry_u64s_reserved); 1965 return -EINVAL; 1966 } 1967 1968 jset->magic = cpu_to_le64(jset_magic(c)); 1969 jset->version = cpu_to_le32(c->sb.version); 1970 1971 SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); 1972 SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); 1973 1974 if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset)) 1975 j->last_empty_seq = seq; 1976 1977 if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) 1978 validate_before_checksum = true; 1979 1980 if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current) 1981 validate_before_checksum = true; 1982 1983 if (validate_before_checksum && 1984 (ret = jset_validate(c, NULL, jset, 0, WRITE))) 1985 return ret; 1986 1987 ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), 1988 jset->encrypted_start, 1989 vstruct_end(jset) - (void *) jset->encrypted_start); 1990 if (bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret))) 1991 return ret; 1992 1993 jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), 1994 journal_nonce(jset), jset); 1995 1996 if (!validate_before_checksum && 1997 (ret = jset_validate(c, NULL, jset, 0, WRITE))) 1998 return ret; 1999 2000 memset((void *) jset + bytes, 0, (sectors << 9) - bytes); 2001 return 0; 2002 } 2003 2004 static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w) 2005 { 2006 struct bch_fs *c = container_of(j, struct bch_fs, journal); 2007 int error = bch2_journal_error(j); 2008 2009 /* 2010 * If the journal is in an error state - we did an emergency shutdown - 2011 * we prefer to continue doing journal writes. We just mark them as 2012 * noflush so they'll never be used, but they'll still be visible by the 2013 * list_journal tool - this helps in debugging. 2014 * 2015 * There's a caveat: the first journal write after marking the 2016 * superblock dirty must always be a flush write, because on startup 2017 * from a clean shutdown we didn't necessarily read the journal and the 2018 * new journal write might overwrite whatever was in the journal 2019 * previously - we can't leave the journal without any flush writes in 2020 * it. 2021 * 2022 * So if we're in an error state, and we're still starting up, we don't 2023 * write anything at all. 2024 */ 2025 if (error && test_bit(JOURNAL_need_flush_write, &j->flags)) 2026 return error; 2027 2028 if (error || 2029 w->noflush || 2030 (!w->must_flush && 2031 time_before(jiffies, j->last_flush_write + 2032 msecs_to_jiffies(c->opts.journal_flush_delay)) && 2033 test_bit(JOURNAL_may_skip_flush, &j->flags))) { 2034 w->noflush = true; 2035 SET_JSET_NO_FLUSH(w->data, true); 2036 w->data->last_seq = 0; 2037 w->last_seq = 0; 2038 2039 j->nr_noflush_writes++; 2040 } else { 2041 w->must_flush = true; 2042 j->last_flush_write = jiffies; 2043 j->nr_flush_writes++; 2044 clear_bit(JOURNAL_need_flush_write, &j->flags); 2045 } 2046 2047 return 0; 2048 } 2049 2050 CLOSURE_CALLBACK(bch2_journal_write) 2051 { 2052 closure_type(w, struct journal_buf, io); 2053 struct journal *j = container_of(w, struct journal, buf[w->idx]); 2054 struct bch_fs *c = container_of(j, struct bch_fs, journal); 2055 struct bch_replicas_padded replicas; 2056 unsigned nr_rw_members = 0; 2057 int ret; 2058 2059 for_each_rw_member(c, ca) 2060 nr_rw_members++; 2061 2062 BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); 2063 BUG_ON(!w->write_started); 2064 BUG_ON(w->write_allocated); 2065 BUG_ON(w->write_done); 2066 2067 j->write_start_time = local_clock(); 2068 2069 spin_lock(&j->lock); 2070 if (nr_rw_members > 1) 2071 w->separate_flush = true; 2072 2073 ret = bch2_journal_write_pick_flush(j, w); 2074 spin_unlock(&j->lock); 2075 if (ret) 2076 goto err; 2077 2078 mutex_lock(&j->buf_lock); 2079 journal_buf_realloc(j, w); 2080 2081 ret = bch2_journal_write_prep(j, w); 2082 mutex_unlock(&j->buf_lock); 2083 if (ret) 2084 goto err; 2085 2086 j->entry_bytes_written += vstruct_bytes(w->data); 2087 2088 while (1) { 2089 spin_lock(&j->lock); 2090 ret = journal_write_alloc(j, w); 2091 if (!ret || !j->can_discard) 2092 break; 2093 2094 spin_unlock(&j->lock); 2095 bch2_journal_do_discards(j); 2096 } 2097 2098 if (ret && !bch2_journal_error(j)) { 2099 struct printbuf buf = PRINTBUF; 2100 buf.atomic++; 2101 2102 __bch2_journal_debug_to_text(&buf, j); 2103 spin_unlock(&j->lock); 2104 prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"), 2105 le64_to_cpu(w->data->seq), 2106 vstruct_sectors(w->data, c->block_bits), 2107 bch2_err_str(ret)); 2108 bch2_print_string_as_lines(KERN_ERR, buf.buf); 2109 printbuf_exit(&buf); 2110 } 2111 if (ret) 2112 goto err; 2113 2114 /* 2115 * write is allocated, no longer need to account for it in 2116 * bch2_journal_space_available(): 2117 */ 2118 w->sectors = 0; 2119 w->write_allocated = true; 2120 2121 /* 2122 * journal entry has been compacted and allocated, recalculate space 2123 * available: 2124 */ 2125 bch2_journal_space_available(j); 2126 bch2_journal_do_writes(j); 2127 spin_unlock(&j->lock); 2128 2129 w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); 2130 2131 if (c->opts.nochanges) 2132 goto no_io; 2133 2134 /* 2135 * Mark journal replicas before we submit the write to guarantee 2136 * recovery will find the journal entries after a crash. 2137 */ 2138 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, 2139 w->devs_written); 2140 ret = bch2_mark_replicas(c, &replicas.e); 2141 if (ret) 2142 goto err; 2143 2144 if (!JSET_NO_FLUSH(w->data)) 2145 continue_at(cl, journal_write_preflush, j->wq); 2146 else 2147 continue_at(cl, journal_write_submit, j->wq); 2148 return; 2149 no_io: 2150 continue_at(cl, journal_write_done, j->wq); 2151 return; 2152 err: 2153 bch2_fatal_error(c); 2154 continue_at(cl, journal_write_done, j->wq); 2155 } 2156