1 // SPDX-License-Identifier: GPL-2.0 2 #include "bcachefs.h" 3 #include "alloc_background.h" 4 #include "alloc_foreground.h" 5 #include "btree_io.h" 6 #include "btree_update_interior.h" 7 #include "btree_write_buffer.h" 8 #include "buckets.h" 9 #include "checksum.h" 10 #include "disk_groups.h" 11 #include "error.h" 12 #include "journal.h" 13 #include "journal_io.h" 14 #include "journal_reclaim.h" 15 #include "journal_seq_blacklist.h" 16 #include "replicas.h" 17 #include "sb-clean.h" 18 #include "trace.h" 19 20 void bch2_journal_pos_from_member_info_set(struct bch_fs *c) 21 { 22 lockdep_assert_held(&c->sb_lock); 23 24 for_each_member_device(c, ca) { 25 struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); 26 27 m->last_journal_bucket = cpu_to_le32(ca->journal.cur_idx); 28 m->last_journal_bucket_offset = cpu_to_le32(ca->mi.bucket_size - ca->journal.sectors_free); 29 } 30 } 31 32 void bch2_journal_pos_from_member_info_resume(struct bch_fs *c) 33 { 34 mutex_lock(&c->sb_lock); 35 for_each_member_device(c, ca) { 36 struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx); 37 38 unsigned idx = le32_to_cpu(m.last_journal_bucket); 39 if (idx < ca->journal.nr) 40 ca->journal.cur_idx = idx; 41 unsigned offset = le32_to_cpu(m.last_journal_bucket_offset); 42 if (offset <= ca->mi.bucket_size) 43 ca->journal.sectors_free = ca->mi.bucket_size - offset; 44 } 45 mutex_unlock(&c->sb_lock); 46 } 47 48 void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, 49 struct journal_replay *j) 50 { 51 darray_for_each(j->ptrs, i) { 52 if (i != j->ptrs.data) 53 prt_printf(out, " "); 54 prt_printf(out, "%u:%u:%u (sector %llu)", 55 i->dev, i->bucket, i->bucket_offset, i->sector); 56 } 57 } 58 59 static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c, 60 struct journal_replay *j) 61 { 62 prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq)); 63 64 bch2_journal_ptrs_to_text(out, c, j); 65 66 for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) { 67 struct jset_entry_datetime *datetime = 68 container_of(entry, struct jset_entry_datetime, entry); 69 bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); 70 break; 71 } 72 } 73 74 static struct nonce journal_nonce(const struct jset *jset) 75 { 76 return (struct nonce) {{ 77 [0] = 0, 78 [1] = ((__le32 *) &jset->seq)[0], 79 [2] = ((__le32 *) &jset->seq)[1], 80 [3] = BCH_NONCE_JOURNAL, 81 }}; 82 } 83 84 static bool jset_csum_good(struct bch_fs *c, struct jset *j, struct bch_csum *csum) 85 { 86 if (!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j))) { 87 *csum = (struct bch_csum) {}; 88 return false; 89 } 90 91 *csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j); 92 return !bch2_crc_cmp(j->csum, *csum); 93 } 94 95 static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq) 96 { 97 return (seq - c->journal_entries_base_seq) & (~0U >> 1); 98 } 99 100 static void __journal_replay_free(struct bch_fs *c, 101 struct journal_replay *i) 102 { 103 struct journal_replay **p = 104 genradix_ptr(&c->journal_entries, 105 journal_entry_radix_idx(c, le64_to_cpu(i->j.seq))); 106 107 BUG_ON(*p != i); 108 *p = NULL; 109 kvfree(i); 110 } 111 112 static void journal_replay_free(struct bch_fs *c, struct journal_replay *i, bool blacklisted) 113 { 114 if (blacklisted) 115 i->ignore_blacklisted = true; 116 else 117 i->ignore_not_dirty = true; 118 119 if (!c->opts.read_entire_journal) 120 __journal_replay_free(c, i); 121 } 122 123 struct journal_list { 124 struct closure cl; 125 u64 last_seq; 126 struct mutex lock; 127 int ret; 128 }; 129 130 #define JOURNAL_ENTRY_ADD_OK 0 131 #define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5 132 133 /* 134 * Given a journal entry we just read, add it to the list of journal entries to 135 * be replayed: 136 */ 137 static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, 138 struct journal_ptr entry_ptr, 139 struct journal_list *jlist, struct jset *j) 140 { 141 struct genradix_iter iter; 142 struct journal_replay **_i, *i, *dup; 143 size_t bytes = vstruct_bytes(j); 144 u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0; 145 struct printbuf buf = PRINTBUF; 146 int ret = JOURNAL_ENTRY_ADD_OK; 147 148 if (!c->journal.oldest_seq_found_ondisk || 149 le64_to_cpu(j->seq) < c->journal.oldest_seq_found_ondisk) 150 c->journal.oldest_seq_found_ondisk = le64_to_cpu(j->seq); 151 152 /* Is this entry older than the range we need? */ 153 if (!c->opts.read_entire_journal && 154 le64_to_cpu(j->seq) < jlist->last_seq) 155 return JOURNAL_ENTRY_ADD_OUT_OF_RANGE; 156 157 /* 158 * genradixes are indexed by a ulong, not a u64, so we can't index them 159 * by sequence number directly: Assume instead that they will all fall 160 * within the range of +-2billion of the filrst one we find. 161 */ 162 if (!c->journal_entries_base_seq) 163 c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX); 164 165 /* Drop entries we don't need anymore */ 166 if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) { 167 genradix_for_each_from(&c->journal_entries, iter, _i, 168 journal_entry_radix_idx(c, jlist->last_seq)) { 169 i = *_i; 170 171 if (journal_replay_ignore(i)) 172 continue; 173 174 if (le64_to_cpu(i->j.seq) >= last_seq) 175 break; 176 177 journal_replay_free(c, i, false); 178 } 179 } 180 181 jlist->last_seq = max(jlist->last_seq, last_seq); 182 183 _i = genradix_ptr_alloc(&c->journal_entries, 184 journal_entry_radix_idx(c, le64_to_cpu(j->seq)), 185 GFP_KERNEL); 186 if (!_i) 187 return -BCH_ERR_ENOMEM_journal_entry_add; 188 189 /* 190 * Duplicate journal entries? If so we want the one that didn't have a 191 * checksum error: 192 */ 193 dup = *_i; 194 if (dup) { 195 bool identical = bytes == vstruct_bytes(&dup->j) && 196 !memcmp(j, &dup->j, bytes); 197 bool not_identical = !identical && 198 entry_ptr.csum_good && 199 dup->csum_good; 200 201 bool same_device = false; 202 darray_for_each(dup->ptrs, ptr) 203 if (ptr->dev == ca->dev_idx) 204 same_device = true; 205 206 ret = darray_push(&dup->ptrs, entry_ptr); 207 if (ret) 208 goto out; 209 210 bch2_journal_replay_to_text(&buf, c, dup); 211 212 fsck_err_on(same_device, 213 c, journal_entry_dup_same_device, 214 "duplicate journal entry on same device\n %s", 215 buf.buf); 216 217 fsck_err_on(not_identical, 218 c, journal_entry_replicas_data_mismatch, 219 "found duplicate but non identical journal entries\n %s", 220 buf.buf); 221 222 if (entry_ptr.csum_good && !identical) 223 goto replace; 224 225 goto out; 226 } 227 replace: 228 i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); 229 if (!i) 230 return -BCH_ERR_ENOMEM_journal_entry_add; 231 232 darray_init(&i->ptrs); 233 i->csum_good = entry_ptr.csum_good; 234 i->ignore_blacklisted = false; 235 i->ignore_not_dirty = false; 236 unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct"); 237 238 if (dup) { 239 /* The first ptr should represent the jset we kept: */ 240 darray_for_each(dup->ptrs, ptr) 241 darray_push(&i->ptrs, *ptr); 242 __journal_replay_free(c, dup); 243 } else { 244 darray_push(&i->ptrs, entry_ptr); 245 } 246 247 *_i = i; 248 out: 249 fsck_err: 250 printbuf_exit(&buf); 251 return ret; 252 } 253 254 /* this fills in a range with empty jset_entries: */ 255 static void journal_entry_null_range(void *start, void *end) 256 { 257 struct jset_entry *entry; 258 259 for (entry = start; entry != end; entry = vstruct_next(entry)) 260 memset(entry, 0, sizeof(*entry)); 261 } 262 263 #define JOURNAL_ENTRY_REREAD 5 264 #define JOURNAL_ENTRY_NONE 6 265 #define JOURNAL_ENTRY_BAD 7 266 267 static void journal_entry_err_msg(struct printbuf *out, 268 u32 version, 269 struct jset *jset, 270 struct jset_entry *entry) 271 { 272 prt_str(out, "invalid journal entry, version="); 273 bch2_version_to_text(out, version); 274 275 if (entry) { 276 prt_str(out, " type="); 277 bch2_prt_jset_entry_type(out, entry->type); 278 } 279 280 if (!jset) { 281 prt_printf(out, " in superblock"); 282 } else { 283 284 prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq)); 285 286 if (entry) 287 prt_printf(out, " offset=%zi/%u", 288 (u64 *) entry - jset->_data, 289 le32_to_cpu(jset->u64s)); 290 } 291 292 prt_str(out, ": "); 293 } 294 295 #define journal_entry_err(c, version, jset, entry, _err, msg, ...) \ 296 ({ \ 297 struct printbuf _buf = PRINTBUF; \ 298 \ 299 journal_entry_err_msg(&_buf, version, jset, entry); \ 300 prt_printf(&_buf, msg, ##__VA_ARGS__); \ 301 \ 302 switch (flags & BCH_VALIDATE_write) { \ 303 case READ: \ 304 mustfix_fsck_err(c, _err, "%s", _buf.buf); \ 305 break; \ 306 case WRITE: \ 307 bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \ 308 bch_err(c, "corrupt metadata before write: %s\n", _buf.buf);\ 309 if (bch2_fs_inconsistent(c)) { \ 310 ret = -BCH_ERR_fsck_errors_not_fixed; \ 311 goto fsck_err; \ 312 } \ 313 break; \ 314 } \ 315 \ 316 printbuf_exit(&_buf); \ 317 true; \ 318 }) 319 320 #define journal_entry_err_on(cond, ...) \ 321 ((cond) ? journal_entry_err(__VA_ARGS__) : false) 322 323 #define FSCK_DELETED_KEY 5 324 325 static int journal_validate_key(struct bch_fs *c, 326 struct jset *jset, 327 struct jset_entry *entry, 328 unsigned level, enum btree_id btree_id, 329 struct bkey_i *k, 330 unsigned version, int big_endian, 331 enum bch_validate_flags flags) 332 { 333 int write = flags & BCH_VALIDATE_write; 334 void *next = vstruct_next(entry); 335 int ret = 0; 336 337 if (journal_entry_err_on(!k->k.u64s, 338 c, version, jset, entry, 339 journal_entry_bkey_u64s_0, 340 "k->u64s 0")) { 341 entry->u64s = cpu_to_le16((u64 *) k - entry->_data); 342 journal_entry_null_range(vstruct_next(entry), next); 343 return FSCK_DELETED_KEY; 344 } 345 346 if (journal_entry_err_on((void *) bkey_next(k) > 347 (void *) vstruct_next(entry), 348 c, version, jset, entry, 349 journal_entry_bkey_past_end, 350 "extends past end of journal entry")) { 351 entry->u64s = cpu_to_le16((u64 *) k - entry->_data); 352 journal_entry_null_range(vstruct_next(entry), next); 353 return FSCK_DELETED_KEY; 354 } 355 356 if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, 357 c, version, jset, entry, 358 journal_entry_bkey_bad_format, 359 "bad format %u", k->k.format)) { 360 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); 361 memmove(k, bkey_next(k), next - (void *) bkey_next(k)); 362 journal_entry_null_range(vstruct_next(entry), next); 363 return FSCK_DELETED_KEY; 364 } 365 366 if (!write) 367 bch2_bkey_compat(level, btree_id, version, big_endian, 368 write, NULL, bkey_to_packed(k)); 369 370 ret = bch2_bkey_validate(c, bkey_i_to_s_c(k), 371 __btree_node_type(level, btree_id), write); 372 if (ret == -BCH_ERR_fsck_delete_bkey) { 373 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); 374 memmove(k, bkey_next(k), next - (void *) bkey_next(k)); 375 journal_entry_null_range(vstruct_next(entry), next); 376 return FSCK_DELETED_KEY; 377 } 378 if (ret) 379 goto fsck_err; 380 381 if (write) 382 bch2_bkey_compat(level, btree_id, version, big_endian, 383 write, NULL, bkey_to_packed(k)); 384 fsck_err: 385 return ret; 386 } 387 388 static int journal_entry_btree_keys_validate(struct bch_fs *c, 389 struct jset *jset, 390 struct jset_entry *entry, 391 unsigned version, int big_endian, 392 enum bch_validate_flags flags) 393 { 394 struct bkey_i *k = entry->start; 395 396 while (k != vstruct_last(entry)) { 397 int ret = journal_validate_key(c, jset, entry, 398 entry->level, 399 entry->btree_id, 400 k, version, big_endian, 401 flags|BCH_VALIDATE_journal); 402 if (ret == FSCK_DELETED_KEY) 403 continue; 404 else if (ret) 405 return ret; 406 407 k = bkey_next(k); 408 } 409 410 return 0; 411 } 412 413 static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c, 414 struct jset_entry *entry) 415 { 416 bool first = true; 417 418 jset_entry_for_each_key(entry, k) { 419 if (!first) { 420 prt_newline(out); 421 bch2_prt_jset_entry_type(out, entry->type); 422 prt_str(out, ": "); 423 } 424 prt_printf(out, "btree=%s l=%u ", bch2_btree_id_str(entry->btree_id), entry->level); 425 bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); 426 first = false; 427 } 428 } 429 430 static int journal_entry_btree_root_validate(struct bch_fs *c, 431 struct jset *jset, 432 struct jset_entry *entry, 433 unsigned version, int big_endian, 434 enum bch_validate_flags flags) 435 { 436 struct bkey_i *k = entry->start; 437 int ret = 0; 438 439 if (journal_entry_err_on(!entry->u64s || 440 le16_to_cpu(entry->u64s) != k->k.u64s, 441 c, version, jset, entry, 442 journal_entry_btree_root_bad_size, 443 "invalid btree root journal entry: wrong number of keys")) { 444 void *next = vstruct_next(entry); 445 /* 446 * we don't want to null out this jset_entry, 447 * just the contents, so that later we can tell 448 * we were _supposed_ to have a btree root 449 */ 450 entry->u64s = 0; 451 journal_entry_null_range(vstruct_next(entry), next); 452 return 0; 453 } 454 455 ret = journal_validate_key(c, jset, entry, 1, entry->btree_id, k, 456 version, big_endian, flags); 457 if (ret == FSCK_DELETED_KEY) 458 ret = 0; 459 fsck_err: 460 return ret; 461 } 462 463 static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c, 464 struct jset_entry *entry) 465 { 466 journal_entry_btree_keys_to_text(out, c, entry); 467 } 468 469 static int journal_entry_prio_ptrs_validate(struct bch_fs *c, 470 struct jset *jset, 471 struct jset_entry *entry, 472 unsigned version, int big_endian, 473 enum bch_validate_flags flags) 474 { 475 /* obsolete, don't care: */ 476 return 0; 477 } 478 479 static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c, 480 struct jset_entry *entry) 481 { 482 } 483 484 static int journal_entry_blacklist_validate(struct bch_fs *c, 485 struct jset *jset, 486 struct jset_entry *entry, 487 unsigned version, int big_endian, 488 enum bch_validate_flags flags) 489 { 490 int ret = 0; 491 492 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, 493 c, version, jset, entry, 494 journal_entry_blacklist_bad_size, 495 "invalid journal seq blacklist entry: bad size")) { 496 journal_entry_null_range(entry, vstruct_next(entry)); 497 } 498 fsck_err: 499 return ret; 500 } 501 502 static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c, 503 struct jset_entry *entry) 504 { 505 struct jset_entry_blacklist *bl = 506 container_of(entry, struct jset_entry_blacklist, entry); 507 508 prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq)); 509 } 510 511 static int journal_entry_blacklist_v2_validate(struct bch_fs *c, 512 struct jset *jset, 513 struct jset_entry *entry, 514 unsigned version, int big_endian, 515 enum bch_validate_flags flags) 516 { 517 struct jset_entry_blacklist_v2 *bl_entry; 518 int ret = 0; 519 520 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, 521 c, version, jset, entry, 522 journal_entry_blacklist_v2_bad_size, 523 "invalid journal seq blacklist entry: bad size")) { 524 journal_entry_null_range(entry, vstruct_next(entry)); 525 goto out; 526 } 527 528 bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry); 529 530 if (journal_entry_err_on(le64_to_cpu(bl_entry->start) > 531 le64_to_cpu(bl_entry->end), 532 c, version, jset, entry, 533 journal_entry_blacklist_v2_start_past_end, 534 "invalid journal seq blacklist entry: start > end")) { 535 journal_entry_null_range(entry, vstruct_next(entry)); 536 } 537 out: 538 fsck_err: 539 return ret; 540 } 541 542 static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c, 543 struct jset_entry *entry) 544 { 545 struct jset_entry_blacklist_v2 *bl = 546 container_of(entry, struct jset_entry_blacklist_v2, entry); 547 548 prt_printf(out, "start=%llu end=%llu", 549 le64_to_cpu(bl->start), 550 le64_to_cpu(bl->end)); 551 } 552 553 static int journal_entry_usage_validate(struct bch_fs *c, 554 struct jset *jset, 555 struct jset_entry *entry, 556 unsigned version, int big_endian, 557 enum bch_validate_flags flags) 558 { 559 struct jset_entry_usage *u = 560 container_of(entry, struct jset_entry_usage, entry); 561 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 562 int ret = 0; 563 564 if (journal_entry_err_on(bytes < sizeof(*u), 565 c, version, jset, entry, 566 journal_entry_usage_bad_size, 567 "invalid journal entry usage: bad size")) { 568 journal_entry_null_range(entry, vstruct_next(entry)); 569 return ret; 570 } 571 572 fsck_err: 573 return ret; 574 } 575 576 static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c, 577 struct jset_entry *entry) 578 { 579 struct jset_entry_usage *u = 580 container_of(entry, struct jset_entry_usage, entry); 581 582 prt_str(out, "type="); 583 bch2_prt_fs_usage_type(out, u->entry.btree_id); 584 prt_printf(out, " v=%llu", le64_to_cpu(u->v)); 585 } 586 587 static int journal_entry_data_usage_validate(struct bch_fs *c, 588 struct jset *jset, 589 struct jset_entry *entry, 590 unsigned version, int big_endian, 591 enum bch_validate_flags flags) 592 { 593 struct jset_entry_data_usage *u = 594 container_of(entry, struct jset_entry_data_usage, entry); 595 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 596 struct printbuf err = PRINTBUF; 597 int ret = 0; 598 599 if (journal_entry_err_on(bytes < sizeof(*u) || 600 bytes < sizeof(*u) + u->r.nr_devs, 601 c, version, jset, entry, 602 journal_entry_data_usage_bad_size, 603 "invalid journal entry usage: bad size")) { 604 journal_entry_null_range(entry, vstruct_next(entry)); 605 goto out; 606 } 607 608 if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c, &err), 609 c, version, jset, entry, 610 journal_entry_data_usage_bad_size, 611 "invalid journal entry usage: %s", err.buf)) { 612 journal_entry_null_range(entry, vstruct_next(entry)); 613 goto out; 614 } 615 out: 616 fsck_err: 617 printbuf_exit(&err); 618 return ret; 619 } 620 621 static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c, 622 struct jset_entry *entry) 623 { 624 struct jset_entry_data_usage *u = 625 container_of(entry, struct jset_entry_data_usage, entry); 626 627 bch2_replicas_entry_to_text(out, &u->r); 628 prt_printf(out, "=%llu", le64_to_cpu(u->v)); 629 } 630 631 static int journal_entry_clock_validate(struct bch_fs *c, 632 struct jset *jset, 633 struct jset_entry *entry, 634 unsigned version, int big_endian, 635 enum bch_validate_flags flags) 636 { 637 struct jset_entry_clock *clock = 638 container_of(entry, struct jset_entry_clock, entry); 639 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 640 int ret = 0; 641 642 if (journal_entry_err_on(bytes != sizeof(*clock), 643 c, version, jset, entry, 644 journal_entry_clock_bad_size, 645 "bad size")) { 646 journal_entry_null_range(entry, vstruct_next(entry)); 647 return ret; 648 } 649 650 if (journal_entry_err_on(clock->rw > 1, 651 c, version, jset, entry, 652 journal_entry_clock_bad_rw, 653 "bad rw")) { 654 journal_entry_null_range(entry, vstruct_next(entry)); 655 return ret; 656 } 657 658 fsck_err: 659 return ret; 660 } 661 662 static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c, 663 struct jset_entry *entry) 664 { 665 struct jset_entry_clock *clock = 666 container_of(entry, struct jset_entry_clock, entry); 667 668 prt_printf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time)); 669 } 670 671 static int journal_entry_dev_usage_validate(struct bch_fs *c, 672 struct jset *jset, 673 struct jset_entry *entry, 674 unsigned version, int big_endian, 675 enum bch_validate_flags flags) 676 { 677 struct jset_entry_dev_usage *u = 678 container_of(entry, struct jset_entry_dev_usage, entry); 679 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 680 unsigned expected = sizeof(*u); 681 int ret = 0; 682 683 if (journal_entry_err_on(bytes < expected, 684 c, version, jset, entry, 685 journal_entry_dev_usage_bad_size, 686 "bad size (%u < %u)", 687 bytes, expected)) { 688 journal_entry_null_range(entry, vstruct_next(entry)); 689 return ret; 690 } 691 692 if (journal_entry_err_on(u->pad, 693 c, version, jset, entry, 694 journal_entry_dev_usage_bad_pad, 695 "bad pad")) { 696 journal_entry_null_range(entry, vstruct_next(entry)); 697 return ret; 698 } 699 700 fsck_err: 701 return ret; 702 } 703 704 static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c, 705 struct jset_entry *entry) 706 { 707 struct jset_entry_dev_usage *u = 708 container_of(entry, struct jset_entry_dev_usage, entry); 709 unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); 710 711 prt_printf(out, "dev=%u", le32_to_cpu(u->dev)); 712 713 printbuf_indent_add(out, 2); 714 for (i = 0; i < nr_types; i++) { 715 prt_newline(out); 716 bch2_prt_data_type(out, i); 717 prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu", 718 le64_to_cpu(u->d[i].buckets), 719 le64_to_cpu(u->d[i].sectors), 720 le64_to_cpu(u->d[i].fragmented)); 721 } 722 printbuf_indent_sub(out, 2); 723 } 724 725 static int journal_entry_log_validate(struct bch_fs *c, 726 struct jset *jset, 727 struct jset_entry *entry, 728 unsigned version, int big_endian, 729 enum bch_validate_flags flags) 730 { 731 return 0; 732 } 733 734 static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c, 735 struct jset_entry *entry) 736 { 737 struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); 738 unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d); 739 740 prt_printf(out, "%.*s", bytes, l->d); 741 } 742 743 static int journal_entry_overwrite_validate(struct bch_fs *c, 744 struct jset *jset, 745 struct jset_entry *entry, 746 unsigned version, int big_endian, 747 enum bch_validate_flags flags) 748 { 749 return journal_entry_btree_keys_validate(c, jset, entry, 750 version, big_endian, READ); 751 } 752 753 static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c, 754 struct jset_entry *entry) 755 { 756 journal_entry_btree_keys_to_text(out, c, entry); 757 } 758 759 static int journal_entry_write_buffer_keys_validate(struct bch_fs *c, 760 struct jset *jset, 761 struct jset_entry *entry, 762 unsigned version, int big_endian, 763 enum bch_validate_flags flags) 764 { 765 return journal_entry_btree_keys_validate(c, jset, entry, 766 version, big_endian, READ); 767 } 768 769 static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c, 770 struct jset_entry *entry) 771 { 772 journal_entry_btree_keys_to_text(out, c, entry); 773 } 774 775 static int journal_entry_datetime_validate(struct bch_fs *c, 776 struct jset *jset, 777 struct jset_entry *entry, 778 unsigned version, int big_endian, 779 enum bch_validate_flags flags) 780 { 781 unsigned bytes = vstruct_bytes(entry); 782 unsigned expected = 16; 783 int ret = 0; 784 785 if (journal_entry_err_on(vstruct_bytes(entry) < expected, 786 c, version, jset, entry, 787 journal_entry_dev_usage_bad_size, 788 "bad size (%u < %u)", 789 bytes, expected)) { 790 journal_entry_null_range(entry, vstruct_next(entry)); 791 return ret; 792 } 793 fsck_err: 794 return ret; 795 } 796 797 static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *c, 798 struct jset_entry *entry) 799 { 800 struct jset_entry_datetime *datetime = 801 container_of(entry, struct jset_entry_datetime, entry); 802 803 bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); 804 } 805 806 struct jset_entry_ops { 807 int (*validate)(struct bch_fs *, struct jset *, 808 struct jset_entry *, unsigned, int, 809 enum bch_validate_flags); 810 void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); 811 }; 812 813 static const struct jset_entry_ops bch2_jset_entry_ops[] = { 814 #define x(f, nr) \ 815 [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \ 816 .validate = journal_entry_##f##_validate, \ 817 .to_text = journal_entry_##f##_to_text, \ 818 }, 819 BCH_JSET_ENTRY_TYPES() 820 #undef x 821 }; 822 823 int bch2_journal_entry_validate(struct bch_fs *c, 824 struct jset *jset, 825 struct jset_entry *entry, 826 unsigned version, int big_endian, 827 enum bch_validate_flags flags) 828 { 829 return entry->type < BCH_JSET_ENTRY_NR 830 ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry, 831 version, big_endian, flags) 832 : 0; 833 } 834 835 void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, 836 struct jset_entry *entry) 837 { 838 bch2_prt_jset_entry_type(out, entry->type); 839 840 if (entry->type < BCH_JSET_ENTRY_NR) { 841 prt_str(out, ": "); 842 bch2_jset_entry_ops[entry->type].to_text(out, c, entry); 843 } 844 } 845 846 static int jset_validate_entries(struct bch_fs *c, struct jset *jset, 847 enum bch_validate_flags flags) 848 { 849 unsigned version = le32_to_cpu(jset->version); 850 int ret = 0; 851 852 vstruct_for_each(jset, entry) { 853 if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset), 854 c, version, jset, entry, 855 journal_entry_past_jset_end, 856 "journal entry extends past end of jset")) { 857 jset->u64s = cpu_to_le32((u64 *) entry - jset->_data); 858 break; 859 } 860 861 ret = bch2_journal_entry_validate(c, jset, entry, 862 version, JSET_BIG_ENDIAN(jset), flags); 863 if (ret) 864 break; 865 } 866 fsck_err: 867 return ret; 868 } 869 870 static int jset_validate(struct bch_fs *c, 871 struct bch_dev *ca, 872 struct jset *jset, u64 sector, 873 enum bch_validate_flags flags) 874 { 875 unsigned version; 876 int ret = 0; 877 878 if (le64_to_cpu(jset->magic) != jset_magic(c)) 879 return JOURNAL_ENTRY_NONE; 880 881 version = le32_to_cpu(jset->version); 882 if (journal_entry_err_on(!bch2_version_compatible(version), 883 c, version, jset, NULL, 884 jset_unsupported_version, 885 "%s sector %llu seq %llu: incompatible journal entry version %u.%u", 886 ca ? ca->name : c->name, 887 sector, le64_to_cpu(jset->seq), 888 BCH_VERSION_MAJOR(version), 889 BCH_VERSION_MINOR(version))) { 890 /* don't try to continue: */ 891 return -EINVAL; 892 } 893 894 if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), 895 c, version, jset, NULL, 896 jset_unknown_csum, 897 "%s sector %llu seq %llu: journal entry with unknown csum type %llu", 898 ca ? ca->name : c->name, 899 sector, le64_to_cpu(jset->seq), 900 JSET_CSUM_TYPE(jset))) 901 ret = JOURNAL_ENTRY_BAD; 902 903 /* last_seq is ignored when JSET_NO_FLUSH is true */ 904 if (journal_entry_err_on(!JSET_NO_FLUSH(jset) && 905 le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), 906 c, version, jset, NULL, 907 jset_last_seq_newer_than_seq, 908 "invalid journal entry: last_seq > seq (%llu > %llu)", 909 le64_to_cpu(jset->last_seq), 910 le64_to_cpu(jset->seq))) { 911 jset->last_seq = jset->seq; 912 return JOURNAL_ENTRY_BAD; 913 } 914 915 ret = jset_validate_entries(c, jset, flags); 916 fsck_err: 917 return ret; 918 } 919 920 static int jset_validate_early(struct bch_fs *c, 921 struct bch_dev *ca, 922 struct jset *jset, u64 sector, 923 unsigned bucket_sectors_left, 924 unsigned sectors_read) 925 { 926 size_t bytes = vstruct_bytes(jset); 927 unsigned version; 928 enum bch_validate_flags flags = BCH_VALIDATE_journal; 929 int ret = 0; 930 931 if (le64_to_cpu(jset->magic) != jset_magic(c)) 932 return JOURNAL_ENTRY_NONE; 933 934 version = le32_to_cpu(jset->version); 935 if (journal_entry_err_on(!bch2_version_compatible(version), 936 c, version, jset, NULL, 937 jset_unsupported_version, 938 "%s sector %llu seq %llu: unknown journal entry version %u.%u", 939 ca ? ca->name : c->name, 940 sector, le64_to_cpu(jset->seq), 941 BCH_VERSION_MAJOR(version), 942 BCH_VERSION_MINOR(version))) { 943 /* don't try to continue: */ 944 return -EINVAL; 945 } 946 947 if (bytes > (sectors_read << 9) && 948 sectors_read < bucket_sectors_left) 949 return JOURNAL_ENTRY_REREAD; 950 951 if (journal_entry_err_on(bytes > bucket_sectors_left << 9, 952 c, version, jset, NULL, 953 jset_past_bucket_end, 954 "%s sector %llu seq %llu: journal entry too big (%zu bytes)", 955 ca ? ca->name : c->name, 956 sector, le64_to_cpu(jset->seq), bytes)) 957 le32_add_cpu(&jset->u64s, 958 -((bytes - (bucket_sectors_left << 9)) / 8)); 959 fsck_err: 960 return ret; 961 } 962 963 struct journal_read_buf { 964 void *data; 965 size_t size; 966 }; 967 968 static int journal_read_buf_realloc(struct journal_read_buf *b, 969 size_t new_size) 970 { 971 void *n; 972 973 /* the bios are sized for this many pages, max: */ 974 if (new_size > JOURNAL_ENTRY_SIZE_MAX) 975 return -BCH_ERR_ENOMEM_journal_read_buf_realloc; 976 977 new_size = roundup_pow_of_two(new_size); 978 n = kvmalloc(new_size, GFP_KERNEL); 979 if (!n) 980 return -BCH_ERR_ENOMEM_journal_read_buf_realloc; 981 982 kvfree(b->data); 983 b->data = n; 984 b->size = new_size; 985 return 0; 986 } 987 988 static int journal_read_bucket(struct bch_dev *ca, 989 struct journal_read_buf *buf, 990 struct journal_list *jlist, 991 unsigned bucket) 992 { 993 struct bch_fs *c = ca->fs; 994 struct journal_device *ja = &ca->journal; 995 struct jset *j = NULL; 996 unsigned sectors, sectors_read = 0; 997 u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), 998 end = offset + ca->mi.bucket_size; 999 bool saw_bad = false, csum_good; 1000 struct printbuf err = PRINTBUF; 1001 int ret = 0; 1002 1003 pr_debug("reading %u", bucket); 1004 1005 while (offset < end) { 1006 if (!sectors_read) { 1007 struct bio *bio; 1008 unsigned nr_bvecs; 1009 reread: 1010 sectors_read = min_t(unsigned, 1011 end - offset, buf->size >> 9); 1012 nr_bvecs = buf_pages(buf->data, sectors_read << 9); 1013 1014 bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); 1015 if (!bio) 1016 return -BCH_ERR_ENOMEM_journal_read_bucket; 1017 bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ); 1018 1019 bio->bi_iter.bi_sector = offset; 1020 bch2_bio_map(bio, buf->data, sectors_read << 9); 1021 1022 ret = submit_bio_wait(bio); 1023 kfree(bio); 1024 1025 if (bch2_dev_io_err_on(ret, ca, BCH_MEMBER_ERROR_read, 1026 "journal read error: sector %llu", 1027 offset) || 1028 bch2_meta_read_fault("journal")) { 1029 /* 1030 * We don't error out of the recovery process 1031 * here, since the relevant journal entry may be 1032 * found on a different device, and missing or 1033 * no journal entries will be handled later 1034 */ 1035 goto out; 1036 } 1037 1038 j = buf->data; 1039 } 1040 1041 ret = jset_validate_early(c, ca, j, offset, 1042 end - offset, sectors_read); 1043 switch (ret) { 1044 case 0: 1045 sectors = vstruct_sectors(j, c->block_bits); 1046 break; 1047 case JOURNAL_ENTRY_REREAD: 1048 if (vstruct_bytes(j) > buf->size) { 1049 ret = journal_read_buf_realloc(buf, 1050 vstruct_bytes(j)); 1051 if (ret) 1052 goto err; 1053 } 1054 goto reread; 1055 case JOURNAL_ENTRY_NONE: 1056 if (!saw_bad) 1057 goto out; 1058 /* 1059 * On checksum error we don't really trust the size 1060 * field of the journal entry we read, so try reading 1061 * again at next block boundary: 1062 */ 1063 sectors = block_sectors(c); 1064 goto next_block; 1065 default: 1066 goto err; 1067 } 1068 1069 if (le64_to_cpu(j->seq) > ja->highest_seq_found) { 1070 ja->highest_seq_found = le64_to_cpu(j->seq); 1071 ja->cur_idx = bucket; 1072 ja->sectors_free = ca->mi.bucket_size - 1073 bucket_remainder(ca, offset) - sectors; 1074 } 1075 1076 /* 1077 * This happens sometimes if we don't have discards on - 1078 * when we've partially overwritten a bucket with new 1079 * journal entries. We don't need the rest of the 1080 * bucket: 1081 */ 1082 if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) 1083 goto out; 1084 1085 ja->bucket_seq[bucket] = le64_to_cpu(j->seq); 1086 1087 enum bch_csum_type csum_type = JSET_CSUM_TYPE(j); 1088 struct bch_csum csum; 1089 csum_good = jset_csum_good(c, j, &csum); 1090 1091 if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum, 1092 "%s", 1093 (printbuf_reset(&err), 1094 prt_str(&err, "journal "), 1095 bch2_csum_err_msg(&err, csum_type, j->csum, csum), 1096 err.buf))) 1097 saw_bad = true; 1098 1099 ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), 1100 j->encrypted_start, 1101 vstruct_end(j) - (void *) j->encrypted_start); 1102 bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret)); 1103 1104 mutex_lock(&jlist->lock); 1105 ret = journal_entry_add(c, ca, (struct journal_ptr) { 1106 .csum_good = csum_good, 1107 .dev = ca->dev_idx, 1108 .bucket = bucket, 1109 .bucket_offset = offset - 1110 bucket_to_sector(ca, ja->buckets[bucket]), 1111 .sector = offset, 1112 }, jlist, j); 1113 mutex_unlock(&jlist->lock); 1114 1115 switch (ret) { 1116 case JOURNAL_ENTRY_ADD_OK: 1117 break; 1118 case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: 1119 break; 1120 default: 1121 goto err; 1122 } 1123 next_block: 1124 pr_debug("next"); 1125 offset += sectors; 1126 sectors_read -= sectors; 1127 j = ((void *) j) + (sectors << 9); 1128 } 1129 1130 out: 1131 ret = 0; 1132 err: 1133 printbuf_exit(&err); 1134 return ret; 1135 } 1136 1137 static CLOSURE_CALLBACK(bch2_journal_read_device) 1138 { 1139 closure_type(ja, struct journal_device, read); 1140 struct bch_dev *ca = container_of(ja, struct bch_dev, journal); 1141 struct bch_fs *c = ca->fs; 1142 struct journal_list *jlist = 1143 container_of(cl->parent, struct journal_list, cl); 1144 struct journal_read_buf buf = { NULL, 0 }; 1145 unsigned i; 1146 int ret = 0; 1147 1148 if (!ja->nr) 1149 goto out; 1150 1151 ret = journal_read_buf_realloc(&buf, PAGE_SIZE); 1152 if (ret) 1153 goto err; 1154 1155 pr_debug("%u journal buckets", ja->nr); 1156 1157 for (i = 0; i < ja->nr; i++) { 1158 ret = journal_read_bucket(ca, &buf, jlist, i); 1159 if (ret) 1160 goto err; 1161 } 1162 1163 /* 1164 * Set dirty_idx to indicate the entire journal is full and needs to be 1165 * reclaimed - journal reclaim will immediately reclaim whatever isn't 1166 * pinned when it first runs: 1167 */ 1168 ja->discard_idx = ja->dirty_idx_ondisk = 1169 ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; 1170 out: 1171 bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); 1172 kvfree(buf.data); 1173 percpu_ref_put(&ca->io_ref); 1174 closure_return(cl); 1175 return; 1176 err: 1177 mutex_lock(&jlist->lock); 1178 jlist->ret = ret; 1179 mutex_unlock(&jlist->lock); 1180 goto out; 1181 } 1182 1183 int bch2_journal_read(struct bch_fs *c, 1184 u64 *last_seq, 1185 u64 *blacklist_seq, 1186 u64 *start_seq) 1187 { 1188 struct journal_list jlist; 1189 struct journal_replay *i, **_i, *prev = NULL; 1190 struct genradix_iter radix_iter; 1191 struct printbuf buf = PRINTBUF; 1192 bool degraded = false, last_write_torn = false; 1193 u64 seq; 1194 int ret = 0; 1195 1196 closure_init_stack(&jlist.cl); 1197 mutex_init(&jlist.lock); 1198 jlist.last_seq = 0; 1199 jlist.ret = 0; 1200 1201 for_each_member_device(c, ca) { 1202 if (!c->opts.fsck && 1203 !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal))) 1204 continue; 1205 1206 if ((ca->mi.state == BCH_MEMBER_STATE_rw || 1207 ca->mi.state == BCH_MEMBER_STATE_ro) && 1208 percpu_ref_tryget(&ca->io_ref)) 1209 closure_call(&ca->journal.read, 1210 bch2_journal_read_device, 1211 system_unbound_wq, 1212 &jlist.cl); 1213 else 1214 degraded = true; 1215 } 1216 1217 closure_sync(&jlist.cl); 1218 1219 if (jlist.ret) 1220 return jlist.ret; 1221 1222 *last_seq = 0; 1223 *start_seq = 0; 1224 *blacklist_seq = 0; 1225 1226 /* 1227 * Find most recent flush entry, and ignore newer non flush entries - 1228 * those entries will be blacklisted: 1229 */ 1230 genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) { 1231 enum bch_validate_flags flags = BCH_VALIDATE_journal; 1232 1233 i = *_i; 1234 1235 if (journal_replay_ignore(i)) 1236 continue; 1237 1238 if (!*start_seq) 1239 *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1; 1240 1241 if (JSET_NO_FLUSH(&i->j)) { 1242 i->ignore_blacklisted = true; 1243 continue; 1244 } 1245 1246 if (!last_write_torn && !i->csum_good) { 1247 last_write_torn = true; 1248 i->ignore_blacklisted = true; 1249 continue; 1250 } 1251 1252 if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq), 1253 c, le32_to_cpu(i->j.version), &i->j, NULL, 1254 jset_last_seq_newer_than_seq, 1255 "invalid journal entry: last_seq > seq (%llu > %llu)", 1256 le64_to_cpu(i->j.last_seq), 1257 le64_to_cpu(i->j.seq))) 1258 i->j.last_seq = i->j.seq; 1259 1260 *last_seq = le64_to_cpu(i->j.last_seq); 1261 *blacklist_seq = le64_to_cpu(i->j.seq) + 1; 1262 break; 1263 } 1264 1265 if (!*start_seq) { 1266 bch_info(c, "journal read done, but no entries found"); 1267 return 0; 1268 } 1269 1270 if (!*last_seq) { 1271 fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes, 1272 "journal read done, but no entries found after dropping non-flushes"); 1273 return 0; 1274 } 1275 1276 bch_info(c, "journal read done, replaying entries %llu-%llu", 1277 *last_seq, *blacklist_seq - 1); 1278 1279 if (*start_seq != *blacklist_seq) 1280 bch_info(c, "dropped unflushed entries %llu-%llu", 1281 *blacklist_seq, *start_seq - 1); 1282 1283 /* Drop blacklisted entries and entries older than last_seq: */ 1284 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1285 i = *_i; 1286 1287 if (journal_replay_ignore(i)) 1288 continue; 1289 1290 seq = le64_to_cpu(i->j.seq); 1291 if (seq < *last_seq) { 1292 journal_replay_free(c, i, false); 1293 continue; 1294 } 1295 1296 if (bch2_journal_seq_is_blacklisted(c, seq, true)) { 1297 fsck_err_on(!JSET_NO_FLUSH(&i->j), c, 1298 jset_seq_blacklisted, 1299 "found blacklisted journal entry %llu", seq); 1300 i->ignore_blacklisted = true; 1301 } 1302 } 1303 1304 /* Check for missing entries: */ 1305 seq = *last_seq; 1306 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1307 i = *_i; 1308 1309 if (journal_replay_ignore(i)) 1310 continue; 1311 1312 BUG_ON(seq > le64_to_cpu(i->j.seq)); 1313 1314 while (seq < le64_to_cpu(i->j.seq)) { 1315 u64 missing_start, missing_end; 1316 struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; 1317 1318 while (seq < le64_to_cpu(i->j.seq) && 1319 bch2_journal_seq_is_blacklisted(c, seq, false)) 1320 seq++; 1321 1322 if (seq == le64_to_cpu(i->j.seq)) 1323 break; 1324 1325 missing_start = seq; 1326 1327 while (seq < le64_to_cpu(i->j.seq) && 1328 !bch2_journal_seq_is_blacklisted(c, seq, false)) 1329 seq++; 1330 1331 if (prev) { 1332 bch2_journal_ptrs_to_text(&buf1, c, prev); 1333 prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits)); 1334 } else 1335 prt_printf(&buf1, "(none)"); 1336 bch2_journal_ptrs_to_text(&buf2, c, i); 1337 1338 missing_end = seq - 1; 1339 fsck_err(c, journal_entries_missing, 1340 "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" 1341 " prev at %s\n" 1342 " next at %s, continue?", 1343 missing_start, missing_end, 1344 *last_seq, *blacklist_seq - 1, 1345 buf1.buf, buf2.buf); 1346 1347 printbuf_exit(&buf1); 1348 printbuf_exit(&buf2); 1349 } 1350 1351 prev = i; 1352 seq++; 1353 } 1354 1355 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1356 struct bch_replicas_padded replicas = { 1357 .e.data_type = BCH_DATA_journal, 1358 .e.nr_devs = 0, 1359 .e.nr_required = 1, 1360 }; 1361 1362 i = *_i; 1363 if (journal_replay_ignore(i)) 1364 continue; 1365 1366 darray_for_each(i->ptrs, ptr) { 1367 struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); 1368 1369 if (!ptr->csum_good) 1370 bch_err_dev_offset(ca, ptr->sector, 1371 "invalid journal checksum, seq %llu%s", 1372 le64_to_cpu(i->j.seq), 1373 i->csum_good ? " (had good copy on another device)" : ""); 1374 } 1375 1376 ret = jset_validate(c, 1377 bch2_dev_have_ref(c, i->ptrs.data[0].dev), 1378 &i->j, 1379 i->ptrs.data[0].sector, 1380 READ); 1381 if (ret) 1382 goto err; 1383 1384 darray_for_each(i->ptrs, ptr) 1385 replicas_entry_add_dev(&replicas.e, ptr->dev); 1386 1387 bch2_replicas_entry_sort(&replicas.e); 1388 1389 printbuf_reset(&buf); 1390 bch2_replicas_entry_to_text(&buf, &replicas.e); 1391 1392 if (!degraded && 1393 !bch2_replicas_marked(c, &replicas.e) && 1394 (le64_to_cpu(i->j.seq) == *last_seq || 1395 fsck_err(c, journal_entry_replicas_not_marked, 1396 "superblock not marked as containing replicas for journal entry %llu\n %s", 1397 le64_to_cpu(i->j.seq), buf.buf))) { 1398 ret = bch2_mark_replicas(c, &replicas.e); 1399 if (ret) 1400 goto err; 1401 } 1402 } 1403 err: 1404 fsck_err: 1405 printbuf_exit(&buf); 1406 return ret; 1407 } 1408 1409 /* journal write: */ 1410 1411 static void __journal_write_alloc(struct journal *j, 1412 struct journal_buf *w, 1413 struct dev_alloc_list *devs_sorted, 1414 unsigned sectors, 1415 unsigned *replicas, 1416 unsigned replicas_want) 1417 { 1418 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1419 struct journal_device *ja; 1420 struct bch_dev *ca; 1421 unsigned i; 1422 1423 if (*replicas >= replicas_want) 1424 return; 1425 1426 for (i = 0; i < devs_sorted->nr; i++) { 1427 ca = rcu_dereference(c->devs[devs_sorted->devs[i]]); 1428 if (!ca) 1429 continue; 1430 1431 ja = &ca->journal; 1432 1433 /* 1434 * Check that we can use this device, and aren't already using 1435 * it: 1436 */ 1437 if (!ca->mi.durability || 1438 ca->mi.state != BCH_MEMBER_STATE_rw || 1439 !ja->nr || 1440 bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) || 1441 sectors > ja->sectors_free) 1442 continue; 1443 1444 bch2_dev_stripe_increment(ca, &j->wp.stripe); 1445 1446 bch2_bkey_append_ptr(&w->key, 1447 (struct bch_extent_ptr) { 1448 .offset = bucket_to_sector(ca, 1449 ja->buckets[ja->cur_idx]) + 1450 ca->mi.bucket_size - 1451 ja->sectors_free, 1452 .dev = ca->dev_idx, 1453 }); 1454 1455 ja->sectors_free -= sectors; 1456 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1457 1458 *replicas += ca->mi.durability; 1459 1460 if (*replicas >= replicas_want) 1461 break; 1462 } 1463 } 1464 1465 /** 1466 * journal_write_alloc - decide where to write next journal entry 1467 * 1468 * @j: journal object 1469 * @w: journal buf (entry to be written) 1470 * 1471 * Returns: 0 on success, or -EROFS on failure 1472 */ 1473 static int journal_write_alloc(struct journal *j, struct journal_buf *w) 1474 { 1475 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1476 struct bch_devs_mask devs; 1477 struct journal_device *ja; 1478 struct bch_dev *ca; 1479 struct dev_alloc_list devs_sorted; 1480 unsigned sectors = vstruct_sectors(w->data, c->block_bits); 1481 unsigned target = c->opts.metadata_target ?: 1482 c->opts.foreground_target; 1483 unsigned i, replicas = 0, replicas_want = 1484 READ_ONCE(c->opts.metadata_replicas); 1485 unsigned replicas_need = min_t(unsigned, replicas_want, 1486 READ_ONCE(c->opts.metadata_replicas_required)); 1487 1488 rcu_read_lock(); 1489 retry: 1490 devs = target_rw_devs(c, BCH_DATA_journal, target); 1491 1492 devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); 1493 1494 __journal_write_alloc(j, w, &devs_sorted, 1495 sectors, &replicas, replicas_want); 1496 1497 if (replicas >= replicas_want) 1498 goto done; 1499 1500 for (i = 0; i < devs_sorted.nr; i++) { 1501 ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); 1502 if (!ca) 1503 continue; 1504 1505 ja = &ca->journal; 1506 1507 if (sectors > ja->sectors_free && 1508 sectors <= ca->mi.bucket_size && 1509 bch2_journal_dev_buckets_available(j, ja, 1510 journal_space_discarded)) { 1511 ja->cur_idx = (ja->cur_idx + 1) % ja->nr; 1512 ja->sectors_free = ca->mi.bucket_size; 1513 1514 /* 1515 * ja->bucket_seq[ja->cur_idx] must always have 1516 * something sensible: 1517 */ 1518 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1519 } 1520 } 1521 1522 __journal_write_alloc(j, w, &devs_sorted, 1523 sectors, &replicas, replicas_want); 1524 1525 if (replicas < replicas_want && target) { 1526 /* Retry from all devices: */ 1527 target = 0; 1528 goto retry; 1529 } 1530 done: 1531 rcu_read_unlock(); 1532 1533 BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); 1534 1535 return replicas >= replicas_need ? 0 : -EROFS; 1536 } 1537 1538 static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) 1539 { 1540 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1541 1542 /* we aren't holding j->lock: */ 1543 unsigned new_size = READ_ONCE(j->buf_size_want); 1544 void *new_buf; 1545 1546 if (buf->buf_size >= new_size) 1547 return; 1548 1549 size_t btree_write_buffer_size = new_size / 64; 1550 1551 if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size)) 1552 return; 1553 1554 new_buf = kvmalloc(new_size, GFP_NOFS|__GFP_NOWARN); 1555 if (!new_buf) 1556 return; 1557 1558 memcpy(new_buf, buf->data, buf->buf_size); 1559 1560 spin_lock(&j->lock); 1561 swap(buf->data, new_buf); 1562 swap(buf->buf_size, new_size); 1563 spin_unlock(&j->lock); 1564 1565 kvfree(new_buf); 1566 } 1567 1568 static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) 1569 { 1570 return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK); 1571 } 1572 1573 static CLOSURE_CALLBACK(journal_write_done) 1574 { 1575 closure_type(w, struct journal_buf, io); 1576 struct journal *j = container_of(w, struct journal, buf[w->idx]); 1577 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1578 struct bch_replicas_padded replicas; 1579 union journal_res_state old, new; 1580 u64 seq = le64_to_cpu(w->data->seq); 1581 int err = 0; 1582 1583 bch2_time_stats_update(!JSET_NO_FLUSH(w->data) 1584 ? j->flush_write_time 1585 : j->noflush_write_time, j->write_start_time); 1586 1587 if (!w->devs_written.nr) { 1588 bch_err(c, "unable to write journal to sufficient devices"); 1589 err = -EIO; 1590 } else { 1591 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, 1592 w->devs_written); 1593 if (bch2_mark_replicas(c, &replicas.e)) 1594 err = -EIO; 1595 } 1596 1597 if (err) 1598 bch2_fatal_error(c); 1599 1600 closure_debug_destroy(cl); 1601 1602 spin_lock(&j->lock); 1603 if (seq >= j->pin.front) 1604 journal_seq_pin(j, seq)->devs = w->devs_written; 1605 if (err && (!j->err_seq || seq < j->err_seq)) 1606 j->err_seq = seq; 1607 w->write_done = true; 1608 1609 bool completed = false; 1610 1611 for (seq = journal_last_unwritten_seq(j); 1612 seq <= journal_cur_seq(j); 1613 seq++) { 1614 w = j->buf + (seq & JOURNAL_BUF_MASK); 1615 if (!w->write_done) 1616 break; 1617 1618 if (!j->err_seq && !JSET_NO_FLUSH(w->data)) { 1619 j->flushed_seq_ondisk = seq; 1620 j->last_seq_ondisk = w->last_seq; 1621 1622 bch2_do_discards(c); 1623 closure_wake_up(&c->freelist_wait); 1624 bch2_reset_alloc_cursors(c); 1625 } 1626 1627 j->seq_ondisk = seq; 1628 1629 /* 1630 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard 1631 * more buckets: 1632 * 1633 * Must come before signaling write completion, for 1634 * bch2_fs_journal_stop(): 1635 */ 1636 if (j->watermark != BCH_WATERMARK_stripe) 1637 journal_reclaim_kick(&c->journal); 1638 1639 old.v = atomic64_read(&j->reservations.counter); 1640 do { 1641 new.v = old.v; 1642 BUG_ON(journal_state_count(new, new.unwritten_idx)); 1643 BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK)); 1644 1645 new.unwritten_idx++; 1646 } while (!atomic64_try_cmpxchg(&j->reservations.counter, 1647 &old.v, new.v)); 1648 1649 closure_wake_up(&w->wait); 1650 completed = true; 1651 } 1652 1653 if (completed) { 1654 bch2_journal_reclaim_fast(j); 1655 bch2_journal_space_available(j); 1656 1657 track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false); 1658 1659 journal_wake(j); 1660 } 1661 1662 if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && 1663 new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { 1664 struct journal_buf *buf = journal_cur_buf(j); 1665 long delta = buf->expires - jiffies; 1666 1667 /* 1668 * We don't close a journal entry to write it while there's 1669 * previous entries still in flight - the current journal entry 1670 * might want to be written now: 1671 */ 1672 mod_delayed_work(j->wq, &j->write_work, max(0L, delta)); 1673 } 1674 1675 /* 1676 * We don't typically trigger journal writes from her - the next journal 1677 * write will be triggered immediately after the previous one is 1678 * allocated, in bch2_journal_write() - but the journal write error path 1679 * is special: 1680 */ 1681 bch2_journal_do_writes(j); 1682 spin_unlock(&j->lock); 1683 } 1684 1685 static void journal_write_endio(struct bio *bio) 1686 { 1687 struct journal_bio *jbio = container_of(bio, struct journal_bio, bio); 1688 struct bch_dev *ca = jbio->ca; 1689 struct journal *j = &ca->fs->journal; 1690 struct journal_buf *w = j->buf + jbio->buf_idx; 1691 1692 if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, 1693 "error writing journal entry %llu: %s", 1694 le64_to_cpu(w->data->seq), 1695 bch2_blk_status_to_str(bio->bi_status)) || 1696 bch2_meta_write_fault("journal")) { 1697 unsigned long flags; 1698 1699 spin_lock_irqsave(&j->err_lock, flags); 1700 bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx); 1701 spin_unlock_irqrestore(&j->err_lock, flags); 1702 } 1703 1704 closure_put(&w->io); 1705 percpu_ref_put(&ca->io_ref); 1706 } 1707 1708 static CLOSURE_CALLBACK(journal_write_submit) 1709 { 1710 closure_type(w, struct journal_buf, io); 1711 struct journal *j = container_of(w, struct journal, buf[w->idx]); 1712 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1713 unsigned sectors = vstruct_sectors(w->data, c->block_bits); 1714 1715 extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { 1716 struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE); 1717 if (!ca) { 1718 /* XXX: fix this */ 1719 bch_err(c, "missing device for journal write\n"); 1720 continue; 1721 } 1722 1723 this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], 1724 sectors); 1725 1726 struct journal_device *ja = &ca->journal; 1727 struct bio *bio = &ja->bio[w->idx]->bio; 1728 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); 1729 bio->bi_iter.bi_sector = ptr->offset; 1730 bio->bi_end_io = journal_write_endio; 1731 bio->bi_private = ca; 1732 1733 BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector); 1734 ca->prev_journal_sector = bio->bi_iter.bi_sector; 1735 1736 if (!JSET_NO_FLUSH(w->data)) 1737 bio->bi_opf |= REQ_FUA; 1738 if (!JSET_NO_FLUSH(w->data) && !w->separate_flush) 1739 bio->bi_opf |= REQ_PREFLUSH; 1740 1741 bch2_bio_map(bio, w->data, sectors << 9); 1742 1743 trace_and_count(c, journal_write, bio); 1744 closure_bio_submit(bio, cl); 1745 1746 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1747 } 1748 1749 continue_at(cl, journal_write_done, j->wq); 1750 } 1751 1752 static CLOSURE_CALLBACK(journal_write_preflush) 1753 { 1754 closure_type(w, struct journal_buf, io); 1755 struct journal *j = container_of(w, struct journal, buf[w->idx]); 1756 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1757 1758 if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { 1759 spin_lock(&j->lock); 1760 if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { 1761 closure_wait(&j->async_wait, cl); 1762 spin_unlock(&j->lock); 1763 continue_at(cl, journal_write_preflush, j->wq); 1764 return; 1765 } 1766 spin_unlock(&j->lock); 1767 } 1768 1769 if (w->separate_flush) { 1770 for_each_rw_member(c, ca) { 1771 percpu_ref_get(&ca->io_ref); 1772 1773 struct journal_device *ja = &ca->journal; 1774 struct bio *bio = &ja->bio[w->idx]->bio; 1775 bio_reset(bio, ca->disk_sb.bdev, 1776 REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH); 1777 bio->bi_end_io = journal_write_endio; 1778 bio->bi_private = ca; 1779 closure_bio_submit(bio, cl); 1780 } 1781 1782 continue_at(cl, journal_write_submit, j->wq); 1783 } else { 1784 /* 1785 * no need to punt to another work item if we're not waiting on 1786 * preflushes 1787 */ 1788 journal_write_submit(&cl->work); 1789 } 1790 } 1791 1792 static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) 1793 { 1794 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1795 struct jset_entry *start, *end; 1796 struct jset *jset = w->data; 1797 struct journal_keys_to_wb wb = { NULL }; 1798 unsigned sectors, bytes, u64s; 1799 unsigned long btree_roots_have = 0; 1800 bool validate_before_checksum = false; 1801 u64 seq = le64_to_cpu(jset->seq); 1802 int ret; 1803 1804 /* 1805 * Simple compaction, dropping empty jset_entries (from journal 1806 * reservations that weren't fully used) and merging jset_entries that 1807 * can be. 1808 * 1809 * If we wanted to be really fancy here, we could sort all the keys in 1810 * the jset and drop keys that were overwritten - probably not worth it: 1811 */ 1812 vstruct_for_each(jset, i) { 1813 unsigned u64s = le16_to_cpu(i->u64s); 1814 1815 /* Empty entry: */ 1816 if (!u64s) 1817 continue; 1818 1819 /* 1820 * New btree roots are set by journalling them; when the journal 1821 * entry gets written we have to propagate them to 1822 * c->btree_roots 1823 * 1824 * But, every journal entry we write has to contain all the 1825 * btree roots (at least for now); so after we copy btree roots 1826 * to c->btree_roots we have to get any missing btree roots and 1827 * add them to this journal entry: 1828 */ 1829 switch (i->type) { 1830 case BCH_JSET_ENTRY_btree_root: 1831 bch2_journal_entry_to_btree_root(c, i); 1832 __set_bit(i->btree_id, &btree_roots_have); 1833 break; 1834 case BCH_JSET_ENTRY_write_buffer_keys: 1835 EBUG_ON(!w->need_flush_to_write_buffer); 1836 1837 if (!wb.wb) 1838 bch2_journal_keys_to_write_buffer_start(c, &wb, seq); 1839 1840 jset_entry_for_each_key(i, k) { 1841 ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k); 1842 if (ret) { 1843 bch2_fs_fatal_error(c, "flushing journal keys to btree write buffer: %s", 1844 bch2_err_str(ret)); 1845 bch2_journal_keys_to_write_buffer_end(c, &wb); 1846 return ret; 1847 } 1848 } 1849 i->type = BCH_JSET_ENTRY_btree_keys; 1850 break; 1851 } 1852 } 1853 1854 if (wb.wb) { 1855 ret = bch2_journal_keys_to_write_buffer_end(c, &wb); 1856 if (ret) { 1857 bch2_fs_fatal_error(c, "error flushing journal keys to btree write buffer: %s", 1858 bch2_err_str(ret)); 1859 return ret; 1860 } 1861 } 1862 1863 spin_lock(&c->journal.lock); 1864 w->need_flush_to_write_buffer = false; 1865 spin_unlock(&c->journal.lock); 1866 1867 start = end = vstruct_last(jset); 1868 1869 end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have); 1870 1871 struct jset_entry_datetime *d = 1872 container_of(jset_entry_init(&end, sizeof(*d)), struct jset_entry_datetime, entry); 1873 d->entry.type = BCH_JSET_ENTRY_datetime; 1874 d->seconds = cpu_to_le64(ktime_get_real_seconds()); 1875 1876 bch2_journal_super_entries_add_common(c, &end, seq); 1877 u64s = (u64 *) end - (u64 *) start; 1878 1879 WARN_ON(u64s > j->entry_u64s_reserved); 1880 1881 le32_add_cpu(&jset->u64s, u64s); 1882 1883 sectors = vstruct_sectors(jset, c->block_bits); 1884 bytes = vstruct_bytes(jset); 1885 1886 if (sectors > w->sectors) { 1887 bch2_fs_fatal_error(c, ": journal write overran available space, %zu > %u (extra %u reserved %u/%u)", 1888 vstruct_bytes(jset), w->sectors << 9, 1889 u64s, w->u64s_reserved, j->entry_u64s_reserved); 1890 return -EINVAL; 1891 } 1892 1893 jset->magic = cpu_to_le64(jset_magic(c)); 1894 jset->version = cpu_to_le32(c->sb.version); 1895 1896 SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); 1897 SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); 1898 1899 if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset)) 1900 j->last_empty_seq = seq; 1901 1902 if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) 1903 validate_before_checksum = true; 1904 1905 if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current) 1906 validate_before_checksum = true; 1907 1908 if (validate_before_checksum && 1909 (ret = jset_validate(c, NULL, jset, 0, WRITE))) 1910 return ret; 1911 1912 ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), 1913 jset->encrypted_start, 1914 vstruct_end(jset) - (void *) jset->encrypted_start); 1915 if (bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret))) 1916 return ret; 1917 1918 jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), 1919 journal_nonce(jset), jset); 1920 1921 if (!validate_before_checksum && 1922 (ret = jset_validate(c, NULL, jset, 0, WRITE))) 1923 return ret; 1924 1925 memset((void *) jset + bytes, 0, (sectors << 9) - bytes); 1926 return 0; 1927 } 1928 1929 static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w) 1930 { 1931 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1932 int error = bch2_journal_error(j); 1933 1934 /* 1935 * If the journal is in an error state - we did an emergency shutdown - 1936 * we prefer to continue doing journal writes. We just mark them as 1937 * noflush so they'll never be used, but they'll still be visible by the 1938 * list_journal tool - this helps in debugging. 1939 * 1940 * There's a caveat: the first journal write after marking the 1941 * superblock dirty must always be a flush write, because on startup 1942 * from a clean shutdown we didn't necessarily read the journal and the 1943 * new journal write might overwrite whatever was in the journal 1944 * previously - we can't leave the journal without any flush writes in 1945 * it. 1946 * 1947 * So if we're in an error state, and we're still starting up, we don't 1948 * write anything at all. 1949 */ 1950 if (error && test_bit(JOURNAL_need_flush_write, &j->flags)) 1951 return -EIO; 1952 1953 if (error || 1954 w->noflush || 1955 (!w->must_flush && 1956 time_before(jiffies, j->last_flush_write + 1957 msecs_to_jiffies(c->opts.journal_flush_delay)) && 1958 test_bit(JOURNAL_may_skip_flush, &j->flags))) { 1959 w->noflush = true; 1960 SET_JSET_NO_FLUSH(w->data, true); 1961 w->data->last_seq = 0; 1962 w->last_seq = 0; 1963 1964 j->nr_noflush_writes++; 1965 } else { 1966 w->must_flush = true; 1967 j->last_flush_write = jiffies; 1968 j->nr_flush_writes++; 1969 clear_bit(JOURNAL_need_flush_write, &j->flags); 1970 } 1971 1972 return 0; 1973 } 1974 1975 CLOSURE_CALLBACK(bch2_journal_write) 1976 { 1977 closure_type(w, struct journal_buf, io); 1978 struct journal *j = container_of(w, struct journal, buf[w->idx]); 1979 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1980 struct bch_replicas_padded replicas; 1981 unsigned nr_rw_members = 0; 1982 int ret; 1983 1984 for_each_rw_member(c, ca) 1985 nr_rw_members++; 1986 1987 BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); 1988 BUG_ON(!w->write_started); 1989 BUG_ON(w->write_allocated); 1990 BUG_ON(w->write_done); 1991 1992 j->write_start_time = local_clock(); 1993 1994 spin_lock(&j->lock); 1995 if (nr_rw_members > 1) 1996 w->separate_flush = true; 1997 1998 ret = bch2_journal_write_pick_flush(j, w); 1999 spin_unlock(&j->lock); 2000 if (ret) 2001 goto err; 2002 2003 mutex_lock(&j->buf_lock); 2004 journal_buf_realloc(j, w); 2005 2006 ret = bch2_journal_write_prep(j, w); 2007 mutex_unlock(&j->buf_lock); 2008 if (ret) 2009 goto err; 2010 2011 j->entry_bytes_written += vstruct_bytes(w->data); 2012 2013 while (1) { 2014 spin_lock(&j->lock); 2015 ret = journal_write_alloc(j, w); 2016 if (!ret || !j->can_discard) 2017 break; 2018 2019 spin_unlock(&j->lock); 2020 bch2_journal_do_discards(j); 2021 } 2022 2023 if (ret) { 2024 struct printbuf buf = PRINTBUF; 2025 buf.atomic++; 2026 2027 prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu: %s"), 2028 le64_to_cpu(w->data->seq), 2029 bch2_err_str(ret)); 2030 __bch2_journal_debug_to_text(&buf, j); 2031 spin_unlock(&j->lock); 2032 bch2_print_string_as_lines(KERN_ERR, buf.buf); 2033 printbuf_exit(&buf); 2034 goto err; 2035 } 2036 2037 /* 2038 * write is allocated, no longer need to account for it in 2039 * bch2_journal_space_available(): 2040 */ 2041 w->sectors = 0; 2042 w->write_allocated = true; 2043 2044 /* 2045 * journal entry has been compacted and allocated, recalculate space 2046 * available: 2047 */ 2048 bch2_journal_space_available(j); 2049 bch2_journal_do_writes(j); 2050 spin_unlock(&j->lock); 2051 2052 w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); 2053 2054 if (c->opts.nochanges) 2055 goto no_io; 2056 2057 /* 2058 * Mark journal replicas before we submit the write to guarantee 2059 * recovery will find the journal entries after a crash. 2060 */ 2061 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, 2062 w->devs_written); 2063 ret = bch2_mark_replicas(c, &replicas.e); 2064 if (ret) 2065 goto err; 2066 2067 if (!JSET_NO_FLUSH(w->data)) 2068 continue_at(cl, journal_write_preflush, j->wq); 2069 else 2070 continue_at(cl, journal_write_submit, j->wq); 2071 return; 2072 no_io: 2073 continue_at(cl, journal_write_done, j->wq); 2074 return; 2075 err: 2076 bch2_fatal_error(c); 2077 continue_at(cl, journal_write_done, j->wq); 2078 } 2079