1 // SPDX-License-Identifier: GPL-2.0 2 #include "bcachefs.h" 3 #include "alloc_background.h" 4 #include "alloc_foreground.h" 5 #include "btree_io.h" 6 #include "btree_update_interior.h" 7 #include "btree_write_buffer.h" 8 #include "buckets.h" 9 #include "checksum.h" 10 #include "disk_groups.h" 11 #include "error.h" 12 #include "journal.h" 13 #include "journal_io.h" 14 #include "journal_reclaim.h" 15 #include "journal_seq_blacklist.h" 16 #include "replicas.h" 17 #include "sb-clean.h" 18 #include "trace.h" 19 20 void bch2_journal_pos_from_member_info_set(struct bch_fs *c) 21 { 22 lockdep_assert_held(&c->sb_lock); 23 24 for_each_member_device(c, ca) { 25 struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); 26 27 m->last_journal_bucket = cpu_to_le32(ca->journal.cur_idx); 28 m->last_journal_bucket_offset = cpu_to_le32(ca->mi.bucket_size - ca->journal.sectors_free); 29 } 30 } 31 32 void bch2_journal_pos_from_member_info_resume(struct bch_fs *c) 33 { 34 mutex_lock(&c->sb_lock); 35 for_each_member_device(c, ca) { 36 struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx); 37 38 unsigned idx = le32_to_cpu(m.last_journal_bucket); 39 if (idx < ca->journal.nr) 40 ca->journal.cur_idx = idx; 41 unsigned offset = le32_to_cpu(m.last_journal_bucket_offset); 42 if (offset <= ca->mi.bucket_size) 43 ca->journal.sectors_free = ca->mi.bucket_size - offset; 44 } 45 mutex_unlock(&c->sb_lock); 46 } 47 48 void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, 49 struct journal_replay *j) 50 { 51 darray_for_each(j->ptrs, i) { 52 if (i != j->ptrs.data) 53 prt_printf(out, " "); 54 prt_printf(out, "%u:%u:%u (sector %llu)", 55 i->dev, i->bucket, i->bucket_offset, i->sector); 56 } 57 } 58 59 static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c, 60 struct journal_replay *j) 61 { 62 prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq)); 63 64 bch2_journal_ptrs_to_text(out, c, j); 65 66 for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) { 67 struct jset_entry_datetime *datetime = 68 container_of(entry, struct jset_entry_datetime, entry); 69 bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); 70 break; 71 } 72 } 73 74 static struct nonce journal_nonce(const struct jset *jset) 75 { 76 return (struct nonce) {{ 77 [0] = 0, 78 [1] = ((__le32 *) &jset->seq)[0], 79 [2] = ((__le32 *) &jset->seq)[1], 80 [3] = BCH_NONCE_JOURNAL, 81 }}; 82 } 83 84 static bool jset_csum_good(struct bch_fs *c, struct jset *j, struct bch_csum *csum) 85 { 86 if (!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j))) { 87 *csum = (struct bch_csum) {}; 88 return false; 89 } 90 91 *csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j); 92 return !bch2_crc_cmp(j->csum, *csum); 93 } 94 95 static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq) 96 { 97 return (seq - c->journal_entries_base_seq) & (~0U >> 1); 98 } 99 100 static void __journal_replay_free(struct bch_fs *c, 101 struct journal_replay *i) 102 { 103 struct journal_replay **p = 104 genradix_ptr(&c->journal_entries, 105 journal_entry_radix_idx(c, le64_to_cpu(i->j.seq))); 106 107 BUG_ON(*p != i); 108 *p = NULL; 109 kvfree(i); 110 } 111 112 static void journal_replay_free(struct bch_fs *c, struct journal_replay *i, bool blacklisted) 113 { 114 if (blacklisted) 115 i->ignore_blacklisted = true; 116 else 117 i->ignore_not_dirty = true; 118 119 if (!c->opts.read_entire_journal) 120 __journal_replay_free(c, i); 121 } 122 123 struct journal_list { 124 struct closure cl; 125 u64 last_seq; 126 struct mutex lock; 127 int ret; 128 }; 129 130 #define JOURNAL_ENTRY_ADD_OK 0 131 #define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5 132 133 /* 134 * Given a journal entry we just read, add it to the list of journal entries to 135 * be replayed: 136 */ 137 static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, 138 struct journal_ptr entry_ptr, 139 struct journal_list *jlist, struct jset *j) 140 { 141 struct genradix_iter iter; 142 struct journal_replay **_i, *i, *dup; 143 size_t bytes = vstruct_bytes(j); 144 u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0; 145 struct printbuf buf = PRINTBUF; 146 int ret = JOURNAL_ENTRY_ADD_OK; 147 148 if (!c->journal.oldest_seq_found_ondisk || 149 le64_to_cpu(j->seq) < c->journal.oldest_seq_found_ondisk) 150 c->journal.oldest_seq_found_ondisk = le64_to_cpu(j->seq); 151 152 /* Is this entry older than the range we need? */ 153 if (!c->opts.read_entire_journal && 154 le64_to_cpu(j->seq) < jlist->last_seq) 155 return JOURNAL_ENTRY_ADD_OUT_OF_RANGE; 156 157 /* 158 * genradixes are indexed by a ulong, not a u64, so we can't index them 159 * by sequence number directly: Assume instead that they will all fall 160 * within the range of +-2billion of the filrst one we find. 161 */ 162 if (!c->journal_entries_base_seq) 163 c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX); 164 165 /* Drop entries we don't need anymore */ 166 if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) { 167 genradix_for_each_from(&c->journal_entries, iter, _i, 168 journal_entry_radix_idx(c, jlist->last_seq)) { 169 i = *_i; 170 171 if (journal_replay_ignore(i)) 172 continue; 173 174 if (le64_to_cpu(i->j.seq) >= last_seq) 175 break; 176 177 journal_replay_free(c, i, false); 178 } 179 } 180 181 jlist->last_seq = max(jlist->last_seq, last_seq); 182 183 _i = genradix_ptr_alloc(&c->journal_entries, 184 journal_entry_radix_idx(c, le64_to_cpu(j->seq)), 185 GFP_KERNEL); 186 if (!_i) 187 return -BCH_ERR_ENOMEM_journal_entry_add; 188 189 /* 190 * Duplicate journal entries? If so we want the one that didn't have a 191 * checksum error: 192 */ 193 dup = *_i; 194 if (dup) { 195 bool identical = bytes == vstruct_bytes(&dup->j) && 196 !memcmp(j, &dup->j, bytes); 197 bool not_identical = !identical && 198 entry_ptr.csum_good && 199 dup->csum_good; 200 201 bool same_device = false; 202 darray_for_each(dup->ptrs, ptr) 203 if (ptr->dev == ca->dev_idx) 204 same_device = true; 205 206 ret = darray_push(&dup->ptrs, entry_ptr); 207 if (ret) 208 goto out; 209 210 bch2_journal_replay_to_text(&buf, c, dup); 211 212 fsck_err_on(same_device, 213 c, journal_entry_dup_same_device, 214 "duplicate journal entry on same device\n %s", 215 buf.buf); 216 217 fsck_err_on(not_identical, 218 c, journal_entry_replicas_data_mismatch, 219 "found duplicate but non identical journal entries\n %s", 220 buf.buf); 221 222 if (entry_ptr.csum_good && !identical) 223 goto replace; 224 225 goto out; 226 } 227 replace: 228 i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); 229 if (!i) 230 return -BCH_ERR_ENOMEM_journal_entry_add; 231 232 darray_init(&i->ptrs); 233 i->csum_good = entry_ptr.csum_good; 234 i->ignore_blacklisted = false; 235 i->ignore_not_dirty = false; 236 unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct"); 237 238 if (dup) { 239 /* The first ptr should represent the jset we kept: */ 240 darray_for_each(dup->ptrs, ptr) 241 darray_push(&i->ptrs, *ptr); 242 __journal_replay_free(c, dup); 243 } else { 244 darray_push(&i->ptrs, entry_ptr); 245 } 246 247 *_i = i; 248 out: 249 fsck_err: 250 printbuf_exit(&buf); 251 return ret; 252 } 253 254 /* this fills in a range with empty jset_entries: */ 255 static void journal_entry_null_range(void *start, void *end) 256 { 257 struct jset_entry *entry; 258 259 for (entry = start; entry != end; entry = vstruct_next(entry)) 260 memset(entry, 0, sizeof(*entry)); 261 } 262 263 #define JOURNAL_ENTRY_REREAD 5 264 #define JOURNAL_ENTRY_NONE 6 265 #define JOURNAL_ENTRY_BAD 7 266 267 static void journal_entry_err_msg(struct printbuf *out, 268 u32 version, 269 struct jset *jset, 270 struct jset_entry *entry) 271 { 272 prt_str(out, "invalid journal entry, version="); 273 bch2_version_to_text(out, version); 274 275 if (entry) { 276 prt_str(out, " type="); 277 bch2_prt_jset_entry_type(out, entry->type); 278 } 279 280 if (!jset) { 281 prt_printf(out, " in superblock"); 282 } else { 283 284 prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq)); 285 286 if (entry) 287 prt_printf(out, " offset=%zi/%u", 288 (u64 *) entry - jset->_data, 289 le32_to_cpu(jset->u64s)); 290 } 291 292 prt_str(out, ": "); 293 } 294 295 #define journal_entry_err(c, version, jset, entry, _err, msg, ...) \ 296 ({ \ 297 struct printbuf _buf = PRINTBUF; \ 298 \ 299 journal_entry_err_msg(&_buf, version, jset, entry); \ 300 prt_printf(&_buf, msg, ##__VA_ARGS__); \ 301 \ 302 switch (flags & BCH_VALIDATE_write) { \ 303 case READ: \ 304 mustfix_fsck_err(c, _err, "%s", _buf.buf); \ 305 break; \ 306 case WRITE: \ 307 bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \ 308 bch_err(c, "corrupt metadata before write: %s\n", _buf.buf);\ 309 if (bch2_fs_inconsistent(c)) { \ 310 ret = -BCH_ERR_fsck_errors_not_fixed; \ 311 goto fsck_err; \ 312 } \ 313 break; \ 314 } \ 315 \ 316 printbuf_exit(&_buf); \ 317 true; \ 318 }) 319 320 #define journal_entry_err_on(cond, ...) \ 321 ((cond) ? journal_entry_err(__VA_ARGS__) : false) 322 323 #define FSCK_DELETED_KEY 5 324 325 static int journal_validate_key(struct bch_fs *c, 326 struct jset *jset, 327 struct jset_entry *entry, 328 unsigned level, enum btree_id btree_id, 329 struct bkey_i *k, 330 unsigned version, int big_endian, 331 enum bch_validate_flags flags) 332 { 333 int write = flags & BCH_VALIDATE_write; 334 void *next = vstruct_next(entry); 335 int ret = 0; 336 337 if (journal_entry_err_on(!k->k.u64s, 338 c, version, jset, entry, 339 journal_entry_bkey_u64s_0, 340 "k->u64s 0")) { 341 entry->u64s = cpu_to_le16((u64 *) k - entry->_data); 342 journal_entry_null_range(vstruct_next(entry), next); 343 return FSCK_DELETED_KEY; 344 } 345 346 if (journal_entry_err_on((void *) bkey_next(k) > 347 (void *) vstruct_next(entry), 348 c, version, jset, entry, 349 journal_entry_bkey_past_end, 350 "extends past end of journal entry")) { 351 entry->u64s = cpu_to_le16((u64 *) k - entry->_data); 352 journal_entry_null_range(vstruct_next(entry), next); 353 return FSCK_DELETED_KEY; 354 } 355 356 if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, 357 c, version, jset, entry, 358 journal_entry_bkey_bad_format, 359 "bad format %u", k->k.format)) { 360 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); 361 memmove(k, bkey_next(k), next - (void *) bkey_next(k)); 362 journal_entry_null_range(vstruct_next(entry), next); 363 return FSCK_DELETED_KEY; 364 } 365 366 if (!write) 367 bch2_bkey_compat(level, btree_id, version, big_endian, 368 write, NULL, bkey_to_packed(k)); 369 370 ret = bch2_bkey_validate(c, bkey_i_to_s_c(k), 371 __btree_node_type(level, btree_id), write); 372 if (ret == -BCH_ERR_fsck_delete_bkey) { 373 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); 374 memmove(k, bkey_next(k), next - (void *) bkey_next(k)); 375 journal_entry_null_range(vstruct_next(entry), next); 376 return FSCK_DELETED_KEY; 377 } 378 if (ret) 379 goto fsck_err; 380 381 if (write) 382 bch2_bkey_compat(level, btree_id, version, big_endian, 383 write, NULL, bkey_to_packed(k)); 384 fsck_err: 385 return ret; 386 } 387 388 static int journal_entry_btree_keys_validate(struct bch_fs *c, 389 struct jset *jset, 390 struct jset_entry *entry, 391 unsigned version, int big_endian, 392 enum bch_validate_flags flags) 393 { 394 struct bkey_i *k = entry->start; 395 396 while (k != vstruct_last(entry)) { 397 int ret = journal_validate_key(c, jset, entry, 398 entry->level, 399 entry->btree_id, 400 k, version, big_endian, 401 flags|BCH_VALIDATE_journal); 402 if (ret == FSCK_DELETED_KEY) 403 continue; 404 else if (ret) 405 return ret; 406 407 k = bkey_next(k); 408 } 409 410 return 0; 411 } 412 413 static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c, 414 struct jset_entry *entry) 415 { 416 bool first = true; 417 418 jset_entry_for_each_key(entry, k) { 419 if (!first) { 420 prt_newline(out); 421 bch2_prt_jset_entry_type(out, entry->type); 422 prt_str(out, ": "); 423 } 424 prt_printf(out, "btree=%s l=%u ", bch2_btree_id_str(entry->btree_id), entry->level); 425 bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); 426 first = false; 427 } 428 } 429 430 static int journal_entry_btree_root_validate(struct bch_fs *c, 431 struct jset *jset, 432 struct jset_entry *entry, 433 unsigned version, int big_endian, 434 enum bch_validate_flags flags) 435 { 436 struct bkey_i *k = entry->start; 437 int ret = 0; 438 439 if (journal_entry_err_on(!entry->u64s || 440 le16_to_cpu(entry->u64s) != k->k.u64s, 441 c, version, jset, entry, 442 journal_entry_btree_root_bad_size, 443 "invalid btree root journal entry: wrong number of keys")) { 444 void *next = vstruct_next(entry); 445 /* 446 * we don't want to null out this jset_entry, 447 * just the contents, so that later we can tell 448 * we were _supposed_ to have a btree root 449 */ 450 entry->u64s = 0; 451 journal_entry_null_range(vstruct_next(entry), next); 452 return 0; 453 } 454 455 ret = journal_validate_key(c, jset, entry, 1, entry->btree_id, k, 456 version, big_endian, flags); 457 if (ret == FSCK_DELETED_KEY) 458 ret = 0; 459 fsck_err: 460 return ret; 461 } 462 463 static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c, 464 struct jset_entry *entry) 465 { 466 journal_entry_btree_keys_to_text(out, c, entry); 467 } 468 469 static int journal_entry_prio_ptrs_validate(struct bch_fs *c, 470 struct jset *jset, 471 struct jset_entry *entry, 472 unsigned version, int big_endian, 473 enum bch_validate_flags flags) 474 { 475 /* obsolete, don't care: */ 476 return 0; 477 } 478 479 static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c, 480 struct jset_entry *entry) 481 { 482 } 483 484 static int journal_entry_blacklist_validate(struct bch_fs *c, 485 struct jset *jset, 486 struct jset_entry *entry, 487 unsigned version, int big_endian, 488 enum bch_validate_flags flags) 489 { 490 int ret = 0; 491 492 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, 493 c, version, jset, entry, 494 journal_entry_blacklist_bad_size, 495 "invalid journal seq blacklist entry: bad size")) { 496 journal_entry_null_range(entry, vstruct_next(entry)); 497 } 498 fsck_err: 499 return ret; 500 } 501 502 static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c, 503 struct jset_entry *entry) 504 { 505 struct jset_entry_blacklist *bl = 506 container_of(entry, struct jset_entry_blacklist, entry); 507 508 prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq)); 509 } 510 511 static int journal_entry_blacklist_v2_validate(struct bch_fs *c, 512 struct jset *jset, 513 struct jset_entry *entry, 514 unsigned version, int big_endian, 515 enum bch_validate_flags flags) 516 { 517 struct jset_entry_blacklist_v2 *bl_entry; 518 int ret = 0; 519 520 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, 521 c, version, jset, entry, 522 journal_entry_blacklist_v2_bad_size, 523 "invalid journal seq blacklist entry: bad size")) { 524 journal_entry_null_range(entry, vstruct_next(entry)); 525 goto out; 526 } 527 528 bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry); 529 530 if (journal_entry_err_on(le64_to_cpu(bl_entry->start) > 531 le64_to_cpu(bl_entry->end), 532 c, version, jset, entry, 533 journal_entry_blacklist_v2_start_past_end, 534 "invalid journal seq blacklist entry: start > end")) { 535 journal_entry_null_range(entry, vstruct_next(entry)); 536 } 537 out: 538 fsck_err: 539 return ret; 540 } 541 542 static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c, 543 struct jset_entry *entry) 544 { 545 struct jset_entry_blacklist_v2 *bl = 546 container_of(entry, struct jset_entry_blacklist_v2, entry); 547 548 prt_printf(out, "start=%llu end=%llu", 549 le64_to_cpu(bl->start), 550 le64_to_cpu(bl->end)); 551 } 552 553 static int journal_entry_usage_validate(struct bch_fs *c, 554 struct jset *jset, 555 struct jset_entry *entry, 556 unsigned version, int big_endian, 557 enum bch_validate_flags flags) 558 { 559 struct jset_entry_usage *u = 560 container_of(entry, struct jset_entry_usage, entry); 561 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 562 int ret = 0; 563 564 if (journal_entry_err_on(bytes < sizeof(*u), 565 c, version, jset, entry, 566 journal_entry_usage_bad_size, 567 "invalid journal entry usage: bad size")) { 568 journal_entry_null_range(entry, vstruct_next(entry)); 569 return ret; 570 } 571 572 fsck_err: 573 return ret; 574 } 575 576 static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c, 577 struct jset_entry *entry) 578 { 579 struct jset_entry_usage *u = 580 container_of(entry, struct jset_entry_usage, entry); 581 582 prt_str(out, "type="); 583 bch2_prt_fs_usage_type(out, u->entry.btree_id); 584 prt_printf(out, " v=%llu", le64_to_cpu(u->v)); 585 } 586 587 static int journal_entry_data_usage_validate(struct bch_fs *c, 588 struct jset *jset, 589 struct jset_entry *entry, 590 unsigned version, int big_endian, 591 enum bch_validate_flags flags) 592 { 593 struct jset_entry_data_usage *u = 594 container_of(entry, struct jset_entry_data_usage, entry); 595 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 596 struct printbuf err = PRINTBUF; 597 int ret = 0; 598 599 if (journal_entry_err_on(bytes < sizeof(*u) || 600 bytes < sizeof(*u) + u->r.nr_devs, 601 c, version, jset, entry, 602 journal_entry_data_usage_bad_size, 603 "invalid journal entry usage: bad size")) { 604 journal_entry_null_range(entry, vstruct_next(entry)); 605 goto out; 606 } 607 608 if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c->disk_sb.sb, &err), 609 c, version, jset, entry, 610 journal_entry_data_usage_bad_size, 611 "invalid journal entry usage: %s", err.buf)) { 612 journal_entry_null_range(entry, vstruct_next(entry)); 613 goto out; 614 } 615 out: 616 fsck_err: 617 printbuf_exit(&err); 618 return ret; 619 } 620 621 static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c, 622 struct jset_entry *entry) 623 { 624 struct jset_entry_data_usage *u = 625 container_of(entry, struct jset_entry_data_usage, entry); 626 627 bch2_replicas_entry_to_text(out, &u->r); 628 prt_printf(out, "=%llu", le64_to_cpu(u->v)); 629 } 630 631 static int journal_entry_clock_validate(struct bch_fs *c, 632 struct jset *jset, 633 struct jset_entry *entry, 634 unsigned version, int big_endian, 635 enum bch_validate_flags flags) 636 { 637 struct jset_entry_clock *clock = 638 container_of(entry, struct jset_entry_clock, entry); 639 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 640 int ret = 0; 641 642 if (journal_entry_err_on(bytes != sizeof(*clock), 643 c, version, jset, entry, 644 journal_entry_clock_bad_size, 645 "bad size")) { 646 journal_entry_null_range(entry, vstruct_next(entry)); 647 return ret; 648 } 649 650 if (journal_entry_err_on(clock->rw > 1, 651 c, version, jset, entry, 652 journal_entry_clock_bad_rw, 653 "bad rw")) { 654 journal_entry_null_range(entry, vstruct_next(entry)); 655 return ret; 656 } 657 658 fsck_err: 659 return ret; 660 } 661 662 static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c, 663 struct jset_entry *entry) 664 { 665 struct jset_entry_clock *clock = 666 container_of(entry, struct jset_entry_clock, entry); 667 668 prt_printf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time)); 669 } 670 671 static int journal_entry_dev_usage_validate(struct bch_fs *c, 672 struct jset *jset, 673 struct jset_entry *entry, 674 unsigned version, int big_endian, 675 enum bch_validate_flags flags) 676 { 677 struct jset_entry_dev_usage *u = 678 container_of(entry, struct jset_entry_dev_usage, entry); 679 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 680 unsigned expected = sizeof(*u); 681 int ret = 0; 682 683 if (journal_entry_err_on(bytes < expected, 684 c, version, jset, entry, 685 journal_entry_dev_usage_bad_size, 686 "bad size (%u < %u)", 687 bytes, expected)) { 688 journal_entry_null_range(entry, vstruct_next(entry)); 689 return ret; 690 } 691 692 if (journal_entry_err_on(u->pad, 693 c, version, jset, entry, 694 journal_entry_dev_usage_bad_pad, 695 "bad pad")) { 696 journal_entry_null_range(entry, vstruct_next(entry)); 697 return ret; 698 } 699 700 fsck_err: 701 return ret; 702 } 703 704 static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c, 705 struct jset_entry *entry) 706 { 707 struct jset_entry_dev_usage *u = 708 container_of(entry, struct jset_entry_dev_usage, entry); 709 unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); 710 711 prt_printf(out, "dev=%u", le32_to_cpu(u->dev)); 712 713 printbuf_indent_add(out, 2); 714 for (i = 0; i < nr_types; i++) { 715 prt_newline(out); 716 bch2_prt_data_type(out, i); 717 prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu", 718 le64_to_cpu(u->d[i].buckets), 719 le64_to_cpu(u->d[i].sectors), 720 le64_to_cpu(u->d[i].fragmented)); 721 } 722 printbuf_indent_sub(out, 2); 723 } 724 725 static int journal_entry_log_validate(struct bch_fs *c, 726 struct jset *jset, 727 struct jset_entry *entry, 728 unsigned version, int big_endian, 729 enum bch_validate_flags flags) 730 { 731 return 0; 732 } 733 734 static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c, 735 struct jset_entry *entry) 736 { 737 struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); 738 unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d); 739 740 prt_printf(out, "%.*s", bytes, l->d); 741 } 742 743 static int journal_entry_overwrite_validate(struct bch_fs *c, 744 struct jset *jset, 745 struct jset_entry *entry, 746 unsigned version, int big_endian, 747 enum bch_validate_flags flags) 748 { 749 return journal_entry_btree_keys_validate(c, jset, entry, 750 version, big_endian, READ); 751 } 752 753 static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c, 754 struct jset_entry *entry) 755 { 756 journal_entry_btree_keys_to_text(out, c, entry); 757 } 758 759 static int journal_entry_write_buffer_keys_validate(struct bch_fs *c, 760 struct jset *jset, 761 struct jset_entry *entry, 762 unsigned version, int big_endian, 763 enum bch_validate_flags flags) 764 { 765 return journal_entry_btree_keys_validate(c, jset, entry, 766 version, big_endian, READ); 767 } 768 769 static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c, 770 struct jset_entry *entry) 771 { 772 journal_entry_btree_keys_to_text(out, c, entry); 773 } 774 775 static int journal_entry_datetime_validate(struct bch_fs *c, 776 struct jset *jset, 777 struct jset_entry *entry, 778 unsigned version, int big_endian, 779 enum bch_validate_flags flags) 780 { 781 unsigned bytes = vstruct_bytes(entry); 782 unsigned expected = 16; 783 int ret = 0; 784 785 if (journal_entry_err_on(vstruct_bytes(entry) < expected, 786 c, version, jset, entry, 787 journal_entry_dev_usage_bad_size, 788 "bad size (%u < %u)", 789 bytes, expected)) { 790 journal_entry_null_range(entry, vstruct_next(entry)); 791 return ret; 792 } 793 fsck_err: 794 return ret; 795 } 796 797 static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *c, 798 struct jset_entry *entry) 799 { 800 struct jset_entry_datetime *datetime = 801 container_of(entry, struct jset_entry_datetime, entry); 802 803 bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); 804 } 805 806 struct jset_entry_ops { 807 int (*validate)(struct bch_fs *, struct jset *, 808 struct jset_entry *, unsigned, int, 809 enum bch_validate_flags); 810 void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); 811 }; 812 813 static const struct jset_entry_ops bch2_jset_entry_ops[] = { 814 #define x(f, nr) \ 815 [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \ 816 .validate = journal_entry_##f##_validate, \ 817 .to_text = journal_entry_##f##_to_text, \ 818 }, 819 BCH_JSET_ENTRY_TYPES() 820 #undef x 821 }; 822 823 int bch2_journal_entry_validate(struct bch_fs *c, 824 struct jset *jset, 825 struct jset_entry *entry, 826 unsigned version, int big_endian, 827 enum bch_validate_flags flags) 828 { 829 return entry->type < BCH_JSET_ENTRY_NR 830 ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry, 831 version, big_endian, flags) 832 : 0; 833 } 834 835 void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, 836 struct jset_entry *entry) 837 { 838 bch2_prt_jset_entry_type(out, entry->type); 839 840 if (entry->type < BCH_JSET_ENTRY_NR) { 841 prt_str(out, ": "); 842 bch2_jset_entry_ops[entry->type].to_text(out, c, entry); 843 } 844 } 845 846 static int jset_validate_entries(struct bch_fs *c, struct jset *jset, 847 enum bch_validate_flags flags) 848 { 849 unsigned version = le32_to_cpu(jset->version); 850 int ret = 0; 851 852 vstruct_for_each(jset, entry) { 853 if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset), 854 c, version, jset, entry, 855 journal_entry_past_jset_end, 856 "journal entry extends past end of jset")) { 857 jset->u64s = cpu_to_le32((u64 *) entry - jset->_data); 858 break; 859 } 860 861 ret = bch2_journal_entry_validate(c, jset, entry, 862 version, JSET_BIG_ENDIAN(jset), flags); 863 if (ret) 864 break; 865 } 866 fsck_err: 867 return ret; 868 } 869 870 static int jset_validate(struct bch_fs *c, 871 struct bch_dev *ca, 872 struct jset *jset, u64 sector, 873 enum bch_validate_flags flags) 874 { 875 unsigned version; 876 int ret = 0; 877 878 if (le64_to_cpu(jset->magic) != jset_magic(c)) 879 return JOURNAL_ENTRY_NONE; 880 881 version = le32_to_cpu(jset->version); 882 if (journal_entry_err_on(!bch2_version_compatible(version), 883 c, version, jset, NULL, 884 jset_unsupported_version, 885 "%s sector %llu seq %llu: incompatible journal entry version %u.%u", 886 ca ? ca->name : c->name, 887 sector, le64_to_cpu(jset->seq), 888 BCH_VERSION_MAJOR(version), 889 BCH_VERSION_MINOR(version))) { 890 /* don't try to continue: */ 891 return -EINVAL; 892 } 893 894 if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), 895 c, version, jset, NULL, 896 jset_unknown_csum, 897 "%s sector %llu seq %llu: journal entry with unknown csum type %llu", 898 ca ? ca->name : c->name, 899 sector, le64_to_cpu(jset->seq), 900 JSET_CSUM_TYPE(jset))) 901 ret = JOURNAL_ENTRY_BAD; 902 903 /* last_seq is ignored when JSET_NO_FLUSH is true */ 904 if (journal_entry_err_on(!JSET_NO_FLUSH(jset) && 905 le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), 906 c, version, jset, NULL, 907 jset_last_seq_newer_than_seq, 908 "invalid journal entry: last_seq > seq (%llu > %llu)", 909 le64_to_cpu(jset->last_seq), 910 le64_to_cpu(jset->seq))) { 911 jset->last_seq = jset->seq; 912 return JOURNAL_ENTRY_BAD; 913 } 914 915 ret = jset_validate_entries(c, jset, flags); 916 fsck_err: 917 return ret; 918 } 919 920 static int jset_validate_early(struct bch_fs *c, 921 struct bch_dev *ca, 922 struct jset *jset, u64 sector, 923 unsigned bucket_sectors_left, 924 unsigned sectors_read) 925 { 926 size_t bytes = vstruct_bytes(jset); 927 unsigned version; 928 enum bch_validate_flags flags = BCH_VALIDATE_journal; 929 int ret = 0; 930 931 if (le64_to_cpu(jset->magic) != jset_magic(c)) 932 return JOURNAL_ENTRY_NONE; 933 934 version = le32_to_cpu(jset->version); 935 if (journal_entry_err_on(!bch2_version_compatible(version), 936 c, version, jset, NULL, 937 jset_unsupported_version, 938 "%s sector %llu seq %llu: unknown journal entry version %u.%u", 939 ca ? ca->name : c->name, 940 sector, le64_to_cpu(jset->seq), 941 BCH_VERSION_MAJOR(version), 942 BCH_VERSION_MINOR(version))) { 943 /* don't try to continue: */ 944 return -EINVAL; 945 } 946 947 if (bytes > (sectors_read << 9) && 948 sectors_read < bucket_sectors_left) 949 return JOURNAL_ENTRY_REREAD; 950 951 if (journal_entry_err_on(bytes > bucket_sectors_left << 9, 952 c, version, jset, NULL, 953 jset_past_bucket_end, 954 "%s sector %llu seq %llu: journal entry too big (%zu bytes)", 955 ca ? ca->name : c->name, 956 sector, le64_to_cpu(jset->seq), bytes)) 957 le32_add_cpu(&jset->u64s, 958 -((bytes - (bucket_sectors_left << 9)) / 8)); 959 fsck_err: 960 return ret; 961 } 962 963 struct journal_read_buf { 964 void *data; 965 size_t size; 966 }; 967 968 static int journal_read_buf_realloc(struct journal_read_buf *b, 969 size_t new_size) 970 { 971 void *n; 972 973 /* the bios are sized for this many pages, max: */ 974 if (new_size > JOURNAL_ENTRY_SIZE_MAX) 975 return -BCH_ERR_ENOMEM_journal_read_buf_realloc; 976 977 new_size = roundup_pow_of_two(new_size); 978 n = kvmalloc(new_size, GFP_KERNEL); 979 if (!n) 980 return -BCH_ERR_ENOMEM_journal_read_buf_realloc; 981 982 kvfree(b->data); 983 b->data = n; 984 b->size = new_size; 985 return 0; 986 } 987 988 static int journal_read_bucket(struct bch_dev *ca, 989 struct journal_read_buf *buf, 990 struct journal_list *jlist, 991 unsigned bucket) 992 { 993 struct bch_fs *c = ca->fs; 994 struct journal_device *ja = &ca->journal; 995 struct jset *j = NULL; 996 unsigned sectors, sectors_read = 0; 997 u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), 998 end = offset + ca->mi.bucket_size; 999 bool saw_bad = false, csum_good; 1000 struct printbuf err = PRINTBUF; 1001 int ret = 0; 1002 1003 pr_debug("reading %u", bucket); 1004 1005 while (offset < end) { 1006 if (!sectors_read) { 1007 struct bio *bio; 1008 unsigned nr_bvecs; 1009 reread: 1010 sectors_read = min_t(unsigned, 1011 end - offset, buf->size >> 9); 1012 nr_bvecs = buf_pages(buf->data, sectors_read << 9); 1013 1014 bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); 1015 bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ); 1016 1017 bio->bi_iter.bi_sector = offset; 1018 bch2_bio_map(bio, buf->data, sectors_read << 9); 1019 1020 ret = submit_bio_wait(bio); 1021 kfree(bio); 1022 1023 if (bch2_dev_io_err_on(ret, ca, BCH_MEMBER_ERROR_read, 1024 "journal read error: sector %llu", 1025 offset) || 1026 bch2_meta_read_fault("journal")) { 1027 /* 1028 * We don't error out of the recovery process 1029 * here, since the relevant journal entry may be 1030 * found on a different device, and missing or 1031 * no journal entries will be handled later 1032 */ 1033 goto out; 1034 } 1035 1036 j = buf->data; 1037 } 1038 1039 ret = jset_validate_early(c, ca, j, offset, 1040 end - offset, sectors_read); 1041 switch (ret) { 1042 case 0: 1043 sectors = vstruct_sectors(j, c->block_bits); 1044 break; 1045 case JOURNAL_ENTRY_REREAD: 1046 if (vstruct_bytes(j) > buf->size) { 1047 ret = journal_read_buf_realloc(buf, 1048 vstruct_bytes(j)); 1049 if (ret) 1050 goto err; 1051 } 1052 goto reread; 1053 case JOURNAL_ENTRY_NONE: 1054 if (!saw_bad) 1055 goto out; 1056 /* 1057 * On checksum error we don't really trust the size 1058 * field of the journal entry we read, so try reading 1059 * again at next block boundary: 1060 */ 1061 sectors = block_sectors(c); 1062 goto next_block; 1063 default: 1064 goto err; 1065 } 1066 1067 if (le64_to_cpu(j->seq) > ja->highest_seq_found) { 1068 ja->highest_seq_found = le64_to_cpu(j->seq); 1069 ja->cur_idx = bucket; 1070 ja->sectors_free = ca->mi.bucket_size - 1071 bucket_remainder(ca, offset) - sectors; 1072 } 1073 1074 /* 1075 * This happens sometimes if we don't have discards on - 1076 * when we've partially overwritten a bucket with new 1077 * journal entries. We don't need the rest of the 1078 * bucket: 1079 */ 1080 if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) 1081 goto out; 1082 1083 ja->bucket_seq[bucket] = le64_to_cpu(j->seq); 1084 1085 enum bch_csum_type csum_type = JSET_CSUM_TYPE(j); 1086 struct bch_csum csum; 1087 csum_good = jset_csum_good(c, j, &csum); 1088 1089 if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum, 1090 "%s", 1091 (printbuf_reset(&err), 1092 prt_str(&err, "journal "), 1093 bch2_csum_err_msg(&err, csum_type, j->csum, csum), 1094 err.buf))) 1095 saw_bad = true; 1096 1097 ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), 1098 j->encrypted_start, 1099 vstruct_end(j) - (void *) j->encrypted_start); 1100 bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret)); 1101 1102 mutex_lock(&jlist->lock); 1103 ret = journal_entry_add(c, ca, (struct journal_ptr) { 1104 .csum_good = csum_good, 1105 .dev = ca->dev_idx, 1106 .bucket = bucket, 1107 .bucket_offset = offset - 1108 bucket_to_sector(ca, ja->buckets[bucket]), 1109 .sector = offset, 1110 }, jlist, j); 1111 mutex_unlock(&jlist->lock); 1112 1113 switch (ret) { 1114 case JOURNAL_ENTRY_ADD_OK: 1115 break; 1116 case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: 1117 break; 1118 default: 1119 goto err; 1120 } 1121 next_block: 1122 pr_debug("next"); 1123 offset += sectors; 1124 sectors_read -= sectors; 1125 j = ((void *) j) + (sectors << 9); 1126 } 1127 1128 out: 1129 ret = 0; 1130 err: 1131 printbuf_exit(&err); 1132 return ret; 1133 } 1134 1135 static CLOSURE_CALLBACK(bch2_journal_read_device) 1136 { 1137 closure_type(ja, struct journal_device, read); 1138 struct bch_dev *ca = container_of(ja, struct bch_dev, journal); 1139 struct bch_fs *c = ca->fs; 1140 struct journal_list *jlist = 1141 container_of(cl->parent, struct journal_list, cl); 1142 struct journal_read_buf buf = { NULL, 0 }; 1143 unsigned i; 1144 int ret = 0; 1145 1146 if (!ja->nr) 1147 goto out; 1148 1149 ret = journal_read_buf_realloc(&buf, PAGE_SIZE); 1150 if (ret) 1151 goto err; 1152 1153 pr_debug("%u journal buckets", ja->nr); 1154 1155 for (i = 0; i < ja->nr; i++) { 1156 ret = journal_read_bucket(ca, &buf, jlist, i); 1157 if (ret) 1158 goto err; 1159 } 1160 1161 /* 1162 * Set dirty_idx to indicate the entire journal is full and needs to be 1163 * reclaimed - journal reclaim will immediately reclaim whatever isn't 1164 * pinned when it first runs: 1165 */ 1166 ja->discard_idx = ja->dirty_idx_ondisk = 1167 ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; 1168 out: 1169 bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); 1170 kvfree(buf.data); 1171 percpu_ref_put(&ca->io_ref); 1172 closure_return(cl); 1173 return; 1174 err: 1175 mutex_lock(&jlist->lock); 1176 jlist->ret = ret; 1177 mutex_unlock(&jlist->lock); 1178 goto out; 1179 } 1180 1181 int bch2_journal_read(struct bch_fs *c, 1182 u64 *last_seq, 1183 u64 *blacklist_seq, 1184 u64 *start_seq) 1185 { 1186 struct journal_list jlist; 1187 struct journal_replay *i, **_i, *prev = NULL; 1188 struct genradix_iter radix_iter; 1189 struct printbuf buf = PRINTBUF; 1190 bool degraded = false, last_write_torn = false; 1191 u64 seq; 1192 int ret = 0; 1193 1194 closure_init_stack(&jlist.cl); 1195 mutex_init(&jlist.lock); 1196 jlist.last_seq = 0; 1197 jlist.ret = 0; 1198 1199 for_each_member_device(c, ca) { 1200 if (!c->opts.fsck && 1201 !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal))) 1202 continue; 1203 1204 if ((ca->mi.state == BCH_MEMBER_STATE_rw || 1205 ca->mi.state == BCH_MEMBER_STATE_ro) && 1206 percpu_ref_tryget(&ca->io_ref)) 1207 closure_call(&ca->journal.read, 1208 bch2_journal_read_device, 1209 system_unbound_wq, 1210 &jlist.cl); 1211 else 1212 degraded = true; 1213 } 1214 1215 closure_sync(&jlist.cl); 1216 1217 if (jlist.ret) 1218 return jlist.ret; 1219 1220 *last_seq = 0; 1221 *start_seq = 0; 1222 *blacklist_seq = 0; 1223 1224 /* 1225 * Find most recent flush entry, and ignore newer non flush entries - 1226 * those entries will be blacklisted: 1227 */ 1228 genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) { 1229 enum bch_validate_flags flags = BCH_VALIDATE_journal; 1230 1231 i = *_i; 1232 1233 if (journal_replay_ignore(i)) 1234 continue; 1235 1236 if (!*start_seq) 1237 *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1; 1238 1239 if (JSET_NO_FLUSH(&i->j)) { 1240 i->ignore_blacklisted = true; 1241 continue; 1242 } 1243 1244 if (!last_write_torn && !i->csum_good) { 1245 last_write_torn = true; 1246 i->ignore_blacklisted = true; 1247 continue; 1248 } 1249 1250 if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq), 1251 c, le32_to_cpu(i->j.version), &i->j, NULL, 1252 jset_last_seq_newer_than_seq, 1253 "invalid journal entry: last_seq > seq (%llu > %llu)", 1254 le64_to_cpu(i->j.last_seq), 1255 le64_to_cpu(i->j.seq))) 1256 i->j.last_seq = i->j.seq; 1257 1258 *last_seq = le64_to_cpu(i->j.last_seq); 1259 *blacklist_seq = le64_to_cpu(i->j.seq) + 1; 1260 break; 1261 } 1262 1263 if (!*start_seq) { 1264 bch_info(c, "journal read done, but no entries found"); 1265 return 0; 1266 } 1267 1268 if (!*last_seq) { 1269 fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes, 1270 "journal read done, but no entries found after dropping non-flushes"); 1271 return 0; 1272 } 1273 1274 bch_info(c, "journal read done, replaying entries %llu-%llu", 1275 *last_seq, *blacklist_seq - 1); 1276 1277 if (*start_seq != *blacklist_seq) 1278 bch_info(c, "dropped unflushed entries %llu-%llu", 1279 *blacklist_seq, *start_seq - 1); 1280 1281 /* Drop blacklisted entries and entries older than last_seq: */ 1282 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1283 i = *_i; 1284 1285 if (journal_replay_ignore(i)) 1286 continue; 1287 1288 seq = le64_to_cpu(i->j.seq); 1289 if (seq < *last_seq) { 1290 journal_replay_free(c, i, false); 1291 continue; 1292 } 1293 1294 if (bch2_journal_seq_is_blacklisted(c, seq, true)) { 1295 fsck_err_on(!JSET_NO_FLUSH(&i->j), c, 1296 jset_seq_blacklisted, 1297 "found blacklisted journal entry %llu", seq); 1298 i->ignore_blacklisted = true; 1299 } 1300 } 1301 1302 /* Check for missing entries: */ 1303 seq = *last_seq; 1304 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1305 i = *_i; 1306 1307 if (journal_replay_ignore(i)) 1308 continue; 1309 1310 BUG_ON(seq > le64_to_cpu(i->j.seq)); 1311 1312 while (seq < le64_to_cpu(i->j.seq)) { 1313 u64 missing_start, missing_end; 1314 struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; 1315 1316 while (seq < le64_to_cpu(i->j.seq) && 1317 bch2_journal_seq_is_blacklisted(c, seq, false)) 1318 seq++; 1319 1320 if (seq == le64_to_cpu(i->j.seq)) 1321 break; 1322 1323 missing_start = seq; 1324 1325 while (seq < le64_to_cpu(i->j.seq) && 1326 !bch2_journal_seq_is_blacklisted(c, seq, false)) 1327 seq++; 1328 1329 if (prev) { 1330 bch2_journal_ptrs_to_text(&buf1, c, prev); 1331 prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits)); 1332 } else 1333 prt_printf(&buf1, "(none)"); 1334 bch2_journal_ptrs_to_text(&buf2, c, i); 1335 1336 missing_end = seq - 1; 1337 fsck_err(c, journal_entries_missing, 1338 "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" 1339 " prev at %s\n" 1340 " next at %s, continue?", 1341 missing_start, missing_end, 1342 *last_seq, *blacklist_seq - 1, 1343 buf1.buf, buf2.buf); 1344 1345 printbuf_exit(&buf1); 1346 printbuf_exit(&buf2); 1347 } 1348 1349 prev = i; 1350 seq++; 1351 } 1352 1353 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1354 struct bch_replicas_padded replicas = { 1355 .e.data_type = BCH_DATA_journal, 1356 .e.nr_required = 1, 1357 }; 1358 1359 i = *_i; 1360 if (journal_replay_ignore(i)) 1361 continue; 1362 1363 darray_for_each(i->ptrs, ptr) { 1364 struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); 1365 1366 if (!ptr->csum_good) 1367 bch_err_dev_offset(ca, ptr->sector, 1368 "invalid journal checksum, seq %llu%s", 1369 le64_to_cpu(i->j.seq), 1370 i->csum_good ? " (had good copy on another device)" : ""); 1371 } 1372 1373 ret = jset_validate(c, 1374 bch2_dev_have_ref(c, i->ptrs.data[0].dev), 1375 &i->j, 1376 i->ptrs.data[0].sector, 1377 READ); 1378 if (ret) 1379 goto err; 1380 1381 darray_for_each(i->ptrs, ptr) 1382 replicas.e.devs[replicas.e.nr_devs++] = ptr->dev; 1383 1384 bch2_replicas_entry_sort(&replicas.e); 1385 1386 printbuf_reset(&buf); 1387 bch2_replicas_entry_to_text(&buf, &replicas.e); 1388 1389 if (!degraded && 1390 !bch2_replicas_marked(c, &replicas.e) && 1391 (le64_to_cpu(i->j.seq) == *last_seq || 1392 fsck_err(c, journal_entry_replicas_not_marked, 1393 "superblock not marked as containing replicas for journal entry %llu\n %s", 1394 le64_to_cpu(i->j.seq), buf.buf))) { 1395 ret = bch2_mark_replicas(c, &replicas.e); 1396 if (ret) 1397 goto err; 1398 } 1399 } 1400 err: 1401 fsck_err: 1402 printbuf_exit(&buf); 1403 return ret; 1404 } 1405 1406 /* journal write: */ 1407 1408 static void __journal_write_alloc(struct journal *j, 1409 struct journal_buf *w, 1410 struct dev_alloc_list *devs_sorted, 1411 unsigned sectors, 1412 unsigned *replicas, 1413 unsigned replicas_want) 1414 { 1415 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1416 struct journal_device *ja; 1417 struct bch_dev *ca; 1418 unsigned i; 1419 1420 if (*replicas >= replicas_want) 1421 return; 1422 1423 for (i = 0; i < devs_sorted->nr; i++) { 1424 ca = rcu_dereference(c->devs[devs_sorted->devs[i]]); 1425 if (!ca) 1426 continue; 1427 1428 ja = &ca->journal; 1429 1430 /* 1431 * Check that we can use this device, and aren't already using 1432 * it: 1433 */ 1434 if (!ca->mi.durability || 1435 ca->mi.state != BCH_MEMBER_STATE_rw || 1436 !ja->nr || 1437 bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) || 1438 sectors > ja->sectors_free) 1439 continue; 1440 1441 bch2_dev_stripe_increment(ca, &j->wp.stripe); 1442 1443 bch2_bkey_append_ptr(&w->key, 1444 (struct bch_extent_ptr) { 1445 .offset = bucket_to_sector(ca, 1446 ja->buckets[ja->cur_idx]) + 1447 ca->mi.bucket_size - 1448 ja->sectors_free, 1449 .dev = ca->dev_idx, 1450 }); 1451 1452 ja->sectors_free -= sectors; 1453 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1454 1455 *replicas += ca->mi.durability; 1456 1457 if (*replicas >= replicas_want) 1458 break; 1459 } 1460 } 1461 1462 /** 1463 * journal_write_alloc - decide where to write next journal entry 1464 * 1465 * @j: journal object 1466 * @w: journal buf (entry to be written) 1467 * 1468 * Returns: 0 on success, or -EROFS on failure 1469 */ 1470 static int journal_write_alloc(struct journal *j, struct journal_buf *w) 1471 { 1472 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1473 struct bch_devs_mask devs; 1474 struct journal_device *ja; 1475 struct bch_dev *ca; 1476 struct dev_alloc_list devs_sorted; 1477 unsigned sectors = vstruct_sectors(w->data, c->block_bits); 1478 unsigned target = c->opts.metadata_target ?: 1479 c->opts.foreground_target; 1480 unsigned i, replicas = 0, replicas_want = 1481 READ_ONCE(c->opts.metadata_replicas); 1482 unsigned replicas_need = min_t(unsigned, replicas_want, 1483 READ_ONCE(c->opts.metadata_replicas_required)); 1484 1485 rcu_read_lock(); 1486 retry: 1487 devs = target_rw_devs(c, BCH_DATA_journal, target); 1488 1489 devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); 1490 1491 __journal_write_alloc(j, w, &devs_sorted, 1492 sectors, &replicas, replicas_want); 1493 1494 if (replicas >= replicas_want) 1495 goto done; 1496 1497 for (i = 0; i < devs_sorted.nr; i++) { 1498 ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); 1499 if (!ca) 1500 continue; 1501 1502 ja = &ca->journal; 1503 1504 if (sectors > ja->sectors_free && 1505 sectors <= ca->mi.bucket_size && 1506 bch2_journal_dev_buckets_available(j, ja, 1507 journal_space_discarded)) { 1508 ja->cur_idx = (ja->cur_idx + 1) % ja->nr; 1509 ja->sectors_free = ca->mi.bucket_size; 1510 1511 /* 1512 * ja->bucket_seq[ja->cur_idx] must always have 1513 * something sensible: 1514 */ 1515 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1516 } 1517 } 1518 1519 __journal_write_alloc(j, w, &devs_sorted, 1520 sectors, &replicas, replicas_want); 1521 1522 if (replicas < replicas_want && target) { 1523 /* Retry from all devices: */ 1524 target = 0; 1525 goto retry; 1526 } 1527 done: 1528 rcu_read_unlock(); 1529 1530 BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); 1531 1532 return replicas >= replicas_need ? 0 : -EROFS; 1533 } 1534 1535 static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) 1536 { 1537 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1538 1539 /* we aren't holding j->lock: */ 1540 unsigned new_size = READ_ONCE(j->buf_size_want); 1541 void *new_buf; 1542 1543 if (buf->buf_size >= new_size) 1544 return; 1545 1546 size_t btree_write_buffer_size = new_size / 64; 1547 1548 if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size)) 1549 return; 1550 1551 new_buf = kvmalloc(new_size, GFP_NOFS|__GFP_NOWARN); 1552 if (!new_buf) 1553 return; 1554 1555 memcpy(new_buf, buf->data, buf->buf_size); 1556 1557 spin_lock(&j->lock); 1558 swap(buf->data, new_buf); 1559 swap(buf->buf_size, new_size); 1560 spin_unlock(&j->lock); 1561 1562 kvfree(new_buf); 1563 } 1564 1565 static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) 1566 { 1567 return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK); 1568 } 1569 1570 static CLOSURE_CALLBACK(journal_write_done) 1571 { 1572 closure_type(w, struct journal_buf, io); 1573 struct journal *j = container_of(w, struct journal, buf[w->idx]); 1574 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1575 struct bch_replicas_padded replicas; 1576 union journal_res_state old, new; 1577 u64 seq = le64_to_cpu(w->data->seq); 1578 int err = 0; 1579 1580 bch2_time_stats_update(!JSET_NO_FLUSH(w->data) 1581 ? j->flush_write_time 1582 : j->noflush_write_time, j->write_start_time); 1583 1584 if (!w->devs_written.nr) { 1585 bch_err(c, "unable to write journal to sufficient devices"); 1586 err = -EIO; 1587 } else { 1588 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, 1589 w->devs_written); 1590 if (bch2_mark_replicas(c, &replicas.e)) 1591 err = -EIO; 1592 } 1593 1594 if (err) 1595 bch2_fatal_error(c); 1596 1597 closure_debug_destroy(cl); 1598 1599 spin_lock(&j->lock); 1600 if (seq >= j->pin.front) 1601 journal_seq_pin(j, seq)->devs = w->devs_written; 1602 if (err && (!j->err_seq || seq < j->err_seq)) 1603 j->err_seq = seq; 1604 w->write_done = true; 1605 1606 bool completed = false; 1607 1608 for (seq = journal_last_unwritten_seq(j); 1609 seq <= journal_cur_seq(j); 1610 seq++) { 1611 w = j->buf + (seq & JOURNAL_BUF_MASK); 1612 if (!w->write_done) 1613 break; 1614 1615 if (!j->err_seq && !JSET_NO_FLUSH(w->data)) { 1616 j->flushed_seq_ondisk = seq; 1617 j->last_seq_ondisk = w->last_seq; 1618 1619 bch2_do_discards(c); 1620 closure_wake_up(&c->freelist_wait); 1621 bch2_reset_alloc_cursors(c); 1622 } 1623 1624 j->seq_ondisk = seq; 1625 1626 /* 1627 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard 1628 * more buckets: 1629 * 1630 * Must come before signaling write completion, for 1631 * bch2_fs_journal_stop(): 1632 */ 1633 if (j->watermark != BCH_WATERMARK_stripe) 1634 journal_reclaim_kick(&c->journal); 1635 1636 old.v = atomic64_read(&j->reservations.counter); 1637 do { 1638 new.v = old.v; 1639 BUG_ON(journal_state_count(new, new.unwritten_idx)); 1640 BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK)); 1641 1642 new.unwritten_idx++; 1643 } while (!atomic64_try_cmpxchg(&j->reservations.counter, 1644 &old.v, new.v)); 1645 1646 closure_wake_up(&w->wait); 1647 completed = true; 1648 } 1649 1650 if (completed) { 1651 bch2_journal_reclaim_fast(j); 1652 bch2_journal_space_available(j); 1653 1654 track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false); 1655 1656 journal_wake(j); 1657 } 1658 1659 if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && 1660 new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { 1661 struct journal_buf *buf = journal_cur_buf(j); 1662 long delta = buf->expires - jiffies; 1663 1664 /* 1665 * We don't close a journal entry to write it while there's 1666 * previous entries still in flight - the current journal entry 1667 * might want to be written now: 1668 */ 1669 mod_delayed_work(j->wq, &j->write_work, max(0L, delta)); 1670 } 1671 1672 /* 1673 * We don't typically trigger journal writes from her - the next journal 1674 * write will be triggered immediately after the previous one is 1675 * allocated, in bch2_journal_write() - but the journal write error path 1676 * is special: 1677 */ 1678 bch2_journal_do_writes(j); 1679 spin_unlock(&j->lock); 1680 } 1681 1682 static void journal_write_endio(struct bio *bio) 1683 { 1684 struct journal_bio *jbio = container_of(bio, struct journal_bio, bio); 1685 struct bch_dev *ca = jbio->ca; 1686 struct journal *j = &ca->fs->journal; 1687 struct journal_buf *w = j->buf + jbio->buf_idx; 1688 1689 if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, 1690 "error writing journal entry %llu: %s", 1691 le64_to_cpu(w->data->seq), 1692 bch2_blk_status_to_str(bio->bi_status)) || 1693 bch2_meta_write_fault("journal")) { 1694 unsigned long flags; 1695 1696 spin_lock_irqsave(&j->err_lock, flags); 1697 bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx); 1698 spin_unlock_irqrestore(&j->err_lock, flags); 1699 } 1700 1701 closure_put(&w->io); 1702 percpu_ref_put(&ca->io_ref); 1703 } 1704 1705 static CLOSURE_CALLBACK(journal_write_submit) 1706 { 1707 closure_type(w, struct journal_buf, io); 1708 struct journal *j = container_of(w, struct journal, buf[w->idx]); 1709 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1710 unsigned sectors = vstruct_sectors(w->data, c->block_bits); 1711 1712 extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { 1713 struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE); 1714 if (!ca) { 1715 /* XXX: fix this */ 1716 bch_err(c, "missing device for journal write\n"); 1717 continue; 1718 } 1719 1720 this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], 1721 sectors); 1722 1723 struct journal_device *ja = &ca->journal; 1724 struct bio *bio = &ja->bio[w->idx]->bio; 1725 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); 1726 bio->bi_iter.bi_sector = ptr->offset; 1727 bio->bi_end_io = journal_write_endio; 1728 bio->bi_private = ca; 1729 1730 BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector); 1731 ca->prev_journal_sector = bio->bi_iter.bi_sector; 1732 1733 if (!JSET_NO_FLUSH(w->data)) 1734 bio->bi_opf |= REQ_FUA; 1735 if (!JSET_NO_FLUSH(w->data) && !w->separate_flush) 1736 bio->bi_opf |= REQ_PREFLUSH; 1737 1738 bch2_bio_map(bio, w->data, sectors << 9); 1739 1740 trace_and_count(c, journal_write, bio); 1741 closure_bio_submit(bio, cl); 1742 1743 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1744 } 1745 1746 continue_at(cl, journal_write_done, j->wq); 1747 } 1748 1749 static CLOSURE_CALLBACK(journal_write_preflush) 1750 { 1751 closure_type(w, struct journal_buf, io); 1752 struct journal *j = container_of(w, struct journal, buf[w->idx]); 1753 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1754 1755 if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { 1756 spin_lock(&j->lock); 1757 if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { 1758 closure_wait(&j->async_wait, cl); 1759 spin_unlock(&j->lock); 1760 continue_at(cl, journal_write_preflush, j->wq); 1761 return; 1762 } 1763 spin_unlock(&j->lock); 1764 } 1765 1766 if (w->separate_flush) { 1767 for_each_rw_member(c, ca) { 1768 percpu_ref_get(&ca->io_ref); 1769 1770 struct journal_device *ja = &ca->journal; 1771 struct bio *bio = &ja->bio[w->idx]->bio; 1772 bio_reset(bio, ca->disk_sb.bdev, 1773 REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH); 1774 bio->bi_end_io = journal_write_endio; 1775 bio->bi_private = ca; 1776 closure_bio_submit(bio, cl); 1777 } 1778 1779 continue_at(cl, journal_write_submit, j->wq); 1780 } else { 1781 /* 1782 * no need to punt to another work item if we're not waiting on 1783 * preflushes 1784 */ 1785 journal_write_submit(&cl->work); 1786 } 1787 } 1788 1789 static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) 1790 { 1791 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1792 struct jset_entry *start, *end; 1793 struct jset *jset = w->data; 1794 struct journal_keys_to_wb wb = { NULL }; 1795 unsigned sectors, bytes, u64s; 1796 unsigned long btree_roots_have = 0; 1797 bool validate_before_checksum = false; 1798 u64 seq = le64_to_cpu(jset->seq); 1799 int ret; 1800 1801 /* 1802 * Simple compaction, dropping empty jset_entries (from journal 1803 * reservations that weren't fully used) and merging jset_entries that 1804 * can be. 1805 * 1806 * If we wanted to be really fancy here, we could sort all the keys in 1807 * the jset and drop keys that were overwritten - probably not worth it: 1808 */ 1809 vstruct_for_each(jset, i) { 1810 unsigned u64s = le16_to_cpu(i->u64s); 1811 1812 /* Empty entry: */ 1813 if (!u64s) 1814 continue; 1815 1816 /* 1817 * New btree roots are set by journalling them; when the journal 1818 * entry gets written we have to propagate them to 1819 * c->btree_roots 1820 * 1821 * But, every journal entry we write has to contain all the 1822 * btree roots (at least for now); so after we copy btree roots 1823 * to c->btree_roots we have to get any missing btree roots and 1824 * add them to this journal entry: 1825 */ 1826 switch (i->type) { 1827 case BCH_JSET_ENTRY_btree_root: 1828 bch2_journal_entry_to_btree_root(c, i); 1829 __set_bit(i->btree_id, &btree_roots_have); 1830 break; 1831 case BCH_JSET_ENTRY_write_buffer_keys: 1832 EBUG_ON(!w->need_flush_to_write_buffer); 1833 1834 if (!wb.wb) 1835 bch2_journal_keys_to_write_buffer_start(c, &wb, seq); 1836 1837 jset_entry_for_each_key(i, k) { 1838 ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k); 1839 if (ret) { 1840 bch2_fs_fatal_error(c, "flushing journal keys to btree write buffer: %s", 1841 bch2_err_str(ret)); 1842 bch2_journal_keys_to_write_buffer_end(c, &wb); 1843 return ret; 1844 } 1845 } 1846 i->type = BCH_JSET_ENTRY_btree_keys; 1847 break; 1848 } 1849 } 1850 1851 if (wb.wb) { 1852 ret = bch2_journal_keys_to_write_buffer_end(c, &wb); 1853 if (ret) { 1854 bch2_fs_fatal_error(c, "error flushing journal keys to btree write buffer: %s", 1855 bch2_err_str(ret)); 1856 return ret; 1857 } 1858 } 1859 1860 spin_lock(&c->journal.lock); 1861 w->need_flush_to_write_buffer = false; 1862 spin_unlock(&c->journal.lock); 1863 1864 start = end = vstruct_last(jset); 1865 1866 end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have); 1867 1868 struct jset_entry_datetime *d = 1869 container_of(jset_entry_init(&end, sizeof(*d)), struct jset_entry_datetime, entry); 1870 d->entry.type = BCH_JSET_ENTRY_datetime; 1871 d->seconds = cpu_to_le64(ktime_get_real_seconds()); 1872 1873 bch2_journal_super_entries_add_common(c, &end, seq); 1874 u64s = (u64 *) end - (u64 *) start; 1875 1876 WARN_ON(u64s > j->entry_u64s_reserved); 1877 1878 le32_add_cpu(&jset->u64s, u64s); 1879 1880 sectors = vstruct_sectors(jset, c->block_bits); 1881 bytes = vstruct_bytes(jset); 1882 1883 if (sectors > w->sectors) { 1884 bch2_fs_fatal_error(c, ": journal write overran available space, %zu > %u (extra %u reserved %u/%u)", 1885 vstruct_bytes(jset), w->sectors << 9, 1886 u64s, w->u64s_reserved, j->entry_u64s_reserved); 1887 return -EINVAL; 1888 } 1889 1890 jset->magic = cpu_to_le64(jset_magic(c)); 1891 jset->version = cpu_to_le32(c->sb.version); 1892 1893 SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); 1894 SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); 1895 1896 if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset)) 1897 j->last_empty_seq = seq; 1898 1899 if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) 1900 validate_before_checksum = true; 1901 1902 if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current) 1903 validate_before_checksum = true; 1904 1905 if (validate_before_checksum && 1906 (ret = jset_validate(c, NULL, jset, 0, WRITE))) 1907 return ret; 1908 1909 ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), 1910 jset->encrypted_start, 1911 vstruct_end(jset) - (void *) jset->encrypted_start); 1912 if (bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret))) 1913 return ret; 1914 1915 jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), 1916 journal_nonce(jset), jset); 1917 1918 if (!validate_before_checksum && 1919 (ret = jset_validate(c, NULL, jset, 0, WRITE))) 1920 return ret; 1921 1922 memset((void *) jset + bytes, 0, (sectors << 9) - bytes); 1923 return 0; 1924 } 1925 1926 static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w) 1927 { 1928 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1929 int error = bch2_journal_error(j); 1930 1931 /* 1932 * If the journal is in an error state - we did an emergency shutdown - 1933 * we prefer to continue doing journal writes. We just mark them as 1934 * noflush so they'll never be used, but they'll still be visible by the 1935 * list_journal tool - this helps in debugging. 1936 * 1937 * There's a caveat: the first journal write after marking the 1938 * superblock dirty must always be a flush write, because on startup 1939 * from a clean shutdown we didn't necessarily read the journal and the 1940 * new journal write might overwrite whatever was in the journal 1941 * previously - we can't leave the journal without any flush writes in 1942 * it. 1943 * 1944 * So if we're in an error state, and we're still starting up, we don't 1945 * write anything at all. 1946 */ 1947 if (error && test_bit(JOURNAL_need_flush_write, &j->flags)) 1948 return -EIO; 1949 1950 if (error || 1951 w->noflush || 1952 (!w->must_flush && 1953 (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && 1954 test_bit(JOURNAL_may_skip_flush, &j->flags))) { 1955 w->noflush = true; 1956 SET_JSET_NO_FLUSH(w->data, true); 1957 w->data->last_seq = 0; 1958 w->last_seq = 0; 1959 1960 j->nr_noflush_writes++; 1961 } else { 1962 w->must_flush = true; 1963 j->last_flush_write = jiffies; 1964 j->nr_flush_writes++; 1965 clear_bit(JOURNAL_need_flush_write, &j->flags); 1966 } 1967 1968 return 0; 1969 } 1970 1971 CLOSURE_CALLBACK(bch2_journal_write) 1972 { 1973 closure_type(w, struct journal_buf, io); 1974 struct journal *j = container_of(w, struct journal, buf[w->idx]); 1975 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1976 struct bch_replicas_padded replicas; 1977 unsigned nr_rw_members = 0; 1978 int ret; 1979 1980 for_each_rw_member(c, ca) 1981 nr_rw_members++; 1982 1983 BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); 1984 BUG_ON(!w->write_started); 1985 BUG_ON(w->write_allocated); 1986 BUG_ON(w->write_done); 1987 1988 j->write_start_time = local_clock(); 1989 1990 spin_lock(&j->lock); 1991 if (nr_rw_members > 1) 1992 w->separate_flush = true; 1993 1994 ret = bch2_journal_write_pick_flush(j, w); 1995 spin_unlock(&j->lock); 1996 if (ret) 1997 goto err; 1998 1999 mutex_lock(&j->buf_lock); 2000 journal_buf_realloc(j, w); 2001 2002 ret = bch2_journal_write_prep(j, w); 2003 mutex_unlock(&j->buf_lock); 2004 if (ret) 2005 goto err; 2006 2007 j->entry_bytes_written += vstruct_bytes(w->data); 2008 2009 while (1) { 2010 spin_lock(&j->lock); 2011 ret = journal_write_alloc(j, w); 2012 if (!ret || !j->can_discard) 2013 break; 2014 2015 spin_unlock(&j->lock); 2016 bch2_journal_do_discards(j); 2017 } 2018 2019 if (ret) { 2020 struct printbuf buf = PRINTBUF; 2021 buf.atomic++; 2022 2023 prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu: %s"), 2024 le64_to_cpu(w->data->seq), 2025 bch2_err_str(ret)); 2026 __bch2_journal_debug_to_text(&buf, j); 2027 spin_unlock(&j->lock); 2028 bch2_print_string_as_lines(KERN_ERR, buf.buf); 2029 printbuf_exit(&buf); 2030 goto err; 2031 } 2032 2033 /* 2034 * write is allocated, no longer need to account for it in 2035 * bch2_journal_space_available(): 2036 */ 2037 w->sectors = 0; 2038 w->write_allocated = true; 2039 2040 /* 2041 * journal entry has been compacted and allocated, recalculate space 2042 * available: 2043 */ 2044 bch2_journal_space_available(j); 2045 bch2_journal_do_writes(j); 2046 spin_unlock(&j->lock); 2047 2048 w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); 2049 2050 if (c->opts.nochanges) 2051 goto no_io; 2052 2053 /* 2054 * Mark journal replicas before we submit the write to guarantee 2055 * recovery will find the journal entries after a crash. 2056 */ 2057 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, 2058 w->devs_written); 2059 ret = bch2_mark_replicas(c, &replicas.e); 2060 if (ret) 2061 goto err; 2062 2063 if (!JSET_NO_FLUSH(w->data)) 2064 continue_at(cl, journal_write_preflush, j->wq); 2065 else 2066 continue_at(cl, journal_write_submit, j->wq); 2067 return; 2068 no_io: 2069 continue_at(cl, journal_write_done, j->wq); 2070 return; 2071 err: 2072 bch2_fatal_error(c); 2073 continue_at(cl, journal_write_done, j->wq); 2074 } 2075