1 // SPDX-License-Identifier: GPL-2.0 2 #include "bcachefs.h" 3 #include "alloc_background.h" 4 #include "alloc_foreground.h" 5 #include "btree_io.h" 6 #include "btree_update_interior.h" 7 #include "btree_write_buffer.h" 8 #include "buckets.h" 9 #include "checksum.h" 10 #include "disk_groups.h" 11 #include "error.h" 12 #include "journal.h" 13 #include "journal_io.h" 14 #include "journal_reclaim.h" 15 #include "journal_seq_blacklist.h" 16 #include "replicas.h" 17 #include "sb-clean.h" 18 #include "trace.h" 19 20 void bch2_journal_pos_from_member_info_set(struct bch_fs *c) 21 { 22 lockdep_assert_held(&c->sb_lock); 23 24 for_each_member_device(c, ca) { 25 struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); 26 27 m->last_journal_bucket = cpu_to_le32(ca->journal.cur_idx); 28 m->last_journal_bucket_offset = cpu_to_le32(ca->mi.bucket_size - ca->journal.sectors_free); 29 } 30 } 31 32 void bch2_journal_pos_from_member_info_resume(struct bch_fs *c) 33 { 34 mutex_lock(&c->sb_lock); 35 for_each_member_device(c, ca) { 36 struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx); 37 38 unsigned idx = le32_to_cpu(m.last_journal_bucket); 39 if (idx < ca->journal.nr) 40 ca->journal.cur_idx = idx; 41 unsigned offset = le32_to_cpu(m.last_journal_bucket_offset); 42 if (offset <= ca->mi.bucket_size) 43 ca->journal.sectors_free = ca->mi.bucket_size - offset; 44 } 45 mutex_unlock(&c->sb_lock); 46 } 47 48 void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, 49 struct journal_replay *j) 50 { 51 darray_for_each(j->ptrs, i) { 52 if (i != j->ptrs.data) 53 prt_printf(out, " "); 54 prt_printf(out, "%u:%u:%u (sector %llu)", 55 i->dev, i->bucket, i->bucket_offset, i->sector); 56 } 57 } 58 59 static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c, 60 struct journal_replay *j) 61 { 62 prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq)); 63 64 bch2_journal_ptrs_to_text(out, c, j); 65 66 for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) { 67 struct jset_entry_datetime *datetime = 68 container_of(entry, struct jset_entry_datetime, entry); 69 bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); 70 break; 71 } 72 } 73 74 static struct nonce journal_nonce(const struct jset *jset) 75 { 76 return (struct nonce) {{ 77 [0] = 0, 78 [1] = ((__le32 *) &jset->seq)[0], 79 [2] = ((__le32 *) &jset->seq)[1], 80 [3] = BCH_NONCE_JOURNAL, 81 }}; 82 } 83 84 static bool jset_csum_good(struct bch_fs *c, struct jset *j, struct bch_csum *csum) 85 { 86 if (!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j))) { 87 *csum = (struct bch_csum) {}; 88 return false; 89 } 90 91 *csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j); 92 return !bch2_crc_cmp(j->csum, *csum); 93 } 94 95 static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq) 96 { 97 return (seq - c->journal_entries_base_seq) & (~0U >> 1); 98 } 99 100 static void __journal_replay_free(struct bch_fs *c, 101 struct journal_replay *i) 102 { 103 struct journal_replay **p = 104 genradix_ptr(&c->journal_entries, 105 journal_entry_radix_idx(c, le64_to_cpu(i->j.seq))); 106 107 BUG_ON(*p != i); 108 *p = NULL; 109 kvfree(i); 110 } 111 112 static void journal_replay_free(struct bch_fs *c, struct journal_replay *i, bool blacklisted) 113 { 114 if (blacklisted) 115 i->ignore_blacklisted = true; 116 else 117 i->ignore_not_dirty = true; 118 119 if (!c->opts.read_entire_journal) 120 __journal_replay_free(c, i); 121 } 122 123 struct journal_list { 124 struct closure cl; 125 u64 last_seq; 126 struct mutex lock; 127 int ret; 128 }; 129 130 #define JOURNAL_ENTRY_ADD_OK 0 131 #define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5 132 133 /* 134 * Given a journal entry we just read, add it to the list of journal entries to 135 * be replayed: 136 */ 137 static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, 138 struct journal_ptr entry_ptr, 139 struct journal_list *jlist, struct jset *j) 140 { 141 struct genradix_iter iter; 142 struct journal_replay **_i, *i, *dup; 143 size_t bytes = vstruct_bytes(j); 144 u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0; 145 struct printbuf buf = PRINTBUF; 146 int ret = JOURNAL_ENTRY_ADD_OK; 147 148 if (!c->journal.oldest_seq_found_ondisk || 149 le64_to_cpu(j->seq) < c->journal.oldest_seq_found_ondisk) 150 c->journal.oldest_seq_found_ondisk = le64_to_cpu(j->seq); 151 152 /* Is this entry older than the range we need? */ 153 if (!c->opts.read_entire_journal && 154 le64_to_cpu(j->seq) < jlist->last_seq) 155 return JOURNAL_ENTRY_ADD_OUT_OF_RANGE; 156 157 /* 158 * genradixes are indexed by a ulong, not a u64, so we can't index them 159 * by sequence number directly: Assume instead that they will all fall 160 * within the range of +-2billion of the filrst one we find. 161 */ 162 if (!c->journal_entries_base_seq) 163 c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX); 164 165 /* Drop entries we don't need anymore */ 166 if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) { 167 genradix_for_each_from(&c->journal_entries, iter, _i, 168 journal_entry_radix_idx(c, jlist->last_seq)) { 169 i = *_i; 170 171 if (journal_replay_ignore(i)) 172 continue; 173 174 if (le64_to_cpu(i->j.seq) >= last_seq) 175 break; 176 177 journal_replay_free(c, i, false); 178 } 179 } 180 181 jlist->last_seq = max(jlist->last_seq, last_seq); 182 183 _i = genradix_ptr_alloc(&c->journal_entries, 184 journal_entry_radix_idx(c, le64_to_cpu(j->seq)), 185 GFP_KERNEL); 186 if (!_i) 187 return -BCH_ERR_ENOMEM_journal_entry_add; 188 189 /* 190 * Duplicate journal entries? If so we want the one that didn't have a 191 * checksum error: 192 */ 193 dup = *_i; 194 if (dup) { 195 bool identical = bytes == vstruct_bytes(&dup->j) && 196 !memcmp(j, &dup->j, bytes); 197 bool not_identical = !identical && 198 entry_ptr.csum_good && 199 dup->csum_good; 200 201 bool same_device = false; 202 darray_for_each(dup->ptrs, ptr) 203 if (ptr->dev == ca->dev_idx) 204 same_device = true; 205 206 ret = darray_push(&dup->ptrs, entry_ptr); 207 if (ret) 208 goto out; 209 210 bch2_journal_replay_to_text(&buf, c, dup); 211 212 fsck_err_on(same_device, 213 c, journal_entry_dup_same_device, 214 "duplicate journal entry on same device\n %s", 215 buf.buf); 216 217 fsck_err_on(not_identical, 218 c, journal_entry_replicas_data_mismatch, 219 "found duplicate but non identical journal entries\n %s", 220 buf.buf); 221 222 if (entry_ptr.csum_good && !identical) 223 goto replace; 224 225 goto out; 226 } 227 replace: 228 i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); 229 if (!i) 230 return -BCH_ERR_ENOMEM_journal_entry_add; 231 232 darray_init(&i->ptrs); 233 i->csum_good = entry_ptr.csum_good; 234 i->ignore_blacklisted = false; 235 i->ignore_not_dirty = false; 236 unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct"); 237 238 if (dup) { 239 /* The first ptr should represent the jset we kept: */ 240 darray_for_each(dup->ptrs, ptr) 241 darray_push(&i->ptrs, *ptr); 242 __journal_replay_free(c, dup); 243 } else { 244 darray_push(&i->ptrs, entry_ptr); 245 } 246 247 *_i = i; 248 out: 249 fsck_err: 250 printbuf_exit(&buf); 251 return ret; 252 } 253 254 /* this fills in a range with empty jset_entries: */ 255 static void journal_entry_null_range(void *start, void *end) 256 { 257 struct jset_entry *entry; 258 259 for (entry = start; entry != end; entry = vstruct_next(entry)) 260 memset(entry, 0, sizeof(*entry)); 261 } 262 263 #define JOURNAL_ENTRY_REREAD 5 264 #define JOURNAL_ENTRY_NONE 6 265 #define JOURNAL_ENTRY_BAD 7 266 267 static void journal_entry_err_msg(struct printbuf *out, 268 u32 version, 269 struct jset *jset, 270 struct jset_entry *entry) 271 { 272 prt_str(out, "invalid journal entry, version="); 273 bch2_version_to_text(out, version); 274 275 if (entry) { 276 prt_str(out, " type="); 277 bch2_prt_jset_entry_type(out, entry->type); 278 } 279 280 if (!jset) { 281 prt_printf(out, " in superblock"); 282 } else { 283 284 prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq)); 285 286 if (entry) 287 prt_printf(out, " offset=%zi/%u", 288 (u64 *) entry - jset->_data, 289 le32_to_cpu(jset->u64s)); 290 } 291 292 prt_str(out, ": "); 293 } 294 295 #define journal_entry_err(c, version, jset, entry, _err, msg, ...) \ 296 ({ \ 297 struct printbuf _buf = PRINTBUF; \ 298 \ 299 journal_entry_err_msg(&_buf, version, jset, entry); \ 300 prt_printf(&_buf, msg, ##__VA_ARGS__); \ 301 \ 302 switch (flags & BCH_VALIDATE_write) { \ 303 case READ: \ 304 mustfix_fsck_err(c, _err, "%s", _buf.buf); \ 305 break; \ 306 case WRITE: \ 307 bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \ 308 bch_err(c, "corrupt metadata before write: %s\n", _buf.buf);\ 309 if (bch2_fs_inconsistent(c)) { \ 310 ret = -BCH_ERR_fsck_errors_not_fixed; \ 311 goto fsck_err; \ 312 } \ 313 break; \ 314 } \ 315 \ 316 printbuf_exit(&_buf); \ 317 true; \ 318 }) 319 320 #define journal_entry_err_on(cond, ...) \ 321 ((cond) ? journal_entry_err(__VA_ARGS__) : false) 322 323 #define FSCK_DELETED_KEY 5 324 325 static int journal_validate_key(struct bch_fs *c, 326 struct jset *jset, 327 struct jset_entry *entry, 328 unsigned level, enum btree_id btree_id, 329 struct bkey_i *k, 330 unsigned version, int big_endian, 331 enum bch_validate_flags flags) 332 { 333 int write = flags & BCH_VALIDATE_write; 334 void *next = vstruct_next(entry); 335 int ret = 0; 336 337 if (journal_entry_err_on(!k->k.u64s, 338 c, version, jset, entry, 339 journal_entry_bkey_u64s_0, 340 "k->u64s 0")) { 341 entry->u64s = cpu_to_le16((u64 *) k - entry->_data); 342 journal_entry_null_range(vstruct_next(entry), next); 343 return FSCK_DELETED_KEY; 344 } 345 346 if (journal_entry_err_on((void *) bkey_next(k) > 347 (void *) vstruct_next(entry), 348 c, version, jset, entry, 349 journal_entry_bkey_past_end, 350 "extends past end of journal entry")) { 351 entry->u64s = cpu_to_le16((u64 *) k - entry->_data); 352 journal_entry_null_range(vstruct_next(entry), next); 353 return FSCK_DELETED_KEY; 354 } 355 356 if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, 357 c, version, jset, entry, 358 journal_entry_bkey_bad_format, 359 "bad format %u", k->k.format)) { 360 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); 361 memmove(k, bkey_next(k), next - (void *) bkey_next(k)); 362 journal_entry_null_range(vstruct_next(entry), next); 363 return FSCK_DELETED_KEY; 364 } 365 366 if (!write) 367 bch2_bkey_compat(level, btree_id, version, big_endian, 368 write, NULL, bkey_to_packed(k)); 369 370 ret = bch2_bkey_validate(c, bkey_i_to_s_c(k), 371 __btree_node_type(level, btree_id), write); 372 if (ret == -BCH_ERR_fsck_delete_bkey) { 373 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); 374 memmove(k, bkey_next(k), next - (void *) bkey_next(k)); 375 journal_entry_null_range(vstruct_next(entry), next); 376 return FSCK_DELETED_KEY; 377 } 378 if (ret) 379 goto fsck_err; 380 381 if (write) 382 bch2_bkey_compat(level, btree_id, version, big_endian, 383 write, NULL, bkey_to_packed(k)); 384 fsck_err: 385 return ret; 386 } 387 388 static int journal_entry_btree_keys_validate(struct bch_fs *c, 389 struct jset *jset, 390 struct jset_entry *entry, 391 unsigned version, int big_endian, 392 enum bch_validate_flags flags) 393 { 394 struct bkey_i *k = entry->start; 395 396 while (k != vstruct_last(entry)) { 397 int ret = journal_validate_key(c, jset, entry, 398 entry->level, 399 entry->btree_id, 400 k, version, big_endian, 401 flags|BCH_VALIDATE_journal); 402 if (ret == FSCK_DELETED_KEY) 403 continue; 404 else if (ret) 405 return ret; 406 407 k = bkey_next(k); 408 } 409 410 return 0; 411 } 412 413 static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c, 414 struct jset_entry *entry) 415 { 416 bool first = true; 417 418 jset_entry_for_each_key(entry, k) { 419 if (!first) { 420 prt_newline(out); 421 bch2_prt_jset_entry_type(out, entry->type); 422 prt_str(out, ": "); 423 } 424 prt_printf(out, "btree=%s l=%u ", bch2_btree_id_str(entry->btree_id), entry->level); 425 bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); 426 first = false; 427 } 428 } 429 430 static int journal_entry_btree_root_validate(struct bch_fs *c, 431 struct jset *jset, 432 struct jset_entry *entry, 433 unsigned version, int big_endian, 434 enum bch_validate_flags flags) 435 { 436 struct bkey_i *k = entry->start; 437 int ret = 0; 438 439 if (journal_entry_err_on(!entry->u64s || 440 le16_to_cpu(entry->u64s) != k->k.u64s, 441 c, version, jset, entry, 442 journal_entry_btree_root_bad_size, 443 "invalid btree root journal entry: wrong number of keys")) { 444 void *next = vstruct_next(entry); 445 /* 446 * we don't want to null out this jset_entry, 447 * just the contents, so that later we can tell 448 * we were _supposed_ to have a btree root 449 */ 450 entry->u64s = 0; 451 journal_entry_null_range(vstruct_next(entry), next); 452 return 0; 453 } 454 455 ret = journal_validate_key(c, jset, entry, 1, entry->btree_id, k, 456 version, big_endian, flags); 457 if (ret == FSCK_DELETED_KEY) 458 ret = 0; 459 fsck_err: 460 return ret; 461 } 462 463 static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c, 464 struct jset_entry *entry) 465 { 466 journal_entry_btree_keys_to_text(out, c, entry); 467 } 468 469 static int journal_entry_prio_ptrs_validate(struct bch_fs *c, 470 struct jset *jset, 471 struct jset_entry *entry, 472 unsigned version, int big_endian, 473 enum bch_validate_flags flags) 474 { 475 /* obsolete, don't care: */ 476 return 0; 477 } 478 479 static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c, 480 struct jset_entry *entry) 481 { 482 } 483 484 static int journal_entry_blacklist_validate(struct bch_fs *c, 485 struct jset *jset, 486 struct jset_entry *entry, 487 unsigned version, int big_endian, 488 enum bch_validate_flags flags) 489 { 490 int ret = 0; 491 492 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, 493 c, version, jset, entry, 494 journal_entry_blacklist_bad_size, 495 "invalid journal seq blacklist entry: bad size")) { 496 journal_entry_null_range(entry, vstruct_next(entry)); 497 } 498 fsck_err: 499 return ret; 500 } 501 502 static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c, 503 struct jset_entry *entry) 504 { 505 struct jset_entry_blacklist *bl = 506 container_of(entry, struct jset_entry_blacklist, entry); 507 508 prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq)); 509 } 510 511 static int journal_entry_blacklist_v2_validate(struct bch_fs *c, 512 struct jset *jset, 513 struct jset_entry *entry, 514 unsigned version, int big_endian, 515 enum bch_validate_flags flags) 516 { 517 struct jset_entry_blacklist_v2 *bl_entry; 518 int ret = 0; 519 520 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, 521 c, version, jset, entry, 522 journal_entry_blacklist_v2_bad_size, 523 "invalid journal seq blacklist entry: bad size")) { 524 journal_entry_null_range(entry, vstruct_next(entry)); 525 goto out; 526 } 527 528 bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry); 529 530 if (journal_entry_err_on(le64_to_cpu(bl_entry->start) > 531 le64_to_cpu(bl_entry->end), 532 c, version, jset, entry, 533 journal_entry_blacklist_v2_start_past_end, 534 "invalid journal seq blacklist entry: start > end")) { 535 journal_entry_null_range(entry, vstruct_next(entry)); 536 } 537 out: 538 fsck_err: 539 return ret; 540 } 541 542 static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c, 543 struct jset_entry *entry) 544 { 545 struct jset_entry_blacklist_v2 *bl = 546 container_of(entry, struct jset_entry_blacklist_v2, entry); 547 548 prt_printf(out, "start=%llu end=%llu", 549 le64_to_cpu(bl->start), 550 le64_to_cpu(bl->end)); 551 } 552 553 static int journal_entry_usage_validate(struct bch_fs *c, 554 struct jset *jset, 555 struct jset_entry *entry, 556 unsigned version, int big_endian, 557 enum bch_validate_flags flags) 558 { 559 struct jset_entry_usage *u = 560 container_of(entry, struct jset_entry_usage, entry); 561 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 562 int ret = 0; 563 564 if (journal_entry_err_on(bytes < sizeof(*u), 565 c, version, jset, entry, 566 journal_entry_usage_bad_size, 567 "invalid journal entry usage: bad size")) { 568 journal_entry_null_range(entry, vstruct_next(entry)); 569 return ret; 570 } 571 572 fsck_err: 573 return ret; 574 } 575 576 static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c, 577 struct jset_entry *entry) 578 { 579 struct jset_entry_usage *u = 580 container_of(entry, struct jset_entry_usage, entry); 581 582 prt_str(out, "type="); 583 bch2_prt_fs_usage_type(out, u->entry.btree_id); 584 prt_printf(out, " v=%llu", le64_to_cpu(u->v)); 585 } 586 587 static int journal_entry_data_usage_validate(struct bch_fs *c, 588 struct jset *jset, 589 struct jset_entry *entry, 590 unsigned version, int big_endian, 591 enum bch_validate_flags flags) 592 { 593 struct jset_entry_data_usage *u = 594 container_of(entry, struct jset_entry_data_usage, entry); 595 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 596 struct printbuf err = PRINTBUF; 597 int ret = 0; 598 599 if (journal_entry_err_on(bytes < sizeof(*u) || 600 bytes < sizeof(*u) + u->r.nr_devs, 601 c, version, jset, entry, 602 journal_entry_data_usage_bad_size, 603 "invalid journal entry usage: bad size")) { 604 journal_entry_null_range(entry, vstruct_next(entry)); 605 goto out; 606 } 607 608 if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c, &err), 609 c, version, jset, entry, 610 journal_entry_data_usage_bad_size, 611 "invalid journal entry usage: %s", err.buf)) { 612 journal_entry_null_range(entry, vstruct_next(entry)); 613 goto out; 614 } 615 out: 616 fsck_err: 617 printbuf_exit(&err); 618 return ret; 619 } 620 621 static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c, 622 struct jset_entry *entry) 623 { 624 struct jset_entry_data_usage *u = 625 container_of(entry, struct jset_entry_data_usage, entry); 626 627 bch2_replicas_entry_to_text(out, &u->r); 628 prt_printf(out, "=%llu", le64_to_cpu(u->v)); 629 } 630 631 static int journal_entry_clock_validate(struct bch_fs *c, 632 struct jset *jset, 633 struct jset_entry *entry, 634 unsigned version, int big_endian, 635 enum bch_validate_flags flags) 636 { 637 struct jset_entry_clock *clock = 638 container_of(entry, struct jset_entry_clock, entry); 639 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 640 int ret = 0; 641 642 if (journal_entry_err_on(bytes != sizeof(*clock), 643 c, version, jset, entry, 644 journal_entry_clock_bad_size, 645 "bad size")) { 646 journal_entry_null_range(entry, vstruct_next(entry)); 647 return ret; 648 } 649 650 if (journal_entry_err_on(clock->rw > 1, 651 c, version, jset, entry, 652 journal_entry_clock_bad_rw, 653 "bad rw")) { 654 journal_entry_null_range(entry, vstruct_next(entry)); 655 return ret; 656 } 657 658 fsck_err: 659 return ret; 660 } 661 662 static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c, 663 struct jset_entry *entry) 664 { 665 struct jset_entry_clock *clock = 666 container_of(entry, struct jset_entry_clock, entry); 667 668 prt_printf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time)); 669 } 670 671 static int journal_entry_dev_usage_validate(struct bch_fs *c, 672 struct jset *jset, 673 struct jset_entry *entry, 674 unsigned version, int big_endian, 675 enum bch_validate_flags flags) 676 { 677 struct jset_entry_dev_usage *u = 678 container_of(entry, struct jset_entry_dev_usage, entry); 679 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 680 unsigned expected = sizeof(*u); 681 int ret = 0; 682 683 if (journal_entry_err_on(bytes < expected, 684 c, version, jset, entry, 685 journal_entry_dev_usage_bad_size, 686 "bad size (%u < %u)", 687 bytes, expected)) { 688 journal_entry_null_range(entry, vstruct_next(entry)); 689 return ret; 690 } 691 692 if (journal_entry_err_on(u->pad, 693 c, version, jset, entry, 694 journal_entry_dev_usage_bad_pad, 695 "bad pad")) { 696 journal_entry_null_range(entry, vstruct_next(entry)); 697 return ret; 698 } 699 700 fsck_err: 701 return ret; 702 } 703 704 static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c, 705 struct jset_entry *entry) 706 { 707 struct jset_entry_dev_usage *u = 708 container_of(entry, struct jset_entry_dev_usage, entry); 709 unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); 710 711 prt_printf(out, "dev=%u", le32_to_cpu(u->dev)); 712 713 printbuf_indent_add(out, 2); 714 for (i = 0; i < nr_types; i++) { 715 prt_newline(out); 716 bch2_prt_data_type(out, i); 717 prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu", 718 le64_to_cpu(u->d[i].buckets), 719 le64_to_cpu(u->d[i].sectors), 720 le64_to_cpu(u->d[i].fragmented)); 721 } 722 printbuf_indent_sub(out, 2); 723 } 724 725 static int journal_entry_log_validate(struct bch_fs *c, 726 struct jset *jset, 727 struct jset_entry *entry, 728 unsigned version, int big_endian, 729 enum bch_validate_flags flags) 730 { 731 return 0; 732 } 733 734 static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c, 735 struct jset_entry *entry) 736 { 737 struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); 738 unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d); 739 740 prt_printf(out, "%.*s", bytes, l->d); 741 } 742 743 static int journal_entry_overwrite_validate(struct bch_fs *c, 744 struct jset *jset, 745 struct jset_entry *entry, 746 unsigned version, int big_endian, 747 enum bch_validate_flags flags) 748 { 749 return journal_entry_btree_keys_validate(c, jset, entry, 750 version, big_endian, READ); 751 } 752 753 static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c, 754 struct jset_entry *entry) 755 { 756 journal_entry_btree_keys_to_text(out, c, entry); 757 } 758 759 static int journal_entry_write_buffer_keys_validate(struct bch_fs *c, 760 struct jset *jset, 761 struct jset_entry *entry, 762 unsigned version, int big_endian, 763 enum bch_validate_flags flags) 764 { 765 return journal_entry_btree_keys_validate(c, jset, entry, 766 version, big_endian, READ); 767 } 768 769 static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c, 770 struct jset_entry *entry) 771 { 772 journal_entry_btree_keys_to_text(out, c, entry); 773 } 774 775 static int journal_entry_datetime_validate(struct bch_fs *c, 776 struct jset *jset, 777 struct jset_entry *entry, 778 unsigned version, int big_endian, 779 enum bch_validate_flags flags) 780 { 781 unsigned bytes = vstruct_bytes(entry); 782 unsigned expected = 16; 783 int ret = 0; 784 785 if (journal_entry_err_on(vstruct_bytes(entry) < expected, 786 c, version, jset, entry, 787 journal_entry_dev_usage_bad_size, 788 "bad size (%u < %u)", 789 bytes, expected)) { 790 journal_entry_null_range(entry, vstruct_next(entry)); 791 return ret; 792 } 793 fsck_err: 794 return ret; 795 } 796 797 static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *c, 798 struct jset_entry *entry) 799 { 800 struct jset_entry_datetime *datetime = 801 container_of(entry, struct jset_entry_datetime, entry); 802 803 bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); 804 } 805 806 struct jset_entry_ops { 807 int (*validate)(struct bch_fs *, struct jset *, 808 struct jset_entry *, unsigned, int, 809 enum bch_validate_flags); 810 void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); 811 }; 812 813 static const struct jset_entry_ops bch2_jset_entry_ops[] = { 814 #define x(f, nr) \ 815 [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \ 816 .validate = journal_entry_##f##_validate, \ 817 .to_text = journal_entry_##f##_to_text, \ 818 }, 819 BCH_JSET_ENTRY_TYPES() 820 #undef x 821 }; 822 823 int bch2_journal_entry_validate(struct bch_fs *c, 824 struct jset *jset, 825 struct jset_entry *entry, 826 unsigned version, int big_endian, 827 enum bch_validate_flags flags) 828 { 829 return entry->type < BCH_JSET_ENTRY_NR 830 ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry, 831 version, big_endian, flags) 832 : 0; 833 } 834 835 void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, 836 struct jset_entry *entry) 837 { 838 bch2_prt_jset_entry_type(out, entry->type); 839 840 if (entry->type < BCH_JSET_ENTRY_NR) { 841 prt_str(out, ": "); 842 bch2_jset_entry_ops[entry->type].to_text(out, c, entry); 843 } 844 } 845 846 static int jset_validate_entries(struct bch_fs *c, struct jset *jset, 847 enum bch_validate_flags flags) 848 { 849 unsigned version = le32_to_cpu(jset->version); 850 int ret = 0; 851 852 vstruct_for_each(jset, entry) { 853 if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset), 854 c, version, jset, entry, 855 journal_entry_past_jset_end, 856 "journal entry extends past end of jset")) { 857 jset->u64s = cpu_to_le32((u64 *) entry - jset->_data); 858 break; 859 } 860 861 ret = bch2_journal_entry_validate(c, jset, entry, 862 version, JSET_BIG_ENDIAN(jset), flags); 863 if (ret) 864 break; 865 } 866 fsck_err: 867 return ret; 868 } 869 870 static int jset_validate(struct bch_fs *c, 871 struct bch_dev *ca, 872 struct jset *jset, u64 sector, 873 enum bch_validate_flags flags) 874 { 875 unsigned version; 876 int ret = 0; 877 878 if (le64_to_cpu(jset->magic) != jset_magic(c)) 879 return JOURNAL_ENTRY_NONE; 880 881 version = le32_to_cpu(jset->version); 882 if (journal_entry_err_on(!bch2_version_compatible(version), 883 c, version, jset, NULL, 884 jset_unsupported_version, 885 "%s sector %llu seq %llu: incompatible journal entry version %u.%u", 886 ca ? ca->name : c->name, 887 sector, le64_to_cpu(jset->seq), 888 BCH_VERSION_MAJOR(version), 889 BCH_VERSION_MINOR(version))) { 890 /* don't try to continue: */ 891 return -EINVAL; 892 } 893 894 if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), 895 c, version, jset, NULL, 896 jset_unknown_csum, 897 "%s sector %llu seq %llu: journal entry with unknown csum type %llu", 898 ca ? ca->name : c->name, 899 sector, le64_to_cpu(jset->seq), 900 JSET_CSUM_TYPE(jset))) 901 ret = JOURNAL_ENTRY_BAD; 902 903 /* last_seq is ignored when JSET_NO_FLUSH is true */ 904 if (journal_entry_err_on(!JSET_NO_FLUSH(jset) && 905 le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), 906 c, version, jset, NULL, 907 jset_last_seq_newer_than_seq, 908 "invalid journal entry: last_seq > seq (%llu > %llu)", 909 le64_to_cpu(jset->last_seq), 910 le64_to_cpu(jset->seq))) { 911 jset->last_seq = jset->seq; 912 return JOURNAL_ENTRY_BAD; 913 } 914 915 ret = jset_validate_entries(c, jset, flags); 916 fsck_err: 917 return ret; 918 } 919 920 static int jset_validate_early(struct bch_fs *c, 921 struct bch_dev *ca, 922 struct jset *jset, u64 sector, 923 unsigned bucket_sectors_left, 924 unsigned sectors_read) 925 { 926 size_t bytes = vstruct_bytes(jset); 927 unsigned version; 928 enum bch_validate_flags flags = BCH_VALIDATE_journal; 929 int ret = 0; 930 931 if (le64_to_cpu(jset->magic) != jset_magic(c)) 932 return JOURNAL_ENTRY_NONE; 933 934 version = le32_to_cpu(jset->version); 935 if (journal_entry_err_on(!bch2_version_compatible(version), 936 c, version, jset, NULL, 937 jset_unsupported_version, 938 "%s sector %llu seq %llu: unknown journal entry version %u.%u", 939 ca ? ca->name : c->name, 940 sector, le64_to_cpu(jset->seq), 941 BCH_VERSION_MAJOR(version), 942 BCH_VERSION_MINOR(version))) { 943 /* don't try to continue: */ 944 return -EINVAL; 945 } 946 947 if (bytes > (sectors_read << 9) && 948 sectors_read < bucket_sectors_left) 949 return JOURNAL_ENTRY_REREAD; 950 951 if (journal_entry_err_on(bytes > bucket_sectors_left << 9, 952 c, version, jset, NULL, 953 jset_past_bucket_end, 954 "%s sector %llu seq %llu: journal entry too big (%zu bytes)", 955 ca ? ca->name : c->name, 956 sector, le64_to_cpu(jset->seq), bytes)) 957 le32_add_cpu(&jset->u64s, 958 -((bytes - (bucket_sectors_left << 9)) / 8)); 959 fsck_err: 960 return ret; 961 } 962 963 struct journal_read_buf { 964 void *data; 965 size_t size; 966 }; 967 968 static int journal_read_buf_realloc(struct journal_read_buf *b, 969 size_t new_size) 970 { 971 void *n; 972 973 /* the bios are sized for this many pages, max: */ 974 if (new_size > JOURNAL_ENTRY_SIZE_MAX) 975 return -BCH_ERR_ENOMEM_journal_read_buf_realloc; 976 977 new_size = roundup_pow_of_two(new_size); 978 n = kvmalloc(new_size, GFP_KERNEL); 979 if (!n) 980 return -BCH_ERR_ENOMEM_journal_read_buf_realloc; 981 982 kvfree(b->data); 983 b->data = n; 984 b->size = new_size; 985 return 0; 986 } 987 988 static int journal_read_bucket(struct bch_dev *ca, 989 struct journal_read_buf *buf, 990 struct journal_list *jlist, 991 unsigned bucket) 992 { 993 struct bch_fs *c = ca->fs; 994 struct journal_device *ja = &ca->journal; 995 struct jset *j = NULL; 996 unsigned sectors, sectors_read = 0; 997 u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), 998 end = offset + ca->mi.bucket_size; 999 bool saw_bad = false, csum_good; 1000 struct printbuf err = PRINTBUF; 1001 int ret = 0; 1002 1003 pr_debug("reading %u", bucket); 1004 1005 while (offset < end) { 1006 if (!sectors_read) { 1007 struct bio *bio; 1008 unsigned nr_bvecs; 1009 reread: 1010 sectors_read = min_t(unsigned, 1011 end - offset, buf->size >> 9); 1012 nr_bvecs = buf_pages(buf->data, sectors_read << 9); 1013 1014 bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); 1015 bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ); 1016 1017 bio->bi_iter.bi_sector = offset; 1018 bch2_bio_map(bio, buf->data, sectors_read << 9); 1019 1020 ret = submit_bio_wait(bio); 1021 kfree(bio); 1022 1023 if (bch2_dev_io_err_on(ret, ca, BCH_MEMBER_ERROR_read, 1024 "journal read error: sector %llu", 1025 offset) || 1026 bch2_meta_read_fault("journal")) { 1027 /* 1028 * We don't error out of the recovery process 1029 * here, since the relevant journal entry may be 1030 * found on a different device, and missing or 1031 * no journal entries will be handled later 1032 */ 1033 goto out; 1034 } 1035 1036 j = buf->data; 1037 } 1038 1039 ret = jset_validate_early(c, ca, j, offset, 1040 end - offset, sectors_read); 1041 switch (ret) { 1042 case 0: 1043 sectors = vstruct_sectors(j, c->block_bits); 1044 break; 1045 case JOURNAL_ENTRY_REREAD: 1046 if (vstruct_bytes(j) > buf->size) { 1047 ret = journal_read_buf_realloc(buf, 1048 vstruct_bytes(j)); 1049 if (ret) 1050 goto err; 1051 } 1052 goto reread; 1053 case JOURNAL_ENTRY_NONE: 1054 if (!saw_bad) 1055 goto out; 1056 /* 1057 * On checksum error we don't really trust the size 1058 * field of the journal entry we read, so try reading 1059 * again at next block boundary: 1060 */ 1061 sectors = block_sectors(c); 1062 goto next_block; 1063 default: 1064 goto err; 1065 } 1066 1067 if (le64_to_cpu(j->seq) > ja->highest_seq_found) { 1068 ja->highest_seq_found = le64_to_cpu(j->seq); 1069 ja->cur_idx = bucket; 1070 ja->sectors_free = ca->mi.bucket_size - 1071 bucket_remainder(ca, offset) - sectors; 1072 } 1073 1074 /* 1075 * This happens sometimes if we don't have discards on - 1076 * when we've partially overwritten a bucket with new 1077 * journal entries. We don't need the rest of the 1078 * bucket: 1079 */ 1080 if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) 1081 goto out; 1082 1083 ja->bucket_seq[bucket] = le64_to_cpu(j->seq); 1084 1085 enum bch_csum_type csum_type = JSET_CSUM_TYPE(j); 1086 struct bch_csum csum; 1087 csum_good = jset_csum_good(c, j, &csum); 1088 1089 if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum, 1090 "%s", 1091 (printbuf_reset(&err), 1092 prt_str(&err, "journal "), 1093 bch2_csum_err_msg(&err, csum_type, j->csum, csum), 1094 err.buf))) 1095 saw_bad = true; 1096 1097 ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), 1098 j->encrypted_start, 1099 vstruct_end(j) - (void *) j->encrypted_start); 1100 bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret)); 1101 1102 mutex_lock(&jlist->lock); 1103 ret = journal_entry_add(c, ca, (struct journal_ptr) { 1104 .csum_good = csum_good, 1105 .dev = ca->dev_idx, 1106 .bucket = bucket, 1107 .bucket_offset = offset - 1108 bucket_to_sector(ca, ja->buckets[bucket]), 1109 .sector = offset, 1110 }, jlist, j); 1111 mutex_unlock(&jlist->lock); 1112 1113 switch (ret) { 1114 case JOURNAL_ENTRY_ADD_OK: 1115 break; 1116 case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: 1117 break; 1118 default: 1119 goto err; 1120 } 1121 next_block: 1122 pr_debug("next"); 1123 offset += sectors; 1124 sectors_read -= sectors; 1125 j = ((void *) j) + (sectors << 9); 1126 } 1127 1128 out: 1129 ret = 0; 1130 err: 1131 printbuf_exit(&err); 1132 return ret; 1133 } 1134 1135 static CLOSURE_CALLBACK(bch2_journal_read_device) 1136 { 1137 closure_type(ja, struct journal_device, read); 1138 struct bch_dev *ca = container_of(ja, struct bch_dev, journal); 1139 struct bch_fs *c = ca->fs; 1140 struct journal_list *jlist = 1141 container_of(cl->parent, struct journal_list, cl); 1142 struct journal_read_buf buf = { NULL, 0 }; 1143 unsigned i; 1144 int ret = 0; 1145 1146 if (!ja->nr) 1147 goto out; 1148 1149 ret = journal_read_buf_realloc(&buf, PAGE_SIZE); 1150 if (ret) 1151 goto err; 1152 1153 pr_debug("%u journal buckets", ja->nr); 1154 1155 for (i = 0; i < ja->nr; i++) { 1156 ret = journal_read_bucket(ca, &buf, jlist, i); 1157 if (ret) 1158 goto err; 1159 } 1160 1161 /* 1162 * Set dirty_idx to indicate the entire journal is full and needs to be 1163 * reclaimed - journal reclaim will immediately reclaim whatever isn't 1164 * pinned when it first runs: 1165 */ 1166 ja->discard_idx = ja->dirty_idx_ondisk = 1167 ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; 1168 out: 1169 bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); 1170 kvfree(buf.data); 1171 percpu_ref_put(&ca->io_ref); 1172 closure_return(cl); 1173 return; 1174 err: 1175 mutex_lock(&jlist->lock); 1176 jlist->ret = ret; 1177 mutex_unlock(&jlist->lock); 1178 goto out; 1179 } 1180 1181 int bch2_journal_read(struct bch_fs *c, 1182 u64 *last_seq, 1183 u64 *blacklist_seq, 1184 u64 *start_seq) 1185 { 1186 struct journal_list jlist; 1187 struct journal_replay *i, **_i, *prev = NULL; 1188 struct genradix_iter radix_iter; 1189 struct printbuf buf = PRINTBUF; 1190 bool degraded = false, last_write_torn = false; 1191 u64 seq; 1192 int ret = 0; 1193 1194 closure_init_stack(&jlist.cl); 1195 mutex_init(&jlist.lock); 1196 jlist.last_seq = 0; 1197 jlist.ret = 0; 1198 1199 for_each_member_device(c, ca) { 1200 if (!c->opts.fsck && 1201 !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal))) 1202 continue; 1203 1204 if ((ca->mi.state == BCH_MEMBER_STATE_rw || 1205 ca->mi.state == BCH_MEMBER_STATE_ro) && 1206 percpu_ref_tryget(&ca->io_ref)) 1207 closure_call(&ca->journal.read, 1208 bch2_journal_read_device, 1209 system_unbound_wq, 1210 &jlist.cl); 1211 else 1212 degraded = true; 1213 } 1214 1215 closure_sync(&jlist.cl); 1216 1217 if (jlist.ret) 1218 return jlist.ret; 1219 1220 *last_seq = 0; 1221 *start_seq = 0; 1222 *blacklist_seq = 0; 1223 1224 /* 1225 * Find most recent flush entry, and ignore newer non flush entries - 1226 * those entries will be blacklisted: 1227 */ 1228 genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) { 1229 enum bch_validate_flags flags = BCH_VALIDATE_journal; 1230 1231 i = *_i; 1232 1233 if (journal_replay_ignore(i)) 1234 continue; 1235 1236 if (!*start_seq) 1237 *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1; 1238 1239 if (JSET_NO_FLUSH(&i->j)) { 1240 i->ignore_blacklisted = true; 1241 continue; 1242 } 1243 1244 if (!last_write_torn && !i->csum_good) { 1245 last_write_torn = true; 1246 i->ignore_blacklisted = true; 1247 continue; 1248 } 1249 1250 if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq), 1251 c, le32_to_cpu(i->j.version), &i->j, NULL, 1252 jset_last_seq_newer_than_seq, 1253 "invalid journal entry: last_seq > seq (%llu > %llu)", 1254 le64_to_cpu(i->j.last_seq), 1255 le64_to_cpu(i->j.seq))) 1256 i->j.last_seq = i->j.seq; 1257 1258 *last_seq = le64_to_cpu(i->j.last_seq); 1259 *blacklist_seq = le64_to_cpu(i->j.seq) + 1; 1260 break; 1261 } 1262 1263 if (!*start_seq) { 1264 bch_info(c, "journal read done, but no entries found"); 1265 return 0; 1266 } 1267 1268 if (!*last_seq) { 1269 fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes, 1270 "journal read done, but no entries found after dropping non-flushes"); 1271 return 0; 1272 } 1273 1274 bch_info(c, "journal read done, replaying entries %llu-%llu", 1275 *last_seq, *blacklist_seq - 1); 1276 1277 if (*start_seq != *blacklist_seq) 1278 bch_info(c, "dropped unflushed entries %llu-%llu", 1279 *blacklist_seq, *start_seq - 1); 1280 1281 /* Drop blacklisted entries and entries older than last_seq: */ 1282 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1283 i = *_i; 1284 1285 if (journal_replay_ignore(i)) 1286 continue; 1287 1288 seq = le64_to_cpu(i->j.seq); 1289 if (seq < *last_seq) { 1290 journal_replay_free(c, i, false); 1291 continue; 1292 } 1293 1294 if (bch2_journal_seq_is_blacklisted(c, seq, true)) { 1295 fsck_err_on(!JSET_NO_FLUSH(&i->j), c, 1296 jset_seq_blacklisted, 1297 "found blacklisted journal entry %llu", seq); 1298 i->ignore_blacklisted = true; 1299 } 1300 } 1301 1302 /* Check for missing entries: */ 1303 seq = *last_seq; 1304 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1305 i = *_i; 1306 1307 if (journal_replay_ignore(i)) 1308 continue; 1309 1310 BUG_ON(seq > le64_to_cpu(i->j.seq)); 1311 1312 while (seq < le64_to_cpu(i->j.seq)) { 1313 u64 missing_start, missing_end; 1314 struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; 1315 1316 while (seq < le64_to_cpu(i->j.seq) && 1317 bch2_journal_seq_is_blacklisted(c, seq, false)) 1318 seq++; 1319 1320 if (seq == le64_to_cpu(i->j.seq)) 1321 break; 1322 1323 missing_start = seq; 1324 1325 while (seq < le64_to_cpu(i->j.seq) && 1326 !bch2_journal_seq_is_blacklisted(c, seq, false)) 1327 seq++; 1328 1329 if (prev) { 1330 bch2_journal_ptrs_to_text(&buf1, c, prev); 1331 prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits)); 1332 } else 1333 prt_printf(&buf1, "(none)"); 1334 bch2_journal_ptrs_to_text(&buf2, c, i); 1335 1336 missing_end = seq - 1; 1337 fsck_err(c, journal_entries_missing, 1338 "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" 1339 " prev at %s\n" 1340 " next at %s, continue?", 1341 missing_start, missing_end, 1342 *last_seq, *blacklist_seq - 1, 1343 buf1.buf, buf2.buf); 1344 1345 printbuf_exit(&buf1); 1346 printbuf_exit(&buf2); 1347 } 1348 1349 prev = i; 1350 seq++; 1351 } 1352 1353 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1354 struct bch_replicas_padded replicas = { 1355 .e.data_type = BCH_DATA_journal, 1356 .e.nr_devs = 0, 1357 .e.nr_required = 1, 1358 }; 1359 1360 i = *_i; 1361 if (journal_replay_ignore(i)) 1362 continue; 1363 1364 darray_for_each(i->ptrs, ptr) { 1365 struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); 1366 1367 if (!ptr->csum_good) 1368 bch_err_dev_offset(ca, ptr->sector, 1369 "invalid journal checksum, seq %llu%s", 1370 le64_to_cpu(i->j.seq), 1371 i->csum_good ? " (had good copy on another device)" : ""); 1372 } 1373 1374 ret = jset_validate(c, 1375 bch2_dev_have_ref(c, i->ptrs.data[0].dev), 1376 &i->j, 1377 i->ptrs.data[0].sector, 1378 READ); 1379 if (ret) 1380 goto err; 1381 1382 darray_for_each(i->ptrs, ptr) 1383 replicas_entry_add_dev(&replicas.e, ptr->dev); 1384 1385 bch2_replicas_entry_sort(&replicas.e); 1386 1387 printbuf_reset(&buf); 1388 bch2_replicas_entry_to_text(&buf, &replicas.e); 1389 1390 if (!degraded && 1391 !bch2_replicas_marked(c, &replicas.e) && 1392 (le64_to_cpu(i->j.seq) == *last_seq || 1393 fsck_err(c, journal_entry_replicas_not_marked, 1394 "superblock not marked as containing replicas for journal entry %llu\n %s", 1395 le64_to_cpu(i->j.seq), buf.buf))) { 1396 ret = bch2_mark_replicas(c, &replicas.e); 1397 if (ret) 1398 goto err; 1399 } 1400 } 1401 err: 1402 fsck_err: 1403 printbuf_exit(&buf); 1404 return ret; 1405 } 1406 1407 /* journal write: */ 1408 1409 static void __journal_write_alloc(struct journal *j, 1410 struct journal_buf *w, 1411 struct dev_alloc_list *devs_sorted, 1412 unsigned sectors, 1413 unsigned *replicas, 1414 unsigned replicas_want) 1415 { 1416 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1417 struct journal_device *ja; 1418 struct bch_dev *ca; 1419 unsigned i; 1420 1421 if (*replicas >= replicas_want) 1422 return; 1423 1424 for (i = 0; i < devs_sorted->nr; i++) { 1425 ca = rcu_dereference(c->devs[devs_sorted->devs[i]]); 1426 if (!ca) 1427 continue; 1428 1429 ja = &ca->journal; 1430 1431 /* 1432 * Check that we can use this device, and aren't already using 1433 * it: 1434 */ 1435 if (!ca->mi.durability || 1436 ca->mi.state != BCH_MEMBER_STATE_rw || 1437 !ja->nr || 1438 bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) || 1439 sectors > ja->sectors_free) 1440 continue; 1441 1442 bch2_dev_stripe_increment(ca, &j->wp.stripe); 1443 1444 bch2_bkey_append_ptr(&w->key, 1445 (struct bch_extent_ptr) { 1446 .offset = bucket_to_sector(ca, 1447 ja->buckets[ja->cur_idx]) + 1448 ca->mi.bucket_size - 1449 ja->sectors_free, 1450 .dev = ca->dev_idx, 1451 }); 1452 1453 ja->sectors_free -= sectors; 1454 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1455 1456 *replicas += ca->mi.durability; 1457 1458 if (*replicas >= replicas_want) 1459 break; 1460 } 1461 } 1462 1463 /** 1464 * journal_write_alloc - decide where to write next journal entry 1465 * 1466 * @j: journal object 1467 * @w: journal buf (entry to be written) 1468 * 1469 * Returns: 0 on success, or -EROFS on failure 1470 */ 1471 static int journal_write_alloc(struct journal *j, struct journal_buf *w) 1472 { 1473 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1474 struct bch_devs_mask devs; 1475 struct journal_device *ja; 1476 struct bch_dev *ca; 1477 struct dev_alloc_list devs_sorted; 1478 unsigned sectors = vstruct_sectors(w->data, c->block_bits); 1479 unsigned target = c->opts.metadata_target ?: 1480 c->opts.foreground_target; 1481 unsigned i, replicas = 0, replicas_want = 1482 READ_ONCE(c->opts.metadata_replicas); 1483 unsigned replicas_need = min_t(unsigned, replicas_want, 1484 READ_ONCE(c->opts.metadata_replicas_required)); 1485 1486 rcu_read_lock(); 1487 retry: 1488 devs = target_rw_devs(c, BCH_DATA_journal, target); 1489 1490 devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); 1491 1492 __journal_write_alloc(j, w, &devs_sorted, 1493 sectors, &replicas, replicas_want); 1494 1495 if (replicas >= replicas_want) 1496 goto done; 1497 1498 for (i = 0; i < devs_sorted.nr; i++) { 1499 ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); 1500 if (!ca) 1501 continue; 1502 1503 ja = &ca->journal; 1504 1505 if (sectors > ja->sectors_free && 1506 sectors <= ca->mi.bucket_size && 1507 bch2_journal_dev_buckets_available(j, ja, 1508 journal_space_discarded)) { 1509 ja->cur_idx = (ja->cur_idx + 1) % ja->nr; 1510 ja->sectors_free = ca->mi.bucket_size; 1511 1512 /* 1513 * ja->bucket_seq[ja->cur_idx] must always have 1514 * something sensible: 1515 */ 1516 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1517 } 1518 } 1519 1520 __journal_write_alloc(j, w, &devs_sorted, 1521 sectors, &replicas, replicas_want); 1522 1523 if (replicas < replicas_want && target) { 1524 /* Retry from all devices: */ 1525 target = 0; 1526 goto retry; 1527 } 1528 done: 1529 rcu_read_unlock(); 1530 1531 BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); 1532 1533 return replicas >= replicas_need ? 0 : -EROFS; 1534 } 1535 1536 static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) 1537 { 1538 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1539 1540 /* we aren't holding j->lock: */ 1541 unsigned new_size = READ_ONCE(j->buf_size_want); 1542 void *new_buf; 1543 1544 if (buf->buf_size >= new_size) 1545 return; 1546 1547 size_t btree_write_buffer_size = new_size / 64; 1548 1549 if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size)) 1550 return; 1551 1552 new_buf = kvmalloc(new_size, GFP_NOFS|__GFP_NOWARN); 1553 if (!new_buf) 1554 return; 1555 1556 memcpy(new_buf, buf->data, buf->buf_size); 1557 1558 spin_lock(&j->lock); 1559 swap(buf->data, new_buf); 1560 swap(buf->buf_size, new_size); 1561 spin_unlock(&j->lock); 1562 1563 kvfree(new_buf); 1564 } 1565 1566 static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) 1567 { 1568 return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK); 1569 } 1570 1571 static CLOSURE_CALLBACK(journal_write_done) 1572 { 1573 closure_type(w, struct journal_buf, io); 1574 struct journal *j = container_of(w, struct journal, buf[w->idx]); 1575 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1576 struct bch_replicas_padded replicas; 1577 union journal_res_state old, new; 1578 u64 seq = le64_to_cpu(w->data->seq); 1579 int err = 0; 1580 1581 bch2_time_stats_update(!JSET_NO_FLUSH(w->data) 1582 ? j->flush_write_time 1583 : j->noflush_write_time, j->write_start_time); 1584 1585 if (!w->devs_written.nr) { 1586 bch_err(c, "unable to write journal to sufficient devices"); 1587 err = -EIO; 1588 } else { 1589 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, 1590 w->devs_written); 1591 if (bch2_mark_replicas(c, &replicas.e)) 1592 err = -EIO; 1593 } 1594 1595 if (err) 1596 bch2_fatal_error(c); 1597 1598 closure_debug_destroy(cl); 1599 1600 spin_lock(&j->lock); 1601 if (seq >= j->pin.front) 1602 journal_seq_pin(j, seq)->devs = w->devs_written; 1603 if (err && (!j->err_seq || seq < j->err_seq)) 1604 j->err_seq = seq; 1605 w->write_done = true; 1606 1607 bool completed = false; 1608 1609 for (seq = journal_last_unwritten_seq(j); 1610 seq <= journal_cur_seq(j); 1611 seq++) { 1612 w = j->buf + (seq & JOURNAL_BUF_MASK); 1613 if (!w->write_done) 1614 break; 1615 1616 if (!j->err_seq && !JSET_NO_FLUSH(w->data)) { 1617 j->flushed_seq_ondisk = seq; 1618 j->last_seq_ondisk = w->last_seq; 1619 1620 bch2_do_discards(c); 1621 closure_wake_up(&c->freelist_wait); 1622 bch2_reset_alloc_cursors(c); 1623 } 1624 1625 j->seq_ondisk = seq; 1626 1627 /* 1628 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard 1629 * more buckets: 1630 * 1631 * Must come before signaling write completion, for 1632 * bch2_fs_journal_stop(): 1633 */ 1634 if (j->watermark != BCH_WATERMARK_stripe) 1635 journal_reclaim_kick(&c->journal); 1636 1637 old.v = atomic64_read(&j->reservations.counter); 1638 do { 1639 new.v = old.v; 1640 BUG_ON(journal_state_count(new, new.unwritten_idx)); 1641 BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK)); 1642 1643 new.unwritten_idx++; 1644 } while (!atomic64_try_cmpxchg(&j->reservations.counter, 1645 &old.v, new.v)); 1646 1647 closure_wake_up(&w->wait); 1648 completed = true; 1649 } 1650 1651 if (completed) { 1652 bch2_journal_reclaim_fast(j); 1653 bch2_journal_space_available(j); 1654 1655 track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false); 1656 1657 journal_wake(j); 1658 } 1659 1660 if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && 1661 new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { 1662 struct journal_buf *buf = journal_cur_buf(j); 1663 long delta = buf->expires - jiffies; 1664 1665 /* 1666 * We don't close a journal entry to write it while there's 1667 * previous entries still in flight - the current journal entry 1668 * might want to be written now: 1669 */ 1670 mod_delayed_work(j->wq, &j->write_work, max(0L, delta)); 1671 } 1672 1673 /* 1674 * We don't typically trigger journal writes from her - the next journal 1675 * write will be triggered immediately after the previous one is 1676 * allocated, in bch2_journal_write() - but the journal write error path 1677 * is special: 1678 */ 1679 bch2_journal_do_writes(j); 1680 spin_unlock(&j->lock); 1681 } 1682 1683 static void journal_write_endio(struct bio *bio) 1684 { 1685 struct journal_bio *jbio = container_of(bio, struct journal_bio, bio); 1686 struct bch_dev *ca = jbio->ca; 1687 struct journal *j = &ca->fs->journal; 1688 struct journal_buf *w = j->buf + jbio->buf_idx; 1689 1690 if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, 1691 "error writing journal entry %llu: %s", 1692 le64_to_cpu(w->data->seq), 1693 bch2_blk_status_to_str(bio->bi_status)) || 1694 bch2_meta_write_fault("journal")) { 1695 unsigned long flags; 1696 1697 spin_lock_irqsave(&j->err_lock, flags); 1698 bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx); 1699 spin_unlock_irqrestore(&j->err_lock, flags); 1700 } 1701 1702 closure_put(&w->io); 1703 percpu_ref_put(&ca->io_ref); 1704 } 1705 1706 static CLOSURE_CALLBACK(journal_write_submit) 1707 { 1708 closure_type(w, struct journal_buf, io); 1709 struct journal *j = container_of(w, struct journal, buf[w->idx]); 1710 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1711 unsigned sectors = vstruct_sectors(w->data, c->block_bits); 1712 1713 extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { 1714 struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE); 1715 if (!ca) { 1716 /* XXX: fix this */ 1717 bch_err(c, "missing device for journal write\n"); 1718 continue; 1719 } 1720 1721 this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], 1722 sectors); 1723 1724 struct journal_device *ja = &ca->journal; 1725 struct bio *bio = &ja->bio[w->idx]->bio; 1726 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); 1727 bio->bi_iter.bi_sector = ptr->offset; 1728 bio->bi_end_io = journal_write_endio; 1729 bio->bi_private = ca; 1730 1731 BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector); 1732 ca->prev_journal_sector = bio->bi_iter.bi_sector; 1733 1734 if (!JSET_NO_FLUSH(w->data)) 1735 bio->bi_opf |= REQ_FUA; 1736 if (!JSET_NO_FLUSH(w->data) && !w->separate_flush) 1737 bio->bi_opf |= REQ_PREFLUSH; 1738 1739 bch2_bio_map(bio, w->data, sectors << 9); 1740 1741 trace_and_count(c, journal_write, bio); 1742 closure_bio_submit(bio, cl); 1743 1744 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1745 } 1746 1747 continue_at(cl, journal_write_done, j->wq); 1748 } 1749 1750 static CLOSURE_CALLBACK(journal_write_preflush) 1751 { 1752 closure_type(w, struct journal_buf, io); 1753 struct journal *j = container_of(w, struct journal, buf[w->idx]); 1754 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1755 1756 if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { 1757 spin_lock(&j->lock); 1758 if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { 1759 closure_wait(&j->async_wait, cl); 1760 spin_unlock(&j->lock); 1761 continue_at(cl, journal_write_preflush, j->wq); 1762 return; 1763 } 1764 spin_unlock(&j->lock); 1765 } 1766 1767 if (w->separate_flush) { 1768 for_each_rw_member(c, ca) { 1769 percpu_ref_get(&ca->io_ref); 1770 1771 struct journal_device *ja = &ca->journal; 1772 struct bio *bio = &ja->bio[w->idx]->bio; 1773 bio_reset(bio, ca->disk_sb.bdev, 1774 REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH); 1775 bio->bi_end_io = journal_write_endio; 1776 bio->bi_private = ca; 1777 closure_bio_submit(bio, cl); 1778 } 1779 1780 continue_at(cl, journal_write_submit, j->wq); 1781 } else { 1782 /* 1783 * no need to punt to another work item if we're not waiting on 1784 * preflushes 1785 */ 1786 journal_write_submit(&cl->work); 1787 } 1788 } 1789 1790 static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) 1791 { 1792 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1793 struct jset_entry *start, *end; 1794 struct jset *jset = w->data; 1795 struct journal_keys_to_wb wb = { NULL }; 1796 unsigned sectors, bytes, u64s; 1797 unsigned long btree_roots_have = 0; 1798 bool validate_before_checksum = false; 1799 u64 seq = le64_to_cpu(jset->seq); 1800 int ret; 1801 1802 /* 1803 * Simple compaction, dropping empty jset_entries (from journal 1804 * reservations that weren't fully used) and merging jset_entries that 1805 * can be. 1806 * 1807 * If we wanted to be really fancy here, we could sort all the keys in 1808 * the jset and drop keys that were overwritten - probably not worth it: 1809 */ 1810 vstruct_for_each(jset, i) { 1811 unsigned u64s = le16_to_cpu(i->u64s); 1812 1813 /* Empty entry: */ 1814 if (!u64s) 1815 continue; 1816 1817 /* 1818 * New btree roots are set by journalling them; when the journal 1819 * entry gets written we have to propagate them to 1820 * c->btree_roots 1821 * 1822 * But, every journal entry we write has to contain all the 1823 * btree roots (at least for now); so after we copy btree roots 1824 * to c->btree_roots we have to get any missing btree roots and 1825 * add them to this journal entry: 1826 */ 1827 switch (i->type) { 1828 case BCH_JSET_ENTRY_btree_root: 1829 bch2_journal_entry_to_btree_root(c, i); 1830 __set_bit(i->btree_id, &btree_roots_have); 1831 break; 1832 case BCH_JSET_ENTRY_write_buffer_keys: 1833 EBUG_ON(!w->need_flush_to_write_buffer); 1834 1835 if (!wb.wb) 1836 bch2_journal_keys_to_write_buffer_start(c, &wb, seq); 1837 1838 jset_entry_for_each_key(i, k) { 1839 ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k); 1840 if (ret) { 1841 bch2_fs_fatal_error(c, "flushing journal keys to btree write buffer: %s", 1842 bch2_err_str(ret)); 1843 bch2_journal_keys_to_write_buffer_end(c, &wb); 1844 return ret; 1845 } 1846 } 1847 i->type = BCH_JSET_ENTRY_btree_keys; 1848 break; 1849 } 1850 } 1851 1852 if (wb.wb) { 1853 ret = bch2_journal_keys_to_write_buffer_end(c, &wb); 1854 if (ret) { 1855 bch2_fs_fatal_error(c, "error flushing journal keys to btree write buffer: %s", 1856 bch2_err_str(ret)); 1857 return ret; 1858 } 1859 } 1860 1861 spin_lock(&c->journal.lock); 1862 w->need_flush_to_write_buffer = false; 1863 spin_unlock(&c->journal.lock); 1864 1865 start = end = vstruct_last(jset); 1866 1867 end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have); 1868 1869 struct jset_entry_datetime *d = 1870 container_of(jset_entry_init(&end, sizeof(*d)), struct jset_entry_datetime, entry); 1871 d->entry.type = BCH_JSET_ENTRY_datetime; 1872 d->seconds = cpu_to_le64(ktime_get_real_seconds()); 1873 1874 bch2_journal_super_entries_add_common(c, &end, seq); 1875 u64s = (u64 *) end - (u64 *) start; 1876 1877 WARN_ON(u64s > j->entry_u64s_reserved); 1878 1879 le32_add_cpu(&jset->u64s, u64s); 1880 1881 sectors = vstruct_sectors(jset, c->block_bits); 1882 bytes = vstruct_bytes(jset); 1883 1884 if (sectors > w->sectors) { 1885 bch2_fs_fatal_error(c, ": journal write overran available space, %zu > %u (extra %u reserved %u/%u)", 1886 vstruct_bytes(jset), w->sectors << 9, 1887 u64s, w->u64s_reserved, j->entry_u64s_reserved); 1888 return -EINVAL; 1889 } 1890 1891 jset->magic = cpu_to_le64(jset_magic(c)); 1892 jset->version = cpu_to_le32(c->sb.version); 1893 1894 SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); 1895 SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); 1896 1897 if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset)) 1898 j->last_empty_seq = seq; 1899 1900 if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) 1901 validate_before_checksum = true; 1902 1903 if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current) 1904 validate_before_checksum = true; 1905 1906 if (validate_before_checksum && 1907 (ret = jset_validate(c, NULL, jset, 0, WRITE))) 1908 return ret; 1909 1910 ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), 1911 jset->encrypted_start, 1912 vstruct_end(jset) - (void *) jset->encrypted_start); 1913 if (bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret))) 1914 return ret; 1915 1916 jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), 1917 journal_nonce(jset), jset); 1918 1919 if (!validate_before_checksum && 1920 (ret = jset_validate(c, NULL, jset, 0, WRITE))) 1921 return ret; 1922 1923 memset((void *) jset + bytes, 0, (sectors << 9) - bytes); 1924 return 0; 1925 } 1926 1927 static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w) 1928 { 1929 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1930 int error = bch2_journal_error(j); 1931 1932 /* 1933 * If the journal is in an error state - we did an emergency shutdown - 1934 * we prefer to continue doing journal writes. We just mark them as 1935 * noflush so they'll never be used, but they'll still be visible by the 1936 * list_journal tool - this helps in debugging. 1937 * 1938 * There's a caveat: the first journal write after marking the 1939 * superblock dirty must always be a flush write, because on startup 1940 * from a clean shutdown we didn't necessarily read the journal and the 1941 * new journal write might overwrite whatever was in the journal 1942 * previously - we can't leave the journal without any flush writes in 1943 * it. 1944 * 1945 * So if we're in an error state, and we're still starting up, we don't 1946 * write anything at all. 1947 */ 1948 if (error && test_bit(JOURNAL_need_flush_write, &j->flags)) 1949 return -EIO; 1950 1951 if (error || 1952 w->noflush || 1953 (!w->must_flush && 1954 time_before(jiffies, j->last_flush_write + 1955 msecs_to_jiffies(c->opts.journal_flush_delay)) && 1956 test_bit(JOURNAL_may_skip_flush, &j->flags))) { 1957 w->noflush = true; 1958 SET_JSET_NO_FLUSH(w->data, true); 1959 w->data->last_seq = 0; 1960 w->last_seq = 0; 1961 1962 j->nr_noflush_writes++; 1963 } else { 1964 w->must_flush = true; 1965 j->last_flush_write = jiffies; 1966 j->nr_flush_writes++; 1967 clear_bit(JOURNAL_need_flush_write, &j->flags); 1968 } 1969 1970 return 0; 1971 } 1972 1973 CLOSURE_CALLBACK(bch2_journal_write) 1974 { 1975 closure_type(w, struct journal_buf, io); 1976 struct journal *j = container_of(w, struct journal, buf[w->idx]); 1977 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1978 struct bch_replicas_padded replicas; 1979 unsigned nr_rw_members = 0; 1980 int ret; 1981 1982 for_each_rw_member(c, ca) 1983 nr_rw_members++; 1984 1985 BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); 1986 BUG_ON(!w->write_started); 1987 BUG_ON(w->write_allocated); 1988 BUG_ON(w->write_done); 1989 1990 j->write_start_time = local_clock(); 1991 1992 spin_lock(&j->lock); 1993 if (nr_rw_members > 1) 1994 w->separate_flush = true; 1995 1996 ret = bch2_journal_write_pick_flush(j, w); 1997 spin_unlock(&j->lock); 1998 if (ret) 1999 goto err; 2000 2001 mutex_lock(&j->buf_lock); 2002 journal_buf_realloc(j, w); 2003 2004 ret = bch2_journal_write_prep(j, w); 2005 mutex_unlock(&j->buf_lock); 2006 if (ret) 2007 goto err; 2008 2009 j->entry_bytes_written += vstruct_bytes(w->data); 2010 2011 while (1) { 2012 spin_lock(&j->lock); 2013 ret = journal_write_alloc(j, w); 2014 if (!ret || !j->can_discard) 2015 break; 2016 2017 spin_unlock(&j->lock); 2018 bch2_journal_do_discards(j); 2019 } 2020 2021 if (ret) { 2022 struct printbuf buf = PRINTBUF; 2023 buf.atomic++; 2024 2025 prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu: %s"), 2026 le64_to_cpu(w->data->seq), 2027 bch2_err_str(ret)); 2028 __bch2_journal_debug_to_text(&buf, j); 2029 spin_unlock(&j->lock); 2030 bch2_print_string_as_lines(KERN_ERR, buf.buf); 2031 printbuf_exit(&buf); 2032 goto err; 2033 } 2034 2035 /* 2036 * write is allocated, no longer need to account for it in 2037 * bch2_journal_space_available(): 2038 */ 2039 w->sectors = 0; 2040 w->write_allocated = true; 2041 2042 /* 2043 * journal entry has been compacted and allocated, recalculate space 2044 * available: 2045 */ 2046 bch2_journal_space_available(j); 2047 bch2_journal_do_writes(j); 2048 spin_unlock(&j->lock); 2049 2050 w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); 2051 2052 if (c->opts.nochanges) 2053 goto no_io; 2054 2055 /* 2056 * Mark journal replicas before we submit the write to guarantee 2057 * recovery will find the journal entries after a crash. 2058 */ 2059 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, 2060 w->devs_written); 2061 ret = bch2_mark_replicas(c, &replicas.e); 2062 if (ret) 2063 goto err; 2064 2065 if (!JSET_NO_FLUSH(w->data)) 2066 continue_at(cl, journal_write_preflush, j->wq); 2067 else 2068 continue_at(cl, journal_write_submit, j->wq); 2069 return; 2070 no_io: 2071 continue_at(cl, journal_write_done, j->wq); 2072 return; 2073 err: 2074 bch2_fatal_error(c); 2075 continue_at(cl, journal_write_done, j->wq); 2076 } 2077