1 // SPDX-License-Identifier: GPL-2.0 2 #include "bcachefs.h" 3 #include "alloc_background.h" 4 #include "alloc_foreground.h" 5 #include "btree_io.h" 6 #include "btree_update_interior.h" 7 #include "btree_write_buffer.h" 8 #include "buckets.h" 9 #include "checksum.h" 10 #include "disk_groups.h" 11 #include "error.h" 12 #include "journal.h" 13 #include "journal_io.h" 14 #include "journal_reclaim.h" 15 #include "journal_seq_blacklist.h" 16 #include "replicas.h" 17 #include "sb-clean.h" 18 #include "trace.h" 19 20 void bch2_journal_pos_from_member_info_set(struct bch_fs *c) 21 { 22 lockdep_assert_held(&c->sb_lock); 23 24 for_each_member_device(c, ca) { 25 struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); 26 27 m->last_journal_bucket = cpu_to_le32(ca->journal.cur_idx); 28 m->last_journal_bucket_offset = cpu_to_le32(ca->mi.bucket_size - ca->journal.sectors_free); 29 } 30 } 31 32 void bch2_journal_pos_from_member_info_resume(struct bch_fs *c) 33 { 34 mutex_lock(&c->sb_lock); 35 for_each_member_device(c, ca) { 36 struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx); 37 38 unsigned idx = le32_to_cpu(m.last_journal_bucket); 39 if (idx < ca->journal.nr) 40 ca->journal.cur_idx = idx; 41 unsigned offset = le32_to_cpu(m.last_journal_bucket_offset); 42 if (offset <= ca->mi.bucket_size) 43 ca->journal.sectors_free = ca->mi.bucket_size - offset; 44 } 45 mutex_unlock(&c->sb_lock); 46 } 47 48 void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, 49 struct journal_replay *j) 50 { 51 darray_for_each(j->ptrs, i) { 52 if (i != j->ptrs.data) 53 prt_printf(out, " "); 54 prt_printf(out, "%u:%u:%u (sector %llu)", 55 i->dev, i->bucket, i->bucket_offset, i->sector); 56 } 57 } 58 59 static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c, 60 struct journal_replay *j) 61 { 62 prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq)); 63 64 bch2_journal_ptrs_to_text(out, c, j); 65 66 for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) { 67 struct jset_entry_datetime *datetime = 68 container_of(entry, struct jset_entry_datetime, entry); 69 bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); 70 break; 71 } 72 } 73 74 static struct nonce journal_nonce(const struct jset *jset) 75 { 76 return (struct nonce) {{ 77 [0] = 0, 78 [1] = ((__le32 *) &jset->seq)[0], 79 [2] = ((__le32 *) &jset->seq)[1], 80 [3] = BCH_NONCE_JOURNAL, 81 }}; 82 } 83 84 static bool jset_csum_good(struct bch_fs *c, struct jset *j, struct bch_csum *csum) 85 { 86 if (!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j))) { 87 *csum = (struct bch_csum) {}; 88 return false; 89 } 90 91 *csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j); 92 return !bch2_crc_cmp(j->csum, *csum); 93 } 94 95 static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq) 96 { 97 return (seq - c->journal_entries_base_seq) & (~0U >> 1); 98 } 99 100 static void __journal_replay_free(struct bch_fs *c, 101 struct journal_replay *i) 102 { 103 struct journal_replay **p = 104 genradix_ptr(&c->journal_entries, 105 journal_entry_radix_idx(c, le64_to_cpu(i->j.seq))); 106 107 BUG_ON(*p != i); 108 *p = NULL; 109 kvfree(i); 110 } 111 112 static void journal_replay_free(struct bch_fs *c, struct journal_replay *i, bool blacklisted) 113 { 114 if (blacklisted) 115 i->ignore_blacklisted = true; 116 else 117 i->ignore_not_dirty = true; 118 119 if (!c->opts.read_entire_journal) 120 __journal_replay_free(c, i); 121 } 122 123 struct journal_list { 124 struct closure cl; 125 u64 last_seq; 126 struct mutex lock; 127 int ret; 128 }; 129 130 #define JOURNAL_ENTRY_ADD_OK 0 131 #define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5 132 133 /* 134 * Given a journal entry we just read, add it to the list of journal entries to 135 * be replayed: 136 */ 137 static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, 138 struct journal_ptr entry_ptr, 139 struct journal_list *jlist, struct jset *j) 140 { 141 struct genradix_iter iter; 142 struct journal_replay **_i, *i, *dup; 143 size_t bytes = vstruct_bytes(j); 144 u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0; 145 struct printbuf buf = PRINTBUF; 146 int ret = JOURNAL_ENTRY_ADD_OK; 147 148 if (!c->journal.oldest_seq_found_ondisk || 149 le64_to_cpu(j->seq) < c->journal.oldest_seq_found_ondisk) 150 c->journal.oldest_seq_found_ondisk = le64_to_cpu(j->seq); 151 152 /* Is this entry older than the range we need? */ 153 if (!c->opts.read_entire_journal && 154 le64_to_cpu(j->seq) < jlist->last_seq) 155 return JOURNAL_ENTRY_ADD_OUT_OF_RANGE; 156 157 /* 158 * genradixes are indexed by a ulong, not a u64, so we can't index them 159 * by sequence number directly: Assume instead that they will all fall 160 * within the range of +-2billion of the filrst one we find. 161 */ 162 if (!c->journal_entries_base_seq) 163 c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX); 164 165 /* Drop entries we don't need anymore */ 166 if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) { 167 genradix_for_each_from(&c->journal_entries, iter, _i, 168 journal_entry_radix_idx(c, jlist->last_seq)) { 169 i = *_i; 170 171 if (journal_replay_ignore(i)) 172 continue; 173 174 if (le64_to_cpu(i->j.seq) >= last_seq) 175 break; 176 177 journal_replay_free(c, i, false); 178 } 179 } 180 181 jlist->last_seq = max(jlist->last_seq, last_seq); 182 183 _i = genradix_ptr_alloc(&c->journal_entries, 184 journal_entry_radix_idx(c, le64_to_cpu(j->seq)), 185 GFP_KERNEL); 186 if (!_i) 187 return -BCH_ERR_ENOMEM_journal_entry_add; 188 189 /* 190 * Duplicate journal entries? If so we want the one that didn't have a 191 * checksum error: 192 */ 193 dup = *_i; 194 if (dup) { 195 bool identical = bytes == vstruct_bytes(&dup->j) && 196 !memcmp(j, &dup->j, bytes); 197 bool not_identical = !identical && 198 entry_ptr.csum_good && 199 dup->csum_good; 200 201 bool same_device = false; 202 darray_for_each(dup->ptrs, ptr) 203 if (ptr->dev == ca->dev_idx) 204 same_device = true; 205 206 ret = darray_push(&dup->ptrs, entry_ptr); 207 if (ret) 208 goto out; 209 210 bch2_journal_replay_to_text(&buf, c, dup); 211 212 fsck_err_on(same_device, 213 c, journal_entry_dup_same_device, 214 "duplicate journal entry on same device\n %s", 215 buf.buf); 216 217 fsck_err_on(not_identical, 218 c, journal_entry_replicas_data_mismatch, 219 "found duplicate but non identical journal entries\n %s", 220 buf.buf); 221 222 if (entry_ptr.csum_good && !identical) 223 goto replace; 224 225 goto out; 226 } 227 replace: 228 i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); 229 if (!i) 230 return -BCH_ERR_ENOMEM_journal_entry_add; 231 232 darray_init(&i->ptrs); 233 i->csum_good = entry_ptr.csum_good; 234 i->ignore_blacklisted = false; 235 i->ignore_not_dirty = false; 236 unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct"); 237 238 if (dup) { 239 /* The first ptr should represent the jset we kept: */ 240 darray_for_each(dup->ptrs, ptr) 241 darray_push(&i->ptrs, *ptr); 242 __journal_replay_free(c, dup); 243 } else { 244 darray_push(&i->ptrs, entry_ptr); 245 } 246 247 *_i = i; 248 out: 249 fsck_err: 250 printbuf_exit(&buf); 251 return ret; 252 } 253 254 /* this fills in a range with empty jset_entries: */ 255 static void journal_entry_null_range(void *start, void *end) 256 { 257 struct jset_entry *entry; 258 259 for (entry = start; entry != end; entry = vstruct_next(entry)) 260 memset(entry, 0, sizeof(*entry)); 261 } 262 263 #define JOURNAL_ENTRY_REREAD 5 264 #define JOURNAL_ENTRY_NONE 6 265 #define JOURNAL_ENTRY_BAD 7 266 267 static void journal_entry_err_msg(struct printbuf *out, 268 u32 version, 269 struct jset *jset, 270 struct jset_entry *entry) 271 { 272 prt_str(out, "invalid journal entry, version="); 273 bch2_version_to_text(out, version); 274 275 if (entry) { 276 prt_str(out, " type="); 277 bch2_prt_jset_entry_type(out, entry->type); 278 } 279 280 if (!jset) { 281 prt_printf(out, " in superblock"); 282 } else { 283 284 prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq)); 285 286 if (entry) 287 prt_printf(out, " offset=%zi/%u", 288 (u64 *) entry - jset->_data, 289 le32_to_cpu(jset->u64s)); 290 } 291 292 prt_str(out, ": "); 293 } 294 295 #define journal_entry_err(c, version, jset, entry, _err, msg, ...) \ 296 ({ \ 297 struct printbuf _buf = PRINTBUF; \ 298 \ 299 journal_entry_err_msg(&_buf, version, jset, entry); \ 300 prt_printf(&_buf, msg, ##__VA_ARGS__); \ 301 \ 302 switch (flags & BCH_VALIDATE_write) { \ 303 case READ: \ 304 mustfix_fsck_err(c, _err, "%s", _buf.buf); \ 305 break; \ 306 case WRITE: \ 307 bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \ 308 bch_err(c, "corrupt metadata before write: %s\n", _buf.buf);\ 309 if (bch2_fs_inconsistent(c)) { \ 310 ret = -BCH_ERR_fsck_errors_not_fixed; \ 311 goto fsck_err; \ 312 } \ 313 break; \ 314 } \ 315 \ 316 printbuf_exit(&_buf); \ 317 true; \ 318 }) 319 320 #define journal_entry_err_on(cond, ...) \ 321 ((cond) ? journal_entry_err(__VA_ARGS__) : false) 322 323 #define FSCK_DELETED_KEY 5 324 325 static int journal_validate_key(struct bch_fs *c, 326 struct jset *jset, 327 struct jset_entry *entry, 328 unsigned level, enum btree_id btree_id, 329 struct bkey_i *k, 330 unsigned version, int big_endian, 331 enum bch_validate_flags flags) 332 { 333 int write = flags & BCH_VALIDATE_write; 334 void *next = vstruct_next(entry); 335 int ret = 0; 336 337 if (journal_entry_err_on(!k->k.u64s, 338 c, version, jset, entry, 339 journal_entry_bkey_u64s_0, 340 "k->u64s 0")) { 341 entry->u64s = cpu_to_le16((u64 *) k - entry->_data); 342 journal_entry_null_range(vstruct_next(entry), next); 343 return FSCK_DELETED_KEY; 344 } 345 346 if (journal_entry_err_on((void *) bkey_next(k) > 347 (void *) vstruct_next(entry), 348 c, version, jset, entry, 349 journal_entry_bkey_past_end, 350 "extends past end of journal entry")) { 351 entry->u64s = cpu_to_le16((u64 *) k - entry->_data); 352 journal_entry_null_range(vstruct_next(entry), next); 353 return FSCK_DELETED_KEY; 354 } 355 356 if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, 357 c, version, jset, entry, 358 journal_entry_bkey_bad_format, 359 "bad format %u", k->k.format)) { 360 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); 361 memmove(k, bkey_next(k), next - (void *) bkey_next(k)); 362 journal_entry_null_range(vstruct_next(entry), next); 363 return FSCK_DELETED_KEY; 364 } 365 366 if (!write) 367 bch2_bkey_compat(level, btree_id, version, big_endian, 368 write, NULL, bkey_to_packed(k)); 369 370 ret = bch2_bkey_validate(c, bkey_i_to_s_c(k), 371 __btree_node_type(level, btree_id), write); 372 if (ret == -BCH_ERR_fsck_delete_bkey) { 373 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); 374 memmove(k, bkey_next(k), next - (void *) bkey_next(k)); 375 journal_entry_null_range(vstruct_next(entry), next); 376 return FSCK_DELETED_KEY; 377 } 378 if (ret) 379 goto fsck_err; 380 381 if (write) 382 bch2_bkey_compat(level, btree_id, version, big_endian, 383 write, NULL, bkey_to_packed(k)); 384 fsck_err: 385 return ret; 386 } 387 388 static int journal_entry_btree_keys_validate(struct bch_fs *c, 389 struct jset *jset, 390 struct jset_entry *entry, 391 unsigned version, int big_endian, 392 enum bch_validate_flags flags) 393 { 394 struct bkey_i *k = entry->start; 395 396 while (k != vstruct_last(entry)) { 397 int ret = journal_validate_key(c, jset, entry, 398 entry->level, 399 entry->btree_id, 400 k, version, big_endian, 401 flags|BCH_VALIDATE_journal); 402 if (ret == FSCK_DELETED_KEY) 403 continue; 404 else if (ret) 405 return ret; 406 407 k = bkey_next(k); 408 } 409 410 return 0; 411 } 412 413 static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c, 414 struct jset_entry *entry) 415 { 416 bool first = true; 417 418 jset_entry_for_each_key(entry, k) { 419 if (!first) { 420 prt_newline(out); 421 bch2_prt_jset_entry_type(out, entry->type); 422 prt_str(out, ": "); 423 } 424 prt_printf(out, "btree=%s l=%u ", bch2_btree_id_str(entry->btree_id), entry->level); 425 bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); 426 first = false; 427 } 428 } 429 430 static int journal_entry_btree_root_validate(struct bch_fs *c, 431 struct jset *jset, 432 struct jset_entry *entry, 433 unsigned version, int big_endian, 434 enum bch_validate_flags flags) 435 { 436 struct bkey_i *k = entry->start; 437 int ret = 0; 438 439 if (journal_entry_err_on(!entry->u64s || 440 le16_to_cpu(entry->u64s) != k->k.u64s, 441 c, version, jset, entry, 442 journal_entry_btree_root_bad_size, 443 "invalid btree root journal entry: wrong number of keys")) { 444 void *next = vstruct_next(entry); 445 /* 446 * we don't want to null out this jset_entry, 447 * just the contents, so that later we can tell 448 * we were _supposed_ to have a btree root 449 */ 450 entry->u64s = 0; 451 journal_entry_null_range(vstruct_next(entry), next); 452 return 0; 453 } 454 455 ret = journal_validate_key(c, jset, entry, 1, entry->btree_id, k, 456 version, big_endian, flags); 457 if (ret == FSCK_DELETED_KEY) 458 ret = 0; 459 fsck_err: 460 return ret; 461 } 462 463 static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c, 464 struct jset_entry *entry) 465 { 466 journal_entry_btree_keys_to_text(out, c, entry); 467 } 468 469 static int journal_entry_prio_ptrs_validate(struct bch_fs *c, 470 struct jset *jset, 471 struct jset_entry *entry, 472 unsigned version, int big_endian, 473 enum bch_validate_flags flags) 474 { 475 /* obsolete, don't care: */ 476 return 0; 477 } 478 479 static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c, 480 struct jset_entry *entry) 481 { 482 } 483 484 static int journal_entry_blacklist_validate(struct bch_fs *c, 485 struct jset *jset, 486 struct jset_entry *entry, 487 unsigned version, int big_endian, 488 enum bch_validate_flags flags) 489 { 490 int ret = 0; 491 492 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, 493 c, version, jset, entry, 494 journal_entry_blacklist_bad_size, 495 "invalid journal seq blacklist entry: bad size")) { 496 journal_entry_null_range(entry, vstruct_next(entry)); 497 } 498 fsck_err: 499 return ret; 500 } 501 502 static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c, 503 struct jset_entry *entry) 504 { 505 struct jset_entry_blacklist *bl = 506 container_of(entry, struct jset_entry_blacklist, entry); 507 508 prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq)); 509 } 510 511 static int journal_entry_blacklist_v2_validate(struct bch_fs *c, 512 struct jset *jset, 513 struct jset_entry *entry, 514 unsigned version, int big_endian, 515 enum bch_validate_flags flags) 516 { 517 struct jset_entry_blacklist_v2 *bl_entry; 518 int ret = 0; 519 520 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, 521 c, version, jset, entry, 522 journal_entry_blacklist_v2_bad_size, 523 "invalid journal seq blacklist entry: bad size")) { 524 journal_entry_null_range(entry, vstruct_next(entry)); 525 goto out; 526 } 527 528 bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry); 529 530 if (journal_entry_err_on(le64_to_cpu(bl_entry->start) > 531 le64_to_cpu(bl_entry->end), 532 c, version, jset, entry, 533 journal_entry_blacklist_v2_start_past_end, 534 "invalid journal seq blacklist entry: start > end")) { 535 journal_entry_null_range(entry, vstruct_next(entry)); 536 } 537 out: 538 fsck_err: 539 return ret; 540 } 541 542 static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c, 543 struct jset_entry *entry) 544 { 545 struct jset_entry_blacklist_v2 *bl = 546 container_of(entry, struct jset_entry_blacklist_v2, entry); 547 548 prt_printf(out, "start=%llu end=%llu", 549 le64_to_cpu(bl->start), 550 le64_to_cpu(bl->end)); 551 } 552 553 static int journal_entry_usage_validate(struct bch_fs *c, 554 struct jset *jset, 555 struct jset_entry *entry, 556 unsigned version, int big_endian, 557 enum bch_validate_flags flags) 558 { 559 struct jset_entry_usage *u = 560 container_of(entry, struct jset_entry_usage, entry); 561 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 562 int ret = 0; 563 564 if (journal_entry_err_on(bytes < sizeof(*u), 565 c, version, jset, entry, 566 journal_entry_usage_bad_size, 567 "invalid journal entry usage: bad size")) { 568 journal_entry_null_range(entry, vstruct_next(entry)); 569 return ret; 570 } 571 572 fsck_err: 573 return ret; 574 } 575 576 static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c, 577 struct jset_entry *entry) 578 { 579 struct jset_entry_usage *u = 580 container_of(entry, struct jset_entry_usage, entry); 581 582 prt_str(out, "type="); 583 bch2_prt_fs_usage_type(out, u->entry.btree_id); 584 prt_printf(out, " v=%llu", le64_to_cpu(u->v)); 585 } 586 587 static int journal_entry_data_usage_validate(struct bch_fs *c, 588 struct jset *jset, 589 struct jset_entry *entry, 590 unsigned version, int big_endian, 591 enum bch_validate_flags flags) 592 { 593 struct jset_entry_data_usage *u = 594 container_of(entry, struct jset_entry_data_usage, entry); 595 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 596 struct printbuf err = PRINTBUF; 597 int ret = 0; 598 599 if (journal_entry_err_on(bytes < sizeof(*u) || 600 bytes < sizeof(*u) + u->r.nr_devs, 601 c, version, jset, entry, 602 journal_entry_data_usage_bad_size, 603 "invalid journal entry usage: bad size")) { 604 journal_entry_null_range(entry, vstruct_next(entry)); 605 goto out; 606 } 607 608 if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c, &err), 609 c, version, jset, entry, 610 journal_entry_data_usage_bad_size, 611 "invalid journal entry usage: %s", err.buf)) { 612 journal_entry_null_range(entry, vstruct_next(entry)); 613 goto out; 614 } 615 out: 616 fsck_err: 617 printbuf_exit(&err); 618 return ret; 619 } 620 621 static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c, 622 struct jset_entry *entry) 623 { 624 struct jset_entry_data_usage *u = 625 container_of(entry, struct jset_entry_data_usage, entry); 626 627 bch2_replicas_entry_to_text(out, &u->r); 628 prt_printf(out, "=%llu", le64_to_cpu(u->v)); 629 } 630 631 static int journal_entry_clock_validate(struct bch_fs *c, 632 struct jset *jset, 633 struct jset_entry *entry, 634 unsigned version, int big_endian, 635 enum bch_validate_flags flags) 636 { 637 struct jset_entry_clock *clock = 638 container_of(entry, struct jset_entry_clock, entry); 639 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 640 int ret = 0; 641 642 if (journal_entry_err_on(bytes != sizeof(*clock), 643 c, version, jset, entry, 644 journal_entry_clock_bad_size, 645 "bad size")) { 646 journal_entry_null_range(entry, vstruct_next(entry)); 647 return ret; 648 } 649 650 if (journal_entry_err_on(clock->rw > 1, 651 c, version, jset, entry, 652 journal_entry_clock_bad_rw, 653 "bad rw")) { 654 journal_entry_null_range(entry, vstruct_next(entry)); 655 return ret; 656 } 657 658 fsck_err: 659 return ret; 660 } 661 662 static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c, 663 struct jset_entry *entry) 664 { 665 struct jset_entry_clock *clock = 666 container_of(entry, struct jset_entry_clock, entry); 667 668 prt_printf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time)); 669 } 670 671 static int journal_entry_dev_usage_validate(struct bch_fs *c, 672 struct jset *jset, 673 struct jset_entry *entry, 674 unsigned version, int big_endian, 675 enum bch_validate_flags flags) 676 { 677 struct jset_entry_dev_usage *u = 678 container_of(entry, struct jset_entry_dev_usage, entry); 679 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 680 unsigned expected = sizeof(*u); 681 int ret = 0; 682 683 if (journal_entry_err_on(bytes < expected, 684 c, version, jset, entry, 685 journal_entry_dev_usage_bad_size, 686 "bad size (%u < %u)", 687 bytes, expected)) { 688 journal_entry_null_range(entry, vstruct_next(entry)); 689 return ret; 690 } 691 692 if (journal_entry_err_on(u->pad, 693 c, version, jset, entry, 694 journal_entry_dev_usage_bad_pad, 695 "bad pad")) { 696 journal_entry_null_range(entry, vstruct_next(entry)); 697 return ret; 698 } 699 700 fsck_err: 701 return ret; 702 } 703 704 static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c, 705 struct jset_entry *entry) 706 { 707 struct jset_entry_dev_usage *u = 708 container_of(entry, struct jset_entry_dev_usage, entry); 709 unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); 710 711 if (vstruct_bytes(entry) < sizeof(*u)) 712 return; 713 714 prt_printf(out, "dev=%u", le32_to_cpu(u->dev)); 715 716 printbuf_indent_add(out, 2); 717 for (i = 0; i < nr_types; i++) { 718 prt_newline(out); 719 bch2_prt_data_type(out, i); 720 prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu", 721 le64_to_cpu(u->d[i].buckets), 722 le64_to_cpu(u->d[i].sectors), 723 le64_to_cpu(u->d[i].fragmented)); 724 } 725 printbuf_indent_sub(out, 2); 726 } 727 728 static int journal_entry_log_validate(struct bch_fs *c, 729 struct jset *jset, 730 struct jset_entry *entry, 731 unsigned version, int big_endian, 732 enum bch_validate_flags flags) 733 { 734 return 0; 735 } 736 737 static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c, 738 struct jset_entry *entry) 739 { 740 struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); 741 unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d); 742 743 prt_printf(out, "%.*s", bytes, l->d); 744 } 745 746 static int journal_entry_overwrite_validate(struct bch_fs *c, 747 struct jset *jset, 748 struct jset_entry *entry, 749 unsigned version, int big_endian, 750 enum bch_validate_flags flags) 751 { 752 return journal_entry_btree_keys_validate(c, jset, entry, 753 version, big_endian, READ); 754 } 755 756 static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c, 757 struct jset_entry *entry) 758 { 759 journal_entry_btree_keys_to_text(out, c, entry); 760 } 761 762 static int journal_entry_write_buffer_keys_validate(struct bch_fs *c, 763 struct jset *jset, 764 struct jset_entry *entry, 765 unsigned version, int big_endian, 766 enum bch_validate_flags flags) 767 { 768 return journal_entry_btree_keys_validate(c, jset, entry, 769 version, big_endian, READ); 770 } 771 772 static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c, 773 struct jset_entry *entry) 774 { 775 journal_entry_btree_keys_to_text(out, c, entry); 776 } 777 778 static int journal_entry_datetime_validate(struct bch_fs *c, 779 struct jset *jset, 780 struct jset_entry *entry, 781 unsigned version, int big_endian, 782 enum bch_validate_flags flags) 783 { 784 unsigned bytes = vstruct_bytes(entry); 785 unsigned expected = 16; 786 int ret = 0; 787 788 if (journal_entry_err_on(vstruct_bytes(entry) < expected, 789 c, version, jset, entry, 790 journal_entry_dev_usage_bad_size, 791 "bad size (%u < %u)", 792 bytes, expected)) { 793 journal_entry_null_range(entry, vstruct_next(entry)); 794 return ret; 795 } 796 fsck_err: 797 return ret; 798 } 799 800 static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *c, 801 struct jset_entry *entry) 802 { 803 struct jset_entry_datetime *datetime = 804 container_of(entry, struct jset_entry_datetime, entry); 805 806 bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); 807 } 808 809 struct jset_entry_ops { 810 int (*validate)(struct bch_fs *, struct jset *, 811 struct jset_entry *, unsigned, int, 812 enum bch_validate_flags); 813 void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); 814 }; 815 816 static const struct jset_entry_ops bch2_jset_entry_ops[] = { 817 #define x(f, nr) \ 818 [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \ 819 .validate = journal_entry_##f##_validate, \ 820 .to_text = journal_entry_##f##_to_text, \ 821 }, 822 BCH_JSET_ENTRY_TYPES() 823 #undef x 824 }; 825 826 int bch2_journal_entry_validate(struct bch_fs *c, 827 struct jset *jset, 828 struct jset_entry *entry, 829 unsigned version, int big_endian, 830 enum bch_validate_flags flags) 831 { 832 return entry->type < BCH_JSET_ENTRY_NR 833 ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry, 834 version, big_endian, flags) 835 : 0; 836 } 837 838 void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, 839 struct jset_entry *entry) 840 { 841 bch2_prt_jset_entry_type(out, entry->type); 842 843 if (entry->type < BCH_JSET_ENTRY_NR) { 844 prt_str(out, ": "); 845 bch2_jset_entry_ops[entry->type].to_text(out, c, entry); 846 } 847 } 848 849 static int jset_validate_entries(struct bch_fs *c, struct jset *jset, 850 enum bch_validate_flags flags) 851 { 852 unsigned version = le32_to_cpu(jset->version); 853 int ret = 0; 854 855 vstruct_for_each(jset, entry) { 856 if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset), 857 c, version, jset, entry, 858 journal_entry_past_jset_end, 859 "journal entry extends past end of jset")) { 860 jset->u64s = cpu_to_le32((u64 *) entry - jset->_data); 861 break; 862 } 863 864 ret = bch2_journal_entry_validate(c, jset, entry, 865 version, JSET_BIG_ENDIAN(jset), flags); 866 if (ret) 867 break; 868 } 869 fsck_err: 870 return ret; 871 } 872 873 static int jset_validate(struct bch_fs *c, 874 struct bch_dev *ca, 875 struct jset *jset, u64 sector, 876 enum bch_validate_flags flags) 877 { 878 unsigned version; 879 int ret = 0; 880 881 if (le64_to_cpu(jset->magic) != jset_magic(c)) 882 return JOURNAL_ENTRY_NONE; 883 884 version = le32_to_cpu(jset->version); 885 if (journal_entry_err_on(!bch2_version_compatible(version), 886 c, version, jset, NULL, 887 jset_unsupported_version, 888 "%s sector %llu seq %llu: incompatible journal entry version %u.%u", 889 ca ? ca->name : c->name, 890 sector, le64_to_cpu(jset->seq), 891 BCH_VERSION_MAJOR(version), 892 BCH_VERSION_MINOR(version))) { 893 /* don't try to continue: */ 894 return -EINVAL; 895 } 896 897 if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), 898 c, version, jset, NULL, 899 jset_unknown_csum, 900 "%s sector %llu seq %llu: journal entry with unknown csum type %llu", 901 ca ? ca->name : c->name, 902 sector, le64_to_cpu(jset->seq), 903 JSET_CSUM_TYPE(jset))) 904 ret = JOURNAL_ENTRY_BAD; 905 906 /* last_seq is ignored when JSET_NO_FLUSH is true */ 907 if (journal_entry_err_on(!JSET_NO_FLUSH(jset) && 908 le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), 909 c, version, jset, NULL, 910 jset_last_seq_newer_than_seq, 911 "invalid journal entry: last_seq > seq (%llu > %llu)", 912 le64_to_cpu(jset->last_seq), 913 le64_to_cpu(jset->seq))) { 914 jset->last_seq = jset->seq; 915 return JOURNAL_ENTRY_BAD; 916 } 917 918 ret = jset_validate_entries(c, jset, flags); 919 fsck_err: 920 return ret; 921 } 922 923 static int jset_validate_early(struct bch_fs *c, 924 struct bch_dev *ca, 925 struct jset *jset, u64 sector, 926 unsigned bucket_sectors_left, 927 unsigned sectors_read) 928 { 929 size_t bytes = vstruct_bytes(jset); 930 unsigned version; 931 enum bch_validate_flags flags = BCH_VALIDATE_journal; 932 int ret = 0; 933 934 if (le64_to_cpu(jset->magic) != jset_magic(c)) 935 return JOURNAL_ENTRY_NONE; 936 937 version = le32_to_cpu(jset->version); 938 if (journal_entry_err_on(!bch2_version_compatible(version), 939 c, version, jset, NULL, 940 jset_unsupported_version, 941 "%s sector %llu seq %llu: unknown journal entry version %u.%u", 942 ca ? ca->name : c->name, 943 sector, le64_to_cpu(jset->seq), 944 BCH_VERSION_MAJOR(version), 945 BCH_VERSION_MINOR(version))) { 946 /* don't try to continue: */ 947 return -EINVAL; 948 } 949 950 if (bytes > (sectors_read << 9) && 951 sectors_read < bucket_sectors_left) 952 return JOURNAL_ENTRY_REREAD; 953 954 if (journal_entry_err_on(bytes > bucket_sectors_left << 9, 955 c, version, jset, NULL, 956 jset_past_bucket_end, 957 "%s sector %llu seq %llu: journal entry too big (%zu bytes)", 958 ca ? ca->name : c->name, 959 sector, le64_to_cpu(jset->seq), bytes)) 960 le32_add_cpu(&jset->u64s, 961 -((bytes - (bucket_sectors_left << 9)) / 8)); 962 fsck_err: 963 return ret; 964 } 965 966 struct journal_read_buf { 967 void *data; 968 size_t size; 969 }; 970 971 static int journal_read_buf_realloc(struct journal_read_buf *b, 972 size_t new_size) 973 { 974 void *n; 975 976 /* the bios are sized for this many pages, max: */ 977 if (new_size > JOURNAL_ENTRY_SIZE_MAX) 978 return -BCH_ERR_ENOMEM_journal_read_buf_realloc; 979 980 new_size = roundup_pow_of_two(new_size); 981 n = kvmalloc(new_size, GFP_KERNEL); 982 if (!n) 983 return -BCH_ERR_ENOMEM_journal_read_buf_realloc; 984 985 kvfree(b->data); 986 b->data = n; 987 b->size = new_size; 988 return 0; 989 } 990 991 static int journal_read_bucket(struct bch_dev *ca, 992 struct journal_read_buf *buf, 993 struct journal_list *jlist, 994 unsigned bucket) 995 { 996 struct bch_fs *c = ca->fs; 997 struct journal_device *ja = &ca->journal; 998 struct jset *j = NULL; 999 unsigned sectors, sectors_read = 0; 1000 u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), 1001 end = offset + ca->mi.bucket_size; 1002 bool saw_bad = false, csum_good; 1003 struct printbuf err = PRINTBUF; 1004 int ret = 0; 1005 1006 pr_debug("reading %u", bucket); 1007 1008 while (offset < end) { 1009 if (!sectors_read) { 1010 struct bio *bio; 1011 unsigned nr_bvecs; 1012 reread: 1013 sectors_read = min_t(unsigned, 1014 end - offset, buf->size >> 9); 1015 nr_bvecs = buf_pages(buf->data, sectors_read << 9); 1016 1017 bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); 1018 if (!bio) 1019 return -BCH_ERR_ENOMEM_journal_read_bucket; 1020 bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ); 1021 1022 bio->bi_iter.bi_sector = offset; 1023 bch2_bio_map(bio, buf->data, sectors_read << 9); 1024 1025 ret = submit_bio_wait(bio); 1026 kfree(bio); 1027 1028 if (bch2_dev_io_err_on(ret, ca, BCH_MEMBER_ERROR_read, 1029 "journal read error: sector %llu", 1030 offset) || 1031 bch2_meta_read_fault("journal")) { 1032 /* 1033 * We don't error out of the recovery process 1034 * here, since the relevant journal entry may be 1035 * found on a different device, and missing or 1036 * no journal entries will be handled later 1037 */ 1038 goto out; 1039 } 1040 1041 j = buf->data; 1042 } 1043 1044 ret = jset_validate_early(c, ca, j, offset, 1045 end - offset, sectors_read); 1046 switch (ret) { 1047 case 0: 1048 sectors = vstruct_sectors(j, c->block_bits); 1049 break; 1050 case JOURNAL_ENTRY_REREAD: 1051 if (vstruct_bytes(j) > buf->size) { 1052 ret = journal_read_buf_realloc(buf, 1053 vstruct_bytes(j)); 1054 if (ret) 1055 goto err; 1056 } 1057 goto reread; 1058 case JOURNAL_ENTRY_NONE: 1059 if (!saw_bad) 1060 goto out; 1061 /* 1062 * On checksum error we don't really trust the size 1063 * field of the journal entry we read, so try reading 1064 * again at next block boundary: 1065 */ 1066 sectors = block_sectors(c); 1067 goto next_block; 1068 default: 1069 goto err; 1070 } 1071 1072 if (le64_to_cpu(j->seq) > ja->highest_seq_found) { 1073 ja->highest_seq_found = le64_to_cpu(j->seq); 1074 ja->cur_idx = bucket; 1075 ja->sectors_free = ca->mi.bucket_size - 1076 bucket_remainder(ca, offset) - sectors; 1077 } 1078 1079 /* 1080 * This happens sometimes if we don't have discards on - 1081 * when we've partially overwritten a bucket with new 1082 * journal entries. We don't need the rest of the 1083 * bucket: 1084 */ 1085 if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) 1086 goto out; 1087 1088 ja->bucket_seq[bucket] = le64_to_cpu(j->seq); 1089 1090 enum bch_csum_type csum_type = JSET_CSUM_TYPE(j); 1091 struct bch_csum csum; 1092 csum_good = jset_csum_good(c, j, &csum); 1093 1094 if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum, 1095 "%s", 1096 (printbuf_reset(&err), 1097 prt_str(&err, "journal "), 1098 bch2_csum_err_msg(&err, csum_type, j->csum, csum), 1099 err.buf))) 1100 saw_bad = true; 1101 1102 ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), 1103 j->encrypted_start, 1104 vstruct_end(j) - (void *) j->encrypted_start); 1105 bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret)); 1106 1107 mutex_lock(&jlist->lock); 1108 ret = journal_entry_add(c, ca, (struct journal_ptr) { 1109 .csum_good = csum_good, 1110 .dev = ca->dev_idx, 1111 .bucket = bucket, 1112 .bucket_offset = offset - 1113 bucket_to_sector(ca, ja->buckets[bucket]), 1114 .sector = offset, 1115 }, jlist, j); 1116 mutex_unlock(&jlist->lock); 1117 1118 switch (ret) { 1119 case JOURNAL_ENTRY_ADD_OK: 1120 break; 1121 case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: 1122 break; 1123 default: 1124 goto err; 1125 } 1126 next_block: 1127 pr_debug("next"); 1128 offset += sectors; 1129 sectors_read -= sectors; 1130 j = ((void *) j) + (sectors << 9); 1131 } 1132 1133 out: 1134 ret = 0; 1135 err: 1136 printbuf_exit(&err); 1137 return ret; 1138 } 1139 1140 static CLOSURE_CALLBACK(bch2_journal_read_device) 1141 { 1142 closure_type(ja, struct journal_device, read); 1143 struct bch_dev *ca = container_of(ja, struct bch_dev, journal); 1144 struct bch_fs *c = ca->fs; 1145 struct journal_list *jlist = 1146 container_of(cl->parent, struct journal_list, cl); 1147 struct journal_read_buf buf = { NULL, 0 }; 1148 unsigned i; 1149 int ret = 0; 1150 1151 if (!ja->nr) 1152 goto out; 1153 1154 ret = journal_read_buf_realloc(&buf, PAGE_SIZE); 1155 if (ret) 1156 goto err; 1157 1158 pr_debug("%u journal buckets", ja->nr); 1159 1160 for (i = 0; i < ja->nr; i++) { 1161 ret = journal_read_bucket(ca, &buf, jlist, i); 1162 if (ret) 1163 goto err; 1164 } 1165 1166 /* 1167 * Set dirty_idx to indicate the entire journal is full and needs to be 1168 * reclaimed - journal reclaim will immediately reclaim whatever isn't 1169 * pinned when it first runs: 1170 */ 1171 ja->discard_idx = ja->dirty_idx_ondisk = 1172 ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; 1173 out: 1174 bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); 1175 kvfree(buf.data); 1176 percpu_ref_put(&ca->io_ref); 1177 closure_return(cl); 1178 return; 1179 err: 1180 mutex_lock(&jlist->lock); 1181 jlist->ret = ret; 1182 mutex_unlock(&jlist->lock); 1183 goto out; 1184 } 1185 1186 int bch2_journal_read(struct bch_fs *c, 1187 u64 *last_seq, 1188 u64 *blacklist_seq, 1189 u64 *start_seq) 1190 { 1191 struct journal_list jlist; 1192 struct journal_replay *i, **_i, *prev = NULL; 1193 struct genradix_iter radix_iter; 1194 struct printbuf buf = PRINTBUF; 1195 bool degraded = false, last_write_torn = false; 1196 u64 seq; 1197 int ret = 0; 1198 1199 closure_init_stack(&jlist.cl); 1200 mutex_init(&jlist.lock); 1201 jlist.last_seq = 0; 1202 jlist.ret = 0; 1203 1204 for_each_member_device(c, ca) { 1205 if (!c->opts.fsck && 1206 !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal))) 1207 continue; 1208 1209 if ((ca->mi.state == BCH_MEMBER_STATE_rw || 1210 ca->mi.state == BCH_MEMBER_STATE_ro) && 1211 percpu_ref_tryget(&ca->io_ref)) 1212 closure_call(&ca->journal.read, 1213 bch2_journal_read_device, 1214 system_unbound_wq, 1215 &jlist.cl); 1216 else 1217 degraded = true; 1218 } 1219 1220 closure_sync(&jlist.cl); 1221 1222 if (jlist.ret) 1223 return jlist.ret; 1224 1225 *last_seq = 0; 1226 *start_seq = 0; 1227 *blacklist_seq = 0; 1228 1229 /* 1230 * Find most recent flush entry, and ignore newer non flush entries - 1231 * those entries will be blacklisted: 1232 */ 1233 genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) { 1234 enum bch_validate_flags flags = BCH_VALIDATE_journal; 1235 1236 i = *_i; 1237 1238 if (journal_replay_ignore(i)) 1239 continue; 1240 1241 if (!*start_seq) 1242 *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1; 1243 1244 if (JSET_NO_FLUSH(&i->j)) { 1245 i->ignore_blacklisted = true; 1246 continue; 1247 } 1248 1249 if (!last_write_torn && !i->csum_good) { 1250 last_write_torn = true; 1251 i->ignore_blacklisted = true; 1252 continue; 1253 } 1254 1255 if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq), 1256 c, le32_to_cpu(i->j.version), &i->j, NULL, 1257 jset_last_seq_newer_than_seq, 1258 "invalid journal entry: last_seq > seq (%llu > %llu)", 1259 le64_to_cpu(i->j.last_seq), 1260 le64_to_cpu(i->j.seq))) 1261 i->j.last_seq = i->j.seq; 1262 1263 *last_seq = le64_to_cpu(i->j.last_seq); 1264 *blacklist_seq = le64_to_cpu(i->j.seq) + 1; 1265 break; 1266 } 1267 1268 if (!*start_seq) { 1269 bch_info(c, "journal read done, but no entries found"); 1270 return 0; 1271 } 1272 1273 if (!*last_seq) { 1274 fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes, 1275 "journal read done, but no entries found after dropping non-flushes"); 1276 return 0; 1277 } 1278 1279 bch_info(c, "journal read done, replaying entries %llu-%llu", 1280 *last_seq, *blacklist_seq - 1); 1281 1282 if (*start_seq != *blacklist_seq) 1283 bch_info(c, "dropped unflushed entries %llu-%llu", 1284 *blacklist_seq, *start_seq - 1); 1285 1286 /* Drop blacklisted entries and entries older than last_seq: */ 1287 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1288 i = *_i; 1289 1290 if (journal_replay_ignore(i)) 1291 continue; 1292 1293 seq = le64_to_cpu(i->j.seq); 1294 if (seq < *last_seq) { 1295 journal_replay_free(c, i, false); 1296 continue; 1297 } 1298 1299 if (bch2_journal_seq_is_blacklisted(c, seq, true)) { 1300 fsck_err_on(!JSET_NO_FLUSH(&i->j), c, 1301 jset_seq_blacklisted, 1302 "found blacklisted journal entry %llu", seq); 1303 i->ignore_blacklisted = true; 1304 } 1305 } 1306 1307 /* Check for missing entries: */ 1308 seq = *last_seq; 1309 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1310 i = *_i; 1311 1312 if (journal_replay_ignore(i)) 1313 continue; 1314 1315 BUG_ON(seq > le64_to_cpu(i->j.seq)); 1316 1317 while (seq < le64_to_cpu(i->j.seq)) { 1318 u64 missing_start, missing_end; 1319 struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; 1320 1321 while (seq < le64_to_cpu(i->j.seq) && 1322 bch2_journal_seq_is_blacklisted(c, seq, false)) 1323 seq++; 1324 1325 if (seq == le64_to_cpu(i->j.seq)) 1326 break; 1327 1328 missing_start = seq; 1329 1330 while (seq < le64_to_cpu(i->j.seq) && 1331 !bch2_journal_seq_is_blacklisted(c, seq, false)) 1332 seq++; 1333 1334 if (prev) { 1335 bch2_journal_ptrs_to_text(&buf1, c, prev); 1336 prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits)); 1337 } else 1338 prt_printf(&buf1, "(none)"); 1339 bch2_journal_ptrs_to_text(&buf2, c, i); 1340 1341 missing_end = seq - 1; 1342 fsck_err(c, journal_entries_missing, 1343 "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" 1344 " prev at %s\n" 1345 " next at %s, continue?", 1346 missing_start, missing_end, 1347 *last_seq, *blacklist_seq - 1, 1348 buf1.buf, buf2.buf); 1349 1350 printbuf_exit(&buf1); 1351 printbuf_exit(&buf2); 1352 } 1353 1354 prev = i; 1355 seq++; 1356 } 1357 1358 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1359 struct bch_replicas_padded replicas = { 1360 .e.data_type = BCH_DATA_journal, 1361 .e.nr_devs = 0, 1362 .e.nr_required = 1, 1363 }; 1364 1365 i = *_i; 1366 if (journal_replay_ignore(i)) 1367 continue; 1368 1369 darray_for_each(i->ptrs, ptr) { 1370 struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); 1371 1372 if (!ptr->csum_good) 1373 bch_err_dev_offset(ca, ptr->sector, 1374 "invalid journal checksum, seq %llu%s", 1375 le64_to_cpu(i->j.seq), 1376 i->csum_good ? " (had good copy on another device)" : ""); 1377 } 1378 1379 ret = jset_validate(c, 1380 bch2_dev_have_ref(c, i->ptrs.data[0].dev), 1381 &i->j, 1382 i->ptrs.data[0].sector, 1383 READ); 1384 if (ret) 1385 goto err; 1386 1387 darray_for_each(i->ptrs, ptr) 1388 replicas_entry_add_dev(&replicas.e, ptr->dev); 1389 1390 bch2_replicas_entry_sort(&replicas.e); 1391 1392 printbuf_reset(&buf); 1393 bch2_replicas_entry_to_text(&buf, &replicas.e); 1394 1395 if (!degraded && 1396 !bch2_replicas_marked(c, &replicas.e) && 1397 (le64_to_cpu(i->j.seq) == *last_seq || 1398 fsck_err(c, journal_entry_replicas_not_marked, 1399 "superblock not marked as containing replicas for journal entry %llu\n %s", 1400 le64_to_cpu(i->j.seq), buf.buf))) { 1401 ret = bch2_mark_replicas(c, &replicas.e); 1402 if (ret) 1403 goto err; 1404 } 1405 } 1406 err: 1407 fsck_err: 1408 printbuf_exit(&buf); 1409 return ret; 1410 } 1411 1412 /* journal write: */ 1413 1414 static void __journal_write_alloc(struct journal *j, 1415 struct journal_buf *w, 1416 struct dev_alloc_list *devs_sorted, 1417 unsigned sectors, 1418 unsigned *replicas, 1419 unsigned replicas_want) 1420 { 1421 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1422 struct journal_device *ja; 1423 struct bch_dev *ca; 1424 unsigned i; 1425 1426 if (*replicas >= replicas_want) 1427 return; 1428 1429 for (i = 0; i < devs_sorted->nr; i++) { 1430 ca = rcu_dereference(c->devs[devs_sorted->devs[i]]); 1431 if (!ca) 1432 continue; 1433 1434 ja = &ca->journal; 1435 1436 /* 1437 * Check that we can use this device, and aren't already using 1438 * it: 1439 */ 1440 if (!ca->mi.durability || 1441 ca->mi.state != BCH_MEMBER_STATE_rw || 1442 !ja->nr || 1443 bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) || 1444 sectors > ja->sectors_free) 1445 continue; 1446 1447 bch2_dev_stripe_increment(ca, &j->wp.stripe); 1448 1449 bch2_bkey_append_ptr(&w->key, 1450 (struct bch_extent_ptr) { 1451 .offset = bucket_to_sector(ca, 1452 ja->buckets[ja->cur_idx]) + 1453 ca->mi.bucket_size - 1454 ja->sectors_free, 1455 .dev = ca->dev_idx, 1456 }); 1457 1458 ja->sectors_free -= sectors; 1459 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1460 1461 *replicas += ca->mi.durability; 1462 1463 if (*replicas >= replicas_want) 1464 break; 1465 } 1466 } 1467 1468 /** 1469 * journal_write_alloc - decide where to write next journal entry 1470 * 1471 * @j: journal object 1472 * @w: journal buf (entry to be written) 1473 * 1474 * Returns: 0 on success, or -EROFS on failure 1475 */ 1476 static int journal_write_alloc(struct journal *j, struct journal_buf *w) 1477 { 1478 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1479 struct bch_devs_mask devs; 1480 struct journal_device *ja; 1481 struct bch_dev *ca; 1482 struct dev_alloc_list devs_sorted; 1483 unsigned sectors = vstruct_sectors(w->data, c->block_bits); 1484 unsigned target = c->opts.metadata_target ?: 1485 c->opts.foreground_target; 1486 unsigned i, replicas = 0, replicas_want = 1487 READ_ONCE(c->opts.metadata_replicas); 1488 unsigned replicas_need = min_t(unsigned, replicas_want, 1489 READ_ONCE(c->opts.metadata_replicas_required)); 1490 1491 rcu_read_lock(); 1492 retry: 1493 devs = target_rw_devs(c, BCH_DATA_journal, target); 1494 1495 devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); 1496 1497 __journal_write_alloc(j, w, &devs_sorted, 1498 sectors, &replicas, replicas_want); 1499 1500 if (replicas >= replicas_want) 1501 goto done; 1502 1503 for (i = 0; i < devs_sorted.nr; i++) { 1504 ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); 1505 if (!ca) 1506 continue; 1507 1508 ja = &ca->journal; 1509 1510 if (sectors > ja->sectors_free && 1511 sectors <= ca->mi.bucket_size && 1512 bch2_journal_dev_buckets_available(j, ja, 1513 journal_space_discarded)) { 1514 ja->cur_idx = (ja->cur_idx + 1) % ja->nr; 1515 ja->sectors_free = ca->mi.bucket_size; 1516 1517 /* 1518 * ja->bucket_seq[ja->cur_idx] must always have 1519 * something sensible: 1520 */ 1521 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1522 } 1523 } 1524 1525 __journal_write_alloc(j, w, &devs_sorted, 1526 sectors, &replicas, replicas_want); 1527 1528 if (replicas < replicas_want && target) { 1529 /* Retry from all devices: */ 1530 target = 0; 1531 goto retry; 1532 } 1533 done: 1534 rcu_read_unlock(); 1535 1536 BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); 1537 1538 return replicas >= replicas_need ? 0 : -EROFS; 1539 } 1540 1541 static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) 1542 { 1543 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1544 1545 /* we aren't holding j->lock: */ 1546 unsigned new_size = READ_ONCE(j->buf_size_want); 1547 void *new_buf; 1548 1549 if (buf->buf_size >= new_size) 1550 return; 1551 1552 size_t btree_write_buffer_size = new_size / 64; 1553 1554 if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size)) 1555 return; 1556 1557 new_buf = kvmalloc(new_size, GFP_NOFS|__GFP_NOWARN); 1558 if (!new_buf) 1559 return; 1560 1561 memcpy(new_buf, buf->data, buf->buf_size); 1562 1563 spin_lock(&j->lock); 1564 swap(buf->data, new_buf); 1565 swap(buf->buf_size, new_size); 1566 spin_unlock(&j->lock); 1567 1568 kvfree(new_buf); 1569 } 1570 1571 static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) 1572 { 1573 return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK); 1574 } 1575 1576 static CLOSURE_CALLBACK(journal_write_done) 1577 { 1578 closure_type(w, struct journal_buf, io); 1579 struct journal *j = container_of(w, struct journal, buf[w->idx]); 1580 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1581 struct bch_replicas_padded replicas; 1582 union journal_res_state old, new; 1583 u64 seq = le64_to_cpu(w->data->seq); 1584 int err = 0; 1585 1586 bch2_time_stats_update(!JSET_NO_FLUSH(w->data) 1587 ? j->flush_write_time 1588 : j->noflush_write_time, j->write_start_time); 1589 1590 if (!w->devs_written.nr) { 1591 bch_err(c, "unable to write journal to sufficient devices"); 1592 err = -EIO; 1593 } else { 1594 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, 1595 w->devs_written); 1596 if (bch2_mark_replicas(c, &replicas.e)) 1597 err = -EIO; 1598 } 1599 1600 if (err) 1601 bch2_fatal_error(c); 1602 1603 closure_debug_destroy(cl); 1604 1605 spin_lock(&j->lock); 1606 if (seq >= j->pin.front) 1607 journal_seq_pin(j, seq)->devs = w->devs_written; 1608 if (err && (!j->err_seq || seq < j->err_seq)) 1609 j->err_seq = seq; 1610 w->write_done = true; 1611 1612 bool completed = false; 1613 1614 for (seq = journal_last_unwritten_seq(j); 1615 seq <= journal_cur_seq(j); 1616 seq++) { 1617 w = j->buf + (seq & JOURNAL_BUF_MASK); 1618 if (!w->write_done) 1619 break; 1620 1621 if (!j->err_seq && !JSET_NO_FLUSH(w->data)) { 1622 j->flushed_seq_ondisk = seq; 1623 j->last_seq_ondisk = w->last_seq; 1624 1625 bch2_do_discards(c); 1626 closure_wake_up(&c->freelist_wait); 1627 bch2_reset_alloc_cursors(c); 1628 } 1629 1630 j->seq_ondisk = seq; 1631 1632 /* 1633 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard 1634 * more buckets: 1635 * 1636 * Must come before signaling write completion, for 1637 * bch2_fs_journal_stop(): 1638 */ 1639 if (j->watermark != BCH_WATERMARK_stripe) 1640 journal_reclaim_kick(&c->journal); 1641 1642 old.v = atomic64_read(&j->reservations.counter); 1643 do { 1644 new.v = old.v; 1645 BUG_ON(journal_state_count(new, new.unwritten_idx)); 1646 BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK)); 1647 1648 new.unwritten_idx++; 1649 } while (!atomic64_try_cmpxchg(&j->reservations.counter, 1650 &old.v, new.v)); 1651 1652 closure_wake_up(&w->wait); 1653 completed = true; 1654 } 1655 1656 if (completed) { 1657 bch2_journal_reclaim_fast(j); 1658 bch2_journal_space_available(j); 1659 1660 track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false); 1661 1662 journal_wake(j); 1663 } 1664 1665 if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && 1666 new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { 1667 struct journal_buf *buf = journal_cur_buf(j); 1668 long delta = buf->expires - jiffies; 1669 1670 /* 1671 * We don't close a journal entry to write it while there's 1672 * previous entries still in flight - the current journal entry 1673 * might want to be written now: 1674 */ 1675 mod_delayed_work(j->wq, &j->write_work, max(0L, delta)); 1676 } 1677 1678 /* 1679 * We don't typically trigger journal writes from her - the next journal 1680 * write will be triggered immediately after the previous one is 1681 * allocated, in bch2_journal_write() - but the journal write error path 1682 * is special: 1683 */ 1684 bch2_journal_do_writes(j); 1685 spin_unlock(&j->lock); 1686 } 1687 1688 static void journal_write_endio(struct bio *bio) 1689 { 1690 struct journal_bio *jbio = container_of(bio, struct journal_bio, bio); 1691 struct bch_dev *ca = jbio->ca; 1692 struct journal *j = &ca->fs->journal; 1693 struct journal_buf *w = j->buf + jbio->buf_idx; 1694 1695 if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, 1696 "error writing journal entry %llu: %s", 1697 le64_to_cpu(w->data->seq), 1698 bch2_blk_status_to_str(bio->bi_status)) || 1699 bch2_meta_write_fault("journal")) { 1700 unsigned long flags; 1701 1702 spin_lock_irqsave(&j->err_lock, flags); 1703 bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx); 1704 spin_unlock_irqrestore(&j->err_lock, flags); 1705 } 1706 1707 closure_put(&w->io); 1708 percpu_ref_put(&ca->io_ref); 1709 } 1710 1711 static CLOSURE_CALLBACK(journal_write_submit) 1712 { 1713 closure_type(w, struct journal_buf, io); 1714 struct journal *j = container_of(w, struct journal, buf[w->idx]); 1715 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1716 unsigned sectors = vstruct_sectors(w->data, c->block_bits); 1717 1718 extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { 1719 struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE); 1720 if (!ca) { 1721 /* XXX: fix this */ 1722 bch_err(c, "missing device for journal write\n"); 1723 continue; 1724 } 1725 1726 this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], 1727 sectors); 1728 1729 struct journal_device *ja = &ca->journal; 1730 struct bio *bio = &ja->bio[w->idx]->bio; 1731 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); 1732 bio->bi_iter.bi_sector = ptr->offset; 1733 bio->bi_end_io = journal_write_endio; 1734 bio->bi_private = ca; 1735 1736 BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector); 1737 ca->prev_journal_sector = bio->bi_iter.bi_sector; 1738 1739 if (!JSET_NO_FLUSH(w->data)) 1740 bio->bi_opf |= REQ_FUA; 1741 if (!JSET_NO_FLUSH(w->data) && !w->separate_flush) 1742 bio->bi_opf |= REQ_PREFLUSH; 1743 1744 bch2_bio_map(bio, w->data, sectors << 9); 1745 1746 trace_and_count(c, journal_write, bio); 1747 closure_bio_submit(bio, cl); 1748 1749 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1750 } 1751 1752 continue_at(cl, journal_write_done, j->wq); 1753 } 1754 1755 static CLOSURE_CALLBACK(journal_write_preflush) 1756 { 1757 closure_type(w, struct journal_buf, io); 1758 struct journal *j = container_of(w, struct journal, buf[w->idx]); 1759 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1760 1761 if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { 1762 spin_lock(&j->lock); 1763 if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { 1764 closure_wait(&j->async_wait, cl); 1765 spin_unlock(&j->lock); 1766 continue_at(cl, journal_write_preflush, j->wq); 1767 return; 1768 } 1769 spin_unlock(&j->lock); 1770 } 1771 1772 if (w->separate_flush) { 1773 for_each_rw_member(c, ca) { 1774 percpu_ref_get(&ca->io_ref); 1775 1776 struct journal_device *ja = &ca->journal; 1777 struct bio *bio = &ja->bio[w->idx]->bio; 1778 bio_reset(bio, ca->disk_sb.bdev, 1779 REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH); 1780 bio->bi_end_io = journal_write_endio; 1781 bio->bi_private = ca; 1782 closure_bio_submit(bio, cl); 1783 } 1784 1785 continue_at(cl, journal_write_submit, j->wq); 1786 } else { 1787 /* 1788 * no need to punt to another work item if we're not waiting on 1789 * preflushes 1790 */ 1791 journal_write_submit(&cl->work); 1792 } 1793 } 1794 1795 static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) 1796 { 1797 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1798 struct jset_entry *start, *end; 1799 struct jset *jset = w->data; 1800 struct journal_keys_to_wb wb = { NULL }; 1801 unsigned sectors, bytes, u64s; 1802 unsigned long btree_roots_have = 0; 1803 bool validate_before_checksum = false; 1804 u64 seq = le64_to_cpu(jset->seq); 1805 int ret; 1806 1807 /* 1808 * Simple compaction, dropping empty jset_entries (from journal 1809 * reservations that weren't fully used) and merging jset_entries that 1810 * can be. 1811 * 1812 * If we wanted to be really fancy here, we could sort all the keys in 1813 * the jset and drop keys that were overwritten - probably not worth it: 1814 */ 1815 vstruct_for_each(jset, i) { 1816 unsigned u64s = le16_to_cpu(i->u64s); 1817 1818 /* Empty entry: */ 1819 if (!u64s) 1820 continue; 1821 1822 /* 1823 * New btree roots are set by journalling them; when the journal 1824 * entry gets written we have to propagate them to 1825 * c->btree_roots 1826 * 1827 * But, every journal entry we write has to contain all the 1828 * btree roots (at least for now); so after we copy btree roots 1829 * to c->btree_roots we have to get any missing btree roots and 1830 * add them to this journal entry: 1831 */ 1832 switch (i->type) { 1833 case BCH_JSET_ENTRY_btree_root: 1834 bch2_journal_entry_to_btree_root(c, i); 1835 __set_bit(i->btree_id, &btree_roots_have); 1836 break; 1837 case BCH_JSET_ENTRY_write_buffer_keys: 1838 EBUG_ON(!w->need_flush_to_write_buffer); 1839 1840 if (!wb.wb) 1841 bch2_journal_keys_to_write_buffer_start(c, &wb, seq); 1842 1843 jset_entry_for_each_key(i, k) { 1844 ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k); 1845 if (ret) { 1846 bch2_fs_fatal_error(c, "flushing journal keys to btree write buffer: %s", 1847 bch2_err_str(ret)); 1848 bch2_journal_keys_to_write_buffer_end(c, &wb); 1849 return ret; 1850 } 1851 } 1852 i->type = BCH_JSET_ENTRY_btree_keys; 1853 break; 1854 } 1855 } 1856 1857 if (wb.wb) { 1858 ret = bch2_journal_keys_to_write_buffer_end(c, &wb); 1859 if (ret) { 1860 bch2_fs_fatal_error(c, "error flushing journal keys to btree write buffer: %s", 1861 bch2_err_str(ret)); 1862 return ret; 1863 } 1864 } 1865 1866 spin_lock(&c->journal.lock); 1867 w->need_flush_to_write_buffer = false; 1868 spin_unlock(&c->journal.lock); 1869 1870 start = end = vstruct_last(jset); 1871 1872 end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have); 1873 1874 struct jset_entry_datetime *d = 1875 container_of(jset_entry_init(&end, sizeof(*d)), struct jset_entry_datetime, entry); 1876 d->entry.type = BCH_JSET_ENTRY_datetime; 1877 d->seconds = cpu_to_le64(ktime_get_real_seconds()); 1878 1879 bch2_journal_super_entries_add_common(c, &end, seq); 1880 u64s = (u64 *) end - (u64 *) start; 1881 1882 WARN_ON(u64s > j->entry_u64s_reserved); 1883 1884 le32_add_cpu(&jset->u64s, u64s); 1885 1886 sectors = vstruct_sectors(jset, c->block_bits); 1887 bytes = vstruct_bytes(jset); 1888 1889 if (sectors > w->sectors) { 1890 bch2_fs_fatal_error(c, ": journal write overran available space, %zu > %u (extra %u reserved %u/%u)", 1891 vstruct_bytes(jset), w->sectors << 9, 1892 u64s, w->u64s_reserved, j->entry_u64s_reserved); 1893 return -EINVAL; 1894 } 1895 1896 jset->magic = cpu_to_le64(jset_magic(c)); 1897 jset->version = cpu_to_le32(c->sb.version); 1898 1899 SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); 1900 SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); 1901 1902 if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset)) 1903 j->last_empty_seq = seq; 1904 1905 if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) 1906 validate_before_checksum = true; 1907 1908 if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current) 1909 validate_before_checksum = true; 1910 1911 if (validate_before_checksum && 1912 (ret = jset_validate(c, NULL, jset, 0, WRITE))) 1913 return ret; 1914 1915 ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), 1916 jset->encrypted_start, 1917 vstruct_end(jset) - (void *) jset->encrypted_start); 1918 if (bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret))) 1919 return ret; 1920 1921 jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), 1922 journal_nonce(jset), jset); 1923 1924 if (!validate_before_checksum && 1925 (ret = jset_validate(c, NULL, jset, 0, WRITE))) 1926 return ret; 1927 1928 memset((void *) jset + bytes, 0, (sectors << 9) - bytes); 1929 return 0; 1930 } 1931 1932 static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w) 1933 { 1934 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1935 int error = bch2_journal_error(j); 1936 1937 /* 1938 * If the journal is in an error state - we did an emergency shutdown - 1939 * we prefer to continue doing journal writes. We just mark them as 1940 * noflush so they'll never be used, but they'll still be visible by the 1941 * list_journal tool - this helps in debugging. 1942 * 1943 * There's a caveat: the first journal write after marking the 1944 * superblock dirty must always be a flush write, because on startup 1945 * from a clean shutdown we didn't necessarily read the journal and the 1946 * new journal write might overwrite whatever was in the journal 1947 * previously - we can't leave the journal without any flush writes in 1948 * it. 1949 * 1950 * So if we're in an error state, and we're still starting up, we don't 1951 * write anything at all. 1952 */ 1953 if (error && test_bit(JOURNAL_need_flush_write, &j->flags)) 1954 return -EIO; 1955 1956 if (error || 1957 w->noflush || 1958 (!w->must_flush && 1959 time_before(jiffies, j->last_flush_write + 1960 msecs_to_jiffies(c->opts.journal_flush_delay)) && 1961 test_bit(JOURNAL_may_skip_flush, &j->flags))) { 1962 w->noflush = true; 1963 SET_JSET_NO_FLUSH(w->data, true); 1964 w->data->last_seq = 0; 1965 w->last_seq = 0; 1966 1967 j->nr_noflush_writes++; 1968 } else { 1969 w->must_flush = true; 1970 j->last_flush_write = jiffies; 1971 j->nr_flush_writes++; 1972 clear_bit(JOURNAL_need_flush_write, &j->flags); 1973 } 1974 1975 return 0; 1976 } 1977 1978 CLOSURE_CALLBACK(bch2_journal_write) 1979 { 1980 closure_type(w, struct journal_buf, io); 1981 struct journal *j = container_of(w, struct journal, buf[w->idx]); 1982 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1983 struct bch_replicas_padded replicas; 1984 unsigned nr_rw_members = 0; 1985 int ret; 1986 1987 for_each_rw_member(c, ca) 1988 nr_rw_members++; 1989 1990 BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); 1991 BUG_ON(!w->write_started); 1992 BUG_ON(w->write_allocated); 1993 BUG_ON(w->write_done); 1994 1995 j->write_start_time = local_clock(); 1996 1997 spin_lock(&j->lock); 1998 if (nr_rw_members > 1) 1999 w->separate_flush = true; 2000 2001 ret = bch2_journal_write_pick_flush(j, w); 2002 spin_unlock(&j->lock); 2003 if (ret) 2004 goto err; 2005 2006 mutex_lock(&j->buf_lock); 2007 journal_buf_realloc(j, w); 2008 2009 ret = bch2_journal_write_prep(j, w); 2010 mutex_unlock(&j->buf_lock); 2011 if (ret) 2012 goto err; 2013 2014 j->entry_bytes_written += vstruct_bytes(w->data); 2015 2016 while (1) { 2017 spin_lock(&j->lock); 2018 ret = journal_write_alloc(j, w); 2019 if (!ret || !j->can_discard) 2020 break; 2021 2022 spin_unlock(&j->lock); 2023 bch2_journal_do_discards(j); 2024 } 2025 2026 if (ret) { 2027 struct printbuf buf = PRINTBUF; 2028 buf.atomic++; 2029 2030 prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu: %s"), 2031 le64_to_cpu(w->data->seq), 2032 bch2_err_str(ret)); 2033 __bch2_journal_debug_to_text(&buf, j); 2034 spin_unlock(&j->lock); 2035 bch2_print_string_as_lines(KERN_ERR, buf.buf); 2036 printbuf_exit(&buf); 2037 goto err; 2038 } 2039 2040 /* 2041 * write is allocated, no longer need to account for it in 2042 * bch2_journal_space_available(): 2043 */ 2044 w->sectors = 0; 2045 w->write_allocated = true; 2046 2047 /* 2048 * journal entry has been compacted and allocated, recalculate space 2049 * available: 2050 */ 2051 bch2_journal_space_available(j); 2052 bch2_journal_do_writes(j); 2053 spin_unlock(&j->lock); 2054 2055 w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); 2056 2057 if (c->opts.nochanges) 2058 goto no_io; 2059 2060 /* 2061 * Mark journal replicas before we submit the write to guarantee 2062 * recovery will find the journal entries after a crash. 2063 */ 2064 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, 2065 w->devs_written); 2066 ret = bch2_mark_replicas(c, &replicas.e); 2067 if (ret) 2068 goto err; 2069 2070 if (!JSET_NO_FLUSH(w->data)) 2071 continue_at(cl, journal_write_preflush, j->wq); 2072 else 2073 continue_at(cl, journal_write_submit, j->wq); 2074 return; 2075 no_io: 2076 continue_at(cl, journal_write_done, j->wq); 2077 return; 2078 err: 2079 bch2_fatal_error(c); 2080 continue_at(cl, journal_write_done, j->wq); 2081 } 2082