1 // SPDX-License-Identifier: GPL-2.0 2 #include "bcachefs.h" 3 #include "alloc_background.h" 4 #include "alloc_foreground.h" 5 #include "btree_io.h" 6 #include "btree_update_interior.h" 7 #include "btree_write_buffer.h" 8 #include "buckets.h" 9 #include "checksum.h" 10 #include "disk_groups.h" 11 #include "error.h" 12 #include "journal.h" 13 #include "journal_io.h" 14 #include "journal_reclaim.h" 15 #include "journal_seq_blacklist.h" 16 #include "replicas.h" 17 #include "sb-clean.h" 18 #include "trace.h" 19 20 void bch2_journal_pos_from_member_info_set(struct bch_fs *c) 21 { 22 lockdep_assert_held(&c->sb_lock); 23 24 for_each_member_device(c, ca) { 25 struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); 26 27 m->last_journal_bucket = cpu_to_le32(ca->journal.cur_idx); 28 m->last_journal_bucket_offset = cpu_to_le32(ca->mi.bucket_size - ca->journal.sectors_free); 29 } 30 } 31 32 void bch2_journal_pos_from_member_info_resume(struct bch_fs *c) 33 { 34 mutex_lock(&c->sb_lock); 35 for_each_member_device(c, ca) { 36 struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx); 37 38 unsigned idx = le32_to_cpu(m.last_journal_bucket); 39 if (idx < ca->journal.nr) 40 ca->journal.cur_idx = idx; 41 unsigned offset = le32_to_cpu(m.last_journal_bucket_offset); 42 if (offset <= ca->mi.bucket_size) 43 ca->journal.sectors_free = ca->mi.bucket_size - offset; 44 } 45 mutex_unlock(&c->sb_lock); 46 } 47 48 void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, 49 struct journal_replay *j) 50 { 51 darray_for_each(j->ptrs, i) { 52 if (i != j->ptrs.data) 53 prt_printf(out, " "); 54 prt_printf(out, "%u:%u:%u (sector %llu)", 55 i->dev, i->bucket, i->bucket_offset, i->sector); 56 } 57 } 58 59 static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c, 60 struct journal_replay *j) 61 { 62 prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq)); 63 64 bch2_journal_ptrs_to_text(out, c, j); 65 66 for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) { 67 struct jset_entry_datetime *datetime = 68 container_of(entry, struct jset_entry_datetime, entry); 69 bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); 70 break; 71 } 72 } 73 74 static struct nonce journal_nonce(const struct jset *jset) 75 { 76 return (struct nonce) {{ 77 [0] = 0, 78 [1] = ((__le32 *) &jset->seq)[0], 79 [2] = ((__le32 *) &jset->seq)[1], 80 [3] = BCH_NONCE_JOURNAL, 81 }}; 82 } 83 84 static bool jset_csum_good(struct bch_fs *c, struct jset *j, struct bch_csum *csum) 85 { 86 if (!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j))) { 87 *csum = (struct bch_csum) {}; 88 return false; 89 } 90 91 *csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j); 92 return !bch2_crc_cmp(j->csum, *csum); 93 } 94 95 static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq) 96 { 97 return (seq - c->journal_entries_base_seq) & (~0U >> 1); 98 } 99 100 static void __journal_replay_free(struct bch_fs *c, 101 struct journal_replay *i) 102 { 103 struct journal_replay **p = 104 genradix_ptr(&c->journal_entries, 105 journal_entry_radix_idx(c, le64_to_cpu(i->j.seq))); 106 107 BUG_ON(*p != i); 108 *p = NULL; 109 kvfree(i); 110 } 111 112 static void journal_replay_free(struct bch_fs *c, struct journal_replay *i, bool blacklisted) 113 { 114 if (blacklisted) 115 i->ignore_blacklisted = true; 116 else 117 i->ignore_not_dirty = true; 118 119 if (!c->opts.read_entire_journal) 120 __journal_replay_free(c, i); 121 } 122 123 struct journal_list { 124 struct closure cl; 125 u64 last_seq; 126 struct mutex lock; 127 int ret; 128 }; 129 130 #define JOURNAL_ENTRY_ADD_OK 0 131 #define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5 132 133 /* 134 * Given a journal entry we just read, add it to the list of journal entries to 135 * be replayed: 136 */ 137 static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, 138 struct journal_ptr entry_ptr, 139 struct journal_list *jlist, struct jset *j) 140 { 141 struct genradix_iter iter; 142 struct journal_replay **_i, *i, *dup; 143 size_t bytes = vstruct_bytes(j); 144 u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0; 145 struct printbuf buf = PRINTBUF; 146 int ret = JOURNAL_ENTRY_ADD_OK; 147 148 if (!c->journal.oldest_seq_found_ondisk || 149 le64_to_cpu(j->seq) < c->journal.oldest_seq_found_ondisk) 150 c->journal.oldest_seq_found_ondisk = le64_to_cpu(j->seq); 151 152 /* Is this entry older than the range we need? */ 153 if (!c->opts.read_entire_journal && 154 le64_to_cpu(j->seq) < jlist->last_seq) 155 return JOURNAL_ENTRY_ADD_OUT_OF_RANGE; 156 157 /* 158 * genradixes are indexed by a ulong, not a u64, so we can't index them 159 * by sequence number directly: Assume instead that they will all fall 160 * within the range of +-2billion of the filrst one we find. 161 */ 162 if (!c->journal_entries_base_seq) 163 c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX); 164 165 /* Drop entries we don't need anymore */ 166 if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) { 167 genradix_for_each_from(&c->journal_entries, iter, _i, 168 journal_entry_radix_idx(c, jlist->last_seq)) { 169 i = *_i; 170 171 if (journal_replay_ignore(i)) 172 continue; 173 174 if (le64_to_cpu(i->j.seq) >= last_seq) 175 break; 176 177 journal_replay_free(c, i, false); 178 } 179 } 180 181 jlist->last_seq = max(jlist->last_seq, last_seq); 182 183 _i = genradix_ptr_alloc(&c->journal_entries, 184 journal_entry_radix_idx(c, le64_to_cpu(j->seq)), 185 GFP_KERNEL); 186 if (!_i) 187 return -BCH_ERR_ENOMEM_journal_entry_add; 188 189 /* 190 * Duplicate journal entries? If so we want the one that didn't have a 191 * checksum error: 192 */ 193 dup = *_i; 194 if (dup) { 195 bool identical = bytes == vstruct_bytes(&dup->j) && 196 !memcmp(j, &dup->j, bytes); 197 bool not_identical = !identical && 198 entry_ptr.csum_good && 199 dup->csum_good; 200 201 bool same_device = false; 202 darray_for_each(dup->ptrs, ptr) 203 if (ptr->dev == ca->dev_idx) 204 same_device = true; 205 206 ret = darray_push(&dup->ptrs, entry_ptr); 207 if (ret) 208 goto out; 209 210 bch2_journal_replay_to_text(&buf, c, dup); 211 212 fsck_err_on(same_device, 213 c, journal_entry_dup_same_device, 214 "duplicate journal entry on same device\n %s", 215 buf.buf); 216 217 fsck_err_on(not_identical, 218 c, journal_entry_replicas_data_mismatch, 219 "found duplicate but non identical journal entries\n %s", 220 buf.buf); 221 222 if (entry_ptr.csum_good && !identical) 223 goto replace; 224 225 goto out; 226 } 227 replace: 228 i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); 229 if (!i) 230 return -BCH_ERR_ENOMEM_journal_entry_add; 231 232 darray_init(&i->ptrs); 233 i->csum_good = entry_ptr.csum_good; 234 i->ignore_blacklisted = false; 235 i->ignore_not_dirty = false; 236 unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct"); 237 238 if (dup) { 239 /* The first ptr should represent the jset we kept: */ 240 darray_for_each(dup->ptrs, ptr) 241 darray_push(&i->ptrs, *ptr); 242 __journal_replay_free(c, dup); 243 } else { 244 darray_push(&i->ptrs, entry_ptr); 245 } 246 247 *_i = i; 248 out: 249 fsck_err: 250 printbuf_exit(&buf); 251 return ret; 252 } 253 254 /* this fills in a range with empty jset_entries: */ 255 static void journal_entry_null_range(void *start, void *end) 256 { 257 struct jset_entry *entry; 258 259 for (entry = start; entry != end; entry = vstruct_next(entry)) 260 memset(entry, 0, sizeof(*entry)); 261 } 262 263 #define JOURNAL_ENTRY_REREAD 5 264 #define JOURNAL_ENTRY_NONE 6 265 #define JOURNAL_ENTRY_BAD 7 266 267 static void journal_entry_err_msg(struct printbuf *out, 268 u32 version, 269 struct jset *jset, 270 struct jset_entry *entry) 271 { 272 prt_str(out, "invalid journal entry, version="); 273 bch2_version_to_text(out, version); 274 275 if (entry) { 276 prt_str(out, " type="); 277 bch2_prt_jset_entry_type(out, entry->type); 278 } 279 280 if (!jset) { 281 prt_printf(out, " in superblock"); 282 } else { 283 284 prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq)); 285 286 if (entry) 287 prt_printf(out, " offset=%zi/%u", 288 (u64 *) entry - jset->_data, 289 le32_to_cpu(jset->u64s)); 290 } 291 292 prt_str(out, ": "); 293 } 294 295 #define journal_entry_err(c, version, jset, entry, _err, msg, ...) \ 296 ({ \ 297 struct printbuf _buf = PRINTBUF; \ 298 \ 299 journal_entry_err_msg(&_buf, version, jset, entry); \ 300 prt_printf(&_buf, msg, ##__VA_ARGS__); \ 301 \ 302 switch (flags & BCH_VALIDATE_write) { \ 303 case READ: \ 304 mustfix_fsck_err(c, _err, "%s", _buf.buf); \ 305 break; \ 306 case WRITE: \ 307 bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \ 308 bch_err(c, "corrupt metadata before write: %s\n", _buf.buf);\ 309 if (bch2_fs_inconsistent(c)) { \ 310 ret = -BCH_ERR_fsck_errors_not_fixed; \ 311 goto fsck_err; \ 312 } \ 313 break; \ 314 } \ 315 \ 316 printbuf_exit(&_buf); \ 317 true; \ 318 }) 319 320 #define journal_entry_err_on(cond, ...) \ 321 ((cond) ? journal_entry_err(__VA_ARGS__) : false) 322 323 #define FSCK_DELETED_KEY 5 324 325 static int journal_validate_key(struct bch_fs *c, 326 struct jset *jset, 327 struct jset_entry *entry, 328 unsigned level, enum btree_id btree_id, 329 struct bkey_i *k, 330 unsigned version, int big_endian, 331 enum bch_validate_flags flags) 332 { 333 int write = flags & BCH_VALIDATE_write; 334 void *next = vstruct_next(entry); 335 struct printbuf buf = PRINTBUF; 336 int ret = 0; 337 338 if (journal_entry_err_on(!k->k.u64s, 339 c, version, jset, entry, 340 journal_entry_bkey_u64s_0, 341 "k->u64s 0")) { 342 entry->u64s = cpu_to_le16((u64 *) k - entry->_data); 343 journal_entry_null_range(vstruct_next(entry), next); 344 return FSCK_DELETED_KEY; 345 } 346 347 if (journal_entry_err_on((void *) bkey_next(k) > 348 (void *) vstruct_next(entry), 349 c, version, jset, entry, 350 journal_entry_bkey_past_end, 351 "extends past end of journal entry")) { 352 entry->u64s = cpu_to_le16((u64 *) k - entry->_data); 353 journal_entry_null_range(vstruct_next(entry), next); 354 return FSCK_DELETED_KEY; 355 } 356 357 if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, 358 c, version, jset, entry, 359 journal_entry_bkey_bad_format, 360 "bad format %u", k->k.format)) { 361 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); 362 memmove(k, bkey_next(k), next - (void *) bkey_next(k)); 363 journal_entry_null_range(vstruct_next(entry), next); 364 return FSCK_DELETED_KEY; 365 } 366 367 if (!write) 368 bch2_bkey_compat(level, btree_id, version, big_endian, 369 write, NULL, bkey_to_packed(k)); 370 371 if (bch2_bkey_invalid(c, bkey_i_to_s_c(k), 372 __btree_node_type(level, btree_id), write, &buf)) { 373 printbuf_reset(&buf); 374 journal_entry_err_msg(&buf, version, jset, entry); 375 prt_newline(&buf); 376 printbuf_indent_add(&buf, 2); 377 378 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); 379 prt_newline(&buf); 380 bch2_bkey_invalid(c, bkey_i_to_s_c(k), 381 __btree_node_type(level, btree_id), write, &buf); 382 383 mustfix_fsck_err(c, journal_entry_bkey_invalid, 384 "%s", buf.buf); 385 386 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); 387 memmove(k, bkey_next(k), next - (void *) bkey_next(k)); 388 journal_entry_null_range(vstruct_next(entry), next); 389 390 printbuf_exit(&buf); 391 return FSCK_DELETED_KEY; 392 } 393 394 if (write) 395 bch2_bkey_compat(level, btree_id, version, big_endian, 396 write, NULL, bkey_to_packed(k)); 397 fsck_err: 398 printbuf_exit(&buf); 399 return ret; 400 } 401 402 static int journal_entry_btree_keys_validate(struct bch_fs *c, 403 struct jset *jset, 404 struct jset_entry *entry, 405 unsigned version, int big_endian, 406 enum bch_validate_flags flags) 407 { 408 struct bkey_i *k = entry->start; 409 410 while (k != vstruct_last(entry)) { 411 int ret = journal_validate_key(c, jset, entry, 412 entry->level, 413 entry->btree_id, 414 k, version, big_endian, 415 flags|BCH_VALIDATE_journal); 416 if (ret == FSCK_DELETED_KEY) 417 continue; 418 419 k = bkey_next(k); 420 } 421 422 return 0; 423 } 424 425 static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c, 426 struct jset_entry *entry) 427 { 428 bool first = true; 429 430 jset_entry_for_each_key(entry, k) { 431 if (!first) { 432 prt_newline(out); 433 bch2_prt_jset_entry_type(out, entry->type); 434 prt_str(out, ": "); 435 } 436 prt_printf(out, "btree=%s l=%u ", bch2_btree_id_str(entry->btree_id), entry->level); 437 bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); 438 first = false; 439 } 440 } 441 442 static int journal_entry_btree_root_validate(struct bch_fs *c, 443 struct jset *jset, 444 struct jset_entry *entry, 445 unsigned version, int big_endian, 446 enum bch_validate_flags flags) 447 { 448 struct bkey_i *k = entry->start; 449 int ret = 0; 450 451 if (journal_entry_err_on(!entry->u64s || 452 le16_to_cpu(entry->u64s) != k->k.u64s, 453 c, version, jset, entry, 454 journal_entry_btree_root_bad_size, 455 "invalid btree root journal entry: wrong number of keys")) { 456 void *next = vstruct_next(entry); 457 /* 458 * we don't want to null out this jset_entry, 459 * just the contents, so that later we can tell 460 * we were _supposed_ to have a btree root 461 */ 462 entry->u64s = 0; 463 journal_entry_null_range(vstruct_next(entry), next); 464 return 0; 465 } 466 467 ret = journal_validate_key(c, jset, entry, 1, entry->btree_id, k, 468 version, big_endian, flags); 469 if (ret == FSCK_DELETED_KEY) 470 ret = 0; 471 fsck_err: 472 return ret; 473 } 474 475 static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c, 476 struct jset_entry *entry) 477 { 478 journal_entry_btree_keys_to_text(out, c, entry); 479 } 480 481 static int journal_entry_prio_ptrs_validate(struct bch_fs *c, 482 struct jset *jset, 483 struct jset_entry *entry, 484 unsigned version, int big_endian, 485 enum bch_validate_flags flags) 486 { 487 /* obsolete, don't care: */ 488 return 0; 489 } 490 491 static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c, 492 struct jset_entry *entry) 493 { 494 } 495 496 static int journal_entry_blacklist_validate(struct bch_fs *c, 497 struct jset *jset, 498 struct jset_entry *entry, 499 unsigned version, int big_endian, 500 enum bch_validate_flags flags) 501 { 502 int ret = 0; 503 504 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, 505 c, version, jset, entry, 506 journal_entry_blacklist_bad_size, 507 "invalid journal seq blacklist entry: bad size")) { 508 journal_entry_null_range(entry, vstruct_next(entry)); 509 } 510 fsck_err: 511 return ret; 512 } 513 514 static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c, 515 struct jset_entry *entry) 516 { 517 struct jset_entry_blacklist *bl = 518 container_of(entry, struct jset_entry_blacklist, entry); 519 520 prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq)); 521 } 522 523 static int journal_entry_blacklist_v2_validate(struct bch_fs *c, 524 struct jset *jset, 525 struct jset_entry *entry, 526 unsigned version, int big_endian, 527 enum bch_validate_flags flags) 528 { 529 struct jset_entry_blacklist_v2 *bl_entry; 530 int ret = 0; 531 532 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, 533 c, version, jset, entry, 534 journal_entry_blacklist_v2_bad_size, 535 "invalid journal seq blacklist entry: bad size")) { 536 journal_entry_null_range(entry, vstruct_next(entry)); 537 goto out; 538 } 539 540 bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry); 541 542 if (journal_entry_err_on(le64_to_cpu(bl_entry->start) > 543 le64_to_cpu(bl_entry->end), 544 c, version, jset, entry, 545 journal_entry_blacklist_v2_start_past_end, 546 "invalid journal seq blacklist entry: start > end")) { 547 journal_entry_null_range(entry, vstruct_next(entry)); 548 } 549 out: 550 fsck_err: 551 return ret; 552 } 553 554 static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c, 555 struct jset_entry *entry) 556 { 557 struct jset_entry_blacklist_v2 *bl = 558 container_of(entry, struct jset_entry_blacklist_v2, entry); 559 560 prt_printf(out, "start=%llu end=%llu", 561 le64_to_cpu(bl->start), 562 le64_to_cpu(bl->end)); 563 } 564 565 static int journal_entry_usage_validate(struct bch_fs *c, 566 struct jset *jset, 567 struct jset_entry *entry, 568 unsigned version, int big_endian, 569 enum bch_validate_flags flags) 570 { 571 struct jset_entry_usage *u = 572 container_of(entry, struct jset_entry_usage, entry); 573 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 574 int ret = 0; 575 576 if (journal_entry_err_on(bytes < sizeof(*u), 577 c, version, jset, entry, 578 journal_entry_usage_bad_size, 579 "invalid journal entry usage: bad size")) { 580 journal_entry_null_range(entry, vstruct_next(entry)); 581 return ret; 582 } 583 584 fsck_err: 585 return ret; 586 } 587 588 static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c, 589 struct jset_entry *entry) 590 { 591 struct jset_entry_usage *u = 592 container_of(entry, struct jset_entry_usage, entry); 593 594 prt_str(out, "type="); 595 bch2_prt_fs_usage_type(out, u->entry.btree_id); 596 prt_printf(out, " v=%llu", le64_to_cpu(u->v)); 597 } 598 599 static int journal_entry_data_usage_validate(struct bch_fs *c, 600 struct jset *jset, 601 struct jset_entry *entry, 602 unsigned version, int big_endian, 603 enum bch_validate_flags flags) 604 { 605 struct jset_entry_data_usage *u = 606 container_of(entry, struct jset_entry_data_usage, entry); 607 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 608 struct printbuf err = PRINTBUF; 609 int ret = 0; 610 611 if (journal_entry_err_on(bytes < sizeof(*u) || 612 bytes < sizeof(*u) + u->r.nr_devs, 613 c, version, jset, entry, 614 journal_entry_data_usage_bad_size, 615 "invalid journal entry usage: bad size")) { 616 journal_entry_null_range(entry, vstruct_next(entry)); 617 goto out; 618 } 619 620 if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c->disk_sb.sb, &err), 621 c, version, jset, entry, 622 journal_entry_data_usage_bad_size, 623 "invalid journal entry usage: %s", err.buf)) { 624 journal_entry_null_range(entry, vstruct_next(entry)); 625 goto out; 626 } 627 out: 628 fsck_err: 629 printbuf_exit(&err); 630 return ret; 631 } 632 633 static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c, 634 struct jset_entry *entry) 635 { 636 struct jset_entry_data_usage *u = 637 container_of(entry, struct jset_entry_data_usage, entry); 638 639 bch2_replicas_entry_to_text(out, &u->r); 640 prt_printf(out, "=%llu", le64_to_cpu(u->v)); 641 } 642 643 static int journal_entry_clock_validate(struct bch_fs *c, 644 struct jset *jset, 645 struct jset_entry *entry, 646 unsigned version, int big_endian, 647 enum bch_validate_flags flags) 648 { 649 struct jset_entry_clock *clock = 650 container_of(entry, struct jset_entry_clock, entry); 651 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 652 int ret = 0; 653 654 if (journal_entry_err_on(bytes != sizeof(*clock), 655 c, version, jset, entry, 656 journal_entry_clock_bad_size, 657 "bad size")) { 658 journal_entry_null_range(entry, vstruct_next(entry)); 659 return ret; 660 } 661 662 if (journal_entry_err_on(clock->rw > 1, 663 c, version, jset, entry, 664 journal_entry_clock_bad_rw, 665 "bad rw")) { 666 journal_entry_null_range(entry, vstruct_next(entry)); 667 return ret; 668 } 669 670 fsck_err: 671 return ret; 672 } 673 674 static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c, 675 struct jset_entry *entry) 676 { 677 struct jset_entry_clock *clock = 678 container_of(entry, struct jset_entry_clock, entry); 679 680 prt_printf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time)); 681 } 682 683 static int journal_entry_dev_usage_validate(struct bch_fs *c, 684 struct jset *jset, 685 struct jset_entry *entry, 686 unsigned version, int big_endian, 687 enum bch_validate_flags flags) 688 { 689 struct jset_entry_dev_usage *u = 690 container_of(entry, struct jset_entry_dev_usage, entry); 691 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 692 unsigned expected = sizeof(*u); 693 int ret = 0; 694 695 if (journal_entry_err_on(bytes < expected, 696 c, version, jset, entry, 697 journal_entry_dev_usage_bad_size, 698 "bad size (%u < %u)", 699 bytes, expected)) { 700 journal_entry_null_range(entry, vstruct_next(entry)); 701 return ret; 702 } 703 704 if (journal_entry_err_on(u->pad, 705 c, version, jset, entry, 706 journal_entry_dev_usage_bad_pad, 707 "bad pad")) { 708 journal_entry_null_range(entry, vstruct_next(entry)); 709 return ret; 710 } 711 712 fsck_err: 713 return ret; 714 } 715 716 static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c, 717 struct jset_entry *entry) 718 { 719 struct jset_entry_dev_usage *u = 720 container_of(entry, struct jset_entry_dev_usage, entry); 721 unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); 722 723 prt_printf(out, "dev=%u", le32_to_cpu(u->dev)); 724 725 for (i = 0; i < nr_types; i++) { 726 bch2_prt_data_type(out, i); 727 prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu", 728 le64_to_cpu(u->d[i].buckets), 729 le64_to_cpu(u->d[i].sectors), 730 le64_to_cpu(u->d[i].fragmented)); 731 } 732 } 733 734 static int journal_entry_log_validate(struct bch_fs *c, 735 struct jset *jset, 736 struct jset_entry *entry, 737 unsigned version, int big_endian, 738 enum bch_validate_flags flags) 739 { 740 return 0; 741 } 742 743 static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c, 744 struct jset_entry *entry) 745 { 746 struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); 747 unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d); 748 749 prt_printf(out, "%.*s", bytes, l->d); 750 } 751 752 static int journal_entry_overwrite_validate(struct bch_fs *c, 753 struct jset *jset, 754 struct jset_entry *entry, 755 unsigned version, int big_endian, 756 enum bch_validate_flags flags) 757 { 758 return journal_entry_btree_keys_validate(c, jset, entry, 759 version, big_endian, READ); 760 } 761 762 static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c, 763 struct jset_entry *entry) 764 { 765 journal_entry_btree_keys_to_text(out, c, entry); 766 } 767 768 static int journal_entry_write_buffer_keys_validate(struct bch_fs *c, 769 struct jset *jset, 770 struct jset_entry *entry, 771 unsigned version, int big_endian, 772 enum bch_validate_flags flags) 773 { 774 return journal_entry_btree_keys_validate(c, jset, entry, 775 version, big_endian, READ); 776 } 777 778 static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c, 779 struct jset_entry *entry) 780 { 781 journal_entry_btree_keys_to_text(out, c, entry); 782 } 783 784 static int journal_entry_datetime_validate(struct bch_fs *c, 785 struct jset *jset, 786 struct jset_entry *entry, 787 unsigned version, int big_endian, 788 enum bch_validate_flags flags) 789 { 790 unsigned bytes = vstruct_bytes(entry); 791 unsigned expected = 16; 792 int ret = 0; 793 794 if (journal_entry_err_on(vstruct_bytes(entry) < expected, 795 c, version, jset, entry, 796 journal_entry_dev_usage_bad_size, 797 "bad size (%u < %u)", 798 bytes, expected)) { 799 journal_entry_null_range(entry, vstruct_next(entry)); 800 return ret; 801 } 802 fsck_err: 803 return ret; 804 } 805 806 static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *c, 807 struct jset_entry *entry) 808 { 809 struct jset_entry_datetime *datetime = 810 container_of(entry, struct jset_entry_datetime, entry); 811 812 bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); 813 } 814 815 struct jset_entry_ops { 816 int (*validate)(struct bch_fs *, struct jset *, 817 struct jset_entry *, unsigned, int, 818 enum bch_validate_flags); 819 void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); 820 }; 821 822 static const struct jset_entry_ops bch2_jset_entry_ops[] = { 823 #define x(f, nr) \ 824 [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \ 825 .validate = journal_entry_##f##_validate, \ 826 .to_text = journal_entry_##f##_to_text, \ 827 }, 828 BCH_JSET_ENTRY_TYPES() 829 #undef x 830 }; 831 832 int bch2_journal_entry_validate(struct bch_fs *c, 833 struct jset *jset, 834 struct jset_entry *entry, 835 unsigned version, int big_endian, 836 enum bch_validate_flags flags) 837 { 838 return entry->type < BCH_JSET_ENTRY_NR 839 ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry, 840 version, big_endian, flags) 841 : 0; 842 } 843 844 void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, 845 struct jset_entry *entry) 846 { 847 bch2_prt_jset_entry_type(out, entry->type); 848 849 if (entry->type < BCH_JSET_ENTRY_NR) { 850 prt_str(out, ": "); 851 bch2_jset_entry_ops[entry->type].to_text(out, c, entry); 852 } 853 } 854 855 static int jset_validate_entries(struct bch_fs *c, struct jset *jset, 856 enum bch_validate_flags flags) 857 { 858 unsigned version = le32_to_cpu(jset->version); 859 int ret = 0; 860 861 vstruct_for_each(jset, entry) { 862 if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset), 863 c, version, jset, entry, 864 journal_entry_past_jset_end, 865 "journal entry extends past end of jset")) { 866 jset->u64s = cpu_to_le32((u64 *) entry - jset->_data); 867 break; 868 } 869 870 ret = bch2_journal_entry_validate(c, jset, entry, 871 version, JSET_BIG_ENDIAN(jset), flags); 872 if (ret) 873 break; 874 } 875 fsck_err: 876 return ret; 877 } 878 879 static int jset_validate(struct bch_fs *c, 880 struct bch_dev *ca, 881 struct jset *jset, u64 sector, 882 enum bch_validate_flags flags) 883 { 884 unsigned version; 885 int ret = 0; 886 887 if (le64_to_cpu(jset->magic) != jset_magic(c)) 888 return JOURNAL_ENTRY_NONE; 889 890 version = le32_to_cpu(jset->version); 891 if (journal_entry_err_on(!bch2_version_compatible(version), 892 c, version, jset, NULL, 893 jset_unsupported_version, 894 "%s sector %llu seq %llu: incompatible journal entry version %u.%u", 895 ca ? ca->name : c->name, 896 sector, le64_to_cpu(jset->seq), 897 BCH_VERSION_MAJOR(version), 898 BCH_VERSION_MINOR(version))) { 899 /* don't try to continue: */ 900 return -EINVAL; 901 } 902 903 if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), 904 c, version, jset, NULL, 905 jset_unknown_csum, 906 "%s sector %llu seq %llu: journal entry with unknown csum type %llu", 907 ca ? ca->name : c->name, 908 sector, le64_to_cpu(jset->seq), 909 JSET_CSUM_TYPE(jset))) 910 ret = JOURNAL_ENTRY_BAD; 911 912 /* last_seq is ignored when JSET_NO_FLUSH is true */ 913 if (journal_entry_err_on(!JSET_NO_FLUSH(jset) && 914 le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), 915 c, version, jset, NULL, 916 jset_last_seq_newer_than_seq, 917 "invalid journal entry: last_seq > seq (%llu > %llu)", 918 le64_to_cpu(jset->last_seq), 919 le64_to_cpu(jset->seq))) { 920 jset->last_seq = jset->seq; 921 return JOURNAL_ENTRY_BAD; 922 } 923 924 ret = jset_validate_entries(c, jset, flags); 925 fsck_err: 926 return ret; 927 } 928 929 static int jset_validate_early(struct bch_fs *c, 930 struct bch_dev *ca, 931 struct jset *jset, u64 sector, 932 unsigned bucket_sectors_left, 933 unsigned sectors_read) 934 { 935 size_t bytes = vstruct_bytes(jset); 936 unsigned version; 937 enum bch_validate_flags flags = BCH_VALIDATE_journal; 938 int ret = 0; 939 940 if (le64_to_cpu(jset->magic) != jset_magic(c)) 941 return JOURNAL_ENTRY_NONE; 942 943 version = le32_to_cpu(jset->version); 944 if (journal_entry_err_on(!bch2_version_compatible(version), 945 c, version, jset, NULL, 946 jset_unsupported_version, 947 "%s sector %llu seq %llu: unknown journal entry version %u.%u", 948 ca ? ca->name : c->name, 949 sector, le64_to_cpu(jset->seq), 950 BCH_VERSION_MAJOR(version), 951 BCH_VERSION_MINOR(version))) { 952 /* don't try to continue: */ 953 return -EINVAL; 954 } 955 956 if (bytes > (sectors_read << 9) && 957 sectors_read < bucket_sectors_left) 958 return JOURNAL_ENTRY_REREAD; 959 960 if (journal_entry_err_on(bytes > bucket_sectors_left << 9, 961 c, version, jset, NULL, 962 jset_past_bucket_end, 963 "%s sector %llu seq %llu: journal entry too big (%zu bytes)", 964 ca ? ca->name : c->name, 965 sector, le64_to_cpu(jset->seq), bytes)) 966 le32_add_cpu(&jset->u64s, 967 -((bytes - (bucket_sectors_left << 9)) / 8)); 968 fsck_err: 969 return ret; 970 } 971 972 struct journal_read_buf { 973 void *data; 974 size_t size; 975 }; 976 977 static int journal_read_buf_realloc(struct journal_read_buf *b, 978 size_t new_size) 979 { 980 void *n; 981 982 /* the bios are sized for this many pages, max: */ 983 if (new_size > JOURNAL_ENTRY_SIZE_MAX) 984 return -BCH_ERR_ENOMEM_journal_read_buf_realloc; 985 986 new_size = roundup_pow_of_two(new_size); 987 n = kvmalloc(new_size, GFP_KERNEL); 988 if (!n) 989 return -BCH_ERR_ENOMEM_journal_read_buf_realloc; 990 991 kvfree(b->data); 992 b->data = n; 993 b->size = new_size; 994 return 0; 995 } 996 997 static int journal_read_bucket(struct bch_dev *ca, 998 struct journal_read_buf *buf, 999 struct journal_list *jlist, 1000 unsigned bucket) 1001 { 1002 struct bch_fs *c = ca->fs; 1003 struct journal_device *ja = &ca->journal; 1004 struct jset *j = NULL; 1005 unsigned sectors, sectors_read = 0; 1006 u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), 1007 end = offset + ca->mi.bucket_size; 1008 bool saw_bad = false, csum_good; 1009 struct printbuf err = PRINTBUF; 1010 int ret = 0; 1011 1012 pr_debug("reading %u", bucket); 1013 1014 while (offset < end) { 1015 if (!sectors_read) { 1016 struct bio *bio; 1017 unsigned nr_bvecs; 1018 reread: 1019 sectors_read = min_t(unsigned, 1020 end - offset, buf->size >> 9); 1021 nr_bvecs = buf_pages(buf->data, sectors_read << 9); 1022 1023 bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); 1024 bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ); 1025 1026 bio->bi_iter.bi_sector = offset; 1027 bch2_bio_map(bio, buf->data, sectors_read << 9); 1028 1029 ret = submit_bio_wait(bio); 1030 kfree(bio); 1031 1032 if (bch2_dev_io_err_on(ret, ca, BCH_MEMBER_ERROR_read, 1033 "journal read error: sector %llu", 1034 offset) || 1035 bch2_meta_read_fault("journal")) { 1036 /* 1037 * We don't error out of the recovery process 1038 * here, since the relevant journal entry may be 1039 * found on a different device, and missing or 1040 * no journal entries will be handled later 1041 */ 1042 goto out; 1043 } 1044 1045 j = buf->data; 1046 } 1047 1048 ret = jset_validate_early(c, ca, j, offset, 1049 end - offset, sectors_read); 1050 switch (ret) { 1051 case 0: 1052 sectors = vstruct_sectors(j, c->block_bits); 1053 break; 1054 case JOURNAL_ENTRY_REREAD: 1055 if (vstruct_bytes(j) > buf->size) { 1056 ret = journal_read_buf_realloc(buf, 1057 vstruct_bytes(j)); 1058 if (ret) 1059 goto err; 1060 } 1061 goto reread; 1062 case JOURNAL_ENTRY_NONE: 1063 if (!saw_bad) 1064 goto out; 1065 /* 1066 * On checksum error we don't really trust the size 1067 * field of the journal entry we read, so try reading 1068 * again at next block boundary: 1069 */ 1070 sectors = block_sectors(c); 1071 goto next_block; 1072 default: 1073 goto err; 1074 } 1075 1076 if (le64_to_cpu(j->seq) > ja->highest_seq_found) { 1077 ja->highest_seq_found = le64_to_cpu(j->seq); 1078 ja->cur_idx = bucket; 1079 ja->sectors_free = ca->mi.bucket_size - 1080 bucket_remainder(ca, offset) - sectors; 1081 } 1082 1083 /* 1084 * This happens sometimes if we don't have discards on - 1085 * when we've partially overwritten a bucket with new 1086 * journal entries. We don't need the rest of the 1087 * bucket: 1088 */ 1089 if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) 1090 goto out; 1091 1092 ja->bucket_seq[bucket] = le64_to_cpu(j->seq); 1093 1094 enum bch_csum_type csum_type = JSET_CSUM_TYPE(j); 1095 struct bch_csum csum; 1096 csum_good = jset_csum_good(c, j, &csum); 1097 1098 if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum, 1099 "%s", 1100 (printbuf_reset(&err), 1101 prt_str(&err, "journal "), 1102 bch2_csum_err_msg(&err, csum_type, j->csum, csum), 1103 err.buf))) 1104 saw_bad = true; 1105 1106 ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), 1107 j->encrypted_start, 1108 vstruct_end(j) - (void *) j->encrypted_start); 1109 bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret)); 1110 1111 mutex_lock(&jlist->lock); 1112 ret = journal_entry_add(c, ca, (struct journal_ptr) { 1113 .csum_good = csum_good, 1114 .dev = ca->dev_idx, 1115 .bucket = bucket, 1116 .bucket_offset = offset - 1117 bucket_to_sector(ca, ja->buckets[bucket]), 1118 .sector = offset, 1119 }, jlist, j); 1120 mutex_unlock(&jlist->lock); 1121 1122 switch (ret) { 1123 case JOURNAL_ENTRY_ADD_OK: 1124 break; 1125 case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: 1126 break; 1127 default: 1128 goto err; 1129 } 1130 next_block: 1131 pr_debug("next"); 1132 offset += sectors; 1133 sectors_read -= sectors; 1134 j = ((void *) j) + (sectors << 9); 1135 } 1136 1137 out: 1138 ret = 0; 1139 err: 1140 printbuf_exit(&err); 1141 return ret; 1142 } 1143 1144 static CLOSURE_CALLBACK(bch2_journal_read_device) 1145 { 1146 closure_type(ja, struct journal_device, read); 1147 struct bch_dev *ca = container_of(ja, struct bch_dev, journal); 1148 struct bch_fs *c = ca->fs; 1149 struct journal_list *jlist = 1150 container_of(cl->parent, struct journal_list, cl); 1151 struct journal_read_buf buf = { NULL, 0 }; 1152 unsigned i; 1153 int ret = 0; 1154 1155 if (!ja->nr) 1156 goto out; 1157 1158 ret = journal_read_buf_realloc(&buf, PAGE_SIZE); 1159 if (ret) 1160 goto err; 1161 1162 pr_debug("%u journal buckets", ja->nr); 1163 1164 for (i = 0; i < ja->nr; i++) { 1165 ret = journal_read_bucket(ca, &buf, jlist, i); 1166 if (ret) 1167 goto err; 1168 } 1169 1170 /* 1171 * Set dirty_idx to indicate the entire journal is full and needs to be 1172 * reclaimed - journal reclaim will immediately reclaim whatever isn't 1173 * pinned when it first runs: 1174 */ 1175 ja->discard_idx = ja->dirty_idx_ondisk = 1176 ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; 1177 out: 1178 bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); 1179 kvfree(buf.data); 1180 percpu_ref_put(&ca->io_ref); 1181 closure_return(cl); 1182 return; 1183 err: 1184 mutex_lock(&jlist->lock); 1185 jlist->ret = ret; 1186 mutex_unlock(&jlist->lock); 1187 goto out; 1188 } 1189 1190 int bch2_journal_read(struct bch_fs *c, 1191 u64 *last_seq, 1192 u64 *blacklist_seq, 1193 u64 *start_seq) 1194 { 1195 struct journal_list jlist; 1196 struct journal_replay *i, **_i, *prev = NULL; 1197 struct genradix_iter radix_iter; 1198 struct printbuf buf = PRINTBUF; 1199 bool degraded = false, last_write_torn = false; 1200 u64 seq; 1201 int ret = 0; 1202 1203 closure_init_stack(&jlist.cl); 1204 mutex_init(&jlist.lock); 1205 jlist.last_seq = 0; 1206 jlist.ret = 0; 1207 1208 for_each_member_device(c, ca) { 1209 if (!c->opts.fsck && 1210 !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal))) 1211 continue; 1212 1213 if ((ca->mi.state == BCH_MEMBER_STATE_rw || 1214 ca->mi.state == BCH_MEMBER_STATE_ro) && 1215 percpu_ref_tryget(&ca->io_ref)) 1216 closure_call(&ca->journal.read, 1217 bch2_journal_read_device, 1218 system_unbound_wq, 1219 &jlist.cl); 1220 else 1221 degraded = true; 1222 } 1223 1224 closure_sync(&jlist.cl); 1225 1226 if (jlist.ret) 1227 return jlist.ret; 1228 1229 *last_seq = 0; 1230 *start_seq = 0; 1231 *blacklist_seq = 0; 1232 1233 /* 1234 * Find most recent flush entry, and ignore newer non flush entries - 1235 * those entries will be blacklisted: 1236 */ 1237 genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) { 1238 enum bch_validate_flags flags = BCH_VALIDATE_journal; 1239 1240 i = *_i; 1241 1242 if (journal_replay_ignore(i)) 1243 continue; 1244 1245 if (!*start_seq) 1246 *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1; 1247 1248 if (JSET_NO_FLUSH(&i->j)) { 1249 i->ignore_blacklisted = true; 1250 continue; 1251 } 1252 1253 if (!last_write_torn && !i->csum_good) { 1254 last_write_torn = true; 1255 i->ignore_blacklisted = true; 1256 continue; 1257 } 1258 1259 if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq), 1260 c, le32_to_cpu(i->j.version), &i->j, NULL, 1261 jset_last_seq_newer_than_seq, 1262 "invalid journal entry: last_seq > seq (%llu > %llu)", 1263 le64_to_cpu(i->j.last_seq), 1264 le64_to_cpu(i->j.seq))) 1265 i->j.last_seq = i->j.seq; 1266 1267 *last_seq = le64_to_cpu(i->j.last_seq); 1268 *blacklist_seq = le64_to_cpu(i->j.seq) + 1; 1269 break; 1270 } 1271 1272 if (!*start_seq) { 1273 bch_info(c, "journal read done, but no entries found"); 1274 return 0; 1275 } 1276 1277 if (!*last_seq) { 1278 fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes, 1279 "journal read done, but no entries found after dropping non-flushes"); 1280 return 0; 1281 } 1282 1283 bch_info(c, "journal read done, replaying entries %llu-%llu", 1284 *last_seq, *blacklist_seq - 1); 1285 1286 if (*start_seq != *blacklist_seq) 1287 bch_info(c, "dropped unflushed entries %llu-%llu", 1288 *blacklist_seq, *start_seq - 1); 1289 1290 /* Drop blacklisted entries and entries older than last_seq: */ 1291 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1292 i = *_i; 1293 1294 if (journal_replay_ignore(i)) 1295 continue; 1296 1297 seq = le64_to_cpu(i->j.seq); 1298 if (seq < *last_seq) { 1299 journal_replay_free(c, i, false); 1300 continue; 1301 } 1302 1303 if (bch2_journal_seq_is_blacklisted(c, seq, true)) { 1304 fsck_err_on(!JSET_NO_FLUSH(&i->j), c, 1305 jset_seq_blacklisted, 1306 "found blacklisted journal entry %llu", seq); 1307 i->ignore_blacklisted = true; 1308 } 1309 } 1310 1311 /* Check for missing entries: */ 1312 seq = *last_seq; 1313 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1314 i = *_i; 1315 1316 if (journal_replay_ignore(i)) 1317 continue; 1318 1319 BUG_ON(seq > le64_to_cpu(i->j.seq)); 1320 1321 while (seq < le64_to_cpu(i->j.seq)) { 1322 u64 missing_start, missing_end; 1323 struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; 1324 1325 while (seq < le64_to_cpu(i->j.seq) && 1326 bch2_journal_seq_is_blacklisted(c, seq, false)) 1327 seq++; 1328 1329 if (seq == le64_to_cpu(i->j.seq)) 1330 break; 1331 1332 missing_start = seq; 1333 1334 while (seq < le64_to_cpu(i->j.seq) && 1335 !bch2_journal_seq_is_blacklisted(c, seq, false)) 1336 seq++; 1337 1338 if (prev) { 1339 bch2_journal_ptrs_to_text(&buf1, c, prev); 1340 prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits)); 1341 } else 1342 prt_printf(&buf1, "(none)"); 1343 bch2_journal_ptrs_to_text(&buf2, c, i); 1344 1345 missing_end = seq - 1; 1346 fsck_err(c, journal_entries_missing, 1347 "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" 1348 " prev at %s\n" 1349 " next at %s, continue?", 1350 missing_start, missing_end, 1351 *last_seq, *blacklist_seq - 1, 1352 buf1.buf, buf2.buf); 1353 1354 printbuf_exit(&buf1); 1355 printbuf_exit(&buf2); 1356 } 1357 1358 prev = i; 1359 seq++; 1360 } 1361 1362 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1363 struct bch_replicas_padded replicas = { 1364 .e.data_type = BCH_DATA_journal, 1365 .e.nr_required = 1, 1366 }; 1367 1368 i = *_i; 1369 if (journal_replay_ignore(i)) 1370 continue; 1371 1372 darray_for_each(i->ptrs, ptr) { 1373 struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); 1374 1375 if (!ptr->csum_good) 1376 bch_err_dev_offset(ca, ptr->sector, 1377 "invalid journal checksum, seq %llu%s", 1378 le64_to_cpu(i->j.seq), 1379 i->csum_good ? " (had good copy on another device)" : ""); 1380 } 1381 1382 ret = jset_validate(c, 1383 bch2_dev_have_ref(c, i->ptrs.data[0].dev), 1384 &i->j, 1385 i->ptrs.data[0].sector, 1386 READ); 1387 if (ret) 1388 goto err; 1389 1390 darray_for_each(i->ptrs, ptr) 1391 replicas.e.devs[replicas.e.nr_devs++] = ptr->dev; 1392 1393 bch2_replicas_entry_sort(&replicas.e); 1394 1395 printbuf_reset(&buf); 1396 bch2_replicas_entry_to_text(&buf, &replicas.e); 1397 1398 if (!degraded && 1399 !bch2_replicas_marked(c, &replicas.e) && 1400 (le64_to_cpu(i->j.seq) == *last_seq || 1401 fsck_err(c, journal_entry_replicas_not_marked, 1402 "superblock not marked as containing replicas for journal entry %llu\n %s", 1403 le64_to_cpu(i->j.seq), buf.buf))) { 1404 ret = bch2_mark_replicas(c, &replicas.e); 1405 if (ret) 1406 goto err; 1407 } 1408 } 1409 err: 1410 fsck_err: 1411 printbuf_exit(&buf); 1412 return ret; 1413 } 1414 1415 /* journal write: */ 1416 1417 static void __journal_write_alloc(struct journal *j, 1418 struct journal_buf *w, 1419 struct dev_alloc_list *devs_sorted, 1420 unsigned sectors, 1421 unsigned *replicas, 1422 unsigned replicas_want) 1423 { 1424 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1425 struct journal_device *ja; 1426 struct bch_dev *ca; 1427 unsigned i; 1428 1429 if (*replicas >= replicas_want) 1430 return; 1431 1432 for (i = 0; i < devs_sorted->nr; i++) { 1433 ca = rcu_dereference(c->devs[devs_sorted->devs[i]]); 1434 if (!ca) 1435 continue; 1436 1437 ja = &ca->journal; 1438 1439 /* 1440 * Check that we can use this device, and aren't already using 1441 * it: 1442 */ 1443 if (!ca->mi.durability || 1444 ca->mi.state != BCH_MEMBER_STATE_rw || 1445 !ja->nr || 1446 bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) || 1447 sectors > ja->sectors_free) 1448 continue; 1449 1450 bch2_dev_stripe_increment(ca, &j->wp.stripe); 1451 1452 bch2_bkey_append_ptr(&w->key, 1453 (struct bch_extent_ptr) { 1454 .offset = bucket_to_sector(ca, 1455 ja->buckets[ja->cur_idx]) + 1456 ca->mi.bucket_size - 1457 ja->sectors_free, 1458 .dev = ca->dev_idx, 1459 }); 1460 1461 ja->sectors_free -= sectors; 1462 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1463 1464 *replicas += ca->mi.durability; 1465 1466 if (*replicas >= replicas_want) 1467 break; 1468 } 1469 } 1470 1471 /** 1472 * journal_write_alloc - decide where to write next journal entry 1473 * 1474 * @j: journal object 1475 * @w: journal buf (entry to be written) 1476 * 1477 * Returns: 0 on success, or -EROFS on failure 1478 */ 1479 static int journal_write_alloc(struct journal *j, struct journal_buf *w) 1480 { 1481 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1482 struct bch_devs_mask devs; 1483 struct journal_device *ja; 1484 struct bch_dev *ca; 1485 struct dev_alloc_list devs_sorted; 1486 unsigned sectors = vstruct_sectors(w->data, c->block_bits); 1487 unsigned target = c->opts.metadata_target ?: 1488 c->opts.foreground_target; 1489 unsigned i, replicas = 0, replicas_want = 1490 READ_ONCE(c->opts.metadata_replicas); 1491 unsigned replicas_need = min_t(unsigned, replicas_want, 1492 READ_ONCE(c->opts.metadata_replicas_required)); 1493 1494 rcu_read_lock(); 1495 retry: 1496 devs = target_rw_devs(c, BCH_DATA_journal, target); 1497 1498 devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); 1499 1500 __journal_write_alloc(j, w, &devs_sorted, 1501 sectors, &replicas, replicas_want); 1502 1503 if (replicas >= replicas_want) 1504 goto done; 1505 1506 for (i = 0; i < devs_sorted.nr; i++) { 1507 ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); 1508 if (!ca) 1509 continue; 1510 1511 ja = &ca->journal; 1512 1513 if (sectors > ja->sectors_free && 1514 sectors <= ca->mi.bucket_size && 1515 bch2_journal_dev_buckets_available(j, ja, 1516 journal_space_discarded)) { 1517 ja->cur_idx = (ja->cur_idx + 1) % ja->nr; 1518 ja->sectors_free = ca->mi.bucket_size; 1519 1520 /* 1521 * ja->bucket_seq[ja->cur_idx] must always have 1522 * something sensible: 1523 */ 1524 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1525 } 1526 } 1527 1528 __journal_write_alloc(j, w, &devs_sorted, 1529 sectors, &replicas, replicas_want); 1530 1531 if (replicas < replicas_want && target) { 1532 /* Retry from all devices: */ 1533 target = 0; 1534 goto retry; 1535 } 1536 done: 1537 rcu_read_unlock(); 1538 1539 BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); 1540 1541 return replicas >= replicas_need ? 0 : -EROFS; 1542 } 1543 1544 static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) 1545 { 1546 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1547 1548 /* we aren't holding j->lock: */ 1549 unsigned new_size = READ_ONCE(j->buf_size_want); 1550 void *new_buf; 1551 1552 if (buf->buf_size >= new_size) 1553 return; 1554 1555 size_t btree_write_buffer_size = new_size / 64; 1556 1557 if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size)) 1558 return; 1559 1560 new_buf = kvmalloc(new_size, GFP_NOFS|__GFP_NOWARN); 1561 if (!new_buf) 1562 return; 1563 1564 memcpy(new_buf, buf->data, buf->buf_size); 1565 1566 spin_lock(&j->lock); 1567 swap(buf->data, new_buf); 1568 swap(buf->buf_size, new_size); 1569 spin_unlock(&j->lock); 1570 1571 kvfree(new_buf); 1572 } 1573 1574 static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) 1575 { 1576 return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK); 1577 } 1578 1579 static CLOSURE_CALLBACK(journal_write_done) 1580 { 1581 closure_type(w, struct journal_buf, io); 1582 struct journal *j = container_of(w, struct journal, buf[w->idx]); 1583 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1584 struct bch_replicas_padded replicas; 1585 union journal_res_state old, new; 1586 u64 v, seq = le64_to_cpu(w->data->seq); 1587 int err = 0; 1588 1589 bch2_time_stats_update(!JSET_NO_FLUSH(w->data) 1590 ? j->flush_write_time 1591 : j->noflush_write_time, j->write_start_time); 1592 1593 if (!w->devs_written.nr) { 1594 bch_err(c, "unable to write journal to sufficient devices"); 1595 err = -EIO; 1596 } else { 1597 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, 1598 w->devs_written); 1599 if (bch2_mark_replicas(c, &replicas.e)) 1600 err = -EIO; 1601 } 1602 1603 if (err) 1604 bch2_fatal_error(c); 1605 1606 closure_debug_destroy(cl); 1607 1608 spin_lock(&j->lock); 1609 if (seq >= j->pin.front) 1610 journal_seq_pin(j, seq)->devs = w->devs_written; 1611 if (err && (!j->err_seq || seq < j->err_seq)) 1612 j->err_seq = seq; 1613 w->write_done = true; 1614 1615 bool completed = false; 1616 1617 for (seq = journal_last_unwritten_seq(j); 1618 seq <= journal_cur_seq(j); 1619 seq++) { 1620 w = j->buf + (seq & JOURNAL_BUF_MASK); 1621 if (!w->write_done) 1622 break; 1623 1624 if (!j->err_seq && !JSET_NO_FLUSH(w->data)) { 1625 j->flushed_seq_ondisk = seq; 1626 j->last_seq_ondisk = w->last_seq; 1627 1628 bch2_do_discards(c); 1629 closure_wake_up(&c->freelist_wait); 1630 bch2_reset_alloc_cursors(c); 1631 } 1632 1633 j->seq_ondisk = seq; 1634 1635 /* 1636 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard 1637 * more buckets: 1638 * 1639 * Must come before signaling write completion, for 1640 * bch2_fs_journal_stop(): 1641 */ 1642 if (j->watermark != BCH_WATERMARK_stripe) 1643 journal_reclaim_kick(&c->journal); 1644 1645 v = atomic64_read(&j->reservations.counter); 1646 do { 1647 old.v = new.v = v; 1648 BUG_ON(journal_state_count(new, new.unwritten_idx)); 1649 BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK)); 1650 1651 new.unwritten_idx++; 1652 } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); 1653 1654 closure_wake_up(&w->wait); 1655 completed = true; 1656 } 1657 1658 if (completed) { 1659 bch2_journal_reclaim_fast(j); 1660 bch2_journal_space_available(j); 1661 1662 track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false); 1663 1664 journal_wake(j); 1665 } 1666 1667 if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && 1668 new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { 1669 struct journal_buf *buf = journal_cur_buf(j); 1670 long delta = buf->expires - jiffies; 1671 1672 /* 1673 * We don't close a journal entry to write it while there's 1674 * previous entries still in flight - the current journal entry 1675 * might want to be written now: 1676 */ 1677 mod_delayed_work(j->wq, &j->write_work, max(0L, delta)); 1678 } 1679 1680 spin_unlock(&j->lock); 1681 } 1682 1683 static void journal_write_endio(struct bio *bio) 1684 { 1685 struct journal_bio *jbio = container_of(bio, struct journal_bio, bio); 1686 struct bch_dev *ca = jbio->ca; 1687 struct journal *j = &ca->fs->journal; 1688 struct journal_buf *w = j->buf + jbio->buf_idx; 1689 1690 if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, 1691 "error writing journal entry %llu: %s", 1692 le64_to_cpu(w->data->seq), 1693 bch2_blk_status_to_str(bio->bi_status)) || 1694 bch2_meta_write_fault("journal")) { 1695 unsigned long flags; 1696 1697 spin_lock_irqsave(&j->err_lock, flags); 1698 bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx); 1699 spin_unlock_irqrestore(&j->err_lock, flags); 1700 } 1701 1702 closure_put(&w->io); 1703 percpu_ref_put(&ca->io_ref); 1704 } 1705 1706 static CLOSURE_CALLBACK(journal_write_submit) 1707 { 1708 closure_type(w, struct journal_buf, io); 1709 struct journal *j = container_of(w, struct journal, buf[w->idx]); 1710 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1711 unsigned sectors = vstruct_sectors(w->data, c->block_bits); 1712 1713 extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { 1714 struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE); 1715 if (!ca) { 1716 /* XXX: fix this */ 1717 bch_err(c, "missing device for journal write\n"); 1718 continue; 1719 } 1720 1721 this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], 1722 sectors); 1723 1724 struct journal_device *ja = &ca->journal; 1725 struct bio *bio = &ja->bio[w->idx]->bio; 1726 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); 1727 bio->bi_iter.bi_sector = ptr->offset; 1728 bio->bi_end_io = journal_write_endio; 1729 bio->bi_private = ca; 1730 1731 BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector); 1732 ca->prev_journal_sector = bio->bi_iter.bi_sector; 1733 1734 if (!JSET_NO_FLUSH(w->data)) 1735 bio->bi_opf |= REQ_FUA; 1736 if (!JSET_NO_FLUSH(w->data) && !w->separate_flush) 1737 bio->bi_opf |= REQ_PREFLUSH; 1738 1739 bch2_bio_map(bio, w->data, sectors << 9); 1740 1741 trace_and_count(c, journal_write, bio); 1742 closure_bio_submit(bio, cl); 1743 1744 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1745 } 1746 1747 continue_at(cl, journal_write_done, j->wq); 1748 } 1749 1750 static CLOSURE_CALLBACK(journal_write_preflush) 1751 { 1752 closure_type(w, struct journal_buf, io); 1753 struct journal *j = container_of(w, struct journal, buf[w->idx]); 1754 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1755 1756 if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { 1757 spin_lock(&j->lock); 1758 closure_wait(&j->async_wait, cl); 1759 spin_unlock(&j->lock); 1760 1761 continue_at(cl, journal_write_preflush, j->wq); 1762 return; 1763 } 1764 1765 if (w->separate_flush) { 1766 for_each_rw_member(c, ca) { 1767 percpu_ref_get(&ca->io_ref); 1768 1769 struct journal_device *ja = &ca->journal; 1770 struct bio *bio = &ja->bio[w->idx]->bio; 1771 bio_reset(bio, ca->disk_sb.bdev, 1772 REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH); 1773 bio->bi_end_io = journal_write_endio; 1774 bio->bi_private = ca; 1775 closure_bio_submit(bio, cl); 1776 } 1777 1778 continue_at(cl, journal_write_submit, j->wq); 1779 } else { 1780 /* 1781 * no need to punt to another work item if we're not waiting on 1782 * preflushes 1783 */ 1784 journal_write_submit(&cl->work); 1785 } 1786 } 1787 1788 static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) 1789 { 1790 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1791 struct jset_entry *start, *end; 1792 struct jset *jset = w->data; 1793 struct journal_keys_to_wb wb = { NULL }; 1794 unsigned sectors, bytes, u64s; 1795 unsigned long btree_roots_have = 0; 1796 bool validate_before_checksum = false; 1797 u64 seq = le64_to_cpu(jset->seq); 1798 int ret; 1799 1800 /* 1801 * Simple compaction, dropping empty jset_entries (from journal 1802 * reservations that weren't fully used) and merging jset_entries that 1803 * can be. 1804 * 1805 * If we wanted to be really fancy here, we could sort all the keys in 1806 * the jset and drop keys that were overwritten - probably not worth it: 1807 */ 1808 vstruct_for_each(jset, i) { 1809 unsigned u64s = le16_to_cpu(i->u64s); 1810 1811 /* Empty entry: */ 1812 if (!u64s) 1813 continue; 1814 1815 /* 1816 * New btree roots are set by journalling them; when the journal 1817 * entry gets written we have to propagate them to 1818 * c->btree_roots 1819 * 1820 * But, every journal entry we write has to contain all the 1821 * btree roots (at least for now); so after we copy btree roots 1822 * to c->btree_roots we have to get any missing btree roots and 1823 * add them to this journal entry: 1824 */ 1825 switch (i->type) { 1826 case BCH_JSET_ENTRY_btree_root: 1827 bch2_journal_entry_to_btree_root(c, i); 1828 __set_bit(i->btree_id, &btree_roots_have); 1829 break; 1830 case BCH_JSET_ENTRY_write_buffer_keys: 1831 EBUG_ON(!w->need_flush_to_write_buffer); 1832 1833 if (!wb.wb) 1834 bch2_journal_keys_to_write_buffer_start(c, &wb, seq); 1835 1836 jset_entry_for_each_key(i, k) { 1837 ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k); 1838 if (ret) { 1839 bch2_fs_fatal_error(c, "flushing journal keys to btree write buffer: %s", 1840 bch2_err_str(ret)); 1841 bch2_journal_keys_to_write_buffer_end(c, &wb); 1842 return ret; 1843 } 1844 } 1845 i->type = BCH_JSET_ENTRY_btree_keys; 1846 break; 1847 } 1848 } 1849 1850 if (wb.wb) 1851 bch2_journal_keys_to_write_buffer_end(c, &wb); 1852 1853 spin_lock(&c->journal.lock); 1854 w->need_flush_to_write_buffer = false; 1855 spin_unlock(&c->journal.lock); 1856 1857 start = end = vstruct_last(jset); 1858 1859 end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have); 1860 1861 struct jset_entry_datetime *d = 1862 container_of(jset_entry_init(&end, sizeof(*d)), struct jset_entry_datetime, entry); 1863 d->entry.type = BCH_JSET_ENTRY_datetime; 1864 d->seconds = cpu_to_le64(ktime_get_real_seconds()); 1865 1866 bch2_journal_super_entries_add_common(c, &end, seq); 1867 u64s = (u64 *) end - (u64 *) start; 1868 1869 WARN_ON(u64s > j->entry_u64s_reserved); 1870 1871 le32_add_cpu(&jset->u64s, u64s); 1872 1873 sectors = vstruct_sectors(jset, c->block_bits); 1874 bytes = vstruct_bytes(jset); 1875 1876 if (sectors > w->sectors) { 1877 bch2_fs_fatal_error(c, ": journal write overran available space, %zu > %u (extra %u reserved %u/%u)", 1878 vstruct_bytes(jset), w->sectors << 9, 1879 u64s, w->u64s_reserved, j->entry_u64s_reserved); 1880 return -EINVAL; 1881 } 1882 1883 jset->magic = cpu_to_le64(jset_magic(c)); 1884 jset->version = cpu_to_le32(c->sb.version); 1885 1886 SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); 1887 SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); 1888 1889 if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset)) 1890 j->last_empty_seq = seq; 1891 1892 if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) 1893 validate_before_checksum = true; 1894 1895 if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current) 1896 validate_before_checksum = true; 1897 1898 if (validate_before_checksum && 1899 (ret = jset_validate(c, NULL, jset, 0, WRITE))) 1900 return ret; 1901 1902 ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), 1903 jset->encrypted_start, 1904 vstruct_end(jset) - (void *) jset->encrypted_start); 1905 if (bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret))) 1906 return ret; 1907 1908 jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), 1909 journal_nonce(jset), jset); 1910 1911 if (!validate_before_checksum && 1912 (ret = jset_validate(c, NULL, jset, 0, WRITE))) 1913 return ret; 1914 1915 memset((void *) jset + bytes, 0, (sectors << 9) - bytes); 1916 return 0; 1917 } 1918 1919 static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w) 1920 { 1921 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1922 int error = bch2_journal_error(j); 1923 1924 /* 1925 * If the journal is in an error state - we did an emergency shutdown - 1926 * we prefer to continue doing journal writes. We just mark them as 1927 * noflush so they'll never be used, but they'll still be visible by the 1928 * list_journal tool - this helps in debugging. 1929 * 1930 * There's a caveat: the first journal write after marking the 1931 * superblock dirty must always be a flush write, because on startup 1932 * from a clean shutdown we didn't necessarily read the journal and the 1933 * new journal write might overwrite whatever was in the journal 1934 * previously - we can't leave the journal without any flush writes in 1935 * it. 1936 * 1937 * So if we're in an error state, and we're still starting up, we don't 1938 * write anything at all. 1939 */ 1940 if (error && test_bit(JOURNAL_need_flush_write, &j->flags)) 1941 return -EIO; 1942 1943 if (error || 1944 w->noflush || 1945 (!w->must_flush && 1946 (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && 1947 test_bit(JOURNAL_may_skip_flush, &j->flags))) { 1948 w->noflush = true; 1949 SET_JSET_NO_FLUSH(w->data, true); 1950 w->data->last_seq = 0; 1951 w->last_seq = 0; 1952 1953 j->nr_noflush_writes++; 1954 } else { 1955 w->must_flush = true; 1956 j->last_flush_write = jiffies; 1957 j->nr_flush_writes++; 1958 clear_bit(JOURNAL_need_flush_write, &j->flags); 1959 } 1960 1961 return 0; 1962 } 1963 1964 CLOSURE_CALLBACK(bch2_journal_write) 1965 { 1966 closure_type(w, struct journal_buf, io); 1967 struct journal *j = container_of(w, struct journal, buf[w->idx]); 1968 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1969 struct bch_replicas_padded replicas; 1970 struct printbuf journal_debug_buf = PRINTBUF; 1971 unsigned nr_rw_members = 0; 1972 int ret; 1973 1974 for_each_rw_member(c, ca) 1975 nr_rw_members++; 1976 1977 BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); 1978 BUG_ON(!w->write_started); 1979 BUG_ON(w->write_allocated); 1980 BUG_ON(w->write_done); 1981 1982 j->write_start_time = local_clock(); 1983 1984 spin_lock(&j->lock); 1985 if (nr_rw_members > 1) 1986 w->separate_flush = true; 1987 1988 ret = bch2_journal_write_pick_flush(j, w); 1989 spin_unlock(&j->lock); 1990 if (ret) 1991 goto err; 1992 1993 mutex_lock(&j->buf_lock); 1994 journal_buf_realloc(j, w); 1995 1996 ret = bch2_journal_write_prep(j, w); 1997 mutex_unlock(&j->buf_lock); 1998 if (ret) 1999 goto err; 2000 2001 j->entry_bytes_written += vstruct_bytes(w->data); 2002 2003 while (1) { 2004 spin_lock(&j->lock); 2005 ret = journal_write_alloc(j, w); 2006 if (!ret || !j->can_discard) 2007 break; 2008 2009 spin_unlock(&j->lock); 2010 bch2_journal_do_discards(j); 2011 } 2012 2013 if (ret) { 2014 __bch2_journal_debug_to_text(&journal_debug_buf, j); 2015 spin_unlock(&j->lock); 2016 bch_err(c, "Unable to allocate journal write:\n%s", 2017 journal_debug_buf.buf); 2018 printbuf_exit(&journal_debug_buf); 2019 goto err; 2020 } 2021 2022 /* 2023 * write is allocated, no longer need to account for it in 2024 * bch2_journal_space_available(): 2025 */ 2026 w->sectors = 0; 2027 w->write_allocated = true; 2028 2029 /* 2030 * journal entry has been compacted and allocated, recalculate space 2031 * available: 2032 */ 2033 bch2_journal_space_available(j); 2034 bch2_journal_do_writes(j); 2035 spin_unlock(&j->lock); 2036 2037 w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); 2038 2039 if (c->opts.nochanges) 2040 goto no_io; 2041 2042 /* 2043 * Mark journal replicas before we submit the write to guarantee 2044 * recovery will find the journal entries after a crash. 2045 */ 2046 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, 2047 w->devs_written); 2048 ret = bch2_mark_replicas(c, &replicas.e); 2049 if (ret) 2050 goto err; 2051 2052 if (!JSET_NO_FLUSH(w->data)) 2053 continue_at(cl, journal_write_preflush, j->wq); 2054 else 2055 continue_at(cl, journal_write_submit, j->wq); 2056 return; 2057 no_io: 2058 continue_at(cl, journal_write_done, j->wq); 2059 return; 2060 err: 2061 bch2_fatal_error(c); 2062 continue_at(cl, journal_write_done, j->wq); 2063 } 2064