1 // SPDX-License-Identifier: GPL-2.0 2 #include "bcachefs.h" 3 #include "alloc_background.h" 4 #include "alloc_foreground.h" 5 #include "btree_io.h" 6 #include "btree_update_interior.h" 7 #include "btree_write_buffer.h" 8 #include "buckets.h" 9 #include "checksum.h" 10 #include "disk_groups.h" 11 #include "error.h" 12 #include "journal.h" 13 #include "journal_io.h" 14 #include "journal_reclaim.h" 15 #include "journal_seq_blacklist.h" 16 #include "replicas.h" 17 #include "sb-clean.h" 18 #include "trace.h" 19 20 void bch2_journal_pos_from_member_info_set(struct bch_fs *c) 21 { 22 lockdep_assert_held(&c->sb_lock); 23 24 for_each_member_device(c, ca) { 25 struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); 26 27 m->last_journal_bucket = cpu_to_le32(ca->journal.cur_idx); 28 m->last_journal_bucket_offset = cpu_to_le32(ca->mi.bucket_size - ca->journal.sectors_free); 29 } 30 } 31 32 void bch2_journal_pos_from_member_info_resume(struct bch_fs *c) 33 { 34 mutex_lock(&c->sb_lock); 35 for_each_member_device(c, ca) { 36 struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx); 37 38 unsigned idx = le32_to_cpu(m.last_journal_bucket); 39 if (idx < ca->journal.nr) 40 ca->journal.cur_idx = idx; 41 unsigned offset = le32_to_cpu(m.last_journal_bucket_offset); 42 if (offset <= ca->mi.bucket_size) 43 ca->journal.sectors_free = ca->mi.bucket_size - offset; 44 } 45 mutex_unlock(&c->sb_lock); 46 } 47 48 void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, 49 struct journal_replay *j) 50 { 51 darray_for_each(j->ptrs, i) { 52 if (i != j->ptrs.data) 53 prt_printf(out, " "); 54 prt_printf(out, "%u:%u:%u (sector %llu)", 55 i->dev, i->bucket, i->bucket_offset, i->sector); 56 } 57 } 58 59 static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c, 60 struct journal_replay *j) 61 { 62 prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq)); 63 64 bch2_journal_ptrs_to_text(out, c, j); 65 66 for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) { 67 struct jset_entry_datetime *datetime = 68 container_of(entry, struct jset_entry_datetime, entry); 69 bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); 70 break; 71 } 72 } 73 74 static struct nonce journal_nonce(const struct jset *jset) 75 { 76 return (struct nonce) {{ 77 [0] = 0, 78 [1] = ((__le32 *) &jset->seq)[0], 79 [2] = ((__le32 *) &jset->seq)[1], 80 [3] = BCH_NONCE_JOURNAL, 81 }}; 82 } 83 84 static bool jset_csum_good(struct bch_fs *c, struct jset *j, struct bch_csum *csum) 85 { 86 if (!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j))) { 87 *csum = (struct bch_csum) {}; 88 return false; 89 } 90 91 *csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j); 92 return !bch2_crc_cmp(j->csum, *csum); 93 } 94 95 static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq) 96 { 97 return (seq - c->journal_entries_base_seq) & (~0U >> 1); 98 } 99 100 static void __journal_replay_free(struct bch_fs *c, 101 struct journal_replay *i) 102 { 103 struct journal_replay **p = 104 genradix_ptr(&c->journal_entries, 105 journal_entry_radix_idx(c, le64_to_cpu(i->j.seq))); 106 107 BUG_ON(*p != i); 108 *p = NULL; 109 kvfree(i); 110 } 111 112 static void journal_replay_free(struct bch_fs *c, struct journal_replay *i, bool blacklisted) 113 { 114 if (blacklisted) 115 i->ignore_blacklisted = true; 116 else 117 i->ignore_not_dirty = true; 118 119 if (!c->opts.read_entire_journal) 120 __journal_replay_free(c, i); 121 } 122 123 struct journal_list { 124 struct closure cl; 125 u64 last_seq; 126 struct mutex lock; 127 int ret; 128 }; 129 130 #define JOURNAL_ENTRY_ADD_OK 0 131 #define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5 132 133 /* 134 * Given a journal entry we just read, add it to the list of journal entries to 135 * be replayed: 136 */ 137 static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, 138 struct journal_ptr entry_ptr, 139 struct journal_list *jlist, struct jset *j) 140 { 141 struct genradix_iter iter; 142 struct journal_replay **_i, *i, *dup; 143 size_t bytes = vstruct_bytes(j); 144 u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0; 145 struct printbuf buf = PRINTBUF; 146 int ret = JOURNAL_ENTRY_ADD_OK; 147 148 if (!c->journal.oldest_seq_found_ondisk || 149 le64_to_cpu(j->seq) < c->journal.oldest_seq_found_ondisk) 150 c->journal.oldest_seq_found_ondisk = le64_to_cpu(j->seq); 151 152 /* Is this entry older than the range we need? */ 153 if (!c->opts.read_entire_journal && 154 le64_to_cpu(j->seq) < jlist->last_seq) 155 return JOURNAL_ENTRY_ADD_OUT_OF_RANGE; 156 157 /* 158 * genradixes are indexed by a ulong, not a u64, so we can't index them 159 * by sequence number directly: Assume instead that they will all fall 160 * within the range of +-2billion of the filrst one we find. 161 */ 162 if (!c->journal_entries_base_seq) 163 c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX); 164 165 /* Drop entries we don't need anymore */ 166 if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) { 167 genradix_for_each_from(&c->journal_entries, iter, _i, 168 journal_entry_radix_idx(c, jlist->last_seq)) { 169 i = *_i; 170 171 if (journal_replay_ignore(i)) 172 continue; 173 174 if (le64_to_cpu(i->j.seq) >= last_seq) 175 break; 176 177 journal_replay_free(c, i, false); 178 } 179 } 180 181 jlist->last_seq = max(jlist->last_seq, last_seq); 182 183 _i = genradix_ptr_alloc(&c->journal_entries, 184 journal_entry_radix_idx(c, le64_to_cpu(j->seq)), 185 GFP_KERNEL); 186 if (!_i) 187 return -BCH_ERR_ENOMEM_journal_entry_add; 188 189 /* 190 * Duplicate journal entries? If so we want the one that didn't have a 191 * checksum error: 192 */ 193 dup = *_i; 194 if (dup) { 195 bool identical = bytes == vstruct_bytes(&dup->j) && 196 !memcmp(j, &dup->j, bytes); 197 bool not_identical = !identical && 198 entry_ptr.csum_good && 199 dup->csum_good; 200 201 bool same_device = false; 202 darray_for_each(dup->ptrs, ptr) 203 if (ptr->dev == ca->dev_idx) 204 same_device = true; 205 206 ret = darray_push(&dup->ptrs, entry_ptr); 207 if (ret) 208 goto out; 209 210 bch2_journal_replay_to_text(&buf, c, dup); 211 212 fsck_err_on(same_device, 213 c, journal_entry_dup_same_device, 214 "duplicate journal entry on same device\n %s", 215 buf.buf); 216 217 fsck_err_on(not_identical, 218 c, journal_entry_replicas_data_mismatch, 219 "found duplicate but non identical journal entries\n %s", 220 buf.buf); 221 222 if (entry_ptr.csum_good && !identical) 223 goto replace; 224 225 goto out; 226 } 227 replace: 228 i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); 229 if (!i) 230 return -BCH_ERR_ENOMEM_journal_entry_add; 231 232 darray_init(&i->ptrs); 233 i->csum_good = entry_ptr.csum_good; 234 i->ignore_blacklisted = false; 235 i->ignore_not_dirty = false; 236 unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct"); 237 238 if (dup) { 239 /* The first ptr should represent the jset we kept: */ 240 darray_for_each(dup->ptrs, ptr) 241 darray_push(&i->ptrs, *ptr); 242 __journal_replay_free(c, dup); 243 } else { 244 darray_push(&i->ptrs, entry_ptr); 245 } 246 247 *_i = i; 248 out: 249 fsck_err: 250 printbuf_exit(&buf); 251 return ret; 252 } 253 254 /* this fills in a range with empty jset_entries: */ 255 static void journal_entry_null_range(void *start, void *end) 256 { 257 struct jset_entry *entry; 258 259 for (entry = start; entry != end; entry = vstruct_next(entry)) 260 memset(entry, 0, sizeof(*entry)); 261 } 262 263 #define JOURNAL_ENTRY_REREAD 5 264 #define JOURNAL_ENTRY_NONE 6 265 #define JOURNAL_ENTRY_BAD 7 266 267 static void journal_entry_err_msg(struct printbuf *out, 268 u32 version, 269 struct jset *jset, 270 struct jset_entry *entry) 271 { 272 prt_str(out, "invalid journal entry, version="); 273 bch2_version_to_text(out, version); 274 275 if (entry) { 276 prt_str(out, " type="); 277 bch2_prt_jset_entry_type(out, entry->type); 278 } 279 280 if (!jset) { 281 prt_printf(out, " in superblock"); 282 } else { 283 284 prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq)); 285 286 if (entry) 287 prt_printf(out, " offset=%zi/%u", 288 (u64 *) entry - jset->_data, 289 le32_to_cpu(jset->u64s)); 290 } 291 292 prt_str(out, ": "); 293 } 294 295 #define journal_entry_err(c, version, jset, entry, _err, msg, ...) \ 296 ({ \ 297 struct printbuf _buf = PRINTBUF; \ 298 \ 299 journal_entry_err_msg(&_buf, version, jset, entry); \ 300 prt_printf(&_buf, msg, ##__VA_ARGS__); \ 301 \ 302 switch (flags & BCH_VALIDATE_write) { \ 303 case READ: \ 304 mustfix_fsck_err(c, _err, "%s", _buf.buf); \ 305 break; \ 306 case WRITE: \ 307 bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \ 308 bch_err(c, "corrupt metadata before write: %s\n", _buf.buf);\ 309 if (bch2_fs_inconsistent(c)) { \ 310 ret = -BCH_ERR_fsck_errors_not_fixed; \ 311 goto fsck_err; \ 312 } \ 313 break; \ 314 } \ 315 \ 316 printbuf_exit(&_buf); \ 317 true; \ 318 }) 319 320 #define journal_entry_err_on(cond, ...) \ 321 ((cond) ? journal_entry_err(__VA_ARGS__) : false) 322 323 #define FSCK_DELETED_KEY 5 324 325 static int journal_validate_key(struct bch_fs *c, 326 struct jset *jset, 327 struct jset_entry *entry, 328 unsigned level, enum btree_id btree_id, 329 struct bkey_i *k, 330 unsigned version, int big_endian, 331 enum bch_validate_flags flags) 332 { 333 int write = flags & BCH_VALIDATE_write; 334 void *next = vstruct_next(entry); 335 struct printbuf buf = PRINTBUF; 336 int ret = 0; 337 338 if (journal_entry_err_on(!k->k.u64s, 339 c, version, jset, entry, 340 journal_entry_bkey_u64s_0, 341 "k->u64s 0")) { 342 entry->u64s = cpu_to_le16((u64 *) k - entry->_data); 343 journal_entry_null_range(vstruct_next(entry), next); 344 return FSCK_DELETED_KEY; 345 } 346 347 if (journal_entry_err_on((void *) bkey_next(k) > 348 (void *) vstruct_next(entry), 349 c, version, jset, entry, 350 journal_entry_bkey_past_end, 351 "extends past end of journal entry")) { 352 entry->u64s = cpu_to_le16((u64 *) k - entry->_data); 353 journal_entry_null_range(vstruct_next(entry), next); 354 return FSCK_DELETED_KEY; 355 } 356 357 if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, 358 c, version, jset, entry, 359 journal_entry_bkey_bad_format, 360 "bad format %u", k->k.format)) { 361 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); 362 memmove(k, bkey_next(k), next - (void *) bkey_next(k)); 363 journal_entry_null_range(vstruct_next(entry), next); 364 return FSCK_DELETED_KEY; 365 } 366 367 if (!write) 368 bch2_bkey_compat(level, btree_id, version, big_endian, 369 write, NULL, bkey_to_packed(k)); 370 371 if (bch2_bkey_invalid(c, bkey_i_to_s_c(k), 372 __btree_node_type(level, btree_id), write, &buf)) { 373 printbuf_reset(&buf); 374 journal_entry_err_msg(&buf, version, jset, entry); 375 prt_newline(&buf); 376 printbuf_indent_add(&buf, 2); 377 378 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); 379 prt_newline(&buf); 380 bch2_bkey_invalid(c, bkey_i_to_s_c(k), 381 __btree_node_type(level, btree_id), write, &buf); 382 383 mustfix_fsck_err(c, journal_entry_bkey_invalid, 384 "%s", buf.buf); 385 386 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); 387 memmove(k, bkey_next(k), next - (void *) bkey_next(k)); 388 journal_entry_null_range(vstruct_next(entry), next); 389 390 printbuf_exit(&buf); 391 return FSCK_DELETED_KEY; 392 } 393 394 if (write) 395 bch2_bkey_compat(level, btree_id, version, big_endian, 396 write, NULL, bkey_to_packed(k)); 397 fsck_err: 398 printbuf_exit(&buf); 399 return ret; 400 } 401 402 static int journal_entry_btree_keys_validate(struct bch_fs *c, 403 struct jset *jset, 404 struct jset_entry *entry, 405 unsigned version, int big_endian, 406 enum bch_validate_flags flags) 407 { 408 struct bkey_i *k = entry->start; 409 410 while (k != vstruct_last(entry)) { 411 int ret = journal_validate_key(c, jset, entry, 412 entry->level, 413 entry->btree_id, 414 k, version, big_endian, 415 flags|BCH_VALIDATE_journal); 416 if (ret == FSCK_DELETED_KEY) 417 continue; 418 else if (ret) 419 return ret; 420 421 k = bkey_next(k); 422 } 423 424 return 0; 425 } 426 427 static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c, 428 struct jset_entry *entry) 429 { 430 bool first = true; 431 432 jset_entry_for_each_key(entry, k) { 433 if (!first) { 434 prt_newline(out); 435 bch2_prt_jset_entry_type(out, entry->type); 436 prt_str(out, ": "); 437 } 438 prt_printf(out, "btree=%s l=%u ", bch2_btree_id_str(entry->btree_id), entry->level); 439 bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); 440 first = false; 441 } 442 } 443 444 static int journal_entry_btree_root_validate(struct bch_fs *c, 445 struct jset *jset, 446 struct jset_entry *entry, 447 unsigned version, int big_endian, 448 enum bch_validate_flags flags) 449 { 450 struct bkey_i *k = entry->start; 451 int ret = 0; 452 453 if (journal_entry_err_on(!entry->u64s || 454 le16_to_cpu(entry->u64s) != k->k.u64s, 455 c, version, jset, entry, 456 journal_entry_btree_root_bad_size, 457 "invalid btree root journal entry: wrong number of keys")) { 458 void *next = vstruct_next(entry); 459 /* 460 * we don't want to null out this jset_entry, 461 * just the contents, so that later we can tell 462 * we were _supposed_ to have a btree root 463 */ 464 entry->u64s = 0; 465 journal_entry_null_range(vstruct_next(entry), next); 466 return 0; 467 } 468 469 ret = journal_validate_key(c, jset, entry, 1, entry->btree_id, k, 470 version, big_endian, flags); 471 if (ret == FSCK_DELETED_KEY) 472 ret = 0; 473 fsck_err: 474 return ret; 475 } 476 477 static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c, 478 struct jset_entry *entry) 479 { 480 journal_entry_btree_keys_to_text(out, c, entry); 481 } 482 483 static int journal_entry_prio_ptrs_validate(struct bch_fs *c, 484 struct jset *jset, 485 struct jset_entry *entry, 486 unsigned version, int big_endian, 487 enum bch_validate_flags flags) 488 { 489 /* obsolete, don't care: */ 490 return 0; 491 } 492 493 static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c, 494 struct jset_entry *entry) 495 { 496 } 497 498 static int journal_entry_blacklist_validate(struct bch_fs *c, 499 struct jset *jset, 500 struct jset_entry *entry, 501 unsigned version, int big_endian, 502 enum bch_validate_flags flags) 503 { 504 int ret = 0; 505 506 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, 507 c, version, jset, entry, 508 journal_entry_blacklist_bad_size, 509 "invalid journal seq blacklist entry: bad size")) { 510 journal_entry_null_range(entry, vstruct_next(entry)); 511 } 512 fsck_err: 513 return ret; 514 } 515 516 static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c, 517 struct jset_entry *entry) 518 { 519 struct jset_entry_blacklist *bl = 520 container_of(entry, struct jset_entry_blacklist, entry); 521 522 prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq)); 523 } 524 525 static int journal_entry_blacklist_v2_validate(struct bch_fs *c, 526 struct jset *jset, 527 struct jset_entry *entry, 528 unsigned version, int big_endian, 529 enum bch_validate_flags flags) 530 { 531 struct jset_entry_blacklist_v2 *bl_entry; 532 int ret = 0; 533 534 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, 535 c, version, jset, entry, 536 journal_entry_blacklist_v2_bad_size, 537 "invalid journal seq blacklist entry: bad size")) { 538 journal_entry_null_range(entry, vstruct_next(entry)); 539 goto out; 540 } 541 542 bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry); 543 544 if (journal_entry_err_on(le64_to_cpu(bl_entry->start) > 545 le64_to_cpu(bl_entry->end), 546 c, version, jset, entry, 547 journal_entry_blacklist_v2_start_past_end, 548 "invalid journal seq blacklist entry: start > end")) { 549 journal_entry_null_range(entry, vstruct_next(entry)); 550 } 551 out: 552 fsck_err: 553 return ret; 554 } 555 556 static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c, 557 struct jset_entry *entry) 558 { 559 struct jset_entry_blacklist_v2 *bl = 560 container_of(entry, struct jset_entry_blacklist_v2, entry); 561 562 prt_printf(out, "start=%llu end=%llu", 563 le64_to_cpu(bl->start), 564 le64_to_cpu(bl->end)); 565 } 566 567 static int journal_entry_usage_validate(struct bch_fs *c, 568 struct jset *jset, 569 struct jset_entry *entry, 570 unsigned version, int big_endian, 571 enum bch_validate_flags flags) 572 { 573 struct jset_entry_usage *u = 574 container_of(entry, struct jset_entry_usage, entry); 575 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 576 int ret = 0; 577 578 if (journal_entry_err_on(bytes < sizeof(*u), 579 c, version, jset, entry, 580 journal_entry_usage_bad_size, 581 "invalid journal entry usage: bad size")) { 582 journal_entry_null_range(entry, vstruct_next(entry)); 583 return ret; 584 } 585 586 fsck_err: 587 return ret; 588 } 589 590 static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c, 591 struct jset_entry *entry) 592 { 593 struct jset_entry_usage *u = 594 container_of(entry, struct jset_entry_usage, entry); 595 596 prt_str(out, "type="); 597 bch2_prt_fs_usage_type(out, u->entry.btree_id); 598 prt_printf(out, " v=%llu", le64_to_cpu(u->v)); 599 } 600 601 static int journal_entry_data_usage_validate(struct bch_fs *c, 602 struct jset *jset, 603 struct jset_entry *entry, 604 unsigned version, int big_endian, 605 enum bch_validate_flags flags) 606 { 607 struct jset_entry_data_usage *u = 608 container_of(entry, struct jset_entry_data_usage, entry); 609 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 610 struct printbuf err = PRINTBUF; 611 int ret = 0; 612 613 if (journal_entry_err_on(bytes < sizeof(*u) || 614 bytes < sizeof(*u) + u->r.nr_devs, 615 c, version, jset, entry, 616 journal_entry_data_usage_bad_size, 617 "invalid journal entry usage: bad size")) { 618 journal_entry_null_range(entry, vstruct_next(entry)); 619 goto out; 620 } 621 622 if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c->disk_sb.sb, &err), 623 c, version, jset, entry, 624 journal_entry_data_usage_bad_size, 625 "invalid journal entry usage: %s", err.buf)) { 626 journal_entry_null_range(entry, vstruct_next(entry)); 627 goto out; 628 } 629 out: 630 fsck_err: 631 printbuf_exit(&err); 632 return ret; 633 } 634 635 static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c, 636 struct jset_entry *entry) 637 { 638 struct jset_entry_data_usage *u = 639 container_of(entry, struct jset_entry_data_usage, entry); 640 641 bch2_replicas_entry_to_text(out, &u->r); 642 prt_printf(out, "=%llu", le64_to_cpu(u->v)); 643 } 644 645 static int journal_entry_clock_validate(struct bch_fs *c, 646 struct jset *jset, 647 struct jset_entry *entry, 648 unsigned version, int big_endian, 649 enum bch_validate_flags flags) 650 { 651 struct jset_entry_clock *clock = 652 container_of(entry, struct jset_entry_clock, entry); 653 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 654 int ret = 0; 655 656 if (journal_entry_err_on(bytes != sizeof(*clock), 657 c, version, jset, entry, 658 journal_entry_clock_bad_size, 659 "bad size")) { 660 journal_entry_null_range(entry, vstruct_next(entry)); 661 return ret; 662 } 663 664 if (journal_entry_err_on(clock->rw > 1, 665 c, version, jset, entry, 666 journal_entry_clock_bad_rw, 667 "bad rw")) { 668 journal_entry_null_range(entry, vstruct_next(entry)); 669 return ret; 670 } 671 672 fsck_err: 673 return ret; 674 } 675 676 static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c, 677 struct jset_entry *entry) 678 { 679 struct jset_entry_clock *clock = 680 container_of(entry, struct jset_entry_clock, entry); 681 682 prt_printf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time)); 683 } 684 685 static int journal_entry_dev_usage_validate(struct bch_fs *c, 686 struct jset *jset, 687 struct jset_entry *entry, 688 unsigned version, int big_endian, 689 enum bch_validate_flags flags) 690 { 691 struct jset_entry_dev_usage *u = 692 container_of(entry, struct jset_entry_dev_usage, entry); 693 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 694 unsigned expected = sizeof(*u); 695 int ret = 0; 696 697 if (journal_entry_err_on(bytes < expected, 698 c, version, jset, entry, 699 journal_entry_dev_usage_bad_size, 700 "bad size (%u < %u)", 701 bytes, expected)) { 702 journal_entry_null_range(entry, vstruct_next(entry)); 703 return ret; 704 } 705 706 if (journal_entry_err_on(u->pad, 707 c, version, jset, entry, 708 journal_entry_dev_usage_bad_pad, 709 "bad pad")) { 710 journal_entry_null_range(entry, vstruct_next(entry)); 711 return ret; 712 } 713 714 fsck_err: 715 return ret; 716 } 717 718 static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c, 719 struct jset_entry *entry) 720 { 721 struct jset_entry_dev_usage *u = 722 container_of(entry, struct jset_entry_dev_usage, entry); 723 unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); 724 725 prt_printf(out, "dev=%u", le32_to_cpu(u->dev)); 726 727 printbuf_indent_add(out, 2); 728 for (i = 0; i < nr_types; i++) { 729 prt_newline(out); 730 bch2_prt_data_type(out, i); 731 prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu", 732 le64_to_cpu(u->d[i].buckets), 733 le64_to_cpu(u->d[i].sectors), 734 le64_to_cpu(u->d[i].fragmented)); 735 } 736 printbuf_indent_sub(out, 2); 737 } 738 739 static int journal_entry_log_validate(struct bch_fs *c, 740 struct jset *jset, 741 struct jset_entry *entry, 742 unsigned version, int big_endian, 743 enum bch_validate_flags flags) 744 { 745 return 0; 746 } 747 748 static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c, 749 struct jset_entry *entry) 750 { 751 struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); 752 unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d); 753 754 prt_printf(out, "%.*s", bytes, l->d); 755 } 756 757 static int journal_entry_overwrite_validate(struct bch_fs *c, 758 struct jset *jset, 759 struct jset_entry *entry, 760 unsigned version, int big_endian, 761 enum bch_validate_flags flags) 762 { 763 return journal_entry_btree_keys_validate(c, jset, entry, 764 version, big_endian, READ); 765 } 766 767 static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c, 768 struct jset_entry *entry) 769 { 770 journal_entry_btree_keys_to_text(out, c, entry); 771 } 772 773 static int journal_entry_write_buffer_keys_validate(struct bch_fs *c, 774 struct jset *jset, 775 struct jset_entry *entry, 776 unsigned version, int big_endian, 777 enum bch_validate_flags flags) 778 { 779 return journal_entry_btree_keys_validate(c, jset, entry, 780 version, big_endian, READ); 781 } 782 783 static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c, 784 struct jset_entry *entry) 785 { 786 journal_entry_btree_keys_to_text(out, c, entry); 787 } 788 789 static int journal_entry_datetime_validate(struct bch_fs *c, 790 struct jset *jset, 791 struct jset_entry *entry, 792 unsigned version, int big_endian, 793 enum bch_validate_flags flags) 794 { 795 unsigned bytes = vstruct_bytes(entry); 796 unsigned expected = 16; 797 int ret = 0; 798 799 if (journal_entry_err_on(vstruct_bytes(entry) < expected, 800 c, version, jset, entry, 801 journal_entry_dev_usage_bad_size, 802 "bad size (%u < %u)", 803 bytes, expected)) { 804 journal_entry_null_range(entry, vstruct_next(entry)); 805 return ret; 806 } 807 fsck_err: 808 return ret; 809 } 810 811 static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *c, 812 struct jset_entry *entry) 813 { 814 struct jset_entry_datetime *datetime = 815 container_of(entry, struct jset_entry_datetime, entry); 816 817 bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); 818 } 819 820 struct jset_entry_ops { 821 int (*validate)(struct bch_fs *, struct jset *, 822 struct jset_entry *, unsigned, int, 823 enum bch_validate_flags); 824 void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); 825 }; 826 827 static const struct jset_entry_ops bch2_jset_entry_ops[] = { 828 #define x(f, nr) \ 829 [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \ 830 .validate = journal_entry_##f##_validate, \ 831 .to_text = journal_entry_##f##_to_text, \ 832 }, 833 BCH_JSET_ENTRY_TYPES() 834 #undef x 835 }; 836 837 int bch2_journal_entry_validate(struct bch_fs *c, 838 struct jset *jset, 839 struct jset_entry *entry, 840 unsigned version, int big_endian, 841 enum bch_validate_flags flags) 842 { 843 return entry->type < BCH_JSET_ENTRY_NR 844 ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry, 845 version, big_endian, flags) 846 : 0; 847 } 848 849 void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, 850 struct jset_entry *entry) 851 { 852 bch2_prt_jset_entry_type(out, entry->type); 853 854 if (entry->type < BCH_JSET_ENTRY_NR) { 855 prt_str(out, ": "); 856 bch2_jset_entry_ops[entry->type].to_text(out, c, entry); 857 } 858 } 859 860 static int jset_validate_entries(struct bch_fs *c, struct jset *jset, 861 enum bch_validate_flags flags) 862 { 863 unsigned version = le32_to_cpu(jset->version); 864 int ret = 0; 865 866 vstruct_for_each(jset, entry) { 867 if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset), 868 c, version, jset, entry, 869 journal_entry_past_jset_end, 870 "journal entry extends past end of jset")) { 871 jset->u64s = cpu_to_le32((u64 *) entry - jset->_data); 872 break; 873 } 874 875 ret = bch2_journal_entry_validate(c, jset, entry, 876 version, JSET_BIG_ENDIAN(jset), flags); 877 if (ret) 878 break; 879 } 880 fsck_err: 881 return ret; 882 } 883 884 static int jset_validate(struct bch_fs *c, 885 struct bch_dev *ca, 886 struct jset *jset, u64 sector, 887 enum bch_validate_flags flags) 888 { 889 unsigned version; 890 int ret = 0; 891 892 if (le64_to_cpu(jset->magic) != jset_magic(c)) 893 return JOURNAL_ENTRY_NONE; 894 895 version = le32_to_cpu(jset->version); 896 if (journal_entry_err_on(!bch2_version_compatible(version), 897 c, version, jset, NULL, 898 jset_unsupported_version, 899 "%s sector %llu seq %llu: incompatible journal entry version %u.%u", 900 ca ? ca->name : c->name, 901 sector, le64_to_cpu(jset->seq), 902 BCH_VERSION_MAJOR(version), 903 BCH_VERSION_MINOR(version))) { 904 /* don't try to continue: */ 905 return -EINVAL; 906 } 907 908 if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), 909 c, version, jset, NULL, 910 jset_unknown_csum, 911 "%s sector %llu seq %llu: journal entry with unknown csum type %llu", 912 ca ? ca->name : c->name, 913 sector, le64_to_cpu(jset->seq), 914 JSET_CSUM_TYPE(jset))) 915 ret = JOURNAL_ENTRY_BAD; 916 917 /* last_seq is ignored when JSET_NO_FLUSH is true */ 918 if (journal_entry_err_on(!JSET_NO_FLUSH(jset) && 919 le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), 920 c, version, jset, NULL, 921 jset_last_seq_newer_than_seq, 922 "invalid journal entry: last_seq > seq (%llu > %llu)", 923 le64_to_cpu(jset->last_seq), 924 le64_to_cpu(jset->seq))) { 925 jset->last_seq = jset->seq; 926 return JOURNAL_ENTRY_BAD; 927 } 928 929 ret = jset_validate_entries(c, jset, flags); 930 fsck_err: 931 return ret; 932 } 933 934 static int jset_validate_early(struct bch_fs *c, 935 struct bch_dev *ca, 936 struct jset *jset, u64 sector, 937 unsigned bucket_sectors_left, 938 unsigned sectors_read) 939 { 940 size_t bytes = vstruct_bytes(jset); 941 unsigned version; 942 enum bch_validate_flags flags = BCH_VALIDATE_journal; 943 int ret = 0; 944 945 if (le64_to_cpu(jset->magic) != jset_magic(c)) 946 return JOURNAL_ENTRY_NONE; 947 948 version = le32_to_cpu(jset->version); 949 if (journal_entry_err_on(!bch2_version_compatible(version), 950 c, version, jset, NULL, 951 jset_unsupported_version, 952 "%s sector %llu seq %llu: unknown journal entry version %u.%u", 953 ca ? ca->name : c->name, 954 sector, le64_to_cpu(jset->seq), 955 BCH_VERSION_MAJOR(version), 956 BCH_VERSION_MINOR(version))) { 957 /* don't try to continue: */ 958 return -EINVAL; 959 } 960 961 if (bytes > (sectors_read << 9) && 962 sectors_read < bucket_sectors_left) 963 return JOURNAL_ENTRY_REREAD; 964 965 if (journal_entry_err_on(bytes > bucket_sectors_left << 9, 966 c, version, jset, NULL, 967 jset_past_bucket_end, 968 "%s sector %llu seq %llu: journal entry too big (%zu bytes)", 969 ca ? ca->name : c->name, 970 sector, le64_to_cpu(jset->seq), bytes)) 971 le32_add_cpu(&jset->u64s, 972 -((bytes - (bucket_sectors_left << 9)) / 8)); 973 fsck_err: 974 return ret; 975 } 976 977 struct journal_read_buf { 978 void *data; 979 size_t size; 980 }; 981 982 static int journal_read_buf_realloc(struct journal_read_buf *b, 983 size_t new_size) 984 { 985 void *n; 986 987 /* the bios are sized for this many pages, max: */ 988 if (new_size > JOURNAL_ENTRY_SIZE_MAX) 989 return -BCH_ERR_ENOMEM_journal_read_buf_realloc; 990 991 new_size = roundup_pow_of_two(new_size); 992 n = kvmalloc(new_size, GFP_KERNEL); 993 if (!n) 994 return -BCH_ERR_ENOMEM_journal_read_buf_realloc; 995 996 kvfree(b->data); 997 b->data = n; 998 b->size = new_size; 999 return 0; 1000 } 1001 1002 static int journal_read_bucket(struct bch_dev *ca, 1003 struct journal_read_buf *buf, 1004 struct journal_list *jlist, 1005 unsigned bucket) 1006 { 1007 struct bch_fs *c = ca->fs; 1008 struct journal_device *ja = &ca->journal; 1009 struct jset *j = NULL; 1010 unsigned sectors, sectors_read = 0; 1011 u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), 1012 end = offset + ca->mi.bucket_size; 1013 bool saw_bad = false, csum_good; 1014 struct printbuf err = PRINTBUF; 1015 int ret = 0; 1016 1017 pr_debug("reading %u", bucket); 1018 1019 while (offset < end) { 1020 if (!sectors_read) { 1021 struct bio *bio; 1022 unsigned nr_bvecs; 1023 reread: 1024 sectors_read = min_t(unsigned, 1025 end - offset, buf->size >> 9); 1026 nr_bvecs = buf_pages(buf->data, sectors_read << 9); 1027 1028 bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); 1029 bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ); 1030 1031 bio->bi_iter.bi_sector = offset; 1032 bch2_bio_map(bio, buf->data, sectors_read << 9); 1033 1034 ret = submit_bio_wait(bio); 1035 kfree(bio); 1036 1037 if (bch2_dev_io_err_on(ret, ca, BCH_MEMBER_ERROR_read, 1038 "journal read error: sector %llu", 1039 offset) || 1040 bch2_meta_read_fault("journal")) { 1041 /* 1042 * We don't error out of the recovery process 1043 * here, since the relevant journal entry may be 1044 * found on a different device, and missing or 1045 * no journal entries will be handled later 1046 */ 1047 goto out; 1048 } 1049 1050 j = buf->data; 1051 } 1052 1053 ret = jset_validate_early(c, ca, j, offset, 1054 end - offset, sectors_read); 1055 switch (ret) { 1056 case 0: 1057 sectors = vstruct_sectors(j, c->block_bits); 1058 break; 1059 case JOURNAL_ENTRY_REREAD: 1060 if (vstruct_bytes(j) > buf->size) { 1061 ret = journal_read_buf_realloc(buf, 1062 vstruct_bytes(j)); 1063 if (ret) 1064 goto err; 1065 } 1066 goto reread; 1067 case JOURNAL_ENTRY_NONE: 1068 if (!saw_bad) 1069 goto out; 1070 /* 1071 * On checksum error we don't really trust the size 1072 * field of the journal entry we read, so try reading 1073 * again at next block boundary: 1074 */ 1075 sectors = block_sectors(c); 1076 goto next_block; 1077 default: 1078 goto err; 1079 } 1080 1081 if (le64_to_cpu(j->seq) > ja->highest_seq_found) { 1082 ja->highest_seq_found = le64_to_cpu(j->seq); 1083 ja->cur_idx = bucket; 1084 ja->sectors_free = ca->mi.bucket_size - 1085 bucket_remainder(ca, offset) - sectors; 1086 } 1087 1088 /* 1089 * This happens sometimes if we don't have discards on - 1090 * when we've partially overwritten a bucket with new 1091 * journal entries. We don't need the rest of the 1092 * bucket: 1093 */ 1094 if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) 1095 goto out; 1096 1097 ja->bucket_seq[bucket] = le64_to_cpu(j->seq); 1098 1099 enum bch_csum_type csum_type = JSET_CSUM_TYPE(j); 1100 struct bch_csum csum; 1101 csum_good = jset_csum_good(c, j, &csum); 1102 1103 if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum, 1104 "%s", 1105 (printbuf_reset(&err), 1106 prt_str(&err, "journal "), 1107 bch2_csum_err_msg(&err, csum_type, j->csum, csum), 1108 err.buf))) 1109 saw_bad = true; 1110 1111 ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), 1112 j->encrypted_start, 1113 vstruct_end(j) - (void *) j->encrypted_start); 1114 bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret)); 1115 1116 mutex_lock(&jlist->lock); 1117 ret = journal_entry_add(c, ca, (struct journal_ptr) { 1118 .csum_good = csum_good, 1119 .dev = ca->dev_idx, 1120 .bucket = bucket, 1121 .bucket_offset = offset - 1122 bucket_to_sector(ca, ja->buckets[bucket]), 1123 .sector = offset, 1124 }, jlist, j); 1125 mutex_unlock(&jlist->lock); 1126 1127 switch (ret) { 1128 case JOURNAL_ENTRY_ADD_OK: 1129 break; 1130 case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: 1131 break; 1132 default: 1133 goto err; 1134 } 1135 next_block: 1136 pr_debug("next"); 1137 offset += sectors; 1138 sectors_read -= sectors; 1139 j = ((void *) j) + (sectors << 9); 1140 } 1141 1142 out: 1143 ret = 0; 1144 err: 1145 printbuf_exit(&err); 1146 return ret; 1147 } 1148 1149 static CLOSURE_CALLBACK(bch2_journal_read_device) 1150 { 1151 closure_type(ja, struct journal_device, read); 1152 struct bch_dev *ca = container_of(ja, struct bch_dev, journal); 1153 struct bch_fs *c = ca->fs; 1154 struct journal_list *jlist = 1155 container_of(cl->parent, struct journal_list, cl); 1156 struct journal_read_buf buf = { NULL, 0 }; 1157 unsigned i; 1158 int ret = 0; 1159 1160 if (!ja->nr) 1161 goto out; 1162 1163 ret = journal_read_buf_realloc(&buf, PAGE_SIZE); 1164 if (ret) 1165 goto err; 1166 1167 pr_debug("%u journal buckets", ja->nr); 1168 1169 for (i = 0; i < ja->nr; i++) { 1170 ret = journal_read_bucket(ca, &buf, jlist, i); 1171 if (ret) 1172 goto err; 1173 } 1174 1175 /* 1176 * Set dirty_idx to indicate the entire journal is full and needs to be 1177 * reclaimed - journal reclaim will immediately reclaim whatever isn't 1178 * pinned when it first runs: 1179 */ 1180 ja->discard_idx = ja->dirty_idx_ondisk = 1181 ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; 1182 out: 1183 bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); 1184 kvfree(buf.data); 1185 percpu_ref_put(&ca->io_ref); 1186 closure_return(cl); 1187 return; 1188 err: 1189 mutex_lock(&jlist->lock); 1190 jlist->ret = ret; 1191 mutex_unlock(&jlist->lock); 1192 goto out; 1193 } 1194 1195 int bch2_journal_read(struct bch_fs *c, 1196 u64 *last_seq, 1197 u64 *blacklist_seq, 1198 u64 *start_seq) 1199 { 1200 struct journal_list jlist; 1201 struct journal_replay *i, **_i, *prev = NULL; 1202 struct genradix_iter radix_iter; 1203 struct printbuf buf = PRINTBUF; 1204 bool degraded = false, last_write_torn = false; 1205 u64 seq; 1206 int ret = 0; 1207 1208 closure_init_stack(&jlist.cl); 1209 mutex_init(&jlist.lock); 1210 jlist.last_seq = 0; 1211 jlist.ret = 0; 1212 1213 for_each_member_device(c, ca) { 1214 if (!c->opts.fsck && 1215 !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal))) 1216 continue; 1217 1218 if ((ca->mi.state == BCH_MEMBER_STATE_rw || 1219 ca->mi.state == BCH_MEMBER_STATE_ro) && 1220 percpu_ref_tryget(&ca->io_ref)) 1221 closure_call(&ca->journal.read, 1222 bch2_journal_read_device, 1223 system_unbound_wq, 1224 &jlist.cl); 1225 else 1226 degraded = true; 1227 } 1228 1229 closure_sync(&jlist.cl); 1230 1231 if (jlist.ret) 1232 return jlist.ret; 1233 1234 *last_seq = 0; 1235 *start_seq = 0; 1236 *blacklist_seq = 0; 1237 1238 /* 1239 * Find most recent flush entry, and ignore newer non flush entries - 1240 * those entries will be blacklisted: 1241 */ 1242 genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) { 1243 enum bch_validate_flags flags = BCH_VALIDATE_journal; 1244 1245 i = *_i; 1246 1247 if (journal_replay_ignore(i)) 1248 continue; 1249 1250 if (!*start_seq) 1251 *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1; 1252 1253 if (JSET_NO_FLUSH(&i->j)) { 1254 i->ignore_blacklisted = true; 1255 continue; 1256 } 1257 1258 if (!last_write_torn && !i->csum_good) { 1259 last_write_torn = true; 1260 i->ignore_blacklisted = true; 1261 continue; 1262 } 1263 1264 if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq), 1265 c, le32_to_cpu(i->j.version), &i->j, NULL, 1266 jset_last_seq_newer_than_seq, 1267 "invalid journal entry: last_seq > seq (%llu > %llu)", 1268 le64_to_cpu(i->j.last_seq), 1269 le64_to_cpu(i->j.seq))) 1270 i->j.last_seq = i->j.seq; 1271 1272 *last_seq = le64_to_cpu(i->j.last_seq); 1273 *blacklist_seq = le64_to_cpu(i->j.seq) + 1; 1274 break; 1275 } 1276 1277 if (!*start_seq) { 1278 bch_info(c, "journal read done, but no entries found"); 1279 return 0; 1280 } 1281 1282 if (!*last_seq) { 1283 fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes, 1284 "journal read done, but no entries found after dropping non-flushes"); 1285 return 0; 1286 } 1287 1288 bch_info(c, "journal read done, replaying entries %llu-%llu", 1289 *last_seq, *blacklist_seq - 1); 1290 1291 if (*start_seq != *blacklist_seq) 1292 bch_info(c, "dropped unflushed entries %llu-%llu", 1293 *blacklist_seq, *start_seq - 1); 1294 1295 /* Drop blacklisted entries and entries older than last_seq: */ 1296 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1297 i = *_i; 1298 1299 if (journal_replay_ignore(i)) 1300 continue; 1301 1302 seq = le64_to_cpu(i->j.seq); 1303 if (seq < *last_seq) { 1304 journal_replay_free(c, i, false); 1305 continue; 1306 } 1307 1308 if (bch2_journal_seq_is_blacklisted(c, seq, true)) { 1309 fsck_err_on(!JSET_NO_FLUSH(&i->j), c, 1310 jset_seq_blacklisted, 1311 "found blacklisted journal entry %llu", seq); 1312 i->ignore_blacklisted = true; 1313 } 1314 } 1315 1316 /* Check for missing entries: */ 1317 seq = *last_seq; 1318 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1319 i = *_i; 1320 1321 if (journal_replay_ignore(i)) 1322 continue; 1323 1324 BUG_ON(seq > le64_to_cpu(i->j.seq)); 1325 1326 while (seq < le64_to_cpu(i->j.seq)) { 1327 u64 missing_start, missing_end; 1328 struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; 1329 1330 while (seq < le64_to_cpu(i->j.seq) && 1331 bch2_journal_seq_is_blacklisted(c, seq, false)) 1332 seq++; 1333 1334 if (seq == le64_to_cpu(i->j.seq)) 1335 break; 1336 1337 missing_start = seq; 1338 1339 while (seq < le64_to_cpu(i->j.seq) && 1340 !bch2_journal_seq_is_blacklisted(c, seq, false)) 1341 seq++; 1342 1343 if (prev) { 1344 bch2_journal_ptrs_to_text(&buf1, c, prev); 1345 prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits)); 1346 } else 1347 prt_printf(&buf1, "(none)"); 1348 bch2_journal_ptrs_to_text(&buf2, c, i); 1349 1350 missing_end = seq - 1; 1351 fsck_err(c, journal_entries_missing, 1352 "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" 1353 " prev at %s\n" 1354 " next at %s, continue?", 1355 missing_start, missing_end, 1356 *last_seq, *blacklist_seq - 1, 1357 buf1.buf, buf2.buf); 1358 1359 printbuf_exit(&buf1); 1360 printbuf_exit(&buf2); 1361 } 1362 1363 prev = i; 1364 seq++; 1365 } 1366 1367 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1368 struct bch_replicas_padded replicas = { 1369 .e.data_type = BCH_DATA_journal, 1370 .e.nr_required = 1, 1371 }; 1372 1373 i = *_i; 1374 if (journal_replay_ignore(i)) 1375 continue; 1376 1377 darray_for_each(i->ptrs, ptr) { 1378 struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); 1379 1380 if (!ptr->csum_good) 1381 bch_err_dev_offset(ca, ptr->sector, 1382 "invalid journal checksum, seq %llu%s", 1383 le64_to_cpu(i->j.seq), 1384 i->csum_good ? " (had good copy on another device)" : ""); 1385 } 1386 1387 ret = jset_validate(c, 1388 bch2_dev_have_ref(c, i->ptrs.data[0].dev), 1389 &i->j, 1390 i->ptrs.data[0].sector, 1391 READ); 1392 if (ret) 1393 goto err; 1394 1395 darray_for_each(i->ptrs, ptr) 1396 replicas.e.devs[replicas.e.nr_devs++] = ptr->dev; 1397 1398 bch2_replicas_entry_sort(&replicas.e); 1399 1400 printbuf_reset(&buf); 1401 bch2_replicas_entry_to_text(&buf, &replicas.e); 1402 1403 if (!degraded && 1404 !bch2_replicas_marked(c, &replicas.e) && 1405 (le64_to_cpu(i->j.seq) == *last_seq || 1406 fsck_err(c, journal_entry_replicas_not_marked, 1407 "superblock not marked as containing replicas for journal entry %llu\n %s", 1408 le64_to_cpu(i->j.seq), buf.buf))) { 1409 ret = bch2_mark_replicas(c, &replicas.e); 1410 if (ret) 1411 goto err; 1412 } 1413 } 1414 err: 1415 fsck_err: 1416 printbuf_exit(&buf); 1417 return ret; 1418 } 1419 1420 /* journal write: */ 1421 1422 static void __journal_write_alloc(struct journal *j, 1423 struct journal_buf *w, 1424 struct dev_alloc_list *devs_sorted, 1425 unsigned sectors, 1426 unsigned *replicas, 1427 unsigned replicas_want) 1428 { 1429 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1430 struct journal_device *ja; 1431 struct bch_dev *ca; 1432 unsigned i; 1433 1434 if (*replicas >= replicas_want) 1435 return; 1436 1437 for (i = 0; i < devs_sorted->nr; i++) { 1438 ca = rcu_dereference(c->devs[devs_sorted->devs[i]]); 1439 if (!ca) 1440 continue; 1441 1442 ja = &ca->journal; 1443 1444 /* 1445 * Check that we can use this device, and aren't already using 1446 * it: 1447 */ 1448 if (!ca->mi.durability || 1449 ca->mi.state != BCH_MEMBER_STATE_rw || 1450 !ja->nr || 1451 bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) || 1452 sectors > ja->sectors_free) 1453 continue; 1454 1455 bch2_dev_stripe_increment(ca, &j->wp.stripe); 1456 1457 bch2_bkey_append_ptr(&w->key, 1458 (struct bch_extent_ptr) { 1459 .offset = bucket_to_sector(ca, 1460 ja->buckets[ja->cur_idx]) + 1461 ca->mi.bucket_size - 1462 ja->sectors_free, 1463 .dev = ca->dev_idx, 1464 }); 1465 1466 ja->sectors_free -= sectors; 1467 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1468 1469 *replicas += ca->mi.durability; 1470 1471 if (*replicas >= replicas_want) 1472 break; 1473 } 1474 } 1475 1476 /** 1477 * journal_write_alloc - decide where to write next journal entry 1478 * 1479 * @j: journal object 1480 * @w: journal buf (entry to be written) 1481 * 1482 * Returns: 0 on success, or -EROFS on failure 1483 */ 1484 static int journal_write_alloc(struct journal *j, struct journal_buf *w) 1485 { 1486 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1487 struct bch_devs_mask devs; 1488 struct journal_device *ja; 1489 struct bch_dev *ca; 1490 struct dev_alloc_list devs_sorted; 1491 unsigned sectors = vstruct_sectors(w->data, c->block_bits); 1492 unsigned target = c->opts.metadata_target ?: 1493 c->opts.foreground_target; 1494 unsigned i, replicas = 0, replicas_want = 1495 READ_ONCE(c->opts.metadata_replicas); 1496 unsigned replicas_need = min_t(unsigned, replicas_want, 1497 READ_ONCE(c->opts.metadata_replicas_required)); 1498 1499 rcu_read_lock(); 1500 retry: 1501 devs = target_rw_devs(c, BCH_DATA_journal, target); 1502 1503 devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); 1504 1505 __journal_write_alloc(j, w, &devs_sorted, 1506 sectors, &replicas, replicas_want); 1507 1508 if (replicas >= replicas_want) 1509 goto done; 1510 1511 for (i = 0; i < devs_sorted.nr; i++) { 1512 ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); 1513 if (!ca) 1514 continue; 1515 1516 ja = &ca->journal; 1517 1518 if (sectors > ja->sectors_free && 1519 sectors <= ca->mi.bucket_size && 1520 bch2_journal_dev_buckets_available(j, ja, 1521 journal_space_discarded)) { 1522 ja->cur_idx = (ja->cur_idx + 1) % ja->nr; 1523 ja->sectors_free = ca->mi.bucket_size; 1524 1525 /* 1526 * ja->bucket_seq[ja->cur_idx] must always have 1527 * something sensible: 1528 */ 1529 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1530 } 1531 } 1532 1533 __journal_write_alloc(j, w, &devs_sorted, 1534 sectors, &replicas, replicas_want); 1535 1536 if (replicas < replicas_want && target) { 1537 /* Retry from all devices: */ 1538 target = 0; 1539 goto retry; 1540 } 1541 done: 1542 rcu_read_unlock(); 1543 1544 BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); 1545 1546 return replicas >= replicas_need ? 0 : -EROFS; 1547 } 1548 1549 static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) 1550 { 1551 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1552 1553 /* we aren't holding j->lock: */ 1554 unsigned new_size = READ_ONCE(j->buf_size_want); 1555 void *new_buf; 1556 1557 if (buf->buf_size >= new_size) 1558 return; 1559 1560 size_t btree_write_buffer_size = new_size / 64; 1561 1562 if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size)) 1563 return; 1564 1565 new_buf = kvmalloc(new_size, GFP_NOFS|__GFP_NOWARN); 1566 if (!new_buf) 1567 return; 1568 1569 memcpy(new_buf, buf->data, buf->buf_size); 1570 1571 spin_lock(&j->lock); 1572 swap(buf->data, new_buf); 1573 swap(buf->buf_size, new_size); 1574 spin_unlock(&j->lock); 1575 1576 kvfree(new_buf); 1577 } 1578 1579 static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) 1580 { 1581 return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK); 1582 } 1583 1584 static CLOSURE_CALLBACK(journal_write_done) 1585 { 1586 closure_type(w, struct journal_buf, io); 1587 struct journal *j = container_of(w, struct journal, buf[w->idx]); 1588 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1589 struct bch_replicas_padded replicas; 1590 union journal_res_state old, new; 1591 u64 seq = le64_to_cpu(w->data->seq); 1592 int err = 0; 1593 1594 bch2_time_stats_update(!JSET_NO_FLUSH(w->data) 1595 ? j->flush_write_time 1596 : j->noflush_write_time, j->write_start_time); 1597 1598 if (!w->devs_written.nr) { 1599 bch_err(c, "unable to write journal to sufficient devices"); 1600 err = -EIO; 1601 } else { 1602 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, 1603 w->devs_written); 1604 if (bch2_mark_replicas(c, &replicas.e)) 1605 err = -EIO; 1606 } 1607 1608 if (err) 1609 bch2_fatal_error(c); 1610 1611 closure_debug_destroy(cl); 1612 1613 spin_lock(&j->lock); 1614 if (seq >= j->pin.front) 1615 journal_seq_pin(j, seq)->devs = w->devs_written; 1616 if (err && (!j->err_seq || seq < j->err_seq)) 1617 j->err_seq = seq; 1618 w->write_done = true; 1619 1620 bool completed = false; 1621 1622 for (seq = journal_last_unwritten_seq(j); 1623 seq <= journal_cur_seq(j); 1624 seq++) { 1625 w = j->buf + (seq & JOURNAL_BUF_MASK); 1626 if (!w->write_done) 1627 break; 1628 1629 if (!j->err_seq && !JSET_NO_FLUSH(w->data)) { 1630 j->flushed_seq_ondisk = seq; 1631 j->last_seq_ondisk = w->last_seq; 1632 1633 bch2_do_discards(c); 1634 closure_wake_up(&c->freelist_wait); 1635 bch2_reset_alloc_cursors(c); 1636 } 1637 1638 j->seq_ondisk = seq; 1639 1640 /* 1641 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard 1642 * more buckets: 1643 * 1644 * Must come before signaling write completion, for 1645 * bch2_fs_journal_stop(): 1646 */ 1647 if (j->watermark != BCH_WATERMARK_stripe) 1648 journal_reclaim_kick(&c->journal); 1649 1650 old.v = atomic64_read(&j->reservations.counter); 1651 do { 1652 new.v = old.v; 1653 BUG_ON(journal_state_count(new, new.unwritten_idx)); 1654 BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK)); 1655 1656 new.unwritten_idx++; 1657 } while (!atomic64_try_cmpxchg(&j->reservations.counter, 1658 &old.v, new.v)); 1659 1660 closure_wake_up(&w->wait); 1661 completed = true; 1662 } 1663 1664 if (completed) { 1665 bch2_journal_reclaim_fast(j); 1666 bch2_journal_space_available(j); 1667 1668 track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false); 1669 1670 journal_wake(j); 1671 } 1672 1673 if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && 1674 new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { 1675 struct journal_buf *buf = journal_cur_buf(j); 1676 long delta = buf->expires - jiffies; 1677 1678 /* 1679 * We don't close a journal entry to write it while there's 1680 * previous entries still in flight - the current journal entry 1681 * might want to be written now: 1682 */ 1683 mod_delayed_work(j->wq, &j->write_work, max(0L, delta)); 1684 } 1685 1686 /* 1687 * We don't typically trigger journal writes from her - the next journal 1688 * write will be triggered immediately after the previous one is 1689 * allocated, in bch2_journal_write() - but the journal write error path 1690 * is special: 1691 */ 1692 bch2_journal_do_writes(j); 1693 spin_unlock(&j->lock); 1694 } 1695 1696 static void journal_write_endio(struct bio *bio) 1697 { 1698 struct journal_bio *jbio = container_of(bio, struct journal_bio, bio); 1699 struct bch_dev *ca = jbio->ca; 1700 struct journal *j = &ca->fs->journal; 1701 struct journal_buf *w = j->buf + jbio->buf_idx; 1702 1703 if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, 1704 "error writing journal entry %llu: %s", 1705 le64_to_cpu(w->data->seq), 1706 bch2_blk_status_to_str(bio->bi_status)) || 1707 bch2_meta_write_fault("journal")) { 1708 unsigned long flags; 1709 1710 spin_lock_irqsave(&j->err_lock, flags); 1711 bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx); 1712 spin_unlock_irqrestore(&j->err_lock, flags); 1713 } 1714 1715 closure_put(&w->io); 1716 percpu_ref_put(&ca->io_ref); 1717 } 1718 1719 static CLOSURE_CALLBACK(journal_write_submit) 1720 { 1721 closure_type(w, struct journal_buf, io); 1722 struct journal *j = container_of(w, struct journal, buf[w->idx]); 1723 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1724 unsigned sectors = vstruct_sectors(w->data, c->block_bits); 1725 1726 extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { 1727 struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE); 1728 if (!ca) { 1729 /* XXX: fix this */ 1730 bch_err(c, "missing device for journal write\n"); 1731 continue; 1732 } 1733 1734 this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], 1735 sectors); 1736 1737 struct journal_device *ja = &ca->journal; 1738 struct bio *bio = &ja->bio[w->idx]->bio; 1739 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); 1740 bio->bi_iter.bi_sector = ptr->offset; 1741 bio->bi_end_io = journal_write_endio; 1742 bio->bi_private = ca; 1743 1744 BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector); 1745 ca->prev_journal_sector = bio->bi_iter.bi_sector; 1746 1747 if (!JSET_NO_FLUSH(w->data)) 1748 bio->bi_opf |= REQ_FUA; 1749 if (!JSET_NO_FLUSH(w->data) && !w->separate_flush) 1750 bio->bi_opf |= REQ_PREFLUSH; 1751 1752 bch2_bio_map(bio, w->data, sectors << 9); 1753 1754 trace_and_count(c, journal_write, bio); 1755 closure_bio_submit(bio, cl); 1756 1757 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1758 } 1759 1760 continue_at(cl, journal_write_done, j->wq); 1761 } 1762 1763 static CLOSURE_CALLBACK(journal_write_preflush) 1764 { 1765 closure_type(w, struct journal_buf, io); 1766 struct journal *j = container_of(w, struct journal, buf[w->idx]); 1767 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1768 1769 if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { 1770 spin_lock(&j->lock); 1771 if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { 1772 closure_wait(&j->async_wait, cl); 1773 spin_unlock(&j->lock); 1774 continue_at(cl, journal_write_preflush, j->wq); 1775 return; 1776 } 1777 spin_unlock(&j->lock); 1778 } 1779 1780 if (w->separate_flush) { 1781 for_each_rw_member(c, ca) { 1782 percpu_ref_get(&ca->io_ref); 1783 1784 struct journal_device *ja = &ca->journal; 1785 struct bio *bio = &ja->bio[w->idx]->bio; 1786 bio_reset(bio, ca->disk_sb.bdev, 1787 REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH); 1788 bio->bi_end_io = journal_write_endio; 1789 bio->bi_private = ca; 1790 closure_bio_submit(bio, cl); 1791 } 1792 1793 continue_at(cl, journal_write_submit, j->wq); 1794 } else { 1795 /* 1796 * no need to punt to another work item if we're not waiting on 1797 * preflushes 1798 */ 1799 journal_write_submit(&cl->work); 1800 } 1801 } 1802 1803 static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) 1804 { 1805 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1806 struct jset_entry *start, *end; 1807 struct jset *jset = w->data; 1808 struct journal_keys_to_wb wb = { NULL }; 1809 unsigned sectors, bytes, u64s; 1810 unsigned long btree_roots_have = 0; 1811 bool validate_before_checksum = false; 1812 u64 seq = le64_to_cpu(jset->seq); 1813 int ret; 1814 1815 /* 1816 * Simple compaction, dropping empty jset_entries (from journal 1817 * reservations that weren't fully used) and merging jset_entries that 1818 * can be. 1819 * 1820 * If we wanted to be really fancy here, we could sort all the keys in 1821 * the jset and drop keys that were overwritten - probably not worth it: 1822 */ 1823 vstruct_for_each(jset, i) { 1824 unsigned u64s = le16_to_cpu(i->u64s); 1825 1826 /* Empty entry: */ 1827 if (!u64s) 1828 continue; 1829 1830 /* 1831 * New btree roots are set by journalling them; when the journal 1832 * entry gets written we have to propagate them to 1833 * c->btree_roots 1834 * 1835 * But, every journal entry we write has to contain all the 1836 * btree roots (at least for now); so after we copy btree roots 1837 * to c->btree_roots we have to get any missing btree roots and 1838 * add them to this journal entry: 1839 */ 1840 switch (i->type) { 1841 case BCH_JSET_ENTRY_btree_root: 1842 bch2_journal_entry_to_btree_root(c, i); 1843 __set_bit(i->btree_id, &btree_roots_have); 1844 break; 1845 case BCH_JSET_ENTRY_write_buffer_keys: 1846 EBUG_ON(!w->need_flush_to_write_buffer); 1847 1848 if (!wb.wb) 1849 bch2_journal_keys_to_write_buffer_start(c, &wb, seq); 1850 1851 jset_entry_for_each_key(i, k) { 1852 ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k); 1853 if (ret) { 1854 bch2_fs_fatal_error(c, "flushing journal keys to btree write buffer: %s", 1855 bch2_err_str(ret)); 1856 bch2_journal_keys_to_write_buffer_end(c, &wb); 1857 return ret; 1858 } 1859 } 1860 i->type = BCH_JSET_ENTRY_btree_keys; 1861 break; 1862 } 1863 } 1864 1865 if (wb.wb) { 1866 ret = bch2_journal_keys_to_write_buffer_end(c, &wb); 1867 if (ret) { 1868 bch2_fs_fatal_error(c, "error flushing journal keys to btree write buffer: %s", 1869 bch2_err_str(ret)); 1870 return ret; 1871 } 1872 } 1873 1874 spin_lock(&c->journal.lock); 1875 w->need_flush_to_write_buffer = false; 1876 spin_unlock(&c->journal.lock); 1877 1878 start = end = vstruct_last(jset); 1879 1880 end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have); 1881 1882 struct jset_entry_datetime *d = 1883 container_of(jset_entry_init(&end, sizeof(*d)), struct jset_entry_datetime, entry); 1884 d->entry.type = BCH_JSET_ENTRY_datetime; 1885 d->seconds = cpu_to_le64(ktime_get_real_seconds()); 1886 1887 bch2_journal_super_entries_add_common(c, &end, seq); 1888 u64s = (u64 *) end - (u64 *) start; 1889 1890 WARN_ON(u64s > j->entry_u64s_reserved); 1891 1892 le32_add_cpu(&jset->u64s, u64s); 1893 1894 sectors = vstruct_sectors(jset, c->block_bits); 1895 bytes = vstruct_bytes(jset); 1896 1897 if (sectors > w->sectors) { 1898 bch2_fs_fatal_error(c, ": journal write overran available space, %zu > %u (extra %u reserved %u/%u)", 1899 vstruct_bytes(jset), w->sectors << 9, 1900 u64s, w->u64s_reserved, j->entry_u64s_reserved); 1901 return -EINVAL; 1902 } 1903 1904 jset->magic = cpu_to_le64(jset_magic(c)); 1905 jset->version = cpu_to_le32(c->sb.version); 1906 1907 SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); 1908 SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); 1909 1910 if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset)) 1911 j->last_empty_seq = seq; 1912 1913 if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) 1914 validate_before_checksum = true; 1915 1916 if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current) 1917 validate_before_checksum = true; 1918 1919 if (validate_before_checksum && 1920 (ret = jset_validate(c, NULL, jset, 0, WRITE))) 1921 return ret; 1922 1923 ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), 1924 jset->encrypted_start, 1925 vstruct_end(jset) - (void *) jset->encrypted_start); 1926 if (bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret))) 1927 return ret; 1928 1929 jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), 1930 journal_nonce(jset), jset); 1931 1932 if (!validate_before_checksum && 1933 (ret = jset_validate(c, NULL, jset, 0, WRITE))) 1934 return ret; 1935 1936 memset((void *) jset + bytes, 0, (sectors << 9) - bytes); 1937 return 0; 1938 } 1939 1940 static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w) 1941 { 1942 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1943 int error = bch2_journal_error(j); 1944 1945 /* 1946 * If the journal is in an error state - we did an emergency shutdown - 1947 * we prefer to continue doing journal writes. We just mark them as 1948 * noflush so they'll never be used, but they'll still be visible by the 1949 * list_journal tool - this helps in debugging. 1950 * 1951 * There's a caveat: the first journal write after marking the 1952 * superblock dirty must always be a flush write, because on startup 1953 * from a clean shutdown we didn't necessarily read the journal and the 1954 * new journal write might overwrite whatever was in the journal 1955 * previously - we can't leave the journal without any flush writes in 1956 * it. 1957 * 1958 * So if we're in an error state, and we're still starting up, we don't 1959 * write anything at all. 1960 */ 1961 if (error && test_bit(JOURNAL_need_flush_write, &j->flags)) 1962 return -EIO; 1963 1964 if (error || 1965 w->noflush || 1966 (!w->must_flush && 1967 (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && 1968 test_bit(JOURNAL_may_skip_flush, &j->flags))) { 1969 w->noflush = true; 1970 SET_JSET_NO_FLUSH(w->data, true); 1971 w->data->last_seq = 0; 1972 w->last_seq = 0; 1973 1974 j->nr_noflush_writes++; 1975 } else { 1976 w->must_flush = true; 1977 j->last_flush_write = jiffies; 1978 j->nr_flush_writes++; 1979 clear_bit(JOURNAL_need_flush_write, &j->flags); 1980 } 1981 1982 return 0; 1983 } 1984 1985 CLOSURE_CALLBACK(bch2_journal_write) 1986 { 1987 closure_type(w, struct journal_buf, io); 1988 struct journal *j = container_of(w, struct journal, buf[w->idx]); 1989 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1990 struct bch_replicas_padded replicas; 1991 unsigned nr_rw_members = 0; 1992 int ret; 1993 1994 for_each_rw_member(c, ca) 1995 nr_rw_members++; 1996 1997 BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); 1998 BUG_ON(!w->write_started); 1999 BUG_ON(w->write_allocated); 2000 BUG_ON(w->write_done); 2001 2002 j->write_start_time = local_clock(); 2003 2004 spin_lock(&j->lock); 2005 if (nr_rw_members > 1) 2006 w->separate_flush = true; 2007 2008 ret = bch2_journal_write_pick_flush(j, w); 2009 spin_unlock(&j->lock); 2010 if (ret) 2011 goto err; 2012 2013 mutex_lock(&j->buf_lock); 2014 journal_buf_realloc(j, w); 2015 2016 ret = bch2_journal_write_prep(j, w); 2017 mutex_unlock(&j->buf_lock); 2018 if (ret) 2019 goto err; 2020 2021 j->entry_bytes_written += vstruct_bytes(w->data); 2022 2023 while (1) { 2024 spin_lock(&j->lock); 2025 ret = journal_write_alloc(j, w); 2026 if (!ret || !j->can_discard) 2027 break; 2028 2029 spin_unlock(&j->lock); 2030 bch2_journal_do_discards(j); 2031 } 2032 2033 if (ret) { 2034 struct printbuf buf = PRINTBUF; 2035 buf.atomic++; 2036 2037 prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu: %s"), 2038 le64_to_cpu(w->data->seq), 2039 bch2_err_str(ret)); 2040 __bch2_journal_debug_to_text(&buf, j); 2041 spin_unlock(&j->lock); 2042 bch2_print_string_as_lines(KERN_ERR, buf.buf); 2043 printbuf_exit(&buf); 2044 goto err; 2045 } 2046 2047 /* 2048 * write is allocated, no longer need to account for it in 2049 * bch2_journal_space_available(): 2050 */ 2051 w->sectors = 0; 2052 w->write_allocated = true; 2053 2054 /* 2055 * journal entry has been compacted and allocated, recalculate space 2056 * available: 2057 */ 2058 bch2_journal_space_available(j); 2059 bch2_journal_do_writes(j); 2060 spin_unlock(&j->lock); 2061 2062 w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); 2063 2064 if (c->opts.nochanges) 2065 goto no_io; 2066 2067 /* 2068 * Mark journal replicas before we submit the write to guarantee 2069 * recovery will find the journal entries after a crash. 2070 */ 2071 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, 2072 w->devs_written); 2073 ret = bch2_mark_replicas(c, &replicas.e); 2074 if (ret) 2075 goto err; 2076 2077 if (!JSET_NO_FLUSH(w->data)) 2078 continue_at(cl, journal_write_preflush, j->wq); 2079 else 2080 continue_at(cl, journal_write_submit, j->wq); 2081 return; 2082 no_io: 2083 continue_at(cl, journal_write_done, j->wq); 2084 return; 2085 err: 2086 bch2_fatal_error(c); 2087 continue_at(cl, journal_write_done, j->wq); 2088 } 2089