1 // SPDX-License-Identifier: GPL-2.0 2 #include "bcachefs.h" 3 #include "alloc_background.h" 4 #include "alloc_foreground.h" 5 #include "btree_io.h" 6 #include "btree_update_interior.h" 7 #include "btree_write_buffer.h" 8 #include "buckets.h" 9 #include "checksum.h" 10 #include "disk_groups.h" 11 #include "error.h" 12 #include "journal.h" 13 #include "journal_io.h" 14 #include "journal_reclaim.h" 15 #include "journal_seq_blacklist.h" 16 #include "replicas.h" 17 #include "sb-clean.h" 18 #include "trace.h" 19 20 #include <linux/string_choices.h> 21 22 void bch2_journal_pos_from_member_info_set(struct bch_fs *c) 23 { 24 lockdep_assert_held(&c->sb_lock); 25 26 for_each_member_device(c, ca) { 27 struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); 28 29 m->last_journal_bucket = cpu_to_le32(ca->journal.cur_idx); 30 m->last_journal_bucket_offset = cpu_to_le32(ca->mi.bucket_size - ca->journal.sectors_free); 31 } 32 } 33 34 void bch2_journal_pos_from_member_info_resume(struct bch_fs *c) 35 { 36 mutex_lock(&c->sb_lock); 37 for_each_member_device(c, ca) { 38 struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx); 39 40 unsigned idx = le32_to_cpu(m.last_journal_bucket); 41 if (idx < ca->journal.nr) 42 ca->journal.cur_idx = idx; 43 unsigned offset = le32_to_cpu(m.last_journal_bucket_offset); 44 if (offset <= ca->mi.bucket_size) 45 ca->journal.sectors_free = ca->mi.bucket_size - offset; 46 } 47 mutex_unlock(&c->sb_lock); 48 } 49 50 void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, 51 struct journal_replay *j) 52 { 53 darray_for_each(j->ptrs, i) { 54 if (i != j->ptrs.data) 55 prt_printf(out, " "); 56 prt_printf(out, "%u:%u:%u (sector %llu)", 57 i->dev, i->bucket, i->bucket_offset, i->sector); 58 } 59 } 60 61 static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c, 62 struct journal_replay *j) 63 { 64 prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq)); 65 66 bch2_journal_ptrs_to_text(out, c, j); 67 68 for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) { 69 struct jset_entry_datetime *datetime = 70 container_of(entry, struct jset_entry_datetime, entry); 71 bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); 72 break; 73 } 74 } 75 76 static struct nonce journal_nonce(const struct jset *jset) 77 { 78 return (struct nonce) {{ 79 [0] = 0, 80 [1] = ((__le32 *) &jset->seq)[0], 81 [2] = ((__le32 *) &jset->seq)[1], 82 [3] = BCH_NONCE_JOURNAL, 83 }}; 84 } 85 86 static bool jset_csum_good(struct bch_fs *c, struct jset *j, struct bch_csum *csum) 87 { 88 if (!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j))) { 89 *csum = (struct bch_csum) {}; 90 return false; 91 } 92 93 *csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j); 94 return !bch2_crc_cmp(j->csum, *csum); 95 } 96 97 static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq) 98 { 99 return (seq - c->journal_entries_base_seq) & (~0U >> 1); 100 } 101 102 static void __journal_replay_free(struct bch_fs *c, 103 struct journal_replay *i) 104 { 105 struct journal_replay **p = 106 genradix_ptr(&c->journal_entries, 107 journal_entry_radix_idx(c, le64_to_cpu(i->j.seq))); 108 109 BUG_ON(*p != i); 110 *p = NULL; 111 kvfree(i); 112 } 113 114 static void journal_replay_free(struct bch_fs *c, struct journal_replay *i, bool blacklisted) 115 { 116 if (blacklisted) 117 i->ignore_blacklisted = true; 118 else 119 i->ignore_not_dirty = true; 120 121 if (!c->opts.read_entire_journal) 122 __journal_replay_free(c, i); 123 } 124 125 struct journal_list { 126 struct closure cl; 127 u64 last_seq; 128 struct mutex lock; 129 int ret; 130 }; 131 132 #define JOURNAL_ENTRY_ADD_OK 0 133 #define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5 134 135 /* 136 * Given a journal entry we just read, add it to the list of journal entries to 137 * be replayed: 138 */ 139 static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, 140 struct journal_ptr entry_ptr, 141 struct journal_list *jlist, struct jset *j) 142 { 143 struct genradix_iter iter; 144 struct journal_replay **_i, *i, *dup; 145 size_t bytes = vstruct_bytes(j); 146 u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0; 147 struct printbuf buf = PRINTBUF; 148 int ret = JOURNAL_ENTRY_ADD_OK; 149 150 if (!c->journal.oldest_seq_found_ondisk || 151 le64_to_cpu(j->seq) < c->journal.oldest_seq_found_ondisk) 152 c->journal.oldest_seq_found_ondisk = le64_to_cpu(j->seq); 153 154 /* Is this entry older than the range we need? */ 155 if (!c->opts.read_entire_journal && 156 le64_to_cpu(j->seq) < jlist->last_seq) 157 return JOURNAL_ENTRY_ADD_OUT_OF_RANGE; 158 159 /* 160 * genradixes are indexed by a ulong, not a u64, so we can't index them 161 * by sequence number directly: Assume instead that they will all fall 162 * within the range of +-2billion of the filrst one we find. 163 */ 164 if (!c->journal_entries_base_seq) 165 c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX); 166 167 /* Drop entries we don't need anymore */ 168 if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) { 169 genradix_for_each_from(&c->journal_entries, iter, _i, 170 journal_entry_radix_idx(c, jlist->last_seq)) { 171 i = *_i; 172 173 if (journal_replay_ignore(i)) 174 continue; 175 176 if (le64_to_cpu(i->j.seq) >= last_seq) 177 break; 178 179 journal_replay_free(c, i, false); 180 } 181 } 182 183 jlist->last_seq = max(jlist->last_seq, last_seq); 184 185 _i = genradix_ptr_alloc(&c->journal_entries, 186 journal_entry_radix_idx(c, le64_to_cpu(j->seq)), 187 GFP_KERNEL); 188 if (!_i) 189 return -BCH_ERR_ENOMEM_journal_entry_add; 190 191 /* 192 * Duplicate journal entries? If so we want the one that didn't have a 193 * checksum error: 194 */ 195 dup = *_i; 196 if (dup) { 197 bool identical = bytes == vstruct_bytes(&dup->j) && 198 !memcmp(j, &dup->j, bytes); 199 bool not_identical = !identical && 200 entry_ptr.csum_good && 201 dup->csum_good; 202 203 bool same_device = false; 204 darray_for_each(dup->ptrs, ptr) 205 if (ptr->dev == ca->dev_idx) 206 same_device = true; 207 208 ret = darray_push(&dup->ptrs, entry_ptr); 209 if (ret) 210 goto out; 211 212 bch2_journal_replay_to_text(&buf, c, dup); 213 214 fsck_err_on(same_device, 215 c, journal_entry_dup_same_device, 216 "duplicate journal entry on same device\n %s", 217 buf.buf); 218 219 fsck_err_on(not_identical, 220 c, journal_entry_replicas_data_mismatch, 221 "found duplicate but non identical journal entries\n %s", 222 buf.buf); 223 224 if (entry_ptr.csum_good && !identical) 225 goto replace; 226 227 goto out; 228 } 229 replace: 230 i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); 231 if (!i) 232 return -BCH_ERR_ENOMEM_journal_entry_add; 233 234 darray_init(&i->ptrs); 235 i->csum_good = entry_ptr.csum_good; 236 i->ignore_blacklisted = false; 237 i->ignore_not_dirty = false; 238 unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct"); 239 240 if (dup) { 241 /* The first ptr should represent the jset we kept: */ 242 darray_for_each(dup->ptrs, ptr) 243 darray_push(&i->ptrs, *ptr); 244 __journal_replay_free(c, dup); 245 } else { 246 darray_push(&i->ptrs, entry_ptr); 247 } 248 249 *_i = i; 250 out: 251 fsck_err: 252 printbuf_exit(&buf); 253 return ret; 254 } 255 256 /* this fills in a range with empty jset_entries: */ 257 static void journal_entry_null_range(void *start, void *end) 258 { 259 struct jset_entry *entry; 260 261 for (entry = start; entry != end; entry = vstruct_next(entry)) 262 memset(entry, 0, sizeof(*entry)); 263 } 264 265 #define JOURNAL_ENTRY_REREAD 5 266 #define JOURNAL_ENTRY_NONE 6 267 #define JOURNAL_ENTRY_BAD 7 268 269 static void journal_entry_err_msg(struct printbuf *out, 270 u32 version, 271 struct jset *jset, 272 struct jset_entry *entry) 273 { 274 prt_str(out, "invalid journal entry, version="); 275 bch2_version_to_text(out, version); 276 277 if (entry) { 278 prt_str(out, " type="); 279 bch2_prt_jset_entry_type(out, entry->type); 280 } 281 282 if (!jset) { 283 prt_printf(out, " in superblock"); 284 } else { 285 286 prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq)); 287 288 if (entry) 289 prt_printf(out, " offset=%zi/%u", 290 (u64 *) entry - jset->_data, 291 le32_to_cpu(jset->u64s)); 292 } 293 294 prt_str(out, ": "); 295 } 296 297 #define journal_entry_err(c, version, jset, entry, _err, msg, ...) \ 298 ({ \ 299 struct printbuf _buf = PRINTBUF; \ 300 \ 301 journal_entry_err_msg(&_buf, version, jset, entry); \ 302 prt_printf(&_buf, msg, ##__VA_ARGS__); \ 303 \ 304 switch (from.flags & BCH_VALIDATE_write) { \ 305 case READ: \ 306 mustfix_fsck_err(c, _err, "%s", _buf.buf); \ 307 break; \ 308 case WRITE: \ 309 bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \ 310 bch_err(c, "corrupt metadata before write: %s\n", _buf.buf);\ 311 if (bch2_fs_inconsistent(c)) { \ 312 ret = -BCH_ERR_fsck_errors_not_fixed; \ 313 goto fsck_err; \ 314 } \ 315 break; \ 316 } \ 317 \ 318 printbuf_exit(&_buf); \ 319 true; \ 320 }) 321 322 #define journal_entry_err_on(cond, ...) \ 323 ((cond) ? journal_entry_err(__VA_ARGS__) : false) 324 325 #define FSCK_DELETED_KEY 5 326 327 static int journal_validate_key(struct bch_fs *c, 328 struct jset *jset, 329 struct jset_entry *entry, 330 struct bkey_i *k, 331 struct bkey_validate_context from, 332 unsigned version, int big_endian) 333 { 334 enum bch_validate_flags flags = from.flags; 335 int write = flags & BCH_VALIDATE_write; 336 void *next = vstruct_next(entry); 337 int ret = 0; 338 339 if (journal_entry_err_on(!k->k.u64s, 340 c, version, jset, entry, 341 journal_entry_bkey_u64s_0, 342 "k->u64s 0")) { 343 entry->u64s = cpu_to_le16((u64 *) k - entry->_data); 344 journal_entry_null_range(vstruct_next(entry), next); 345 return FSCK_DELETED_KEY; 346 } 347 348 if (journal_entry_err_on((void *) bkey_next(k) > 349 (void *) vstruct_next(entry), 350 c, version, jset, entry, 351 journal_entry_bkey_past_end, 352 "extends past end of journal entry")) { 353 entry->u64s = cpu_to_le16((u64 *) k - entry->_data); 354 journal_entry_null_range(vstruct_next(entry), next); 355 return FSCK_DELETED_KEY; 356 } 357 358 if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, 359 c, version, jset, entry, 360 journal_entry_bkey_bad_format, 361 "bad format %u", k->k.format)) { 362 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); 363 memmove(k, bkey_next(k), next - (void *) bkey_next(k)); 364 journal_entry_null_range(vstruct_next(entry), next); 365 return FSCK_DELETED_KEY; 366 } 367 368 if (!write) 369 bch2_bkey_compat(from.level, from.btree, version, big_endian, 370 write, NULL, bkey_to_packed(k)); 371 372 ret = bch2_bkey_validate(c, bkey_i_to_s_c(k), from); 373 if (ret == -BCH_ERR_fsck_delete_bkey) { 374 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); 375 memmove(k, bkey_next(k), next - (void *) bkey_next(k)); 376 journal_entry_null_range(vstruct_next(entry), next); 377 return FSCK_DELETED_KEY; 378 } 379 if (ret) 380 goto fsck_err; 381 382 if (write) 383 bch2_bkey_compat(from.level, from.btree, version, big_endian, 384 write, NULL, bkey_to_packed(k)); 385 fsck_err: 386 return ret; 387 } 388 389 static int journal_entry_btree_keys_validate(struct bch_fs *c, 390 struct jset *jset, 391 struct jset_entry *entry, 392 unsigned version, int big_endian, 393 struct bkey_validate_context from) 394 { 395 struct bkey_i *k = entry->start; 396 397 from.level = entry->level; 398 from.btree = entry->btree_id; 399 400 while (k != vstruct_last(entry)) { 401 int ret = journal_validate_key(c, jset, entry, k, from, version, big_endian); 402 if (ret == FSCK_DELETED_KEY) 403 continue; 404 else if (ret) 405 return ret; 406 407 k = bkey_next(k); 408 } 409 410 return 0; 411 } 412 413 static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c, 414 struct jset_entry *entry) 415 { 416 bool first = true; 417 418 jset_entry_for_each_key(entry, k) { 419 if (!first) { 420 prt_newline(out); 421 bch2_prt_jset_entry_type(out, entry->type); 422 prt_str(out, ": "); 423 } 424 bch2_btree_id_level_to_text(out, entry->btree_id, entry->level); 425 prt_char(out, ' '); 426 bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); 427 first = false; 428 } 429 } 430 431 static int journal_entry_btree_root_validate(struct bch_fs *c, 432 struct jset *jset, 433 struct jset_entry *entry, 434 unsigned version, int big_endian, 435 struct bkey_validate_context from) 436 { 437 struct bkey_i *k = entry->start; 438 int ret = 0; 439 440 from.root = true; 441 from.level = entry->level + 1; 442 from.btree = entry->btree_id; 443 444 if (journal_entry_err_on(!entry->u64s || 445 le16_to_cpu(entry->u64s) != k->k.u64s, 446 c, version, jset, entry, 447 journal_entry_btree_root_bad_size, 448 "invalid btree root journal entry: wrong number of keys")) { 449 void *next = vstruct_next(entry); 450 /* 451 * we don't want to null out this jset_entry, 452 * just the contents, so that later we can tell 453 * we were _supposed_ to have a btree root 454 */ 455 entry->u64s = 0; 456 journal_entry_null_range(vstruct_next(entry), next); 457 return 0; 458 } 459 460 ret = journal_validate_key(c, jset, entry, k, from, version, big_endian); 461 if (ret == FSCK_DELETED_KEY) 462 ret = 0; 463 fsck_err: 464 return ret; 465 } 466 467 static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c, 468 struct jset_entry *entry) 469 { 470 journal_entry_btree_keys_to_text(out, c, entry); 471 } 472 473 static int journal_entry_prio_ptrs_validate(struct bch_fs *c, 474 struct jset *jset, 475 struct jset_entry *entry, 476 unsigned version, int big_endian, 477 struct bkey_validate_context from) 478 { 479 /* obsolete, don't care: */ 480 return 0; 481 } 482 483 static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c, 484 struct jset_entry *entry) 485 { 486 } 487 488 static int journal_entry_blacklist_validate(struct bch_fs *c, 489 struct jset *jset, 490 struct jset_entry *entry, 491 unsigned version, int big_endian, 492 struct bkey_validate_context from) 493 { 494 int ret = 0; 495 496 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, 497 c, version, jset, entry, 498 journal_entry_blacklist_bad_size, 499 "invalid journal seq blacklist entry: bad size")) { 500 journal_entry_null_range(entry, vstruct_next(entry)); 501 } 502 fsck_err: 503 return ret; 504 } 505 506 static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c, 507 struct jset_entry *entry) 508 { 509 struct jset_entry_blacklist *bl = 510 container_of(entry, struct jset_entry_blacklist, entry); 511 512 prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq)); 513 } 514 515 static int journal_entry_blacklist_v2_validate(struct bch_fs *c, 516 struct jset *jset, 517 struct jset_entry *entry, 518 unsigned version, int big_endian, 519 struct bkey_validate_context from) 520 { 521 struct jset_entry_blacklist_v2 *bl_entry; 522 int ret = 0; 523 524 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, 525 c, version, jset, entry, 526 journal_entry_blacklist_v2_bad_size, 527 "invalid journal seq blacklist entry: bad size")) { 528 journal_entry_null_range(entry, vstruct_next(entry)); 529 goto out; 530 } 531 532 bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry); 533 534 if (journal_entry_err_on(le64_to_cpu(bl_entry->start) > 535 le64_to_cpu(bl_entry->end), 536 c, version, jset, entry, 537 journal_entry_blacklist_v2_start_past_end, 538 "invalid journal seq blacklist entry: start > end")) { 539 journal_entry_null_range(entry, vstruct_next(entry)); 540 } 541 out: 542 fsck_err: 543 return ret; 544 } 545 546 static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c, 547 struct jset_entry *entry) 548 { 549 struct jset_entry_blacklist_v2 *bl = 550 container_of(entry, struct jset_entry_blacklist_v2, entry); 551 552 prt_printf(out, "start=%llu end=%llu", 553 le64_to_cpu(bl->start), 554 le64_to_cpu(bl->end)); 555 } 556 557 static int journal_entry_usage_validate(struct bch_fs *c, 558 struct jset *jset, 559 struct jset_entry *entry, 560 unsigned version, int big_endian, 561 struct bkey_validate_context from) 562 { 563 struct jset_entry_usage *u = 564 container_of(entry, struct jset_entry_usage, entry); 565 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 566 int ret = 0; 567 568 if (journal_entry_err_on(bytes < sizeof(*u), 569 c, version, jset, entry, 570 journal_entry_usage_bad_size, 571 "invalid journal entry usage: bad size")) { 572 journal_entry_null_range(entry, vstruct_next(entry)); 573 return ret; 574 } 575 576 fsck_err: 577 return ret; 578 } 579 580 static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c, 581 struct jset_entry *entry) 582 { 583 struct jset_entry_usage *u = 584 container_of(entry, struct jset_entry_usage, entry); 585 586 prt_str(out, "type="); 587 bch2_prt_fs_usage_type(out, u->entry.btree_id); 588 prt_printf(out, " v=%llu", le64_to_cpu(u->v)); 589 } 590 591 static int journal_entry_data_usage_validate(struct bch_fs *c, 592 struct jset *jset, 593 struct jset_entry *entry, 594 unsigned version, int big_endian, 595 struct bkey_validate_context from) 596 { 597 struct jset_entry_data_usage *u = 598 container_of(entry, struct jset_entry_data_usage, entry); 599 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 600 struct printbuf err = PRINTBUF; 601 int ret = 0; 602 603 if (journal_entry_err_on(bytes < sizeof(*u) || 604 bytes < sizeof(*u) + u->r.nr_devs, 605 c, version, jset, entry, 606 journal_entry_data_usage_bad_size, 607 "invalid journal entry usage: bad size")) { 608 journal_entry_null_range(entry, vstruct_next(entry)); 609 goto out; 610 } 611 612 if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c, &err), 613 c, version, jset, entry, 614 journal_entry_data_usage_bad_size, 615 "invalid journal entry usage: %s", err.buf)) { 616 journal_entry_null_range(entry, vstruct_next(entry)); 617 goto out; 618 } 619 out: 620 fsck_err: 621 printbuf_exit(&err); 622 return ret; 623 } 624 625 static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c, 626 struct jset_entry *entry) 627 { 628 struct jset_entry_data_usage *u = 629 container_of(entry, struct jset_entry_data_usage, entry); 630 631 bch2_replicas_entry_to_text(out, &u->r); 632 prt_printf(out, "=%llu", le64_to_cpu(u->v)); 633 } 634 635 static int journal_entry_clock_validate(struct bch_fs *c, 636 struct jset *jset, 637 struct jset_entry *entry, 638 unsigned version, int big_endian, 639 struct bkey_validate_context from) 640 { 641 struct jset_entry_clock *clock = 642 container_of(entry, struct jset_entry_clock, entry); 643 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 644 int ret = 0; 645 646 if (journal_entry_err_on(bytes != sizeof(*clock), 647 c, version, jset, entry, 648 journal_entry_clock_bad_size, 649 "bad size")) { 650 journal_entry_null_range(entry, vstruct_next(entry)); 651 return ret; 652 } 653 654 if (journal_entry_err_on(clock->rw > 1, 655 c, version, jset, entry, 656 journal_entry_clock_bad_rw, 657 "bad rw")) { 658 journal_entry_null_range(entry, vstruct_next(entry)); 659 return ret; 660 } 661 662 fsck_err: 663 return ret; 664 } 665 666 static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c, 667 struct jset_entry *entry) 668 { 669 struct jset_entry_clock *clock = 670 container_of(entry, struct jset_entry_clock, entry); 671 672 prt_printf(out, "%s=%llu", str_write_read(clock->rw), le64_to_cpu(clock->time)); 673 } 674 675 static int journal_entry_dev_usage_validate(struct bch_fs *c, 676 struct jset *jset, 677 struct jset_entry *entry, 678 unsigned version, int big_endian, 679 struct bkey_validate_context from) 680 { 681 struct jset_entry_dev_usage *u = 682 container_of(entry, struct jset_entry_dev_usage, entry); 683 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 684 unsigned expected = sizeof(*u); 685 int ret = 0; 686 687 if (journal_entry_err_on(bytes < expected, 688 c, version, jset, entry, 689 journal_entry_dev_usage_bad_size, 690 "bad size (%u < %u)", 691 bytes, expected)) { 692 journal_entry_null_range(entry, vstruct_next(entry)); 693 return ret; 694 } 695 696 if (journal_entry_err_on(u->pad, 697 c, version, jset, entry, 698 journal_entry_dev_usage_bad_pad, 699 "bad pad")) { 700 journal_entry_null_range(entry, vstruct_next(entry)); 701 return ret; 702 } 703 704 fsck_err: 705 return ret; 706 } 707 708 static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c, 709 struct jset_entry *entry) 710 { 711 struct jset_entry_dev_usage *u = 712 container_of(entry, struct jset_entry_dev_usage, entry); 713 unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); 714 715 if (vstruct_bytes(entry) < sizeof(*u)) 716 return; 717 718 prt_printf(out, "dev=%u", le32_to_cpu(u->dev)); 719 720 printbuf_indent_add(out, 2); 721 for (i = 0; i < nr_types; i++) { 722 prt_newline(out); 723 bch2_prt_data_type(out, i); 724 prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu", 725 le64_to_cpu(u->d[i].buckets), 726 le64_to_cpu(u->d[i].sectors), 727 le64_to_cpu(u->d[i].fragmented)); 728 } 729 printbuf_indent_sub(out, 2); 730 } 731 732 static int journal_entry_log_validate(struct bch_fs *c, 733 struct jset *jset, 734 struct jset_entry *entry, 735 unsigned version, int big_endian, 736 struct bkey_validate_context from) 737 { 738 return 0; 739 } 740 741 static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c, 742 struct jset_entry *entry) 743 { 744 struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); 745 746 prt_printf(out, "%.*s", jset_entry_log_msg_bytes(l), l->d); 747 } 748 749 static int journal_entry_overwrite_validate(struct bch_fs *c, 750 struct jset *jset, 751 struct jset_entry *entry, 752 unsigned version, int big_endian, 753 struct bkey_validate_context from) 754 { 755 from.flags = 0; 756 return journal_entry_btree_keys_validate(c, jset, entry, 757 version, big_endian, from); 758 } 759 760 static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c, 761 struct jset_entry *entry) 762 { 763 journal_entry_btree_keys_to_text(out, c, entry); 764 } 765 766 static int journal_entry_write_buffer_keys_validate(struct bch_fs *c, 767 struct jset *jset, 768 struct jset_entry *entry, 769 unsigned version, int big_endian, 770 struct bkey_validate_context from) 771 { 772 return journal_entry_btree_keys_validate(c, jset, entry, 773 version, big_endian, from); 774 } 775 776 static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c, 777 struct jset_entry *entry) 778 { 779 journal_entry_btree_keys_to_text(out, c, entry); 780 } 781 782 static int journal_entry_datetime_validate(struct bch_fs *c, 783 struct jset *jset, 784 struct jset_entry *entry, 785 unsigned version, int big_endian, 786 struct bkey_validate_context from) 787 { 788 unsigned bytes = vstruct_bytes(entry); 789 unsigned expected = 16; 790 int ret = 0; 791 792 if (journal_entry_err_on(vstruct_bytes(entry) < expected, 793 c, version, jset, entry, 794 journal_entry_dev_usage_bad_size, 795 "bad size (%u < %u)", 796 bytes, expected)) { 797 journal_entry_null_range(entry, vstruct_next(entry)); 798 return ret; 799 } 800 fsck_err: 801 return ret; 802 } 803 804 static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *c, 805 struct jset_entry *entry) 806 { 807 struct jset_entry_datetime *datetime = 808 container_of(entry, struct jset_entry_datetime, entry); 809 810 bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); 811 } 812 813 struct jset_entry_ops { 814 int (*validate)(struct bch_fs *, struct jset *, 815 struct jset_entry *, unsigned, int, 816 struct bkey_validate_context); 817 void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); 818 }; 819 820 static const struct jset_entry_ops bch2_jset_entry_ops[] = { 821 #define x(f, nr) \ 822 [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \ 823 .validate = journal_entry_##f##_validate, \ 824 .to_text = journal_entry_##f##_to_text, \ 825 }, 826 BCH_JSET_ENTRY_TYPES() 827 #undef x 828 }; 829 830 int bch2_journal_entry_validate(struct bch_fs *c, 831 struct jset *jset, 832 struct jset_entry *entry, 833 unsigned version, int big_endian, 834 struct bkey_validate_context from) 835 { 836 return entry->type < BCH_JSET_ENTRY_NR 837 ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry, 838 version, big_endian, from) 839 : 0; 840 } 841 842 void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, 843 struct jset_entry *entry) 844 { 845 bch2_prt_jset_entry_type(out, entry->type); 846 847 if (entry->type < BCH_JSET_ENTRY_NR) { 848 prt_str(out, ": "); 849 bch2_jset_entry_ops[entry->type].to_text(out, c, entry); 850 } 851 } 852 853 static int jset_validate_entries(struct bch_fs *c, struct jset *jset, 854 enum bch_validate_flags flags) 855 { 856 struct bkey_validate_context from = { 857 .flags = flags, 858 .from = BKEY_VALIDATE_journal, 859 .journal_seq = le64_to_cpu(jset->seq), 860 }; 861 862 unsigned version = le32_to_cpu(jset->version); 863 int ret = 0; 864 865 vstruct_for_each(jset, entry) { 866 from.journal_offset = (u64 *) entry - jset->_data; 867 868 if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset), 869 c, version, jset, entry, 870 journal_entry_past_jset_end, 871 "journal entry extends past end of jset")) { 872 jset->u64s = cpu_to_le32((u64 *) entry - jset->_data); 873 break; 874 } 875 876 ret = bch2_journal_entry_validate(c, jset, entry, version, 877 JSET_BIG_ENDIAN(jset), from); 878 if (ret) 879 break; 880 } 881 fsck_err: 882 return ret; 883 } 884 885 static int jset_validate(struct bch_fs *c, 886 struct bch_dev *ca, 887 struct jset *jset, u64 sector, 888 enum bch_validate_flags flags) 889 { 890 struct bkey_validate_context from = { 891 .flags = flags, 892 .from = BKEY_VALIDATE_journal, 893 .journal_seq = le64_to_cpu(jset->seq), 894 }; 895 int ret = 0; 896 897 if (le64_to_cpu(jset->magic) != jset_magic(c)) 898 return JOURNAL_ENTRY_NONE; 899 900 unsigned version = le32_to_cpu(jset->version); 901 if (journal_entry_err_on(!bch2_version_compatible(version), 902 c, version, jset, NULL, 903 jset_unsupported_version, 904 "%s sector %llu seq %llu: incompatible journal entry version %u.%u", 905 ca ? ca->name : c->name, 906 sector, le64_to_cpu(jset->seq), 907 BCH_VERSION_MAJOR(version), 908 BCH_VERSION_MINOR(version))) { 909 /* don't try to continue: */ 910 return -EINVAL; 911 } 912 913 if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), 914 c, version, jset, NULL, 915 jset_unknown_csum, 916 "%s sector %llu seq %llu: journal entry with unknown csum type %llu", 917 ca ? ca->name : c->name, 918 sector, le64_to_cpu(jset->seq), 919 JSET_CSUM_TYPE(jset))) 920 ret = JOURNAL_ENTRY_BAD; 921 922 /* last_seq is ignored when JSET_NO_FLUSH is true */ 923 if (journal_entry_err_on(!JSET_NO_FLUSH(jset) && 924 le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), 925 c, version, jset, NULL, 926 jset_last_seq_newer_than_seq, 927 "invalid journal entry: last_seq > seq (%llu > %llu)", 928 le64_to_cpu(jset->last_seq), 929 le64_to_cpu(jset->seq))) { 930 jset->last_seq = jset->seq; 931 return JOURNAL_ENTRY_BAD; 932 } 933 934 ret = jset_validate_entries(c, jset, flags); 935 fsck_err: 936 return ret; 937 } 938 939 static int jset_validate_early(struct bch_fs *c, 940 struct bch_dev *ca, 941 struct jset *jset, u64 sector, 942 unsigned bucket_sectors_left, 943 unsigned sectors_read) 944 { 945 struct bkey_validate_context from = { 946 .from = BKEY_VALIDATE_journal, 947 .journal_seq = le64_to_cpu(jset->seq), 948 }; 949 int ret = 0; 950 951 if (le64_to_cpu(jset->magic) != jset_magic(c)) 952 return JOURNAL_ENTRY_NONE; 953 954 unsigned version = le32_to_cpu(jset->version); 955 if (journal_entry_err_on(!bch2_version_compatible(version), 956 c, version, jset, NULL, 957 jset_unsupported_version, 958 "%s sector %llu seq %llu: unknown journal entry version %u.%u", 959 ca ? ca->name : c->name, 960 sector, le64_to_cpu(jset->seq), 961 BCH_VERSION_MAJOR(version), 962 BCH_VERSION_MINOR(version))) { 963 /* don't try to continue: */ 964 return -EINVAL; 965 } 966 967 size_t bytes = vstruct_bytes(jset); 968 if (bytes > (sectors_read << 9) && 969 sectors_read < bucket_sectors_left) 970 return JOURNAL_ENTRY_REREAD; 971 972 if (journal_entry_err_on(bytes > bucket_sectors_left << 9, 973 c, version, jset, NULL, 974 jset_past_bucket_end, 975 "%s sector %llu seq %llu: journal entry too big (%zu bytes)", 976 ca ? ca->name : c->name, 977 sector, le64_to_cpu(jset->seq), bytes)) 978 le32_add_cpu(&jset->u64s, 979 -((bytes - (bucket_sectors_left << 9)) / 8)); 980 fsck_err: 981 return ret; 982 } 983 984 struct journal_read_buf { 985 void *data; 986 size_t size; 987 }; 988 989 static int journal_read_buf_realloc(struct journal_read_buf *b, 990 size_t new_size) 991 { 992 void *n; 993 994 /* the bios are sized for this many pages, max: */ 995 if (new_size > JOURNAL_ENTRY_SIZE_MAX) 996 return -BCH_ERR_ENOMEM_journal_read_buf_realloc; 997 998 new_size = roundup_pow_of_two(new_size); 999 n = kvmalloc(new_size, GFP_KERNEL); 1000 if (!n) 1001 return -BCH_ERR_ENOMEM_journal_read_buf_realloc; 1002 1003 kvfree(b->data); 1004 b->data = n; 1005 b->size = new_size; 1006 return 0; 1007 } 1008 1009 static int journal_read_bucket(struct bch_dev *ca, 1010 struct journal_read_buf *buf, 1011 struct journal_list *jlist, 1012 unsigned bucket) 1013 { 1014 struct bch_fs *c = ca->fs; 1015 struct journal_device *ja = &ca->journal; 1016 struct jset *j = NULL; 1017 unsigned sectors, sectors_read = 0; 1018 u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), 1019 end = offset + ca->mi.bucket_size; 1020 bool saw_bad = false, csum_good; 1021 struct printbuf err = PRINTBUF; 1022 int ret = 0; 1023 1024 pr_debug("reading %u", bucket); 1025 1026 while (offset < end) { 1027 if (!sectors_read) { 1028 struct bio *bio; 1029 unsigned nr_bvecs; 1030 reread: 1031 sectors_read = min_t(unsigned, 1032 end - offset, buf->size >> 9); 1033 nr_bvecs = buf_pages(buf->data, sectors_read << 9); 1034 1035 bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); 1036 if (!bio) 1037 return -BCH_ERR_ENOMEM_journal_read_bucket; 1038 bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ); 1039 1040 bio->bi_iter.bi_sector = offset; 1041 bch2_bio_map(bio, buf->data, sectors_read << 9); 1042 1043 ret = submit_bio_wait(bio); 1044 kfree(bio); 1045 1046 if (bch2_dev_io_err_on(ret, ca, BCH_MEMBER_ERROR_read, 1047 "journal read error: sector %llu", 1048 offset) || 1049 bch2_meta_read_fault("journal")) { 1050 /* 1051 * We don't error out of the recovery process 1052 * here, since the relevant journal entry may be 1053 * found on a different device, and missing or 1054 * no journal entries will be handled later 1055 */ 1056 goto out; 1057 } 1058 1059 j = buf->data; 1060 } 1061 1062 ret = jset_validate_early(c, ca, j, offset, 1063 end - offset, sectors_read); 1064 switch (ret) { 1065 case 0: 1066 sectors = vstruct_sectors(j, c->block_bits); 1067 break; 1068 case JOURNAL_ENTRY_REREAD: 1069 if (vstruct_bytes(j) > buf->size) { 1070 ret = journal_read_buf_realloc(buf, 1071 vstruct_bytes(j)); 1072 if (ret) 1073 goto err; 1074 } 1075 goto reread; 1076 case JOURNAL_ENTRY_NONE: 1077 if (!saw_bad) 1078 goto out; 1079 /* 1080 * On checksum error we don't really trust the size 1081 * field of the journal entry we read, so try reading 1082 * again at next block boundary: 1083 */ 1084 sectors = block_sectors(c); 1085 goto next_block; 1086 default: 1087 goto err; 1088 } 1089 1090 if (le64_to_cpu(j->seq) > ja->highest_seq_found) { 1091 ja->highest_seq_found = le64_to_cpu(j->seq); 1092 ja->cur_idx = bucket; 1093 ja->sectors_free = ca->mi.bucket_size - 1094 bucket_remainder(ca, offset) - sectors; 1095 } 1096 1097 /* 1098 * This happens sometimes if we don't have discards on - 1099 * when we've partially overwritten a bucket with new 1100 * journal entries. We don't need the rest of the 1101 * bucket: 1102 */ 1103 if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) 1104 goto out; 1105 1106 ja->bucket_seq[bucket] = le64_to_cpu(j->seq); 1107 1108 enum bch_csum_type csum_type = JSET_CSUM_TYPE(j); 1109 struct bch_csum csum; 1110 csum_good = jset_csum_good(c, j, &csum); 1111 1112 if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum, 1113 "%s", 1114 (printbuf_reset(&err), 1115 prt_str(&err, "journal "), 1116 bch2_csum_err_msg(&err, csum_type, j->csum, csum), 1117 err.buf))) 1118 saw_bad = true; 1119 1120 ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), 1121 j->encrypted_start, 1122 vstruct_end(j) - (void *) j->encrypted_start); 1123 bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret)); 1124 1125 mutex_lock(&jlist->lock); 1126 ret = journal_entry_add(c, ca, (struct journal_ptr) { 1127 .csum_good = csum_good, 1128 .dev = ca->dev_idx, 1129 .bucket = bucket, 1130 .bucket_offset = offset - 1131 bucket_to_sector(ca, ja->buckets[bucket]), 1132 .sector = offset, 1133 }, jlist, j); 1134 mutex_unlock(&jlist->lock); 1135 1136 switch (ret) { 1137 case JOURNAL_ENTRY_ADD_OK: 1138 break; 1139 case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: 1140 break; 1141 default: 1142 goto err; 1143 } 1144 next_block: 1145 pr_debug("next"); 1146 offset += sectors; 1147 sectors_read -= sectors; 1148 j = ((void *) j) + (sectors << 9); 1149 } 1150 1151 out: 1152 ret = 0; 1153 err: 1154 printbuf_exit(&err); 1155 return ret; 1156 } 1157 1158 static CLOSURE_CALLBACK(bch2_journal_read_device) 1159 { 1160 closure_type(ja, struct journal_device, read); 1161 struct bch_dev *ca = container_of(ja, struct bch_dev, journal); 1162 struct bch_fs *c = ca->fs; 1163 struct journal_list *jlist = 1164 container_of(cl->parent, struct journal_list, cl); 1165 struct journal_read_buf buf = { NULL, 0 }; 1166 unsigned i; 1167 int ret = 0; 1168 1169 if (!ja->nr) 1170 goto out; 1171 1172 ret = journal_read_buf_realloc(&buf, PAGE_SIZE); 1173 if (ret) 1174 goto err; 1175 1176 pr_debug("%u journal buckets", ja->nr); 1177 1178 for (i = 0; i < ja->nr; i++) { 1179 ret = journal_read_bucket(ca, &buf, jlist, i); 1180 if (ret) 1181 goto err; 1182 } 1183 1184 /* 1185 * Set dirty_idx to indicate the entire journal is full and needs to be 1186 * reclaimed - journal reclaim will immediately reclaim whatever isn't 1187 * pinned when it first runs: 1188 */ 1189 ja->discard_idx = ja->dirty_idx_ondisk = 1190 ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; 1191 out: 1192 bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); 1193 kvfree(buf.data); 1194 percpu_ref_put(&ca->io_ref); 1195 closure_return(cl); 1196 return; 1197 err: 1198 mutex_lock(&jlist->lock); 1199 jlist->ret = ret; 1200 mutex_unlock(&jlist->lock); 1201 goto out; 1202 } 1203 1204 int bch2_journal_read(struct bch_fs *c, 1205 u64 *last_seq, 1206 u64 *blacklist_seq, 1207 u64 *start_seq) 1208 { 1209 struct journal_list jlist; 1210 struct journal_replay *i, **_i, *prev = NULL; 1211 struct genradix_iter radix_iter; 1212 struct printbuf buf = PRINTBUF; 1213 bool degraded = false, last_write_torn = false; 1214 u64 seq; 1215 int ret = 0; 1216 1217 closure_init_stack(&jlist.cl); 1218 mutex_init(&jlist.lock); 1219 jlist.last_seq = 0; 1220 jlist.ret = 0; 1221 1222 for_each_member_device(c, ca) { 1223 if (!c->opts.fsck && 1224 !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal))) 1225 continue; 1226 1227 if ((ca->mi.state == BCH_MEMBER_STATE_rw || 1228 ca->mi.state == BCH_MEMBER_STATE_ro) && 1229 percpu_ref_tryget(&ca->io_ref)) 1230 closure_call(&ca->journal.read, 1231 bch2_journal_read_device, 1232 system_unbound_wq, 1233 &jlist.cl); 1234 else 1235 degraded = true; 1236 } 1237 1238 closure_sync(&jlist.cl); 1239 1240 if (jlist.ret) 1241 return jlist.ret; 1242 1243 *last_seq = 0; 1244 *start_seq = 0; 1245 *blacklist_seq = 0; 1246 1247 /* 1248 * Find most recent flush entry, and ignore newer non flush entries - 1249 * those entries will be blacklisted: 1250 */ 1251 genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) { 1252 i = *_i; 1253 1254 if (journal_replay_ignore(i)) 1255 continue; 1256 1257 if (!*start_seq) 1258 *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1; 1259 1260 if (JSET_NO_FLUSH(&i->j)) { 1261 i->ignore_blacklisted = true; 1262 continue; 1263 } 1264 1265 if (!last_write_torn && !i->csum_good) { 1266 last_write_torn = true; 1267 i->ignore_blacklisted = true; 1268 continue; 1269 } 1270 1271 struct bkey_validate_context from = { 1272 .from = BKEY_VALIDATE_journal, 1273 .journal_seq = le64_to_cpu(i->j.seq), 1274 }; 1275 if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq), 1276 c, le32_to_cpu(i->j.version), &i->j, NULL, 1277 jset_last_seq_newer_than_seq, 1278 "invalid journal entry: last_seq > seq (%llu > %llu)", 1279 le64_to_cpu(i->j.last_seq), 1280 le64_to_cpu(i->j.seq))) 1281 i->j.last_seq = i->j.seq; 1282 1283 *last_seq = le64_to_cpu(i->j.last_seq); 1284 *blacklist_seq = le64_to_cpu(i->j.seq) + 1; 1285 break; 1286 } 1287 1288 if (!*start_seq) { 1289 bch_info(c, "journal read done, but no entries found"); 1290 return 0; 1291 } 1292 1293 if (!*last_seq) { 1294 fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes, 1295 "journal read done, but no entries found after dropping non-flushes"); 1296 return 0; 1297 } 1298 1299 bch_info(c, "journal read done, replaying entries %llu-%llu", 1300 *last_seq, *blacklist_seq - 1); 1301 1302 if (*start_seq != *blacklist_seq) 1303 bch_info(c, "dropped unflushed entries %llu-%llu", 1304 *blacklist_seq, *start_seq - 1); 1305 1306 /* Drop blacklisted entries and entries older than last_seq: */ 1307 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1308 i = *_i; 1309 1310 if (journal_replay_ignore(i)) 1311 continue; 1312 1313 seq = le64_to_cpu(i->j.seq); 1314 if (seq < *last_seq) { 1315 journal_replay_free(c, i, false); 1316 continue; 1317 } 1318 1319 if (bch2_journal_seq_is_blacklisted(c, seq, true)) { 1320 fsck_err_on(!JSET_NO_FLUSH(&i->j), c, 1321 jset_seq_blacklisted, 1322 "found blacklisted journal entry %llu", seq); 1323 i->ignore_blacklisted = true; 1324 } 1325 } 1326 1327 /* Check for missing entries: */ 1328 seq = *last_seq; 1329 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1330 i = *_i; 1331 1332 if (journal_replay_ignore(i)) 1333 continue; 1334 1335 BUG_ON(seq > le64_to_cpu(i->j.seq)); 1336 1337 while (seq < le64_to_cpu(i->j.seq)) { 1338 u64 missing_start, missing_end; 1339 struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; 1340 1341 while (seq < le64_to_cpu(i->j.seq) && 1342 bch2_journal_seq_is_blacklisted(c, seq, false)) 1343 seq++; 1344 1345 if (seq == le64_to_cpu(i->j.seq)) 1346 break; 1347 1348 missing_start = seq; 1349 1350 while (seq < le64_to_cpu(i->j.seq) && 1351 !bch2_journal_seq_is_blacklisted(c, seq, false)) 1352 seq++; 1353 1354 if (prev) { 1355 bch2_journal_ptrs_to_text(&buf1, c, prev); 1356 prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits)); 1357 } else 1358 prt_printf(&buf1, "(none)"); 1359 bch2_journal_ptrs_to_text(&buf2, c, i); 1360 1361 missing_end = seq - 1; 1362 fsck_err(c, journal_entries_missing, 1363 "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" 1364 " prev at %s\n" 1365 " next at %s, continue?", 1366 missing_start, missing_end, 1367 *last_seq, *blacklist_seq - 1, 1368 buf1.buf, buf2.buf); 1369 1370 printbuf_exit(&buf1); 1371 printbuf_exit(&buf2); 1372 } 1373 1374 prev = i; 1375 seq++; 1376 } 1377 1378 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1379 struct bch_replicas_padded replicas = { 1380 .e.data_type = BCH_DATA_journal, 1381 .e.nr_devs = 0, 1382 .e.nr_required = 1, 1383 }; 1384 1385 i = *_i; 1386 if (journal_replay_ignore(i)) 1387 continue; 1388 1389 darray_for_each(i->ptrs, ptr) { 1390 struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); 1391 1392 if (!ptr->csum_good) 1393 bch_err_dev_offset(ca, ptr->sector, 1394 "invalid journal checksum, seq %llu%s", 1395 le64_to_cpu(i->j.seq), 1396 i->csum_good ? " (had good copy on another device)" : ""); 1397 } 1398 1399 ret = jset_validate(c, 1400 bch2_dev_have_ref(c, i->ptrs.data[0].dev), 1401 &i->j, 1402 i->ptrs.data[0].sector, 1403 READ); 1404 if (ret) 1405 goto err; 1406 1407 darray_for_each(i->ptrs, ptr) 1408 replicas_entry_add_dev(&replicas.e, ptr->dev); 1409 1410 bch2_replicas_entry_sort(&replicas.e); 1411 1412 printbuf_reset(&buf); 1413 bch2_replicas_entry_to_text(&buf, &replicas.e); 1414 1415 if (!degraded && 1416 !bch2_replicas_marked(c, &replicas.e) && 1417 (le64_to_cpu(i->j.seq) == *last_seq || 1418 fsck_err(c, journal_entry_replicas_not_marked, 1419 "superblock not marked as containing replicas for journal entry %llu\n %s", 1420 le64_to_cpu(i->j.seq), buf.buf))) { 1421 ret = bch2_mark_replicas(c, &replicas.e); 1422 if (ret) 1423 goto err; 1424 } 1425 } 1426 err: 1427 fsck_err: 1428 printbuf_exit(&buf); 1429 return ret; 1430 } 1431 1432 /* journal write: */ 1433 1434 static void journal_advance_devs_to_next_bucket(struct journal *j, 1435 struct dev_alloc_list *devs, 1436 unsigned sectors, u64 seq) 1437 { 1438 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1439 1440 darray_for_each(*devs, i) { 1441 struct bch_dev *ca = rcu_dereference(c->devs[*i]); 1442 if (!ca) 1443 continue; 1444 1445 struct journal_device *ja = &ca->journal; 1446 1447 if (sectors > ja->sectors_free && 1448 sectors <= ca->mi.bucket_size && 1449 bch2_journal_dev_buckets_available(j, ja, 1450 journal_space_discarded)) { 1451 ja->cur_idx = (ja->cur_idx + 1) % ja->nr; 1452 ja->sectors_free = ca->mi.bucket_size; 1453 1454 /* 1455 * ja->bucket_seq[ja->cur_idx] must always have 1456 * something sensible: 1457 */ 1458 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(seq); 1459 } 1460 } 1461 } 1462 1463 static void __journal_write_alloc(struct journal *j, 1464 struct journal_buf *w, 1465 struct dev_alloc_list *devs, 1466 unsigned sectors, 1467 unsigned *replicas, 1468 unsigned replicas_want) 1469 { 1470 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1471 1472 darray_for_each(*devs, i) { 1473 struct bch_dev *ca = rcu_dereference(c->devs[*i]); 1474 if (!ca) 1475 continue; 1476 1477 struct journal_device *ja = &ca->journal; 1478 1479 /* 1480 * Check that we can use this device, and aren't already using 1481 * it: 1482 */ 1483 if (!ca->mi.durability || 1484 ca->mi.state != BCH_MEMBER_STATE_rw || 1485 !ja->nr || 1486 bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) || 1487 sectors > ja->sectors_free) 1488 continue; 1489 1490 bch2_dev_stripe_increment(ca, &j->wp.stripe); 1491 1492 bch2_bkey_append_ptr(&w->key, 1493 (struct bch_extent_ptr) { 1494 .offset = bucket_to_sector(ca, 1495 ja->buckets[ja->cur_idx]) + 1496 ca->mi.bucket_size - 1497 ja->sectors_free, 1498 .dev = ca->dev_idx, 1499 }); 1500 1501 ja->sectors_free -= sectors; 1502 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1503 1504 *replicas += ca->mi.durability; 1505 1506 if (*replicas >= replicas_want) 1507 break; 1508 } 1509 } 1510 1511 /** 1512 * journal_write_alloc - decide where to write next journal entry 1513 * 1514 * @j: journal object 1515 * @w: journal buf (entry to be written) 1516 * 1517 * Returns: 0 on success, or -EROFS on failure 1518 */ 1519 static int journal_write_alloc(struct journal *j, struct journal_buf *w) 1520 { 1521 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1522 struct bch_devs_mask devs; 1523 struct dev_alloc_list devs_sorted; 1524 unsigned sectors = vstruct_sectors(w->data, c->block_bits); 1525 unsigned target = c->opts.metadata_target ?: 1526 c->opts.foreground_target; 1527 unsigned replicas = 0, replicas_want = 1528 READ_ONCE(c->opts.metadata_replicas); 1529 unsigned replicas_need = min_t(unsigned, replicas_want, 1530 READ_ONCE(c->opts.metadata_replicas_required)); 1531 bool advance_done = false; 1532 1533 rcu_read_lock(); 1534 1535 /* We might run more than once if we have to stop and do discards: */ 1536 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&w->key)); 1537 bkey_for_each_ptr(ptrs, p) { 1538 struct bch_dev *ca = bch2_dev_rcu_noerror(c, p->dev); 1539 if (ca) 1540 replicas += ca->mi.durability; 1541 } 1542 1543 retry_target: 1544 devs = target_rw_devs(c, BCH_DATA_journal, target); 1545 devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); 1546 retry_alloc: 1547 __journal_write_alloc(j, w, &devs_sorted, sectors, &replicas, replicas_want); 1548 1549 if (likely(replicas >= replicas_want)) 1550 goto done; 1551 1552 if (!advance_done) { 1553 journal_advance_devs_to_next_bucket(j, &devs_sorted, sectors, w->data->seq); 1554 advance_done = true; 1555 goto retry_alloc; 1556 } 1557 1558 if (replicas < replicas_want && target) { 1559 /* Retry from all devices: */ 1560 target = 0; 1561 advance_done = false; 1562 goto retry_target; 1563 } 1564 done: 1565 rcu_read_unlock(); 1566 1567 BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); 1568 1569 return replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices; 1570 } 1571 1572 static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) 1573 { 1574 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1575 1576 /* we aren't holding j->lock: */ 1577 unsigned new_size = READ_ONCE(j->buf_size_want); 1578 void *new_buf; 1579 1580 if (buf->buf_size >= new_size) 1581 return; 1582 1583 size_t btree_write_buffer_size = new_size / 64; 1584 1585 if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size)) 1586 return; 1587 1588 new_buf = kvmalloc(new_size, GFP_NOFS|__GFP_NOWARN); 1589 if (!new_buf) 1590 return; 1591 1592 memcpy(new_buf, buf->data, buf->buf_size); 1593 1594 spin_lock(&j->lock); 1595 swap(buf->data, new_buf); 1596 swap(buf->buf_size, new_size); 1597 spin_unlock(&j->lock); 1598 1599 kvfree(new_buf); 1600 } 1601 1602 static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) 1603 { 1604 return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK); 1605 } 1606 1607 static CLOSURE_CALLBACK(journal_write_done) 1608 { 1609 closure_type(w, struct journal_buf, io); 1610 struct journal *j = container_of(w, struct journal, buf[w->idx]); 1611 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1612 struct bch_replicas_padded replicas; 1613 union journal_res_state old, new; 1614 u64 seq = le64_to_cpu(w->data->seq); 1615 int err = 0; 1616 1617 bch2_time_stats_update(!JSET_NO_FLUSH(w->data) 1618 ? j->flush_write_time 1619 : j->noflush_write_time, j->write_start_time); 1620 1621 if (!w->devs_written.nr) { 1622 bch_err(c, "unable to write journal to sufficient devices"); 1623 err = -EIO; 1624 } else { 1625 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, 1626 w->devs_written); 1627 if (bch2_mark_replicas(c, &replicas.e)) 1628 err = -EIO; 1629 } 1630 1631 if (err) 1632 bch2_fatal_error(c); 1633 1634 closure_debug_destroy(cl); 1635 1636 spin_lock(&j->lock); 1637 if (seq >= j->pin.front) 1638 journal_seq_pin(j, seq)->devs = w->devs_written; 1639 if (err && (!j->err_seq || seq < j->err_seq)) 1640 j->err_seq = seq; 1641 w->write_done = true; 1642 1643 bool completed = false; 1644 1645 for (seq = journal_last_unwritten_seq(j); 1646 seq <= journal_cur_seq(j); 1647 seq++) { 1648 w = j->buf + (seq & JOURNAL_BUF_MASK); 1649 if (!w->write_done) 1650 break; 1651 1652 if (!j->err_seq && !JSET_NO_FLUSH(w->data)) { 1653 j->flushed_seq_ondisk = seq; 1654 j->last_seq_ondisk = w->last_seq; 1655 1656 bch2_do_discards(c); 1657 closure_wake_up(&c->freelist_wait); 1658 bch2_reset_alloc_cursors(c); 1659 } 1660 1661 j->seq_ondisk = seq; 1662 1663 /* 1664 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard 1665 * more buckets: 1666 * 1667 * Must come before signaling write completion, for 1668 * bch2_fs_journal_stop(): 1669 */ 1670 if (j->watermark != BCH_WATERMARK_stripe) 1671 journal_reclaim_kick(&c->journal); 1672 1673 old.v = atomic64_read(&j->reservations.counter); 1674 do { 1675 new.v = old.v; 1676 BUG_ON(journal_state_count(new, new.unwritten_idx)); 1677 BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK)); 1678 1679 new.unwritten_idx++; 1680 } while (!atomic64_try_cmpxchg(&j->reservations.counter, 1681 &old.v, new.v)); 1682 1683 closure_wake_up(&w->wait); 1684 completed = true; 1685 } 1686 1687 if (completed) { 1688 bch2_journal_reclaim_fast(j); 1689 bch2_journal_space_available(j); 1690 1691 track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false); 1692 1693 journal_wake(j); 1694 } 1695 1696 if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && 1697 new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { 1698 struct journal_buf *buf = journal_cur_buf(j); 1699 long delta = buf->expires - jiffies; 1700 1701 /* 1702 * We don't close a journal entry to write it while there's 1703 * previous entries still in flight - the current journal entry 1704 * might want to be written now: 1705 */ 1706 mod_delayed_work(j->wq, &j->write_work, max(0L, delta)); 1707 } 1708 1709 /* 1710 * We don't typically trigger journal writes from her - the next journal 1711 * write will be triggered immediately after the previous one is 1712 * allocated, in bch2_journal_write() - but the journal write error path 1713 * is special: 1714 */ 1715 bch2_journal_do_writes(j); 1716 spin_unlock(&j->lock); 1717 } 1718 1719 static void journal_write_endio(struct bio *bio) 1720 { 1721 struct journal_bio *jbio = container_of(bio, struct journal_bio, bio); 1722 struct bch_dev *ca = jbio->ca; 1723 struct journal *j = &ca->fs->journal; 1724 struct journal_buf *w = j->buf + jbio->buf_idx; 1725 1726 if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, 1727 "error writing journal entry %llu: %s", 1728 le64_to_cpu(w->data->seq), 1729 bch2_blk_status_to_str(bio->bi_status)) || 1730 bch2_meta_write_fault("journal")) { 1731 unsigned long flags; 1732 1733 spin_lock_irqsave(&j->err_lock, flags); 1734 bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx); 1735 spin_unlock_irqrestore(&j->err_lock, flags); 1736 } 1737 1738 closure_put(&w->io); 1739 percpu_ref_put(&ca->io_ref); 1740 } 1741 1742 static CLOSURE_CALLBACK(journal_write_submit) 1743 { 1744 closure_type(w, struct journal_buf, io); 1745 struct journal *j = container_of(w, struct journal, buf[w->idx]); 1746 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1747 unsigned sectors = vstruct_sectors(w->data, c->block_bits); 1748 1749 extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { 1750 struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE); 1751 if (!ca) { 1752 /* XXX: fix this */ 1753 bch_err(c, "missing device for journal write\n"); 1754 continue; 1755 } 1756 1757 this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], 1758 sectors); 1759 1760 struct journal_device *ja = &ca->journal; 1761 struct bio *bio = &ja->bio[w->idx]->bio; 1762 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); 1763 bio->bi_iter.bi_sector = ptr->offset; 1764 bio->bi_end_io = journal_write_endio; 1765 bio->bi_private = ca; 1766 1767 BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector); 1768 ca->prev_journal_sector = bio->bi_iter.bi_sector; 1769 1770 if (!JSET_NO_FLUSH(w->data)) 1771 bio->bi_opf |= REQ_FUA; 1772 if (!JSET_NO_FLUSH(w->data) && !w->separate_flush) 1773 bio->bi_opf |= REQ_PREFLUSH; 1774 1775 bch2_bio_map(bio, w->data, sectors << 9); 1776 1777 trace_and_count(c, journal_write, bio); 1778 closure_bio_submit(bio, cl); 1779 1780 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1781 } 1782 1783 continue_at(cl, journal_write_done, j->wq); 1784 } 1785 1786 static CLOSURE_CALLBACK(journal_write_preflush) 1787 { 1788 closure_type(w, struct journal_buf, io); 1789 struct journal *j = container_of(w, struct journal, buf[w->idx]); 1790 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1791 1792 if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { 1793 spin_lock(&j->lock); 1794 if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { 1795 closure_wait(&j->async_wait, cl); 1796 spin_unlock(&j->lock); 1797 continue_at(cl, journal_write_preflush, j->wq); 1798 return; 1799 } 1800 spin_unlock(&j->lock); 1801 } 1802 1803 if (w->separate_flush) { 1804 for_each_rw_member(c, ca) { 1805 percpu_ref_get(&ca->io_ref); 1806 1807 struct journal_device *ja = &ca->journal; 1808 struct bio *bio = &ja->bio[w->idx]->bio; 1809 bio_reset(bio, ca->disk_sb.bdev, 1810 REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH); 1811 bio->bi_end_io = journal_write_endio; 1812 bio->bi_private = ca; 1813 closure_bio_submit(bio, cl); 1814 } 1815 1816 continue_at(cl, journal_write_submit, j->wq); 1817 } else { 1818 /* 1819 * no need to punt to another work item if we're not waiting on 1820 * preflushes 1821 */ 1822 journal_write_submit(&cl->work); 1823 } 1824 } 1825 1826 static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) 1827 { 1828 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1829 struct jset_entry *start, *end; 1830 struct jset *jset = w->data; 1831 struct journal_keys_to_wb wb = { NULL }; 1832 unsigned sectors, bytes, u64s; 1833 unsigned long btree_roots_have = 0; 1834 bool validate_before_checksum = false; 1835 u64 seq = le64_to_cpu(jset->seq); 1836 int ret; 1837 1838 /* 1839 * Simple compaction, dropping empty jset_entries (from journal 1840 * reservations that weren't fully used) and merging jset_entries that 1841 * can be. 1842 * 1843 * If we wanted to be really fancy here, we could sort all the keys in 1844 * the jset and drop keys that were overwritten - probably not worth it: 1845 */ 1846 vstruct_for_each(jset, i) { 1847 unsigned u64s = le16_to_cpu(i->u64s); 1848 1849 /* Empty entry: */ 1850 if (!u64s) 1851 continue; 1852 1853 /* 1854 * New btree roots are set by journalling them; when the journal 1855 * entry gets written we have to propagate them to 1856 * c->btree_roots 1857 * 1858 * But, every journal entry we write has to contain all the 1859 * btree roots (at least for now); so after we copy btree roots 1860 * to c->btree_roots we have to get any missing btree roots and 1861 * add them to this journal entry: 1862 */ 1863 switch (i->type) { 1864 case BCH_JSET_ENTRY_btree_root: 1865 bch2_journal_entry_to_btree_root(c, i); 1866 __set_bit(i->btree_id, &btree_roots_have); 1867 break; 1868 case BCH_JSET_ENTRY_write_buffer_keys: 1869 EBUG_ON(!w->need_flush_to_write_buffer); 1870 1871 if (!wb.wb) 1872 bch2_journal_keys_to_write_buffer_start(c, &wb, seq); 1873 1874 jset_entry_for_each_key(i, k) { 1875 ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k); 1876 if (ret) { 1877 bch2_fs_fatal_error(c, "flushing journal keys to btree write buffer: %s", 1878 bch2_err_str(ret)); 1879 bch2_journal_keys_to_write_buffer_end(c, &wb); 1880 return ret; 1881 } 1882 } 1883 i->type = BCH_JSET_ENTRY_btree_keys; 1884 break; 1885 } 1886 } 1887 1888 if (wb.wb) { 1889 ret = bch2_journal_keys_to_write_buffer_end(c, &wb); 1890 if (ret) { 1891 bch2_fs_fatal_error(c, "error flushing journal keys to btree write buffer: %s", 1892 bch2_err_str(ret)); 1893 return ret; 1894 } 1895 } 1896 1897 spin_lock(&c->journal.lock); 1898 w->need_flush_to_write_buffer = false; 1899 spin_unlock(&c->journal.lock); 1900 1901 start = end = vstruct_last(jset); 1902 1903 end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have); 1904 1905 struct jset_entry_datetime *d = 1906 container_of(jset_entry_init(&end, sizeof(*d)), struct jset_entry_datetime, entry); 1907 d->entry.type = BCH_JSET_ENTRY_datetime; 1908 d->seconds = cpu_to_le64(ktime_get_real_seconds()); 1909 1910 bch2_journal_super_entries_add_common(c, &end, seq); 1911 u64s = (u64 *) end - (u64 *) start; 1912 1913 WARN_ON(u64s > j->entry_u64s_reserved); 1914 1915 le32_add_cpu(&jset->u64s, u64s); 1916 1917 sectors = vstruct_sectors(jset, c->block_bits); 1918 bytes = vstruct_bytes(jset); 1919 1920 if (sectors > w->sectors) { 1921 bch2_fs_fatal_error(c, ": journal write overran available space, %zu > %u (extra %u reserved %u/%u)", 1922 vstruct_bytes(jset), w->sectors << 9, 1923 u64s, w->u64s_reserved, j->entry_u64s_reserved); 1924 return -EINVAL; 1925 } 1926 1927 jset->magic = cpu_to_le64(jset_magic(c)); 1928 jset->version = cpu_to_le32(c->sb.version); 1929 1930 SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); 1931 SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); 1932 1933 if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset)) 1934 j->last_empty_seq = seq; 1935 1936 if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) 1937 validate_before_checksum = true; 1938 1939 if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current) 1940 validate_before_checksum = true; 1941 1942 if (validate_before_checksum && 1943 (ret = jset_validate(c, NULL, jset, 0, WRITE))) 1944 return ret; 1945 1946 ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), 1947 jset->encrypted_start, 1948 vstruct_end(jset) - (void *) jset->encrypted_start); 1949 if (bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret))) 1950 return ret; 1951 1952 jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), 1953 journal_nonce(jset), jset); 1954 1955 if (!validate_before_checksum && 1956 (ret = jset_validate(c, NULL, jset, 0, WRITE))) 1957 return ret; 1958 1959 memset((void *) jset + bytes, 0, (sectors << 9) - bytes); 1960 return 0; 1961 } 1962 1963 static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w) 1964 { 1965 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1966 int error = bch2_journal_error(j); 1967 1968 /* 1969 * If the journal is in an error state - we did an emergency shutdown - 1970 * we prefer to continue doing journal writes. We just mark them as 1971 * noflush so they'll never be used, but they'll still be visible by the 1972 * list_journal tool - this helps in debugging. 1973 * 1974 * There's a caveat: the first journal write after marking the 1975 * superblock dirty must always be a flush write, because on startup 1976 * from a clean shutdown we didn't necessarily read the journal and the 1977 * new journal write might overwrite whatever was in the journal 1978 * previously - we can't leave the journal without any flush writes in 1979 * it. 1980 * 1981 * So if we're in an error state, and we're still starting up, we don't 1982 * write anything at all. 1983 */ 1984 if (error && test_bit(JOURNAL_need_flush_write, &j->flags)) 1985 return -EIO; 1986 1987 if (error || 1988 w->noflush || 1989 (!w->must_flush && 1990 time_before(jiffies, j->last_flush_write + 1991 msecs_to_jiffies(c->opts.journal_flush_delay)) && 1992 test_bit(JOURNAL_may_skip_flush, &j->flags))) { 1993 w->noflush = true; 1994 SET_JSET_NO_FLUSH(w->data, true); 1995 w->data->last_seq = 0; 1996 w->last_seq = 0; 1997 1998 j->nr_noflush_writes++; 1999 } else { 2000 w->must_flush = true; 2001 j->last_flush_write = jiffies; 2002 j->nr_flush_writes++; 2003 clear_bit(JOURNAL_need_flush_write, &j->flags); 2004 } 2005 2006 return 0; 2007 } 2008 2009 CLOSURE_CALLBACK(bch2_journal_write) 2010 { 2011 closure_type(w, struct journal_buf, io); 2012 struct journal *j = container_of(w, struct journal, buf[w->idx]); 2013 struct bch_fs *c = container_of(j, struct bch_fs, journal); 2014 struct bch_replicas_padded replicas; 2015 unsigned nr_rw_members = 0; 2016 int ret; 2017 2018 for_each_rw_member(c, ca) 2019 nr_rw_members++; 2020 2021 BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); 2022 BUG_ON(!w->write_started); 2023 BUG_ON(w->write_allocated); 2024 BUG_ON(w->write_done); 2025 2026 j->write_start_time = local_clock(); 2027 2028 spin_lock(&j->lock); 2029 if (nr_rw_members > 1) 2030 w->separate_flush = true; 2031 2032 ret = bch2_journal_write_pick_flush(j, w); 2033 spin_unlock(&j->lock); 2034 if (ret) 2035 goto err; 2036 2037 mutex_lock(&j->buf_lock); 2038 journal_buf_realloc(j, w); 2039 2040 ret = bch2_journal_write_prep(j, w); 2041 mutex_unlock(&j->buf_lock); 2042 if (ret) 2043 goto err; 2044 2045 j->entry_bytes_written += vstruct_bytes(w->data); 2046 2047 while (1) { 2048 spin_lock(&j->lock); 2049 ret = journal_write_alloc(j, w); 2050 if (!ret || !j->can_discard) 2051 break; 2052 2053 spin_unlock(&j->lock); 2054 bch2_journal_do_discards(j); 2055 } 2056 2057 if (ret && !bch2_journal_error(j)) { 2058 struct printbuf buf = PRINTBUF; 2059 buf.atomic++; 2060 2061 prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"), 2062 le64_to_cpu(w->data->seq), 2063 vstruct_sectors(w->data, c->block_bits), 2064 bch2_err_str(ret)); 2065 __bch2_journal_debug_to_text(&buf, j); 2066 spin_unlock(&j->lock); 2067 bch2_print_string_as_lines(KERN_ERR, buf.buf); 2068 printbuf_exit(&buf); 2069 } 2070 if (ret) 2071 goto err; 2072 2073 /* 2074 * write is allocated, no longer need to account for it in 2075 * bch2_journal_space_available(): 2076 */ 2077 w->sectors = 0; 2078 w->write_allocated = true; 2079 2080 /* 2081 * journal entry has been compacted and allocated, recalculate space 2082 * available: 2083 */ 2084 bch2_journal_space_available(j); 2085 bch2_journal_do_writes(j); 2086 spin_unlock(&j->lock); 2087 2088 w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); 2089 2090 if (c->opts.nochanges) 2091 goto no_io; 2092 2093 /* 2094 * Mark journal replicas before we submit the write to guarantee 2095 * recovery will find the journal entries after a crash. 2096 */ 2097 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, 2098 w->devs_written); 2099 ret = bch2_mark_replicas(c, &replicas.e); 2100 if (ret) 2101 goto err; 2102 2103 if (!JSET_NO_FLUSH(w->data)) 2104 continue_at(cl, journal_write_preflush, j->wq); 2105 else 2106 continue_at(cl, journal_write_submit, j->wq); 2107 return; 2108 no_io: 2109 continue_at(cl, journal_write_done, j->wq); 2110 return; 2111 err: 2112 bch2_fatal_error(c); 2113 continue_at(cl, journal_write_done, j->wq); 2114 } 2115