1 // SPDX-License-Identifier: GPL-2.0 2 #include "bcachefs.h" 3 #include "alloc_background.h" 4 #include "alloc_foreground.h" 5 #include "btree_io.h" 6 #include "btree_update_interior.h" 7 #include "btree_write_buffer.h" 8 #include "buckets.h" 9 #include "checksum.h" 10 #include "disk_groups.h" 11 #include "error.h" 12 #include "journal.h" 13 #include "journal_io.h" 14 #include "journal_reclaim.h" 15 #include "journal_seq_blacklist.h" 16 #include "replicas.h" 17 #include "sb-clean.h" 18 #include "trace.h" 19 20 #include <linux/ioprio.h> 21 #include <linux/string_choices.h> 22 23 void bch2_journal_pos_from_member_info_set(struct bch_fs *c) 24 { 25 lockdep_assert_held(&c->sb_lock); 26 27 for_each_member_device(c, ca) { 28 struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); 29 30 m->last_journal_bucket = cpu_to_le32(ca->journal.cur_idx); 31 m->last_journal_bucket_offset = cpu_to_le32(ca->mi.bucket_size - ca->journal.sectors_free); 32 } 33 } 34 35 void bch2_journal_pos_from_member_info_resume(struct bch_fs *c) 36 { 37 mutex_lock(&c->sb_lock); 38 for_each_member_device(c, ca) { 39 struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx); 40 41 unsigned idx = le32_to_cpu(m.last_journal_bucket); 42 if (idx < ca->journal.nr) 43 ca->journal.cur_idx = idx; 44 unsigned offset = le32_to_cpu(m.last_journal_bucket_offset); 45 if (offset <= ca->mi.bucket_size) 46 ca->journal.sectors_free = ca->mi.bucket_size - offset; 47 } 48 mutex_unlock(&c->sb_lock); 49 } 50 51 void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, 52 struct journal_replay *j) 53 { 54 darray_for_each(j->ptrs, i) { 55 if (i != j->ptrs.data) 56 prt_printf(out, " "); 57 prt_printf(out, "%u:%u:%u (sector %llu)", 58 i->dev, i->bucket, i->bucket_offset, i->sector); 59 } 60 } 61 62 static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c, 63 struct journal_replay *j) 64 { 65 prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq)); 66 67 bch2_journal_ptrs_to_text(out, c, j); 68 69 for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) { 70 struct jset_entry_datetime *datetime = 71 container_of(entry, struct jset_entry_datetime, entry); 72 bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); 73 break; 74 } 75 } 76 77 static struct nonce journal_nonce(const struct jset *jset) 78 { 79 return (struct nonce) {{ 80 [0] = 0, 81 [1] = ((__le32 *) &jset->seq)[0], 82 [2] = ((__le32 *) &jset->seq)[1], 83 [3] = BCH_NONCE_JOURNAL, 84 }}; 85 } 86 87 static bool jset_csum_good(struct bch_fs *c, struct jset *j, struct bch_csum *csum) 88 { 89 if (!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j))) { 90 *csum = (struct bch_csum) {}; 91 return false; 92 } 93 94 *csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j); 95 return !bch2_crc_cmp(j->csum, *csum); 96 } 97 98 static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq) 99 { 100 return (seq - c->journal_entries_base_seq) & (~0U >> 1); 101 } 102 103 static void __journal_replay_free(struct bch_fs *c, 104 struct journal_replay *i) 105 { 106 struct journal_replay **p = 107 genradix_ptr(&c->journal_entries, 108 journal_entry_radix_idx(c, le64_to_cpu(i->j.seq))); 109 110 BUG_ON(*p != i); 111 *p = NULL; 112 kvfree(i); 113 } 114 115 static void journal_replay_free(struct bch_fs *c, struct journal_replay *i, bool blacklisted) 116 { 117 if (blacklisted) 118 i->ignore_blacklisted = true; 119 else 120 i->ignore_not_dirty = true; 121 122 if (!c->opts.read_entire_journal) 123 __journal_replay_free(c, i); 124 } 125 126 struct journal_list { 127 struct closure cl; 128 u64 last_seq; 129 struct mutex lock; 130 int ret; 131 }; 132 133 #define JOURNAL_ENTRY_ADD_OK 0 134 #define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5 135 136 /* 137 * Given a journal entry we just read, add it to the list of journal entries to 138 * be replayed: 139 */ 140 static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, 141 struct journal_ptr entry_ptr, 142 struct journal_list *jlist, struct jset *j) 143 { 144 struct genradix_iter iter; 145 struct journal_replay **_i, *i, *dup; 146 size_t bytes = vstruct_bytes(j); 147 u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0; 148 struct printbuf buf = PRINTBUF; 149 int ret = JOURNAL_ENTRY_ADD_OK; 150 151 if (!c->journal.oldest_seq_found_ondisk || 152 le64_to_cpu(j->seq) < c->journal.oldest_seq_found_ondisk) 153 c->journal.oldest_seq_found_ondisk = le64_to_cpu(j->seq); 154 155 /* Is this entry older than the range we need? */ 156 if (!c->opts.read_entire_journal && 157 le64_to_cpu(j->seq) < jlist->last_seq) 158 return JOURNAL_ENTRY_ADD_OUT_OF_RANGE; 159 160 /* 161 * genradixes are indexed by a ulong, not a u64, so we can't index them 162 * by sequence number directly: Assume instead that they will all fall 163 * within the range of +-2billion of the filrst one we find. 164 */ 165 if (!c->journal_entries_base_seq) 166 c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX); 167 168 /* Drop entries we don't need anymore */ 169 if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) { 170 genradix_for_each_from(&c->journal_entries, iter, _i, 171 journal_entry_radix_idx(c, jlist->last_seq)) { 172 i = *_i; 173 174 if (journal_replay_ignore(i)) 175 continue; 176 177 if (le64_to_cpu(i->j.seq) >= last_seq) 178 break; 179 180 journal_replay_free(c, i, false); 181 } 182 } 183 184 jlist->last_seq = max(jlist->last_seq, last_seq); 185 186 _i = genradix_ptr_alloc(&c->journal_entries, 187 journal_entry_radix_idx(c, le64_to_cpu(j->seq)), 188 GFP_KERNEL); 189 if (!_i) 190 return -BCH_ERR_ENOMEM_journal_entry_add; 191 192 /* 193 * Duplicate journal entries? If so we want the one that didn't have a 194 * checksum error: 195 */ 196 dup = *_i; 197 if (dup) { 198 bool identical = bytes == vstruct_bytes(&dup->j) && 199 !memcmp(j, &dup->j, bytes); 200 bool not_identical = !identical && 201 entry_ptr.csum_good && 202 dup->csum_good; 203 204 bool same_device = false; 205 darray_for_each(dup->ptrs, ptr) 206 if (ptr->dev == ca->dev_idx) 207 same_device = true; 208 209 ret = darray_push(&dup->ptrs, entry_ptr); 210 if (ret) 211 goto out; 212 213 bch2_journal_replay_to_text(&buf, c, dup); 214 215 fsck_err_on(same_device, 216 c, journal_entry_dup_same_device, 217 "duplicate journal entry on same device\n %s", 218 buf.buf); 219 220 fsck_err_on(not_identical, 221 c, journal_entry_replicas_data_mismatch, 222 "found duplicate but non identical journal entries\n %s", 223 buf.buf); 224 225 if (entry_ptr.csum_good && !identical) 226 goto replace; 227 228 goto out; 229 } 230 replace: 231 i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); 232 if (!i) 233 return -BCH_ERR_ENOMEM_journal_entry_add; 234 235 darray_init(&i->ptrs); 236 i->csum_good = entry_ptr.csum_good; 237 i->ignore_blacklisted = false; 238 i->ignore_not_dirty = false; 239 unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct"); 240 241 if (dup) { 242 /* The first ptr should represent the jset we kept: */ 243 darray_for_each(dup->ptrs, ptr) 244 darray_push(&i->ptrs, *ptr); 245 __journal_replay_free(c, dup); 246 } else { 247 darray_push(&i->ptrs, entry_ptr); 248 } 249 250 *_i = i; 251 out: 252 fsck_err: 253 printbuf_exit(&buf); 254 return ret; 255 } 256 257 /* this fills in a range with empty jset_entries: */ 258 static void journal_entry_null_range(void *start, void *end) 259 { 260 struct jset_entry *entry; 261 262 for (entry = start; entry != end; entry = vstruct_next(entry)) 263 memset(entry, 0, sizeof(*entry)); 264 } 265 266 #define JOURNAL_ENTRY_REREAD 5 267 #define JOURNAL_ENTRY_NONE 6 268 #define JOURNAL_ENTRY_BAD 7 269 270 static void journal_entry_err_msg(struct printbuf *out, 271 u32 version, 272 struct jset *jset, 273 struct jset_entry *entry) 274 { 275 prt_str(out, "invalid journal entry, version="); 276 bch2_version_to_text(out, version); 277 278 if (entry) { 279 prt_str(out, " type="); 280 bch2_prt_jset_entry_type(out, entry->type); 281 } 282 283 if (!jset) { 284 prt_printf(out, " in superblock"); 285 } else { 286 287 prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq)); 288 289 if (entry) 290 prt_printf(out, " offset=%zi/%u", 291 (u64 *) entry - jset->_data, 292 le32_to_cpu(jset->u64s)); 293 } 294 295 prt_str(out, ": "); 296 } 297 298 #define journal_entry_err(c, version, jset, entry, _err, msg, ...) \ 299 ({ \ 300 struct printbuf _buf = PRINTBUF; \ 301 \ 302 journal_entry_err_msg(&_buf, version, jset, entry); \ 303 prt_printf(&_buf, msg, ##__VA_ARGS__); \ 304 \ 305 switch (from.flags & BCH_VALIDATE_write) { \ 306 case READ: \ 307 mustfix_fsck_err(c, _err, "%s", _buf.buf); \ 308 break; \ 309 case WRITE: \ 310 bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \ 311 bch_err(c, "corrupt metadata before write: %s\n", _buf.buf);\ 312 if (bch2_fs_inconsistent(c)) { \ 313 ret = -BCH_ERR_fsck_errors_not_fixed; \ 314 goto fsck_err; \ 315 } \ 316 break; \ 317 } \ 318 \ 319 printbuf_exit(&_buf); \ 320 true; \ 321 }) 322 323 #define journal_entry_err_on(cond, ...) \ 324 ((cond) ? journal_entry_err(__VA_ARGS__) : false) 325 326 #define FSCK_DELETED_KEY 5 327 328 static int journal_validate_key(struct bch_fs *c, 329 struct jset *jset, 330 struct jset_entry *entry, 331 struct bkey_i *k, 332 struct bkey_validate_context from, 333 unsigned version, int big_endian) 334 { 335 enum bch_validate_flags flags = from.flags; 336 int write = flags & BCH_VALIDATE_write; 337 void *next = vstruct_next(entry); 338 int ret = 0; 339 340 if (journal_entry_err_on(!k->k.u64s, 341 c, version, jset, entry, 342 journal_entry_bkey_u64s_0, 343 "k->u64s 0")) { 344 entry->u64s = cpu_to_le16((u64 *) k - entry->_data); 345 journal_entry_null_range(vstruct_next(entry), next); 346 return FSCK_DELETED_KEY; 347 } 348 349 if (journal_entry_err_on((void *) bkey_next(k) > 350 (void *) vstruct_next(entry), 351 c, version, jset, entry, 352 journal_entry_bkey_past_end, 353 "extends past end of journal entry")) { 354 entry->u64s = cpu_to_le16((u64 *) k - entry->_data); 355 journal_entry_null_range(vstruct_next(entry), next); 356 return FSCK_DELETED_KEY; 357 } 358 359 if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, 360 c, version, jset, entry, 361 journal_entry_bkey_bad_format, 362 "bad format %u", k->k.format)) { 363 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); 364 memmove(k, bkey_next(k), next - (void *) bkey_next(k)); 365 journal_entry_null_range(vstruct_next(entry), next); 366 return FSCK_DELETED_KEY; 367 } 368 369 if (!write) 370 bch2_bkey_compat(from.level, from.btree, version, big_endian, 371 write, NULL, bkey_to_packed(k)); 372 373 ret = bch2_bkey_validate(c, bkey_i_to_s_c(k), from); 374 if (ret == -BCH_ERR_fsck_delete_bkey) { 375 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); 376 memmove(k, bkey_next(k), next - (void *) bkey_next(k)); 377 journal_entry_null_range(vstruct_next(entry), next); 378 return FSCK_DELETED_KEY; 379 } 380 if (ret) 381 goto fsck_err; 382 383 if (write) 384 bch2_bkey_compat(from.level, from.btree, version, big_endian, 385 write, NULL, bkey_to_packed(k)); 386 fsck_err: 387 return ret; 388 } 389 390 static int journal_entry_btree_keys_validate(struct bch_fs *c, 391 struct jset *jset, 392 struct jset_entry *entry, 393 unsigned version, int big_endian, 394 struct bkey_validate_context from) 395 { 396 struct bkey_i *k = entry->start; 397 398 from.level = entry->level; 399 from.btree = entry->btree_id; 400 401 while (k != vstruct_last(entry)) { 402 int ret = journal_validate_key(c, jset, entry, k, from, version, big_endian); 403 if (ret == FSCK_DELETED_KEY) 404 continue; 405 else if (ret) 406 return ret; 407 408 k = bkey_next(k); 409 } 410 411 return 0; 412 } 413 414 static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c, 415 struct jset_entry *entry) 416 { 417 bool first = true; 418 419 jset_entry_for_each_key(entry, k) { 420 if (!first) { 421 prt_newline(out); 422 bch2_prt_jset_entry_type(out, entry->type); 423 prt_str(out, ": "); 424 } 425 bch2_btree_id_level_to_text(out, entry->btree_id, entry->level); 426 prt_char(out, ' '); 427 bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); 428 first = false; 429 } 430 } 431 432 static int journal_entry_btree_root_validate(struct bch_fs *c, 433 struct jset *jset, 434 struct jset_entry *entry, 435 unsigned version, int big_endian, 436 struct bkey_validate_context from) 437 { 438 struct bkey_i *k = entry->start; 439 int ret = 0; 440 441 from.root = true; 442 from.level = entry->level + 1; 443 from.btree = entry->btree_id; 444 445 if (journal_entry_err_on(!entry->u64s || 446 le16_to_cpu(entry->u64s) != k->k.u64s, 447 c, version, jset, entry, 448 journal_entry_btree_root_bad_size, 449 "invalid btree root journal entry: wrong number of keys")) { 450 void *next = vstruct_next(entry); 451 /* 452 * we don't want to null out this jset_entry, 453 * just the contents, so that later we can tell 454 * we were _supposed_ to have a btree root 455 */ 456 entry->u64s = 0; 457 journal_entry_null_range(vstruct_next(entry), next); 458 return 0; 459 } 460 461 ret = journal_validate_key(c, jset, entry, k, from, version, big_endian); 462 if (ret == FSCK_DELETED_KEY) 463 ret = 0; 464 fsck_err: 465 return ret; 466 } 467 468 static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c, 469 struct jset_entry *entry) 470 { 471 journal_entry_btree_keys_to_text(out, c, entry); 472 } 473 474 static int journal_entry_prio_ptrs_validate(struct bch_fs *c, 475 struct jset *jset, 476 struct jset_entry *entry, 477 unsigned version, int big_endian, 478 struct bkey_validate_context from) 479 { 480 /* obsolete, don't care: */ 481 return 0; 482 } 483 484 static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c, 485 struct jset_entry *entry) 486 { 487 } 488 489 static int journal_entry_blacklist_validate(struct bch_fs *c, 490 struct jset *jset, 491 struct jset_entry *entry, 492 unsigned version, int big_endian, 493 struct bkey_validate_context from) 494 { 495 int ret = 0; 496 497 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, 498 c, version, jset, entry, 499 journal_entry_blacklist_bad_size, 500 "invalid journal seq blacklist entry: bad size")) { 501 journal_entry_null_range(entry, vstruct_next(entry)); 502 } 503 fsck_err: 504 return ret; 505 } 506 507 static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c, 508 struct jset_entry *entry) 509 { 510 struct jset_entry_blacklist *bl = 511 container_of(entry, struct jset_entry_blacklist, entry); 512 513 prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq)); 514 } 515 516 static int journal_entry_blacklist_v2_validate(struct bch_fs *c, 517 struct jset *jset, 518 struct jset_entry *entry, 519 unsigned version, int big_endian, 520 struct bkey_validate_context from) 521 { 522 struct jset_entry_blacklist_v2 *bl_entry; 523 int ret = 0; 524 525 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, 526 c, version, jset, entry, 527 journal_entry_blacklist_v2_bad_size, 528 "invalid journal seq blacklist entry: bad size")) { 529 journal_entry_null_range(entry, vstruct_next(entry)); 530 goto out; 531 } 532 533 bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry); 534 535 if (journal_entry_err_on(le64_to_cpu(bl_entry->start) > 536 le64_to_cpu(bl_entry->end), 537 c, version, jset, entry, 538 journal_entry_blacklist_v2_start_past_end, 539 "invalid journal seq blacklist entry: start > end")) { 540 journal_entry_null_range(entry, vstruct_next(entry)); 541 } 542 out: 543 fsck_err: 544 return ret; 545 } 546 547 static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c, 548 struct jset_entry *entry) 549 { 550 struct jset_entry_blacklist_v2 *bl = 551 container_of(entry, struct jset_entry_blacklist_v2, entry); 552 553 prt_printf(out, "start=%llu end=%llu", 554 le64_to_cpu(bl->start), 555 le64_to_cpu(bl->end)); 556 } 557 558 static int journal_entry_usage_validate(struct bch_fs *c, 559 struct jset *jset, 560 struct jset_entry *entry, 561 unsigned version, int big_endian, 562 struct bkey_validate_context from) 563 { 564 struct jset_entry_usage *u = 565 container_of(entry, struct jset_entry_usage, entry); 566 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 567 int ret = 0; 568 569 if (journal_entry_err_on(bytes < sizeof(*u), 570 c, version, jset, entry, 571 journal_entry_usage_bad_size, 572 "invalid journal entry usage: bad size")) { 573 journal_entry_null_range(entry, vstruct_next(entry)); 574 return ret; 575 } 576 577 fsck_err: 578 return ret; 579 } 580 581 static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c, 582 struct jset_entry *entry) 583 { 584 struct jset_entry_usage *u = 585 container_of(entry, struct jset_entry_usage, entry); 586 587 prt_str(out, "type="); 588 bch2_prt_fs_usage_type(out, u->entry.btree_id); 589 prt_printf(out, " v=%llu", le64_to_cpu(u->v)); 590 } 591 592 static int journal_entry_data_usage_validate(struct bch_fs *c, 593 struct jset *jset, 594 struct jset_entry *entry, 595 unsigned version, int big_endian, 596 struct bkey_validate_context from) 597 { 598 struct jset_entry_data_usage *u = 599 container_of(entry, struct jset_entry_data_usage, entry); 600 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 601 struct printbuf err = PRINTBUF; 602 int ret = 0; 603 604 if (journal_entry_err_on(bytes < sizeof(*u) || 605 bytes < sizeof(*u) + u->r.nr_devs, 606 c, version, jset, entry, 607 journal_entry_data_usage_bad_size, 608 "invalid journal entry usage: bad size")) { 609 journal_entry_null_range(entry, vstruct_next(entry)); 610 goto out; 611 } 612 613 if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c, &err), 614 c, version, jset, entry, 615 journal_entry_data_usage_bad_size, 616 "invalid journal entry usage: %s", err.buf)) { 617 journal_entry_null_range(entry, vstruct_next(entry)); 618 goto out; 619 } 620 out: 621 fsck_err: 622 printbuf_exit(&err); 623 return ret; 624 } 625 626 static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c, 627 struct jset_entry *entry) 628 { 629 struct jset_entry_data_usage *u = 630 container_of(entry, struct jset_entry_data_usage, entry); 631 632 bch2_replicas_entry_to_text(out, &u->r); 633 prt_printf(out, "=%llu", le64_to_cpu(u->v)); 634 } 635 636 static int journal_entry_clock_validate(struct bch_fs *c, 637 struct jset *jset, 638 struct jset_entry *entry, 639 unsigned version, int big_endian, 640 struct bkey_validate_context from) 641 { 642 struct jset_entry_clock *clock = 643 container_of(entry, struct jset_entry_clock, entry); 644 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 645 int ret = 0; 646 647 if (journal_entry_err_on(bytes != sizeof(*clock), 648 c, version, jset, entry, 649 journal_entry_clock_bad_size, 650 "bad size")) { 651 journal_entry_null_range(entry, vstruct_next(entry)); 652 return ret; 653 } 654 655 if (journal_entry_err_on(clock->rw > 1, 656 c, version, jset, entry, 657 journal_entry_clock_bad_rw, 658 "bad rw")) { 659 journal_entry_null_range(entry, vstruct_next(entry)); 660 return ret; 661 } 662 663 fsck_err: 664 return ret; 665 } 666 667 static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c, 668 struct jset_entry *entry) 669 { 670 struct jset_entry_clock *clock = 671 container_of(entry, struct jset_entry_clock, entry); 672 673 prt_printf(out, "%s=%llu", str_write_read(clock->rw), le64_to_cpu(clock->time)); 674 } 675 676 static int journal_entry_dev_usage_validate(struct bch_fs *c, 677 struct jset *jset, 678 struct jset_entry *entry, 679 unsigned version, int big_endian, 680 struct bkey_validate_context from) 681 { 682 struct jset_entry_dev_usage *u = 683 container_of(entry, struct jset_entry_dev_usage, entry); 684 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 685 unsigned expected = sizeof(*u); 686 int ret = 0; 687 688 if (journal_entry_err_on(bytes < expected, 689 c, version, jset, entry, 690 journal_entry_dev_usage_bad_size, 691 "bad size (%u < %u)", 692 bytes, expected)) { 693 journal_entry_null_range(entry, vstruct_next(entry)); 694 return ret; 695 } 696 697 if (journal_entry_err_on(u->pad, 698 c, version, jset, entry, 699 journal_entry_dev_usage_bad_pad, 700 "bad pad")) { 701 journal_entry_null_range(entry, vstruct_next(entry)); 702 return ret; 703 } 704 705 fsck_err: 706 return ret; 707 } 708 709 static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c, 710 struct jset_entry *entry) 711 { 712 struct jset_entry_dev_usage *u = 713 container_of(entry, struct jset_entry_dev_usage, entry); 714 unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); 715 716 if (vstruct_bytes(entry) < sizeof(*u)) 717 return; 718 719 prt_printf(out, "dev=%u", le32_to_cpu(u->dev)); 720 721 printbuf_indent_add(out, 2); 722 for (i = 0; i < nr_types; i++) { 723 prt_newline(out); 724 bch2_prt_data_type(out, i); 725 prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu", 726 le64_to_cpu(u->d[i].buckets), 727 le64_to_cpu(u->d[i].sectors), 728 le64_to_cpu(u->d[i].fragmented)); 729 } 730 printbuf_indent_sub(out, 2); 731 } 732 733 static int journal_entry_log_validate(struct bch_fs *c, 734 struct jset *jset, 735 struct jset_entry *entry, 736 unsigned version, int big_endian, 737 struct bkey_validate_context from) 738 { 739 return 0; 740 } 741 742 static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c, 743 struct jset_entry *entry) 744 { 745 struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); 746 747 prt_printf(out, "%.*s", jset_entry_log_msg_bytes(l), l->d); 748 } 749 750 static int journal_entry_overwrite_validate(struct bch_fs *c, 751 struct jset *jset, 752 struct jset_entry *entry, 753 unsigned version, int big_endian, 754 struct bkey_validate_context from) 755 { 756 from.flags = 0; 757 return journal_entry_btree_keys_validate(c, jset, entry, 758 version, big_endian, from); 759 } 760 761 static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c, 762 struct jset_entry *entry) 763 { 764 journal_entry_btree_keys_to_text(out, c, entry); 765 } 766 767 static int journal_entry_write_buffer_keys_validate(struct bch_fs *c, 768 struct jset *jset, 769 struct jset_entry *entry, 770 unsigned version, int big_endian, 771 struct bkey_validate_context from) 772 { 773 return journal_entry_btree_keys_validate(c, jset, entry, 774 version, big_endian, from); 775 } 776 777 static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c, 778 struct jset_entry *entry) 779 { 780 journal_entry_btree_keys_to_text(out, c, entry); 781 } 782 783 static int journal_entry_datetime_validate(struct bch_fs *c, 784 struct jset *jset, 785 struct jset_entry *entry, 786 unsigned version, int big_endian, 787 struct bkey_validate_context from) 788 { 789 unsigned bytes = vstruct_bytes(entry); 790 unsigned expected = 16; 791 int ret = 0; 792 793 if (journal_entry_err_on(vstruct_bytes(entry) < expected, 794 c, version, jset, entry, 795 journal_entry_dev_usage_bad_size, 796 "bad size (%u < %u)", 797 bytes, expected)) { 798 journal_entry_null_range(entry, vstruct_next(entry)); 799 return ret; 800 } 801 fsck_err: 802 return ret; 803 } 804 805 static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *c, 806 struct jset_entry *entry) 807 { 808 struct jset_entry_datetime *datetime = 809 container_of(entry, struct jset_entry_datetime, entry); 810 811 bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); 812 } 813 814 struct jset_entry_ops { 815 int (*validate)(struct bch_fs *, struct jset *, 816 struct jset_entry *, unsigned, int, 817 struct bkey_validate_context); 818 void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); 819 }; 820 821 static const struct jset_entry_ops bch2_jset_entry_ops[] = { 822 #define x(f, nr) \ 823 [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \ 824 .validate = journal_entry_##f##_validate, \ 825 .to_text = journal_entry_##f##_to_text, \ 826 }, 827 BCH_JSET_ENTRY_TYPES() 828 #undef x 829 }; 830 831 int bch2_journal_entry_validate(struct bch_fs *c, 832 struct jset *jset, 833 struct jset_entry *entry, 834 unsigned version, int big_endian, 835 struct bkey_validate_context from) 836 { 837 return entry->type < BCH_JSET_ENTRY_NR 838 ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry, 839 version, big_endian, from) 840 : 0; 841 } 842 843 void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, 844 struct jset_entry *entry) 845 { 846 bch2_prt_jset_entry_type(out, entry->type); 847 848 if (entry->type < BCH_JSET_ENTRY_NR) { 849 prt_str(out, ": "); 850 bch2_jset_entry_ops[entry->type].to_text(out, c, entry); 851 } 852 } 853 854 static int jset_validate_entries(struct bch_fs *c, struct jset *jset, 855 enum bch_validate_flags flags) 856 { 857 struct bkey_validate_context from = { 858 .flags = flags, 859 .from = BKEY_VALIDATE_journal, 860 .journal_seq = le64_to_cpu(jset->seq), 861 }; 862 863 unsigned version = le32_to_cpu(jset->version); 864 int ret = 0; 865 866 vstruct_for_each(jset, entry) { 867 from.journal_offset = (u64 *) entry - jset->_data; 868 869 if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset), 870 c, version, jset, entry, 871 journal_entry_past_jset_end, 872 "journal entry extends past end of jset")) { 873 jset->u64s = cpu_to_le32((u64 *) entry - jset->_data); 874 break; 875 } 876 877 ret = bch2_journal_entry_validate(c, jset, entry, version, 878 JSET_BIG_ENDIAN(jset), from); 879 if (ret) 880 break; 881 } 882 fsck_err: 883 return ret; 884 } 885 886 static int jset_validate(struct bch_fs *c, 887 struct bch_dev *ca, 888 struct jset *jset, u64 sector, 889 enum bch_validate_flags flags) 890 { 891 struct bkey_validate_context from = { 892 .flags = flags, 893 .from = BKEY_VALIDATE_journal, 894 .journal_seq = le64_to_cpu(jset->seq), 895 }; 896 int ret = 0; 897 898 if (le64_to_cpu(jset->magic) != jset_magic(c)) 899 return JOURNAL_ENTRY_NONE; 900 901 unsigned version = le32_to_cpu(jset->version); 902 if (journal_entry_err_on(!bch2_version_compatible(version), 903 c, version, jset, NULL, 904 jset_unsupported_version, 905 "%s sector %llu seq %llu: incompatible journal entry version %u.%u", 906 ca ? ca->name : c->name, 907 sector, le64_to_cpu(jset->seq), 908 BCH_VERSION_MAJOR(version), 909 BCH_VERSION_MINOR(version))) { 910 /* don't try to continue: */ 911 return -EINVAL; 912 } 913 914 if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), 915 c, version, jset, NULL, 916 jset_unknown_csum, 917 "%s sector %llu seq %llu: journal entry with unknown csum type %llu", 918 ca ? ca->name : c->name, 919 sector, le64_to_cpu(jset->seq), 920 JSET_CSUM_TYPE(jset))) 921 ret = JOURNAL_ENTRY_BAD; 922 923 /* last_seq is ignored when JSET_NO_FLUSH is true */ 924 if (journal_entry_err_on(!JSET_NO_FLUSH(jset) && 925 le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), 926 c, version, jset, NULL, 927 jset_last_seq_newer_than_seq, 928 "invalid journal entry: last_seq > seq (%llu > %llu)", 929 le64_to_cpu(jset->last_seq), 930 le64_to_cpu(jset->seq))) { 931 jset->last_seq = jset->seq; 932 return JOURNAL_ENTRY_BAD; 933 } 934 935 ret = jset_validate_entries(c, jset, flags); 936 fsck_err: 937 return ret; 938 } 939 940 static int jset_validate_early(struct bch_fs *c, 941 struct bch_dev *ca, 942 struct jset *jset, u64 sector, 943 unsigned bucket_sectors_left, 944 unsigned sectors_read) 945 { 946 struct bkey_validate_context from = { 947 .from = BKEY_VALIDATE_journal, 948 .journal_seq = le64_to_cpu(jset->seq), 949 }; 950 int ret = 0; 951 952 if (le64_to_cpu(jset->magic) != jset_magic(c)) 953 return JOURNAL_ENTRY_NONE; 954 955 unsigned version = le32_to_cpu(jset->version); 956 if (journal_entry_err_on(!bch2_version_compatible(version), 957 c, version, jset, NULL, 958 jset_unsupported_version, 959 "%s sector %llu seq %llu: unknown journal entry version %u.%u", 960 ca ? ca->name : c->name, 961 sector, le64_to_cpu(jset->seq), 962 BCH_VERSION_MAJOR(version), 963 BCH_VERSION_MINOR(version))) { 964 /* don't try to continue: */ 965 return -EINVAL; 966 } 967 968 size_t bytes = vstruct_bytes(jset); 969 if (bytes > (sectors_read << 9) && 970 sectors_read < bucket_sectors_left) 971 return JOURNAL_ENTRY_REREAD; 972 973 if (journal_entry_err_on(bytes > bucket_sectors_left << 9, 974 c, version, jset, NULL, 975 jset_past_bucket_end, 976 "%s sector %llu seq %llu: journal entry too big (%zu bytes)", 977 ca ? ca->name : c->name, 978 sector, le64_to_cpu(jset->seq), bytes)) 979 le32_add_cpu(&jset->u64s, 980 -((bytes - (bucket_sectors_left << 9)) / 8)); 981 fsck_err: 982 return ret; 983 } 984 985 struct journal_read_buf { 986 void *data; 987 size_t size; 988 }; 989 990 static int journal_read_buf_realloc(struct journal_read_buf *b, 991 size_t new_size) 992 { 993 void *n; 994 995 /* the bios are sized for this many pages, max: */ 996 if (new_size > JOURNAL_ENTRY_SIZE_MAX) 997 return -BCH_ERR_ENOMEM_journal_read_buf_realloc; 998 999 new_size = roundup_pow_of_two(new_size); 1000 n = kvmalloc(new_size, GFP_KERNEL); 1001 if (!n) 1002 return -BCH_ERR_ENOMEM_journal_read_buf_realloc; 1003 1004 kvfree(b->data); 1005 b->data = n; 1006 b->size = new_size; 1007 return 0; 1008 } 1009 1010 static int journal_read_bucket(struct bch_dev *ca, 1011 struct journal_read_buf *buf, 1012 struct journal_list *jlist, 1013 unsigned bucket) 1014 { 1015 struct bch_fs *c = ca->fs; 1016 struct journal_device *ja = &ca->journal; 1017 struct jset *j = NULL; 1018 unsigned sectors, sectors_read = 0; 1019 u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), 1020 end = offset + ca->mi.bucket_size; 1021 bool saw_bad = false, csum_good; 1022 struct printbuf err = PRINTBUF; 1023 int ret = 0; 1024 1025 pr_debug("reading %u", bucket); 1026 1027 while (offset < end) { 1028 if (!sectors_read) { 1029 struct bio *bio; 1030 unsigned nr_bvecs; 1031 reread: 1032 sectors_read = min_t(unsigned, 1033 end - offset, buf->size >> 9); 1034 nr_bvecs = buf_pages(buf->data, sectors_read << 9); 1035 1036 bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); 1037 if (!bio) 1038 return -BCH_ERR_ENOMEM_journal_read_bucket; 1039 bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ); 1040 1041 bio->bi_iter.bi_sector = offset; 1042 bch2_bio_map(bio, buf->data, sectors_read << 9); 1043 1044 ret = submit_bio_wait(bio); 1045 kfree(bio); 1046 1047 if (bch2_dev_io_err_on(ret, ca, BCH_MEMBER_ERROR_read, 1048 "journal read error: sector %llu", 1049 offset) || 1050 bch2_meta_read_fault("journal")) { 1051 /* 1052 * We don't error out of the recovery process 1053 * here, since the relevant journal entry may be 1054 * found on a different device, and missing or 1055 * no journal entries will be handled later 1056 */ 1057 goto out; 1058 } 1059 1060 j = buf->data; 1061 } 1062 1063 ret = jset_validate_early(c, ca, j, offset, 1064 end - offset, sectors_read); 1065 switch (ret) { 1066 case 0: 1067 sectors = vstruct_sectors(j, c->block_bits); 1068 break; 1069 case JOURNAL_ENTRY_REREAD: 1070 if (vstruct_bytes(j) > buf->size) { 1071 ret = journal_read_buf_realloc(buf, 1072 vstruct_bytes(j)); 1073 if (ret) 1074 goto err; 1075 } 1076 goto reread; 1077 case JOURNAL_ENTRY_NONE: 1078 if (!saw_bad) 1079 goto out; 1080 /* 1081 * On checksum error we don't really trust the size 1082 * field of the journal entry we read, so try reading 1083 * again at next block boundary: 1084 */ 1085 sectors = block_sectors(c); 1086 goto next_block; 1087 default: 1088 goto err; 1089 } 1090 1091 if (le64_to_cpu(j->seq) > ja->highest_seq_found) { 1092 ja->highest_seq_found = le64_to_cpu(j->seq); 1093 ja->cur_idx = bucket; 1094 ja->sectors_free = ca->mi.bucket_size - 1095 bucket_remainder(ca, offset) - sectors; 1096 } 1097 1098 /* 1099 * This happens sometimes if we don't have discards on - 1100 * when we've partially overwritten a bucket with new 1101 * journal entries. We don't need the rest of the 1102 * bucket: 1103 */ 1104 if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) 1105 goto out; 1106 1107 ja->bucket_seq[bucket] = le64_to_cpu(j->seq); 1108 1109 enum bch_csum_type csum_type = JSET_CSUM_TYPE(j); 1110 struct bch_csum csum; 1111 csum_good = jset_csum_good(c, j, &csum); 1112 1113 if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum, 1114 "%s", 1115 (printbuf_reset(&err), 1116 prt_str(&err, "journal "), 1117 bch2_csum_err_msg(&err, csum_type, j->csum, csum), 1118 err.buf))) 1119 saw_bad = true; 1120 1121 ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), 1122 j->encrypted_start, 1123 vstruct_end(j) - (void *) j->encrypted_start); 1124 bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret)); 1125 1126 mutex_lock(&jlist->lock); 1127 ret = journal_entry_add(c, ca, (struct journal_ptr) { 1128 .csum_good = csum_good, 1129 .dev = ca->dev_idx, 1130 .bucket = bucket, 1131 .bucket_offset = offset - 1132 bucket_to_sector(ca, ja->buckets[bucket]), 1133 .sector = offset, 1134 }, jlist, j); 1135 mutex_unlock(&jlist->lock); 1136 1137 switch (ret) { 1138 case JOURNAL_ENTRY_ADD_OK: 1139 break; 1140 case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: 1141 break; 1142 default: 1143 goto err; 1144 } 1145 next_block: 1146 pr_debug("next"); 1147 offset += sectors; 1148 sectors_read -= sectors; 1149 j = ((void *) j) + (sectors << 9); 1150 } 1151 1152 out: 1153 ret = 0; 1154 err: 1155 printbuf_exit(&err); 1156 return ret; 1157 } 1158 1159 static CLOSURE_CALLBACK(bch2_journal_read_device) 1160 { 1161 closure_type(ja, struct journal_device, read); 1162 struct bch_dev *ca = container_of(ja, struct bch_dev, journal); 1163 struct bch_fs *c = ca->fs; 1164 struct journal_list *jlist = 1165 container_of(cl->parent, struct journal_list, cl); 1166 struct journal_read_buf buf = { NULL, 0 }; 1167 unsigned i; 1168 int ret = 0; 1169 1170 if (!ja->nr) 1171 goto out; 1172 1173 ret = journal_read_buf_realloc(&buf, PAGE_SIZE); 1174 if (ret) 1175 goto err; 1176 1177 pr_debug("%u journal buckets", ja->nr); 1178 1179 for (i = 0; i < ja->nr; i++) { 1180 ret = journal_read_bucket(ca, &buf, jlist, i); 1181 if (ret) 1182 goto err; 1183 } 1184 1185 /* 1186 * Set dirty_idx to indicate the entire journal is full and needs to be 1187 * reclaimed - journal reclaim will immediately reclaim whatever isn't 1188 * pinned when it first runs: 1189 */ 1190 ja->discard_idx = ja->dirty_idx_ondisk = 1191 ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; 1192 out: 1193 bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); 1194 kvfree(buf.data); 1195 percpu_ref_put(&ca->io_ref); 1196 closure_return(cl); 1197 return; 1198 err: 1199 mutex_lock(&jlist->lock); 1200 jlist->ret = ret; 1201 mutex_unlock(&jlist->lock); 1202 goto out; 1203 } 1204 1205 int bch2_journal_read(struct bch_fs *c, 1206 u64 *last_seq, 1207 u64 *blacklist_seq, 1208 u64 *start_seq) 1209 { 1210 struct journal_list jlist; 1211 struct journal_replay *i, **_i, *prev = NULL; 1212 struct genradix_iter radix_iter; 1213 struct printbuf buf = PRINTBUF; 1214 bool degraded = false, last_write_torn = false; 1215 u64 seq; 1216 int ret = 0; 1217 1218 closure_init_stack(&jlist.cl); 1219 mutex_init(&jlist.lock); 1220 jlist.last_seq = 0; 1221 jlist.ret = 0; 1222 1223 for_each_member_device(c, ca) { 1224 if (!c->opts.fsck && 1225 !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal))) 1226 continue; 1227 1228 if ((ca->mi.state == BCH_MEMBER_STATE_rw || 1229 ca->mi.state == BCH_MEMBER_STATE_ro) && 1230 percpu_ref_tryget(&ca->io_ref)) 1231 closure_call(&ca->journal.read, 1232 bch2_journal_read_device, 1233 system_unbound_wq, 1234 &jlist.cl); 1235 else 1236 degraded = true; 1237 } 1238 1239 closure_sync(&jlist.cl); 1240 1241 if (jlist.ret) 1242 return jlist.ret; 1243 1244 *last_seq = 0; 1245 *start_seq = 0; 1246 *blacklist_seq = 0; 1247 1248 /* 1249 * Find most recent flush entry, and ignore newer non flush entries - 1250 * those entries will be blacklisted: 1251 */ 1252 genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) { 1253 i = *_i; 1254 1255 if (journal_replay_ignore(i)) 1256 continue; 1257 1258 if (!*start_seq) 1259 *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1; 1260 1261 if (JSET_NO_FLUSH(&i->j)) { 1262 i->ignore_blacklisted = true; 1263 continue; 1264 } 1265 1266 if (!last_write_torn && !i->csum_good) { 1267 last_write_torn = true; 1268 i->ignore_blacklisted = true; 1269 continue; 1270 } 1271 1272 struct bkey_validate_context from = { 1273 .from = BKEY_VALIDATE_journal, 1274 .journal_seq = le64_to_cpu(i->j.seq), 1275 }; 1276 if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq), 1277 c, le32_to_cpu(i->j.version), &i->j, NULL, 1278 jset_last_seq_newer_than_seq, 1279 "invalid journal entry: last_seq > seq (%llu > %llu)", 1280 le64_to_cpu(i->j.last_seq), 1281 le64_to_cpu(i->j.seq))) 1282 i->j.last_seq = i->j.seq; 1283 1284 *last_seq = le64_to_cpu(i->j.last_seq); 1285 *blacklist_seq = le64_to_cpu(i->j.seq) + 1; 1286 break; 1287 } 1288 1289 if (!*start_seq) { 1290 bch_info(c, "journal read done, but no entries found"); 1291 return 0; 1292 } 1293 1294 if (!*last_seq) { 1295 fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes, 1296 "journal read done, but no entries found after dropping non-flushes"); 1297 return 0; 1298 } 1299 1300 bch_info(c, "journal read done, replaying entries %llu-%llu", 1301 *last_seq, *blacklist_seq - 1); 1302 1303 if (*start_seq != *blacklist_seq) 1304 bch_info(c, "dropped unflushed entries %llu-%llu", 1305 *blacklist_seq, *start_seq - 1); 1306 1307 /* Drop blacklisted entries and entries older than last_seq: */ 1308 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1309 i = *_i; 1310 1311 if (journal_replay_ignore(i)) 1312 continue; 1313 1314 seq = le64_to_cpu(i->j.seq); 1315 if (seq < *last_seq) { 1316 journal_replay_free(c, i, false); 1317 continue; 1318 } 1319 1320 if (bch2_journal_seq_is_blacklisted(c, seq, true)) { 1321 fsck_err_on(!JSET_NO_FLUSH(&i->j), c, 1322 jset_seq_blacklisted, 1323 "found blacklisted journal entry %llu", seq); 1324 i->ignore_blacklisted = true; 1325 } 1326 } 1327 1328 /* Check for missing entries: */ 1329 seq = *last_seq; 1330 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1331 i = *_i; 1332 1333 if (journal_replay_ignore(i)) 1334 continue; 1335 1336 BUG_ON(seq > le64_to_cpu(i->j.seq)); 1337 1338 while (seq < le64_to_cpu(i->j.seq)) { 1339 u64 missing_start, missing_end; 1340 struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; 1341 1342 while (seq < le64_to_cpu(i->j.seq) && 1343 bch2_journal_seq_is_blacklisted(c, seq, false)) 1344 seq++; 1345 1346 if (seq == le64_to_cpu(i->j.seq)) 1347 break; 1348 1349 missing_start = seq; 1350 1351 while (seq < le64_to_cpu(i->j.seq) && 1352 !bch2_journal_seq_is_blacklisted(c, seq, false)) 1353 seq++; 1354 1355 if (prev) { 1356 bch2_journal_ptrs_to_text(&buf1, c, prev); 1357 prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits)); 1358 } else 1359 prt_printf(&buf1, "(none)"); 1360 bch2_journal_ptrs_to_text(&buf2, c, i); 1361 1362 missing_end = seq - 1; 1363 fsck_err(c, journal_entries_missing, 1364 "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" 1365 " prev at %s\n" 1366 " next at %s, continue?", 1367 missing_start, missing_end, 1368 *last_seq, *blacklist_seq - 1, 1369 buf1.buf, buf2.buf); 1370 1371 printbuf_exit(&buf1); 1372 printbuf_exit(&buf2); 1373 } 1374 1375 prev = i; 1376 seq++; 1377 } 1378 1379 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1380 struct bch_replicas_padded replicas = { 1381 .e.data_type = BCH_DATA_journal, 1382 .e.nr_devs = 0, 1383 .e.nr_required = 1, 1384 }; 1385 1386 i = *_i; 1387 if (journal_replay_ignore(i)) 1388 continue; 1389 1390 darray_for_each(i->ptrs, ptr) { 1391 struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); 1392 1393 if (!ptr->csum_good) 1394 bch_err_dev_offset(ca, ptr->sector, 1395 "invalid journal checksum, seq %llu%s", 1396 le64_to_cpu(i->j.seq), 1397 i->csum_good ? " (had good copy on another device)" : ""); 1398 } 1399 1400 ret = jset_validate(c, 1401 bch2_dev_have_ref(c, i->ptrs.data[0].dev), 1402 &i->j, 1403 i->ptrs.data[0].sector, 1404 READ); 1405 if (ret) 1406 goto err; 1407 1408 darray_for_each(i->ptrs, ptr) 1409 replicas_entry_add_dev(&replicas.e, ptr->dev); 1410 1411 bch2_replicas_entry_sort(&replicas.e); 1412 1413 printbuf_reset(&buf); 1414 bch2_replicas_entry_to_text(&buf, &replicas.e); 1415 1416 if (!degraded && 1417 !bch2_replicas_marked(c, &replicas.e) && 1418 (le64_to_cpu(i->j.seq) == *last_seq || 1419 fsck_err(c, journal_entry_replicas_not_marked, 1420 "superblock not marked as containing replicas for journal entry %llu\n %s", 1421 le64_to_cpu(i->j.seq), buf.buf))) { 1422 ret = bch2_mark_replicas(c, &replicas.e); 1423 if (ret) 1424 goto err; 1425 } 1426 } 1427 err: 1428 fsck_err: 1429 printbuf_exit(&buf); 1430 return ret; 1431 } 1432 1433 /* journal write: */ 1434 1435 static void journal_advance_devs_to_next_bucket(struct journal *j, 1436 struct dev_alloc_list *devs, 1437 unsigned sectors, u64 seq) 1438 { 1439 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1440 1441 darray_for_each(*devs, i) { 1442 struct bch_dev *ca = rcu_dereference(c->devs[*i]); 1443 if (!ca) 1444 continue; 1445 1446 struct journal_device *ja = &ca->journal; 1447 1448 if (sectors > ja->sectors_free && 1449 sectors <= ca->mi.bucket_size && 1450 bch2_journal_dev_buckets_available(j, ja, 1451 journal_space_discarded)) { 1452 ja->cur_idx = (ja->cur_idx + 1) % ja->nr; 1453 ja->sectors_free = ca->mi.bucket_size; 1454 1455 /* 1456 * ja->bucket_seq[ja->cur_idx] must always have 1457 * something sensible: 1458 */ 1459 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(seq); 1460 } 1461 } 1462 } 1463 1464 static void __journal_write_alloc(struct journal *j, 1465 struct journal_buf *w, 1466 struct dev_alloc_list *devs, 1467 unsigned sectors, 1468 unsigned *replicas, 1469 unsigned replicas_want) 1470 { 1471 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1472 1473 darray_for_each(*devs, i) { 1474 struct bch_dev *ca = rcu_dereference(c->devs[*i]); 1475 if (!ca) 1476 continue; 1477 1478 struct journal_device *ja = &ca->journal; 1479 1480 /* 1481 * Check that we can use this device, and aren't already using 1482 * it: 1483 */ 1484 if (!ca->mi.durability || 1485 ca->mi.state != BCH_MEMBER_STATE_rw || 1486 !ja->nr || 1487 bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) || 1488 sectors > ja->sectors_free) 1489 continue; 1490 1491 bch2_dev_stripe_increment(ca, &j->wp.stripe); 1492 1493 bch2_bkey_append_ptr(&w->key, 1494 (struct bch_extent_ptr) { 1495 .offset = bucket_to_sector(ca, 1496 ja->buckets[ja->cur_idx]) + 1497 ca->mi.bucket_size - 1498 ja->sectors_free, 1499 .dev = ca->dev_idx, 1500 }); 1501 1502 ja->sectors_free -= sectors; 1503 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1504 1505 *replicas += ca->mi.durability; 1506 1507 if (*replicas >= replicas_want) 1508 break; 1509 } 1510 } 1511 1512 /** 1513 * journal_write_alloc - decide where to write next journal entry 1514 * 1515 * @j: journal object 1516 * @w: journal buf (entry to be written) 1517 * 1518 * Returns: 0 on success, or -EROFS on failure 1519 */ 1520 static int journal_write_alloc(struct journal *j, struct journal_buf *w) 1521 { 1522 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1523 struct bch_devs_mask devs; 1524 struct dev_alloc_list devs_sorted; 1525 unsigned sectors = vstruct_sectors(w->data, c->block_bits); 1526 unsigned target = c->opts.metadata_target ?: 1527 c->opts.foreground_target; 1528 unsigned replicas = 0, replicas_want = 1529 READ_ONCE(c->opts.metadata_replicas); 1530 unsigned replicas_need = min_t(unsigned, replicas_want, 1531 READ_ONCE(c->opts.metadata_replicas_required)); 1532 bool advance_done = false; 1533 1534 rcu_read_lock(); 1535 1536 /* We might run more than once if we have to stop and do discards: */ 1537 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&w->key)); 1538 bkey_for_each_ptr(ptrs, p) { 1539 struct bch_dev *ca = bch2_dev_rcu_noerror(c, p->dev); 1540 if (ca) 1541 replicas += ca->mi.durability; 1542 } 1543 1544 retry_target: 1545 devs = target_rw_devs(c, BCH_DATA_journal, target); 1546 devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); 1547 retry_alloc: 1548 __journal_write_alloc(j, w, &devs_sorted, sectors, &replicas, replicas_want); 1549 1550 if (likely(replicas >= replicas_want)) 1551 goto done; 1552 1553 if (!advance_done) { 1554 journal_advance_devs_to_next_bucket(j, &devs_sorted, sectors, w->data->seq); 1555 advance_done = true; 1556 goto retry_alloc; 1557 } 1558 1559 if (replicas < replicas_want && target) { 1560 /* Retry from all devices: */ 1561 target = 0; 1562 advance_done = false; 1563 goto retry_target; 1564 } 1565 done: 1566 rcu_read_unlock(); 1567 1568 BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); 1569 1570 return replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices; 1571 } 1572 1573 static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) 1574 { 1575 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1576 1577 /* we aren't holding j->lock: */ 1578 unsigned new_size = READ_ONCE(j->buf_size_want); 1579 void *new_buf; 1580 1581 if (buf->buf_size >= new_size) 1582 return; 1583 1584 size_t btree_write_buffer_size = new_size / 64; 1585 1586 if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size)) 1587 return; 1588 1589 new_buf = kvmalloc(new_size, GFP_NOFS|__GFP_NOWARN); 1590 if (!new_buf) 1591 return; 1592 1593 memcpy(new_buf, buf->data, buf->buf_size); 1594 1595 spin_lock(&j->lock); 1596 swap(buf->data, new_buf); 1597 swap(buf->buf_size, new_size); 1598 spin_unlock(&j->lock); 1599 1600 kvfree(new_buf); 1601 } 1602 1603 static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) 1604 { 1605 return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK); 1606 } 1607 1608 static CLOSURE_CALLBACK(journal_write_done) 1609 { 1610 closure_type(w, struct journal_buf, io); 1611 struct journal *j = container_of(w, struct journal, buf[w->idx]); 1612 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1613 struct bch_replicas_padded replicas; 1614 union journal_res_state old, new; 1615 u64 seq = le64_to_cpu(w->data->seq); 1616 int err = 0; 1617 1618 bch2_time_stats_update(!JSET_NO_FLUSH(w->data) 1619 ? j->flush_write_time 1620 : j->noflush_write_time, j->write_start_time); 1621 1622 if (!w->devs_written.nr) { 1623 bch_err(c, "unable to write journal to sufficient devices"); 1624 err = -EIO; 1625 } else { 1626 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, 1627 w->devs_written); 1628 if (bch2_mark_replicas(c, &replicas.e)) 1629 err = -EIO; 1630 } 1631 1632 if (err) 1633 bch2_fatal_error(c); 1634 1635 closure_debug_destroy(cl); 1636 1637 spin_lock(&j->lock); 1638 if (seq >= j->pin.front) 1639 journal_seq_pin(j, seq)->devs = w->devs_written; 1640 if (err && (!j->err_seq || seq < j->err_seq)) 1641 j->err_seq = seq; 1642 w->write_done = true; 1643 1644 bool completed = false; 1645 1646 for (seq = journal_last_unwritten_seq(j); 1647 seq <= journal_cur_seq(j); 1648 seq++) { 1649 w = j->buf + (seq & JOURNAL_BUF_MASK); 1650 if (!w->write_done) 1651 break; 1652 1653 if (!j->err_seq && !JSET_NO_FLUSH(w->data)) { 1654 j->flushed_seq_ondisk = seq; 1655 j->last_seq_ondisk = w->last_seq; 1656 1657 bch2_do_discards(c); 1658 closure_wake_up(&c->freelist_wait); 1659 bch2_reset_alloc_cursors(c); 1660 } 1661 1662 j->seq_ondisk = seq; 1663 1664 /* 1665 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard 1666 * more buckets: 1667 * 1668 * Must come before signaling write completion, for 1669 * bch2_fs_journal_stop(): 1670 */ 1671 if (j->watermark != BCH_WATERMARK_stripe) 1672 journal_reclaim_kick(&c->journal); 1673 1674 old.v = atomic64_read(&j->reservations.counter); 1675 do { 1676 new.v = old.v; 1677 BUG_ON(journal_state_count(new, new.unwritten_idx)); 1678 BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK)); 1679 1680 new.unwritten_idx++; 1681 } while (!atomic64_try_cmpxchg(&j->reservations.counter, 1682 &old.v, new.v)); 1683 1684 closure_wake_up(&w->wait); 1685 completed = true; 1686 } 1687 1688 if (completed) { 1689 bch2_journal_reclaim_fast(j); 1690 bch2_journal_space_available(j); 1691 1692 track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false); 1693 1694 journal_wake(j); 1695 } 1696 1697 if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && 1698 new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { 1699 struct journal_buf *buf = journal_cur_buf(j); 1700 long delta = buf->expires - jiffies; 1701 1702 /* 1703 * We don't close a journal entry to write it while there's 1704 * previous entries still in flight - the current journal entry 1705 * might want to be written now: 1706 */ 1707 mod_delayed_work(j->wq, &j->write_work, max(0L, delta)); 1708 } 1709 1710 /* 1711 * We don't typically trigger journal writes from her - the next journal 1712 * write will be triggered immediately after the previous one is 1713 * allocated, in bch2_journal_write() - but the journal write error path 1714 * is special: 1715 */ 1716 bch2_journal_do_writes(j); 1717 spin_unlock(&j->lock); 1718 } 1719 1720 static void journal_write_endio(struct bio *bio) 1721 { 1722 struct journal_bio *jbio = container_of(bio, struct journal_bio, bio); 1723 struct bch_dev *ca = jbio->ca; 1724 struct journal *j = &ca->fs->journal; 1725 struct journal_buf *w = j->buf + jbio->buf_idx; 1726 1727 if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, 1728 "error writing journal entry %llu: %s", 1729 le64_to_cpu(w->data->seq), 1730 bch2_blk_status_to_str(bio->bi_status)) || 1731 bch2_meta_write_fault("journal")) { 1732 unsigned long flags; 1733 1734 spin_lock_irqsave(&j->err_lock, flags); 1735 bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx); 1736 spin_unlock_irqrestore(&j->err_lock, flags); 1737 } 1738 1739 closure_put(&w->io); 1740 percpu_ref_put(&ca->io_ref); 1741 } 1742 1743 static CLOSURE_CALLBACK(journal_write_submit) 1744 { 1745 closure_type(w, struct journal_buf, io); 1746 struct journal *j = container_of(w, struct journal, buf[w->idx]); 1747 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1748 unsigned sectors = vstruct_sectors(w->data, c->block_bits); 1749 1750 extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { 1751 struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE); 1752 if (!ca) { 1753 /* XXX: fix this */ 1754 bch_err(c, "missing device for journal write\n"); 1755 continue; 1756 } 1757 1758 this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], 1759 sectors); 1760 1761 struct journal_device *ja = &ca->journal; 1762 struct bio *bio = &ja->bio[w->idx]->bio; 1763 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); 1764 bio->bi_iter.bi_sector = ptr->offset; 1765 bio->bi_end_io = journal_write_endio; 1766 bio->bi_private = ca; 1767 bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 0); 1768 1769 BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector); 1770 ca->prev_journal_sector = bio->bi_iter.bi_sector; 1771 1772 if (!JSET_NO_FLUSH(w->data)) 1773 bio->bi_opf |= REQ_FUA; 1774 if (!JSET_NO_FLUSH(w->data) && !w->separate_flush) 1775 bio->bi_opf |= REQ_PREFLUSH; 1776 1777 bch2_bio_map(bio, w->data, sectors << 9); 1778 1779 trace_and_count(c, journal_write, bio); 1780 closure_bio_submit(bio, cl); 1781 1782 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1783 } 1784 1785 continue_at(cl, journal_write_done, j->wq); 1786 } 1787 1788 static CLOSURE_CALLBACK(journal_write_preflush) 1789 { 1790 closure_type(w, struct journal_buf, io); 1791 struct journal *j = container_of(w, struct journal, buf[w->idx]); 1792 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1793 1794 if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { 1795 spin_lock(&j->lock); 1796 if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { 1797 closure_wait(&j->async_wait, cl); 1798 spin_unlock(&j->lock); 1799 continue_at(cl, journal_write_preflush, j->wq); 1800 return; 1801 } 1802 spin_unlock(&j->lock); 1803 } 1804 1805 if (w->separate_flush) { 1806 for_each_rw_member(c, ca) { 1807 percpu_ref_get(&ca->io_ref); 1808 1809 struct journal_device *ja = &ca->journal; 1810 struct bio *bio = &ja->bio[w->idx]->bio; 1811 bio_reset(bio, ca->disk_sb.bdev, 1812 REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH); 1813 bio->bi_end_io = journal_write_endio; 1814 bio->bi_private = ca; 1815 closure_bio_submit(bio, cl); 1816 } 1817 1818 continue_at(cl, journal_write_submit, j->wq); 1819 } else { 1820 /* 1821 * no need to punt to another work item if we're not waiting on 1822 * preflushes 1823 */ 1824 journal_write_submit(&cl->work); 1825 } 1826 } 1827 1828 static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) 1829 { 1830 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1831 struct jset_entry *start, *end; 1832 struct jset *jset = w->data; 1833 struct journal_keys_to_wb wb = { NULL }; 1834 unsigned sectors, bytes, u64s; 1835 unsigned long btree_roots_have = 0; 1836 bool validate_before_checksum = false; 1837 u64 seq = le64_to_cpu(jset->seq); 1838 int ret; 1839 1840 /* 1841 * Simple compaction, dropping empty jset_entries (from journal 1842 * reservations that weren't fully used) and merging jset_entries that 1843 * can be. 1844 * 1845 * If we wanted to be really fancy here, we could sort all the keys in 1846 * the jset and drop keys that were overwritten - probably not worth it: 1847 */ 1848 vstruct_for_each(jset, i) { 1849 unsigned u64s = le16_to_cpu(i->u64s); 1850 1851 /* Empty entry: */ 1852 if (!u64s) 1853 continue; 1854 1855 /* 1856 * New btree roots are set by journalling them; when the journal 1857 * entry gets written we have to propagate them to 1858 * c->btree_roots 1859 * 1860 * But, every journal entry we write has to contain all the 1861 * btree roots (at least for now); so after we copy btree roots 1862 * to c->btree_roots we have to get any missing btree roots and 1863 * add them to this journal entry: 1864 */ 1865 switch (i->type) { 1866 case BCH_JSET_ENTRY_btree_root: 1867 bch2_journal_entry_to_btree_root(c, i); 1868 __set_bit(i->btree_id, &btree_roots_have); 1869 break; 1870 case BCH_JSET_ENTRY_write_buffer_keys: 1871 EBUG_ON(!w->need_flush_to_write_buffer); 1872 1873 if (!wb.wb) 1874 bch2_journal_keys_to_write_buffer_start(c, &wb, seq); 1875 1876 jset_entry_for_each_key(i, k) { 1877 ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k); 1878 if (ret) { 1879 bch2_fs_fatal_error(c, "flushing journal keys to btree write buffer: %s", 1880 bch2_err_str(ret)); 1881 bch2_journal_keys_to_write_buffer_end(c, &wb); 1882 return ret; 1883 } 1884 } 1885 i->type = BCH_JSET_ENTRY_btree_keys; 1886 break; 1887 } 1888 } 1889 1890 if (wb.wb) { 1891 ret = bch2_journal_keys_to_write_buffer_end(c, &wb); 1892 if (ret) { 1893 bch2_fs_fatal_error(c, "error flushing journal keys to btree write buffer: %s", 1894 bch2_err_str(ret)); 1895 return ret; 1896 } 1897 } 1898 1899 spin_lock(&c->journal.lock); 1900 w->need_flush_to_write_buffer = false; 1901 spin_unlock(&c->journal.lock); 1902 1903 start = end = vstruct_last(jset); 1904 1905 end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have); 1906 1907 struct jset_entry_datetime *d = 1908 container_of(jset_entry_init(&end, sizeof(*d)), struct jset_entry_datetime, entry); 1909 d->entry.type = BCH_JSET_ENTRY_datetime; 1910 d->seconds = cpu_to_le64(ktime_get_real_seconds()); 1911 1912 bch2_journal_super_entries_add_common(c, &end, seq); 1913 u64s = (u64 *) end - (u64 *) start; 1914 1915 WARN_ON(u64s > j->entry_u64s_reserved); 1916 1917 le32_add_cpu(&jset->u64s, u64s); 1918 1919 sectors = vstruct_sectors(jset, c->block_bits); 1920 bytes = vstruct_bytes(jset); 1921 1922 if (sectors > w->sectors) { 1923 bch2_fs_fatal_error(c, ": journal write overran available space, %zu > %u (extra %u reserved %u/%u)", 1924 vstruct_bytes(jset), w->sectors << 9, 1925 u64s, w->u64s_reserved, j->entry_u64s_reserved); 1926 return -EINVAL; 1927 } 1928 1929 jset->magic = cpu_to_le64(jset_magic(c)); 1930 jset->version = cpu_to_le32(c->sb.version); 1931 1932 SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); 1933 SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); 1934 1935 if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset)) 1936 j->last_empty_seq = seq; 1937 1938 if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) 1939 validate_before_checksum = true; 1940 1941 if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current) 1942 validate_before_checksum = true; 1943 1944 if (validate_before_checksum && 1945 (ret = jset_validate(c, NULL, jset, 0, WRITE))) 1946 return ret; 1947 1948 ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), 1949 jset->encrypted_start, 1950 vstruct_end(jset) - (void *) jset->encrypted_start); 1951 if (bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret))) 1952 return ret; 1953 1954 jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), 1955 journal_nonce(jset), jset); 1956 1957 if (!validate_before_checksum && 1958 (ret = jset_validate(c, NULL, jset, 0, WRITE))) 1959 return ret; 1960 1961 memset((void *) jset + bytes, 0, (sectors << 9) - bytes); 1962 return 0; 1963 } 1964 1965 static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w) 1966 { 1967 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1968 int error = bch2_journal_error(j); 1969 1970 /* 1971 * If the journal is in an error state - we did an emergency shutdown - 1972 * we prefer to continue doing journal writes. We just mark them as 1973 * noflush so they'll never be used, but they'll still be visible by the 1974 * list_journal tool - this helps in debugging. 1975 * 1976 * There's a caveat: the first journal write after marking the 1977 * superblock dirty must always be a flush write, because on startup 1978 * from a clean shutdown we didn't necessarily read the journal and the 1979 * new journal write might overwrite whatever was in the journal 1980 * previously - we can't leave the journal without any flush writes in 1981 * it. 1982 * 1983 * So if we're in an error state, and we're still starting up, we don't 1984 * write anything at all. 1985 */ 1986 if (error && test_bit(JOURNAL_need_flush_write, &j->flags)) 1987 return -EIO; 1988 1989 if (error || 1990 w->noflush || 1991 (!w->must_flush && 1992 time_before(jiffies, j->last_flush_write + 1993 msecs_to_jiffies(c->opts.journal_flush_delay)) && 1994 test_bit(JOURNAL_may_skip_flush, &j->flags))) { 1995 w->noflush = true; 1996 SET_JSET_NO_FLUSH(w->data, true); 1997 w->data->last_seq = 0; 1998 w->last_seq = 0; 1999 2000 j->nr_noflush_writes++; 2001 } else { 2002 w->must_flush = true; 2003 j->last_flush_write = jiffies; 2004 j->nr_flush_writes++; 2005 clear_bit(JOURNAL_need_flush_write, &j->flags); 2006 } 2007 2008 return 0; 2009 } 2010 2011 CLOSURE_CALLBACK(bch2_journal_write) 2012 { 2013 closure_type(w, struct journal_buf, io); 2014 struct journal *j = container_of(w, struct journal, buf[w->idx]); 2015 struct bch_fs *c = container_of(j, struct bch_fs, journal); 2016 struct bch_replicas_padded replicas; 2017 unsigned nr_rw_members = 0; 2018 int ret; 2019 2020 for_each_rw_member(c, ca) 2021 nr_rw_members++; 2022 2023 BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); 2024 BUG_ON(!w->write_started); 2025 BUG_ON(w->write_allocated); 2026 BUG_ON(w->write_done); 2027 2028 j->write_start_time = local_clock(); 2029 2030 spin_lock(&j->lock); 2031 if (nr_rw_members > 1) 2032 w->separate_flush = true; 2033 2034 ret = bch2_journal_write_pick_flush(j, w); 2035 spin_unlock(&j->lock); 2036 if (ret) 2037 goto err; 2038 2039 mutex_lock(&j->buf_lock); 2040 journal_buf_realloc(j, w); 2041 2042 ret = bch2_journal_write_prep(j, w); 2043 mutex_unlock(&j->buf_lock); 2044 if (ret) 2045 goto err; 2046 2047 j->entry_bytes_written += vstruct_bytes(w->data); 2048 2049 while (1) { 2050 spin_lock(&j->lock); 2051 ret = journal_write_alloc(j, w); 2052 if (!ret || !j->can_discard) 2053 break; 2054 2055 spin_unlock(&j->lock); 2056 bch2_journal_do_discards(j); 2057 } 2058 2059 if (ret && !bch2_journal_error(j)) { 2060 struct printbuf buf = PRINTBUF; 2061 buf.atomic++; 2062 2063 prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"), 2064 le64_to_cpu(w->data->seq), 2065 vstruct_sectors(w->data, c->block_bits), 2066 bch2_err_str(ret)); 2067 __bch2_journal_debug_to_text(&buf, j); 2068 spin_unlock(&j->lock); 2069 bch2_print_string_as_lines(KERN_ERR, buf.buf); 2070 printbuf_exit(&buf); 2071 } 2072 if (ret) 2073 goto err; 2074 2075 /* 2076 * write is allocated, no longer need to account for it in 2077 * bch2_journal_space_available(): 2078 */ 2079 w->sectors = 0; 2080 w->write_allocated = true; 2081 2082 /* 2083 * journal entry has been compacted and allocated, recalculate space 2084 * available: 2085 */ 2086 bch2_journal_space_available(j); 2087 bch2_journal_do_writes(j); 2088 spin_unlock(&j->lock); 2089 2090 w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); 2091 2092 if (c->opts.nochanges) 2093 goto no_io; 2094 2095 /* 2096 * Mark journal replicas before we submit the write to guarantee 2097 * recovery will find the journal entries after a crash. 2098 */ 2099 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, 2100 w->devs_written); 2101 ret = bch2_mark_replicas(c, &replicas.e); 2102 if (ret) 2103 goto err; 2104 2105 if (!JSET_NO_FLUSH(w->data)) 2106 continue_at(cl, journal_write_preflush, j->wq); 2107 else 2108 continue_at(cl, journal_write_submit, j->wq); 2109 return; 2110 no_io: 2111 continue_at(cl, journal_write_done, j->wq); 2112 return; 2113 err: 2114 bch2_fatal_error(c); 2115 continue_at(cl, journal_write_done, j->wq); 2116 } 2117