1 // SPDX-License-Identifier: GPL-2.0 2 #include "bcachefs.h" 3 #include "alloc_background.h" 4 #include "alloc_foreground.h" 5 #include "btree_io.h" 6 #include "btree_update_interior.h" 7 #include "btree_write_buffer.h" 8 #include "buckets.h" 9 #include "checksum.h" 10 #include "disk_groups.h" 11 #include "error.h" 12 #include "journal.h" 13 #include "journal_io.h" 14 #include "journal_reclaim.h" 15 #include "journal_seq_blacklist.h" 16 #include "replicas.h" 17 #include "sb-clean.h" 18 #include "trace.h" 19 20 #include <linux/ioprio.h> 21 #include <linux/string_choices.h> 22 #include <linux/sched/sysctl.h> 23 24 void bch2_journal_pos_from_member_info_set(struct bch_fs *c) 25 { 26 lockdep_assert_held(&c->sb_lock); 27 28 for_each_member_device(c, ca) { 29 struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); 30 31 m->last_journal_bucket = cpu_to_le32(ca->journal.cur_idx); 32 m->last_journal_bucket_offset = cpu_to_le32(ca->mi.bucket_size - ca->journal.sectors_free); 33 } 34 } 35 36 void bch2_journal_pos_from_member_info_resume(struct bch_fs *c) 37 { 38 mutex_lock(&c->sb_lock); 39 for_each_member_device(c, ca) { 40 struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx); 41 42 unsigned idx = le32_to_cpu(m.last_journal_bucket); 43 if (idx < ca->journal.nr) 44 ca->journal.cur_idx = idx; 45 unsigned offset = le32_to_cpu(m.last_journal_bucket_offset); 46 if (offset <= ca->mi.bucket_size) 47 ca->journal.sectors_free = ca->mi.bucket_size - offset; 48 } 49 mutex_unlock(&c->sb_lock); 50 } 51 52 static void bch2_journal_ptr_to_text(struct printbuf *out, struct bch_fs *c, struct journal_ptr *p) 53 { 54 struct bch_dev *ca = bch2_dev_tryget_noerror(c, p->dev); 55 prt_printf(out, "%s %u:%u:%u (sector %llu)", 56 ca ? ca->name : "(invalid dev)", 57 p->dev, p->bucket, p->bucket_offset, p->sector); 58 bch2_dev_put(ca); 59 } 60 61 void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct journal_replay *j) 62 { 63 darray_for_each(j->ptrs, i) { 64 if (i != j->ptrs.data) 65 prt_printf(out, " "); 66 bch2_journal_ptr_to_text(out, c, i); 67 } 68 } 69 70 static void bch2_journal_datetime_to_text(struct printbuf *out, struct jset *j) 71 { 72 for_each_jset_entry_type(entry, j, BCH_JSET_ENTRY_datetime) { 73 struct jset_entry_datetime *datetime = 74 container_of(entry, struct jset_entry_datetime, entry); 75 bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); 76 break; 77 } 78 } 79 80 static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c, 81 struct journal_replay *j) 82 { 83 prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq)); 84 bch2_journal_datetime_to_text(out, &j->j); 85 prt_char(out, ' '); 86 bch2_journal_ptrs_to_text(out, c, j); 87 } 88 89 static struct nonce journal_nonce(const struct jset *jset) 90 { 91 return (struct nonce) {{ 92 [0] = 0, 93 [1] = ((__le32 *) &jset->seq)[0], 94 [2] = ((__le32 *) &jset->seq)[1], 95 [3] = BCH_NONCE_JOURNAL, 96 }}; 97 } 98 99 static bool jset_csum_good(struct bch_fs *c, struct jset *j, struct bch_csum *csum) 100 { 101 if (!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j))) { 102 *csum = (struct bch_csum) {}; 103 return false; 104 } 105 106 *csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j); 107 return !bch2_crc_cmp(j->csum, *csum); 108 } 109 110 static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq) 111 { 112 return (seq - c->journal_entries_base_seq) & (~0U >> 1); 113 } 114 115 static void __journal_replay_free(struct bch_fs *c, 116 struct journal_replay *i) 117 { 118 struct journal_replay **p = 119 genradix_ptr(&c->journal_entries, 120 journal_entry_radix_idx(c, le64_to_cpu(i->j.seq))); 121 122 BUG_ON(*p != i); 123 *p = NULL; 124 kvfree(i); 125 } 126 127 static void journal_replay_free(struct bch_fs *c, struct journal_replay *i, bool blacklisted) 128 { 129 if (blacklisted) 130 i->ignore_blacklisted = true; 131 else 132 i->ignore_not_dirty = true; 133 134 if (!c->opts.read_entire_journal) 135 __journal_replay_free(c, i); 136 } 137 138 struct journal_list { 139 struct closure cl; 140 u64 last_seq; 141 struct mutex lock; 142 int ret; 143 }; 144 145 #define JOURNAL_ENTRY_ADD_OK 0 146 #define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5 147 148 /* 149 * Given a journal entry we just read, add it to the list of journal entries to 150 * be replayed: 151 */ 152 static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, 153 struct journal_ptr entry_ptr, 154 struct journal_list *jlist, struct jset *j) 155 { 156 struct genradix_iter iter; 157 struct journal_replay **_i, *i, *dup; 158 size_t bytes = vstruct_bytes(j); 159 u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0; 160 struct printbuf buf = PRINTBUF; 161 int ret = JOURNAL_ENTRY_ADD_OK; 162 163 if (!c->journal.oldest_seq_found_ondisk || 164 le64_to_cpu(j->seq) < c->journal.oldest_seq_found_ondisk) 165 c->journal.oldest_seq_found_ondisk = le64_to_cpu(j->seq); 166 167 /* Is this entry older than the range we need? */ 168 if (!c->opts.read_entire_journal && 169 le64_to_cpu(j->seq) < jlist->last_seq) 170 return JOURNAL_ENTRY_ADD_OUT_OF_RANGE; 171 172 /* 173 * genradixes are indexed by a ulong, not a u64, so we can't index them 174 * by sequence number directly: Assume instead that they will all fall 175 * within the range of +-2billion of the filrst one we find. 176 */ 177 if (!c->journal_entries_base_seq) 178 c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX); 179 180 /* Drop entries we don't need anymore */ 181 if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) { 182 genradix_for_each_from(&c->journal_entries, iter, _i, 183 journal_entry_radix_idx(c, jlist->last_seq)) { 184 i = *_i; 185 186 if (journal_replay_ignore(i)) 187 continue; 188 189 if (le64_to_cpu(i->j.seq) >= last_seq) 190 break; 191 192 journal_replay_free(c, i, false); 193 } 194 } 195 196 jlist->last_seq = max(jlist->last_seq, last_seq); 197 198 _i = genradix_ptr_alloc(&c->journal_entries, 199 journal_entry_radix_idx(c, le64_to_cpu(j->seq)), 200 GFP_KERNEL); 201 if (!_i) 202 return bch_err_throw(c, ENOMEM_journal_entry_add); 203 204 /* 205 * Duplicate journal entries? If so we want the one that didn't have a 206 * checksum error: 207 */ 208 dup = *_i; 209 if (dup) { 210 bool identical = bytes == vstruct_bytes(&dup->j) && 211 !memcmp(j, &dup->j, bytes); 212 bool not_identical = !identical && 213 entry_ptr.csum_good && 214 dup->csum_good; 215 216 bool same_device = false; 217 darray_for_each(dup->ptrs, ptr) 218 if (ptr->dev == ca->dev_idx) 219 same_device = true; 220 221 ret = darray_push(&dup->ptrs, entry_ptr); 222 if (ret) 223 goto out; 224 225 bch2_journal_replay_to_text(&buf, c, dup); 226 227 fsck_err_on(same_device, 228 c, journal_entry_dup_same_device, 229 "duplicate journal entry on same device\n%s", 230 buf.buf); 231 232 fsck_err_on(not_identical, 233 c, journal_entry_replicas_data_mismatch, 234 "found duplicate but non identical journal entries\n%s", 235 buf.buf); 236 237 if (entry_ptr.csum_good && !identical) 238 goto replace; 239 240 goto out; 241 } 242 replace: 243 i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); 244 if (!i) 245 return bch_err_throw(c, ENOMEM_journal_entry_add); 246 247 darray_init(&i->ptrs); 248 i->csum_good = entry_ptr.csum_good; 249 i->ignore_blacklisted = false; 250 i->ignore_not_dirty = false; 251 unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct"); 252 253 if (dup) { 254 /* The first ptr should represent the jset we kept: */ 255 darray_for_each(dup->ptrs, ptr) 256 darray_push(&i->ptrs, *ptr); 257 __journal_replay_free(c, dup); 258 } else { 259 darray_push(&i->ptrs, entry_ptr); 260 } 261 262 *_i = i; 263 out: 264 fsck_err: 265 printbuf_exit(&buf); 266 return ret; 267 } 268 269 /* this fills in a range with empty jset_entries: */ 270 static void journal_entry_null_range(void *start, void *end) 271 { 272 struct jset_entry *entry; 273 274 for (entry = start; entry != end; entry = vstruct_next(entry)) 275 memset(entry, 0, sizeof(*entry)); 276 } 277 278 #define JOURNAL_ENTRY_REREAD 5 279 #define JOURNAL_ENTRY_NONE 6 280 #define JOURNAL_ENTRY_BAD 7 281 282 static void journal_entry_err_msg(struct printbuf *out, 283 u32 version, 284 struct jset *jset, 285 struct jset_entry *entry) 286 { 287 prt_str(out, "invalid journal entry, version="); 288 bch2_version_to_text(out, version); 289 290 if (entry) { 291 prt_str(out, " type="); 292 bch2_prt_jset_entry_type(out, entry->type); 293 } 294 295 if (!jset) { 296 prt_printf(out, " in superblock"); 297 } else { 298 299 prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq)); 300 301 if (entry) 302 prt_printf(out, " offset=%zi/%u", 303 (u64 *) entry - jset->_data, 304 le32_to_cpu(jset->u64s)); 305 } 306 307 prt_str(out, ": "); 308 } 309 310 #define journal_entry_err(c, version, jset, entry, _err, msg, ...) \ 311 ({ \ 312 struct printbuf _buf = PRINTBUF; \ 313 \ 314 journal_entry_err_msg(&_buf, version, jset, entry); \ 315 prt_printf(&_buf, msg, ##__VA_ARGS__); \ 316 \ 317 switch (from.flags & BCH_VALIDATE_write) { \ 318 case READ: \ 319 mustfix_fsck_err(c, _err, "%s", _buf.buf); \ 320 break; \ 321 case WRITE: \ 322 bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \ 323 if (bch2_fs_inconsistent(c, \ 324 "corrupt metadata before write: %s\n", _buf.buf)) {\ 325 ret = bch_err_throw(c, fsck_errors_not_fixed); \ 326 goto fsck_err; \ 327 } \ 328 break; \ 329 } \ 330 \ 331 printbuf_exit(&_buf); \ 332 true; \ 333 }) 334 335 #define journal_entry_err_on(cond, ...) \ 336 ((cond) ? journal_entry_err(__VA_ARGS__) : false) 337 338 #define FSCK_DELETED_KEY 5 339 340 static int journal_validate_key(struct bch_fs *c, 341 struct jset *jset, 342 struct jset_entry *entry, 343 struct bkey_i *k, 344 struct bkey_validate_context from, 345 unsigned version, int big_endian) 346 { 347 enum bch_validate_flags flags = from.flags; 348 int write = flags & BCH_VALIDATE_write; 349 void *next = vstruct_next(entry); 350 int ret = 0; 351 352 if (journal_entry_err_on(!k->k.u64s, 353 c, version, jset, entry, 354 journal_entry_bkey_u64s_0, 355 "k->u64s 0")) { 356 entry->u64s = cpu_to_le16((u64 *) k - entry->_data); 357 journal_entry_null_range(vstruct_next(entry), next); 358 return FSCK_DELETED_KEY; 359 } 360 361 if (journal_entry_err_on((void *) bkey_next(k) > 362 (void *) vstruct_next(entry), 363 c, version, jset, entry, 364 journal_entry_bkey_past_end, 365 "extends past end of journal entry")) { 366 entry->u64s = cpu_to_le16((u64 *) k - entry->_data); 367 journal_entry_null_range(vstruct_next(entry), next); 368 return FSCK_DELETED_KEY; 369 } 370 371 if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, 372 c, version, jset, entry, 373 journal_entry_bkey_bad_format, 374 "bad format %u", k->k.format)) { 375 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); 376 memmove(k, bkey_next(k), next - (void *) bkey_next(k)); 377 journal_entry_null_range(vstruct_next(entry), next); 378 return FSCK_DELETED_KEY; 379 } 380 381 if (!write) 382 bch2_bkey_compat(from.level, from.btree, version, big_endian, 383 write, NULL, bkey_to_packed(k)); 384 385 ret = bch2_bkey_validate(c, bkey_i_to_s_c(k), from); 386 if (ret == -BCH_ERR_fsck_delete_bkey) { 387 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); 388 memmove(k, bkey_next(k), next - (void *) bkey_next(k)); 389 journal_entry_null_range(vstruct_next(entry), next); 390 return FSCK_DELETED_KEY; 391 } 392 if (ret) 393 goto fsck_err; 394 395 if (write) 396 bch2_bkey_compat(from.level, from.btree, version, big_endian, 397 write, NULL, bkey_to_packed(k)); 398 fsck_err: 399 return ret; 400 } 401 402 static int journal_entry_btree_keys_validate(struct bch_fs *c, 403 struct jset *jset, 404 struct jset_entry *entry, 405 unsigned version, int big_endian, 406 struct bkey_validate_context from) 407 { 408 struct bkey_i *k = entry->start; 409 410 from.level = entry->level; 411 from.btree = entry->btree_id; 412 413 while (k != vstruct_last(entry)) { 414 int ret = journal_validate_key(c, jset, entry, k, from, version, big_endian); 415 if (ret == FSCK_DELETED_KEY) 416 continue; 417 else if (ret) 418 return ret; 419 420 k = bkey_next(k); 421 } 422 423 return 0; 424 } 425 426 static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c, 427 struct jset_entry *entry) 428 { 429 bool first = true; 430 431 jset_entry_for_each_key(entry, k) { 432 /* We may be called on entries that haven't been validated: */ 433 if (!k->k.u64s) 434 break; 435 436 if (!first) { 437 prt_newline(out); 438 bch2_prt_jset_entry_type(out, entry->type); 439 prt_str(out, ": "); 440 } 441 bch2_btree_id_level_to_text(out, entry->btree_id, entry->level); 442 prt_char(out, ' '); 443 bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); 444 first = false; 445 } 446 } 447 448 static int journal_entry_btree_root_validate(struct bch_fs *c, 449 struct jset *jset, 450 struct jset_entry *entry, 451 unsigned version, int big_endian, 452 struct bkey_validate_context from) 453 { 454 struct bkey_i *k = entry->start; 455 int ret = 0; 456 457 from.root = true; 458 from.level = entry->level + 1; 459 from.btree = entry->btree_id; 460 461 if (journal_entry_err_on(!entry->u64s || 462 le16_to_cpu(entry->u64s) != k->k.u64s, 463 c, version, jset, entry, 464 journal_entry_btree_root_bad_size, 465 "invalid btree root journal entry: wrong number of keys")) { 466 void *next = vstruct_next(entry); 467 /* 468 * we don't want to null out this jset_entry, 469 * just the contents, so that later we can tell 470 * we were _supposed_ to have a btree root 471 */ 472 entry->u64s = 0; 473 journal_entry_null_range(vstruct_next(entry), next); 474 return 0; 475 } 476 477 ret = journal_validate_key(c, jset, entry, k, from, version, big_endian); 478 if (ret == FSCK_DELETED_KEY) 479 ret = 0; 480 fsck_err: 481 return ret; 482 } 483 484 static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c, 485 struct jset_entry *entry) 486 { 487 journal_entry_btree_keys_to_text(out, c, entry); 488 } 489 490 static int journal_entry_prio_ptrs_validate(struct bch_fs *c, 491 struct jset *jset, 492 struct jset_entry *entry, 493 unsigned version, int big_endian, 494 struct bkey_validate_context from) 495 { 496 /* obsolete, don't care: */ 497 return 0; 498 } 499 500 static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c, 501 struct jset_entry *entry) 502 { 503 } 504 505 static int journal_entry_blacklist_validate(struct bch_fs *c, 506 struct jset *jset, 507 struct jset_entry *entry, 508 unsigned version, int big_endian, 509 struct bkey_validate_context from) 510 { 511 int ret = 0; 512 513 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, 514 c, version, jset, entry, 515 journal_entry_blacklist_bad_size, 516 "invalid journal seq blacklist entry: bad size")) { 517 journal_entry_null_range(entry, vstruct_next(entry)); 518 } 519 fsck_err: 520 return ret; 521 } 522 523 static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c, 524 struct jset_entry *entry) 525 { 526 struct jset_entry_blacklist *bl = 527 container_of(entry, struct jset_entry_blacklist, entry); 528 529 prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq)); 530 } 531 532 static int journal_entry_blacklist_v2_validate(struct bch_fs *c, 533 struct jset *jset, 534 struct jset_entry *entry, 535 unsigned version, int big_endian, 536 struct bkey_validate_context from) 537 { 538 struct jset_entry_blacklist_v2 *bl_entry; 539 int ret = 0; 540 541 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, 542 c, version, jset, entry, 543 journal_entry_blacklist_v2_bad_size, 544 "invalid journal seq blacklist entry: bad size")) { 545 journal_entry_null_range(entry, vstruct_next(entry)); 546 goto out; 547 } 548 549 bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry); 550 551 if (journal_entry_err_on(le64_to_cpu(bl_entry->start) > 552 le64_to_cpu(bl_entry->end), 553 c, version, jset, entry, 554 journal_entry_blacklist_v2_start_past_end, 555 "invalid journal seq blacklist entry: start > end")) { 556 journal_entry_null_range(entry, vstruct_next(entry)); 557 } 558 out: 559 fsck_err: 560 return ret; 561 } 562 563 static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c, 564 struct jset_entry *entry) 565 { 566 struct jset_entry_blacklist_v2 *bl = 567 container_of(entry, struct jset_entry_blacklist_v2, entry); 568 569 prt_printf(out, "start=%llu end=%llu", 570 le64_to_cpu(bl->start), 571 le64_to_cpu(bl->end)); 572 } 573 574 static int journal_entry_usage_validate(struct bch_fs *c, 575 struct jset *jset, 576 struct jset_entry *entry, 577 unsigned version, int big_endian, 578 struct bkey_validate_context from) 579 { 580 struct jset_entry_usage *u = 581 container_of(entry, struct jset_entry_usage, entry); 582 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 583 int ret = 0; 584 585 if (journal_entry_err_on(bytes < sizeof(*u), 586 c, version, jset, entry, 587 journal_entry_usage_bad_size, 588 "invalid journal entry usage: bad size")) { 589 journal_entry_null_range(entry, vstruct_next(entry)); 590 return ret; 591 } 592 593 fsck_err: 594 return ret; 595 } 596 597 static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c, 598 struct jset_entry *entry) 599 { 600 struct jset_entry_usage *u = 601 container_of(entry, struct jset_entry_usage, entry); 602 603 prt_str(out, "type="); 604 bch2_prt_fs_usage_type(out, u->entry.btree_id); 605 prt_printf(out, " v=%llu", le64_to_cpu(u->v)); 606 } 607 608 static int journal_entry_data_usage_validate(struct bch_fs *c, 609 struct jset *jset, 610 struct jset_entry *entry, 611 unsigned version, int big_endian, 612 struct bkey_validate_context from) 613 { 614 struct jset_entry_data_usage *u = 615 container_of(entry, struct jset_entry_data_usage, entry); 616 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 617 struct printbuf err = PRINTBUF; 618 int ret = 0; 619 620 if (journal_entry_err_on(bytes < sizeof(*u) || 621 bytes < sizeof(*u) + u->r.nr_devs, 622 c, version, jset, entry, 623 journal_entry_data_usage_bad_size, 624 "invalid journal entry usage: bad size")) { 625 journal_entry_null_range(entry, vstruct_next(entry)); 626 goto out; 627 } 628 629 if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c, &err), 630 c, version, jset, entry, 631 journal_entry_data_usage_bad_size, 632 "invalid journal entry usage: %s", err.buf)) { 633 journal_entry_null_range(entry, vstruct_next(entry)); 634 goto out; 635 } 636 out: 637 fsck_err: 638 printbuf_exit(&err); 639 return ret; 640 } 641 642 static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c, 643 struct jset_entry *entry) 644 { 645 struct jset_entry_data_usage *u = 646 container_of(entry, struct jset_entry_data_usage, entry); 647 648 bch2_replicas_entry_to_text(out, &u->r); 649 prt_printf(out, "=%llu", le64_to_cpu(u->v)); 650 } 651 652 static int journal_entry_clock_validate(struct bch_fs *c, 653 struct jset *jset, 654 struct jset_entry *entry, 655 unsigned version, int big_endian, 656 struct bkey_validate_context from) 657 { 658 struct jset_entry_clock *clock = 659 container_of(entry, struct jset_entry_clock, entry); 660 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 661 int ret = 0; 662 663 if (journal_entry_err_on(bytes != sizeof(*clock), 664 c, version, jset, entry, 665 journal_entry_clock_bad_size, 666 "bad size")) { 667 journal_entry_null_range(entry, vstruct_next(entry)); 668 return ret; 669 } 670 671 if (journal_entry_err_on(clock->rw > 1, 672 c, version, jset, entry, 673 journal_entry_clock_bad_rw, 674 "bad rw")) { 675 journal_entry_null_range(entry, vstruct_next(entry)); 676 return ret; 677 } 678 679 fsck_err: 680 return ret; 681 } 682 683 static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c, 684 struct jset_entry *entry) 685 { 686 struct jset_entry_clock *clock = 687 container_of(entry, struct jset_entry_clock, entry); 688 689 prt_printf(out, "%s=%llu", str_write_read(clock->rw), le64_to_cpu(clock->time)); 690 } 691 692 static int journal_entry_dev_usage_validate(struct bch_fs *c, 693 struct jset *jset, 694 struct jset_entry *entry, 695 unsigned version, int big_endian, 696 struct bkey_validate_context from) 697 { 698 struct jset_entry_dev_usage *u = 699 container_of(entry, struct jset_entry_dev_usage, entry); 700 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 701 unsigned expected = sizeof(*u); 702 int ret = 0; 703 704 if (journal_entry_err_on(bytes < expected, 705 c, version, jset, entry, 706 journal_entry_dev_usage_bad_size, 707 "bad size (%u < %u)", 708 bytes, expected)) { 709 journal_entry_null_range(entry, vstruct_next(entry)); 710 return ret; 711 } 712 713 if (journal_entry_err_on(u->pad, 714 c, version, jset, entry, 715 journal_entry_dev_usage_bad_pad, 716 "bad pad")) { 717 journal_entry_null_range(entry, vstruct_next(entry)); 718 return ret; 719 } 720 721 fsck_err: 722 return ret; 723 } 724 725 static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c, 726 struct jset_entry *entry) 727 { 728 struct jset_entry_dev_usage *u = 729 container_of(entry, struct jset_entry_dev_usage, entry); 730 unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); 731 732 if (vstruct_bytes(entry) < sizeof(*u)) 733 return; 734 735 prt_printf(out, "dev=%u", le32_to_cpu(u->dev)); 736 737 printbuf_indent_add(out, 2); 738 for (i = 0; i < nr_types; i++) { 739 prt_newline(out); 740 bch2_prt_data_type(out, i); 741 prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu", 742 le64_to_cpu(u->d[i].buckets), 743 le64_to_cpu(u->d[i].sectors), 744 le64_to_cpu(u->d[i].fragmented)); 745 } 746 printbuf_indent_sub(out, 2); 747 } 748 749 static int journal_entry_log_validate(struct bch_fs *c, 750 struct jset *jset, 751 struct jset_entry *entry, 752 unsigned version, int big_endian, 753 struct bkey_validate_context from) 754 { 755 return 0; 756 } 757 758 static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c, 759 struct jset_entry *entry) 760 { 761 struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); 762 763 prt_printf(out, "%.*s", jset_entry_log_msg_bytes(l), l->d); 764 } 765 766 static int journal_entry_overwrite_validate(struct bch_fs *c, 767 struct jset *jset, 768 struct jset_entry *entry, 769 unsigned version, int big_endian, 770 struct bkey_validate_context from) 771 { 772 from.flags = 0; 773 return journal_entry_btree_keys_validate(c, jset, entry, 774 version, big_endian, from); 775 } 776 777 static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c, 778 struct jset_entry *entry) 779 { 780 journal_entry_btree_keys_to_text(out, c, entry); 781 } 782 783 static int journal_entry_log_bkey_validate(struct bch_fs *c, 784 struct jset *jset, 785 struct jset_entry *entry, 786 unsigned version, int big_endian, 787 struct bkey_validate_context from) 788 { 789 from.flags = 0; 790 return journal_entry_btree_keys_validate(c, jset, entry, 791 version, big_endian, from); 792 } 793 794 static void journal_entry_log_bkey_to_text(struct printbuf *out, struct bch_fs *c, 795 struct jset_entry *entry) 796 { 797 journal_entry_btree_keys_to_text(out, c, entry); 798 } 799 800 static int journal_entry_write_buffer_keys_validate(struct bch_fs *c, 801 struct jset *jset, 802 struct jset_entry *entry, 803 unsigned version, int big_endian, 804 struct bkey_validate_context from) 805 { 806 return journal_entry_btree_keys_validate(c, jset, entry, 807 version, big_endian, from); 808 } 809 810 static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c, 811 struct jset_entry *entry) 812 { 813 journal_entry_btree_keys_to_text(out, c, entry); 814 } 815 816 static int journal_entry_datetime_validate(struct bch_fs *c, 817 struct jset *jset, 818 struct jset_entry *entry, 819 unsigned version, int big_endian, 820 struct bkey_validate_context from) 821 { 822 unsigned bytes = vstruct_bytes(entry); 823 unsigned expected = 16; 824 int ret = 0; 825 826 if (journal_entry_err_on(vstruct_bytes(entry) < expected, 827 c, version, jset, entry, 828 journal_entry_dev_usage_bad_size, 829 "bad size (%u < %u)", 830 bytes, expected)) { 831 journal_entry_null_range(entry, vstruct_next(entry)); 832 return ret; 833 } 834 fsck_err: 835 return ret; 836 } 837 838 static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *c, 839 struct jset_entry *entry) 840 { 841 struct jset_entry_datetime *datetime = 842 container_of(entry, struct jset_entry_datetime, entry); 843 844 bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); 845 } 846 847 struct jset_entry_ops { 848 int (*validate)(struct bch_fs *, struct jset *, 849 struct jset_entry *, unsigned, int, 850 struct bkey_validate_context); 851 void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); 852 }; 853 854 static const struct jset_entry_ops bch2_jset_entry_ops[] = { 855 #define x(f, nr) \ 856 [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \ 857 .validate = journal_entry_##f##_validate, \ 858 .to_text = journal_entry_##f##_to_text, \ 859 }, 860 BCH_JSET_ENTRY_TYPES() 861 #undef x 862 }; 863 864 int bch2_journal_entry_validate(struct bch_fs *c, 865 struct jset *jset, 866 struct jset_entry *entry, 867 unsigned version, int big_endian, 868 struct bkey_validate_context from) 869 { 870 return entry->type < BCH_JSET_ENTRY_NR 871 ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry, 872 version, big_endian, from) 873 : 0; 874 } 875 876 void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, 877 struct jset_entry *entry) 878 { 879 bch2_prt_jset_entry_type(out, entry->type); 880 881 if (entry->type < BCH_JSET_ENTRY_NR) { 882 prt_str(out, ": "); 883 bch2_jset_entry_ops[entry->type].to_text(out, c, entry); 884 } 885 } 886 887 static int jset_validate_entries(struct bch_fs *c, struct jset *jset, 888 enum bch_validate_flags flags) 889 { 890 struct bkey_validate_context from = { 891 .flags = flags, 892 .from = BKEY_VALIDATE_journal, 893 .journal_seq = le64_to_cpu(jset->seq), 894 }; 895 896 unsigned version = le32_to_cpu(jset->version); 897 int ret = 0; 898 899 vstruct_for_each(jset, entry) { 900 from.journal_offset = (u64 *) entry - jset->_data; 901 902 if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset), 903 c, version, jset, entry, 904 journal_entry_past_jset_end, 905 "journal entry extends past end of jset")) { 906 jset->u64s = cpu_to_le32((u64 *) entry - jset->_data); 907 break; 908 } 909 910 ret = bch2_journal_entry_validate(c, jset, entry, version, 911 JSET_BIG_ENDIAN(jset), from); 912 if (ret) 913 break; 914 } 915 fsck_err: 916 return ret; 917 } 918 919 static int jset_validate(struct bch_fs *c, 920 struct bch_dev *ca, 921 struct jset *jset, u64 sector, 922 enum bch_validate_flags flags) 923 { 924 struct bkey_validate_context from = { 925 .flags = flags, 926 .from = BKEY_VALIDATE_journal, 927 .journal_seq = le64_to_cpu(jset->seq), 928 }; 929 int ret = 0; 930 931 if (le64_to_cpu(jset->magic) != jset_magic(c)) 932 return JOURNAL_ENTRY_NONE; 933 934 unsigned version = le32_to_cpu(jset->version); 935 if (journal_entry_err_on(!bch2_version_compatible(version), 936 c, version, jset, NULL, 937 jset_unsupported_version, 938 "%s sector %llu seq %llu: incompatible journal entry version %u.%u", 939 ca ? ca->name : c->name, 940 sector, le64_to_cpu(jset->seq), 941 BCH_VERSION_MAJOR(version), 942 BCH_VERSION_MINOR(version))) { 943 /* don't try to continue: */ 944 return -EINVAL; 945 } 946 947 if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), 948 c, version, jset, NULL, 949 jset_unknown_csum, 950 "%s sector %llu seq %llu: journal entry with unknown csum type %llu", 951 ca ? ca->name : c->name, 952 sector, le64_to_cpu(jset->seq), 953 JSET_CSUM_TYPE(jset))) 954 ret = JOURNAL_ENTRY_BAD; 955 956 /* last_seq is ignored when JSET_NO_FLUSH is true */ 957 if (journal_entry_err_on(!JSET_NO_FLUSH(jset) && 958 le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), 959 c, version, jset, NULL, 960 jset_last_seq_newer_than_seq, 961 "invalid journal entry: last_seq > seq (%llu > %llu)", 962 le64_to_cpu(jset->last_seq), 963 le64_to_cpu(jset->seq))) { 964 jset->last_seq = jset->seq; 965 return JOURNAL_ENTRY_BAD; 966 } 967 968 ret = jset_validate_entries(c, jset, flags); 969 fsck_err: 970 return ret; 971 } 972 973 static int jset_validate_early(struct bch_fs *c, 974 struct bch_dev *ca, 975 struct jset *jset, u64 sector, 976 unsigned bucket_sectors_left, 977 unsigned sectors_read) 978 { 979 struct bkey_validate_context from = { 980 .from = BKEY_VALIDATE_journal, 981 .journal_seq = le64_to_cpu(jset->seq), 982 }; 983 int ret = 0; 984 985 if (le64_to_cpu(jset->magic) != jset_magic(c)) 986 return JOURNAL_ENTRY_NONE; 987 988 unsigned version = le32_to_cpu(jset->version); 989 if (journal_entry_err_on(!bch2_version_compatible(version), 990 c, version, jset, NULL, 991 jset_unsupported_version, 992 "%s sector %llu seq %llu: unknown journal entry version %u.%u", 993 ca ? ca->name : c->name, 994 sector, le64_to_cpu(jset->seq), 995 BCH_VERSION_MAJOR(version), 996 BCH_VERSION_MINOR(version))) { 997 /* don't try to continue: */ 998 return -EINVAL; 999 } 1000 1001 size_t bytes = vstruct_bytes(jset); 1002 if (bytes > (sectors_read << 9) && 1003 sectors_read < bucket_sectors_left) 1004 return JOURNAL_ENTRY_REREAD; 1005 1006 if (journal_entry_err_on(bytes > bucket_sectors_left << 9, 1007 c, version, jset, NULL, 1008 jset_past_bucket_end, 1009 "%s sector %llu seq %llu: journal entry too big (%zu bytes)", 1010 ca ? ca->name : c->name, 1011 sector, le64_to_cpu(jset->seq), bytes)) 1012 le32_add_cpu(&jset->u64s, 1013 -((bytes - (bucket_sectors_left << 9)) / 8)); 1014 fsck_err: 1015 return ret; 1016 } 1017 1018 struct journal_read_buf { 1019 void *data; 1020 size_t size; 1021 }; 1022 1023 static int journal_read_buf_realloc(struct bch_fs *c, struct journal_read_buf *b, 1024 size_t new_size) 1025 { 1026 void *n; 1027 1028 /* the bios are sized for this many pages, max: */ 1029 if (new_size > JOURNAL_ENTRY_SIZE_MAX) 1030 return bch_err_throw(c, ENOMEM_journal_read_buf_realloc); 1031 1032 new_size = roundup_pow_of_two(new_size); 1033 n = kvmalloc(new_size, GFP_KERNEL); 1034 if (!n) 1035 return bch_err_throw(c, ENOMEM_journal_read_buf_realloc); 1036 1037 kvfree(b->data); 1038 b->data = n; 1039 b->size = new_size; 1040 return 0; 1041 } 1042 1043 static int journal_read_bucket(struct bch_dev *ca, 1044 struct journal_read_buf *buf, 1045 struct journal_list *jlist, 1046 unsigned bucket) 1047 { 1048 struct bch_fs *c = ca->fs; 1049 struct journal_device *ja = &ca->journal; 1050 struct jset *j = NULL; 1051 unsigned sectors, sectors_read = 0; 1052 u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), 1053 end = offset + ca->mi.bucket_size; 1054 bool saw_bad = false, csum_good; 1055 int ret = 0; 1056 1057 pr_debug("reading %u", bucket); 1058 1059 while (offset < end) { 1060 if (!sectors_read) { 1061 struct bio *bio; 1062 unsigned nr_bvecs; 1063 reread: 1064 sectors_read = min_t(unsigned, 1065 end - offset, buf->size >> 9); 1066 nr_bvecs = buf_pages(buf->data, sectors_read << 9); 1067 1068 bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); 1069 if (!bio) 1070 return bch_err_throw(c, ENOMEM_journal_read_bucket); 1071 bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ); 1072 1073 bio->bi_iter.bi_sector = offset; 1074 bch2_bio_map(bio, buf->data, sectors_read << 9); 1075 1076 u64 submit_time = local_clock(); 1077 ret = submit_bio_wait(bio); 1078 kfree(bio); 1079 1080 if (!ret && bch2_meta_read_fault("journal")) 1081 ret = bch_err_throw(c, EIO_fault_injected); 1082 1083 bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, 1084 submit_time, !ret); 1085 1086 if (ret) { 1087 bch_err_dev_ratelimited(ca, 1088 "journal read error: sector %llu", offset); 1089 /* 1090 * We don't error out of the recovery process 1091 * here, since the relevant journal entry may be 1092 * found on a different device, and missing or 1093 * no journal entries will be handled later 1094 */ 1095 return 0; 1096 } 1097 1098 j = buf->data; 1099 } 1100 1101 ret = jset_validate_early(c, ca, j, offset, 1102 end - offset, sectors_read); 1103 switch (ret) { 1104 case 0: 1105 sectors = vstruct_sectors(j, c->block_bits); 1106 break; 1107 case JOURNAL_ENTRY_REREAD: 1108 if (vstruct_bytes(j) > buf->size) { 1109 ret = journal_read_buf_realloc(c, buf, 1110 vstruct_bytes(j)); 1111 if (ret) 1112 return ret; 1113 } 1114 goto reread; 1115 case JOURNAL_ENTRY_NONE: 1116 if (!saw_bad) 1117 return 0; 1118 /* 1119 * On checksum error we don't really trust the size 1120 * field of the journal entry we read, so try reading 1121 * again at next block boundary: 1122 */ 1123 sectors = block_sectors(c); 1124 goto next_block; 1125 default: 1126 return ret; 1127 } 1128 1129 if (le64_to_cpu(j->seq) > ja->highest_seq_found) { 1130 ja->highest_seq_found = le64_to_cpu(j->seq); 1131 ja->cur_idx = bucket; 1132 ja->sectors_free = ca->mi.bucket_size - 1133 bucket_remainder(ca, offset) - sectors; 1134 } 1135 1136 /* 1137 * This happens sometimes if we don't have discards on - 1138 * when we've partially overwritten a bucket with new 1139 * journal entries. We don't need the rest of the 1140 * bucket: 1141 */ 1142 if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) 1143 return 0; 1144 1145 ja->bucket_seq[bucket] = le64_to_cpu(j->seq); 1146 1147 struct bch_csum csum; 1148 csum_good = jset_csum_good(c, j, &csum); 1149 1150 bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good); 1151 1152 if (!csum_good) { 1153 /* 1154 * Don't print an error here, we'll print the error 1155 * later if we need this journal entry 1156 */ 1157 saw_bad = true; 1158 } 1159 1160 ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), 1161 j->encrypted_start, 1162 vstruct_end(j) - (void *) j->encrypted_start); 1163 bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret)); 1164 1165 mutex_lock(&jlist->lock); 1166 ret = journal_entry_add(c, ca, (struct journal_ptr) { 1167 .csum_good = csum_good, 1168 .csum = csum, 1169 .dev = ca->dev_idx, 1170 .bucket = bucket, 1171 .bucket_offset = offset - 1172 bucket_to_sector(ca, ja->buckets[bucket]), 1173 .sector = offset, 1174 }, jlist, j); 1175 mutex_unlock(&jlist->lock); 1176 1177 switch (ret) { 1178 case JOURNAL_ENTRY_ADD_OK: 1179 break; 1180 case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: 1181 break; 1182 default: 1183 return ret; 1184 } 1185 next_block: 1186 pr_debug("next"); 1187 offset += sectors; 1188 sectors_read -= sectors; 1189 j = ((void *) j) + (sectors << 9); 1190 } 1191 1192 return 0; 1193 } 1194 1195 static CLOSURE_CALLBACK(bch2_journal_read_device) 1196 { 1197 closure_type(ja, struct journal_device, read); 1198 struct bch_dev *ca = container_of(ja, struct bch_dev, journal); 1199 struct bch_fs *c = ca->fs; 1200 struct journal_list *jlist = 1201 container_of(cl->parent, struct journal_list, cl); 1202 struct journal_read_buf buf = { NULL, 0 }; 1203 unsigned i; 1204 int ret = 0; 1205 1206 if (!ja->nr) 1207 goto out; 1208 1209 ret = journal_read_buf_realloc(c, &buf, PAGE_SIZE); 1210 if (ret) 1211 goto err; 1212 1213 pr_debug("%u journal buckets", ja->nr); 1214 1215 for (i = 0; i < ja->nr; i++) { 1216 ret = journal_read_bucket(ca, &buf, jlist, i); 1217 if (ret) 1218 goto err; 1219 } 1220 1221 /* 1222 * Set dirty_idx to indicate the entire journal is full and needs to be 1223 * reclaimed - journal reclaim will immediately reclaim whatever isn't 1224 * pinned when it first runs: 1225 */ 1226 ja->discard_idx = ja->dirty_idx_ondisk = 1227 ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; 1228 out: 1229 bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); 1230 kvfree(buf.data); 1231 enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_journal_read); 1232 closure_return(cl); 1233 return; 1234 err: 1235 mutex_lock(&jlist->lock); 1236 jlist->ret = ret; 1237 mutex_unlock(&jlist->lock); 1238 goto out; 1239 } 1240 1241 noinline_for_stack 1242 static void bch2_journal_print_checksum_error(struct bch_fs *c, struct journal_replay *j) 1243 { 1244 struct printbuf buf = PRINTBUF; 1245 enum bch_csum_type csum_type = JSET_CSUM_TYPE(&j->j); 1246 bool have_good = false; 1247 1248 prt_printf(&buf, "invalid journal checksum(s) at seq %llu ", le64_to_cpu(j->j.seq)); 1249 bch2_journal_datetime_to_text(&buf, &j->j); 1250 prt_newline(&buf); 1251 1252 darray_for_each(j->ptrs, ptr) 1253 if (!ptr->csum_good) { 1254 bch2_journal_ptr_to_text(&buf, c, ptr); 1255 prt_char(&buf, ' '); 1256 bch2_csum_to_text(&buf, csum_type, ptr->csum); 1257 prt_newline(&buf); 1258 } else { 1259 have_good = true; 1260 } 1261 1262 prt_printf(&buf, "should be "); 1263 bch2_csum_to_text(&buf, csum_type, j->j.csum); 1264 1265 if (have_good) 1266 prt_printf(&buf, "\n(had good copy on another device)"); 1267 1268 bch2_print_str(c, KERN_ERR, buf.buf); 1269 printbuf_exit(&buf); 1270 } 1271 1272 noinline_for_stack 1273 static int bch2_journal_check_for_missing(struct bch_fs *c, u64 start_seq, u64 end_seq) 1274 { 1275 struct printbuf buf = PRINTBUF; 1276 int ret = 0; 1277 1278 struct genradix_iter radix_iter; 1279 struct journal_replay *i, **_i, *prev = NULL; 1280 u64 seq = start_seq; 1281 1282 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1283 i = *_i; 1284 1285 if (journal_replay_ignore(i)) 1286 continue; 1287 1288 BUG_ON(seq > le64_to_cpu(i->j.seq)); 1289 1290 while (seq < le64_to_cpu(i->j.seq)) { 1291 while (seq < le64_to_cpu(i->j.seq) && 1292 bch2_journal_seq_is_blacklisted(c, seq, false)) 1293 seq++; 1294 1295 if (seq == le64_to_cpu(i->j.seq)) 1296 break; 1297 1298 u64 missing_start = seq; 1299 1300 while (seq < le64_to_cpu(i->j.seq) && 1301 !bch2_journal_seq_is_blacklisted(c, seq, false)) 1302 seq++; 1303 1304 u64 missing_end = seq - 1; 1305 1306 printbuf_reset(&buf); 1307 prt_printf(&buf, "journal entries %llu-%llu missing! (replaying %llu-%llu)", 1308 missing_start, missing_end, 1309 start_seq, end_seq); 1310 1311 prt_printf(&buf, "\nprev at "); 1312 if (prev) { 1313 bch2_journal_ptrs_to_text(&buf, c, prev); 1314 prt_printf(&buf, " size %zu", vstruct_sectors(&prev->j, c->block_bits)); 1315 } else 1316 prt_printf(&buf, "(none)"); 1317 1318 prt_printf(&buf, "\nnext at "); 1319 bch2_journal_ptrs_to_text(&buf, c, i); 1320 prt_printf(&buf, ", continue?"); 1321 1322 fsck_err(c, journal_entries_missing, "%s", buf.buf); 1323 } 1324 1325 prev = i; 1326 seq++; 1327 } 1328 fsck_err: 1329 printbuf_exit(&buf); 1330 return ret; 1331 } 1332 1333 int bch2_journal_read(struct bch_fs *c, 1334 u64 *last_seq, 1335 u64 *blacklist_seq, 1336 u64 *start_seq) 1337 { 1338 struct journal_list jlist; 1339 struct journal_replay *i, **_i; 1340 struct genradix_iter radix_iter; 1341 struct printbuf buf = PRINTBUF; 1342 bool degraded = false, last_write_torn = false; 1343 u64 seq; 1344 int ret = 0; 1345 1346 closure_init_stack(&jlist.cl); 1347 mutex_init(&jlist.lock); 1348 jlist.last_seq = 0; 1349 jlist.ret = 0; 1350 1351 for_each_member_device(c, ca) { 1352 if (!c->opts.fsck && 1353 !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal))) 1354 continue; 1355 1356 if ((ca->mi.state == BCH_MEMBER_STATE_rw || 1357 ca->mi.state == BCH_MEMBER_STATE_ro) && 1358 enumerated_ref_tryget(&ca->io_ref[READ], 1359 BCH_DEV_READ_REF_journal_read)) 1360 closure_call(&ca->journal.read, 1361 bch2_journal_read_device, 1362 system_unbound_wq, 1363 &jlist.cl); 1364 else 1365 degraded = true; 1366 } 1367 1368 while (closure_sync_timeout(&jlist.cl, sysctl_hung_task_timeout_secs * HZ / 2)) 1369 ; 1370 1371 if (jlist.ret) 1372 return jlist.ret; 1373 1374 *last_seq = 0; 1375 *start_seq = 0; 1376 *blacklist_seq = 0; 1377 1378 /* 1379 * Find most recent flush entry, and ignore newer non flush entries - 1380 * those entries will be blacklisted: 1381 */ 1382 genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) { 1383 i = *_i; 1384 1385 if (journal_replay_ignore(i)) 1386 continue; 1387 1388 if (!*start_seq) 1389 *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1; 1390 1391 if (JSET_NO_FLUSH(&i->j)) { 1392 i->ignore_blacklisted = true; 1393 continue; 1394 } 1395 1396 if (!last_write_torn && !i->csum_good) { 1397 last_write_torn = true; 1398 i->ignore_blacklisted = true; 1399 continue; 1400 } 1401 1402 struct bkey_validate_context from = { 1403 .from = BKEY_VALIDATE_journal, 1404 .journal_seq = le64_to_cpu(i->j.seq), 1405 }; 1406 if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq), 1407 c, le32_to_cpu(i->j.version), &i->j, NULL, 1408 jset_last_seq_newer_than_seq, 1409 "invalid journal entry: last_seq > seq (%llu > %llu)", 1410 le64_to_cpu(i->j.last_seq), 1411 le64_to_cpu(i->j.seq))) 1412 i->j.last_seq = i->j.seq; 1413 1414 *last_seq = le64_to_cpu(i->j.last_seq); 1415 *blacklist_seq = le64_to_cpu(i->j.seq) + 1; 1416 break; 1417 } 1418 1419 if (!*start_seq) { 1420 bch_info(c, "journal read done, but no entries found"); 1421 return 0; 1422 } 1423 1424 if (!*last_seq) { 1425 fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes, 1426 "journal read done, but no entries found after dropping non-flushes"); 1427 return 0; 1428 } 1429 1430 printbuf_reset(&buf); 1431 prt_printf(&buf, "journal read done, replaying entries %llu-%llu", 1432 *last_seq, *blacklist_seq - 1); 1433 if (*start_seq != *blacklist_seq) 1434 prt_printf(&buf, " (unflushed %llu-%llu)", *blacklist_seq, *start_seq - 1); 1435 bch_info(c, "%s", buf.buf); 1436 1437 /* Drop blacklisted entries and entries older than last_seq: */ 1438 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1439 i = *_i; 1440 1441 if (journal_replay_ignore(i)) 1442 continue; 1443 1444 seq = le64_to_cpu(i->j.seq); 1445 if (seq < *last_seq) { 1446 journal_replay_free(c, i, false); 1447 continue; 1448 } 1449 1450 if (bch2_journal_seq_is_blacklisted(c, seq, true)) { 1451 fsck_err_on(!JSET_NO_FLUSH(&i->j), c, 1452 jset_seq_blacklisted, 1453 "found blacklisted journal entry %llu", seq); 1454 i->ignore_blacklisted = true; 1455 } 1456 } 1457 1458 ret = bch2_journal_check_for_missing(c, *last_seq, *blacklist_seq - 1); 1459 if (ret) 1460 goto err; 1461 1462 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1463 union bch_replicas_padded replicas = { 1464 .e.data_type = BCH_DATA_journal, 1465 .e.nr_devs = 0, 1466 .e.nr_required = 1, 1467 }; 1468 1469 i = *_i; 1470 if (journal_replay_ignore(i)) 1471 continue; 1472 1473 /* 1474 * Don't print checksum errors until we know we're going to use 1475 * a given journal entry: 1476 */ 1477 darray_for_each(i->ptrs, ptr) 1478 if (!ptr->csum_good) { 1479 bch2_journal_print_checksum_error(c, i); 1480 break; 1481 } 1482 1483 ret = jset_validate(c, 1484 bch2_dev_have_ref(c, i->ptrs.data[0].dev), 1485 &i->j, 1486 i->ptrs.data[0].sector, 1487 READ); 1488 if (ret) 1489 goto err; 1490 1491 darray_for_each(i->ptrs, ptr) 1492 replicas_entry_add_dev(&replicas.e, ptr->dev); 1493 1494 bch2_replicas_entry_sort(&replicas.e); 1495 1496 printbuf_reset(&buf); 1497 bch2_replicas_entry_to_text(&buf, &replicas.e); 1498 1499 if (!degraded && 1500 !bch2_replicas_marked(c, &replicas.e) && 1501 (le64_to_cpu(i->j.seq) == *last_seq || 1502 fsck_err(c, journal_entry_replicas_not_marked, 1503 "superblock not marked as containing replicas for journal entry %llu\n%s", 1504 le64_to_cpu(i->j.seq), buf.buf))) { 1505 ret = bch2_mark_replicas(c, &replicas.e); 1506 if (ret) 1507 goto err; 1508 } 1509 } 1510 err: 1511 fsck_err: 1512 printbuf_exit(&buf); 1513 return ret; 1514 } 1515 1516 /* journal write: */ 1517 1518 static void journal_advance_devs_to_next_bucket(struct journal *j, 1519 struct dev_alloc_list *devs, 1520 unsigned sectors, __le64 seq) 1521 { 1522 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1523 1524 guard(rcu)(); 1525 darray_for_each(*devs, i) { 1526 struct bch_dev *ca = rcu_dereference(c->devs[*i]); 1527 if (!ca) 1528 continue; 1529 1530 struct journal_device *ja = &ca->journal; 1531 1532 if (sectors > ja->sectors_free && 1533 sectors <= ca->mi.bucket_size && 1534 bch2_journal_dev_buckets_available(j, ja, 1535 journal_space_discarded)) { 1536 ja->cur_idx = (ja->cur_idx + 1) % ja->nr; 1537 ja->sectors_free = ca->mi.bucket_size; 1538 1539 /* 1540 * ja->bucket_seq[ja->cur_idx] must always have 1541 * something sensible: 1542 */ 1543 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(seq); 1544 } 1545 } 1546 } 1547 1548 static void __journal_write_alloc(struct journal *j, 1549 struct journal_buf *w, 1550 struct dev_alloc_list *devs, 1551 unsigned sectors, 1552 unsigned *replicas, 1553 unsigned replicas_want) 1554 { 1555 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1556 1557 darray_for_each(*devs, i) { 1558 struct bch_dev *ca = bch2_dev_get_ioref(c, *i, WRITE, 1559 BCH_DEV_WRITE_REF_journal_write); 1560 if (!ca) 1561 continue; 1562 1563 struct journal_device *ja = &ca->journal; 1564 1565 /* 1566 * Check that we can use this device, and aren't already using 1567 * it: 1568 */ 1569 if (!ca->mi.durability || 1570 ca->mi.state != BCH_MEMBER_STATE_rw || 1571 !ja->nr || 1572 bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) || 1573 sectors > ja->sectors_free) { 1574 enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write); 1575 continue; 1576 } 1577 1578 bch2_dev_stripe_increment(ca, &j->wp.stripe); 1579 1580 bch2_bkey_append_ptr(&w->key, 1581 (struct bch_extent_ptr) { 1582 .offset = bucket_to_sector(ca, 1583 ja->buckets[ja->cur_idx]) + 1584 ca->mi.bucket_size - 1585 ja->sectors_free, 1586 .dev = ca->dev_idx, 1587 }); 1588 1589 ja->sectors_free -= sectors; 1590 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1591 1592 *replicas += ca->mi.durability; 1593 1594 if (*replicas >= replicas_want) 1595 break; 1596 } 1597 } 1598 1599 static int journal_write_alloc(struct journal *j, struct journal_buf *w, 1600 unsigned *replicas) 1601 { 1602 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1603 struct bch_devs_mask devs; 1604 struct dev_alloc_list devs_sorted; 1605 unsigned sectors = vstruct_sectors(w->data, c->block_bits); 1606 unsigned target = c->opts.metadata_target ?: 1607 c->opts.foreground_target; 1608 unsigned replicas_want = READ_ONCE(c->opts.metadata_replicas); 1609 unsigned replicas_need = min_t(unsigned, replicas_want, 1610 READ_ONCE(c->opts.metadata_replicas_required)); 1611 bool advance_done = false; 1612 1613 retry_target: 1614 devs = target_rw_devs(c, BCH_DATA_journal, target); 1615 bch2_dev_alloc_list(c, &j->wp.stripe, &devs, &devs_sorted); 1616 retry_alloc: 1617 __journal_write_alloc(j, w, &devs_sorted, sectors, replicas, replicas_want); 1618 1619 if (likely(*replicas >= replicas_want)) 1620 goto done; 1621 1622 if (!advance_done) { 1623 journal_advance_devs_to_next_bucket(j, &devs_sorted, sectors, w->data->seq); 1624 advance_done = true; 1625 goto retry_alloc; 1626 } 1627 1628 if (*replicas < replicas_want && target) { 1629 /* Retry from all devices: */ 1630 target = 0; 1631 advance_done = false; 1632 goto retry_target; 1633 } 1634 done: 1635 BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); 1636 1637 #if 0 1638 /* 1639 * XXX: we need a way to alert the user when we go degraded for any 1640 * reason 1641 */ 1642 if (*replicas < min(replicas_want, 1643 dev_mask_nr(&c->rw_devs[BCH_DATA_free]))) { 1644 } 1645 #endif 1646 1647 return *replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices; 1648 } 1649 1650 static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) 1651 { 1652 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1653 1654 /* we aren't holding j->lock: */ 1655 unsigned new_size = READ_ONCE(j->buf_size_want); 1656 void *new_buf; 1657 1658 if (buf->buf_size >= new_size) 1659 return; 1660 1661 size_t btree_write_buffer_size = new_size / 64; 1662 1663 if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size)) 1664 return; 1665 1666 new_buf = kvmalloc(new_size, GFP_NOFS|__GFP_NOWARN); 1667 if (!new_buf) 1668 return; 1669 1670 memcpy(new_buf, buf->data, buf->buf_size); 1671 1672 spin_lock(&j->lock); 1673 swap(buf->data, new_buf); 1674 swap(buf->buf_size, new_size); 1675 spin_unlock(&j->lock); 1676 1677 kvfree(new_buf); 1678 } 1679 1680 static CLOSURE_CALLBACK(journal_write_done) 1681 { 1682 closure_type(w, struct journal_buf, io); 1683 struct journal *j = container_of(w, struct journal, buf[w->idx]); 1684 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1685 union bch_replicas_padded replicas; 1686 u64 seq = le64_to_cpu(w->data->seq); 1687 int err = 0; 1688 1689 bch2_time_stats_update(!JSET_NO_FLUSH(w->data) 1690 ? j->flush_write_time 1691 : j->noflush_write_time, j->write_start_time); 1692 1693 if (!w->devs_written.nr) { 1694 err = bch_err_throw(c, journal_write_err); 1695 } else { 1696 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, 1697 w->devs_written); 1698 err = bch2_mark_replicas(c, &replicas.e); 1699 } 1700 1701 if (err && !bch2_journal_error(j)) { 1702 struct printbuf buf = PRINTBUF; 1703 bch2_log_msg_start(c, &buf); 1704 1705 if (err == -BCH_ERR_journal_write_err) 1706 prt_printf(&buf, "unable to write journal to sufficient devices"); 1707 else 1708 prt_printf(&buf, "journal write error marking replicas: %s", bch2_err_str(err)); 1709 1710 bch2_fs_emergency_read_only2(c, &buf); 1711 1712 bch2_print_str(c, KERN_ERR, buf.buf); 1713 printbuf_exit(&buf); 1714 } 1715 1716 closure_debug_destroy(cl); 1717 1718 spin_lock(&j->lock); 1719 if (seq >= j->pin.front) 1720 journal_seq_pin(j, seq)->devs = w->devs_written; 1721 if (err && (!j->err_seq || seq < j->err_seq)) 1722 j->err_seq = seq; 1723 w->write_done = true; 1724 1725 if (!j->free_buf || j->free_buf_size < w->buf_size) { 1726 swap(j->free_buf, w->data); 1727 swap(j->free_buf_size, w->buf_size); 1728 } 1729 1730 if (w->data) { 1731 void *buf = w->data; 1732 w->data = NULL; 1733 w->buf_size = 0; 1734 1735 spin_unlock(&j->lock); 1736 kvfree(buf); 1737 spin_lock(&j->lock); 1738 } 1739 1740 bool completed = false; 1741 bool do_discards = false; 1742 1743 for (seq = journal_last_unwritten_seq(j); 1744 seq <= journal_cur_seq(j); 1745 seq++) { 1746 w = j->buf + (seq & JOURNAL_BUF_MASK); 1747 if (!w->write_done) 1748 break; 1749 1750 if (!j->err_seq && !w->noflush) { 1751 j->flushed_seq_ondisk = seq; 1752 j->last_seq_ondisk = w->last_seq; 1753 1754 closure_wake_up(&c->freelist_wait); 1755 bch2_reset_alloc_cursors(c); 1756 } 1757 1758 j->seq_ondisk = seq; 1759 1760 /* 1761 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard 1762 * more buckets: 1763 * 1764 * Must come before signaling write completion, for 1765 * bch2_fs_journal_stop(): 1766 */ 1767 if (j->watermark != BCH_WATERMARK_stripe) 1768 journal_reclaim_kick(&c->journal); 1769 1770 closure_wake_up(&w->wait); 1771 completed = true; 1772 } 1773 1774 if (completed) { 1775 bch2_journal_reclaim_fast(j); 1776 bch2_journal_space_available(j); 1777 1778 track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false); 1779 1780 journal_wake(j); 1781 } 1782 1783 if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && 1784 j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { 1785 struct journal_buf *buf = journal_cur_buf(j); 1786 long delta = buf->expires - jiffies; 1787 1788 /* 1789 * We don't close a journal entry to write it while there's 1790 * previous entries still in flight - the current journal entry 1791 * might want to be written now: 1792 */ 1793 mod_delayed_work(j->wq, &j->write_work, max(0L, delta)); 1794 } 1795 1796 /* 1797 * We don't typically trigger journal writes from her - the next journal 1798 * write will be triggered immediately after the previous one is 1799 * allocated, in bch2_journal_write() - but the journal write error path 1800 * is special: 1801 */ 1802 bch2_journal_do_writes(j); 1803 spin_unlock(&j->lock); 1804 1805 if (do_discards) 1806 bch2_do_discards(c); 1807 } 1808 1809 static void journal_write_endio(struct bio *bio) 1810 { 1811 struct journal_bio *jbio = container_of(bio, struct journal_bio, bio); 1812 struct bch_dev *ca = jbio->ca; 1813 struct journal *j = &ca->fs->journal; 1814 struct journal_buf *w = j->buf + jbio->buf_idx; 1815 1816 bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write, 1817 jbio->submit_time, !bio->bi_status); 1818 1819 if (bio->bi_status) { 1820 bch_err_dev_ratelimited(ca, 1821 "error writing journal entry %llu: %s", 1822 le64_to_cpu(w->data->seq), 1823 bch2_blk_status_to_str(bio->bi_status)); 1824 1825 unsigned long flags; 1826 spin_lock_irqsave(&j->err_lock, flags); 1827 bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx); 1828 spin_unlock_irqrestore(&j->err_lock, flags); 1829 } 1830 1831 closure_put(&w->io); 1832 enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write); 1833 } 1834 1835 static CLOSURE_CALLBACK(journal_write_submit) 1836 { 1837 closure_type(w, struct journal_buf, io); 1838 struct journal *j = container_of(w, struct journal, buf[w->idx]); 1839 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1840 unsigned sectors = vstruct_sectors(w->data, c->block_bits); 1841 1842 extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { 1843 struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); 1844 1845 this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], 1846 sectors); 1847 1848 struct journal_device *ja = &ca->journal; 1849 struct journal_bio *jbio = ja->bio[w->idx]; 1850 struct bio *bio = &jbio->bio; 1851 1852 jbio->submit_time = local_clock(); 1853 1854 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); 1855 bio->bi_iter.bi_sector = ptr->offset; 1856 bio->bi_end_io = journal_write_endio; 1857 bio->bi_private = ca; 1858 bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 0); 1859 1860 BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector); 1861 ca->prev_journal_sector = bio->bi_iter.bi_sector; 1862 1863 if (!JSET_NO_FLUSH(w->data)) 1864 bio->bi_opf |= REQ_FUA; 1865 if (!JSET_NO_FLUSH(w->data) && !w->separate_flush) 1866 bio->bi_opf |= REQ_PREFLUSH; 1867 1868 bch2_bio_map(bio, w->data, sectors << 9); 1869 1870 trace_and_count(c, journal_write, bio); 1871 closure_bio_submit(bio, cl); 1872 1873 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1874 } 1875 1876 continue_at(cl, journal_write_done, j->wq); 1877 } 1878 1879 static CLOSURE_CALLBACK(journal_write_preflush) 1880 { 1881 closure_type(w, struct journal_buf, io); 1882 struct journal *j = container_of(w, struct journal, buf[w->idx]); 1883 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1884 1885 /* 1886 * Wait for previous journal writes to comelete; they won't necessarily 1887 * be flushed if they're still in flight 1888 */ 1889 if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { 1890 spin_lock(&j->lock); 1891 if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { 1892 closure_wait(&j->async_wait, cl); 1893 spin_unlock(&j->lock); 1894 continue_at(cl, journal_write_preflush, j->wq); 1895 return; 1896 } 1897 spin_unlock(&j->lock); 1898 } 1899 1900 if (w->separate_flush) { 1901 for_each_rw_member(c, ca, BCH_DEV_WRITE_REF_journal_write) { 1902 enumerated_ref_get(&ca->io_ref[WRITE], 1903 BCH_DEV_WRITE_REF_journal_write); 1904 1905 struct journal_device *ja = &ca->journal; 1906 struct bio *bio = &ja->bio[w->idx]->bio; 1907 bio_reset(bio, ca->disk_sb.bdev, 1908 REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH); 1909 bio->bi_end_io = journal_write_endio; 1910 bio->bi_private = ca; 1911 closure_bio_submit(bio, cl); 1912 } 1913 1914 continue_at(cl, journal_write_submit, j->wq); 1915 } else { 1916 /* 1917 * no need to punt to another work item if we're not waiting on 1918 * preflushes 1919 */ 1920 journal_write_submit(&cl->work); 1921 } 1922 } 1923 1924 static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) 1925 { 1926 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1927 struct jset_entry *start, *end; 1928 struct jset *jset = w->data; 1929 struct journal_keys_to_wb wb = { NULL }; 1930 unsigned u64s; 1931 unsigned long btree_roots_have = 0; 1932 u64 seq = le64_to_cpu(jset->seq); 1933 int ret; 1934 1935 /* 1936 * Simple compaction, dropping empty jset_entries (from journal 1937 * reservations that weren't fully used) and merging jset_entries that 1938 * can be. 1939 * 1940 * If we wanted to be really fancy here, we could sort all the keys in 1941 * the jset and drop keys that were overwritten - probably not worth it: 1942 */ 1943 vstruct_for_each(jset, i) { 1944 unsigned u64s = le16_to_cpu(i->u64s); 1945 1946 /* Empty entry: */ 1947 if (!u64s) 1948 continue; 1949 1950 /* 1951 * New btree roots are set by journalling them; when the journal 1952 * entry gets written we have to propagate them to 1953 * c->btree_roots 1954 * 1955 * But, every journal entry we write has to contain all the 1956 * btree roots (at least for now); so after we copy btree roots 1957 * to c->btree_roots we have to get any missing btree roots and 1958 * add them to this journal entry: 1959 */ 1960 switch (i->type) { 1961 case BCH_JSET_ENTRY_btree_root: 1962 bch2_journal_entry_to_btree_root(c, i); 1963 __set_bit(i->btree_id, &btree_roots_have); 1964 break; 1965 case BCH_JSET_ENTRY_write_buffer_keys: 1966 EBUG_ON(!w->need_flush_to_write_buffer); 1967 1968 if (!wb.wb) 1969 bch2_journal_keys_to_write_buffer_start(c, &wb, seq); 1970 1971 jset_entry_for_each_key(i, k) { 1972 ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k); 1973 if (ret) { 1974 bch2_fs_fatal_error(c, "flushing journal keys to btree write buffer: %s", 1975 bch2_err_str(ret)); 1976 bch2_journal_keys_to_write_buffer_end(c, &wb); 1977 return ret; 1978 } 1979 } 1980 i->type = BCH_JSET_ENTRY_btree_keys; 1981 break; 1982 } 1983 } 1984 1985 if (wb.wb) { 1986 ret = bch2_journal_keys_to_write_buffer_end(c, &wb); 1987 if (ret) { 1988 bch2_fs_fatal_error(c, "error flushing journal keys to btree write buffer: %s", 1989 bch2_err_str(ret)); 1990 return ret; 1991 } 1992 } 1993 1994 spin_lock(&c->journal.lock); 1995 w->need_flush_to_write_buffer = false; 1996 spin_unlock(&c->journal.lock); 1997 1998 start = end = vstruct_last(jset); 1999 2000 end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have); 2001 2002 struct jset_entry_datetime *d = 2003 container_of(jset_entry_init(&end, sizeof(*d)), struct jset_entry_datetime, entry); 2004 d->entry.type = BCH_JSET_ENTRY_datetime; 2005 d->seconds = cpu_to_le64(ktime_get_real_seconds()); 2006 2007 bch2_journal_super_entries_add_common(c, &end, seq); 2008 u64s = (u64 *) end - (u64 *) start; 2009 2010 WARN_ON(u64s > j->entry_u64s_reserved); 2011 2012 le32_add_cpu(&jset->u64s, u64s); 2013 2014 unsigned sectors = vstruct_sectors(jset, c->block_bits); 2015 2016 if (sectors > w->sectors) { 2017 bch2_fs_fatal_error(c, ": journal write overran available space, %zu > %u (extra %u reserved %u/%u)", 2018 vstruct_bytes(jset), w->sectors << 9, 2019 u64s, w->u64s_reserved, j->entry_u64s_reserved); 2020 return -EINVAL; 2021 } 2022 2023 return 0; 2024 } 2025 2026 static int bch2_journal_write_checksum(struct journal *j, struct journal_buf *w) 2027 { 2028 struct bch_fs *c = container_of(j, struct bch_fs, journal); 2029 struct jset *jset = w->data; 2030 u64 seq = le64_to_cpu(jset->seq); 2031 bool validate_before_checksum = false; 2032 int ret = 0; 2033 2034 jset->magic = cpu_to_le64(jset_magic(c)); 2035 jset->version = cpu_to_le32(c->sb.version); 2036 2037 SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); 2038 SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); 2039 2040 if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset)) 2041 j->last_empty_seq = seq; 2042 2043 if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) 2044 validate_before_checksum = true; 2045 2046 if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current) 2047 validate_before_checksum = true; 2048 2049 if (validate_before_checksum && 2050 (ret = jset_validate(c, NULL, jset, 0, WRITE))) 2051 return ret; 2052 2053 ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), 2054 jset->encrypted_start, 2055 vstruct_end(jset) - (void *) jset->encrypted_start); 2056 if (bch2_fs_fatal_err_on(ret, c, "encrypting journal entry: %s", bch2_err_str(ret))) 2057 return ret; 2058 2059 jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), 2060 journal_nonce(jset), jset); 2061 2062 if (!validate_before_checksum && 2063 (ret = jset_validate(c, NULL, jset, 0, WRITE))) 2064 return ret; 2065 2066 unsigned sectors = vstruct_sectors(jset, c->block_bits); 2067 unsigned bytes = vstruct_bytes(jset); 2068 memset((void *) jset + bytes, 0, (sectors << 9) - bytes); 2069 return 0; 2070 } 2071 2072 static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w) 2073 { 2074 struct bch_fs *c = container_of(j, struct bch_fs, journal); 2075 int error = bch2_journal_error(j); 2076 2077 /* 2078 * If the journal is in an error state - we did an emergency shutdown - 2079 * we prefer to continue doing journal writes. We just mark them as 2080 * noflush so they'll never be used, but they'll still be visible by the 2081 * list_journal tool - this helps in debugging. 2082 * 2083 * There's a caveat: the first journal write after marking the 2084 * superblock dirty must always be a flush write, because on startup 2085 * from a clean shutdown we didn't necessarily read the journal and the 2086 * new journal write might overwrite whatever was in the journal 2087 * previously - we can't leave the journal without any flush writes in 2088 * it. 2089 * 2090 * So if we're in an error state, and we're still starting up, we don't 2091 * write anything at all. 2092 */ 2093 if (error && test_bit(JOURNAL_need_flush_write, &j->flags)) 2094 return error; 2095 2096 if (error || 2097 w->noflush || 2098 (!w->must_flush && 2099 time_before(jiffies, j->last_flush_write + 2100 msecs_to_jiffies(c->opts.journal_flush_delay)) && 2101 test_bit(JOURNAL_may_skip_flush, &j->flags))) { 2102 w->noflush = true; 2103 SET_JSET_NO_FLUSH(w->data, true); 2104 w->data->last_seq = 0; 2105 w->last_seq = 0; 2106 2107 j->nr_noflush_writes++; 2108 } else { 2109 w->must_flush = true; 2110 j->last_flush_write = jiffies; 2111 j->nr_flush_writes++; 2112 clear_bit(JOURNAL_need_flush_write, &j->flags); 2113 } 2114 2115 return 0; 2116 } 2117 2118 CLOSURE_CALLBACK(bch2_journal_write) 2119 { 2120 closure_type(w, struct journal_buf, io); 2121 struct journal *j = container_of(w, struct journal, buf[w->idx]); 2122 struct bch_fs *c = container_of(j, struct bch_fs, journal); 2123 union bch_replicas_padded replicas; 2124 unsigned nr_rw_members = dev_mask_nr(&c->rw_devs[BCH_DATA_free]); 2125 int ret; 2126 2127 BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); 2128 BUG_ON(!w->write_started); 2129 BUG_ON(w->write_allocated); 2130 BUG_ON(w->write_done); 2131 2132 j->write_start_time = local_clock(); 2133 2134 spin_lock(&j->lock); 2135 if (nr_rw_members > 1) 2136 w->separate_flush = true; 2137 2138 ret = bch2_journal_write_pick_flush(j, w); 2139 spin_unlock(&j->lock); 2140 2141 if (unlikely(ret)) 2142 goto err; 2143 2144 mutex_lock(&j->buf_lock); 2145 journal_buf_realloc(j, w); 2146 2147 ret = bch2_journal_write_prep(j, w); 2148 mutex_unlock(&j->buf_lock); 2149 2150 if (unlikely(ret)) 2151 goto err; 2152 2153 unsigned replicas_allocated = 0; 2154 while (1) { 2155 ret = journal_write_alloc(j, w, &replicas_allocated); 2156 if (!ret || !j->can_discard) 2157 break; 2158 2159 bch2_journal_do_discards(j); 2160 } 2161 2162 if (unlikely(ret)) 2163 goto err_allocate_write; 2164 2165 ret = bch2_journal_write_checksum(j, w); 2166 if (unlikely(ret)) 2167 goto err; 2168 2169 spin_lock(&j->lock); 2170 /* 2171 * write is allocated, no longer need to account for it in 2172 * bch2_journal_space_available(): 2173 */ 2174 w->sectors = 0; 2175 w->write_allocated = true; 2176 j->entry_bytes_written += vstruct_bytes(w->data); 2177 2178 /* 2179 * journal entry has been compacted and allocated, recalculate space 2180 * available: 2181 */ 2182 bch2_journal_space_available(j); 2183 bch2_journal_do_writes(j); 2184 spin_unlock(&j->lock); 2185 2186 w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); 2187 2188 /* 2189 * Mark journal replicas before we submit the write to guarantee 2190 * recovery will find the journal entries after a crash. 2191 */ 2192 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, 2193 w->devs_written); 2194 ret = bch2_mark_replicas(c, &replicas.e); 2195 if (ret) 2196 goto err; 2197 2198 if (c->opts.nochanges) 2199 goto no_io; 2200 2201 if (!JSET_NO_FLUSH(w->data)) 2202 continue_at(cl, journal_write_preflush, j->wq); 2203 else 2204 continue_at(cl, journal_write_submit, j->wq); 2205 return; 2206 err_allocate_write: 2207 if (!bch2_journal_error(j)) { 2208 struct printbuf buf = PRINTBUF; 2209 2210 bch2_journal_debug_to_text(&buf, j); 2211 prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"), 2212 le64_to_cpu(w->data->seq), 2213 vstruct_sectors(w->data, c->block_bits), 2214 bch2_err_str(ret)); 2215 bch2_print_str(c, KERN_ERR, buf.buf); 2216 printbuf_exit(&buf); 2217 } 2218 err: 2219 bch2_fatal_error(c); 2220 no_io: 2221 extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { 2222 struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); 2223 enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write); 2224 } 2225 2226 continue_at(cl, journal_write_done, j->wq); 2227 } 2228