1 // SPDX-License-Identifier: GPL-2.0 2 #include "bcachefs.h" 3 #include "alloc_background.h" 4 #include "alloc_foreground.h" 5 #include "btree_io.h" 6 #include "btree_update_interior.h" 7 #include "btree_write_buffer.h" 8 #include "buckets.h" 9 #include "checksum.h" 10 #include "disk_groups.h" 11 #include "error.h" 12 #include "journal.h" 13 #include "journal_io.h" 14 #include "journal_reclaim.h" 15 #include "journal_seq_blacklist.h" 16 #include "replicas.h" 17 #include "sb-clean.h" 18 #include "trace.h" 19 20 #include <linux/ioprio.h> 21 #include <linux/string_choices.h> 22 #include <linux/sched/sysctl.h> 23 24 void bch2_journal_pos_from_member_info_set(struct bch_fs *c) 25 { 26 lockdep_assert_held(&c->sb_lock); 27 28 for_each_member_device(c, ca) { 29 struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); 30 31 m->last_journal_bucket = cpu_to_le32(ca->journal.cur_idx); 32 m->last_journal_bucket_offset = cpu_to_le32(ca->mi.bucket_size - ca->journal.sectors_free); 33 } 34 } 35 36 void bch2_journal_pos_from_member_info_resume(struct bch_fs *c) 37 { 38 mutex_lock(&c->sb_lock); 39 for_each_member_device(c, ca) { 40 struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx); 41 42 unsigned idx = le32_to_cpu(m.last_journal_bucket); 43 if (idx < ca->journal.nr) 44 ca->journal.cur_idx = idx; 45 unsigned offset = le32_to_cpu(m.last_journal_bucket_offset); 46 if (offset <= ca->mi.bucket_size) 47 ca->journal.sectors_free = ca->mi.bucket_size - offset; 48 } 49 mutex_unlock(&c->sb_lock); 50 } 51 52 static void bch2_journal_ptr_to_text(struct printbuf *out, struct bch_fs *c, struct journal_ptr *p) 53 { 54 struct bch_dev *ca = bch2_dev_tryget_noerror(c, p->dev); 55 prt_printf(out, "%s %u:%u:%u (sector %llu)", 56 ca ? ca->name : "(invalid dev)", 57 p->dev, p->bucket, p->bucket_offset, p->sector); 58 bch2_dev_put(ca); 59 } 60 61 void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct journal_replay *j) 62 { 63 darray_for_each(j->ptrs, i) { 64 if (i != j->ptrs.data) 65 prt_printf(out, " "); 66 bch2_journal_ptr_to_text(out, c, i); 67 } 68 } 69 70 static void bch2_journal_datetime_to_text(struct printbuf *out, struct jset *j) 71 { 72 for_each_jset_entry_type(entry, j, BCH_JSET_ENTRY_datetime) { 73 struct jset_entry_datetime *datetime = 74 container_of(entry, struct jset_entry_datetime, entry); 75 bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); 76 break; 77 } 78 } 79 80 static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c, 81 struct journal_replay *j) 82 { 83 prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq)); 84 bch2_journal_datetime_to_text(out, &j->j); 85 prt_char(out, ' '); 86 bch2_journal_ptrs_to_text(out, c, j); 87 } 88 89 static struct nonce journal_nonce(const struct jset *jset) 90 { 91 return (struct nonce) {{ 92 [0] = 0, 93 [1] = ((__le32 *) &jset->seq)[0], 94 [2] = ((__le32 *) &jset->seq)[1], 95 [3] = BCH_NONCE_JOURNAL, 96 }}; 97 } 98 99 static bool jset_csum_good(struct bch_fs *c, struct jset *j, struct bch_csum *csum) 100 { 101 if (!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j))) { 102 *csum = (struct bch_csum) {}; 103 return false; 104 } 105 106 *csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j); 107 return !bch2_crc_cmp(j->csum, *csum); 108 } 109 110 static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq) 111 { 112 return (seq - c->journal_entries_base_seq) & (~0U >> 1); 113 } 114 115 static void __journal_replay_free(struct bch_fs *c, 116 struct journal_replay *i) 117 { 118 struct journal_replay **p = 119 genradix_ptr(&c->journal_entries, 120 journal_entry_radix_idx(c, le64_to_cpu(i->j.seq))); 121 122 BUG_ON(*p != i); 123 *p = NULL; 124 kvfree(i); 125 } 126 127 static void journal_replay_free(struct bch_fs *c, struct journal_replay *i, bool blacklisted) 128 { 129 if (blacklisted) 130 i->ignore_blacklisted = true; 131 else 132 i->ignore_not_dirty = true; 133 134 if (!c->opts.read_entire_journal) 135 __journal_replay_free(c, i); 136 } 137 138 struct journal_list { 139 struct closure cl; 140 u64 last_seq; 141 struct mutex lock; 142 int ret; 143 }; 144 145 #define JOURNAL_ENTRY_ADD_OK 0 146 #define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5 147 148 /* 149 * Given a journal entry we just read, add it to the list of journal entries to 150 * be replayed: 151 */ 152 static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, 153 struct journal_ptr entry_ptr, 154 struct journal_list *jlist, struct jset *j) 155 { 156 struct genradix_iter iter; 157 struct journal_replay **_i, *i, *dup; 158 size_t bytes = vstruct_bytes(j); 159 u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0; 160 struct printbuf buf = PRINTBUF; 161 int ret = JOURNAL_ENTRY_ADD_OK; 162 163 if (last_seq && c->opts.journal_rewind) 164 last_seq = min(last_seq, c->opts.journal_rewind); 165 166 if (!c->journal.oldest_seq_found_ondisk || 167 le64_to_cpu(j->seq) < c->journal.oldest_seq_found_ondisk) 168 c->journal.oldest_seq_found_ondisk = le64_to_cpu(j->seq); 169 170 /* Is this entry older than the range we need? */ 171 if (!c->opts.read_entire_journal && 172 le64_to_cpu(j->seq) < jlist->last_seq) 173 return JOURNAL_ENTRY_ADD_OUT_OF_RANGE; 174 175 /* 176 * genradixes are indexed by a ulong, not a u64, so we can't index them 177 * by sequence number directly: Assume instead that they will all fall 178 * within the range of +-2billion of the filrst one we find. 179 */ 180 if (!c->journal_entries_base_seq) 181 c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX); 182 183 /* Drop entries we don't need anymore */ 184 if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) { 185 genradix_for_each_from(&c->journal_entries, iter, _i, 186 journal_entry_radix_idx(c, jlist->last_seq)) { 187 i = *_i; 188 189 if (journal_replay_ignore(i)) 190 continue; 191 192 if (le64_to_cpu(i->j.seq) >= last_seq) 193 break; 194 195 journal_replay_free(c, i, false); 196 } 197 } 198 199 jlist->last_seq = max(jlist->last_seq, last_seq); 200 201 _i = genradix_ptr_alloc(&c->journal_entries, 202 journal_entry_radix_idx(c, le64_to_cpu(j->seq)), 203 GFP_KERNEL); 204 if (!_i) 205 return bch_err_throw(c, ENOMEM_journal_entry_add); 206 207 /* 208 * Duplicate journal entries? If so we want the one that didn't have a 209 * checksum error: 210 */ 211 dup = *_i; 212 if (dup) { 213 bool identical = bytes == vstruct_bytes(&dup->j) && 214 !memcmp(j, &dup->j, bytes); 215 bool not_identical = !identical && 216 entry_ptr.csum_good && 217 dup->csum_good; 218 219 bool same_device = false; 220 darray_for_each(dup->ptrs, ptr) 221 if (ptr->dev == ca->dev_idx) 222 same_device = true; 223 224 ret = darray_push(&dup->ptrs, entry_ptr); 225 if (ret) 226 goto out; 227 228 bch2_journal_replay_to_text(&buf, c, dup); 229 230 fsck_err_on(same_device, 231 c, journal_entry_dup_same_device, 232 "duplicate journal entry on same device\n%s", 233 buf.buf); 234 235 fsck_err_on(not_identical, 236 c, journal_entry_replicas_data_mismatch, 237 "found duplicate but non identical journal entries\n%s", 238 buf.buf); 239 240 if (entry_ptr.csum_good && !identical) 241 goto replace; 242 243 goto out; 244 } 245 replace: 246 i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); 247 if (!i) 248 return bch_err_throw(c, ENOMEM_journal_entry_add); 249 250 darray_init(&i->ptrs); 251 i->csum_good = entry_ptr.csum_good; 252 i->ignore_blacklisted = false; 253 i->ignore_not_dirty = false; 254 unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct"); 255 256 if (dup) { 257 /* The first ptr should represent the jset we kept: */ 258 darray_for_each(dup->ptrs, ptr) 259 darray_push(&i->ptrs, *ptr); 260 __journal_replay_free(c, dup); 261 } else { 262 darray_push(&i->ptrs, entry_ptr); 263 } 264 265 *_i = i; 266 out: 267 fsck_err: 268 printbuf_exit(&buf); 269 return ret; 270 } 271 272 /* this fills in a range with empty jset_entries: */ 273 static void journal_entry_null_range(void *start, void *end) 274 { 275 struct jset_entry *entry; 276 277 for (entry = start; entry != end; entry = vstruct_next(entry)) 278 memset(entry, 0, sizeof(*entry)); 279 } 280 281 #define JOURNAL_ENTRY_REREAD 5 282 #define JOURNAL_ENTRY_NONE 6 283 #define JOURNAL_ENTRY_BAD 7 284 285 static void journal_entry_err_msg(struct printbuf *out, 286 u32 version, 287 struct jset *jset, 288 struct jset_entry *entry) 289 { 290 prt_str(out, "invalid journal entry, version="); 291 bch2_version_to_text(out, version); 292 293 if (entry) { 294 prt_str(out, " type="); 295 bch2_prt_jset_entry_type(out, entry->type); 296 } 297 298 if (!jset) { 299 prt_printf(out, " in superblock"); 300 } else { 301 302 prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq)); 303 304 if (entry) 305 prt_printf(out, " offset=%zi/%u", 306 (u64 *) entry - jset->_data, 307 le32_to_cpu(jset->u64s)); 308 } 309 310 prt_str(out, ": "); 311 } 312 313 #define journal_entry_err(c, version, jset, entry, _err, msg, ...) \ 314 ({ \ 315 struct printbuf _buf = PRINTBUF; \ 316 \ 317 journal_entry_err_msg(&_buf, version, jset, entry); \ 318 prt_printf(&_buf, msg, ##__VA_ARGS__); \ 319 \ 320 switch (from.flags & BCH_VALIDATE_write) { \ 321 case READ: \ 322 mustfix_fsck_err(c, _err, "%s", _buf.buf); \ 323 break; \ 324 case WRITE: \ 325 bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \ 326 if (bch2_fs_inconsistent(c, \ 327 "corrupt metadata before write: %s\n", _buf.buf)) {\ 328 ret = bch_err_throw(c, fsck_errors_not_fixed); \ 329 goto fsck_err; \ 330 } \ 331 break; \ 332 } \ 333 \ 334 printbuf_exit(&_buf); \ 335 true; \ 336 }) 337 338 #define journal_entry_err_on(cond, ...) \ 339 ((cond) ? journal_entry_err(__VA_ARGS__) : false) 340 341 #define FSCK_DELETED_KEY 5 342 343 static int journal_validate_key(struct bch_fs *c, 344 struct jset *jset, 345 struct jset_entry *entry, 346 struct bkey_i *k, 347 struct bkey_validate_context from, 348 unsigned version, int big_endian) 349 { 350 enum bch_validate_flags flags = from.flags; 351 int write = flags & BCH_VALIDATE_write; 352 void *next = vstruct_next(entry); 353 int ret = 0; 354 355 if (journal_entry_err_on(!k->k.u64s, 356 c, version, jset, entry, 357 journal_entry_bkey_u64s_0, 358 "k->u64s 0")) { 359 entry->u64s = cpu_to_le16((u64 *) k - entry->_data); 360 journal_entry_null_range(vstruct_next(entry), next); 361 return FSCK_DELETED_KEY; 362 } 363 364 if (journal_entry_err_on((void *) bkey_next(k) > 365 (void *) vstruct_next(entry), 366 c, version, jset, entry, 367 journal_entry_bkey_past_end, 368 "extends past end of journal entry")) { 369 entry->u64s = cpu_to_le16((u64 *) k - entry->_data); 370 journal_entry_null_range(vstruct_next(entry), next); 371 return FSCK_DELETED_KEY; 372 } 373 374 if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, 375 c, version, jset, entry, 376 journal_entry_bkey_bad_format, 377 "bad format %u", k->k.format)) { 378 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); 379 memmove(k, bkey_next(k), next - (void *) bkey_next(k)); 380 journal_entry_null_range(vstruct_next(entry), next); 381 return FSCK_DELETED_KEY; 382 } 383 384 if (!write) 385 bch2_bkey_compat(from.level, from.btree, version, big_endian, 386 write, NULL, bkey_to_packed(k)); 387 388 ret = bch2_bkey_validate(c, bkey_i_to_s_c(k), from); 389 if (ret == -BCH_ERR_fsck_delete_bkey) { 390 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); 391 memmove(k, bkey_next(k), next - (void *) bkey_next(k)); 392 journal_entry_null_range(vstruct_next(entry), next); 393 return FSCK_DELETED_KEY; 394 } 395 if (ret) 396 goto fsck_err; 397 398 if (write) 399 bch2_bkey_compat(from.level, from.btree, version, big_endian, 400 write, NULL, bkey_to_packed(k)); 401 fsck_err: 402 return ret; 403 } 404 405 static int journal_entry_btree_keys_validate(struct bch_fs *c, 406 struct jset *jset, 407 struct jset_entry *entry, 408 unsigned version, int big_endian, 409 struct bkey_validate_context from) 410 { 411 struct bkey_i *k = entry->start; 412 413 from.level = entry->level; 414 from.btree = entry->btree_id; 415 416 while (k != vstruct_last(entry)) { 417 int ret = journal_validate_key(c, jset, entry, k, from, version, big_endian); 418 if (ret == FSCK_DELETED_KEY) 419 continue; 420 else if (ret) 421 return ret; 422 423 k = bkey_next(k); 424 } 425 426 return 0; 427 } 428 429 static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c, 430 struct jset_entry *entry) 431 { 432 bool first = true; 433 434 jset_entry_for_each_key(entry, k) { 435 /* We may be called on entries that haven't been validated: */ 436 if (!k->k.u64s) 437 break; 438 439 if (!first) { 440 prt_newline(out); 441 bch2_prt_jset_entry_type(out, entry->type); 442 prt_str(out, ": "); 443 } 444 bch2_btree_id_level_to_text(out, entry->btree_id, entry->level); 445 prt_char(out, ' '); 446 bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); 447 first = false; 448 } 449 } 450 451 static int journal_entry_btree_root_validate(struct bch_fs *c, 452 struct jset *jset, 453 struct jset_entry *entry, 454 unsigned version, int big_endian, 455 struct bkey_validate_context from) 456 { 457 struct bkey_i *k = entry->start; 458 int ret = 0; 459 460 from.root = true; 461 from.level = entry->level + 1; 462 from.btree = entry->btree_id; 463 464 if (journal_entry_err_on(!entry->u64s || 465 le16_to_cpu(entry->u64s) != k->k.u64s, 466 c, version, jset, entry, 467 journal_entry_btree_root_bad_size, 468 "invalid btree root journal entry: wrong number of keys")) { 469 void *next = vstruct_next(entry); 470 /* 471 * we don't want to null out this jset_entry, 472 * just the contents, so that later we can tell 473 * we were _supposed_ to have a btree root 474 */ 475 entry->u64s = 0; 476 journal_entry_null_range(vstruct_next(entry), next); 477 return 0; 478 } 479 480 ret = journal_validate_key(c, jset, entry, k, from, version, big_endian); 481 if (ret == FSCK_DELETED_KEY) 482 ret = 0; 483 fsck_err: 484 return ret; 485 } 486 487 static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c, 488 struct jset_entry *entry) 489 { 490 journal_entry_btree_keys_to_text(out, c, entry); 491 } 492 493 static int journal_entry_prio_ptrs_validate(struct bch_fs *c, 494 struct jset *jset, 495 struct jset_entry *entry, 496 unsigned version, int big_endian, 497 struct bkey_validate_context from) 498 { 499 /* obsolete, don't care: */ 500 return 0; 501 } 502 503 static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c, 504 struct jset_entry *entry) 505 { 506 } 507 508 static int journal_entry_blacklist_validate(struct bch_fs *c, 509 struct jset *jset, 510 struct jset_entry *entry, 511 unsigned version, int big_endian, 512 struct bkey_validate_context from) 513 { 514 int ret = 0; 515 516 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, 517 c, version, jset, entry, 518 journal_entry_blacklist_bad_size, 519 "invalid journal seq blacklist entry: bad size")) { 520 journal_entry_null_range(entry, vstruct_next(entry)); 521 } 522 fsck_err: 523 return ret; 524 } 525 526 static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c, 527 struct jset_entry *entry) 528 { 529 struct jset_entry_blacklist *bl = 530 container_of(entry, struct jset_entry_blacklist, entry); 531 532 prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq)); 533 } 534 535 static int journal_entry_blacklist_v2_validate(struct bch_fs *c, 536 struct jset *jset, 537 struct jset_entry *entry, 538 unsigned version, int big_endian, 539 struct bkey_validate_context from) 540 { 541 struct jset_entry_blacklist_v2 *bl_entry; 542 int ret = 0; 543 544 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, 545 c, version, jset, entry, 546 journal_entry_blacklist_v2_bad_size, 547 "invalid journal seq blacklist entry: bad size")) { 548 journal_entry_null_range(entry, vstruct_next(entry)); 549 goto out; 550 } 551 552 bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry); 553 554 if (journal_entry_err_on(le64_to_cpu(bl_entry->start) > 555 le64_to_cpu(bl_entry->end), 556 c, version, jset, entry, 557 journal_entry_blacklist_v2_start_past_end, 558 "invalid journal seq blacklist entry: start > end")) { 559 journal_entry_null_range(entry, vstruct_next(entry)); 560 } 561 out: 562 fsck_err: 563 return ret; 564 } 565 566 static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c, 567 struct jset_entry *entry) 568 { 569 struct jset_entry_blacklist_v2 *bl = 570 container_of(entry, struct jset_entry_blacklist_v2, entry); 571 572 prt_printf(out, "start=%llu end=%llu", 573 le64_to_cpu(bl->start), 574 le64_to_cpu(bl->end)); 575 } 576 577 static int journal_entry_usage_validate(struct bch_fs *c, 578 struct jset *jset, 579 struct jset_entry *entry, 580 unsigned version, int big_endian, 581 struct bkey_validate_context from) 582 { 583 struct jset_entry_usage *u = 584 container_of(entry, struct jset_entry_usage, entry); 585 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 586 int ret = 0; 587 588 if (journal_entry_err_on(bytes < sizeof(*u), 589 c, version, jset, entry, 590 journal_entry_usage_bad_size, 591 "invalid journal entry usage: bad size")) { 592 journal_entry_null_range(entry, vstruct_next(entry)); 593 return ret; 594 } 595 596 fsck_err: 597 return ret; 598 } 599 600 static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c, 601 struct jset_entry *entry) 602 { 603 struct jset_entry_usage *u = 604 container_of(entry, struct jset_entry_usage, entry); 605 606 prt_str(out, "type="); 607 bch2_prt_fs_usage_type(out, u->entry.btree_id); 608 prt_printf(out, " v=%llu", le64_to_cpu(u->v)); 609 } 610 611 static int journal_entry_data_usage_validate(struct bch_fs *c, 612 struct jset *jset, 613 struct jset_entry *entry, 614 unsigned version, int big_endian, 615 struct bkey_validate_context from) 616 { 617 struct jset_entry_data_usage *u = 618 container_of(entry, struct jset_entry_data_usage, entry); 619 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 620 struct printbuf err = PRINTBUF; 621 int ret = 0; 622 623 if (journal_entry_err_on(bytes < sizeof(*u) || 624 bytes < sizeof(*u) + u->r.nr_devs, 625 c, version, jset, entry, 626 journal_entry_data_usage_bad_size, 627 "invalid journal entry usage: bad size")) { 628 journal_entry_null_range(entry, vstruct_next(entry)); 629 goto out; 630 } 631 632 if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c, &err), 633 c, version, jset, entry, 634 journal_entry_data_usage_bad_size, 635 "invalid journal entry usage: %s", err.buf)) { 636 journal_entry_null_range(entry, vstruct_next(entry)); 637 goto out; 638 } 639 out: 640 fsck_err: 641 printbuf_exit(&err); 642 return ret; 643 } 644 645 static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c, 646 struct jset_entry *entry) 647 { 648 struct jset_entry_data_usage *u = 649 container_of(entry, struct jset_entry_data_usage, entry); 650 651 bch2_replicas_entry_to_text(out, &u->r); 652 prt_printf(out, "=%llu", le64_to_cpu(u->v)); 653 } 654 655 static int journal_entry_clock_validate(struct bch_fs *c, 656 struct jset *jset, 657 struct jset_entry *entry, 658 unsigned version, int big_endian, 659 struct bkey_validate_context from) 660 { 661 struct jset_entry_clock *clock = 662 container_of(entry, struct jset_entry_clock, entry); 663 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 664 int ret = 0; 665 666 if (journal_entry_err_on(bytes != sizeof(*clock), 667 c, version, jset, entry, 668 journal_entry_clock_bad_size, 669 "bad size")) { 670 journal_entry_null_range(entry, vstruct_next(entry)); 671 return ret; 672 } 673 674 if (journal_entry_err_on(clock->rw > 1, 675 c, version, jset, entry, 676 journal_entry_clock_bad_rw, 677 "bad rw")) { 678 journal_entry_null_range(entry, vstruct_next(entry)); 679 return ret; 680 } 681 682 fsck_err: 683 return ret; 684 } 685 686 static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c, 687 struct jset_entry *entry) 688 { 689 struct jset_entry_clock *clock = 690 container_of(entry, struct jset_entry_clock, entry); 691 692 prt_printf(out, "%s=%llu", str_write_read(clock->rw), le64_to_cpu(clock->time)); 693 } 694 695 static int journal_entry_dev_usage_validate(struct bch_fs *c, 696 struct jset *jset, 697 struct jset_entry *entry, 698 unsigned version, int big_endian, 699 struct bkey_validate_context from) 700 { 701 struct jset_entry_dev_usage *u = 702 container_of(entry, struct jset_entry_dev_usage, entry); 703 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 704 unsigned expected = sizeof(*u); 705 int ret = 0; 706 707 if (journal_entry_err_on(bytes < expected, 708 c, version, jset, entry, 709 journal_entry_dev_usage_bad_size, 710 "bad size (%u < %u)", 711 bytes, expected)) { 712 journal_entry_null_range(entry, vstruct_next(entry)); 713 return ret; 714 } 715 716 if (journal_entry_err_on(u->pad, 717 c, version, jset, entry, 718 journal_entry_dev_usage_bad_pad, 719 "bad pad")) { 720 journal_entry_null_range(entry, vstruct_next(entry)); 721 return ret; 722 } 723 724 fsck_err: 725 return ret; 726 } 727 728 static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c, 729 struct jset_entry *entry) 730 { 731 struct jset_entry_dev_usage *u = 732 container_of(entry, struct jset_entry_dev_usage, entry); 733 unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); 734 735 if (vstruct_bytes(entry) < sizeof(*u)) 736 return; 737 738 prt_printf(out, "dev=%u", le32_to_cpu(u->dev)); 739 740 printbuf_indent_add(out, 2); 741 for (i = 0; i < nr_types; i++) { 742 prt_newline(out); 743 bch2_prt_data_type(out, i); 744 prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu", 745 le64_to_cpu(u->d[i].buckets), 746 le64_to_cpu(u->d[i].sectors), 747 le64_to_cpu(u->d[i].fragmented)); 748 } 749 printbuf_indent_sub(out, 2); 750 } 751 752 static int journal_entry_log_validate(struct bch_fs *c, 753 struct jset *jset, 754 struct jset_entry *entry, 755 unsigned version, int big_endian, 756 struct bkey_validate_context from) 757 { 758 return 0; 759 } 760 761 static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c, 762 struct jset_entry *entry) 763 { 764 struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); 765 766 prt_printf(out, "%.*s", jset_entry_log_msg_bytes(l), l->d); 767 } 768 769 static int journal_entry_overwrite_validate(struct bch_fs *c, 770 struct jset *jset, 771 struct jset_entry *entry, 772 unsigned version, int big_endian, 773 struct bkey_validate_context from) 774 { 775 from.flags = 0; 776 return journal_entry_btree_keys_validate(c, jset, entry, 777 version, big_endian, from); 778 } 779 780 static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c, 781 struct jset_entry *entry) 782 { 783 journal_entry_btree_keys_to_text(out, c, entry); 784 } 785 786 static int journal_entry_log_bkey_validate(struct bch_fs *c, 787 struct jset *jset, 788 struct jset_entry *entry, 789 unsigned version, int big_endian, 790 struct bkey_validate_context from) 791 { 792 from.flags = 0; 793 return journal_entry_btree_keys_validate(c, jset, entry, 794 version, big_endian, from); 795 } 796 797 static void journal_entry_log_bkey_to_text(struct printbuf *out, struct bch_fs *c, 798 struct jset_entry *entry) 799 { 800 journal_entry_btree_keys_to_text(out, c, entry); 801 } 802 803 static int journal_entry_write_buffer_keys_validate(struct bch_fs *c, 804 struct jset *jset, 805 struct jset_entry *entry, 806 unsigned version, int big_endian, 807 struct bkey_validate_context from) 808 { 809 return journal_entry_btree_keys_validate(c, jset, entry, 810 version, big_endian, from); 811 } 812 813 static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c, 814 struct jset_entry *entry) 815 { 816 journal_entry_btree_keys_to_text(out, c, entry); 817 } 818 819 static int journal_entry_datetime_validate(struct bch_fs *c, 820 struct jset *jset, 821 struct jset_entry *entry, 822 unsigned version, int big_endian, 823 struct bkey_validate_context from) 824 { 825 unsigned bytes = vstruct_bytes(entry); 826 unsigned expected = 16; 827 int ret = 0; 828 829 if (journal_entry_err_on(vstruct_bytes(entry) < expected, 830 c, version, jset, entry, 831 journal_entry_dev_usage_bad_size, 832 "bad size (%u < %u)", 833 bytes, expected)) { 834 journal_entry_null_range(entry, vstruct_next(entry)); 835 return ret; 836 } 837 fsck_err: 838 return ret; 839 } 840 841 static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *c, 842 struct jset_entry *entry) 843 { 844 struct jset_entry_datetime *datetime = 845 container_of(entry, struct jset_entry_datetime, entry); 846 847 bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); 848 } 849 850 struct jset_entry_ops { 851 int (*validate)(struct bch_fs *, struct jset *, 852 struct jset_entry *, unsigned, int, 853 struct bkey_validate_context); 854 void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); 855 }; 856 857 static const struct jset_entry_ops bch2_jset_entry_ops[] = { 858 #define x(f, nr) \ 859 [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \ 860 .validate = journal_entry_##f##_validate, \ 861 .to_text = journal_entry_##f##_to_text, \ 862 }, 863 BCH_JSET_ENTRY_TYPES() 864 #undef x 865 }; 866 867 int bch2_journal_entry_validate(struct bch_fs *c, 868 struct jset *jset, 869 struct jset_entry *entry, 870 unsigned version, int big_endian, 871 struct bkey_validate_context from) 872 { 873 return entry->type < BCH_JSET_ENTRY_NR 874 ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry, 875 version, big_endian, from) 876 : 0; 877 } 878 879 void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, 880 struct jset_entry *entry) 881 { 882 bch2_prt_jset_entry_type(out, entry->type); 883 884 if (entry->type < BCH_JSET_ENTRY_NR) { 885 prt_str(out, ": "); 886 bch2_jset_entry_ops[entry->type].to_text(out, c, entry); 887 } 888 } 889 890 static int jset_validate_entries(struct bch_fs *c, struct jset *jset, 891 enum bch_validate_flags flags) 892 { 893 struct bkey_validate_context from = { 894 .flags = flags, 895 .from = BKEY_VALIDATE_journal, 896 .journal_seq = le64_to_cpu(jset->seq), 897 }; 898 899 unsigned version = le32_to_cpu(jset->version); 900 int ret = 0; 901 902 vstruct_for_each(jset, entry) { 903 from.journal_offset = (u64 *) entry - jset->_data; 904 905 if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset), 906 c, version, jset, entry, 907 journal_entry_past_jset_end, 908 "journal entry extends past end of jset")) { 909 jset->u64s = cpu_to_le32((u64 *) entry - jset->_data); 910 break; 911 } 912 913 ret = bch2_journal_entry_validate(c, jset, entry, version, 914 JSET_BIG_ENDIAN(jset), from); 915 if (ret) 916 break; 917 } 918 fsck_err: 919 return ret; 920 } 921 922 static int jset_validate(struct bch_fs *c, 923 struct bch_dev *ca, 924 struct jset *jset, u64 sector, 925 enum bch_validate_flags flags) 926 { 927 struct bkey_validate_context from = { 928 .flags = flags, 929 .from = BKEY_VALIDATE_journal, 930 .journal_seq = le64_to_cpu(jset->seq), 931 }; 932 int ret = 0; 933 934 if (le64_to_cpu(jset->magic) != jset_magic(c)) 935 return JOURNAL_ENTRY_NONE; 936 937 unsigned version = le32_to_cpu(jset->version); 938 if (journal_entry_err_on(!bch2_version_compatible(version), 939 c, version, jset, NULL, 940 jset_unsupported_version, 941 "%s sector %llu seq %llu: incompatible journal entry version %u.%u", 942 ca ? ca->name : c->name, 943 sector, le64_to_cpu(jset->seq), 944 BCH_VERSION_MAJOR(version), 945 BCH_VERSION_MINOR(version))) { 946 /* don't try to continue: */ 947 return -EINVAL; 948 } 949 950 if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), 951 c, version, jset, NULL, 952 jset_unknown_csum, 953 "%s sector %llu seq %llu: journal entry with unknown csum type %llu", 954 ca ? ca->name : c->name, 955 sector, le64_to_cpu(jset->seq), 956 JSET_CSUM_TYPE(jset))) 957 ret = JOURNAL_ENTRY_BAD; 958 959 /* last_seq is ignored when JSET_NO_FLUSH is true */ 960 if (journal_entry_err_on(!JSET_NO_FLUSH(jset) && 961 le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), 962 c, version, jset, NULL, 963 jset_last_seq_newer_than_seq, 964 "invalid journal entry: last_seq > seq (%llu > %llu)", 965 le64_to_cpu(jset->last_seq), 966 le64_to_cpu(jset->seq))) { 967 jset->last_seq = jset->seq; 968 return JOURNAL_ENTRY_BAD; 969 } 970 971 ret = jset_validate_entries(c, jset, flags); 972 fsck_err: 973 return ret; 974 } 975 976 static int jset_validate_early(struct bch_fs *c, 977 struct bch_dev *ca, 978 struct jset *jset, u64 sector, 979 unsigned bucket_sectors_left, 980 unsigned sectors_read) 981 { 982 struct bkey_validate_context from = { 983 .from = BKEY_VALIDATE_journal, 984 .journal_seq = le64_to_cpu(jset->seq), 985 }; 986 int ret = 0; 987 988 if (le64_to_cpu(jset->magic) != jset_magic(c)) 989 return JOURNAL_ENTRY_NONE; 990 991 unsigned version = le32_to_cpu(jset->version); 992 if (journal_entry_err_on(!bch2_version_compatible(version), 993 c, version, jset, NULL, 994 jset_unsupported_version, 995 "%s sector %llu seq %llu: unknown journal entry version %u.%u", 996 ca ? ca->name : c->name, 997 sector, le64_to_cpu(jset->seq), 998 BCH_VERSION_MAJOR(version), 999 BCH_VERSION_MINOR(version))) { 1000 /* don't try to continue: */ 1001 return -EINVAL; 1002 } 1003 1004 size_t bytes = vstruct_bytes(jset); 1005 if (bytes > (sectors_read << 9) && 1006 sectors_read < bucket_sectors_left) 1007 return JOURNAL_ENTRY_REREAD; 1008 1009 if (journal_entry_err_on(bytes > bucket_sectors_left << 9, 1010 c, version, jset, NULL, 1011 jset_past_bucket_end, 1012 "%s sector %llu seq %llu: journal entry too big (%zu bytes)", 1013 ca ? ca->name : c->name, 1014 sector, le64_to_cpu(jset->seq), bytes)) 1015 le32_add_cpu(&jset->u64s, 1016 -((bytes - (bucket_sectors_left << 9)) / 8)); 1017 fsck_err: 1018 return ret; 1019 } 1020 1021 struct journal_read_buf { 1022 void *data; 1023 size_t size; 1024 }; 1025 1026 static int journal_read_buf_realloc(struct bch_fs *c, struct journal_read_buf *b, 1027 size_t new_size) 1028 { 1029 void *n; 1030 1031 /* the bios are sized for this many pages, max: */ 1032 if (new_size > JOURNAL_ENTRY_SIZE_MAX) 1033 return bch_err_throw(c, ENOMEM_journal_read_buf_realloc); 1034 1035 new_size = roundup_pow_of_two(new_size); 1036 n = kvmalloc(new_size, GFP_KERNEL); 1037 if (!n) 1038 return bch_err_throw(c, ENOMEM_journal_read_buf_realloc); 1039 1040 kvfree(b->data); 1041 b->data = n; 1042 b->size = new_size; 1043 return 0; 1044 } 1045 1046 static int journal_read_bucket(struct bch_dev *ca, 1047 struct journal_read_buf *buf, 1048 struct journal_list *jlist, 1049 unsigned bucket) 1050 { 1051 struct bch_fs *c = ca->fs; 1052 struct journal_device *ja = &ca->journal; 1053 struct jset *j = NULL; 1054 unsigned sectors, sectors_read = 0; 1055 u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), 1056 end = offset + ca->mi.bucket_size; 1057 bool saw_bad = false, csum_good; 1058 int ret = 0; 1059 1060 pr_debug("reading %u", bucket); 1061 1062 while (offset < end) { 1063 if (!sectors_read) { 1064 struct bio *bio; 1065 unsigned nr_bvecs; 1066 reread: 1067 sectors_read = min_t(unsigned, 1068 end - offset, buf->size >> 9); 1069 nr_bvecs = buf_pages(buf->data, sectors_read << 9); 1070 1071 bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); 1072 if (!bio) 1073 return bch_err_throw(c, ENOMEM_journal_read_bucket); 1074 bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ); 1075 1076 bio->bi_iter.bi_sector = offset; 1077 bch2_bio_map(bio, buf->data, sectors_read << 9); 1078 1079 u64 submit_time = local_clock(); 1080 ret = submit_bio_wait(bio); 1081 kfree(bio); 1082 1083 if (!ret && bch2_meta_read_fault("journal")) 1084 ret = bch_err_throw(c, EIO_fault_injected); 1085 1086 bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, 1087 submit_time, !ret); 1088 1089 if (ret) { 1090 bch_err_dev_ratelimited(ca, 1091 "journal read error: sector %llu", offset); 1092 /* 1093 * We don't error out of the recovery process 1094 * here, since the relevant journal entry may be 1095 * found on a different device, and missing or 1096 * no journal entries will be handled later 1097 */ 1098 return 0; 1099 } 1100 1101 j = buf->data; 1102 } 1103 1104 ret = jset_validate_early(c, ca, j, offset, 1105 end - offset, sectors_read); 1106 switch (ret) { 1107 case 0: 1108 sectors = vstruct_sectors(j, c->block_bits); 1109 break; 1110 case JOURNAL_ENTRY_REREAD: 1111 if (vstruct_bytes(j) > buf->size) { 1112 ret = journal_read_buf_realloc(c, buf, 1113 vstruct_bytes(j)); 1114 if (ret) 1115 return ret; 1116 } 1117 goto reread; 1118 case JOURNAL_ENTRY_NONE: 1119 if (!saw_bad) 1120 return 0; 1121 /* 1122 * On checksum error we don't really trust the size 1123 * field of the journal entry we read, so try reading 1124 * again at next block boundary: 1125 */ 1126 sectors = block_sectors(c); 1127 goto next_block; 1128 default: 1129 return ret; 1130 } 1131 1132 if (le64_to_cpu(j->seq) > ja->highest_seq_found) { 1133 ja->highest_seq_found = le64_to_cpu(j->seq); 1134 ja->cur_idx = bucket; 1135 ja->sectors_free = ca->mi.bucket_size - 1136 bucket_remainder(ca, offset) - sectors; 1137 } 1138 1139 /* 1140 * This happens sometimes if we don't have discards on - 1141 * when we've partially overwritten a bucket with new 1142 * journal entries. We don't need the rest of the 1143 * bucket: 1144 */ 1145 if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) 1146 return 0; 1147 1148 ja->bucket_seq[bucket] = le64_to_cpu(j->seq); 1149 1150 struct bch_csum csum; 1151 csum_good = jset_csum_good(c, j, &csum); 1152 1153 bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good); 1154 1155 if (!csum_good) { 1156 /* 1157 * Don't print an error here, we'll print the error 1158 * later if we need this journal entry 1159 */ 1160 saw_bad = true; 1161 } 1162 1163 ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), 1164 j->encrypted_start, 1165 vstruct_end(j) - (void *) j->encrypted_start); 1166 bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret)); 1167 1168 mutex_lock(&jlist->lock); 1169 ret = journal_entry_add(c, ca, (struct journal_ptr) { 1170 .csum_good = csum_good, 1171 .csum = csum, 1172 .dev = ca->dev_idx, 1173 .bucket = bucket, 1174 .bucket_offset = offset - 1175 bucket_to_sector(ca, ja->buckets[bucket]), 1176 .sector = offset, 1177 }, jlist, j); 1178 mutex_unlock(&jlist->lock); 1179 1180 switch (ret) { 1181 case JOURNAL_ENTRY_ADD_OK: 1182 break; 1183 case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: 1184 break; 1185 default: 1186 return ret; 1187 } 1188 next_block: 1189 pr_debug("next"); 1190 offset += sectors; 1191 sectors_read -= sectors; 1192 j = ((void *) j) + (sectors << 9); 1193 } 1194 1195 return 0; 1196 } 1197 1198 static CLOSURE_CALLBACK(bch2_journal_read_device) 1199 { 1200 closure_type(ja, struct journal_device, read); 1201 struct bch_dev *ca = container_of(ja, struct bch_dev, journal); 1202 struct bch_fs *c = ca->fs; 1203 struct journal_list *jlist = 1204 container_of(cl->parent, struct journal_list, cl); 1205 struct journal_read_buf buf = { NULL, 0 }; 1206 unsigned i; 1207 int ret = 0; 1208 1209 if (!ja->nr) 1210 goto out; 1211 1212 ret = journal_read_buf_realloc(c, &buf, PAGE_SIZE); 1213 if (ret) 1214 goto err; 1215 1216 pr_debug("%u journal buckets", ja->nr); 1217 1218 for (i = 0; i < ja->nr; i++) { 1219 ret = journal_read_bucket(ca, &buf, jlist, i); 1220 if (ret) 1221 goto err; 1222 } 1223 1224 /* 1225 * Set dirty_idx to indicate the entire journal is full and needs to be 1226 * reclaimed - journal reclaim will immediately reclaim whatever isn't 1227 * pinned when it first runs: 1228 */ 1229 ja->discard_idx = ja->dirty_idx_ondisk = 1230 ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; 1231 out: 1232 bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); 1233 kvfree(buf.data); 1234 enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_journal_read); 1235 closure_return(cl); 1236 return; 1237 err: 1238 mutex_lock(&jlist->lock); 1239 jlist->ret = ret; 1240 mutex_unlock(&jlist->lock); 1241 goto out; 1242 } 1243 1244 noinline_for_stack 1245 static void bch2_journal_print_checksum_error(struct bch_fs *c, struct journal_replay *j) 1246 { 1247 struct printbuf buf = PRINTBUF; 1248 enum bch_csum_type csum_type = JSET_CSUM_TYPE(&j->j); 1249 bool have_good = false; 1250 1251 prt_printf(&buf, "invalid journal checksum(s) at seq %llu ", le64_to_cpu(j->j.seq)); 1252 bch2_journal_datetime_to_text(&buf, &j->j); 1253 prt_newline(&buf); 1254 1255 darray_for_each(j->ptrs, ptr) 1256 if (!ptr->csum_good) { 1257 bch2_journal_ptr_to_text(&buf, c, ptr); 1258 prt_char(&buf, ' '); 1259 bch2_csum_to_text(&buf, csum_type, ptr->csum); 1260 prt_newline(&buf); 1261 } else { 1262 have_good = true; 1263 } 1264 1265 prt_printf(&buf, "should be "); 1266 bch2_csum_to_text(&buf, csum_type, j->j.csum); 1267 1268 if (have_good) 1269 prt_printf(&buf, "\n(had good copy on another device)"); 1270 1271 bch2_print_str(c, KERN_ERR, buf.buf); 1272 printbuf_exit(&buf); 1273 } 1274 1275 noinline_for_stack 1276 static int bch2_journal_check_for_missing(struct bch_fs *c, u64 start_seq, u64 end_seq) 1277 { 1278 struct printbuf buf = PRINTBUF; 1279 int ret = 0; 1280 1281 struct genradix_iter radix_iter; 1282 struct journal_replay *i, **_i, *prev = NULL; 1283 u64 seq = start_seq; 1284 1285 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1286 i = *_i; 1287 1288 if (journal_replay_ignore(i)) 1289 continue; 1290 1291 BUG_ON(seq > le64_to_cpu(i->j.seq)); 1292 1293 while (seq < le64_to_cpu(i->j.seq)) { 1294 while (seq < le64_to_cpu(i->j.seq) && 1295 bch2_journal_seq_is_blacklisted(c, seq, false)) 1296 seq++; 1297 1298 if (seq == le64_to_cpu(i->j.seq)) 1299 break; 1300 1301 u64 missing_start = seq; 1302 1303 while (seq < le64_to_cpu(i->j.seq) && 1304 !bch2_journal_seq_is_blacklisted(c, seq, false)) 1305 seq++; 1306 1307 u64 missing_end = seq - 1; 1308 1309 printbuf_reset(&buf); 1310 prt_printf(&buf, "journal entries %llu-%llu missing! (replaying %llu-%llu)", 1311 missing_start, missing_end, 1312 start_seq, end_seq); 1313 1314 prt_printf(&buf, "\nprev at "); 1315 if (prev) { 1316 bch2_journal_ptrs_to_text(&buf, c, prev); 1317 prt_printf(&buf, " size %zu", vstruct_sectors(&prev->j, c->block_bits)); 1318 } else 1319 prt_printf(&buf, "(none)"); 1320 1321 prt_printf(&buf, "\nnext at "); 1322 bch2_journal_ptrs_to_text(&buf, c, i); 1323 prt_printf(&buf, ", continue?"); 1324 1325 fsck_err(c, journal_entries_missing, "%s", buf.buf); 1326 } 1327 1328 prev = i; 1329 seq++; 1330 } 1331 fsck_err: 1332 printbuf_exit(&buf); 1333 return ret; 1334 } 1335 1336 int bch2_journal_read(struct bch_fs *c, 1337 u64 *last_seq, 1338 u64 *blacklist_seq, 1339 u64 *start_seq) 1340 { 1341 struct journal_list jlist; 1342 struct journal_replay *i, **_i; 1343 struct genradix_iter radix_iter; 1344 struct printbuf buf = PRINTBUF; 1345 bool degraded = false, last_write_torn = false; 1346 u64 seq; 1347 int ret = 0; 1348 1349 closure_init_stack(&jlist.cl); 1350 mutex_init(&jlist.lock); 1351 jlist.last_seq = 0; 1352 jlist.ret = 0; 1353 1354 for_each_member_device(c, ca) { 1355 if (!c->opts.fsck && 1356 !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal))) 1357 continue; 1358 1359 if ((ca->mi.state == BCH_MEMBER_STATE_rw || 1360 ca->mi.state == BCH_MEMBER_STATE_ro) && 1361 enumerated_ref_tryget(&ca->io_ref[READ], 1362 BCH_DEV_READ_REF_journal_read)) 1363 closure_call(&ca->journal.read, 1364 bch2_journal_read_device, 1365 system_unbound_wq, 1366 &jlist.cl); 1367 else 1368 degraded = true; 1369 } 1370 1371 while (closure_sync_timeout(&jlist.cl, sysctl_hung_task_timeout_secs * HZ / 2)) 1372 ; 1373 1374 if (jlist.ret) 1375 return jlist.ret; 1376 1377 *last_seq = 0; 1378 *start_seq = 0; 1379 *blacklist_seq = 0; 1380 1381 /* 1382 * Find most recent flush entry, and ignore newer non flush entries - 1383 * those entries will be blacklisted: 1384 */ 1385 genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) { 1386 i = *_i; 1387 1388 if (journal_replay_ignore(i)) 1389 continue; 1390 1391 if (!*start_seq) 1392 *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1; 1393 1394 if (JSET_NO_FLUSH(&i->j)) { 1395 i->ignore_blacklisted = true; 1396 continue; 1397 } 1398 1399 if (!last_write_torn && !i->csum_good) { 1400 last_write_torn = true; 1401 i->ignore_blacklisted = true; 1402 continue; 1403 } 1404 1405 struct bkey_validate_context from = { 1406 .from = BKEY_VALIDATE_journal, 1407 .journal_seq = le64_to_cpu(i->j.seq), 1408 }; 1409 if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq), 1410 c, le32_to_cpu(i->j.version), &i->j, NULL, 1411 jset_last_seq_newer_than_seq, 1412 "invalid journal entry: last_seq > seq (%llu > %llu)", 1413 le64_to_cpu(i->j.last_seq), 1414 le64_to_cpu(i->j.seq))) 1415 i->j.last_seq = i->j.seq; 1416 1417 *last_seq = le64_to_cpu(i->j.last_seq); 1418 *blacklist_seq = le64_to_cpu(i->j.seq) + 1; 1419 break; 1420 } 1421 1422 if (!*start_seq) { 1423 bch_info(c, "journal read done, but no entries found"); 1424 return 0; 1425 } 1426 1427 if (!*last_seq) { 1428 fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes, 1429 "journal read done, but no entries found after dropping non-flushes"); 1430 return 0; 1431 } 1432 1433 printbuf_reset(&buf); 1434 prt_printf(&buf, "journal read done, replaying entries %llu-%llu", 1435 *last_seq, *blacklist_seq - 1); 1436 1437 /* 1438 * Drop blacklisted entries and entries older than last_seq (or start of 1439 * journal rewind: 1440 */ 1441 u64 drop_before = *last_seq; 1442 if (c->opts.journal_rewind) { 1443 drop_before = min(drop_before, c->opts.journal_rewind); 1444 prt_printf(&buf, " (rewinding from %llu)", c->opts.journal_rewind); 1445 } 1446 1447 *last_seq = drop_before; 1448 if (*start_seq != *blacklist_seq) 1449 prt_printf(&buf, " (unflushed %llu-%llu)", *blacklist_seq, *start_seq - 1); 1450 bch_info(c, "%s", buf.buf); 1451 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1452 i = *_i; 1453 1454 if (journal_replay_ignore(i)) 1455 continue; 1456 1457 seq = le64_to_cpu(i->j.seq); 1458 if (seq < drop_before) { 1459 journal_replay_free(c, i, false); 1460 continue; 1461 } 1462 1463 if (bch2_journal_seq_is_blacklisted(c, seq, true)) { 1464 fsck_err_on(!JSET_NO_FLUSH(&i->j), c, 1465 jset_seq_blacklisted, 1466 "found blacklisted journal entry %llu", seq); 1467 i->ignore_blacklisted = true; 1468 } 1469 } 1470 1471 ret = bch2_journal_check_for_missing(c, drop_before, *blacklist_seq - 1); 1472 if (ret) 1473 goto err; 1474 1475 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1476 union bch_replicas_padded replicas = { 1477 .e.data_type = BCH_DATA_journal, 1478 .e.nr_devs = 0, 1479 .e.nr_required = 1, 1480 }; 1481 1482 i = *_i; 1483 if (journal_replay_ignore(i)) 1484 continue; 1485 1486 /* 1487 * Don't print checksum errors until we know we're going to use 1488 * a given journal entry: 1489 */ 1490 darray_for_each(i->ptrs, ptr) 1491 if (!ptr->csum_good) { 1492 bch2_journal_print_checksum_error(c, i); 1493 break; 1494 } 1495 1496 ret = jset_validate(c, 1497 bch2_dev_have_ref(c, i->ptrs.data[0].dev), 1498 &i->j, 1499 i->ptrs.data[0].sector, 1500 READ); 1501 if (ret) 1502 goto err; 1503 1504 darray_for_each(i->ptrs, ptr) 1505 replicas_entry_add_dev(&replicas.e, ptr->dev); 1506 1507 bch2_replicas_entry_sort(&replicas.e); 1508 1509 printbuf_reset(&buf); 1510 bch2_replicas_entry_to_text(&buf, &replicas.e); 1511 1512 if (!degraded && 1513 !bch2_replicas_marked(c, &replicas.e) && 1514 (le64_to_cpu(i->j.seq) == *last_seq || 1515 fsck_err(c, journal_entry_replicas_not_marked, 1516 "superblock not marked as containing replicas for journal entry %llu\n%s", 1517 le64_to_cpu(i->j.seq), buf.buf))) { 1518 ret = bch2_mark_replicas(c, &replicas.e); 1519 if (ret) 1520 goto err; 1521 } 1522 } 1523 err: 1524 fsck_err: 1525 printbuf_exit(&buf); 1526 return ret; 1527 } 1528 1529 /* journal write: */ 1530 1531 static void journal_advance_devs_to_next_bucket(struct journal *j, 1532 struct dev_alloc_list *devs, 1533 unsigned sectors, __le64 seq) 1534 { 1535 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1536 1537 guard(rcu)(); 1538 darray_for_each(*devs, i) { 1539 struct bch_dev *ca = rcu_dereference(c->devs[*i]); 1540 if (!ca) 1541 continue; 1542 1543 struct journal_device *ja = &ca->journal; 1544 1545 if (sectors > ja->sectors_free && 1546 sectors <= ca->mi.bucket_size && 1547 bch2_journal_dev_buckets_available(j, ja, 1548 journal_space_discarded)) { 1549 ja->cur_idx = (ja->cur_idx + 1) % ja->nr; 1550 ja->sectors_free = ca->mi.bucket_size; 1551 1552 /* 1553 * ja->bucket_seq[ja->cur_idx] must always have 1554 * something sensible: 1555 */ 1556 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(seq); 1557 } 1558 } 1559 } 1560 1561 static void __journal_write_alloc(struct journal *j, 1562 struct journal_buf *w, 1563 struct dev_alloc_list *devs, 1564 unsigned sectors, 1565 unsigned *replicas, 1566 unsigned replicas_want) 1567 { 1568 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1569 1570 darray_for_each(*devs, i) { 1571 struct bch_dev *ca = bch2_dev_get_ioref(c, *i, WRITE, 1572 BCH_DEV_WRITE_REF_journal_write); 1573 if (!ca) 1574 continue; 1575 1576 struct journal_device *ja = &ca->journal; 1577 1578 /* 1579 * Check that we can use this device, and aren't already using 1580 * it: 1581 */ 1582 if (!ca->mi.durability || 1583 ca->mi.state != BCH_MEMBER_STATE_rw || 1584 !ja->nr || 1585 bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) || 1586 sectors > ja->sectors_free) { 1587 enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write); 1588 continue; 1589 } 1590 1591 bch2_dev_stripe_increment(ca, &j->wp.stripe); 1592 1593 bch2_bkey_append_ptr(&w->key, 1594 (struct bch_extent_ptr) { 1595 .offset = bucket_to_sector(ca, 1596 ja->buckets[ja->cur_idx]) + 1597 ca->mi.bucket_size - 1598 ja->sectors_free, 1599 .dev = ca->dev_idx, 1600 }); 1601 1602 ja->sectors_free -= sectors; 1603 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1604 1605 *replicas += ca->mi.durability; 1606 1607 if (*replicas >= replicas_want) 1608 break; 1609 } 1610 } 1611 1612 static int journal_write_alloc(struct journal *j, struct journal_buf *w, 1613 unsigned *replicas) 1614 { 1615 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1616 struct bch_devs_mask devs; 1617 struct dev_alloc_list devs_sorted; 1618 unsigned sectors = vstruct_sectors(w->data, c->block_bits); 1619 unsigned target = c->opts.metadata_target ?: 1620 c->opts.foreground_target; 1621 unsigned replicas_want = READ_ONCE(c->opts.metadata_replicas); 1622 unsigned replicas_need = min_t(unsigned, replicas_want, 1623 READ_ONCE(c->opts.metadata_replicas_required)); 1624 bool advance_done = false; 1625 1626 retry_target: 1627 devs = target_rw_devs(c, BCH_DATA_journal, target); 1628 bch2_dev_alloc_list(c, &j->wp.stripe, &devs, &devs_sorted); 1629 retry_alloc: 1630 __journal_write_alloc(j, w, &devs_sorted, sectors, replicas, replicas_want); 1631 1632 if (likely(*replicas >= replicas_want)) 1633 goto done; 1634 1635 if (!advance_done) { 1636 journal_advance_devs_to_next_bucket(j, &devs_sorted, sectors, w->data->seq); 1637 advance_done = true; 1638 goto retry_alloc; 1639 } 1640 1641 if (*replicas < replicas_want && target) { 1642 /* Retry from all devices: */ 1643 target = 0; 1644 advance_done = false; 1645 goto retry_target; 1646 } 1647 done: 1648 BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); 1649 1650 #if 0 1651 /* 1652 * XXX: we need a way to alert the user when we go degraded for any 1653 * reason 1654 */ 1655 if (*replicas < min(replicas_want, 1656 dev_mask_nr(&c->rw_devs[BCH_DATA_free]))) { 1657 } 1658 #endif 1659 1660 return *replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices; 1661 } 1662 1663 static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) 1664 { 1665 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1666 1667 /* we aren't holding j->lock: */ 1668 unsigned new_size = READ_ONCE(j->buf_size_want); 1669 void *new_buf; 1670 1671 if (buf->buf_size >= new_size) 1672 return; 1673 1674 size_t btree_write_buffer_size = new_size / 64; 1675 1676 if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size)) 1677 return; 1678 1679 new_buf = kvmalloc(new_size, GFP_NOFS|__GFP_NOWARN); 1680 if (!new_buf) 1681 return; 1682 1683 memcpy(new_buf, buf->data, buf->buf_size); 1684 1685 spin_lock(&j->lock); 1686 swap(buf->data, new_buf); 1687 swap(buf->buf_size, new_size); 1688 spin_unlock(&j->lock); 1689 1690 kvfree(new_buf); 1691 } 1692 1693 static CLOSURE_CALLBACK(journal_write_done) 1694 { 1695 closure_type(w, struct journal_buf, io); 1696 struct journal *j = container_of(w, struct journal, buf[w->idx]); 1697 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1698 union bch_replicas_padded replicas; 1699 u64 seq = le64_to_cpu(w->data->seq); 1700 int err = 0; 1701 1702 bch2_time_stats_update(!JSET_NO_FLUSH(w->data) 1703 ? j->flush_write_time 1704 : j->noflush_write_time, j->write_start_time); 1705 1706 if (!w->devs_written.nr) { 1707 err = bch_err_throw(c, journal_write_err); 1708 } else { 1709 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, 1710 w->devs_written); 1711 err = bch2_mark_replicas(c, &replicas.e); 1712 } 1713 1714 if (err && !bch2_journal_error(j)) { 1715 struct printbuf buf = PRINTBUF; 1716 bch2_log_msg_start(c, &buf); 1717 1718 if (err == -BCH_ERR_journal_write_err) 1719 prt_printf(&buf, "unable to write journal to sufficient devices\n"); 1720 else 1721 prt_printf(&buf, "journal write error marking replicas: %s\n", 1722 bch2_err_str(err)); 1723 1724 bch2_fs_emergency_read_only2(c, &buf); 1725 1726 bch2_print_str(c, KERN_ERR, buf.buf); 1727 printbuf_exit(&buf); 1728 } 1729 1730 closure_debug_destroy(cl); 1731 1732 spin_lock(&j->lock); 1733 if (seq >= j->pin.front) 1734 journal_seq_pin(j, seq)->devs = w->devs_written; 1735 if (err && (!j->err_seq || seq < j->err_seq)) 1736 j->err_seq = seq; 1737 w->write_done = true; 1738 1739 if (!j->free_buf || j->free_buf_size < w->buf_size) { 1740 swap(j->free_buf, w->data); 1741 swap(j->free_buf_size, w->buf_size); 1742 } 1743 1744 if (w->data) { 1745 void *buf = w->data; 1746 w->data = NULL; 1747 w->buf_size = 0; 1748 1749 spin_unlock(&j->lock); 1750 kvfree(buf); 1751 spin_lock(&j->lock); 1752 } 1753 1754 bool completed = false; 1755 bool do_discards = false; 1756 1757 for (seq = journal_last_unwritten_seq(j); 1758 seq <= journal_cur_seq(j); 1759 seq++) { 1760 w = j->buf + (seq & JOURNAL_BUF_MASK); 1761 if (!w->write_done) 1762 break; 1763 1764 if (!j->err_seq && !w->noflush) { 1765 j->flushed_seq_ondisk = seq; 1766 j->last_seq_ondisk = w->last_seq; 1767 1768 closure_wake_up(&c->freelist_wait); 1769 bch2_reset_alloc_cursors(c); 1770 do_discards = true; 1771 } 1772 1773 j->seq_ondisk = seq; 1774 1775 /* 1776 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard 1777 * more buckets: 1778 * 1779 * Must come before signaling write completion, for 1780 * bch2_fs_journal_stop(): 1781 */ 1782 if (j->watermark != BCH_WATERMARK_stripe) 1783 journal_reclaim_kick(&c->journal); 1784 1785 closure_wake_up(&w->wait); 1786 completed = true; 1787 } 1788 1789 if (completed) { 1790 bch2_journal_reclaim_fast(j); 1791 bch2_journal_space_available(j); 1792 1793 track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false); 1794 1795 journal_wake(j); 1796 } 1797 1798 if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && 1799 j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { 1800 struct journal_buf *buf = journal_cur_buf(j); 1801 long delta = buf->expires - jiffies; 1802 1803 /* 1804 * We don't close a journal entry to write it while there's 1805 * previous entries still in flight - the current journal entry 1806 * might want to be written now: 1807 */ 1808 mod_delayed_work(j->wq, &j->write_work, max(0L, delta)); 1809 } 1810 1811 /* 1812 * We don't typically trigger journal writes from her - the next journal 1813 * write will be triggered immediately after the previous one is 1814 * allocated, in bch2_journal_write() - but the journal write error path 1815 * is special: 1816 */ 1817 bch2_journal_do_writes(j); 1818 spin_unlock(&j->lock); 1819 1820 if (do_discards) 1821 bch2_do_discards(c); 1822 } 1823 1824 static void journal_write_endio(struct bio *bio) 1825 { 1826 struct journal_bio *jbio = container_of(bio, struct journal_bio, bio); 1827 struct bch_dev *ca = jbio->ca; 1828 struct journal *j = &ca->fs->journal; 1829 struct journal_buf *w = j->buf + jbio->buf_idx; 1830 1831 bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write, 1832 jbio->submit_time, !bio->bi_status); 1833 1834 if (bio->bi_status) { 1835 bch_err_dev_ratelimited(ca, 1836 "error writing journal entry %llu: %s", 1837 le64_to_cpu(w->data->seq), 1838 bch2_blk_status_to_str(bio->bi_status)); 1839 1840 unsigned long flags; 1841 spin_lock_irqsave(&j->err_lock, flags); 1842 bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx); 1843 spin_unlock_irqrestore(&j->err_lock, flags); 1844 } 1845 1846 closure_put(&w->io); 1847 enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write); 1848 } 1849 1850 static CLOSURE_CALLBACK(journal_write_submit) 1851 { 1852 closure_type(w, struct journal_buf, io); 1853 struct journal *j = container_of(w, struct journal, buf[w->idx]); 1854 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1855 unsigned sectors = vstruct_sectors(w->data, c->block_bits); 1856 1857 extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { 1858 struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); 1859 1860 this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], 1861 sectors); 1862 1863 struct journal_device *ja = &ca->journal; 1864 struct journal_bio *jbio = ja->bio[w->idx]; 1865 struct bio *bio = &jbio->bio; 1866 1867 jbio->submit_time = local_clock(); 1868 1869 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); 1870 bio->bi_iter.bi_sector = ptr->offset; 1871 bio->bi_end_io = journal_write_endio; 1872 bio->bi_private = ca; 1873 bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 0); 1874 1875 BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector); 1876 ca->prev_journal_sector = bio->bi_iter.bi_sector; 1877 1878 if (!JSET_NO_FLUSH(w->data)) 1879 bio->bi_opf |= REQ_FUA; 1880 if (!JSET_NO_FLUSH(w->data) && !w->separate_flush) 1881 bio->bi_opf |= REQ_PREFLUSH; 1882 1883 bch2_bio_map(bio, w->data, sectors << 9); 1884 1885 trace_and_count(c, journal_write, bio); 1886 closure_bio_submit(bio, cl); 1887 1888 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1889 } 1890 1891 continue_at(cl, journal_write_done, j->wq); 1892 } 1893 1894 static CLOSURE_CALLBACK(journal_write_preflush) 1895 { 1896 closure_type(w, struct journal_buf, io); 1897 struct journal *j = container_of(w, struct journal, buf[w->idx]); 1898 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1899 1900 /* 1901 * Wait for previous journal writes to comelete; they won't necessarily 1902 * be flushed if they're still in flight 1903 */ 1904 if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { 1905 spin_lock(&j->lock); 1906 if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { 1907 closure_wait(&j->async_wait, cl); 1908 spin_unlock(&j->lock); 1909 continue_at(cl, journal_write_preflush, j->wq); 1910 return; 1911 } 1912 spin_unlock(&j->lock); 1913 } 1914 1915 if (w->separate_flush) { 1916 for_each_rw_member(c, ca, BCH_DEV_WRITE_REF_journal_write) { 1917 enumerated_ref_get(&ca->io_ref[WRITE], 1918 BCH_DEV_WRITE_REF_journal_write); 1919 1920 struct journal_device *ja = &ca->journal; 1921 struct bio *bio = &ja->bio[w->idx]->bio; 1922 bio_reset(bio, ca->disk_sb.bdev, 1923 REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH); 1924 bio->bi_end_io = journal_write_endio; 1925 bio->bi_private = ca; 1926 closure_bio_submit(bio, cl); 1927 } 1928 1929 continue_at(cl, journal_write_submit, j->wq); 1930 } else { 1931 /* 1932 * no need to punt to another work item if we're not waiting on 1933 * preflushes 1934 */ 1935 journal_write_submit(&cl->work); 1936 } 1937 } 1938 1939 static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) 1940 { 1941 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1942 struct jset_entry *start, *end; 1943 struct jset *jset = w->data; 1944 struct journal_keys_to_wb wb = { NULL }; 1945 unsigned u64s; 1946 unsigned long btree_roots_have = 0; 1947 u64 seq = le64_to_cpu(jset->seq); 1948 int ret; 1949 1950 /* 1951 * Simple compaction, dropping empty jset_entries (from journal 1952 * reservations that weren't fully used) and merging jset_entries that 1953 * can be. 1954 * 1955 * If we wanted to be really fancy here, we could sort all the keys in 1956 * the jset and drop keys that were overwritten - probably not worth it: 1957 */ 1958 vstruct_for_each(jset, i) { 1959 unsigned u64s = le16_to_cpu(i->u64s); 1960 1961 /* Empty entry: */ 1962 if (!u64s) 1963 continue; 1964 1965 /* 1966 * New btree roots are set by journalling them; when the journal 1967 * entry gets written we have to propagate them to 1968 * c->btree_roots 1969 * 1970 * But, every journal entry we write has to contain all the 1971 * btree roots (at least for now); so after we copy btree roots 1972 * to c->btree_roots we have to get any missing btree roots and 1973 * add them to this journal entry: 1974 */ 1975 switch (i->type) { 1976 case BCH_JSET_ENTRY_btree_root: 1977 bch2_journal_entry_to_btree_root(c, i); 1978 __set_bit(i->btree_id, &btree_roots_have); 1979 break; 1980 case BCH_JSET_ENTRY_write_buffer_keys: 1981 EBUG_ON(!w->need_flush_to_write_buffer); 1982 1983 if (!wb.wb) 1984 bch2_journal_keys_to_write_buffer_start(c, &wb, seq); 1985 1986 jset_entry_for_each_key(i, k) { 1987 ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k); 1988 if (ret) { 1989 bch2_fs_fatal_error(c, "flushing journal keys to btree write buffer: %s", 1990 bch2_err_str(ret)); 1991 bch2_journal_keys_to_write_buffer_end(c, &wb); 1992 return ret; 1993 } 1994 } 1995 i->type = BCH_JSET_ENTRY_btree_keys; 1996 break; 1997 } 1998 } 1999 2000 if (wb.wb) { 2001 ret = bch2_journal_keys_to_write_buffer_end(c, &wb); 2002 if (ret) { 2003 bch2_fs_fatal_error(c, "error flushing journal keys to btree write buffer: %s", 2004 bch2_err_str(ret)); 2005 return ret; 2006 } 2007 } 2008 2009 spin_lock(&c->journal.lock); 2010 w->need_flush_to_write_buffer = false; 2011 spin_unlock(&c->journal.lock); 2012 2013 start = end = vstruct_last(jset); 2014 2015 end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have); 2016 2017 struct jset_entry_datetime *d = 2018 container_of(jset_entry_init(&end, sizeof(*d)), struct jset_entry_datetime, entry); 2019 d->entry.type = BCH_JSET_ENTRY_datetime; 2020 d->seconds = cpu_to_le64(ktime_get_real_seconds()); 2021 2022 bch2_journal_super_entries_add_common(c, &end, seq); 2023 u64s = (u64 *) end - (u64 *) start; 2024 2025 WARN_ON(u64s > j->entry_u64s_reserved); 2026 2027 le32_add_cpu(&jset->u64s, u64s); 2028 2029 unsigned sectors = vstruct_sectors(jset, c->block_bits); 2030 2031 if (sectors > w->sectors) { 2032 bch2_fs_fatal_error(c, ": journal write overran available space, %zu > %u (extra %u reserved %u/%u)", 2033 vstruct_bytes(jset), w->sectors << 9, 2034 u64s, w->u64s_reserved, j->entry_u64s_reserved); 2035 return -EINVAL; 2036 } 2037 2038 return 0; 2039 } 2040 2041 static int bch2_journal_write_checksum(struct journal *j, struct journal_buf *w) 2042 { 2043 struct bch_fs *c = container_of(j, struct bch_fs, journal); 2044 struct jset *jset = w->data; 2045 u64 seq = le64_to_cpu(jset->seq); 2046 bool validate_before_checksum = false; 2047 int ret = 0; 2048 2049 jset->magic = cpu_to_le64(jset_magic(c)); 2050 jset->version = cpu_to_le32(c->sb.version); 2051 2052 SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); 2053 SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); 2054 2055 if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset)) 2056 j->last_empty_seq = seq; 2057 2058 if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) 2059 validate_before_checksum = true; 2060 2061 if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current) 2062 validate_before_checksum = true; 2063 2064 if (validate_before_checksum && 2065 (ret = jset_validate(c, NULL, jset, 0, WRITE))) 2066 return ret; 2067 2068 ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), 2069 jset->encrypted_start, 2070 vstruct_end(jset) - (void *) jset->encrypted_start); 2071 if (bch2_fs_fatal_err_on(ret, c, "encrypting journal entry: %s", bch2_err_str(ret))) 2072 return ret; 2073 2074 jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), 2075 journal_nonce(jset), jset); 2076 2077 if (!validate_before_checksum && 2078 (ret = jset_validate(c, NULL, jset, 0, WRITE))) 2079 return ret; 2080 2081 unsigned sectors = vstruct_sectors(jset, c->block_bits); 2082 unsigned bytes = vstruct_bytes(jset); 2083 memset((void *) jset + bytes, 0, (sectors << 9) - bytes); 2084 return 0; 2085 } 2086 2087 static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *w) 2088 { 2089 struct bch_fs *c = container_of(j, struct bch_fs, journal); 2090 int error = bch2_journal_error(j); 2091 2092 /* 2093 * If the journal is in an error state - we did an emergency shutdown - 2094 * we prefer to continue doing journal writes. We just mark them as 2095 * noflush so they'll never be used, but they'll still be visible by the 2096 * list_journal tool - this helps in debugging. 2097 * 2098 * There's a caveat: the first journal write after marking the 2099 * superblock dirty must always be a flush write, because on startup 2100 * from a clean shutdown we didn't necessarily read the journal and the 2101 * new journal write might overwrite whatever was in the journal 2102 * previously - we can't leave the journal without any flush writes in 2103 * it. 2104 * 2105 * So if we're in an error state, and we're still starting up, we don't 2106 * write anything at all. 2107 */ 2108 if (error && test_bit(JOURNAL_need_flush_write, &j->flags)) 2109 return error; 2110 2111 if (error || 2112 w->noflush || 2113 (!w->must_flush && 2114 time_before(jiffies, j->last_flush_write + 2115 msecs_to_jiffies(c->opts.journal_flush_delay)) && 2116 test_bit(JOURNAL_may_skip_flush, &j->flags))) { 2117 w->noflush = true; 2118 SET_JSET_NO_FLUSH(w->data, true); 2119 w->data->last_seq = 0; 2120 w->last_seq = 0; 2121 2122 j->nr_noflush_writes++; 2123 } else { 2124 w->must_flush = true; 2125 j->last_flush_write = jiffies; 2126 j->nr_flush_writes++; 2127 clear_bit(JOURNAL_need_flush_write, &j->flags); 2128 } 2129 2130 return 0; 2131 } 2132 2133 CLOSURE_CALLBACK(bch2_journal_write) 2134 { 2135 closure_type(w, struct journal_buf, io); 2136 struct journal *j = container_of(w, struct journal, buf[w->idx]); 2137 struct bch_fs *c = container_of(j, struct bch_fs, journal); 2138 union bch_replicas_padded replicas; 2139 unsigned nr_rw_members = dev_mask_nr(&c->rw_devs[BCH_DATA_free]); 2140 int ret; 2141 2142 BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); 2143 BUG_ON(!w->write_started); 2144 BUG_ON(w->write_allocated); 2145 BUG_ON(w->write_done); 2146 2147 j->write_start_time = local_clock(); 2148 2149 spin_lock(&j->lock); 2150 if (nr_rw_members > 1) 2151 w->separate_flush = true; 2152 2153 ret = bch2_journal_write_pick_flush(j, w); 2154 spin_unlock(&j->lock); 2155 2156 if (unlikely(ret)) 2157 goto err; 2158 2159 mutex_lock(&j->buf_lock); 2160 journal_buf_realloc(j, w); 2161 2162 ret = bch2_journal_write_prep(j, w); 2163 mutex_unlock(&j->buf_lock); 2164 2165 if (unlikely(ret)) 2166 goto err; 2167 2168 unsigned replicas_allocated = 0; 2169 while (1) { 2170 ret = journal_write_alloc(j, w, &replicas_allocated); 2171 if (!ret || !j->can_discard) 2172 break; 2173 2174 bch2_journal_do_discards(j); 2175 } 2176 2177 if (unlikely(ret)) 2178 goto err_allocate_write; 2179 2180 ret = bch2_journal_write_checksum(j, w); 2181 if (unlikely(ret)) 2182 goto err; 2183 2184 spin_lock(&j->lock); 2185 /* 2186 * write is allocated, no longer need to account for it in 2187 * bch2_journal_space_available(): 2188 */ 2189 w->sectors = 0; 2190 w->write_allocated = true; 2191 j->entry_bytes_written += vstruct_bytes(w->data); 2192 2193 /* 2194 * journal entry has been compacted and allocated, recalculate space 2195 * available: 2196 */ 2197 bch2_journal_space_available(j); 2198 bch2_journal_do_writes(j); 2199 spin_unlock(&j->lock); 2200 2201 w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); 2202 2203 /* 2204 * Mark journal replicas before we submit the write to guarantee 2205 * recovery will find the journal entries after a crash. 2206 */ 2207 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, 2208 w->devs_written); 2209 ret = bch2_mark_replicas(c, &replicas.e); 2210 if (ret) 2211 goto err; 2212 2213 if (c->opts.nochanges) 2214 goto no_io; 2215 2216 if (!JSET_NO_FLUSH(w->data)) 2217 continue_at(cl, journal_write_preflush, j->wq); 2218 else 2219 continue_at(cl, journal_write_submit, j->wq); 2220 return; 2221 err_allocate_write: 2222 if (!bch2_journal_error(j)) { 2223 struct printbuf buf = PRINTBUF; 2224 2225 bch2_journal_debug_to_text(&buf, j); 2226 prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"), 2227 le64_to_cpu(w->data->seq), 2228 vstruct_sectors(w->data, c->block_bits), 2229 bch2_err_str(ret)); 2230 bch2_print_str(c, KERN_ERR, buf.buf); 2231 printbuf_exit(&buf); 2232 } 2233 err: 2234 bch2_fatal_error(c); 2235 no_io: 2236 extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { 2237 struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); 2238 enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write); 2239 } 2240 2241 continue_at(cl, journal_write_done, j->wq); 2242 } 2243