1 // SPDX-License-Identifier: GPL-2.0 2 #include "bcachefs.h" 3 #include "alloc_background.h" 4 #include "alloc_foreground.h" 5 #include "btree_io.h" 6 #include "btree_update_interior.h" 7 #include "buckets.h" 8 #include "checksum.h" 9 #include "disk_groups.h" 10 #include "error.h" 11 #include "journal.h" 12 #include "journal_io.h" 13 #include "journal_reclaim.h" 14 #include "journal_seq_blacklist.h" 15 #include "replicas.h" 16 #include "sb-clean.h" 17 #include "trace.h" 18 19 static struct nonce journal_nonce(const struct jset *jset) 20 { 21 return (struct nonce) {{ 22 [0] = 0, 23 [1] = ((__le32 *) &jset->seq)[0], 24 [2] = ((__le32 *) &jset->seq)[1], 25 [3] = BCH_NONCE_JOURNAL, 26 }}; 27 } 28 29 static bool jset_csum_good(struct bch_fs *c, struct jset *j) 30 { 31 return bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j)) && 32 !bch2_crc_cmp(j->csum, 33 csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j)); 34 } 35 36 static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq) 37 { 38 return (seq - c->journal_entries_base_seq) & (~0U >> 1); 39 } 40 41 static void __journal_replay_free(struct bch_fs *c, 42 struct journal_replay *i) 43 { 44 struct journal_replay **p = 45 genradix_ptr(&c->journal_entries, 46 journal_entry_radix_idx(c, le64_to_cpu(i->j.seq))); 47 48 BUG_ON(*p != i); 49 *p = NULL; 50 kvpfree(i, offsetof(struct journal_replay, j) + 51 vstruct_bytes(&i->j)); 52 } 53 54 static void journal_replay_free(struct bch_fs *c, struct journal_replay *i) 55 { 56 i->ignore = true; 57 58 if (!c->opts.read_entire_journal) 59 __journal_replay_free(c, i); 60 } 61 62 struct journal_list { 63 struct closure cl; 64 u64 last_seq; 65 struct mutex lock; 66 int ret; 67 }; 68 69 #define JOURNAL_ENTRY_ADD_OK 0 70 #define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5 71 72 /* 73 * Given a journal entry we just read, add it to the list of journal entries to 74 * be replayed: 75 */ 76 static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, 77 struct journal_ptr entry_ptr, 78 struct journal_list *jlist, struct jset *j) 79 { 80 struct genradix_iter iter; 81 struct journal_replay **_i, *i, *dup; 82 struct journal_ptr *ptr; 83 size_t bytes = vstruct_bytes(j); 84 u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0; 85 int ret = JOURNAL_ENTRY_ADD_OK; 86 87 /* Is this entry older than the range we need? */ 88 if (!c->opts.read_entire_journal && 89 le64_to_cpu(j->seq) < jlist->last_seq) 90 return JOURNAL_ENTRY_ADD_OUT_OF_RANGE; 91 92 /* 93 * genradixes are indexed by a ulong, not a u64, so we can't index them 94 * by sequence number directly: Assume instead that they will all fall 95 * within the range of +-2billion of the filrst one we find. 96 */ 97 if (!c->journal_entries_base_seq) 98 c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX); 99 100 /* Drop entries we don't need anymore */ 101 if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) { 102 genradix_for_each_from(&c->journal_entries, iter, _i, 103 journal_entry_radix_idx(c, jlist->last_seq)) { 104 i = *_i; 105 106 if (!i || i->ignore) 107 continue; 108 109 if (le64_to_cpu(i->j.seq) >= last_seq) 110 break; 111 journal_replay_free(c, i); 112 } 113 } 114 115 jlist->last_seq = max(jlist->last_seq, last_seq); 116 117 _i = genradix_ptr_alloc(&c->journal_entries, 118 journal_entry_radix_idx(c, le64_to_cpu(j->seq)), 119 GFP_KERNEL); 120 if (!_i) 121 return -BCH_ERR_ENOMEM_journal_entry_add; 122 123 /* 124 * Duplicate journal entries? If so we want the one that didn't have a 125 * checksum error: 126 */ 127 dup = *_i; 128 if (dup) { 129 if (bytes == vstruct_bytes(&dup->j) && 130 !memcmp(j, &dup->j, bytes)) { 131 i = dup; 132 goto found; 133 } 134 135 if (!entry_ptr.csum_good) { 136 i = dup; 137 goto found; 138 } 139 140 if (!dup->csum_good) 141 goto replace; 142 143 fsck_err(c, "found duplicate but non identical journal entries (seq %llu)", 144 le64_to_cpu(j->seq)); 145 i = dup; 146 goto found; 147 } 148 replace: 149 i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); 150 if (!i) 151 return -BCH_ERR_ENOMEM_journal_entry_add; 152 153 i->nr_ptrs = 0; 154 i->csum_good = entry_ptr.csum_good; 155 i->ignore = false; 156 unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct"); 157 i->ptrs[i->nr_ptrs++] = entry_ptr; 158 159 if (dup) { 160 if (dup->nr_ptrs >= ARRAY_SIZE(dup->ptrs)) { 161 bch_err(c, "found too many copies of journal entry %llu", 162 le64_to_cpu(i->j.seq)); 163 dup->nr_ptrs = ARRAY_SIZE(dup->ptrs) - 1; 164 } 165 166 /* The first ptr should represent the jset we kept: */ 167 memcpy(i->ptrs + i->nr_ptrs, 168 dup->ptrs, 169 sizeof(dup->ptrs[0]) * dup->nr_ptrs); 170 i->nr_ptrs += dup->nr_ptrs; 171 __journal_replay_free(c, dup); 172 } 173 174 *_i = i; 175 return 0; 176 found: 177 for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) { 178 if (ptr->dev == ca->dev_idx) { 179 bch_err(c, "duplicate journal entry %llu on same device", 180 le64_to_cpu(i->j.seq)); 181 goto out; 182 } 183 } 184 185 if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) { 186 bch_err(c, "found too many copies of journal entry %llu", 187 le64_to_cpu(i->j.seq)); 188 goto out; 189 } 190 191 i->ptrs[i->nr_ptrs++] = entry_ptr; 192 out: 193 fsck_err: 194 return ret; 195 } 196 197 /* this fills in a range with empty jset_entries: */ 198 static void journal_entry_null_range(void *start, void *end) 199 { 200 struct jset_entry *entry; 201 202 for (entry = start; entry != end; entry = vstruct_next(entry)) 203 memset(entry, 0, sizeof(*entry)); 204 } 205 206 #define JOURNAL_ENTRY_REREAD 5 207 #define JOURNAL_ENTRY_NONE 6 208 #define JOURNAL_ENTRY_BAD 7 209 210 static void journal_entry_err_msg(struct printbuf *out, 211 u32 version, 212 struct jset *jset, 213 struct jset_entry *entry) 214 { 215 prt_str(out, "invalid journal entry, version="); 216 bch2_version_to_text(out, version); 217 218 if (entry) { 219 prt_str(out, " type="); 220 prt_str(out, bch2_jset_entry_types[entry->type]); 221 } 222 223 if (!jset) { 224 prt_printf(out, " in superblock"); 225 } else { 226 227 prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq)); 228 229 if (entry) 230 prt_printf(out, " offset=%zi/%u", 231 (u64 *) entry - jset->_data, 232 le32_to_cpu(jset->u64s)); 233 } 234 235 prt_str(out, ": "); 236 } 237 238 #define journal_entry_err(c, version, jset, entry, msg, ...) \ 239 ({ \ 240 struct printbuf _buf = PRINTBUF; \ 241 \ 242 journal_entry_err_msg(&_buf, version, jset, entry); \ 243 prt_printf(&_buf, msg, ##__VA_ARGS__); \ 244 \ 245 switch (flags & BKEY_INVALID_WRITE) { \ 246 case READ: \ 247 mustfix_fsck_err(c, "%s", _buf.buf); \ 248 break; \ 249 case WRITE: \ 250 bch_err(c, "corrupt metadata before write: %s\n", _buf.buf);\ 251 if (bch2_fs_inconsistent(c)) { \ 252 ret = -BCH_ERR_fsck_errors_not_fixed; \ 253 goto fsck_err; \ 254 } \ 255 break; \ 256 } \ 257 \ 258 printbuf_exit(&_buf); \ 259 true; \ 260 }) 261 262 #define journal_entry_err_on(cond, c, version, jset, entry, msg, ...) \ 263 ((cond) ? journal_entry_err(c, version, jset, entry, msg, ##__VA_ARGS__) : false) 264 265 #define FSCK_DELETED_KEY 5 266 267 static int journal_validate_key(struct bch_fs *c, 268 struct jset *jset, 269 struct jset_entry *entry, 270 unsigned level, enum btree_id btree_id, 271 struct bkey_i *k, 272 unsigned version, int big_endian, 273 enum bkey_invalid_flags flags) 274 { 275 int write = flags & BKEY_INVALID_WRITE; 276 void *next = vstruct_next(entry); 277 struct printbuf buf = PRINTBUF; 278 int ret = 0; 279 280 if (journal_entry_err_on(!k->k.u64s, c, version, jset, entry, "k->u64s 0")) { 281 entry->u64s = cpu_to_le16((u64 *) k - entry->_data); 282 journal_entry_null_range(vstruct_next(entry), next); 283 return FSCK_DELETED_KEY; 284 } 285 286 if (journal_entry_err_on((void *) bkey_next(k) > 287 (void *) vstruct_next(entry), 288 c, version, jset, entry, 289 "extends past end of journal entry")) { 290 entry->u64s = cpu_to_le16((u64 *) k - entry->_data); 291 journal_entry_null_range(vstruct_next(entry), next); 292 return FSCK_DELETED_KEY; 293 } 294 295 if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, 296 c, version, jset, entry, 297 "bad format %u", k->k.format)) { 298 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); 299 memmove(k, bkey_next(k), next - (void *) bkey_next(k)); 300 journal_entry_null_range(vstruct_next(entry), next); 301 return FSCK_DELETED_KEY; 302 } 303 304 if (!write) 305 bch2_bkey_compat(level, btree_id, version, big_endian, 306 write, NULL, bkey_to_packed(k)); 307 308 if (bch2_bkey_invalid(c, bkey_i_to_s_c(k), 309 __btree_node_type(level, btree_id), write, &buf)) { 310 printbuf_reset(&buf); 311 journal_entry_err_msg(&buf, version, jset, entry); 312 prt_newline(&buf); 313 printbuf_indent_add(&buf, 2); 314 315 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); 316 prt_newline(&buf); 317 bch2_bkey_invalid(c, bkey_i_to_s_c(k), 318 __btree_node_type(level, btree_id), write, &buf); 319 320 mustfix_fsck_err(c, "%s", buf.buf); 321 322 le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); 323 memmove(k, bkey_next(k), next - (void *) bkey_next(k)); 324 journal_entry_null_range(vstruct_next(entry), next); 325 326 printbuf_exit(&buf); 327 return FSCK_DELETED_KEY; 328 } 329 330 if (write) 331 bch2_bkey_compat(level, btree_id, version, big_endian, 332 write, NULL, bkey_to_packed(k)); 333 fsck_err: 334 printbuf_exit(&buf); 335 return ret; 336 } 337 338 static int journal_entry_btree_keys_validate(struct bch_fs *c, 339 struct jset *jset, 340 struct jset_entry *entry, 341 unsigned version, int big_endian, 342 enum bkey_invalid_flags flags) 343 { 344 struct bkey_i *k = entry->start; 345 346 while (k != vstruct_last(entry)) { 347 int ret = journal_validate_key(c, jset, entry, 348 entry->level, 349 entry->btree_id, 350 k, version, big_endian, 351 flags|BKEY_INVALID_JOURNAL); 352 if (ret == FSCK_DELETED_KEY) 353 continue; 354 355 k = bkey_next(k); 356 } 357 358 return 0; 359 } 360 361 static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c, 362 struct jset_entry *entry) 363 { 364 struct bkey_i *k; 365 bool first = true; 366 367 jset_entry_for_each_key(entry, k) { 368 if (!first) { 369 prt_newline(out); 370 prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]); 371 } 372 prt_printf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level); 373 bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); 374 first = false; 375 } 376 } 377 378 static int journal_entry_btree_root_validate(struct bch_fs *c, 379 struct jset *jset, 380 struct jset_entry *entry, 381 unsigned version, int big_endian, 382 enum bkey_invalid_flags flags) 383 { 384 struct bkey_i *k = entry->start; 385 int ret = 0; 386 387 if (journal_entry_err_on(!entry->u64s || 388 le16_to_cpu(entry->u64s) != k->k.u64s, 389 c, version, jset, entry, 390 "invalid btree root journal entry: wrong number of keys")) { 391 void *next = vstruct_next(entry); 392 /* 393 * we don't want to null out this jset_entry, 394 * just the contents, so that later we can tell 395 * we were _supposed_ to have a btree root 396 */ 397 entry->u64s = 0; 398 journal_entry_null_range(vstruct_next(entry), next); 399 return 0; 400 } 401 402 return journal_validate_key(c, jset, entry, 1, entry->btree_id, k, 403 version, big_endian, flags); 404 fsck_err: 405 return ret; 406 } 407 408 static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c, 409 struct jset_entry *entry) 410 { 411 journal_entry_btree_keys_to_text(out, c, entry); 412 } 413 414 static int journal_entry_prio_ptrs_validate(struct bch_fs *c, 415 struct jset *jset, 416 struct jset_entry *entry, 417 unsigned version, int big_endian, 418 enum bkey_invalid_flags flags) 419 { 420 /* obsolete, don't care: */ 421 return 0; 422 } 423 424 static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c, 425 struct jset_entry *entry) 426 { 427 } 428 429 static int journal_entry_blacklist_validate(struct bch_fs *c, 430 struct jset *jset, 431 struct jset_entry *entry, 432 unsigned version, int big_endian, 433 enum bkey_invalid_flags flags) 434 { 435 int ret = 0; 436 437 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, 438 c, version, jset, entry, 439 "invalid journal seq blacklist entry: bad size")) { 440 journal_entry_null_range(entry, vstruct_next(entry)); 441 } 442 fsck_err: 443 return ret; 444 } 445 446 static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c, 447 struct jset_entry *entry) 448 { 449 struct jset_entry_blacklist *bl = 450 container_of(entry, struct jset_entry_blacklist, entry); 451 452 prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq)); 453 } 454 455 static int journal_entry_blacklist_v2_validate(struct bch_fs *c, 456 struct jset *jset, 457 struct jset_entry *entry, 458 unsigned version, int big_endian, 459 enum bkey_invalid_flags flags) 460 { 461 struct jset_entry_blacklist_v2 *bl_entry; 462 int ret = 0; 463 464 if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, 465 c, version, jset, entry, 466 "invalid journal seq blacklist entry: bad size")) { 467 journal_entry_null_range(entry, vstruct_next(entry)); 468 goto out; 469 } 470 471 bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry); 472 473 if (journal_entry_err_on(le64_to_cpu(bl_entry->start) > 474 le64_to_cpu(bl_entry->end), 475 c, version, jset, entry, 476 "invalid journal seq blacklist entry: start > end")) { 477 journal_entry_null_range(entry, vstruct_next(entry)); 478 } 479 out: 480 fsck_err: 481 return ret; 482 } 483 484 static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c, 485 struct jset_entry *entry) 486 { 487 struct jset_entry_blacklist_v2 *bl = 488 container_of(entry, struct jset_entry_blacklist_v2, entry); 489 490 prt_printf(out, "start=%llu end=%llu", 491 le64_to_cpu(bl->start), 492 le64_to_cpu(bl->end)); 493 } 494 495 static int journal_entry_usage_validate(struct bch_fs *c, 496 struct jset *jset, 497 struct jset_entry *entry, 498 unsigned version, int big_endian, 499 enum bkey_invalid_flags flags) 500 { 501 struct jset_entry_usage *u = 502 container_of(entry, struct jset_entry_usage, entry); 503 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 504 int ret = 0; 505 506 if (journal_entry_err_on(bytes < sizeof(*u), 507 c, version, jset, entry, 508 "invalid journal entry usage: bad size")) { 509 journal_entry_null_range(entry, vstruct_next(entry)); 510 return ret; 511 } 512 513 fsck_err: 514 return ret; 515 } 516 517 static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c, 518 struct jset_entry *entry) 519 { 520 struct jset_entry_usage *u = 521 container_of(entry, struct jset_entry_usage, entry); 522 523 prt_printf(out, "type=%s v=%llu", 524 bch2_fs_usage_types[u->entry.btree_id], 525 le64_to_cpu(u->v)); 526 } 527 528 static int journal_entry_data_usage_validate(struct bch_fs *c, 529 struct jset *jset, 530 struct jset_entry *entry, 531 unsigned version, int big_endian, 532 enum bkey_invalid_flags flags) 533 { 534 struct jset_entry_data_usage *u = 535 container_of(entry, struct jset_entry_data_usage, entry); 536 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 537 int ret = 0; 538 539 if (journal_entry_err_on(bytes < sizeof(*u) || 540 bytes < sizeof(*u) + u->r.nr_devs, 541 c, version, jset, entry, 542 "invalid journal entry usage: bad size")) { 543 journal_entry_null_range(entry, vstruct_next(entry)); 544 return ret; 545 } 546 547 fsck_err: 548 return ret; 549 } 550 551 static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c, 552 struct jset_entry *entry) 553 { 554 struct jset_entry_data_usage *u = 555 container_of(entry, struct jset_entry_data_usage, entry); 556 557 bch2_replicas_entry_to_text(out, &u->r); 558 prt_printf(out, "=%llu", le64_to_cpu(u->v)); 559 } 560 561 static int journal_entry_clock_validate(struct bch_fs *c, 562 struct jset *jset, 563 struct jset_entry *entry, 564 unsigned version, int big_endian, 565 enum bkey_invalid_flags flags) 566 { 567 struct jset_entry_clock *clock = 568 container_of(entry, struct jset_entry_clock, entry); 569 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 570 int ret = 0; 571 572 if (journal_entry_err_on(bytes != sizeof(*clock), 573 c, version, jset, entry, "bad size")) { 574 journal_entry_null_range(entry, vstruct_next(entry)); 575 return ret; 576 } 577 578 if (journal_entry_err_on(clock->rw > 1, 579 c, version, jset, entry, "bad rw")) { 580 journal_entry_null_range(entry, vstruct_next(entry)); 581 return ret; 582 } 583 584 fsck_err: 585 return ret; 586 } 587 588 static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c, 589 struct jset_entry *entry) 590 { 591 struct jset_entry_clock *clock = 592 container_of(entry, struct jset_entry_clock, entry); 593 594 prt_printf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time)); 595 } 596 597 static int journal_entry_dev_usage_validate(struct bch_fs *c, 598 struct jset *jset, 599 struct jset_entry *entry, 600 unsigned version, int big_endian, 601 enum bkey_invalid_flags flags) 602 { 603 struct jset_entry_dev_usage *u = 604 container_of(entry, struct jset_entry_dev_usage, entry); 605 unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); 606 unsigned expected = sizeof(*u); 607 unsigned dev; 608 int ret = 0; 609 610 if (journal_entry_err_on(bytes < expected, 611 c, version, jset, entry, "bad size (%u < %u)", 612 bytes, expected)) { 613 journal_entry_null_range(entry, vstruct_next(entry)); 614 return ret; 615 } 616 617 dev = le32_to_cpu(u->dev); 618 619 if (journal_entry_err_on(!bch2_dev_exists2(c, dev), 620 c, version, jset, entry, "bad dev")) { 621 journal_entry_null_range(entry, vstruct_next(entry)); 622 return ret; 623 } 624 625 if (journal_entry_err_on(u->pad, 626 c, version, jset, entry, "bad pad")) { 627 journal_entry_null_range(entry, vstruct_next(entry)); 628 return ret; 629 } 630 631 fsck_err: 632 return ret; 633 } 634 635 static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c, 636 struct jset_entry *entry) 637 { 638 struct jset_entry_dev_usage *u = 639 container_of(entry, struct jset_entry_dev_usage, entry); 640 unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); 641 642 prt_printf(out, "dev=%u", le32_to_cpu(u->dev)); 643 644 for (i = 0; i < nr_types; i++) { 645 if (i < BCH_DATA_NR) 646 prt_printf(out, " %s", bch2_data_types[i]); 647 else 648 prt_printf(out, " (unknown data type %u)", i); 649 prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu", 650 le64_to_cpu(u->d[i].buckets), 651 le64_to_cpu(u->d[i].sectors), 652 le64_to_cpu(u->d[i].fragmented)); 653 } 654 655 prt_printf(out, " buckets_ec: %llu", le64_to_cpu(u->buckets_ec)); 656 } 657 658 static int journal_entry_log_validate(struct bch_fs *c, 659 struct jset *jset, 660 struct jset_entry *entry, 661 unsigned version, int big_endian, 662 enum bkey_invalid_flags flags) 663 { 664 return 0; 665 } 666 667 static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c, 668 struct jset_entry *entry) 669 { 670 struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); 671 unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d); 672 673 prt_printf(out, "%.*s", bytes, l->d); 674 } 675 676 static int journal_entry_overwrite_validate(struct bch_fs *c, 677 struct jset *jset, 678 struct jset_entry *entry, 679 unsigned version, int big_endian, 680 enum bkey_invalid_flags flags) 681 { 682 return journal_entry_btree_keys_validate(c, jset, entry, 683 version, big_endian, READ); 684 } 685 686 static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c, 687 struct jset_entry *entry) 688 { 689 journal_entry_btree_keys_to_text(out, c, entry); 690 } 691 692 struct jset_entry_ops { 693 int (*validate)(struct bch_fs *, struct jset *, 694 struct jset_entry *, unsigned, int, 695 enum bkey_invalid_flags); 696 void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); 697 }; 698 699 static const struct jset_entry_ops bch2_jset_entry_ops[] = { 700 #define x(f, nr) \ 701 [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \ 702 .validate = journal_entry_##f##_validate, \ 703 .to_text = journal_entry_##f##_to_text, \ 704 }, 705 BCH_JSET_ENTRY_TYPES() 706 #undef x 707 }; 708 709 int bch2_journal_entry_validate(struct bch_fs *c, 710 struct jset *jset, 711 struct jset_entry *entry, 712 unsigned version, int big_endian, 713 enum bkey_invalid_flags flags) 714 { 715 return entry->type < BCH_JSET_ENTRY_NR 716 ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry, 717 version, big_endian, flags) 718 : 0; 719 } 720 721 void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, 722 struct jset_entry *entry) 723 { 724 if (entry->type < BCH_JSET_ENTRY_NR) { 725 prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]); 726 bch2_jset_entry_ops[entry->type].to_text(out, c, entry); 727 } else { 728 prt_printf(out, "(unknown type %u)", entry->type); 729 } 730 } 731 732 static int jset_validate_entries(struct bch_fs *c, struct jset *jset, 733 enum bkey_invalid_flags flags) 734 { 735 struct jset_entry *entry; 736 unsigned version = le32_to_cpu(jset->version); 737 int ret = 0; 738 739 vstruct_for_each(jset, entry) { 740 if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset), 741 c, version, jset, entry, 742 "journal entry extends past end of jset")) { 743 jset->u64s = cpu_to_le32((u64 *) entry - jset->_data); 744 break; 745 } 746 747 ret = bch2_journal_entry_validate(c, jset, entry, 748 version, JSET_BIG_ENDIAN(jset), flags); 749 if (ret) 750 break; 751 } 752 fsck_err: 753 return ret; 754 } 755 756 static int jset_validate(struct bch_fs *c, 757 struct bch_dev *ca, 758 struct jset *jset, u64 sector, 759 enum bkey_invalid_flags flags) 760 { 761 unsigned version; 762 int ret = 0; 763 764 if (le64_to_cpu(jset->magic) != jset_magic(c)) 765 return JOURNAL_ENTRY_NONE; 766 767 version = le32_to_cpu(jset->version); 768 if (journal_entry_err_on(!bch2_version_compatible(version), 769 c, version, jset, NULL, 770 "%s sector %llu seq %llu: incompatible journal entry version %u.%u", 771 ca ? ca->name : c->name, 772 sector, le64_to_cpu(jset->seq), 773 BCH_VERSION_MAJOR(version), 774 BCH_VERSION_MINOR(version))) { 775 /* don't try to continue: */ 776 return -EINVAL; 777 } 778 779 if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), 780 c, version, jset, NULL, 781 "%s sector %llu seq %llu: journal entry with unknown csum type %llu", 782 ca ? ca->name : c->name, 783 sector, le64_to_cpu(jset->seq), 784 JSET_CSUM_TYPE(jset))) 785 ret = JOURNAL_ENTRY_BAD; 786 787 /* last_seq is ignored when JSET_NO_FLUSH is true */ 788 if (journal_entry_err_on(!JSET_NO_FLUSH(jset) && 789 le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), 790 c, version, jset, NULL, 791 "invalid journal entry: last_seq > seq (%llu > %llu)", 792 le64_to_cpu(jset->last_seq), 793 le64_to_cpu(jset->seq))) { 794 jset->last_seq = jset->seq; 795 return JOURNAL_ENTRY_BAD; 796 } 797 798 ret = jset_validate_entries(c, jset, flags); 799 fsck_err: 800 return ret; 801 } 802 803 static int jset_validate_early(struct bch_fs *c, 804 struct bch_dev *ca, 805 struct jset *jset, u64 sector, 806 unsigned bucket_sectors_left, 807 unsigned sectors_read) 808 { 809 size_t bytes = vstruct_bytes(jset); 810 unsigned version; 811 enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL; 812 int ret = 0; 813 814 if (le64_to_cpu(jset->magic) != jset_magic(c)) 815 return JOURNAL_ENTRY_NONE; 816 817 version = le32_to_cpu(jset->version); 818 if (journal_entry_err_on(!bch2_version_compatible(version), 819 c, version, jset, NULL, 820 "%s sector %llu seq %llu: unknown journal entry version %u.%u", 821 ca ? ca->name : c->name, 822 sector, le64_to_cpu(jset->seq), 823 BCH_VERSION_MAJOR(version), 824 BCH_VERSION_MINOR(version))) { 825 /* don't try to continue: */ 826 return -EINVAL; 827 } 828 829 if (bytes > (sectors_read << 9) && 830 sectors_read < bucket_sectors_left) 831 return JOURNAL_ENTRY_REREAD; 832 833 if (journal_entry_err_on(bytes > bucket_sectors_left << 9, 834 c, version, jset, NULL, 835 "%s sector %llu seq %llu: journal entry too big (%zu bytes)", 836 ca ? ca->name : c->name, 837 sector, le64_to_cpu(jset->seq), bytes)) 838 le32_add_cpu(&jset->u64s, 839 -((bytes - (bucket_sectors_left << 9)) / 8)); 840 fsck_err: 841 return ret; 842 } 843 844 struct journal_read_buf { 845 void *data; 846 size_t size; 847 }; 848 849 static int journal_read_buf_realloc(struct journal_read_buf *b, 850 size_t new_size) 851 { 852 void *n; 853 854 /* the bios are sized for this many pages, max: */ 855 if (new_size > JOURNAL_ENTRY_SIZE_MAX) 856 return -BCH_ERR_ENOMEM_journal_read_buf_realloc; 857 858 new_size = roundup_pow_of_two(new_size); 859 n = kvpmalloc(new_size, GFP_KERNEL); 860 if (!n) 861 return -BCH_ERR_ENOMEM_journal_read_buf_realloc; 862 863 kvpfree(b->data, b->size); 864 b->data = n; 865 b->size = new_size; 866 return 0; 867 } 868 869 static int journal_read_bucket(struct bch_dev *ca, 870 struct journal_read_buf *buf, 871 struct journal_list *jlist, 872 unsigned bucket) 873 { 874 struct bch_fs *c = ca->fs; 875 struct journal_device *ja = &ca->journal; 876 struct jset *j = NULL; 877 unsigned sectors, sectors_read = 0; 878 u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), 879 end = offset + ca->mi.bucket_size; 880 bool saw_bad = false, csum_good; 881 int ret = 0; 882 883 pr_debug("reading %u", bucket); 884 885 while (offset < end) { 886 if (!sectors_read) { 887 struct bio *bio; 888 unsigned nr_bvecs; 889 reread: 890 sectors_read = min_t(unsigned, 891 end - offset, buf->size >> 9); 892 nr_bvecs = buf_pages(buf->data, sectors_read << 9); 893 894 bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); 895 bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ); 896 897 bio->bi_iter.bi_sector = offset; 898 bch2_bio_map(bio, buf->data, sectors_read << 9); 899 900 ret = submit_bio_wait(bio); 901 kfree(bio); 902 903 if (bch2_dev_io_err_on(ret, ca, 904 "journal read error: sector %llu", 905 offset) || 906 bch2_meta_read_fault("journal")) { 907 /* 908 * We don't error out of the recovery process 909 * here, since the relevant journal entry may be 910 * found on a different device, and missing or 911 * no journal entries will be handled later 912 */ 913 return 0; 914 } 915 916 j = buf->data; 917 } 918 919 ret = jset_validate_early(c, ca, j, offset, 920 end - offset, sectors_read); 921 switch (ret) { 922 case 0: 923 sectors = vstruct_sectors(j, c->block_bits); 924 break; 925 case JOURNAL_ENTRY_REREAD: 926 if (vstruct_bytes(j) > buf->size) { 927 ret = journal_read_buf_realloc(buf, 928 vstruct_bytes(j)); 929 if (ret) 930 return ret; 931 } 932 goto reread; 933 case JOURNAL_ENTRY_NONE: 934 if (!saw_bad) 935 return 0; 936 /* 937 * On checksum error we don't really trust the size 938 * field of the journal entry we read, so try reading 939 * again at next block boundary: 940 */ 941 sectors = block_sectors(c); 942 goto next_block; 943 default: 944 return ret; 945 } 946 947 /* 948 * This happens sometimes if we don't have discards on - 949 * when we've partially overwritten a bucket with new 950 * journal entries. We don't need the rest of the 951 * bucket: 952 */ 953 if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) 954 return 0; 955 956 ja->bucket_seq[bucket] = le64_to_cpu(j->seq); 957 958 csum_good = jset_csum_good(c, j); 959 if (!csum_good) 960 saw_bad = true; 961 962 ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), 963 j->encrypted_start, 964 vstruct_end(j) - (void *) j->encrypted_start); 965 bch2_fs_fatal_err_on(ret, c, 966 "error decrypting journal entry: %i", ret); 967 968 mutex_lock(&jlist->lock); 969 ret = journal_entry_add(c, ca, (struct journal_ptr) { 970 .csum_good = csum_good, 971 .dev = ca->dev_idx, 972 .bucket = bucket, 973 .bucket_offset = offset - 974 bucket_to_sector(ca, ja->buckets[bucket]), 975 .sector = offset, 976 }, jlist, j); 977 mutex_unlock(&jlist->lock); 978 979 switch (ret) { 980 case JOURNAL_ENTRY_ADD_OK: 981 break; 982 case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: 983 break; 984 default: 985 return ret; 986 } 987 next_block: 988 pr_debug("next"); 989 offset += sectors; 990 sectors_read -= sectors; 991 j = ((void *) j) + (sectors << 9); 992 } 993 994 return 0; 995 } 996 997 static void bch2_journal_read_device(struct closure *cl) 998 { 999 struct journal_device *ja = 1000 container_of(cl, struct journal_device, read); 1001 struct bch_dev *ca = container_of(ja, struct bch_dev, journal); 1002 struct bch_fs *c = ca->fs; 1003 struct journal_list *jlist = 1004 container_of(cl->parent, struct journal_list, cl); 1005 struct journal_replay *r, **_r; 1006 struct genradix_iter iter; 1007 struct journal_read_buf buf = { NULL, 0 }; 1008 unsigned i; 1009 int ret = 0; 1010 1011 if (!ja->nr) 1012 goto out; 1013 1014 ret = journal_read_buf_realloc(&buf, PAGE_SIZE); 1015 if (ret) 1016 goto err; 1017 1018 pr_debug("%u journal buckets", ja->nr); 1019 1020 for (i = 0; i < ja->nr; i++) { 1021 ret = journal_read_bucket(ca, &buf, jlist, i); 1022 if (ret) 1023 goto err; 1024 } 1025 1026 ja->sectors_free = ca->mi.bucket_size; 1027 1028 mutex_lock(&jlist->lock); 1029 genradix_for_each_reverse(&c->journal_entries, iter, _r) { 1030 r = *_r; 1031 1032 if (!r) 1033 continue; 1034 1035 for (i = 0; i < r->nr_ptrs; i++) { 1036 if (r->ptrs[i].dev == ca->dev_idx) { 1037 unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) + 1038 vstruct_sectors(&r->j, c->block_bits); 1039 1040 ja->cur_idx = r->ptrs[i].bucket; 1041 ja->sectors_free = ca->mi.bucket_size - wrote; 1042 goto found; 1043 } 1044 } 1045 } 1046 found: 1047 mutex_unlock(&jlist->lock); 1048 1049 if (ja->bucket_seq[ja->cur_idx] && 1050 ja->sectors_free == ca->mi.bucket_size) { 1051 bch_err(c, "ja->sectors_free == ca->mi.bucket_size"); 1052 bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr); 1053 for (i = 0; i < 3; i++) { 1054 unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr; 1055 1056 bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]); 1057 } 1058 ja->sectors_free = 0; 1059 } 1060 1061 /* 1062 * Set dirty_idx to indicate the entire journal is full and needs to be 1063 * reclaimed - journal reclaim will immediately reclaim whatever isn't 1064 * pinned when it first runs: 1065 */ 1066 ja->discard_idx = ja->dirty_idx_ondisk = 1067 ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; 1068 out: 1069 bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); 1070 kvpfree(buf.data, buf.size); 1071 percpu_ref_put(&ca->io_ref); 1072 closure_return(cl); 1073 return; 1074 err: 1075 mutex_lock(&jlist->lock); 1076 jlist->ret = ret; 1077 mutex_unlock(&jlist->lock); 1078 goto out; 1079 } 1080 1081 void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, 1082 struct journal_replay *j) 1083 { 1084 unsigned i; 1085 1086 for (i = 0; i < j->nr_ptrs; i++) { 1087 struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev); 1088 u64 offset; 1089 1090 div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset); 1091 1092 if (i) 1093 prt_printf(out, " "); 1094 prt_printf(out, "%u:%u:%u (sector %llu)", 1095 j->ptrs[i].dev, 1096 j->ptrs[i].bucket, 1097 j->ptrs[i].bucket_offset, 1098 j->ptrs[i].sector); 1099 } 1100 } 1101 1102 int bch2_journal_read(struct bch_fs *c, 1103 u64 *last_seq, 1104 u64 *blacklist_seq, 1105 u64 *start_seq) 1106 { 1107 struct journal_list jlist; 1108 struct journal_replay *i, **_i, *prev = NULL; 1109 struct genradix_iter radix_iter; 1110 struct bch_dev *ca; 1111 unsigned iter; 1112 struct printbuf buf = PRINTBUF; 1113 bool degraded = false, last_write_torn = false; 1114 u64 seq; 1115 int ret = 0; 1116 1117 closure_init_stack(&jlist.cl); 1118 mutex_init(&jlist.lock); 1119 jlist.last_seq = 0; 1120 jlist.ret = 0; 1121 1122 for_each_member_device(ca, c, iter) { 1123 if (!c->opts.fsck && 1124 !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal))) 1125 continue; 1126 1127 if ((ca->mi.state == BCH_MEMBER_STATE_rw || 1128 ca->mi.state == BCH_MEMBER_STATE_ro) && 1129 percpu_ref_tryget(&ca->io_ref)) 1130 closure_call(&ca->journal.read, 1131 bch2_journal_read_device, 1132 system_unbound_wq, 1133 &jlist.cl); 1134 else 1135 degraded = true; 1136 } 1137 1138 closure_sync(&jlist.cl); 1139 1140 if (jlist.ret) 1141 return jlist.ret; 1142 1143 *last_seq = 0; 1144 *start_seq = 0; 1145 *blacklist_seq = 0; 1146 1147 /* 1148 * Find most recent flush entry, and ignore newer non flush entries - 1149 * those entries will be blacklisted: 1150 */ 1151 genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) { 1152 enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL; 1153 1154 i = *_i; 1155 1156 if (!i || i->ignore) 1157 continue; 1158 1159 if (!*start_seq) 1160 *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1; 1161 1162 if (JSET_NO_FLUSH(&i->j)) { 1163 i->ignore = true; 1164 continue; 1165 } 1166 1167 if (!last_write_torn && !i->csum_good) { 1168 last_write_torn = true; 1169 i->ignore = true; 1170 continue; 1171 } 1172 1173 if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq), 1174 c, le32_to_cpu(i->j.version), &i->j, NULL, 1175 "invalid journal entry: last_seq > seq (%llu > %llu)", 1176 le64_to_cpu(i->j.last_seq), 1177 le64_to_cpu(i->j.seq))) 1178 i->j.last_seq = i->j.seq; 1179 1180 *last_seq = le64_to_cpu(i->j.last_seq); 1181 *blacklist_seq = le64_to_cpu(i->j.seq) + 1; 1182 break; 1183 } 1184 1185 if (!*start_seq) { 1186 bch_info(c, "journal read done, but no entries found"); 1187 return 0; 1188 } 1189 1190 if (!*last_seq) { 1191 fsck_err(c, "journal read done, but no entries found after dropping non-flushes"); 1192 return 0; 1193 } 1194 1195 bch_info(c, "journal read done, replaying entries %llu-%llu", 1196 *last_seq, *blacklist_seq - 1); 1197 1198 if (*start_seq != *blacklist_seq) 1199 bch_info(c, "dropped unflushed entries %llu-%llu", 1200 *blacklist_seq, *start_seq - 1); 1201 1202 /* Drop blacklisted entries and entries older than last_seq: */ 1203 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1204 i = *_i; 1205 1206 if (!i || i->ignore) 1207 continue; 1208 1209 seq = le64_to_cpu(i->j.seq); 1210 if (seq < *last_seq) { 1211 journal_replay_free(c, i); 1212 continue; 1213 } 1214 1215 if (bch2_journal_seq_is_blacklisted(c, seq, true)) { 1216 fsck_err_on(!JSET_NO_FLUSH(&i->j), c, 1217 "found blacklisted journal entry %llu", seq); 1218 i->ignore = true; 1219 } 1220 } 1221 1222 /* Check for missing entries: */ 1223 seq = *last_seq; 1224 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1225 i = *_i; 1226 1227 if (!i || i->ignore) 1228 continue; 1229 1230 BUG_ON(seq > le64_to_cpu(i->j.seq)); 1231 1232 while (seq < le64_to_cpu(i->j.seq)) { 1233 u64 missing_start, missing_end; 1234 struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; 1235 1236 while (seq < le64_to_cpu(i->j.seq) && 1237 bch2_journal_seq_is_blacklisted(c, seq, false)) 1238 seq++; 1239 1240 if (seq == le64_to_cpu(i->j.seq)) 1241 break; 1242 1243 missing_start = seq; 1244 1245 while (seq < le64_to_cpu(i->j.seq) && 1246 !bch2_journal_seq_is_blacklisted(c, seq, false)) 1247 seq++; 1248 1249 if (prev) { 1250 bch2_journal_ptrs_to_text(&buf1, c, prev); 1251 prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits)); 1252 } else 1253 prt_printf(&buf1, "(none)"); 1254 bch2_journal_ptrs_to_text(&buf2, c, i); 1255 1256 missing_end = seq - 1; 1257 fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" 1258 " prev at %s\n" 1259 " next at %s", 1260 missing_start, missing_end, 1261 *last_seq, *blacklist_seq - 1, 1262 buf1.buf, buf2.buf); 1263 1264 printbuf_exit(&buf1); 1265 printbuf_exit(&buf2); 1266 } 1267 1268 prev = i; 1269 seq++; 1270 } 1271 1272 genradix_for_each(&c->journal_entries, radix_iter, _i) { 1273 struct bch_replicas_padded replicas = { 1274 .e.data_type = BCH_DATA_journal, 1275 .e.nr_required = 1, 1276 }; 1277 unsigned ptr; 1278 1279 i = *_i; 1280 if (!i || i->ignore) 1281 continue; 1282 1283 for (ptr = 0; ptr < i->nr_ptrs; ptr++) { 1284 ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev); 1285 1286 if (!i->ptrs[ptr].csum_good) 1287 bch_err_dev_offset(ca, i->ptrs[ptr].sector, 1288 "invalid journal checksum, seq %llu%s", 1289 le64_to_cpu(i->j.seq), 1290 i->csum_good ? " (had good copy on another device)" : ""); 1291 } 1292 1293 ret = jset_validate(c, 1294 bch_dev_bkey_exists(c, i->ptrs[0].dev), 1295 &i->j, 1296 i->ptrs[0].sector, 1297 READ); 1298 if (ret) 1299 goto err; 1300 1301 for (ptr = 0; ptr < i->nr_ptrs; ptr++) 1302 replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev; 1303 1304 bch2_replicas_entry_sort(&replicas.e); 1305 1306 printbuf_reset(&buf); 1307 bch2_replicas_entry_to_text(&buf, &replicas.e); 1308 1309 if (!degraded && 1310 !bch2_replicas_marked(c, &replicas.e) && 1311 (le64_to_cpu(i->j.seq) == *last_seq || 1312 fsck_err(c, "superblock not marked as containing replicas for journal entry %llu\n %s", 1313 le64_to_cpu(i->j.seq), buf.buf))) { 1314 ret = bch2_mark_replicas(c, &replicas.e); 1315 if (ret) 1316 goto err; 1317 } 1318 } 1319 err: 1320 fsck_err: 1321 printbuf_exit(&buf); 1322 return ret; 1323 } 1324 1325 /* journal write: */ 1326 1327 static void __journal_write_alloc(struct journal *j, 1328 struct journal_buf *w, 1329 struct dev_alloc_list *devs_sorted, 1330 unsigned sectors, 1331 unsigned *replicas, 1332 unsigned replicas_want) 1333 { 1334 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1335 struct journal_device *ja; 1336 struct bch_dev *ca; 1337 unsigned i; 1338 1339 if (*replicas >= replicas_want) 1340 return; 1341 1342 for (i = 0; i < devs_sorted->nr; i++) { 1343 ca = rcu_dereference(c->devs[devs_sorted->devs[i]]); 1344 if (!ca) 1345 continue; 1346 1347 ja = &ca->journal; 1348 1349 /* 1350 * Check that we can use this device, and aren't already using 1351 * it: 1352 */ 1353 if (!ca->mi.durability || 1354 ca->mi.state != BCH_MEMBER_STATE_rw || 1355 !ja->nr || 1356 bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) || 1357 sectors > ja->sectors_free) 1358 continue; 1359 1360 bch2_dev_stripe_increment(ca, &j->wp.stripe); 1361 1362 bch2_bkey_append_ptr(&w->key, 1363 (struct bch_extent_ptr) { 1364 .offset = bucket_to_sector(ca, 1365 ja->buckets[ja->cur_idx]) + 1366 ca->mi.bucket_size - 1367 ja->sectors_free, 1368 .dev = ca->dev_idx, 1369 }); 1370 1371 ja->sectors_free -= sectors; 1372 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1373 1374 *replicas += ca->mi.durability; 1375 1376 if (*replicas >= replicas_want) 1377 break; 1378 } 1379 } 1380 1381 /** 1382 * journal_write_alloc - decide where to write next journal entry 1383 * 1384 * @j: journal object 1385 * @w: journal buf (entry to be written) 1386 * 1387 * Returns: 0 on success, or -EROFS on failure 1388 */ 1389 static int journal_write_alloc(struct journal *j, struct journal_buf *w) 1390 { 1391 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1392 struct bch_devs_mask devs; 1393 struct journal_device *ja; 1394 struct bch_dev *ca; 1395 struct dev_alloc_list devs_sorted; 1396 unsigned sectors = vstruct_sectors(w->data, c->block_bits); 1397 unsigned target = c->opts.metadata_target ?: 1398 c->opts.foreground_target; 1399 unsigned i, replicas = 0, replicas_want = 1400 READ_ONCE(c->opts.metadata_replicas); 1401 1402 rcu_read_lock(); 1403 retry: 1404 devs = target_rw_devs(c, BCH_DATA_journal, target); 1405 1406 devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); 1407 1408 __journal_write_alloc(j, w, &devs_sorted, 1409 sectors, &replicas, replicas_want); 1410 1411 if (replicas >= replicas_want) 1412 goto done; 1413 1414 for (i = 0; i < devs_sorted.nr; i++) { 1415 ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); 1416 if (!ca) 1417 continue; 1418 1419 ja = &ca->journal; 1420 1421 if (sectors > ja->sectors_free && 1422 sectors <= ca->mi.bucket_size && 1423 bch2_journal_dev_buckets_available(j, ja, 1424 journal_space_discarded)) { 1425 ja->cur_idx = (ja->cur_idx + 1) % ja->nr; 1426 ja->sectors_free = ca->mi.bucket_size; 1427 1428 /* 1429 * ja->bucket_seq[ja->cur_idx] must always have 1430 * something sensible: 1431 */ 1432 ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); 1433 } 1434 } 1435 1436 __journal_write_alloc(j, w, &devs_sorted, 1437 sectors, &replicas, replicas_want); 1438 1439 if (replicas < replicas_want && target) { 1440 /* Retry from all devices: */ 1441 target = 0; 1442 goto retry; 1443 } 1444 done: 1445 rcu_read_unlock(); 1446 1447 BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); 1448 1449 return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS; 1450 } 1451 1452 static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) 1453 { 1454 /* we aren't holding j->lock: */ 1455 unsigned new_size = READ_ONCE(j->buf_size_want); 1456 void *new_buf; 1457 1458 if (buf->buf_size >= new_size) 1459 return; 1460 1461 new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN); 1462 if (!new_buf) 1463 return; 1464 1465 memcpy(new_buf, buf->data, buf->buf_size); 1466 1467 spin_lock(&j->lock); 1468 swap(buf->data, new_buf); 1469 swap(buf->buf_size, new_size); 1470 spin_unlock(&j->lock); 1471 1472 kvpfree(new_buf, new_size); 1473 } 1474 1475 static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) 1476 { 1477 return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK); 1478 } 1479 1480 static void journal_write_done(struct closure *cl) 1481 { 1482 struct journal *j = container_of(cl, struct journal, io); 1483 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1484 struct journal_buf *w = journal_last_unwritten_buf(j); 1485 struct bch_replicas_padded replicas; 1486 union journal_res_state old, new; 1487 u64 v, seq; 1488 int err = 0; 1489 1490 bch2_time_stats_update(!JSET_NO_FLUSH(w->data) 1491 ? j->flush_write_time 1492 : j->noflush_write_time, j->write_start_time); 1493 1494 if (!w->devs_written.nr) { 1495 bch_err(c, "unable to write journal to sufficient devices"); 1496 err = -EIO; 1497 } else { 1498 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, 1499 w->devs_written); 1500 if (bch2_mark_replicas(c, &replicas.e)) 1501 err = -EIO; 1502 } 1503 1504 if (err) 1505 bch2_fatal_error(c); 1506 1507 spin_lock(&j->lock); 1508 seq = le64_to_cpu(w->data->seq); 1509 1510 if (seq >= j->pin.front) 1511 journal_seq_pin(j, seq)->devs = w->devs_written; 1512 1513 if (!err) { 1514 if (!JSET_NO_FLUSH(w->data)) { 1515 j->flushed_seq_ondisk = seq; 1516 j->last_seq_ondisk = w->last_seq; 1517 1518 bch2_do_discards(c); 1519 closure_wake_up(&c->freelist_wait); 1520 1521 bch2_reset_alloc_cursors(c); 1522 } 1523 } else if (!j->err_seq || seq < j->err_seq) 1524 j->err_seq = seq; 1525 1526 j->seq_ondisk = seq; 1527 1528 /* 1529 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard 1530 * more buckets: 1531 * 1532 * Must come before signaling write completion, for 1533 * bch2_fs_journal_stop(): 1534 */ 1535 if (j->watermark != BCH_WATERMARK_stripe) 1536 journal_reclaim_kick(&c->journal); 1537 1538 /* also must come before signalling write completion: */ 1539 closure_debug_destroy(cl); 1540 1541 v = atomic64_read(&j->reservations.counter); 1542 do { 1543 old.v = new.v = v; 1544 BUG_ON(journal_state_count(new, new.unwritten_idx)); 1545 1546 new.unwritten_idx++; 1547 } while ((v = atomic64_cmpxchg(&j->reservations.counter, 1548 old.v, new.v)) != old.v); 1549 1550 bch2_journal_space_available(j); 1551 1552 closure_wake_up(&w->wait); 1553 journal_wake(j); 1554 1555 if (!journal_state_count(new, new.unwritten_idx) && 1556 journal_last_unwritten_seq(j) <= journal_cur_seq(j)) { 1557 spin_unlock(&j->lock); 1558 closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); 1559 } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && 1560 new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { 1561 struct journal_buf *buf = journal_cur_buf(j); 1562 long delta = buf->expires - jiffies; 1563 1564 /* 1565 * We don't close a journal entry to write it while there's 1566 * previous entries still in flight - the current journal entry 1567 * might want to be written now: 1568 */ 1569 1570 spin_unlock(&j->lock); 1571 mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta)); 1572 } else { 1573 spin_unlock(&j->lock); 1574 } 1575 } 1576 1577 static void journal_write_endio(struct bio *bio) 1578 { 1579 struct bch_dev *ca = bio->bi_private; 1580 struct journal *j = &ca->fs->journal; 1581 struct journal_buf *w = journal_last_unwritten_buf(j); 1582 unsigned long flags; 1583 1584 if (bch2_dev_io_err_on(bio->bi_status, ca, "error writing journal entry %llu: %s", 1585 le64_to_cpu(w->data->seq), 1586 bch2_blk_status_to_str(bio->bi_status)) || 1587 bch2_meta_write_fault("journal")) { 1588 spin_lock_irqsave(&j->err_lock, flags); 1589 bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx); 1590 spin_unlock_irqrestore(&j->err_lock, flags); 1591 } 1592 1593 closure_put(&j->io); 1594 percpu_ref_put(&ca->io_ref); 1595 } 1596 1597 static void do_journal_write(struct closure *cl) 1598 { 1599 struct journal *j = container_of(cl, struct journal, io); 1600 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1601 struct bch_dev *ca; 1602 struct journal_buf *w = journal_last_unwritten_buf(j); 1603 struct bch_extent_ptr *ptr; 1604 struct bio *bio; 1605 unsigned sectors = vstruct_sectors(w->data, c->block_bits); 1606 1607 extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { 1608 ca = bch_dev_bkey_exists(c, ptr->dev); 1609 if (!percpu_ref_tryget(&ca->io_ref)) { 1610 /* XXX: fix this */ 1611 bch_err(c, "missing device for journal write\n"); 1612 continue; 1613 } 1614 1615 this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], 1616 sectors); 1617 1618 bio = ca->journal.bio; 1619 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); 1620 bio->bi_iter.bi_sector = ptr->offset; 1621 bio->bi_end_io = journal_write_endio; 1622 bio->bi_private = ca; 1623 1624 BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector); 1625 ca->prev_journal_sector = bio->bi_iter.bi_sector; 1626 1627 if (!JSET_NO_FLUSH(w->data)) 1628 bio->bi_opf |= REQ_FUA; 1629 if (!JSET_NO_FLUSH(w->data) && !w->separate_flush) 1630 bio->bi_opf |= REQ_PREFLUSH; 1631 1632 bch2_bio_map(bio, w->data, sectors << 9); 1633 1634 trace_and_count(c, journal_write, bio); 1635 closure_bio_submit(bio, cl); 1636 1637 ca->journal.bucket_seq[ca->journal.cur_idx] = 1638 le64_to_cpu(w->data->seq); 1639 } 1640 1641 continue_at(cl, journal_write_done, c->io_complete_wq); 1642 } 1643 1644 static void bch2_journal_entries_postprocess(struct bch_fs *c, struct jset *jset) 1645 { 1646 struct jset_entry *i, *next, *prev = NULL; 1647 1648 /* 1649 * Simple compaction, dropping empty jset_entries (from journal 1650 * reservations that weren't fully used) and merging jset_entries that 1651 * can be. 1652 * 1653 * If we wanted to be really fancy here, we could sort all the keys in 1654 * the jset and drop keys that were overwritten - probably not worth it: 1655 */ 1656 vstruct_for_each_safe(jset, i, next) { 1657 unsigned u64s = le16_to_cpu(i->u64s); 1658 1659 /* Empty entry: */ 1660 if (!u64s) 1661 continue; 1662 1663 if (i->type == BCH_JSET_ENTRY_btree_root) 1664 bch2_journal_entry_to_btree_root(c, i); 1665 1666 /* Can we merge with previous entry? */ 1667 if (prev && 1668 i->btree_id == prev->btree_id && 1669 i->level == prev->level && 1670 i->type == prev->type && 1671 i->type == BCH_JSET_ENTRY_btree_keys && 1672 le16_to_cpu(prev->u64s) + u64s <= U16_MAX) { 1673 memmove_u64s_down(vstruct_next(prev), 1674 i->_data, 1675 u64s); 1676 le16_add_cpu(&prev->u64s, u64s); 1677 continue; 1678 } 1679 1680 /* Couldn't merge, move i into new position (after prev): */ 1681 prev = prev ? vstruct_next(prev) : jset->start; 1682 if (i != prev) 1683 memmove_u64s_down(prev, i, jset_u64s(u64s)); 1684 } 1685 1686 prev = prev ? vstruct_next(prev) : jset->start; 1687 jset->u64s = cpu_to_le32((u64 *) prev - jset->_data); 1688 } 1689 1690 void bch2_journal_write(struct closure *cl) 1691 { 1692 struct journal *j = container_of(cl, struct journal, io); 1693 struct bch_fs *c = container_of(j, struct bch_fs, journal); 1694 struct bch_dev *ca; 1695 struct journal_buf *w = journal_last_unwritten_buf(j); 1696 struct bch_replicas_padded replicas; 1697 struct jset_entry *start, *end; 1698 struct jset *jset; 1699 struct bio *bio; 1700 struct printbuf journal_debug_buf = PRINTBUF; 1701 bool validate_before_checksum = false; 1702 unsigned i, sectors, bytes, u64s, nr_rw_members = 0; 1703 int ret; 1704 1705 BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); 1706 1707 journal_buf_realloc(j, w); 1708 jset = w->data; 1709 1710 j->write_start_time = local_clock(); 1711 1712 spin_lock(&j->lock); 1713 1714 /* 1715 * If the journal is in an error state - we did an emergency shutdown - 1716 * we prefer to continue doing journal writes. We just mark them as 1717 * noflush so they'll never be used, but they'll still be visible by the 1718 * list_journal tool - this helps in debugging. 1719 * 1720 * There's a caveat: the first journal write after marking the 1721 * superblock dirty must always be a flush write, because on startup 1722 * from a clean shutdown we didn't necessarily read the journal and the 1723 * new journal write might overwrite whatever was in the journal 1724 * previously - we can't leave the journal without any flush writes in 1725 * it. 1726 * 1727 * So if we're in an error state, and we're still starting up, we don't 1728 * write anything at all. 1729 */ 1730 if (!test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags) && 1731 (bch2_journal_error(j) || 1732 w->noflush || 1733 (!w->must_flush && 1734 (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && 1735 test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)))) { 1736 w->noflush = true; 1737 SET_JSET_NO_FLUSH(jset, true); 1738 jset->last_seq = 0; 1739 w->last_seq = 0; 1740 1741 j->nr_noflush_writes++; 1742 } else if (!bch2_journal_error(j)) { 1743 j->last_flush_write = jiffies; 1744 j->nr_flush_writes++; 1745 clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags); 1746 } else { 1747 spin_unlock(&j->lock); 1748 goto err; 1749 } 1750 spin_unlock(&j->lock); 1751 1752 /* 1753 * New btree roots are set by journalling them; when the journal entry 1754 * gets written we have to propagate them to c->btree_roots 1755 * 1756 * But, every journal entry we write has to contain all the btree roots 1757 * (at least for now); so after we copy btree roots to c->btree_roots we 1758 * have to get any missing btree roots and add them to this journal 1759 * entry: 1760 */ 1761 1762 bch2_journal_entries_postprocess(c, jset); 1763 1764 start = end = vstruct_last(jset); 1765 1766 end = bch2_btree_roots_to_journal_entries(c, jset->start, end); 1767 1768 bch2_journal_super_entries_add_common(c, &end, 1769 le64_to_cpu(jset->seq)); 1770 u64s = (u64 *) end - (u64 *) start; 1771 BUG_ON(u64s > j->entry_u64s_reserved); 1772 1773 le32_add_cpu(&jset->u64s, u64s); 1774 1775 sectors = vstruct_sectors(jset, c->block_bits); 1776 bytes = vstruct_bytes(jset); 1777 1778 if (sectors > w->sectors) { 1779 bch2_fs_fatal_error(c, "aieeee! journal write overran available space, %zu > %u (extra %u reserved %u/%u)", 1780 vstruct_bytes(jset), w->sectors << 9, 1781 u64s, w->u64s_reserved, j->entry_u64s_reserved); 1782 goto err; 1783 } 1784 1785 jset->magic = cpu_to_le64(jset_magic(c)); 1786 jset->version = cpu_to_le32(c->sb.version); 1787 1788 SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); 1789 SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); 1790 1791 if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset)) 1792 j->last_empty_seq = le64_to_cpu(jset->seq); 1793 1794 if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) 1795 validate_before_checksum = true; 1796 1797 if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current) 1798 validate_before_checksum = true; 1799 1800 if (validate_before_checksum && 1801 jset_validate(c, NULL, jset, 0, WRITE)) 1802 goto err; 1803 1804 ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), 1805 jset->encrypted_start, 1806 vstruct_end(jset) - (void *) jset->encrypted_start); 1807 if (bch2_fs_fatal_err_on(ret, c, 1808 "error decrypting journal entry: %i", ret)) 1809 goto err; 1810 1811 jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), 1812 journal_nonce(jset), jset); 1813 1814 if (!validate_before_checksum && 1815 jset_validate(c, NULL, jset, 0, WRITE)) 1816 goto err; 1817 1818 memset((void *) jset + bytes, 0, (sectors << 9) - bytes); 1819 1820 retry_alloc: 1821 spin_lock(&j->lock); 1822 ret = journal_write_alloc(j, w); 1823 1824 if (ret && j->can_discard) { 1825 spin_unlock(&j->lock); 1826 bch2_journal_do_discards(j); 1827 goto retry_alloc; 1828 } 1829 1830 if (ret) 1831 __bch2_journal_debug_to_text(&journal_debug_buf, j); 1832 1833 /* 1834 * write is allocated, no longer need to account for it in 1835 * bch2_journal_space_available(): 1836 */ 1837 w->sectors = 0; 1838 1839 /* 1840 * journal entry has been compacted and allocated, recalculate space 1841 * available: 1842 */ 1843 bch2_journal_space_available(j); 1844 spin_unlock(&j->lock); 1845 1846 if (ret) { 1847 bch_err(c, "Unable to allocate journal write:\n%s", 1848 journal_debug_buf.buf); 1849 printbuf_exit(&journal_debug_buf); 1850 goto err; 1851 } 1852 1853 w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); 1854 1855 if (c->opts.nochanges) 1856 goto no_io; 1857 1858 for_each_rw_member(ca, c, i) 1859 nr_rw_members++; 1860 1861 if (nr_rw_members > 1) 1862 w->separate_flush = true; 1863 1864 /* 1865 * Mark journal replicas before we submit the write to guarantee 1866 * recovery will find the journal entries after a crash. 1867 */ 1868 bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, 1869 w->devs_written); 1870 ret = bch2_mark_replicas(c, &replicas.e); 1871 if (ret) 1872 goto err; 1873 1874 if (!JSET_NO_FLUSH(jset) && w->separate_flush) { 1875 for_each_rw_member(ca, c, i) { 1876 percpu_ref_get(&ca->io_ref); 1877 1878 bio = ca->journal.bio; 1879 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH); 1880 bio->bi_end_io = journal_write_endio; 1881 bio->bi_private = ca; 1882 closure_bio_submit(bio, cl); 1883 } 1884 } 1885 1886 continue_at(cl, do_journal_write, c->io_complete_wq); 1887 return; 1888 no_io: 1889 continue_at(cl, journal_write_done, c->io_complete_wq); 1890 return; 1891 err: 1892 bch2_fatal_error(c); 1893 continue_at(cl, journal_write_done, c->io_complete_wq); 1894 } 1895