1 // SPDX-License-Identifier: GPL-2.0 2 #include "bcachefs.h" 3 #include "alloc_background.h" 4 #include "alloc_foreground.h" 5 #include "backpointers.h" 6 #include "btree_cache.h" 7 #include "btree_io.h" 8 #include "btree_key_cache.h" 9 #include "btree_update.h" 10 #include "btree_update_interior.h" 11 #include "btree_gc.h" 12 #include "btree_write_buffer.h" 13 #include "buckets.h" 14 #include "buckets_waiting_for_journal.h" 15 #include "clock.h" 16 #include "debug.h" 17 #include "ec.h" 18 #include "error.h" 19 #include "lru.h" 20 #include "recovery.h" 21 #include "trace.h" 22 #include "varint.h" 23 24 #include <linux/kthread.h> 25 #include <linux/math64.h> 26 #include <linux/random.h> 27 #include <linux/rculist.h> 28 #include <linux/rcupdate.h> 29 #include <linux/sched/task.h> 30 #include <linux/sort.h> 31 32 /* Persistent alloc info: */ 33 34 static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { 35 #define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8, 36 BCH_ALLOC_FIELDS_V1() 37 #undef x 38 }; 39 40 struct bkey_alloc_unpacked { 41 u64 journal_seq; 42 u8 gen; 43 u8 oldest_gen; 44 u8 data_type; 45 bool need_discard:1; 46 bool need_inc_gen:1; 47 #define x(_name, _bits) u##_bits _name; 48 BCH_ALLOC_FIELDS_V2() 49 #undef x 50 }; 51 52 static inline u64 alloc_field_v1_get(const struct bch_alloc *a, 53 const void **p, unsigned field) 54 { 55 unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field]; 56 u64 v; 57 58 if (!(a->fields & (1 << field))) 59 return 0; 60 61 switch (bytes) { 62 case 1: 63 v = *((const u8 *) *p); 64 break; 65 case 2: 66 v = le16_to_cpup(*p); 67 break; 68 case 4: 69 v = le32_to_cpup(*p); 70 break; 71 case 8: 72 v = le64_to_cpup(*p); 73 break; 74 default: 75 BUG(); 76 } 77 78 *p += bytes; 79 return v; 80 } 81 82 static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out, 83 struct bkey_s_c k) 84 { 85 const struct bch_alloc *in = bkey_s_c_to_alloc(k).v; 86 const void *d = in->data; 87 unsigned idx = 0; 88 89 out->gen = in->gen; 90 91 #define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++); 92 BCH_ALLOC_FIELDS_V1() 93 #undef x 94 } 95 96 static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out, 97 struct bkey_s_c k) 98 { 99 struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k); 100 const u8 *in = a.v->data; 101 const u8 *end = bkey_val_end(a); 102 unsigned fieldnr = 0; 103 int ret; 104 u64 v; 105 106 out->gen = a.v->gen; 107 out->oldest_gen = a.v->oldest_gen; 108 out->data_type = a.v->data_type; 109 110 #define x(_name, _bits) \ 111 if (fieldnr < a.v->nr_fields) { \ 112 ret = bch2_varint_decode_fast(in, end, &v); \ 113 if (ret < 0) \ 114 return ret; \ 115 in += ret; \ 116 } else { \ 117 v = 0; \ 118 } \ 119 out->_name = v; \ 120 if (v != out->_name) \ 121 return -1; \ 122 fieldnr++; 123 124 BCH_ALLOC_FIELDS_V2() 125 #undef x 126 return 0; 127 } 128 129 static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out, 130 struct bkey_s_c k) 131 { 132 struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k); 133 const u8 *in = a.v->data; 134 const u8 *end = bkey_val_end(a); 135 unsigned fieldnr = 0; 136 int ret; 137 u64 v; 138 139 out->gen = a.v->gen; 140 out->oldest_gen = a.v->oldest_gen; 141 out->data_type = a.v->data_type; 142 out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v); 143 out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v); 144 out->journal_seq = le64_to_cpu(a.v->journal_seq); 145 146 #define x(_name, _bits) \ 147 if (fieldnr < a.v->nr_fields) { \ 148 ret = bch2_varint_decode_fast(in, end, &v); \ 149 if (ret < 0) \ 150 return ret; \ 151 in += ret; \ 152 } else { \ 153 v = 0; \ 154 } \ 155 out->_name = v; \ 156 if (v != out->_name) \ 157 return -1; \ 158 fieldnr++; 159 160 BCH_ALLOC_FIELDS_V2() 161 #undef x 162 return 0; 163 } 164 165 static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) 166 { 167 struct bkey_alloc_unpacked ret = { .gen = 0 }; 168 169 switch (k.k->type) { 170 case KEY_TYPE_alloc: 171 bch2_alloc_unpack_v1(&ret, k); 172 break; 173 case KEY_TYPE_alloc_v2: 174 bch2_alloc_unpack_v2(&ret, k); 175 break; 176 case KEY_TYPE_alloc_v3: 177 bch2_alloc_unpack_v3(&ret, k); 178 break; 179 } 180 181 return ret; 182 } 183 184 static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) 185 { 186 unsigned i, bytes = offsetof(struct bch_alloc, data); 187 188 for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++) 189 if (a->fields & (1 << i)) 190 bytes += BCH_ALLOC_V1_FIELD_BYTES[i]; 191 192 return DIV_ROUND_UP(bytes, sizeof(u64)); 193 } 194 195 int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k, 196 enum bkey_invalid_flags flags, 197 struct printbuf *err) 198 { 199 struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); 200 201 /* allow for unknown fields */ 202 if (bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v)) { 203 prt_printf(err, "incorrect value size (%zu < %u)", 204 bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v)); 205 return -BCH_ERR_invalid_bkey; 206 } 207 208 return 0; 209 } 210 211 int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, 212 enum bkey_invalid_flags flags, 213 struct printbuf *err) 214 { 215 struct bkey_alloc_unpacked u; 216 217 if (bch2_alloc_unpack_v2(&u, k)) { 218 prt_printf(err, "unpack error"); 219 return -BCH_ERR_invalid_bkey; 220 } 221 222 return 0; 223 } 224 225 int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k, 226 enum bkey_invalid_flags flags, 227 struct printbuf *err) 228 { 229 struct bkey_alloc_unpacked u; 230 231 if (bch2_alloc_unpack_v3(&u, k)) { 232 prt_printf(err, "unpack error"); 233 return -BCH_ERR_invalid_bkey; 234 } 235 236 return 0; 237 } 238 239 int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k, 240 enum bkey_invalid_flags flags, struct printbuf *err) 241 { 242 struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k); 243 244 if (alloc_v4_u64s(a.v) > bkey_val_u64s(k.k)) { 245 prt_printf(err, "bad val size (%u > %zu)", 246 alloc_v4_u64s(a.v), bkey_val_u64s(k.k)); 247 return -BCH_ERR_invalid_bkey; 248 } 249 250 if (!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) && 251 BCH_ALLOC_V4_NR_BACKPOINTERS(a.v)) { 252 prt_printf(err, "invalid backpointers_start"); 253 return -BCH_ERR_invalid_bkey; 254 } 255 256 if (alloc_data_type(*a.v, a.v->data_type) != a.v->data_type) { 257 prt_printf(err, "invalid data type (got %u should be %u)", 258 a.v->data_type, alloc_data_type(*a.v, a.v->data_type)); 259 return -BCH_ERR_invalid_bkey; 260 } 261 262 switch (a.v->data_type) { 263 case BCH_DATA_free: 264 case BCH_DATA_need_gc_gens: 265 case BCH_DATA_need_discard: 266 if (a.v->dirty_sectors || 267 a.v->cached_sectors || 268 a.v->stripe) { 269 prt_printf(err, "empty data type free but have data"); 270 return -BCH_ERR_invalid_bkey; 271 } 272 break; 273 case BCH_DATA_sb: 274 case BCH_DATA_journal: 275 case BCH_DATA_btree: 276 case BCH_DATA_user: 277 case BCH_DATA_parity: 278 if (!a.v->dirty_sectors) { 279 prt_printf(err, "data_type %s but dirty_sectors==0", 280 bch2_data_types[a.v->data_type]); 281 return -BCH_ERR_invalid_bkey; 282 } 283 break; 284 case BCH_DATA_cached: 285 if (!a.v->cached_sectors || 286 a.v->dirty_sectors || 287 a.v->stripe) { 288 prt_printf(err, "data type inconsistency"); 289 return -BCH_ERR_invalid_bkey; 290 } 291 292 if (!a.v->io_time[READ] && 293 c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs) { 294 prt_printf(err, "cached bucket with read_time == 0"); 295 return -BCH_ERR_invalid_bkey; 296 } 297 break; 298 case BCH_DATA_stripe: 299 break; 300 } 301 302 return 0; 303 } 304 305 static inline u64 swab40(u64 x) 306 { 307 return (((x & 0x00000000ffULL) << 32)| 308 ((x & 0x000000ff00ULL) << 16)| 309 ((x & 0x0000ff0000ULL) >> 0)| 310 ((x & 0x00ff000000ULL) >> 16)| 311 ((x & 0xff00000000ULL) >> 32)); 312 } 313 314 void bch2_alloc_v4_swab(struct bkey_s k) 315 { 316 struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v; 317 struct bch_backpointer *bp, *bps; 318 319 a->journal_seq = swab64(a->journal_seq); 320 a->flags = swab32(a->flags); 321 a->dirty_sectors = swab32(a->dirty_sectors); 322 a->cached_sectors = swab32(a->cached_sectors); 323 a->io_time[0] = swab64(a->io_time[0]); 324 a->io_time[1] = swab64(a->io_time[1]); 325 a->stripe = swab32(a->stripe); 326 a->nr_external_backpointers = swab32(a->nr_external_backpointers); 327 328 bps = alloc_v4_backpointers(a); 329 for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) { 330 bp->bucket_offset = swab40(bp->bucket_offset); 331 bp->bucket_len = swab32(bp->bucket_len); 332 bch2_bpos_swab(&bp->pos); 333 } 334 } 335 336 void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) 337 { 338 struct bch_alloc_v4 _a; 339 const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); 340 unsigned i; 341 342 prt_newline(out); 343 printbuf_indent_add(out, 2); 344 345 prt_printf(out, "gen %u oldest_gen %u data_type %s", 346 a->gen, a->oldest_gen, 347 a->data_type < BCH_DATA_NR 348 ? bch2_data_types[a->data_type] 349 : "(invalid data type)"); 350 prt_newline(out); 351 prt_printf(out, "journal_seq %llu", a->journal_seq); 352 prt_newline(out); 353 prt_printf(out, "need_discard %llu", BCH_ALLOC_V4_NEED_DISCARD(a)); 354 prt_newline(out); 355 prt_printf(out, "need_inc_gen %llu", BCH_ALLOC_V4_NEED_INC_GEN(a)); 356 prt_newline(out); 357 prt_printf(out, "dirty_sectors %u", a->dirty_sectors); 358 prt_newline(out); 359 prt_printf(out, "cached_sectors %u", a->cached_sectors); 360 prt_newline(out); 361 prt_printf(out, "stripe %u", a->stripe); 362 prt_newline(out); 363 prt_printf(out, "stripe_redundancy %u", a->stripe_redundancy); 364 prt_newline(out); 365 prt_printf(out, "io_time[READ] %llu", a->io_time[READ]); 366 prt_newline(out); 367 prt_printf(out, "io_time[WRITE] %llu", a->io_time[WRITE]); 368 prt_newline(out); 369 prt_printf(out, "fragmentation %llu", a->fragmentation_lru); 370 prt_newline(out); 371 prt_printf(out, "bp_start %llu", BCH_ALLOC_V4_BACKPOINTERS_START(a)); 372 prt_newline(out); 373 374 if (BCH_ALLOC_V4_NR_BACKPOINTERS(a)) { 375 struct bkey_s_c_alloc_v4 a_raw = bkey_s_c_to_alloc_v4(k); 376 const struct bch_backpointer *bps = alloc_v4_backpointers_c(a_raw.v); 377 378 prt_printf(out, "backpointers: %llu", BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v)); 379 printbuf_indent_add(out, 2); 380 381 for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v); i++) { 382 prt_newline(out); 383 bch2_backpointer_to_text(out, &bps[i]); 384 } 385 386 printbuf_indent_sub(out, 2); 387 } 388 389 printbuf_indent_sub(out, 2); 390 } 391 392 void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out) 393 { 394 if (k.k->type == KEY_TYPE_alloc_v4) { 395 void *src, *dst; 396 397 *out = *bkey_s_c_to_alloc_v4(k).v; 398 399 src = alloc_v4_backpointers(out); 400 SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s); 401 dst = alloc_v4_backpointers(out); 402 403 if (src < dst) 404 memset(src, 0, dst - src); 405 406 SET_BCH_ALLOC_V4_NR_BACKPOINTERS(out, 0); 407 } else { 408 struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); 409 410 *out = (struct bch_alloc_v4) { 411 .journal_seq = u.journal_seq, 412 .flags = u.need_discard, 413 .gen = u.gen, 414 .oldest_gen = u.oldest_gen, 415 .data_type = u.data_type, 416 .stripe_redundancy = u.stripe_redundancy, 417 .dirty_sectors = u.dirty_sectors, 418 .cached_sectors = u.cached_sectors, 419 .io_time[READ] = u.read_time, 420 .io_time[WRITE] = u.write_time, 421 .stripe = u.stripe, 422 }; 423 424 SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s); 425 } 426 } 427 428 static noinline struct bkey_i_alloc_v4 * 429 __bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) 430 { 431 struct bkey_i_alloc_v4 *ret; 432 433 ret = bch2_trans_kmalloc(trans, max(bkey_bytes(k.k), sizeof(struct bkey_i_alloc_v4))); 434 if (IS_ERR(ret)) 435 return ret; 436 437 if (k.k->type == KEY_TYPE_alloc_v4) { 438 void *src, *dst; 439 440 bkey_reassemble(&ret->k_i, k); 441 442 src = alloc_v4_backpointers(&ret->v); 443 SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s); 444 dst = alloc_v4_backpointers(&ret->v); 445 446 if (src < dst) 447 memset(src, 0, dst - src); 448 449 SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v, 0); 450 set_alloc_v4_u64s(ret); 451 } else { 452 bkey_alloc_v4_init(&ret->k_i); 453 ret->k.p = k.k->p; 454 bch2_alloc_to_v4(k, &ret->v); 455 } 456 return ret; 457 } 458 459 static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k) 460 { 461 struct bkey_s_c_alloc_v4 a; 462 463 if (likely(k.k->type == KEY_TYPE_alloc_v4) && 464 ((a = bkey_s_c_to_alloc_v4(k), true) && 465 BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) == 0)) 466 return bch2_bkey_make_mut_noupdate_typed(trans, k, alloc_v4); 467 468 return __bch2_alloc_to_v4_mut(trans, k); 469 } 470 471 struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) 472 { 473 return bch2_alloc_to_v4_mut_inlined(trans, k); 474 } 475 476 struct bkey_i_alloc_v4 * 477 bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter, 478 struct bpos pos) 479 { 480 struct bkey_s_c k; 481 struct bkey_i_alloc_v4 *a; 482 int ret; 483 484 k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos, 485 BTREE_ITER_WITH_UPDATES| 486 BTREE_ITER_CACHED| 487 BTREE_ITER_INTENT); 488 ret = bkey_err(k); 489 if (unlikely(ret)) 490 return ERR_PTR(ret); 491 492 a = bch2_alloc_to_v4_mut_inlined(trans, k); 493 ret = PTR_ERR_OR_ZERO(a); 494 if (unlikely(ret)) 495 goto err; 496 return a; 497 err: 498 bch2_trans_iter_exit(trans, iter); 499 return ERR_PTR(ret); 500 } 501 502 static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset) 503 { 504 *offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK; 505 506 pos.offset >>= KEY_TYPE_BUCKET_GENS_BITS; 507 return pos; 508 } 509 510 static struct bpos bucket_gens_pos_to_alloc(struct bpos pos, unsigned offset) 511 { 512 pos.offset <<= KEY_TYPE_BUCKET_GENS_BITS; 513 pos.offset += offset; 514 return pos; 515 } 516 517 static unsigned alloc_gen(struct bkey_s_c k, unsigned offset) 518 { 519 return k.k->type == KEY_TYPE_bucket_gens 520 ? bkey_s_c_to_bucket_gens(k).v->gens[offset] 521 : 0; 522 } 523 524 int bch2_bucket_gens_invalid(const struct bch_fs *c, struct bkey_s_c k, 525 enum bkey_invalid_flags flags, 526 struct printbuf *err) 527 { 528 if (bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens)) { 529 prt_printf(err, "bad val size (%zu != %zu)", 530 bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens)); 531 return -BCH_ERR_invalid_bkey; 532 } 533 534 return 0; 535 } 536 537 void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) 538 { 539 struct bkey_s_c_bucket_gens g = bkey_s_c_to_bucket_gens(k); 540 unsigned i; 541 542 for (i = 0; i < ARRAY_SIZE(g.v->gens); i++) { 543 if (i) 544 prt_char(out, ' '); 545 prt_printf(out, "%u", g.v->gens[i]); 546 } 547 } 548 549 int bch2_bucket_gens_init(struct bch_fs *c) 550 { 551 struct btree_trans *trans = bch2_trans_get(c); 552 struct btree_iter iter; 553 struct bkey_s_c k; 554 struct bch_alloc_v4 a; 555 struct bkey_i_bucket_gens g; 556 bool have_bucket_gens_key = false; 557 unsigned offset; 558 struct bpos pos; 559 u8 gen; 560 int ret; 561 562 for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, 563 BTREE_ITER_PREFETCH, k, ret) { 564 /* 565 * Not a fsck error because this is checked/repaired by 566 * bch2_check_alloc_key() which runs later: 567 */ 568 if (!bch2_dev_bucket_exists(c, k.k->p)) 569 continue; 570 571 gen = bch2_alloc_to_v4(k, &a)->gen; 572 pos = alloc_gens_pos(iter.pos, &offset); 573 574 if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) { 575 ret = commit_do(trans, NULL, NULL, 576 BTREE_INSERT_NOFAIL| 577 BTREE_INSERT_LAZY_RW, 578 bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0)); 579 if (ret) 580 break; 581 have_bucket_gens_key = false; 582 } 583 584 if (!have_bucket_gens_key) { 585 bkey_bucket_gens_init(&g.k_i); 586 g.k.p = pos; 587 have_bucket_gens_key = true; 588 } 589 590 g.v.gens[offset] = gen; 591 } 592 bch2_trans_iter_exit(trans, &iter); 593 594 if (have_bucket_gens_key && !ret) 595 ret = commit_do(trans, NULL, NULL, 596 BTREE_INSERT_NOFAIL| 597 BTREE_INSERT_LAZY_RW, 598 bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0)); 599 600 bch2_trans_put(trans); 601 602 if (ret) 603 bch_err_fn(c, ret); 604 return ret; 605 } 606 607 int bch2_alloc_read(struct bch_fs *c) 608 { 609 struct btree_trans *trans = bch2_trans_get(c); 610 struct btree_iter iter; 611 struct bkey_s_c k; 612 struct bch_dev *ca; 613 int ret; 614 615 down_read(&c->gc_lock); 616 617 if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) { 618 const struct bch_bucket_gens *g; 619 u64 b; 620 621 for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN, 622 BTREE_ITER_PREFETCH, k, ret) { 623 u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; 624 u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; 625 626 if (k.k->type != KEY_TYPE_bucket_gens) 627 continue; 628 629 g = bkey_s_c_to_bucket_gens(k).v; 630 631 /* 632 * Not a fsck error because this is checked/repaired by 633 * bch2_check_alloc_key() which runs later: 634 */ 635 if (!bch2_dev_exists2(c, k.k->p.inode)) 636 continue; 637 638 ca = bch_dev_bkey_exists(c, k.k->p.inode); 639 640 for (b = max_t(u64, ca->mi.first_bucket, start); 641 b < min_t(u64, ca->mi.nbuckets, end); 642 b++) 643 *bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK]; 644 } 645 bch2_trans_iter_exit(trans, &iter); 646 } else { 647 struct bch_alloc_v4 a; 648 649 for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, 650 BTREE_ITER_PREFETCH, k, ret) { 651 /* 652 * Not a fsck error because this is checked/repaired by 653 * bch2_check_alloc_key() which runs later: 654 */ 655 if (!bch2_dev_bucket_exists(c, k.k->p)) 656 continue; 657 658 ca = bch_dev_bkey_exists(c, k.k->p.inode); 659 660 *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen; 661 } 662 bch2_trans_iter_exit(trans, &iter); 663 } 664 665 bch2_trans_put(trans); 666 up_read(&c->gc_lock); 667 668 if (ret) 669 bch_err_fn(c, ret); 670 671 return ret; 672 } 673 674 /* Free space/discard btree: */ 675 676 static int bch2_bucket_do_index(struct btree_trans *trans, 677 struct bkey_s_c alloc_k, 678 const struct bch_alloc_v4 *a, 679 bool set) 680 { 681 struct bch_fs *c = trans->c; 682 struct bch_dev *ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode); 683 struct btree_iter iter; 684 struct bkey_s_c old; 685 struct bkey_i *k; 686 enum btree_id btree; 687 enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted; 688 enum bch_bkey_type new_type = set ? KEY_TYPE_set : KEY_TYPE_deleted; 689 struct printbuf buf = PRINTBUF; 690 int ret; 691 692 if (a->data_type != BCH_DATA_free && 693 a->data_type != BCH_DATA_need_discard) 694 return 0; 695 696 k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k)); 697 if (IS_ERR(k)) 698 return PTR_ERR(k); 699 700 bkey_init(&k->k); 701 k->k.type = new_type; 702 703 switch (a->data_type) { 704 case BCH_DATA_free: 705 btree = BTREE_ID_freespace; 706 k->k.p = alloc_freespace_pos(alloc_k.k->p, *a); 707 bch2_key_resize(&k->k, 1); 708 break; 709 case BCH_DATA_need_discard: 710 btree = BTREE_ID_need_discard; 711 k->k.p = alloc_k.k->p; 712 break; 713 default: 714 return 0; 715 } 716 717 old = bch2_bkey_get_iter(trans, &iter, btree, 718 bkey_start_pos(&k->k), 719 BTREE_ITER_INTENT); 720 ret = bkey_err(old); 721 if (ret) 722 return ret; 723 724 if (ca->mi.freespace_initialized && 725 c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info && 726 bch2_trans_inconsistent_on(old.k->type != old_type, trans, 727 "incorrect key when %s %s:%llu:%llu:0 (got %s should be %s)\n" 728 " for %s", 729 set ? "setting" : "clearing", 730 bch2_btree_ids[btree], 731 iter.pos.inode, 732 iter.pos.offset, 733 bch2_bkey_types[old.k->type], 734 bch2_bkey_types[old_type], 735 (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { 736 ret = -EIO; 737 goto err; 738 } 739 740 ret = bch2_trans_update(trans, &iter, k, 0); 741 err: 742 bch2_trans_iter_exit(trans, &iter); 743 printbuf_exit(&buf); 744 return ret; 745 } 746 747 static noinline int bch2_bucket_gen_update(struct btree_trans *trans, 748 struct bpos bucket, u8 gen) 749 { 750 struct btree_iter iter; 751 unsigned offset; 752 struct bpos pos = alloc_gens_pos(bucket, &offset); 753 struct bkey_i_bucket_gens *g; 754 struct bkey_s_c k; 755 int ret; 756 757 g = bch2_trans_kmalloc(trans, sizeof(*g)); 758 ret = PTR_ERR_OR_ZERO(g); 759 if (ret) 760 return ret; 761 762 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_bucket_gens, pos, 763 BTREE_ITER_INTENT| 764 BTREE_ITER_WITH_UPDATES); 765 ret = bkey_err(k); 766 if (ret) 767 return ret; 768 769 if (k.k->type != KEY_TYPE_bucket_gens) { 770 bkey_bucket_gens_init(&g->k_i); 771 g->k.p = iter.pos; 772 } else { 773 bkey_reassemble(&g->k_i, k); 774 } 775 776 g->v.gens[offset] = gen; 777 778 ret = bch2_trans_update(trans, &iter, &g->k_i, 0); 779 bch2_trans_iter_exit(trans, &iter); 780 return ret; 781 } 782 783 int bch2_trans_mark_alloc(struct btree_trans *trans, 784 enum btree_id btree_id, unsigned level, 785 struct bkey_s_c old, struct bkey_i *new, 786 unsigned flags) 787 { 788 struct bch_fs *c = trans->c; 789 struct bch_alloc_v4 old_a_convert, *new_a; 790 const struct bch_alloc_v4 *old_a; 791 u64 old_lru, new_lru; 792 int ret = 0; 793 794 /* 795 * Deletion only happens in the device removal path, with 796 * BTREE_TRIGGER_NORUN: 797 */ 798 BUG_ON(new->k.type != KEY_TYPE_alloc_v4); 799 800 old_a = bch2_alloc_to_v4(old, &old_a_convert); 801 new_a = &bkey_i_to_alloc_v4(new)->v; 802 803 new_a->data_type = alloc_data_type(*new_a, new_a->data_type); 804 805 if (new_a->dirty_sectors > old_a->dirty_sectors || 806 new_a->cached_sectors > old_a->cached_sectors) { 807 new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); 808 new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now)); 809 SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true); 810 SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true); 811 } 812 813 if (data_type_is_empty(new_a->data_type) && 814 BCH_ALLOC_V4_NEED_INC_GEN(new_a) && 815 !bch2_bucket_is_open_safe(c, new->k.p.inode, new->k.p.offset)) { 816 new_a->gen++; 817 SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false); 818 } 819 820 if (old_a->data_type != new_a->data_type || 821 (new_a->data_type == BCH_DATA_free && 822 alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) { 823 ret = bch2_bucket_do_index(trans, old, old_a, false) ?: 824 bch2_bucket_do_index(trans, bkey_i_to_s_c(new), new_a, true); 825 if (ret) 826 return ret; 827 } 828 829 if (new_a->data_type == BCH_DATA_cached && 830 !new_a->io_time[READ]) 831 new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); 832 833 old_lru = alloc_lru_idx_read(*old_a); 834 new_lru = alloc_lru_idx_read(*new_a); 835 836 if (old_lru != new_lru) { 837 ret = bch2_lru_change(trans, new->k.p.inode, 838 bucket_to_u64(new->k.p), 839 old_lru, new_lru); 840 if (ret) 841 return ret; 842 } 843 844 new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a, 845 bch_dev_bkey_exists(c, new->k.p.inode)); 846 847 if (old_a->fragmentation_lru != new_a->fragmentation_lru) { 848 ret = bch2_lru_change(trans, 849 BCH_LRU_FRAGMENTATION_START, 850 bucket_to_u64(new->k.p), 851 old_a->fragmentation_lru, new_a->fragmentation_lru); 852 if (ret) 853 return ret; 854 } 855 856 if (old_a->gen != new_a->gen) { 857 ret = bch2_bucket_gen_update(trans, new->k.p, new_a->gen); 858 if (ret) 859 return ret; 860 } 861 862 return 0; 863 } 864 865 /* 866 * This synthesizes deleted extents for holes, similar to BTREE_ITER_SLOTS for 867 * extents style btrees, but works on non-extents btrees: 868 */ 869 static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole) 870 { 871 struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); 872 873 if (bkey_err(k)) 874 return k; 875 876 if (k.k->type) { 877 return k; 878 } else { 879 struct btree_iter iter2; 880 struct bpos next; 881 882 bch2_trans_copy_iter(&iter2, iter); 883 884 if (!bpos_eq(iter->path->l[0].b->key.k.p, SPOS_MAX)) 885 end = bkey_min(end, bpos_nosnap_successor(iter->path->l[0].b->key.k.p)); 886 887 end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1)); 888 889 /* 890 * btree node min/max is a closed interval, upto takes a half 891 * open interval: 892 */ 893 k = bch2_btree_iter_peek_upto(&iter2, end); 894 next = iter2.pos; 895 bch2_trans_iter_exit(iter->trans, &iter2); 896 897 BUG_ON(next.offset >= iter->pos.offset + U32_MAX); 898 899 if (bkey_err(k)) 900 return k; 901 902 bkey_init(hole); 903 hole->p = iter->pos; 904 905 bch2_key_resize(hole, next.offset - iter->pos.offset); 906 return (struct bkey_s_c) { hole, NULL }; 907 } 908 } 909 910 static bool next_bucket(struct bch_fs *c, struct bpos *bucket) 911 { 912 struct bch_dev *ca; 913 unsigned iter; 914 915 if (bch2_dev_bucket_exists(c, *bucket)) 916 return true; 917 918 if (bch2_dev_exists2(c, bucket->inode)) { 919 ca = bch_dev_bkey_exists(c, bucket->inode); 920 921 if (bucket->offset < ca->mi.first_bucket) { 922 bucket->offset = ca->mi.first_bucket; 923 return true; 924 } 925 926 bucket->inode++; 927 bucket->offset = 0; 928 } 929 930 rcu_read_lock(); 931 iter = bucket->inode; 932 ca = __bch2_next_dev(c, &iter, NULL); 933 if (ca) 934 *bucket = POS(ca->dev_idx, ca->mi.first_bucket); 935 rcu_read_unlock(); 936 937 return ca != NULL; 938 } 939 940 static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter, struct bkey *hole) 941 { 942 struct bch_fs *c = iter->trans->c; 943 struct bkey_s_c k; 944 again: 945 k = bch2_get_key_or_hole(iter, POS_MAX, hole); 946 if (bkey_err(k)) 947 return k; 948 949 if (!k.k->type) { 950 struct bpos bucket = bkey_start_pos(k.k); 951 952 if (!bch2_dev_bucket_exists(c, bucket)) { 953 if (!next_bucket(c, &bucket)) 954 return bkey_s_c_null; 955 956 bch2_btree_iter_set_pos(iter, bucket); 957 goto again; 958 } 959 960 if (!bch2_dev_bucket_exists(c, k.k->p)) { 961 struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode); 962 963 bch2_key_resize(hole, ca->mi.nbuckets - bucket.offset); 964 } 965 } 966 967 return k; 968 } 969 970 static noinline_for_stack 971 int bch2_check_alloc_key(struct btree_trans *trans, 972 struct bkey_s_c alloc_k, 973 struct btree_iter *alloc_iter, 974 struct btree_iter *discard_iter, 975 struct btree_iter *freespace_iter, 976 struct btree_iter *bucket_gens_iter) 977 { 978 struct bch_fs *c = trans->c; 979 struct bch_dev *ca; 980 struct bch_alloc_v4 a_convert; 981 const struct bch_alloc_v4 *a; 982 unsigned discard_key_type, freespace_key_type; 983 unsigned gens_offset; 984 struct bkey_s_c k; 985 struct printbuf buf = PRINTBUF; 986 int ret; 987 988 if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_k.k->p), c, 989 "alloc key for invalid device:bucket %llu:%llu", 990 alloc_k.k->p.inode, alloc_k.k->p.offset)) 991 return bch2_btree_delete_at(trans, alloc_iter, 0); 992 993 ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode); 994 if (!ca->mi.freespace_initialized) 995 return 0; 996 997 a = bch2_alloc_to_v4(alloc_k, &a_convert); 998 999 discard_key_type = a->data_type == BCH_DATA_need_discard ? KEY_TYPE_set : 0; 1000 bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p); 1001 k = bch2_btree_iter_peek_slot(discard_iter); 1002 ret = bkey_err(k); 1003 if (ret) 1004 goto err; 1005 1006 if (k.k->type != discard_key_type && 1007 (c->opts.reconstruct_alloc || 1008 fsck_err(c, "incorrect key in need_discard btree (got %s should be %s)\n" 1009 " %s", 1010 bch2_bkey_types[k.k->type], 1011 bch2_bkey_types[discard_key_type], 1012 (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) { 1013 struct bkey_i *update = 1014 bch2_trans_kmalloc(trans, sizeof(*update)); 1015 1016 ret = PTR_ERR_OR_ZERO(update); 1017 if (ret) 1018 goto err; 1019 1020 bkey_init(&update->k); 1021 update->k.type = discard_key_type; 1022 update->k.p = discard_iter->pos; 1023 1024 ret = bch2_trans_update(trans, discard_iter, update, 0); 1025 if (ret) 1026 goto err; 1027 } 1028 1029 freespace_key_type = a->data_type == BCH_DATA_free ? KEY_TYPE_set : 0; 1030 bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a)); 1031 k = bch2_btree_iter_peek_slot(freespace_iter); 1032 ret = bkey_err(k); 1033 if (ret) 1034 goto err; 1035 1036 if (k.k->type != freespace_key_type && 1037 (c->opts.reconstruct_alloc || 1038 fsck_err(c, "incorrect key in freespace btree (got %s should be %s)\n" 1039 " %s", 1040 bch2_bkey_types[k.k->type], 1041 bch2_bkey_types[freespace_key_type], 1042 (printbuf_reset(&buf), 1043 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) { 1044 struct bkey_i *update = 1045 bch2_trans_kmalloc(trans, sizeof(*update)); 1046 1047 ret = PTR_ERR_OR_ZERO(update); 1048 if (ret) 1049 goto err; 1050 1051 bkey_init(&update->k); 1052 update->k.type = freespace_key_type; 1053 update->k.p = freespace_iter->pos; 1054 bch2_key_resize(&update->k, 1); 1055 1056 ret = bch2_trans_update(trans, freespace_iter, update, 0); 1057 if (ret) 1058 goto err; 1059 } 1060 1061 bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset)); 1062 k = bch2_btree_iter_peek_slot(bucket_gens_iter); 1063 ret = bkey_err(k); 1064 if (ret) 1065 goto err; 1066 1067 if (a->gen != alloc_gen(k, gens_offset) && 1068 (c->opts.reconstruct_alloc || 1069 fsck_err(c, "incorrect gen in bucket_gens btree (got %u should be %u)\n" 1070 " %s", 1071 alloc_gen(k, gens_offset), a->gen, 1072 (printbuf_reset(&buf), 1073 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) { 1074 struct bkey_i_bucket_gens *g = 1075 bch2_trans_kmalloc(trans, sizeof(*g)); 1076 1077 ret = PTR_ERR_OR_ZERO(g); 1078 if (ret) 1079 goto err; 1080 1081 if (k.k->type == KEY_TYPE_bucket_gens) { 1082 bkey_reassemble(&g->k_i, k); 1083 } else { 1084 bkey_bucket_gens_init(&g->k_i); 1085 g->k.p = alloc_gens_pos(alloc_k.k->p, &gens_offset); 1086 } 1087 1088 g->v.gens[gens_offset] = a->gen; 1089 1090 ret = bch2_trans_update(trans, bucket_gens_iter, &g->k_i, 0); 1091 if (ret) 1092 goto err; 1093 } 1094 err: 1095 fsck_err: 1096 printbuf_exit(&buf); 1097 return ret; 1098 } 1099 1100 static noinline_for_stack 1101 int bch2_check_alloc_hole_freespace(struct btree_trans *trans, 1102 struct bpos start, 1103 struct bpos *end, 1104 struct btree_iter *freespace_iter) 1105 { 1106 struct bch_fs *c = trans->c; 1107 struct bch_dev *ca; 1108 struct bkey_s_c k; 1109 struct printbuf buf = PRINTBUF; 1110 int ret; 1111 1112 ca = bch_dev_bkey_exists(c, start.inode); 1113 if (!ca->mi.freespace_initialized) 1114 return 0; 1115 1116 bch2_btree_iter_set_pos(freespace_iter, start); 1117 1118 k = bch2_btree_iter_peek_slot(freespace_iter); 1119 ret = bkey_err(k); 1120 if (ret) 1121 goto err; 1122 1123 *end = bkey_min(k.k->p, *end); 1124 1125 if (k.k->type != KEY_TYPE_set && 1126 (c->opts.reconstruct_alloc || 1127 fsck_err(c, "hole in alloc btree missing in freespace btree\n" 1128 " device %llu buckets %llu-%llu", 1129 freespace_iter->pos.inode, 1130 freespace_iter->pos.offset, 1131 end->offset))) { 1132 struct bkey_i *update = 1133 bch2_trans_kmalloc(trans, sizeof(*update)); 1134 1135 ret = PTR_ERR_OR_ZERO(update); 1136 if (ret) 1137 goto err; 1138 1139 bkey_init(&update->k); 1140 update->k.type = KEY_TYPE_set; 1141 update->k.p = freespace_iter->pos; 1142 bch2_key_resize(&update->k, 1143 min_t(u64, U32_MAX, end->offset - 1144 freespace_iter->pos.offset)); 1145 1146 ret = bch2_trans_update(trans, freespace_iter, update, 0); 1147 if (ret) 1148 goto err; 1149 } 1150 err: 1151 fsck_err: 1152 printbuf_exit(&buf); 1153 return ret; 1154 } 1155 1156 static noinline_for_stack 1157 int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans, 1158 struct bpos start, 1159 struct bpos *end, 1160 struct btree_iter *bucket_gens_iter) 1161 { 1162 struct bch_fs *c = trans->c; 1163 struct bkey_s_c k; 1164 struct printbuf buf = PRINTBUF; 1165 unsigned i, gens_offset, gens_end_offset; 1166 int ret; 1167 1168 if (c->sb.version < bcachefs_metadata_version_bucket_gens) 1169 return 0; 1170 1171 bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset)); 1172 1173 k = bch2_btree_iter_peek_slot(bucket_gens_iter); 1174 ret = bkey_err(k); 1175 if (ret) 1176 goto err; 1177 1178 if (bkey_cmp(alloc_gens_pos(start, &gens_offset), 1179 alloc_gens_pos(*end, &gens_end_offset))) 1180 gens_end_offset = KEY_TYPE_BUCKET_GENS_NR; 1181 1182 if (k.k->type == KEY_TYPE_bucket_gens) { 1183 struct bkey_i_bucket_gens g; 1184 bool need_update = false; 1185 1186 bkey_reassemble(&g.k_i, k); 1187 1188 for (i = gens_offset; i < gens_end_offset; i++) { 1189 if (fsck_err_on(g.v.gens[i], c, 1190 "hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)", 1191 bucket_gens_pos_to_alloc(k.k->p, i).inode, 1192 bucket_gens_pos_to_alloc(k.k->p, i).offset, 1193 g.v.gens[i])) { 1194 g.v.gens[i] = 0; 1195 need_update = true; 1196 } 1197 } 1198 1199 if (need_update) { 1200 struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g)); 1201 1202 ret = PTR_ERR_OR_ZERO(u); 1203 if (ret) 1204 goto err; 1205 1206 memcpy(u, &g, sizeof(g)); 1207 1208 ret = bch2_trans_update(trans, bucket_gens_iter, u, 0); 1209 if (ret) 1210 goto err; 1211 } 1212 } 1213 1214 *end = bkey_min(*end, bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0)); 1215 err: 1216 fsck_err: 1217 printbuf_exit(&buf); 1218 return ret; 1219 } 1220 1221 static noinline_for_stack int __bch2_check_discard_freespace_key(struct btree_trans *trans, 1222 struct btree_iter *iter) 1223 { 1224 struct bch_fs *c = trans->c; 1225 struct btree_iter alloc_iter; 1226 struct bkey_s_c alloc_k; 1227 struct bch_alloc_v4 a_convert; 1228 const struct bch_alloc_v4 *a; 1229 u64 genbits; 1230 struct bpos pos; 1231 enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard 1232 ? BCH_DATA_need_discard 1233 : BCH_DATA_free; 1234 struct printbuf buf = PRINTBUF; 1235 int ret; 1236 1237 pos = iter->pos; 1238 pos.offset &= ~(~0ULL << 56); 1239 genbits = iter->pos.offset & (~0ULL << 56); 1240 1241 alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, pos, 0); 1242 ret = bkey_err(alloc_k); 1243 if (ret) 1244 return ret; 1245 1246 if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c, 1247 "entry in %s btree for nonexistant dev:bucket %llu:%llu", 1248 bch2_btree_ids[iter->btree_id], pos.inode, pos.offset)) 1249 goto delete; 1250 1251 a = bch2_alloc_to_v4(alloc_k, &a_convert); 1252 1253 if (fsck_err_on(a->data_type != state || 1254 (state == BCH_DATA_free && 1255 genbits != alloc_freespace_genbits(*a)), c, 1256 "%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)", 1257 (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), 1258 bch2_btree_ids[iter->btree_id], 1259 iter->pos.inode, 1260 iter->pos.offset, 1261 a->data_type == state, 1262 genbits >> 56, alloc_freespace_genbits(*a) >> 56)) 1263 goto delete; 1264 out: 1265 fsck_err: 1266 set_btree_iter_dontneed(&alloc_iter); 1267 bch2_trans_iter_exit(trans, &alloc_iter); 1268 printbuf_exit(&buf); 1269 return ret; 1270 delete: 1271 ret = bch2_btree_delete_extent_at(trans, iter, 1272 iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?: 1273 bch2_trans_commit(trans, NULL, NULL, 1274 BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW); 1275 goto out; 1276 } 1277 1278 static int bch2_check_discard_freespace_key(struct btree_trans *trans, 1279 struct btree_iter *iter, 1280 struct bpos end) 1281 { 1282 if (!btree_id_is_extents(iter->btree_id)) { 1283 return __bch2_check_discard_freespace_key(trans, iter); 1284 } else { 1285 int ret = 0; 1286 1287 while (!bkey_eq(iter->pos, end) && 1288 !(ret = btree_trans_too_many_iters(trans) ?: 1289 __bch2_check_discard_freespace_key(trans, iter))) 1290 bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos)); 1291 1292 return ret; 1293 } 1294 } 1295 1296 /* 1297 * We've already checked that generation numbers in the bucket_gens btree are 1298 * valid for buckets that exist; this just checks for keys for nonexistent 1299 * buckets. 1300 */ 1301 static noinline_for_stack 1302 int bch2_check_bucket_gens_key(struct btree_trans *trans, 1303 struct btree_iter *iter, 1304 struct bkey_s_c k) 1305 { 1306 struct bch_fs *c = trans->c; 1307 struct bkey_i_bucket_gens g; 1308 struct bch_dev *ca; 1309 u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; 1310 u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; 1311 u64 b; 1312 bool need_update = false, dev_exists; 1313 struct printbuf buf = PRINTBUF; 1314 int ret = 0; 1315 1316 BUG_ON(k.k->type != KEY_TYPE_bucket_gens); 1317 bkey_reassemble(&g.k_i, k); 1318 1319 /* if no bch_dev, skip out whether we repair or not */ 1320 dev_exists = bch2_dev_exists2(c, k.k->p.inode); 1321 if (!dev_exists) { 1322 if (fsck_err_on(!dev_exists, c, 1323 "bucket_gens key for invalid device:\n %s", 1324 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { 1325 ret = bch2_btree_delete_at(trans, iter, 0); 1326 } 1327 goto out; 1328 } 1329 1330 ca = bch_dev_bkey_exists(c, k.k->p.inode); 1331 if (fsck_err_on(end <= ca->mi.first_bucket || 1332 start >= ca->mi.nbuckets, c, 1333 "bucket_gens key for invalid buckets:\n %s", 1334 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { 1335 ret = bch2_btree_delete_at(trans, iter, 0); 1336 goto out; 1337 } 1338 1339 for (b = start; b < ca->mi.first_bucket; b++) 1340 if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c, 1341 "bucket_gens key has nonzero gen for invalid bucket")) { 1342 g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0; 1343 need_update = true; 1344 } 1345 1346 for (b = ca->mi.nbuckets; b < end; b++) 1347 if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c, 1348 "bucket_gens key has nonzero gen for invalid bucket")) { 1349 g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0; 1350 need_update = true; 1351 } 1352 1353 if (need_update) { 1354 struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g)); 1355 1356 ret = PTR_ERR_OR_ZERO(u); 1357 if (ret) 1358 goto out; 1359 1360 memcpy(u, &g, sizeof(g)); 1361 ret = bch2_trans_update(trans, iter, u, 0); 1362 } 1363 out: 1364 fsck_err: 1365 printbuf_exit(&buf); 1366 return ret; 1367 } 1368 1369 int bch2_check_alloc_info(struct bch_fs *c) 1370 { 1371 struct btree_trans *trans = bch2_trans_get(c); 1372 struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter; 1373 struct bkey hole; 1374 struct bkey_s_c k; 1375 int ret = 0; 1376 1377 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN, 1378 BTREE_ITER_PREFETCH); 1379 bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN, 1380 BTREE_ITER_PREFETCH); 1381 bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, POS_MIN, 1382 BTREE_ITER_PREFETCH); 1383 bch2_trans_iter_init(trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN, 1384 BTREE_ITER_PREFETCH); 1385 1386 while (1) { 1387 struct bpos next; 1388 1389 bch2_trans_begin(trans); 1390 1391 k = bch2_get_key_or_real_bucket_hole(&iter, &hole); 1392 ret = bkey_err(k); 1393 if (ret) 1394 goto bkey_err; 1395 1396 if (!k.k) 1397 break; 1398 1399 if (k.k->type) { 1400 next = bpos_nosnap_successor(k.k->p); 1401 1402 ret = bch2_check_alloc_key(trans, 1403 k, &iter, 1404 &discard_iter, 1405 &freespace_iter, 1406 &bucket_gens_iter); 1407 if (ret) 1408 goto bkey_err; 1409 } else { 1410 next = k.k->p; 1411 1412 ret = bch2_check_alloc_hole_freespace(trans, 1413 bkey_start_pos(k.k), 1414 &next, 1415 &freespace_iter) ?: 1416 bch2_check_alloc_hole_bucket_gens(trans, 1417 bkey_start_pos(k.k), 1418 &next, 1419 &bucket_gens_iter); 1420 if (ret) 1421 goto bkey_err; 1422 } 1423 1424 ret = bch2_trans_commit(trans, NULL, NULL, 1425 BTREE_INSERT_NOFAIL| 1426 BTREE_INSERT_LAZY_RW); 1427 if (ret) 1428 goto bkey_err; 1429 1430 bch2_btree_iter_set_pos(&iter, next); 1431 bkey_err: 1432 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1433 continue; 1434 if (ret) 1435 break; 1436 } 1437 bch2_trans_iter_exit(trans, &bucket_gens_iter); 1438 bch2_trans_iter_exit(trans, &freespace_iter); 1439 bch2_trans_iter_exit(trans, &discard_iter); 1440 bch2_trans_iter_exit(trans, &iter); 1441 1442 if (ret < 0) 1443 goto err; 1444 1445 ret = for_each_btree_key2(trans, iter, 1446 BTREE_ID_need_discard, POS_MIN, 1447 BTREE_ITER_PREFETCH, k, 1448 bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?: 1449 for_each_btree_key2(trans, iter, 1450 BTREE_ID_freespace, POS_MIN, 1451 BTREE_ITER_PREFETCH, k, 1452 bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?: 1453 for_each_btree_key_commit(trans, iter, 1454 BTREE_ID_bucket_gens, POS_MIN, 1455 BTREE_ITER_PREFETCH, k, 1456 NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, 1457 bch2_check_bucket_gens_key(trans, &iter, k)); 1458 err: 1459 bch2_trans_put(trans); 1460 if (ret) 1461 bch_err_fn(c, ret); 1462 return ret; 1463 } 1464 1465 static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, 1466 struct btree_iter *alloc_iter) 1467 { 1468 struct bch_fs *c = trans->c; 1469 struct btree_iter lru_iter; 1470 struct bch_alloc_v4 a_convert; 1471 const struct bch_alloc_v4 *a; 1472 struct bkey_s_c alloc_k, lru_k; 1473 struct printbuf buf = PRINTBUF; 1474 int ret; 1475 1476 alloc_k = bch2_btree_iter_peek(alloc_iter); 1477 if (!alloc_k.k) 1478 return 0; 1479 1480 ret = bkey_err(alloc_k); 1481 if (ret) 1482 return ret; 1483 1484 a = bch2_alloc_to_v4(alloc_k, &a_convert); 1485 1486 if (a->data_type != BCH_DATA_cached) 1487 return 0; 1488 1489 lru_k = bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru, 1490 lru_pos(alloc_k.k->p.inode, 1491 bucket_to_u64(alloc_k.k->p), 1492 a->io_time[READ]), 0); 1493 ret = bkey_err(lru_k); 1494 if (ret) 1495 return ret; 1496 1497 if (fsck_err_on(!a->io_time[READ], c, 1498 "cached bucket with read_time 0\n" 1499 " %s", 1500 (printbuf_reset(&buf), 1501 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) || 1502 fsck_err_on(lru_k.k->type != KEY_TYPE_set, c, 1503 "missing lru entry\n" 1504 " %s", 1505 (printbuf_reset(&buf), 1506 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { 1507 u64 read_time = a->io_time[READ] ?: 1508 atomic64_read(&c->io_clock[READ].now); 1509 1510 ret = bch2_lru_set(trans, 1511 alloc_k.k->p.inode, 1512 bucket_to_u64(alloc_k.k->p), 1513 read_time); 1514 if (ret) 1515 goto err; 1516 1517 if (a->io_time[READ] != read_time) { 1518 struct bkey_i_alloc_v4 *a_mut = 1519 bch2_alloc_to_v4_mut(trans, alloc_k); 1520 ret = PTR_ERR_OR_ZERO(a_mut); 1521 if (ret) 1522 goto err; 1523 1524 a_mut->v.io_time[READ] = read_time; 1525 ret = bch2_trans_update(trans, alloc_iter, 1526 &a_mut->k_i, BTREE_TRIGGER_NORUN); 1527 if (ret) 1528 goto err; 1529 } 1530 } 1531 err: 1532 fsck_err: 1533 bch2_trans_iter_exit(trans, &lru_iter); 1534 printbuf_exit(&buf); 1535 return ret; 1536 } 1537 1538 int bch2_check_alloc_to_lru_refs(struct bch_fs *c) 1539 { 1540 struct btree_iter iter; 1541 struct bkey_s_c k; 1542 int ret = 0; 1543 1544 ret = bch2_trans_run(c, 1545 for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, 1546 POS_MIN, BTREE_ITER_PREFETCH, k, 1547 NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, 1548 bch2_check_alloc_to_lru_ref(trans, &iter))); 1549 if (ret) 1550 bch_err_fn(c, ret); 1551 return ret; 1552 } 1553 1554 static int bch2_discard_one_bucket(struct btree_trans *trans, 1555 struct btree_iter *need_discard_iter, 1556 struct bpos *discard_pos_done, 1557 u64 *seen, 1558 u64 *open, 1559 u64 *need_journal_commit, 1560 u64 *discarded) 1561 { 1562 struct bch_fs *c = trans->c; 1563 struct bpos pos = need_discard_iter->pos; 1564 struct btree_iter iter = { NULL }; 1565 struct bkey_s_c k; 1566 struct bch_dev *ca; 1567 struct bkey_i_alloc_v4 *a; 1568 struct printbuf buf = PRINTBUF; 1569 int ret = 0; 1570 1571 ca = bch_dev_bkey_exists(c, pos.inode); 1572 if (!percpu_ref_tryget(&ca->io_ref)) { 1573 bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0)); 1574 return 0; 1575 } 1576 1577 if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) { 1578 (*open)++; 1579 goto out; 1580 } 1581 1582 if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, 1583 c->journal.flushed_seq_ondisk, 1584 pos.inode, pos.offset)) { 1585 (*need_journal_commit)++; 1586 goto out; 1587 } 1588 1589 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, 1590 need_discard_iter->pos, 1591 BTREE_ITER_CACHED); 1592 ret = bkey_err(k); 1593 if (ret) 1594 goto out; 1595 1596 a = bch2_alloc_to_v4_mut(trans, k); 1597 ret = PTR_ERR_OR_ZERO(a); 1598 if (ret) 1599 goto out; 1600 1601 if (BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) { 1602 a->v.gen++; 1603 SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); 1604 goto write; 1605 } 1606 1607 if (a->v.journal_seq > c->journal.flushed_seq_ondisk) { 1608 if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) { 1609 bch2_trans_inconsistent(trans, 1610 "clearing need_discard but journal_seq %llu > flushed_seq %llu\n" 1611 "%s", 1612 a->v.journal_seq, 1613 c->journal.flushed_seq_ondisk, 1614 (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); 1615 ret = -EIO; 1616 } 1617 goto out; 1618 } 1619 1620 if (a->v.data_type != BCH_DATA_need_discard) { 1621 if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) { 1622 bch2_trans_inconsistent(trans, 1623 "bucket incorrectly set in need_discard btree\n" 1624 "%s", 1625 (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); 1626 ret = -EIO; 1627 } 1628 1629 goto out; 1630 } 1631 1632 if (!bkey_eq(*discard_pos_done, iter.pos) && 1633 ca->mi.discard && !c->opts.nochanges) { 1634 /* 1635 * This works without any other locks because this is the only 1636 * thread that removes items from the need_discard tree 1637 */ 1638 bch2_trans_unlock(trans); 1639 blkdev_issue_discard(ca->disk_sb.bdev, 1640 k.k->p.offset * ca->mi.bucket_size, 1641 ca->mi.bucket_size, 1642 GFP_KERNEL); 1643 *discard_pos_done = iter.pos; 1644 1645 ret = bch2_trans_relock_notrace(trans); 1646 if (ret) 1647 goto out; 1648 } 1649 1650 SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); 1651 a->v.data_type = alloc_data_type(a->v, a->v.data_type); 1652 write: 1653 ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: 1654 bch2_trans_commit(trans, NULL, NULL, 1655 BCH_WATERMARK_btree| 1656 BTREE_INSERT_NOFAIL); 1657 if (ret) 1658 goto out; 1659 1660 this_cpu_inc(c->counters[BCH_COUNTER_bucket_discard]); 1661 (*discarded)++; 1662 out: 1663 (*seen)++; 1664 bch2_trans_iter_exit(trans, &iter); 1665 percpu_ref_put(&ca->io_ref); 1666 printbuf_exit(&buf); 1667 return ret; 1668 } 1669 1670 static void bch2_do_discards_work(struct work_struct *work) 1671 { 1672 struct bch_fs *c = container_of(work, struct bch_fs, discard_work); 1673 struct btree_iter iter; 1674 struct bkey_s_c k; 1675 u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0; 1676 struct bpos discard_pos_done = POS_MAX; 1677 int ret; 1678 1679 /* 1680 * We're doing the commit in bch2_discard_one_bucket instead of using 1681 * for_each_btree_key_commit() so that we can increment counters after 1682 * successful commit: 1683 */ 1684 ret = bch2_trans_run(c, 1685 for_each_btree_key2(trans, iter, 1686 BTREE_ID_need_discard, POS_MIN, 0, k, 1687 bch2_discard_one_bucket(trans, &iter, &discard_pos_done, 1688 &seen, 1689 &open, 1690 &need_journal_commit, 1691 &discarded))); 1692 1693 if (need_journal_commit * 2 > seen) 1694 bch2_journal_flush_async(&c->journal, NULL); 1695 1696 bch2_write_ref_put(c, BCH_WRITE_REF_discard); 1697 1698 trace_discard_buckets(c, seen, open, need_journal_commit, discarded, 1699 bch2_err_str(ret)); 1700 } 1701 1702 void bch2_do_discards(struct bch_fs *c) 1703 { 1704 if (bch2_write_ref_tryget(c, BCH_WRITE_REF_discard) && 1705 !queue_work(c->write_ref_wq, &c->discard_work)) 1706 bch2_write_ref_put(c, BCH_WRITE_REF_discard); 1707 } 1708 1709 static int invalidate_one_bucket(struct btree_trans *trans, 1710 struct btree_iter *lru_iter, 1711 struct bkey_s_c lru_k, 1712 s64 *nr_to_invalidate) 1713 { 1714 struct bch_fs *c = trans->c; 1715 struct btree_iter alloc_iter = { NULL }; 1716 struct bkey_i_alloc_v4 *a = NULL; 1717 struct printbuf buf = PRINTBUF; 1718 struct bpos bucket = u64_to_bucket(lru_k.k->p.offset); 1719 unsigned cached_sectors; 1720 int ret = 0; 1721 1722 if (*nr_to_invalidate <= 0) 1723 return 1; 1724 1725 if (!bch2_dev_bucket_exists(c, bucket)) { 1726 prt_str(&buf, "lru entry points to invalid bucket"); 1727 goto err; 1728 } 1729 1730 if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset)) 1731 return 0; 1732 1733 a = bch2_trans_start_alloc_update(trans, &alloc_iter, bucket); 1734 ret = PTR_ERR_OR_ZERO(a); 1735 if (ret) 1736 goto out; 1737 1738 /* We expect harmless races here due to the btree write buffer: */ 1739 if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(a->v)) 1740 goto out; 1741 1742 BUG_ON(a->v.data_type != BCH_DATA_cached); 1743 1744 if (!a->v.cached_sectors) 1745 bch_err(c, "invalidating empty bucket, confused"); 1746 1747 cached_sectors = a->v.cached_sectors; 1748 1749 SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); 1750 a->v.gen++; 1751 a->v.data_type = 0; 1752 a->v.dirty_sectors = 0; 1753 a->v.cached_sectors = 0; 1754 a->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now); 1755 a->v.io_time[WRITE] = atomic64_read(&c->io_clock[WRITE].now); 1756 1757 ret = bch2_trans_update(trans, &alloc_iter, &a->k_i, 1758 BTREE_TRIGGER_BUCKET_INVALIDATE) ?: 1759 bch2_trans_commit(trans, NULL, NULL, 1760 BCH_WATERMARK_btree| 1761 BTREE_INSERT_NOFAIL); 1762 if (ret) 1763 goto out; 1764 1765 trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors); 1766 --*nr_to_invalidate; 1767 out: 1768 bch2_trans_iter_exit(trans, &alloc_iter); 1769 printbuf_exit(&buf); 1770 return ret; 1771 err: 1772 prt_str(&buf, "\n lru key: "); 1773 bch2_bkey_val_to_text(&buf, c, lru_k); 1774 1775 prt_str(&buf, "\n lru entry: "); 1776 bch2_lru_pos_to_text(&buf, lru_iter->pos); 1777 1778 prt_str(&buf, "\n alloc key: "); 1779 if (!a) 1780 bch2_bpos_to_text(&buf, bucket); 1781 else 1782 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i)); 1783 1784 bch_err(c, "%s", buf.buf); 1785 if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_lrus) { 1786 bch2_inconsistent_error(c); 1787 ret = -EINVAL; 1788 } 1789 1790 goto out; 1791 } 1792 1793 static void bch2_do_invalidates_work(struct work_struct *work) 1794 { 1795 struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work); 1796 struct bch_dev *ca; 1797 struct btree_trans *trans = bch2_trans_get(c); 1798 struct btree_iter iter; 1799 struct bkey_s_c k; 1800 unsigned i; 1801 int ret = 0; 1802 1803 ret = bch2_btree_write_buffer_flush(trans); 1804 if (ret) 1805 goto err; 1806 1807 for_each_member_device(ca, c, i) { 1808 s64 nr_to_invalidate = 1809 should_invalidate_buckets(ca, bch2_dev_usage_read(ca)); 1810 1811 ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_lru, 1812 lru_pos(ca->dev_idx, 0, 0), 1813 lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX), 1814 BTREE_ITER_INTENT, k, 1815 invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate)); 1816 1817 if (ret < 0) { 1818 percpu_ref_put(&ca->ref); 1819 break; 1820 } 1821 } 1822 err: 1823 bch2_trans_put(trans); 1824 bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); 1825 } 1826 1827 void bch2_do_invalidates(struct bch_fs *c) 1828 { 1829 if (bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate) && 1830 !queue_work(c->write_ref_wq, &c->invalidate_work)) 1831 bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); 1832 } 1833 1834 int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, 1835 u64 bucket_start, u64 bucket_end) 1836 { 1837 struct btree_trans *trans = bch2_trans_get(c); 1838 struct btree_iter iter; 1839 struct bkey_s_c k; 1840 struct bkey hole; 1841 struct bpos end = POS(ca->dev_idx, bucket_end); 1842 struct bch_member *m; 1843 unsigned long last_updated = jiffies; 1844 int ret; 1845 1846 BUG_ON(bucket_start > bucket_end); 1847 BUG_ON(bucket_end > ca->mi.nbuckets); 1848 1849 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, 1850 POS(ca->dev_idx, max_t(u64, ca->mi.first_bucket, bucket_start)), 1851 BTREE_ITER_PREFETCH); 1852 /* 1853 * Scan the alloc btree for every bucket on @ca, and add buckets to the 1854 * freespace/need_discard/need_gc_gens btrees as needed: 1855 */ 1856 while (1) { 1857 if (last_updated + HZ * 10 < jiffies) { 1858 bch_info(ca, "%s: currently at %llu/%llu", 1859 __func__, iter.pos.offset, ca->mi.nbuckets); 1860 last_updated = jiffies; 1861 } 1862 1863 bch2_trans_begin(trans); 1864 1865 if (bkey_ge(iter.pos, end)) { 1866 ret = 0; 1867 break; 1868 } 1869 1870 k = bch2_get_key_or_hole(&iter, end, &hole); 1871 ret = bkey_err(k); 1872 if (ret) 1873 goto bkey_err; 1874 1875 if (k.k->type) { 1876 /* 1877 * We process live keys in the alloc btree one at a 1878 * time: 1879 */ 1880 struct bch_alloc_v4 a_convert; 1881 const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); 1882 1883 ret = bch2_bucket_do_index(trans, k, a, true) ?: 1884 bch2_trans_commit(trans, NULL, NULL, 1885 BTREE_INSERT_LAZY_RW| 1886 BTREE_INSERT_NOFAIL); 1887 if (ret) 1888 goto bkey_err; 1889 1890 bch2_btree_iter_advance(&iter); 1891 } else { 1892 struct bkey_i *freespace; 1893 1894 freespace = bch2_trans_kmalloc(trans, sizeof(*freespace)); 1895 ret = PTR_ERR_OR_ZERO(freespace); 1896 if (ret) 1897 goto bkey_err; 1898 1899 bkey_init(&freespace->k); 1900 freespace->k.type = KEY_TYPE_set; 1901 freespace->k.p = k.k->p; 1902 freespace->k.size = k.k->size; 1903 1904 ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?: 1905 bch2_trans_commit(trans, NULL, NULL, 1906 BTREE_INSERT_LAZY_RW| 1907 BTREE_INSERT_NOFAIL); 1908 if (ret) 1909 goto bkey_err; 1910 1911 bch2_btree_iter_set_pos(&iter, k.k->p); 1912 } 1913 bkey_err: 1914 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1915 continue; 1916 if (ret) 1917 break; 1918 } 1919 1920 bch2_trans_iter_exit(trans, &iter); 1921 bch2_trans_put(trans); 1922 1923 if (ret < 0) { 1924 bch_err_msg(ca, ret, "initializing free space"); 1925 return ret; 1926 } 1927 1928 mutex_lock(&c->sb_lock); 1929 m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); 1930 SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true); 1931 mutex_unlock(&c->sb_lock); 1932 1933 return 0; 1934 } 1935 1936 int bch2_fs_freespace_init(struct bch_fs *c) 1937 { 1938 struct bch_dev *ca; 1939 unsigned i; 1940 int ret = 0; 1941 bool doing_init = false; 1942 1943 /* 1944 * We can crash during the device add path, so we need to check this on 1945 * every mount: 1946 */ 1947 1948 for_each_member_device(ca, c, i) { 1949 if (ca->mi.freespace_initialized) 1950 continue; 1951 1952 if (!doing_init) { 1953 bch_info(c, "initializing freespace"); 1954 doing_init = true; 1955 } 1956 1957 ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); 1958 if (ret) { 1959 percpu_ref_put(&ca->ref); 1960 bch_err_fn(c, ret); 1961 return ret; 1962 } 1963 } 1964 1965 if (doing_init) { 1966 mutex_lock(&c->sb_lock); 1967 bch2_write_super(c); 1968 mutex_unlock(&c->sb_lock); 1969 bch_verbose(c, "done initializing freespace"); 1970 } 1971 1972 return 0; 1973 } 1974 1975 /* Bucket IO clocks: */ 1976 1977 int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, 1978 size_t bucket_nr, int rw) 1979 { 1980 struct bch_fs *c = trans->c; 1981 struct btree_iter iter; 1982 struct bkey_i_alloc_v4 *a; 1983 u64 now; 1984 int ret = 0; 1985 1986 a = bch2_trans_start_alloc_update(trans, &iter, POS(dev, bucket_nr)); 1987 ret = PTR_ERR_OR_ZERO(a); 1988 if (ret) 1989 return ret; 1990 1991 now = atomic64_read(&c->io_clock[rw].now); 1992 if (a->v.io_time[rw] == now) 1993 goto out; 1994 1995 a->v.io_time[rw] = now; 1996 1997 ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: 1998 bch2_trans_commit(trans, NULL, NULL, 0); 1999 out: 2000 bch2_trans_iter_exit(trans, &iter); 2001 return ret; 2002 } 2003 2004 /* Startup/shutdown (ro/rw): */ 2005 2006 void bch2_recalc_capacity(struct bch_fs *c) 2007 { 2008 struct bch_dev *ca; 2009 u64 capacity = 0, reserved_sectors = 0, gc_reserve; 2010 unsigned bucket_size_max = 0; 2011 unsigned long ra_pages = 0; 2012 unsigned i; 2013 2014 lockdep_assert_held(&c->state_lock); 2015 2016 for_each_online_member(ca, c, i) { 2017 struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi; 2018 2019 ra_pages += bdi->ra_pages; 2020 } 2021 2022 bch2_set_ra_pages(c, ra_pages); 2023 2024 for_each_rw_member(ca, c, i) { 2025 u64 dev_reserve = 0; 2026 2027 /* 2028 * We need to reserve buckets (from the number 2029 * of currently available buckets) against 2030 * foreground writes so that mainly copygc can 2031 * make forward progress. 2032 * 2033 * We need enough to refill the various reserves 2034 * from scratch - copygc will use its entire 2035 * reserve all at once, then run against when 2036 * its reserve is refilled (from the formerly 2037 * available buckets). 2038 * 2039 * This reserve is just used when considering if 2040 * allocations for foreground writes must wait - 2041 * not -ENOSPC calculations. 2042 */ 2043 2044 dev_reserve += ca->nr_btree_reserve * 2; 2045 dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */ 2046 2047 dev_reserve += 1; /* btree write point */ 2048 dev_reserve += 1; /* copygc write point */ 2049 dev_reserve += 1; /* rebalance write point */ 2050 2051 dev_reserve *= ca->mi.bucket_size; 2052 2053 capacity += bucket_to_sector(ca, ca->mi.nbuckets - 2054 ca->mi.first_bucket); 2055 2056 reserved_sectors += dev_reserve * 2; 2057 2058 bucket_size_max = max_t(unsigned, bucket_size_max, 2059 ca->mi.bucket_size); 2060 } 2061 2062 gc_reserve = c->opts.gc_reserve_bytes 2063 ? c->opts.gc_reserve_bytes >> 9 2064 : div64_u64(capacity * c->opts.gc_reserve_percent, 100); 2065 2066 reserved_sectors = max(gc_reserve, reserved_sectors); 2067 2068 reserved_sectors = min(reserved_sectors, capacity); 2069 2070 c->capacity = capacity - reserved_sectors; 2071 2072 c->bucket_size_max = bucket_size_max; 2073 2074 /* Wake up case someone was waiting for buckets */ 2075 closure_wake_up(&c->freelist_wait); 2076 } 2077 2078 static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) 2079 { 2080 struct open_bucket *ob; 2081 bool ret = false; 2082 2083 for (ob = c->open_buckets; 2084 ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); 2085 ob++) { 2086 spin_lock(&ob->lock); 2087 if (ob->valid && !ob->on_partial_list && 2088 ob->dev == ca->dev_idx) 2089 ret = true; 2090 spin_unlock(&ob->lock); 2091 } 2092 2093 return ret; 2094 } 2095 2096 /* device goes ro: */ 2097 void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) 2098 { 2099 unsigned i; 2100 2101 /* First, remove device from allocation groups: */ 2102 2103 for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) 2104 clear_bit(ca->dev_idx, c->rw_devs[i].d); 2105 2106 /* 2107 * Capacity is calculated based off of devices in allocation groups: 2108 */ 2109 bch2_recalc_capacity(c); 2110 2111 bch2_open_buckets_stop(c, ca, false); 2112 2113 /* 2114 * Wake up threads that were blocked on allocation, so they can notice 2115 * the device can no longer be removed and the capacity has changed: 2116 */ 2117 closure_wake_up(&c->freelist_wait); 2118 2119 /* 2120 * journal_res_get() can block waiting for free space in the journal - 2121 * it needs to notice there may not be devices to allocate from anymore: 2122 */ 2123 wake_up(&c->journal.wait); 2124 2125 /* Now wait for any in flight writes: */ 2126 2127 closure_wait_event(&c->open_buckets_wait, 2128 !bch2_dev_has_open_write_point(c, ca)); 2129 } 2130 2131 /* device goes rw: */ 2132 void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) 2133 { 2134 unsigned i; 2135 2136 for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) 2137 if (ca->mi.data_allowed & (1 << i)) 2138 set_bit(ca->dev_idx, c->rw_devs[i].d); 2139 } 2140 2141 void bch2_fs_allocator_background_init(struct bch_fs *c) 2142 { 2143 spin_lock_init(&c->freelist_lock); 2144 INIT_WORK(&c->discard_work, bch2_do_discards_work); 2145 INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work); 2146 } 2147