1 // SPDX-License-Identifier: GPL-2.0 2 #include "bcachefs.h" 3 #include "alloc_background.h" 4 #include "alloc_foreground.h" 5 #include "backpointers.h" 6 #include "btree_cache.h" 7 #include "btree_io.h" 8 #include "btree_key_cache.h" 9 #include "btree_update.h" 10 #include "btree_update_interior.h" 11 #include "btree_gc.h" 12 #include "btree_write_buffer.h" 13 #include "buckets.h" 14 #include "buckets_waiting_for_journal.h" 15 #include "clock.h" 16 #include "debug.h" 17 #include "ec.h" 18 #include "error.h" 19 #include "lru.h" 20 #include "recovery.h" 21 #include "trace.h" 22 #include "varint.h" 23 24 #include <linux/kthread.h> 25 #include <linux/math64.h> 26 #include <linux/random.h> 27 #include <linux/rculist.h> 28 #include <linux/rcupdate.h> 29 #include <linux/sched/task.h> 30 #include <linux/sort.h> 31 32 static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket); 33 34 /* Persistent alloc info: */ 35 36 static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { 37 #define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8, 38 BCH_ALLOC_FIELDS_V1() 39 #undef x 40 }; 41 42 struct bkey_alloc_unpacked { 43 u64 journal_seq; 44 u8 gen; 45 u8 oldest_gen; 46 u8 data_type; 47 bool need_discard:1; 48 bool need_inc_gen:1; 49 #define x(_name, _bits) u##_bits _name; 50 BCH_ALLOC_FIELDS_V2() 51 #undef x 52 }; 53 54 static inline u64 alloc_field_v1_get(const struct bch_alloc *a, 55 const void **p, unsigned field) 56 { 57 unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field]; 58 u64 v; 59 60 if (!(a->fields & (1 << field))) 61 return 0; 62 63 switch (bytes) { 64 case 1: 65 v = *((const u8 *) *p); 66 break; 67 case 2: 68 v = le16_to_cpup(*p); 69 break; 70 case 4: 71 v = le32_to_cpup(*p); 72 break; 73 case 8: 74 v = le64_to_cpup(*p); 75 break; 76 default: 77 BUG(); 78 } 79 80 *p += bytes; 81 return v; 82 } 83 84 static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out, 85 struct bkey_s_c k) 86 { 87 const struct bch_alloc *in = bkey_s_c_to_alloc(k).v; 88 const void *d = in->data; 89 unsigned idx = 0; 90 91 out->gen = in->gen; 92 93 #define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++); 94 BCH_ALLOC_FIELDS_V1() 95 #undef x 96 } 97 98 static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out, 99 struct bkey_s_c k) 100 { 101 struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k); 102 const u8 *in = a.v->data; 103 const u8 *end = bkey_val_end(a); 104 unsigned fieldnr = 0; 105 int ret; 106 u64 v; 107 108 out->gen = a.v->gen; 109 out->oldest_gen = a.v->oldest_gen; 110 out->data_type = a.v->data_type; 111 112 #define x(_name, _bits) \ 113 if (fieldnr < a.v->nr_fields) { \ 114 ret = bch2_varint_decode_fast(in, end, &v); \ 115 if (ret < 0) \ 116 return ret; \ 117 in += ret; \ 118 } else { \ 119 v = 0; \ 120 } \ 121 out->_name = v; \ 122 if (v != out->_name) \ 123 return -1; \ 124 fieldnr++; 125 126 BCH_ALLOC_FIELDS_V2() 127 #undef x 128 return 0; 129 } 130 131 static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out, 132 struct bkey_s_c k) 133 { 134 struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k); 135 const u8 *in = a.v->data; 136 const u8 *end = bkey_val_end(a); 137 unsigned fieldnr = 0; 138 int ret; 139 u64 v; 140 141 out->gen = a.v->gen; 142 out->oldest_gen = a.v->oldest_gen; 143 out->data_type = a.v->data_type; 144 out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v); 145 out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v); 146 out->journal_seq = le64_to_cpu(a.v->journal_seq); 147 148 #define x(_name, _bits) \ 149 if (fieldnr < a.v->nr_fields) { \ 150 ret = bch2_varint_decode_fast(in, end, &v); \ 151 if (ret < 0) \ 152 return ret; \ 153 in += ret; \ 154 } else { \ 155 v = 0; \ 156 } \ 157 out->_name = v; \ 158 if (v != out->_name) \ 159 return -1; \ 160 fieldnr++; 161 162 BCH_ALLOC_FIELDS_V2() 163 #undef x 164 return 0; 165 } 166 167 static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) 168 { 169 struct bkey_alloc_unpacked ret = { .gen = 0 }; 170 171 switch (k.k->type) { 172 case KEY_TYPE_alloc: 173 bch2_alloc_unpack_v1(&ret, k); 174 break; 175 case KEY_TYPE_alloc_v2: 176 bch2_alloc_unpack_v2(&ret, k); 177 break; 178 case KEY_TYPE_alloc_v3: 179 bch2_alloc_unpack_v3(&ret, k); 180 break; 181 } 182 183 return ret; 184 } 185 186 static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) 187 { 188 unsigned i, bytes = offsetof(struct bch_alloc, data); 189 190 for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++) 191 if (a->fields & (1 << i)) 192 bytes += BCH_ALLOC_V1_FIELD_BYTES[i]; 193 194 return DIV_ROUND_UP(bytes, sizeof(u64)); 195 } 196 197 int bch2_alloc_v1_invalid(struct bch_fs *c, struct bkey_s_c k, 198 enum bkey_invalid_flags flags, 199 struct printbuf *err) 200 { 201 struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); 202 int ret = 0; 203 204 /* allow for unknown fields */ 205 bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v), c, err, 206 alloc_v1_val_size_bad, 207 "incorrect value size (%zu < %u)", 208 bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v)); 209 fsck_err: 210 return ret; 211 } 212 213 int bch2_alloc_v2_invalid(struct bch_fs *c, struct bkey_s_c k, 214 enum bkey_invalid_flags flags, 215 struct printbuf *err) 216 { 217 struct bkey_alloc_unpacked u; 218 int ret = 0; 219 220 bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k), c, err, 221 alloc_v2_unpack_error, 222 "unpack error"); 223 fsck_err: 224 return ret; 225 } 226 227 int bch2_alloc_v3_invalid(struct bch_fs *c, struct bkey_s_c k, 228 enum bkey_invalid_flags flags, 229 struct printbuf *err) 230 { 231 struct bkey_alloc_unpacked u; 232 int ret = 0; 233 234 bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k), c, err, 235 alloc_v2_unpack_error, 236 "unpack error"); 237 fsck_err: 238 return ret; 239 } 240 241 int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k, 242 enum bkey_invalid_flags flags, struct printbuf *err) 243 { 244 struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k); 245 int ret = 0; 246 247 bkey_fsck_err_on(alloc_v4_u64s(a.v) > bkey_val_u64s(k.k), c, err, 248 alloc_v4_val_size_bad, 249 "bad val size (%u > %zu)", 250 alloc_v4_u64s(a.v), bkey_val_u64s(k.k)); 251 252 bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) && 253 BCH_ALLOC_V4_NR_BACKPOINTERS(a.v), c, err, 254 alloc_v4_backpointers_start_bad, 255 "invalid backpointers_start"); 256 257 bkey_fsck_err_on(alloc_data_type(*a.v, a.v->data_type) != a.v->data_type, c, err, 258 alloc_key_data_type_bad, 259 "invalid data type (got %u should be %u)", 260 a.v->data_type, alloc_data_type(*a.v, a.v->data_type)); 261 262 switch (a.v->data_type) { 263 case BCH_DATA_free: 264 case BCH_DATA_need_gc_gens: 265 case BCH_DATA_need_discard: 266 bkey_fsck_err_on(bch2_bucket_sectors(*a.v) || a.v->stripe, 267 c, err, alloc_key_empty_but_have_data, 268 "empty data type free but have data"); 269 break; 270 case BCH_DATA_sb: 271 case BCH_DATA_journal: 272 case BCH_DATA_btree: 273 case BCH_DATA_user: 274 case BCH_DATA_parity: 275 bkey_fsck_err_on(!bch2_bucket_sectors_dirty(*a.v), 276 c, err, alloc_key_dirty_sectors_0, 277 "data_type %s but dirty_sectors==0", 278 bch2_data_type_str(a.v->data_type)); 279 break; 280 case BCH_DATA_cached: 281 bkey_fsck_err_on(!a.v->cached_sectors || 282 bch2_bucket_sectors_dirty(*a.v) || 283 a.v->stripe, 284 c, err, alloc_key_cached_inconsistency, 285 "data type inconsistency"); 286 287 bkey_fsck_err_on(!a.v->io_time[READ] && 288 c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs, 289 c, err, alloc_key_cached_but_read_time_zero, 290 "cached bucket with read_time == 0"); 291 break; 292 case BCH_DATA_stripe: 293 break; 294 } 295 fsck_err: 296 return ret; 297 } 298 299 void bch2_alloc_v4_swab(struct bkey_s k) 300 { 301 struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v; 302 struct bch_backpointer *bp, *bps; 303 304 a->journal_seq = swab64(a->journal_seq); 305 a->flags = swab32(a->flags); 306 a->dirty_sectors = swab32(a->dirty_sectors); 307 a->cached_sectors = swab32(a->cached_sectors); 308 a->io_time[0] = swab64(a->io_time[0]); 309 a->io_time[1] = swab64(a->io_time[1]); 310 a->stripe = swab32(a->stripe); 311 a->nr_external_backpointers = swab32(a->nr_external_backpointers); 312 a->fragmentation_lru = swab64(a->fragmentation_lru); 313 314 bps = alloc_v4_backpointers(a); 315 for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) { 316 bp->bucket_offset = swab40(bp->bucket_offset); 317 bp->bucket_len = swab32(bp->bucket_len); 318 bch2_bpos_swab(&bp->pos); 319 } 320 } 321 322 void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) 323 { 324 struct bch_alloc_v4 _a; 325 const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); 326 327 prt_newline(out); 328 printbuf_indent_add(out, 2); 329 330 prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen); 331 bch2_prt_data_type(out, a->data_type); 332 prt_newline(out); 333 prt_printf(out, "journal_seq %llu", a->journal_seq); 334 prt_newline(out); 335 prt_printf(out, "need_discard %llu", BCH_ALLOC_V4_NEED_DISCARD(a)); 336 prt_newline(out); 337 prt_printf(out, "need_inc_gen %llu", BCH_ALLOC_V4_NEED_INC_GEN(a)); 338 prt_newline(out); 339 prt_printf(out, "dirty_sectors %u", a->dirty_sectors); 340 prt_newline(out); 341 prt_printf(out, "cached_sectors %u", a->cached_sectors); 342 prt_newline(out); 343 prt_printf(out, "stripe %u", a->stripe); 344 prt_newline(out); 345 prt_printf(out, "stripe_redundancy %u", a->stripe_redundancy); 346 prt_newline(out); 347 prt_printf(out, "io_time[READ] %llu", a->io_time[READ]); 348 prt_newline(out); 349 prt_printf(out, "io_time[WRITE] %llu", a->io_time[WRITE]); 350 prt_newline(out); 351 prt_printf(out, "fragmentation %llu", a->fragmentation_lru); 352 prt_newline(out); 353 prt_printf(out, "bp_start %llu", BCH_ALLOC_V4_BACKPOINTERS_START(a)); 354 printbuf_indent_sub(out, 2); 355 } 356 357 void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out) 358 { 359 if (k.k->type == KEY_TYPE_alloc_v4) { 360 void *src, *dst; 361 362 *out = *bkey_s_c_to_alloc_v4(k).v; 363 364 src = alloc_v4_backpointers(out); 365 SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s); 366 dst = alloc_v4_backpointers(out); 367 368 if (src < dst) 369 memset(src, 0, dst - src); 370 371 SET_BCH_ALLOC_V4_NR_BACKPOINTERS(out, 0); 372 } else { 373 struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); 374 375 *out = (struct bch_alloc_v4) { 376 .journal_seq = u.journal_seq, 377 .flags = u.need_discard, 378 .gen = u.gen, 379 .oldest_gen = u.oldest_gen, 380 .data_type = u.data_type, 381 .stripe_redundancy = u.stripe_redundancy, 382 .dirty_sectors = u.dirty_sectors, 383 .cached_sectors = u.cached_sectors, 384 .io_time[READ] = u.read_time, 385 .io_time[WRITE] = u.write_time, 386 .stripe = u.stripe, 387 }; 388 389 SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s); 390 } 391 } 392 393 static noinline struct bkey_i_alloc_v4 * 394 __bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) 395 { 396 struct bkey_i_alloc_v4 *ret; 397 398 ret = bch2_trans_kmalloc(trans, max(bkey_bytes(k.k), sizeof(struct bkey_i_alloc_v4))); 399 if (IS_ERR(ret)) 400 return ret; 401 402 if (k.k->type == KEY_TYPE_alloc_v4) { 403 void *src, *dst; 404 405 bkey_reassemble(&ret->k_i, k); 406 407 src = alloc_v4_backpointers(&ret->v); 408 SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s); 409 dst = alloc_v4_backpointers(&ret->v); 410 411 if (src < dst) 412 memset(src, 0, dst - src); 413 414 SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v, 0); 415 set_alloc_v4_u64s(ret); 416 } else { 417 bkey_alloc_v4_init(&ret->k_i); 418 ret->k.p = k.k->p; 419 bch2_alloc_to_v4(k, &ret->v); 420 } 421 return ret; 422 } 423 424 static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k) 425 { 426 struct bkey_s_c_alloc_v4 a; 427 428 if (likely(k.k->type == KEY_TYPE_alloc_v4) && 429 ((a = bkey_s_c_to_alloc_v4(k), true) && 430 BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) == 0)) 431 return bch2_bkey_make_mut_noupdate_typed(trans, k, alloc_v4); 432 433 return __bch2_alloc_to_v4_mut(trans, k); 434 } 435 436 struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) 437 { 438 return bch2_alloc_to_v4_mut_inlined(trans, k); 439 } 440 441 struct bkey_i_alloc_v4 * 442 bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter, 443 struct bpos pos) 444 { 445 struct bkey_s_c k; 446 struct bkey_i_alloc_v4 *a; 447 int ret; 448 449 k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos, 450 BTREE_ITER_WITH_UPDATES| 451 BTREE_ITER_CACHED| 452 BTREE_ITER_INTENT); 453 ret = bkey_err(k); 454 if (unlikely(ret)) 455 return ERR_PTR(ret); 456 457 a = bch2_alloc_to_v4_mut_inlined(trans, k); 458 ret = PTR_ERR_OR_ZERO(a); 459 if (unlikely(ret)) 460 goto err; 461 return a; 462 err: 463 bch2_trans_iter_exit(trans, iter); 464 return ERR_PTR(ret); 465 } 466 467 static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset) 468 { 469 *offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK; 470 471 pos.offset >>= KEY_TYPE_BUCKET_GENS_BITS; 472 return pos; 473 } 474 475 static struct bpos bucket_gens_pos_to_alloc(struct bpos pos, unsigned offset) 476 { 477 pos.offset <<= KEY_TYPE_BUCKET_GENS_BITS; 478 pos.offset += offset; 479 return pos; 480 } 481 482 static unsigned alloc_gen(struct bkey_s_c k, unsigned offset) 483 { 484 return k.k->type == KEY_TYPE_bucket_gens 485 ? bkey_s_c_to_bucket_gens(k).v->gens[offset] 486 : 0; 487 } 488 489 int bch2_bucket_gens_invalid(struct bch_fs *c, struct bkey_s_c k, 490 enum bkey_invalid_flags flags, 491 struct printbuf *err) 492 { 493 int ret = 0; 494 495 bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens), c, err, 496 bucket_gens_val_size_bad, 497 "bad val size (%zu != %zu)", 498 bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens)); 499 fsck_err: 500 return ret; 501 } 502 503 void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) 504 { 505 struct bkey_s_c_bucket_gens g = bkey_s_c_to_bucket_gens(k); 506 unsigned i; 507 508 for (i = 0; i < ARRAY_SIZE(g.v->gens); i++) { 509 if (i) 510 prt_char(out, ' '); 511 prt_printf(out, "%u", g.v->gens[i]); 512 } 513 } 514 515 int bch2_bucket_gens_init(struct bch_fs *c) 516 { 517 struct btree_trans *trans = bch2_trans_get(c); 518 struct bkey_i_bucket_gens g; 519 bool have_bucket_gens_key = false; 520 int ret; 521 522 ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, 523 BTREE_ITER_PREFETCH, k, ({ 524 /* 525 * Not a fsck error because this is checked/repaired by 526 * bch2_check_alloc_key() which runs later: 527 */ 528 if (!bch2_dev_bucket_exists(c, k.k->p)) 529 continue; 530 531 struct bch_alloc_v4 a; 532 u8 gen = bch2_alloc_to_v4(k, &a)->gen; 533 unsigned offset; 534 struct bpos pos = alloc_gens_pos(iter.pos, &offset); 535 536 if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) { 537 ret = commit_do(trans, NULL, NULL, 538 BCH_TRANS_COMMIT_no_enospc, 539 bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0)); 540 if (ret) 541 break; 542 have_bucket_gens_key = false; 543 } 544 545 if (!have_bucket_gens_key) { 546 bkey_bucket_gens_init(&g.k_i); 547 g.k.p = pos; 548 have_bucket_gens_key = true; 549 } 550 551 g.v.gens[offset] = gen; 552 0; 553 })); 554 555 if (have_bucket_gens_key && !ret) 556 ret = commit_do(trans, NULL, NULL, 557 BCH_TRANS_COMMIT_no_enospc, 558 bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0)); 559 560 bch2_trans_put(trans); 561 562 bch_err_fn(c, ret); 563 return ret; 564 } 565 566 int bch2_alloc_read(struct bch_fs *c) 567 { 568 struct btree_trans *trans = bch2_trans_get(c); 569 int ret; 570 571 down_read(&c->gc_lock); 572 573 if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) { 574 ret = for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN, 575 BTREE_ITER_PREFETCH, k, ({ 576 u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; 577 u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; 578 579 if (k.k->type != KEY_TYPE_bucket_gens) 580 continue; 581 582 const struct bch_bucket_gens *g = bkey_s_c_to_bucket_gens(k).v; 583 584 /* 585 * Not a fsck error because this is checked/repaired by 586 * bch2_check_alloc_key() which runs later: 587 */ 588 if (!bch2_dev_exists2(c, k.k->p.inode)) 589 continue; 590 591 struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode); 592 593 for (u64 b = max_t(u64, ca->mi.first_bucket, start); 594 b < min_t(u64, ca->mi.nbuckets, end); 595 b++) 596 *bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK]; 597 0; 598 })); 599 } else { 600 ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, 601 BTREE_ITER_PREFETCH, k, ({ 602 /* 603 * Not a fsck error because this is checked/repaired by 604 * bch2_check_alloc_key() which runs later: 605 */ 606 if (!bch2_dev_bucket_exists(c, k.k->p)) 607 continue; 608 609 struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode); 610 611 struct bch_alloc_v4 a; 612 *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen; 613 0; 614 })); 615 } 616 617 bch2_trans_put(trans); 618 up_read(&c->gc_lock); 619 620 bch_err_fn(c, ret); 621 return ret; 622 } 623 624 /* Free space/discard btree: */ 625 626 static int bch2_bucket_do_index(struct btree_trans *trans, 627 struct bkey_s_c alloc_k, 628 const struct bch_alloc_v4 *a, 629 bool set) 630 { 631 struct bch_fs *c = trans->c; 632 struct bch_dev *ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode); 633 struct btree_iter iter; 634 struct bkey_s_c old; 635 struct bkey_i *k; 636 enum btree_id btree; 637 enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted; 638 enum bch_bkey_type new_type = set ? KEY_TYPE_set : KEY_TYPE_deleted; 639 struct printbuf buf = PRINTBUF; 640 int ret; 641 642 if (a->data_type != BCH_DATA_free && 643 a->data_type != BCH_DATA_need_discard) 644 return 0; 645 646 k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k)); 647 if (IS_ERR(k)) 648 return PTR_ERR(k); 649 650 bkey_init(&k->k); 651 k->k.type = new_type; 652 653 switch (a->data_type) { 654 case BCH_DATA_free: 655 btree = BTREE_ID_freespace; 656 k->k.p = alloc_freespace_pos(alloc_k.k->p, *a); 657 bch2_key_resize(&k->k, 1); 658 break; 659 case BCH_DATA_need_discard: 660 btree = BTREE_ID_need_discard; 661 k->k.p = alloc_k.k->p; 662 break; 663 default: 664 return 0; 665 } 666 667 old = bch2_bkey_get_iter(trans, &iter, btree, 668 bkey_start_pos(&k->k), 669 BTREE_ITER_INTENT); 670 ret = bkey_err(old); 671 if (ret) 672 return ret; 673 674 if (ca->mi.freespace_initialized && 675 c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info && 676 bch2_trans_inconsistent_on(old.k->type != old_type, trans, 677 "incorrect key when %s %s:%llu:%llu:0 (got %s should be %s)\n" 678 " for %s", 679 set ? "setting" : "clearing", 680 bch2_btree_id_str(btree), 681 iter.pos.inode, 682 iter.pos.offset, 683 bch2_bkey_types[old.k->type], 684 bch2_bkey_types[old_type], 685 (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { 686 ret = -EIO; 687 goto err; 688 } 689 690 ret = bch2_trans_update(trans, &iter, k, 0); 691 err: 692 bch2_trans_iter_exit(trans, &iter); 693 printbuf_exit(&buf); 694 return ret; 695 } 696 697 static noinline int bch2_bucket_gen_update(struct btree_trans *trans, 698 struct bpos bucket, u8 gen) 699 { 700 struct btree_iter iter; 701 unsigned offset; 702 struct bpos pos = alloc_gens_pos(bucket, &offset); 703 struct bkey_i_bucket_gens *g; 704 struct bkey_s_c k; 705 int ret; 706 707 g = bch2_trans_kmalloc(trans, sizeof(*g)); 708 ret = PTR_ERR_OR_ZERO(g); 709 if (ret) 710 return ret; 711 712 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_bucket_gens, pos, 713 BTREE_ITER_INTENT| 714 BTREE_ITER_WITH_UPDATES); 715 ret = bkey_err(k); 716 if (ret) 717 return ret; 718 719 if (k.k->type != KEY_TYPE_bucket_gens) { 720 bkey_bucket_gens_init(&g->k_i); 721 g->k.p = iter.pos; 722 } else { 723 bkey_reassemble(&g->k_i, k); 724 } 725 726 g->v.gens[offset] = gen; 727 728 ret = bch2_trans_update(trans, &iter, &g->k_i, 0); 729 bch2_trans_iter_exit(trans, &iter); 730 return ret; 731 } 732 733 int bch2_trigger_alloc(struct btree_trans *trans, 734 enum btree_id btree, unsigned level, 735 struct bkey_s_c old, struct bkey_s new, 736 unsigned flags) 737 { 738 struct bch_fs *c = trans->c; 739 int ret = 0; 740 741 if (bch2_trans_inconsistent_on(!bch2_dev_bucket_exists(c, new.k->p), trans, 742 "alloc key for invalid device or bucket")) 743 return -EIO; 744 745 struct bch_dev *ca = bch_dev_bkey_exists(c, new.k->p.inode); 746 747 struct bch_alloc_v4 old_a_convert; 748 const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert); 749 750 if (flags & BTREE_TRIGGER_TRANSACTIONAL) { 751 struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v; 752 753 new_a->data_type = alloc_data_type(*new_a, new_a->data_type); 754 755 if (bch2_bucket_sectors(*new_a) > bch2_bucket_sectors(*old_a)) { 756 new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); 757 new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now)); 758 SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true); 759 SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true); 760 } 761 762 if (data_type_is_empty(new_a->data_type) && 763 BCH_ALLOC_V4_NEED_INC_GEN(new_a) && 764 !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) { 765 new_a->gen++; 766 SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false); 767 } 768 769 if (old_a->data_type != new_a->data_type || 770 (new_a->data_type == BCH_DATA_free && 771 alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) { 772 ret = bch2_bucket_do_index(trans, old, old_a, false) ?: 773 bch2_bucket_do_index(trans, new.s_c, new_a, true); 774 if (ret) 775 return ret; 776 } 777 778 if (new_a->data_type == BCH_DATA_cached && 779 !new_a->io_time[READ]) 780 new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); 781 782 u64 old_lru = alloc_lru_idx_read(*old_a); 783 u64 new_lru = alloc_lru_idx_read(*new_a); 784 if (old_lru != new_lru) { 785 ret = bch2_lru_change(trans, new.k->p.inode, 786 bucket_to_u64(new.k->p), 787 old_lru, new_lru); 788 if (ret) 789 return ret; 790 } 791 792 new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a, 793 bch_dev_bkey_exists(c, new.k->p.inode)); 794 if (old_a->fragmentation_lru != new_a->fragmentation_lru) { 795 ret = bch2_lru_change(trans, 796 BCH_LRU_FRAGMENTATION_START, 797 bucket_to_u64(new.k->p), 798 old_a->fragmentation_lru, new_a->fragmentation_lru); 799 if (ret) 800 return ret; 801 } 802 803 if (old_a->gen != new_a->gen) { 804 ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen); 805 if (ret) 806 return ret; 807 } 808 809 /* 810 * need to know if we're getting called from the invalidate path or 811 * not: 812 */ 813 814 if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && 815 old_a->cached_sectors) { 816 ret = bch2_update_cached_sectors_list(trans, new.k->p.inode, 817 -((s64) old_a->cached_sectors)); 818 if (ret) 819 return ret; 820 } 821 } 822 823 if ((flags & BTREE_TRIGGER_ATOMIC) && (flags & BTREE_TRIGGER_INSERT)) { 824 struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v; 825 u64 journal_seq = trans->journal_res.seq; 826 u64 bucket_journal_seq = new_a->journal_seq; 827 828 if ((flags & BTREE_TRIGGER_INSERT) && 829 data_type_is_empty(old_a->data_type) != 830 data_type_is_empty(new_a->data_type) && 831 new.k->type == KEY_TYPE_alloc_v4) { 832 struct bch_alloc_v4 *v = bkey_s_to_alloc_v4(new).v; 833 834 /* 835 * If the btree updates referring to a bucket weren't flushed 836 * before the bucket became empty again, then the we don't have 837 * to wait on a journal flush before we can reuse the bucket: 838 */ 839 v->journal_seq = bucket_journal_seq = 840 data_type_is_empty(new_a->data_type) && 841 (journal_seq == v->journal_seq || 842 bch2_journal_noflush_seq(&c->journal, v->journal_seq)) 843 ? 0 : journal_seq; 844 } 845 846 if (!data_type_is_empty(old_a->data_type) && 847 data_type_is_empty(new_a->data_type) && 848 bucket_journal_seq) { 849 ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, 850 c->journal.flushed_seq_ondisk, 851 new.k->p.inode, new.k->p.offset, 852 bucket_journal_seq); 853 if (ret) { 854 bch2_fs_fatal_error(c, 855 "error setting bucket_needs_journal_commit: %i", ret); 856 return ret; 857 } 858 } 859 860 percpu_down_read(&c->mark_lock); 861 if (new_a->gen != old_a->gen) 862 *bucket_gen(ca, new.k->p.offset) = new_a->gen; 863 864 bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, false); 865 percpu_up_read(&c->mark_lock); 866 867 #define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; }) 868 #define statechange(expr) !eval_state(old_a, expr) && eval_state(new_a, expr) 869 #define bucket_flushed(a) (!a->journal_seq || a->journal_seq <= c->journal.flushed_seq_ondisk) 870 871 if (statechange(a->data_type == BCH_DATA_free) && 872 bucket_flushed(new_a)) 873 closure_wake_up(&c->freelist_wait); 874 875 if (statechange(a->data_type == BCH_DATA_need_discard) && 876 !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) && 877 bucket_flushed(new_a)) 878 bch2_discard_one_bucket_fast(c, new.k->p); 879 880 if (statechange(a->data_type == BCH_DATA_cached) && 881 !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) && 882 should_invalidate_buckets(ca, bch2_dev_usage_read(ca))) 883 bch2_do_invalidates(c); 884 885 if (statechange(a->data_type == BCH_DATA_need_gc_gens)) 886 bch2_do_gc_gens(c); 887 } 888 889 if ((flags & BTREE_TRIGGER_GC) && 890 (flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) { 891 struct bch_alloc_v4 new_a_convert; 892 const struct bch_alloc_v4 *new_a = bch2_alloc_to_v4(new.s_c, &new_a_convert); 893 894 percpu_down_read(&c->mark_lock); 895 struct bucket *g = gc_bucket(ca, new.k->p.offset); 896 897 bucket_lock(g); 898 899 g->gen_valid = 1; 900 g->gen = new_a->gen; 901 g->data_type = new_a->data_type; 902 g->stripe = new_a->stripe; 903 g->stripe_redundancy = new_a->stripe_redundancy; 904 g->dirty_sectors = new_a->dirty_sectors; 905 g->cached_sectors = new_a->cached_sectors; 906 907 bucket_unlock(g); 908 percpu_up_read(&c->mark_lock); 909 } 910 911 return 0; 912 } 913 914 /* 915 * This synthesizes deleted extents for holes, similar to BTREE_ITER_SLOTS for 916 * extents style btrees, but works on non-extents btrees: 917 */ 918 static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole) 919 { 920 struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); 921 922 if (bkey_err(k)) 923 return k; 924 925 if (k.k->type) { 926 return k; 927 } else { 928 struct btree_iter iter2; 929 struct bpos next; 930 931 bch2_trans_copy_iter(&iter2, iter); 932 933 struct btree_path *path = btree_iter_path(iter->trans, iter); 934 if (!bpos_eq(path->l[0].b->key.k.p, SPOS_MAX)) 935 end = bkey_min(end, bpos_nosnap_successor(path->l[0].b->key.k.p)); 936 937 end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1)); 938 939 /* 940 * btree node min/max is a closed interval, upto takes a half 941 * open interval: 942 */ 943 k = bch2_btree_iter_peek_upto(&iter2, end); 944 next = iter2.pos; 945 bch2_trans_iter_exit(iter->trans, &iter2); 946 947 BUG_ON(next.offset >= iter->pos.offset + U32_MAX); 948 949 if (bkey_err(k)) 950 return k; 951 952 bkey_init(hole); 953 hole->p = iter->pos; 954 955 bch2_key_resize(hole, next.offset - iter->pos.offset); 956 return (struct bkey_s_c) { hole, NULL }; 957 } 958 } 959 960 static bool next_bucket(struct bch_fs *c, struct bpos *bucket) 961 { 962 struct bch_dev *ca; 963 964 if (bch2_dev_bucket_exists(c, *bucket)) 965 return true; 966 967 if (bch2_dev_exists2(c, bucket->inode)) { 968 ca = bch_dev_bkey_exists(c, bucket->inode); 969 970 if (bucket->offset < ca->mi.first_bucket) { 971 bucket->offset = ca->mi.first_bucket; 972 return true; 973 } 974 975 bucket->inode++; 976 bucket->offset = 0; 977 } 978 979 rcu_read_lock(); 980 ca = __bch2_next_dev_idx(c, bucket->inode, NULL); 981 if (ca) 982 *bucket = POS(ca->dev_idx, ca->mi.first_bucket); 983 rcu_read_unlock(); 984 985 return ca != NULL; 986 } 987 988 static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter, struct bkey *hole) 989 { 990 struct bch_fs *c = iter->trans->c; 991 struct bkey_s_c k; 992 again: 993 k = bch2_get_key_or_hole(iter, POS_MAX, hole); 994 if (bkey_err(k)) 995 return k; 996 997 if (!k.k->type) { 998 struct bpos bucket = bkey_start_pos(k.k); 999 1000 if (!bch2_dev_bucket_exists(c, bucket)) { 1001 if (!next_bucket(c, &bucket)) 1002 return bkey_s_c_null; 1003 1004 bch2_btree_iter_set_pos(iter, bucket); 1005 goto again; 1006 } 1007 1008 if (!bch2_dev_bucket_exists(c, k.k->p)) { 1009 struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode); 1010 1011 bch2_key_resize(hole, ca->mi.nbuckets - bucket.offset); 1012 } 1013 } 1014 1015 return k; 1016 } 1017 1018 static noinline_for_stack 1019 int bch2_check_alloc_key(struct btree_trans *trans, 1020 struct bkey_s_c alloc_k, 1021 struct btree_iter *alloc_iter, 1022 struct btree_iter *discard_iter, 1023 struct btree_iter *freespace_iter, 1024 struct btree_iter *bucket_gens_iter) 1025 { 1026 struct bch_fs *c = trans->c; 1027 struct bch_dev *ca; 1028 struct bch_alloc_v4 a_convert; 1029 const struct bch_alloc_v4 *a; 1030 unsigned discard_key_type, freespace_key_type; 1031 unsigned gens_offset; 1032 struct bkey_s_c k; 1033 struct printbuf buf = PRINTBUF; 1034 int ret; 1035 1036 if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_k.k->p), c, 1037 alloc_key_to_missing_dev_bucket, 1038 "alloc key for invalid device:bucket %llu:%llu", 1039 alloc_k.k->p.inode, alloc_k.k->p.offset)) 1040 return bch2_btree_delete_at(trans, alloc_iter, 0); 1041 1042 ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode); 1043 if (!ca->mi.freespace_initialized) 1044 return 0; 1045 1046 a = bch2_alloc_to_v4(alloc_k, &a_convert); 1047 1048 discard_key_type = a->data_type == BCH_DATA_need_discard ? KEY_TYPE_set : 0; 1049 bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p); 1050 k = bch2_btree_iter_peek_slot(discard_iter); 1051 ret = bkey_err(k); 1052 if (ret) 1053 goto err; 1054 1055 if (fsck_err_on(k.k->type != discard_key_type, 1056 c, need_discard_key_wrong, 1057 "incorrect key in need_discard btree (got %s should be %s)\n" 1058 " %s", 1059 bch2_bkey_types[k.k->type], 1060 bch2_bkey_types[discard_key_type], 1061 (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { 1062 struct bkey_i *update = 1063 bch2_trans_kmalloc(trans, sizeof(*update)); 1064 1065 ret = PTR_ERR_OR_ZERO(update); 1066 if (ret) 1067 goto err; 1068 1069 bkey_init(&update->k); 1070 update->k.type = discard_key_type; 1071 update->k.p = discard_iter->pos; 1072 1073 ret = bch2_trans_update(trans, discard_iter, update, 0); 1074 if (ret) 1075 goto err; 1076 } 1077 1078 freespace_key_type = a->data_type == BCH_DATA_free ? KEY_TYPE_set : 0; 1079 bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a)); 1080 k = bch2_btree_iter_peek_slot(freespace_iter); 1081 ret = bkey_err(k); 1082 if (ret) 1083 goto err; 1084 1085 if (fsck_err_on(k.k->type != freespace_key_type, 1086 c, freespace_key_wrong, 1087 "incorrect key in freespace btree (got %s should be %s)\n" 1088 " %s", 1089 bch2_bkey_types[k.k->type], 1090 bch2_bkey_types[freespace_key_type], 1091 (printbuf_reset(&buf), 1092 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { 1093 struct bkey_i *update = 1094 bch2_trans_kmalloc(trans, sizeof(*update)); 1095 1096 ret = PTR_ERR_OR_ZERO(update); 1097 if (ret) 1098 goto err; 1099 1100 bkey_init(&update->k); 1101 update->k.type = freespace_key_type; 1102 update->k.p = freespace_iter->pos; 1103 bch2_key_resize(&update->k, 1); 1104 1105 ret = bch2_trans_update(trans, freespace_iter, update, 0); 1106 if (ret) 1107 goto err; 1108 } 1109 1110 bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset)); 1111 k = bch2_btree_iter_peek_slot(bucket_gens_iter); 1112 ret = bkey_err(k); 1113 if (ret) 1114 goto err; 1115 1116 if (fsck_err_on(a->gen != alloc_gen(k, gens_offset), 1117 c, bucket_gens_key_wrong, 1118 "incorrect gen in bucket_gens btree (got %u should be %u)\n" 1119 " %s", 1120 alloc_gen(k, gens_offset), a->gen, 1121 (printbuf_reset(&buf), 1122 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { 1123 struct bkey_i_bucket_gens *g = 1124 bch2_trans_kmalloc(trans, sizeof(*g)); 1125 1126 ret = PTR_ERR_OR_ZERO(g); 1127 if (ret) 1128 goto err; 1129 1130 if (k.k->type == KEY_TYPE_bucket_gens) { 1131 bkey_reassemble(&g->k_i, k); 1132 } else { 1133 bkey_bucket_gens_init(&g->k_i); 1134 g->k.p = alloc_gens_pos(alloc_k.k->p, &gens_offset); 1135 } 1136 1137 g->v.gens[gens_offset] = a->gen; 1138 1139 ret = bch2_trans_update(trans, bucket_gens_iter, &g->k_i, 0); 1140 if (ret) 1141 goto err; 1142 } 1143 err: 1144 fsck_err: 1145 printbuf_exit(&buf); 1146 return ret; 1147 } 1148 1149 static noinline_for_stack 1150 int bch2_check_alloc_hole_freespace(struct btree_trans *trans, 1151 struct bpos start, 1152 struct bpos *end, 1153 struct btree_iter *freespace_iter) 1154 { 1155 struct bch_fs *c = trans->c; 1156 struct bch_dev *ca; 1157 struct bkey_s_c k; 1158 struct printbuf buf = PRINTBUF; 1159 int ret; 1160 1161 ca = bch_dev_bkey_exists(c, start.inode); 1162 if (!ca->mi.freespace_initialized) 1163 return 0; 1164 1165 bch2_btree_iter_set_pos(freespace_iter, start); 1166 1167 k = bch2_btree_iter_peek_slot(freespace_iter); 1168 ret = bkey_err(k); 1169 if (ret) 1170 goto err; 1171 1172 *end = bkey_min(k.k->p, *end); 1173 1174 if (fsck_err_on(k.k->type != KEY_TYPE_set, 1175 c, freespace_hole_missing, 1176 "hole in alloc btree missing in freespace btree\n" 1177 " device %llu buckets %llu-%llu", 1178 freespace_iter->pos.inode, 1179 freespace_iter->pos.offset, 1180 end->offset)) { 1181 struct bkey_i *update = 1182 bch2_trans_kmalloc(trans, sizeof(*update)); 1183 1184 ret = PTR_ERR_OR_ZERO(update); 1185 if (ret) 1186 goto err; 1187 1188 bkey_init(&update->k); 1189 update->k.type = KEY_TYPE_set; 1190 update->k.p = freespace_iter->pos; 1191 bch2_key_resize(&update->k, 1192 min_t(u64, U32_MAX, end->offset - 1193 freespace_iter->pos.offset)); 1194 1195 ret = bch2_trans_update(trans, freespace_iter, update, 0); 1196 if (ret) 1197 goto err; 1198 } 1199 err: 1200 fsck_err: 1201 printbuf_exit(&buf); 1202 return ret; 1203 } 1204 1205 static noinline_for_stack 1206 int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans, 1207 struct bpos start, 1208 struct bpos *end, 1209 struct btree_iter *bucket_gens_iter) 1210 { 1211 struct bch_fs *c = trans->c; 1212 struct bkey_s_c k; 1213 struct printbuf buf = PRINTBUF; 1214 unsigned i, gens_offset, gens_end_offset; 1215 int ret; 1216 1217 bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset)); 1218 1219 k = bch2_btree_iter_peek_slot(bucket_gens_iter); 1220 ret = bkey_err(k); 1221 if (ret) 1222 goto err; 1223 1224 if (bkey_cmp(alloc_gens_pos(start, &gens_offset), 1225 alloc_gens_pos(*end, &gens_end_offset))) 1226 gens_end_offset = KEY_TYPE_BUCKET_GENS_NR; 1227 1228 if (k.k->type == KEY_TYPE_bucket_gens) { 1229 struct bkey_i_bucket_gens g; 1230 bool need_update = false; 1231 1232 bkey_reassemble(&g.k_i, k); 1233 1234 for (i = gens_offset; i < gens_end_offset; i++) { 1235 if (fsck_err_on(g.v.gens[i], c, 1236 bucket_gens_hole_wrong, 1237 "hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)", 1238 bucket_gens_pos_to_alloc(k.k->p, i).inode, 1239 bucket_gens_pos_to_alloc(k.k->p, i).offset, 1240 g.v.gens[i])) { 1241 g.v.gens[i] = 0; 1242 need_update = true; 1243 } 1244 } 1245 1246 if (need_update) { 1247 struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g)); 1248 1249 ret = PTR_ERR_OR_ZERO(u); 1250 if (ret) 1251 goto err; 1252 1253 memcpy(u, &g, sizeof(g)); 1254 1255 ret = bch2_trans_update(trans, bucket_gens_iter, u, 0); 1256 if (ret) 1257 goto err; 1258 } 1259 } 1260 1261 *end = bkey_min(*end, bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0)); 1262 err: 1263 fsck_err: 1264 printbuf_exit(&buf); 1265 return ret; 1266 } 1267 1268 static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_trans *trans, 1269 struct btree_iter *iter) 1270 { 1271 struct bch_fs *c = trans->c; 1272 struct btree_iter alloc_iter; 1273 struct bkey_s_c alloc_k; 1274 struct bch_alloc_v4 a_convert; 1275 const struct bch_alloc_v4 *a; 1276 u64 genbits; 1277 struct bpos pos; 1278 enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard 1279 ? BCH_DATA_need_discard 1280 : BCH_DATA_free; 1281 struct printbuf buf = PRINTBUF; 1282 int ret; 1283 1284 pos = iter->pos; 1285 pos.offset &= ~(~0ULL << 56); 1286 genbits = iter->pos.offset & (~0ULL << 56); 1287 1288 alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, pos, 0); 1289 ret = bkey_err(alloc_k); 1290 if (ret) 1291 return ret; 1292 1293 if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c, 1294 need_discard_freespace_key_to_invalid_dev_bucket, 1295 "entry in %s btree for nonexistant dev:bucket %llu:%llu", 1296 bch2_btree_id_str(iter->btree_id), pos.inode, pos.offset)) 1297 goto delete; 1298 1299 a = bch2_alloc_to_v4(alloc_k, &a_convert); 1300 1301 if (fsck_err_on(a->data_type != state || 1302 (state == BCH_DATA_free && 1303 genbits != alloc_freespace_genbits(*a)), c, 1304 need_discard_freespace_key_bad, 1305 "%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)", 1306 (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), 1307 bch2_btree_id_str(iter->btree_id), 1308 iter->pos.inode, 1309 iter->pos.offset, 1310 a->data_type == state, 1311 genbits >> 56, alloc_freespace_genbits(*a) >> 56)) 1312 goto delete; 1313 out: 1314 fsck_err: 1315 set_btree_iter_dontneed(&alloc_iter); 1316 bch2_trans_iter_exit(trans, &alloc_iter); 1317 printbuf_exit(&buf); 1318 return ret; 1319 delete: 1320 ret = bch2_btree_delete_extent_at(trans, iter, 1321 iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?: 1322 bch2_trans_commit(trans, NULL, NULL, 1323 BCH_TRANS_COMMIT_no_enospc); 1324 goto out; 1325 } 1326 1327 /* 1328 * We've already checked that generation numbers in the bucket_gens btree are 1329 * valid for buckets that exist; this just checks for keys for nonexistent 1330 * buckets. 1331 */ 1332 static noinline_for_stack 1333 int bch2_check_bucket_gens_key(struct btree_trans *trans, 1334 struct btree_iter *iter, 1335 struct bkey_s_c k) 1336 { 1337 struct bch_fs *c = trans->c; 1338 struct bkey_i_bucket_gens g; 1339 struct bch_dev *ca; 1340 u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; 1341 u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; 1342 u64 b; 1343 bool need_update = false, dev_exists; 1344 struct printbuf buf = PRINTBUF; 1345 int ret = 0; 1346 1347 BUG_ON(k.k->type != KEY_TYPE_bucket_gens); 1348 bkey_reassemble(&g.k_i, k); 1349 1350 /* if no bch_dev, skip out whether we repair or not */ 1351 dev_exists = bch2_dev_exists2(c, k.k->p.inode); 1352 if (!dev_exists) { 1353 if (fsck_err_on(!dev_exists, c, 1354 bucket_gens_to_invalid_dev, 1355 "bucket_gens key for invalid device:\n %s", 1356 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { 1357 ret = bch2_btree_delete_at(trans, iter, 0); 1358 } 1359 goto out; 1360 } 1361 1362 ca = bch_dev_bkey_exists(c, k.k->p.inode); 1363 if (fsck_err_on(end <= ca->mi.first_bucket || 1364 start >= ca->mi.nbuckets, c, 1365 bucket_gens_to_invalid_buckets, 1366 "bucket_gens key for invalid buckets:\n %s", 1367 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { 1368 ret = bch2_btree_delete_at(trans, iter, 0); 1369 goto out; 1370 } 1371 1372 for (b = start; b < ca->mi.first_bucket; b++) 1373 if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c, 1374 bucket_gens_nonzero_for_invalid_buckets, 1375 "bucket_gens key has nonzero gen for invalid bucket")) { 1376 g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0; 1377 need_update = true; 1378 } 1379 1380 for (b = ca->mi.nbuckets; b < end; b++) 1381 if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c, 1382 bucket_gens_nonzero_for_invalid_buckets, 1383 "bucket_gens key has nonzero gen for invalid bucket")) { 1384 g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0; 1385 need_update = true; 1386 } 1387 1388 if (need_update) { 1389 struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g)); 1390 1391 ret = PTR_ERR_OR_ZERO(u); 1392 if (ret) 1393 goto out; 1394 1395 memcpy(u, &g, sizeof(g)); 1396 ret = bch2_trans_update(trans, iter, u, 0); 1397 } 1398 out: 1399 fsck_err: 1400 printbuf_exit(&buf); 1401 return ret; 1402 } 1403 1404 int bch2_check_alloc_info(struct bch_fs *c) 1405 { 1406 struct btree_trans *trans = bch2_trans_get(c); 1407 struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter; 1408 struct bkey hole; 1409 struct bkey_s_c k; 1410 int ret = 0; 1411 1412 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN, 1413 BTREE_ITER_PREFETCH); 1414 bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN, 1415 BTREE_ITER_PREFETCH); 1416 bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, POS_MIN, 1417 BTREE_ITER_PREFETCH); 1418 bch2_trans_iter_init(trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN, 1419 BTREE_ITER_PREFETCH); 1420 1421 while (1) { 1422 struct bpos next; 1423 1424 bch2_trans_begin(trans); 1425 1426 k = bch2_get_key_or_real_bucket_hole(&iter, &hole); 1427 ret = bkey_err(k); 1428 if (ret) 1429 goto bkey_err; 1430 1431 if (!k.k) 1432 break; 1433 1434 if (k.k->type) { 1435 next = bpos_nosnap_successor(k.k->p); 1436 1437 ret = bch2_check_alloc_key(trans, 1438 k, &iter, 1439 &discard_iter, 1440 &freespace_iter, 1441 &bucket_gens_iter); 1442 if (ret) 1443 goto bkey_err; 1444 } else { 1445 next = k.k->p; 1446 1447 ret = bch2_check_alloc_hole_freespace(trans, 1448 bkey_start_pos(k.k), 1449 &next, 1450 &freespace_iter) ?: 1451 bch2_check_alloc_hole_bucket_gens(trans, 1452 bkey_start_pos(k.k), 1453 &next, 1454 &bucket_gens_iter); 1455 if (ret) 1456 goto bkey_err; 1457 } 1458 1459 ret = bch2_trans_commit(trans, NULL, NULL, 1460 BCH_TRANS_COMMIT_no_enospc); 1461 if (ret) 1462 goto bkey_err; 1463 1464 bch2_btree_iter_set_pos(&iter, next); 1465 bkey_err: 1466 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1467 continue; 1468 if (ret) 1469 break; 1470 } 1471 bch2_trans_iter_exit(trans, &bucket_gens_iter); 1472 bch2_trans_iter_exit(trans, &freespace_iter); 1473 bch2_trans_iter_exit(trans, &discard_iter); 1474 bch2_trans_iter_exit(trans, &iter); 1475 1476 if (ret < 0) 1477 goto err; 1478 1479 ret = for_each_btree_key(trans, iter, 1480 BTREE_ID_need_discard, POS_MIN, 1481 BTREE_ITER_PREFETCH, k, 1482 bch2_check_discard_freespace_key(trans, &iter)); 1483 if (ret) 1484 goto err; 1485 1486 bch2_trans_iter_init(trans, &iter, BTREE_ID_freespace, POS_MIN, 1487 BTREE_ITER_PREFETCH); 1488 while (1) { 1489 bch2_trans_begin(trans); 1490 k = bch2_btree_iter_peek(&iter); 1491 if (!k.k) 1492 break; 1493 1494 ret = bkey_err(k) ?: 1495 bch2_check_discard_freespace_key(trans, &iter); 1496 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { 1497 ret = 0; 1498 continue; 1499 } 1500 if (ret) { 1501 struct printbuf buf = PRINTBUF; 1502 bch2_bkey_val_to_text(&buf, c, k); 1503 1504 bch_err(c, "while checking %s", buf.buf); 1505 printbuf_exit(&buf); 1506 break; 1507 } 1508 1509 bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos)); 1510 } 1511 bch2_trans_iter_exit(trans, &iter); 1512 if (ret) 1513 goto err; 1514 1515 ret = for_each_btree_key_commit(trans, iter, 1516 BTREE_ID_bucket_gens, POS_MIN, 1517 BTREE_ITER_PREFETCH, k, 1518 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 1519 bch2_check_bucket_gens_key(trans, &iter, k)); 1520 err: 1521 bch2_trans_put(trans); 1522 bch_err_fn(c, ret); 1523 return ret; 1524 } 1525 1526 static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, 1527 struct btree_iter *alloc_iter) 1528 { 1529 struct bch_fs *c = trans->c; 1530 struct btree_iter lru_iter; 1531 struct bch_alloc_v4 a_convert; 1532 const struct bch_alloc_v4 *a; 1533 struct bkey_s_c alloc_k, lru_k; 1534 struct printbuf buf = PRINTBUF; 1535 int ret; 1536 1537 alloc_k = bch2_btree_iter_peek(alloc_iter); 1538 if (!alloc_k.k) 1539 return 0; 1540 1541 ret = bkey_err(alloc_k); 1542 if (ret) 1543 return ret; 1544 1545 a = bch2_alloc_to_v4(alloc_k, &a_convert); 1546 1547 if (a->data_type != BCH_DATA_cached) 1548 return 0; 1549 1550 if (fsck_err_on(!a->io_time[READ], c, 1551 alloc_key_cached_but_read_time_zero, 1552 "cached bucket with read_time 0\n" 1553 " %s", 1554 (printbuf_reset(&buf), 1555 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { 1556 struct bkey_i_alloc_v4 *a_mut = 1557 bch2_alloc_to_v4_mut(trans, alloc_k); 1558 ret = PTR_ERR_OR_ZERO(a_mut); 1559 if (ret) 1560 goto err; 1561 1562 a_mut->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now); 1563 ret = bch2_trans_update(trans, alloc_iter, 1564 &a_mut->k_i, BTREE_TRIGGER_NORUN); 1565 if (ret) 1566 goto err; 1567 1568 a = &a_mut->v; 1569 } 1570 1571 lru_k = bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru, 1572 lru_pos(alloc_k.k->p.inode, 1573 bucket_to_u64(alloc_k.k->p), 1574 a->io_time[READ]), 0); 1575 ret = bkey_err(lru_k); 1576 if (ret) 1577 return ret; 1578 1579 if (fsck_err_on(lru_k.k->type != KEY_TYPE_set, c, 1580 alloc_key_to_missing_lru_entry, 1581 "missing lru entry\n" 1582 " %s", 1583 (printbuf_reset(&buf), 1584 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { 1585 ret = bch2_lru_set(trans, 1586 alloc_k.k->p.inode, 1587 bucket_to_u64(alloc_k.k->p), 1588 a->io_time[READ]); 1589 if (ret) 1590 goto err; 1591 } 1592 err: 1593 fsck_err: 1594 bch2_trans_iter_exit(trans, &lru_iter); 1595 printbuf_exit(&buf); 1596 return ret; 1597 } 1598 1599 int bch2_check_alloc_to_lru_refs(struct bch_fs *c) 1600 { 1601 int ret = bch2_trans_run(c, 1602 for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, 1603 POS_MIN, BTREE_ITER_PREFETCH, k, 1604 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 1605 bch2_check_alloc_to_lru_ref(trans, &iter))); 1606 bch_err_fn(c, ret); 1607 return ret; 1608 } 1609 1610 static int discard_in_flight_add(struct bch_fs *c, struct bpos bucket) 1611 { 1612 int ret; 1613 1614 mutex_lock(&c->discard_buckets_in_flight_lock); 1615 darray_for_each(c->discard_buckets_in_flight, i) 1616 if (bkey_eq(*i, bucket)) { 1617 ret = -EEXIST; 1618 goto out; 1619 } 1620 1621 ret = darray_push(&c->discard_buckets_in_flight, bucket); 1622 out: 1623 mutex_unlock(&c->discard_buckets_in_flight_lock); 1624 return ret; 1625 } 1626 1627 static void discard_in_flight_remove(struct bch_fs *c, struct bpos bucket) 1628 { 1629 mutex_lock(&c->discard_buckets_in_flight_lock); 1630 darray_for_each(c->discard_buckets_in_flight, i) 1631 if (bkey_eq(*i, bucket)) { 1632 darray_remove_item(&c->discard_buckets_in_flight, i); 1633 goto found; 1634 } 1635 BUG(); 1636 found: 1637 mutex_unlock(&c->discard_buckets_in_flight_lock); 1638 } 1639 1640 struct discard_buckets_state { 1641 u64 seen; 1642 u64 open; 1643 u64 need_journal_commit; 1644 u64 discarded; 1645 struct bch_dev *ca; 1646 u64 need_journal_commit_this_dev; 1647 }; 1648 1649 static void discard_buckets_next_dev(struct bch_fs *c, struct discard_buckets_state *s, struct bch_dev *ca) 1650 { 1651 if (s->ca == ca) 1652 return; 1653 1654 if (s->ca && s->need_journal_commit_this_dev > 1655 bch2_dev_usage_read(s->ca).d[BCH_DATA_free].buckets) 1656 bch2_journal_flush_async(&c->journal, NULL); 1657 1658 if (s->ca) 1659 percpu_ref_put(&s->ca->ref); 1660 if (ca) 1661 percpu_ref_get(&ca->ref); 1662 s->ca = ca; 1663 s->need_journal_commit_this_dev = 0; 1664 } 1665 1666 static int bch2_discard_one_bucket(struct btree_trans *trans, 1667 struct btree_iter *need_discard_iter, 1668 struct bpos *discard_pos_done, 1669 struct discard_buckets_state *s) 1670 { 1671 struct bch_fs *c = trans->c; 1672 struct bpos pos = need_discard_iter->pos; 1673 struct btree_iter iter = { NULL }; 1674 struct bkey_s_c k; 1675 struct bch_dev *ca; 1676 struct bkey_i_alloc_v4 *a; 1677 struct printbuf buf = PRINTBUF; 1678 bool discard_locked = false; 1679 int ret = 0; 1680 1681 ca = bch_dev_bkey_exists(c, pos.inode); 1682 1683 if (!percpu_ref_tryget(&ca->io_ref)) { 1684 bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0)); 1685 return 0; 1686 } 1687 1688 discard_buckets_next_dev(c, s, ca); 1689 1690 if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) { 1691 s->open++; 1692 goto out; 1693 } 1694 1695 if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, 1696 c->journal.flushed_seq_ondisk, 1697 pos.inode, pos.offset)) { 1698 s->need_journal_commit++; 1699 s->need_journal_commit_this_dev++; 1700 goto out; 1701 } 1702 1703 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, 1704 need_discard_iter->pos, 1705 BTREE_ITER_CACHED); 1706 ret = bkey_err(k); 1707 if (ret) 1708 goto out; 1709 1710 a = bch2_alloc_to_v4_mut(trans, k); 1711 ret = PTR_ERR_OR_ZERO(a); 1712 if (ret) 1713 goto out; 1714 1715 if (BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) { 1716 a->v.gen++; 1717 SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); 1718 goto write; 1719 } 1720 1721 if (a->v.journal_seq > c->journal.flushed_seq_ondisk) { 1722 if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) { 1723 bch2_trans_inconsistent(trans, 1724 "clearing need_discard but journal_seq %llu > flushed_seq %llu\n" 1725 "%s", 1726 a->v.journal_seq, 1727 c->journal.flushed_seq_ondisk, 1728 (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); 1729 ret = -EIO; 1730 } 1731 goto out; 1732 } 1733 1734 if (a->v.data_type != BCH_DATA_need_discard) { 1735 if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) { 1736 bch2_trans_inconsistent(trans, 1737 "bucket incorrectly set in need_discard btree\n" 1738 "%s", 1739 (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); 1740 ret = -EIO; 1741 } 1742 1743 goto out; 1744 } 1745 1746 if (discard_in_flight_add(c, SPOS(iter.pos.inode, iter.pos.offset, true))) 1747 goto out; 1748 1749 discard_locked = true; 1750 1751 if (!bkey_eq(*discard_pos_done, iter.pos) && 1752 ca->mi.discard && !c->opts.nochanges) { 1753 /* 1754 * This works without any other locks because this is the only 1755 * thread that removes items from the need_discard tree 1756 */ 1757 bch2_trans_unlock_long(trans); 1758 blkdev_issue_discard(ca->disk_sb.bdev, 1759 k.k->p.offset * ca->mi.bucket_size, 1760 ca->mi.bucket_size, 1761 GFP_KERNEL); 1762 *discard_pos_done = iter.pos; 1763 1764 ret = bch2_trans_relock_notrace(trans); 1765 if (ret) 1766 goto out; 1767 } 1768 1769 SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); 1770 a->v.data_type = alloc_data_type(a->v, a->v.data_type); 1771 write: 1772 ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: 1773 bch2_trans_commit(trans, NULL, NULL, 1774 BCH_WATERMARK_btree| 1775 BCH_TRANS_COMMIT_no_enospc); 1776 if (ret) 1777 goto out; 1778 1779 count_event(c, bucket_discard); 1780 s->discarded++; 1781 out: 1782 if (discard_locked) 1783 discard_in_flight_remove(c, iter.pos); 1784 s->seen++; 1785 bch2_trans_iter_exit(trans, &iter); 1786 percpu_ref_put(&ca->io_ref); 1787 printbuf_exit(&buf); 1788 return ret; 1789 } 1790 1791 static void bch2_do_discards_work(struct work_struct *work) 1792 { 1793 struct bch_fs *c = container_of(work, struct bch_fs, discard_work); 1794 struct discard_buckets_state s = {}; 1795 struct bpos discard_pos_done = POS_MAX; 1796 int ret; 1797 1798 /* 1799 * We're doing the commit in bch2_discard_one_bucket instead of using 1800 * for_each_btree_key_commit() so that we can increment counters after 1801 * successful commit: 1802 */ 1803 ret = bch2_trans_run(c, 1804 for_each_btree_key(trans, iter, 1805 BTREE_ID_need_discard, POS_MIN, 0, k, 1806 bch2_discard_one_bucket(trans, &iter, &discard_pos_done, &s))); 1807 1808 discard_buckets_next_dev(c, &s, NULL); 1809 1810 trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, 1811 bch2_err_str(ret)); 1812 1813 bch2_write_ref_put(c, BCH_WRITE_REF_discard); 1814 } 1815 1816 void bch2_do_discards(struct bch_fs *c) 1817 { 1818 if (bch2_write_ref_tryget(c, BCH_WRITE_REF_discard) && 1819 !queue_work(c->write_ref_wq, &c->discard_work)) 1820 bch2_write_ref_put(c, BCH_WRITE_REF_discard); 1821 } 1822 1823 static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket) 1824 { 1825 struct btree_iter iter; 1826 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_INTENT); 1827 struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); 1828 int ret = bkey_err(k); 1829 if (ret) 1830 goto err; 1831 1832 struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, k); 1833 ret = PTR_ERR_OR_ZERO(a); 1834 if (ret) 1835 goto err; 1836 1837 SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); 1838 a->v.data_type = alloc_data_type(a->v, a->v.data_type); 1839 1840 ret = bch2_trans_update(trans, &iter, &a->k_i, 0); 1841 err: 1842 bch2_trans_iter_exit(trans, &iter); 1843 return ret; 1844 } 1845 1846 static void bch2_do_discards_fast_work(struct work_struct *work) 1847 { 1848 struct bch_fs *c = container_of(work, struct bch_fs, discard_fast_work); 1849 1850 while (1) { 1851 bool got_bucket = false; 1852 struct bpos bucket; 1853 struct bch_dev *ca; 1854 1855 mutex_lock(&c->discard_buckets_in_flight_lock); 1856 darray_for_each(c->discard_buckets_in_flight, i) { 1857 if (i->snapshot) 1858 continue; 1859 1860 ca = bch_dev_bkey_exists(c, i->inode); 1861 1862 if (!percpu_ref_tryget(&ca->io_ref)) { 1863 darray_remove_item(&c->discard_buckets_in_flight, i); 1864 continue; 1865 } 1866 1867 got_bucket = true; 1868 bucket = *i; 1869 i->snapshot = true; 1870 break; 1871 } 1872 mutex_unlock(&c->discard_buckets_in_flight_lock); 1873 1874 if (!got_bucket) 1875 break; 1876 1877 if (ca->mi.discard && !c->opts.nochanges) 1878 blkdev_issue_discard(ca->disk_sb.bdev, 1879 bucket.offset * ca->mi.bucket_size, 1880 ca->mi.bucket_size, 1881 GFP_KERNEL); 1882 1883 int ret = bch2_trans_do(c, NULL, NULL, 1884 BCH_WATERMARK_btree| 1885 BCH_TRANS_COMMIT_no_enospc, 1886 bch2_clear_bucket_needs_discard(trans, bucket)); 1887 bch_err_fn(c, ret); 1888 1889 percpu_ref_put(&ca->io_ref); 1890 discard_in_flight_remove(c, bucket); 1891 1892 if (ret) 1893 break; 1894 } 1895 1896 bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); 1897 } 1898 1899 static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket) 1900 { 1901 struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode); 1902 1903 if (!percpu_ref_is_dying(&ca->io_ref) && 1904 !discard_in_flight_add(c, bucket) && 1905 bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast) && 1906 !queue_work(c->write_ref_wq, &c->discard_fast_work)) 1907 bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); 1908 } 1909 1910 static int invalidate_one_bucket(struct btree_trans *trans, 1911 struct btree_iter *lru_iter, 1912 struct bkey_s_c lru_k, 1913 s64 *nr_to_invalidate) 1914 { 1915 struct bch_fs *c = trans->c; 1916 struct btree_iter alloc_iter = { NULL }; 1917 struct bkey_i_alloc_v4 *a = NULL; 1918 struct printbuf buf = PRINTBUF; 1919 struct bpos bucket = u64_to_bucket(lru_k.k->p.offset); 1920 unsigned cached_sectors; 1921 int ret = 0; 1922 1923 if (*nr_to_invalidate <= 0) 1924 return 1; 1925 1926 if (!bch2_dev_bucket_exists(c, bucket)) { 1927 prt_str(&buf, "lru entry points to invalid bucket"); 1928 goto err; 1929 } 1930 1931 if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset)) 1932 return 0; 1933 1934 a = bch2_trans_start_alloc_update(trans, &alloc_iter, bucket); 1935 ret = PTR_ERR_OR_ZERO(a); 1936 if (ret) 1937 goto out; 1938 1939 /* We expect harmless races here due to the btree write buffer: */ 1940 if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(a->v)) 1941 goto out; 1942 1943 BUG_ON(a->v.data_type != BCH_DATA_cached); 1944 1945 if (!a->v.cached_sectors) 1946 bch_err(c, "invalidating empty bucket, confused"); 1947 1948 cached_sectors = a->v.cached_sectors; 1949 1950 SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); 1951 a->v.gen++; 1952 a->v.data_type = 0; 1953 a->v.dirty_sectors = 0; 1954 a->v.cached_sectors = 0; 1955 a->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now); 1956 a->v.io_time[WRITE] = atomic64_read(&c->io_clock[WRITE].now); 1957 1958 ret = bch2_trans_update(trans, &alloc_iter, &a->k_i, 1959 BTREE_TRIGGER_BUCKET_INVALIDATE) ?: 1960 bch2_trans_commit(trans, NULL, NULL, 1961 BCH_WATERMARK_btree| 1962 BCH_TRANS_COMMIT_no_enospc); 1963 if (ret) 1964 goto out; 1965 1966 trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors); 1967 --*nr_to_invalidate; 1968 out: 1969 bch2_trans_iter_exit(trans, &alloc_iter); 1970 printbuf_exit(&buf); 1971 return ret; 1972 err: 1973 prt_str(&buf, "\n lru key: "); 1974 bch2_bkey_val_to_text(&buf, c, lru_k); 1975 1976 prt_str(&buf, "\n lru entry: "); 1977 bch2_lru_pos_to_text(&buf, lru_iter->pos); 1978 1979 prt_str(&buf, "\n alloc key: "); 1980 if (!a) 1981 bch2_bpos_to_text(&buf, bucket); 1982 else 1983 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i)); 1984 1985 bch_err(c, "%s", buf.buf); 1986 if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_lrus) { 1987 bch2_inconsistent_error(c); 1988 ret = -EINVAL; 1989 } 1990 1991 goto out; 1992 } 1993 1994 static void bch2_do_invalidates_work(struct work_struct *work) 1995 { 1996 struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work); 1997 struct btree_trans *trans = bch2_trans_get(c); 1998 int ret = 0; 1999 2000 ret = bch2_btree_write_buffer_tryflush(trans); 2001 if (ret) 2002 goto err; 2003 2004 for_each_member_device(c, ca) { 2005 s64 nr_to_invalidate = 2006 should_invalidate_buckets(ca, bch2_dev_usage_read(ca)); 2007 2008 ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru, 2009 lru_pos(ca->dev_idx, 0, 0), 2010 lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX), 2011 BTREE_ITER_INTENT, k, 2012 invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate)); 2013 2014 if (ret < 0) { 2015 percpu_ref_put(&ca->ref); 2016 break; 2017 } 2018 } 2019 err: 2020 bch2_trans_put(trans); 2021 bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); 2022 } 2023 2024 void bch2_do_invalidates(struct bch_fs *c) 2025 { 2026 if (bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate) && 2027 !queue_work(c->write_ref_wq, &c->invalidate_work)) 2028 bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); 2029 } 2030 2031 int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, 2032 u64 bucket_start, u64 bucket_end) 2033 { 2034 struct btree_trans *trans = bch2_trans_get(c); 2035 struct btree_iter iter; 2036 struct bkey_s_c k; 2037 struct bkey hole; 2038 struct bpos end = POS(ca->dev_idx, bucket_end); 2039 struct bch_member *m; 2040 unsigned long last_updated = jiffies; 2041 int ret; 2042 2043 BUG_ON(bucket_start > bucket_end); 2044 BUG_ON(bucket_end > ca->mi.nbuckets); 2045 2046 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, 2047 POS(ca->dev_idx, max_t(u64, ca->mi.first_bucket, bucket_start)), 2048 BTREE_ITER_PREFETCH); 2049 /* 2050 * Scan the alloc btree for every bucket on @ca, and add buckets to the 2051 * freespace/need_discard/need_gc_gens btrees as needed: 2052 */ 2053 while (1) { 2054 if (last_updated + HZ * 10 < jiffies) { 2055 bch_info(ca, "%s: currently at %llu/%llu", 2056 __func__, iter.pos.offset, ca->mi.nbuckets); 2057 last_updated = jiffies; 2058 } 2059 2060 bch2_trans_begin(trans); 2061 2062 if (bkey_ge(iter.pos, end)) { 2063 ret = 0; 2064 break; 2065 } 2066 2067 k = bch2_get_key_or_hole(&iter, end, &hole); 2068 ret = bkey_err(k); 2069 if (ret) 2070 goto bkey_err; 2071 2072 if (k.k->type) { 2073 /* 2074 * We process live keys in the alloc btree one at a 2075 * time: 2076 */ 2077 struct bch_alloc_v4 a_convert; 2078 const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); 2079 2080 ret = bch2_bucket_do_index(trans, k, a, true) ?: 2081 bch2_trans_commit(trans, NULL, NULL, 2082 BCH_TRANS_COMMIT_no_enospc); 2083 if (ret) 2084 goto bkey_err; 2085 2086 bch2_btree_iter_advance(&iter); 2087 } else { 2088 struct bkey_i *freespace; 2089 2090 freespace = bch2_trans_kmalloc(trans, sizeof(*freespace)); 2091 ret = PTR_ERR_OR_ZERO(freespace); 2092 if (ret) 2093 goto bkey_err; 2094 2095 bkey_init(&freespace->k); 2096 freespace->k.type = KEY_TYPE_set; 2097 freespace->k.p = k.k->p; 2098 freespace->k.size = k.k->size; 2099 2100 ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?: 2101 bch2_trans_commit(trans, NULL, NULL, 2102 BCH_TRANS_COMMIT_no_enospc); 2103 if (ret) 2104 goto bkey_err; 2105 2106 bch2_btree_iter_set_pos(&iter, k.k->p); 2107 } 2108 bkey_err: 2109 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 2110 continue; 2111 if (ret) 2112 break; 2113 } 2114 2115 bch2_trans_iter_exit(trans, &iter); 2116 bch2_trans_put(trans); 2117 2118 if (ret < 0) { 2119 bch_err_msg(ca, ret, "initializing free space"); 2120 return ret; 2121 } 2122 2123 mutex_lock(&c->sb_lock); 2124 m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); 2125 SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true); 2126 mutex_unlock(&c->sb_lock); 2127 2128 return 0; 2129 } 2130 2131 int bch2_fs_freespace_init(struct bch_fs *c) 2132 { 2133 int ret = 0; 2134 bool doing_init = false; 2135 2136 /* 2137 * We can crash during the device add path, so we need to check this on 2138 * every mount: 2139 */ 2140 2141 for_each_member_device(c, ca) { 2142 if (ca->mi.freespace_initialized) 2143 continue; 2144 2145 if (!doing_init) { 2146 bch_info(c, "initializing freespace"); 2147 doing_init = true; 2148 } 2149 2150 ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); 2151 if (ret) { 2152 percpu_ref_put(&ca->ref); 2153 bch_err_fn(c, ret); 2154 return ret; 2155 } 2156 } 2157 2158 if (doing_init) { 2159 mutex_lock(&c->sb_lock); 2160 bch2_write_super(c); 2161 mutex_unlock(&c->sb_lock); 2162 bch_verbose(c, "done initializing freespace"); 2163 } 2164 2165 return 0; 2166 } 2167 2168 /* Bucket IO clocks: */ 2169 2170 int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, 2171 size_t bucket_nr, int rw) 2172 { 2173 struct bch_fs *c = trans->c; 2174 struct btree_iter iter; 2175 struct bkey_i_alloc_v4 *a; 2176 u64 now; 2177 int ret = 0; 2178 2179 a = bch2_trans_start_alloc_update(trans, &iter, POS(dev, bucket_nr)); 2180 ret = PTR_ERR_OR_ZERO(a); 2181 if (ret) 2182 return ret; 2183 2184 now = atomic64_read(&c->io_clock[rw].now); 2185 if (a->v.io_time[rw] == now) 2186 goto out; 2187 2188 a->v.io_time[rw] = now; 2189 2190 ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: 2191 bch2_trans_commit(trans, NULL, NULL, 0); 2192 out: 2193 bch2_trans_iter_exit(trans, &iter); 2194 return ret; 2195 } 2196 2197 /* Startup/shutdown (ro/rw): */ 2198 2199 void bch2_recalc_capacity(struct bch_fs *c) 2200 { 2201 u64 capacity = 0, reserved_sectors = 0, gc_reserve; 2202 unsigned bucket_size_max = 0; 2203 unsigned long ra_pages = 0; 2204 2205 lockdep_assert_held(&c->state_lock); 2206 2207 for_each_online_member(c, ca) { 2208 struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi; 2209 2210 ra_pages += bdi->ra_pages; 2211 } 2212 2213 bch2_set_ra_pages(c, ra_pages); 2214 2215 for_each_rw_member(c, ca) { 2216 u64 dev_reserve = 0; 2217 2218 /* 2219 * We need to reserve buckets (from the number 2220 * of currently available buckets) against 2221 * foreground writes so that mainly copygc can 2222 * make forward progress. 2223 * 2224 * We need enough to refill the various reserves 2225 * from scratch - copygc will use its entire 2226 * reserve all at once, then run against when 2227 * its reserve is refilled (from the formerly 2228 * available buckets). 2229 * 2230 * This reserve is just used when considering if 2231 * allocations for foreground writes must wait - 2232 * not -ENOSPC calculations. 2233 */ 2234 2235 dev_reserve += ca->nr_btree_reserve * 2; 2236 dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */ 2237 2238 dev_reserve += 1; /* btree write point */ 2239 dev_reserve += 1; /* copygc write point */ 2240 dev_reserve += 1; /* rebalance write point */ 2241 2242 dev_reserve *= ca->mi.bucket_size; 2243 2244 capacity += bucket_to_sector(ca, ca->mi.nbuckets - 2245 ca->mi.first_bucket); 2246 2247 reserved_sectors += dev_reserve * 2; 2248 2249 bucket_size_max = max_t(unsigned, bucket_size_max, 2250 ca->mi.bucket_size); 2251 } 2252 2253 gc_reserve = c->opts.gc_reserve_bytes 2254 ? c->opts.gc_reserve_bytes >> 9 2255 : div64_u64(capacity * c->opts.gc_reserve_percent, 100); 2256 2257 reserved_sectors = max(gc_reserve, reserved_sectors); 2258 2259 reserved_sectors = min(reserved_sectors, capacity); 2260 2261 c->capacity = capacity - reserved_sectors; 2262 2263 c->bucket_size_max = bucket_size_max; 2264 2265 /* Wake up case someone was waiting for buckets */ 2266 closure_wake_up(&c->freelist_wait); 2267 } 2268 2269 u64 bch2_min_rw_member_capacity(struct bch_fs *c) 2270 { 2271 u64 ret = U64_MAX; 2272 2273 for_each_rw_member(c, ca) 2274 ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size); 2275 return ret; 2276 } 2277 2278 static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) 2279 { 2280 struct open_bucket *ob; 2281 bool ret = false; 2282 2283 for (ob = c->open_buckets; 2284 ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); 2285 ob++) { 2286 spin_lock(&ob->lock); 2287 if (ob->valid && !ob->on_partial_list && 2288 ob->dev == ca->dev_idx) 2289 ret = true; 2290 spin_unlock(&ob->lock); 2291 } 2292 2293 return ret; 2294 } 2295 2296 /* device goes ro: */ 2297 void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) 2298 { 2299 unsigned i; 2300 2301 /* First, remove device from allocation groups: */ 2302 2303 for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) 2304 clear_bit(ca->dev_idx, c->rw_devs[i].d); 2305 2306 /* 2307 * Capacity is calculated based off of devices in allocation groups: 2308 */ 2309 bch2_recalc_capacity(c); 2310 2311 bch2_open_buckets_stop(c, ca, false); 2312 2313 /* 2314 * Wake up threads that were blocked on allocation, so they can notice 2315 * the device can no longer be removed and the capacity has changed: 2316 */ 2317 closure_wake_up(&c->freelist_wait); 2318 2319 /* 2320 * journal_res_get() can block waiting for free space in the journal - 2321 * it needs to notice there may not be devices to allocate from anymore: 2322 */ 2323 wake_up(&c->journal.wait); 2324 2325 /* Now wait for any in flight writes: */ 2326 2327 closure_wait_event(&c->open_buckets_wait, 2328 !bch2_dev_has_open_write_point(c, ca)); 2329 } 2330 2331 /* device goes rw: */ 2332 void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) 2333 { 2334 unsigned i; 2335 2336 for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) 2337 if (ca->mi.data_allowed & (1 << i)) 2338 set_bit(ca->dev_idx, c->rw_devs[i].d); 2339 } 2340 2341 void bch2_fs_allocator_background_exit(struct bch_fs *c) 2342 { 2343 darray_exit(&c->discard_buckets_in_flight); 2344 } 2345 2346 void bch2_fs_allocator_background_init(struct bch_fs *c) 2347 { 2348 spin_lock_init(&c->freelist_lock); 2349 mutex_init(&c->discard_buckets_in_flight_lock); 2350 INIT_WORK(&c->discard_work, bch2_do_discards_work); 2351 INIT_WORK(&c->discard_fast_work, bch2_do_discards_fast_work); 2352 INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work); 2353 } 2354