1 // SPDX-License-Identifier: GPL-2.0 2 #include "bcachefs.h" 3 #include "alloc_background.h" 4 #include "alloc_foreground.h" 5 #include "backpointers.h" 6 #include "btree_cache.h" 7 #include "btree_io.h" 8 #include "btree_key_cache.h" 9 #include "btree_update.h" 10 #include "btree_update_interior.h" 11 #include "btree_gc.h" 12 #include "btree_write_buffer.h" 13 #include "buckets.h" 14 #include "buckets_waiting_for_journal.h" 15 #include "clock.h" 16 #include "debug.h" 17 #include "ec.h" 18 #include "error.h" 19 #include "lru.h" 20 #include "recovery.h" 21 #include "trace.h" 22 #include "varint.h" 23 24 #include <linux/kthread.h> 25 #include <linux/math64.h> 26 #include <linux/random.h> 27 #include <linux/rculist.h> 28 #include <linux/rcupdate.h> 29 #include <linux/sched/task.h> 30 #include <linux/sort.h> 31 32 static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket); 33 34 /* Persistent alloc info: */ 35 36 static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { 37 #define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8, 38 BCH_ALLOC_FIELDS_V1() 39 #undef x 40 }; 41 42 struct bkey_alloc_unpacked { 43 u64 journal_seq; 44 u8 gen; 45 u8 oldest_gen; 46 u8 data_type; 47 bool need_discard:1; 48 bool need_inc_gen:1; 49 #define x(_name, _bits) u##_bits _name; 50 BCH_ALLOC_FIELDS_V2() 51 #undef x 52 }; 53 54 static inline u64 alloc_field_v1_get(const struct bch_alloc *a, 55 const void **p, unsigned field) 56 { 57 unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field]; 58 u64 v; 59 60 if (!(a->fields & (1 << field))) 61 return 0; 62 63 switch (bytes) { 64 case 1: 65 v = *((const u8 *) *p); 66 break; 67 case 2: 68 v = le16_to_cpup(*p); 69 break; 70 case 4: 71 v = le32_to_cpup(*p); 72 break; 73 case 8: 74 v = le64_to_cpup(*p); 75 break; 76 default: 77 BUG(); 78 } 79 80 *p += bytes; 81 return v; 82 } 83 84 static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out, 85 struct bkey_s_c k) 86 { 87 const struct bch_alloc *in = bkey_s_c_to_alloc(k).v; 88 const void *d = in->data; 89 unsigned idx = 0; 90 91 out->gen = in->gen; 92 93 #define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++); 94 BCH_ALLOC_FIELDS_V1() 95 #undef x 96 } 97 98 static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out, 99 struct bkey_s_c k) 100 { 101 struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k); 102 const u8 *in = a.v->data; 103 const u8 *end = bkey_val_end(a); 104 unsigned fieldnr = 0; 105 int ret; 106 u64 v; 107 108 out->gen = a.v->gen; 109 out->oldest_gen = a.v->oldest_gen; 110 out->data_type = a.v->data_type; 111 112 #define x(_name, _bits) \ 113 if (fieldnr < a.v->nr_fields) { \ 114 ret = bch2_varint_decode_fast(in, end, &v); \ 115 if (ret < 0) \ 116 return ret; \ 117 in += ret; \ 118 } else { \ 119 v = 0; \ 120 } \ 121 out->_name = v; \ 122 if (v != out->_name) \ 123 return -1; \ 124 fieldnr++; 125 126 BCH_ALLOC_FIELDS_V2() 127 #undef x 128 return 0; 129 } 130 131 static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out, 132 struct bkey_s_c k) 133 { 134 struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k); 135 const u8 *in = a.v->data; 136 const u8 *end = bkey_val_end(a); 137 unsigned fieldnr = 0; 138 int ret; 139 u64 v; 140 141 out->gen = a.v->gen; 142 out->oldest_gen = a.v->oldest_gen; 143 out->data_type = a.v->data_type; 144 out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v); 145 out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v); 146 out->journal_seq = le64_to_cpu(a.v->journal_seq); 147 148 #define x(_name, _bits) \ 149 if (fieldnr < a.v->nr_fields) { \ 150 ret = bch2_varint_decode_fast(in, end, &v); \ 151 if (ret < 0) \ 152 return ret; \ 153 in += ret; \ 154 } else { \ 155 v = 0; \ 156 } \ 157 out->_name = v; \ 158 if (v != out->_name) \ 159 return -1; \ 160 fieldnr++; 161 162 BCH_ALLOC_FIELDS_V2() 163 #undef x 164 return 0; 165 } 166 167 static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) 168 { 169 struct bkey_alloc_unpacked ret = { .gen = 0 }; 170 171 switch (k.k->type) { 172 case KEY_TYPE_alloc: 173 bch2_alloc_unpack_v1(&ret, k); 174 break; 175 case KEY_TYPE_alloc_v2: 176 bch2_alloc_unpack_v2(&ret, k); 177 break; 178 case KEY_TYPE_alloc_v3: 179 bch2_alloc_unpack_v3(&ret, k); 180 break; 181 } 182 183 return ret; 184 } 185 186 static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) 187 { 188 unsigned i, bytes = offsetof(struct bch_alloc, data); 189 190 for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++) 191 if (a->fields & (1 << i)) 192 bytes += BCH_ALLOC_V1_FIELD_BYTES[i]; 193 194 return DIV_ROUND_UP(bytes, sizeof(u64)); 195 } 196 197 int bch2_alloc_v1_invalid(struct bch_fs *c, struct bkey_s_c k, 198 enum bch_validate_flags flags, 199 struct printbuf *err) 200 { 201 struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); 202 int ret = 0; 203 204 /* allow for unknown fields */ 205 bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v), c, err, 206 alloc_v1_val_size_bad, 207 "incorrect value size (%zu < %u)", 208 bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v)); 209 fsck_err: 210 return ret; 211 } 212 213 int bch2_alloc_v2_invalid(struct bch_fs *c, struct bkey_s_c k, 214 enum bch_validate_flags flags, 215 struct printbuf *err) 216 { 217 struct bkey_alloc_unpacked u; 218 int ret = 0; 219 220 bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k), c, err, 221 alloc_v2_unpack_error, 222 "unpack error"); 223 fsck_err: 224 return ret; 225 } 226 227 int bch2_alloc_v3_invalid(struct bch_fs *c, struct bkey_s_c k, 228 enum bch_validate_flags flags, 229 struct printbuf *err) 230 { 231 struct bkey_alloc_unpacked u; 232 int ret = 0; 233 234 bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k), c, err, 235 alloc_v2_unpack_error, 236 "unpack error"); 237 fsck_err: 238 return ret; 239 } 240 241 int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k, 242 enum bch_validate_flags flags, struct printbuf *err) 243 { 244 struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k); 245 int ret = 0; 246 247 bkey_fsck_err_on(alloc_v4_u64s_noerror(a.v) > bkey_val_u64s(k.k), c, err, 248 alloc_v4_val_size_bad, 249 "bad val size (%u > %zu)", 250 alloc_v4_u64s_noerror(a.v), bkey_val_u64s(k.k)); 251 252 bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) && 253 BCH_ALLOC_V4_NR_BACKPOINTERS(a.v), c, err, 254 alloc_v4_backpointers_start_bad, 255 "invalid backpointers_start"); 256 257 bkey_fsck_err_on(alloc_data_type(*a.v, a.v->data_type) != a.v->data_type, c, err, 258 alloc_key_data_type_bad, 259 "invalid data type (got %u should be %u)", 260 a.v->data_type, alloc_data_type(*a.v, a.v->data_type)); 261 262 switch (a.v->data_type) { 263 case BCH_DATA_free: 264 case BCH_DATA_need_gc_gens: 265 case BCH_DATA_need_discard: 266 bkey_fsck_err_on(bch2_bucket_sectors_total(*a.v) || a.v->stripe, 267 c, err, alloc_key_empty_but_have_data, 268 "empty data type free but have data"); 269 break; 270 case BCH_DATA_sb: 271 case BCH_DATA_journal: 272 case BCH_DATA_btree: 273 case BCH_DATA_user: 274 case BCH_DATA_parity: 275 bkey_fsck_err_on(!bch2_bucket_sectors_dirty(*a.v), 276 c, err, alloc_key_dirty_sectors_0, 277 "data_type %s but dirty_sectors==0", 278 bch2_data_type_str(a.v->data_type)); 279 break; 280 case BCH_DATA_cached: 281 bkey_fsck_err_on(!a.v->cached_sectors || 282 bch2_bucket_sectors_dirty(*a.v) || 283 a.v->stripe, 284 c, err, alloc_key_cached_inconsistency, 285 "data type inconsistency"); 286 287 bkey_fsck_err_on(!a.v->io_time[READ] && 288 c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs, 289 c, err, alloc_key_cached_but_read_time_zero, 290 "cached bucket with read_time == 0"); 291 break; 292 case BCH_DATA_stripe: 293 break; 294 } 295 fsck_err: 296 return ret; 297 } 298 299 void bch2_alloc_v4_swab(struct bkey_s k) 300 { 301 struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v; 302 struct bch_backpointer *bp, *bps; 303 304 a->journal_seq = swab64(a->journal_seq); 305 a->flags = swab32(a->flags); 306 a->dirty_sectors = swab32(a->dirty_sectors); 307 a->cached_sectors = swab32(a->cached_sectors); 308 a->io_time[0] = swab64(a->io_time[0]); 309 a->io_time[1] = swab64(a->io_time[1]); 310 a->stripe = swab32(a->stripe); 311 a->nr_external_backpointers = swab32(a->nr_external_backpointers); 312 a->fragmentation_lru = swab64(a->fragmentation_lru); 313 314 bps = alloc_v4_backpointers(a); 315 for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) { 316 bp->bucket_offset = swab40(bp->bucket_offset); 317 bp->bucket_len = swab32(bp->bucket_len); 318 bch2_bpos_swab(&bp->pos); 319 } 320 } 321 322 void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) 323 { 324 struct bch_alloc_v4 _a; 325 const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); 326 327 prt_newline(out); 328 printbuf_indent_add(out, 2); 329 330 prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen); 331 bch2_prt_data_type(out, a->data_type); 332 prt_newline(out); 333 prt_printf(out, "journal_seq %llu\n", a->journal_seq); 334 prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a)); 335 prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a)); 336 prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors); 337 prt_printf(out, "cached_sectors %u\n", a->cached_sectors); 338 prt_printf(out, "stripe %u\n", a->stripe); 339 prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy); 340 prt_printf(out, "io_time[READ] %llu\n", a->io_time[READ]); 341 prt_printf(out, "io_time[WRITE] %llu\n", a->io_time[WRITE]); 342 prt_printf(out, "fragmentation %llu\n", a->fragmentation_lru); 343 prt_printf(out, "bp_start %llu\n", BCH_ALLOC_V4_BACKPOINTERS_START(a)); 344 printbuf_indent_sub(out, 2); 345 } 346 347 void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out) 348 { 349 if (k.k->type == KEY_TYPE_alloc_v4) { 350 void *src, *dst; 351 352 *out = *bkey_s_c_to_alloc_v4(k).v; 353 354 src = alloc_v4_backpointers(out); 355 SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s); 356 dst = alloc_v4_backpointers(out); 357 358 if (src < dst) 359 memset(src, 0, dst - src); 360 361 SET_BCH_ALLOC_V4_NR_BACKPOINTERS(out, 0); 362 } else { 363 struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); 364 365 *out = (struct bch_alloc_v4) { 366 .journal_seq = u.journal_seq, 367 .flags = u.need_discard, 368 .gen = u.gen, 369 .oldest_gen = u.oldest_gen, 370 .data_type = u.data_type, 371 .stripe_redundancy = u.stripe_redundancy, 372 .dirty_sectors = u.dirty_sectors, 373 .cached_sectors = u.cached_sectors, 374 .io_time[READ] = u.read_time, 375 .io_time[WRITE] = u.write_time, 376 .stripe = u.stripe, 377 }; 378 379 SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s); 380 } 381 } 382 383 static noinline struct bkey_i_alloc_v4 * 384 __bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) 385 { 386 struct bkey_i_alloc_v4 *ret; 387 388 ret = bch2_trans_kmalloc(trans, max(bkey_bytes(k.k), sizeof(struct bkey_i_alloc_v4))); 389 if (IS_ERR(ret)) 390 return ret; 391 392 if (k.k->type == KEY_TYPE_alloc_v4) { 393 void *src, *dst; 394 395 bkey_reassemble(&ret->k_i, k); 396 397 src = alloc_v4_backpointers(&ret->v); 398 SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s); 399 dst = alloc_v4_backpointers(&ret->v); 400 401 if (src < dst) 402 memset(src, 0, dst - src); 403 404 SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v, 0); 405 set_alloc_v4_u64s(ret); 406 } else { 407 bkey_alloc_v4_init(&ret->k_i); 408 ret->k.p = k.k->p; 409 bch2_alloc_to_v4(k, &ret->v); 410 } 411 return ret; 412 } 413 414 static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k) 415 { 416 struct bkey_s_c_alloc_v4 a; 417 418 if (likely(k.k->type == KEY_TYPE_alloc_v4) && 419 ((a = bkey_s_c_to_alloc_v4(k), true) && 420 BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) == 0)) 421 return bch2_bkey_make_mut_noupdate_typed(trans, k, alloc_v4); 422 423 return __bch2_alloc_to_v4_mut(trans, k); 424 } 425 426 struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) 427 { 428 return bch2_alloc_to_v4_mut_inlined(trans, k); 429 } 430 431 struct bkey_i_alloc_v4 * 432 bch2_trans_start_alloc_update_noupdate(struct btree_trans *trans, struct btree_iter *iter, 433 struct bpos pos) 434 { 435 struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos, 436 BTREE_ITER_with_updates| 437 BTREE_ITER_cached| 438 BTREE_ITER_intent); 439 int ret = bkey_err(k); 440 if (unlikely(ret)) 441 return ERR_PTR(ret); 442 443 struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k); 444 ret = PTR_ERR_OR_ZERO(a); 445 if (unlikely(ret)) 446 goto err; 447 return a; 448 err: 449 bch2_trans_iter_exit(trans, iter); 450 return ERR_PTR(ret); 451 } 452 453 __flatten 454 struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, struct bpos pos) 455 { 456 struct btree_iter iter; 457 struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update_noupdate(trans, &iter, pos); 458 int ret = PTR_ERR_OR_ZERO(a); 459 if (ret) 460 return ERR_PTR(ret); 461 462 ret = bch2_trans_update(trans, &iter, &a->k_i, 0); 463 bch2_trans_iter_exit(trans, &iter); 464 return unlikely(ret) ? ERR_PTR(ret) : a; 465 } 466 467 static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset) 468 { 469 *offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK; 470 471 pos.offset >>= KEY_TYPE_BUCKET_GENS_BITS; 472 return pos; 473 } 474 475 static struct bpos bucket_gens_pos_to_alloc(struct bpos pos, unsigned offset) 476 { 477 pos.offset <<= KEY_TYPE_BUCKET_GENS_BITS; 478 pos.offset += offset; 479 return pos; 480 } 481 482 static unsigned alloc_gen(struct bkey_s_c k, unsigned offset) 483 { 484 return k.k->type == KEY_TYPE_bucket_gens 485 ? bkey_s_c_to_bucket_gens(k).v->gens[offset] 486 : 0; 487 } 488 489 int bch2_bucket_gens_invalid(struct bch_fs *c, struct bkey_s_c k, 490 enum bch_validate_flags flags, 491 struct printbuf *err) 492 { 493 int ret = 0; 494 495 bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens), c, err, 496 bucket_gens_val_size_bad, 497 "bad val size (%zu != %zu)", 498 bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens)); 499 fsck_err: 500 return ret; 501 } 502 503 void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) 504 { 505 struct bkey_s_c_bucket_gens g = bkey_s_c_to_bucket_gens(k); 506 unsigned i; 507 508 for (i = 0; i < ARRAY_SIZE(g.v->gens); i++) { 509 if (i) 510 prt_char(out, ' '); 511 prt_printf(out, "%u", g.v->gens[i]); 512 } 513 } 514 515 int bch2_bucket_gens_init(struct bch_fs *c) 516 { 517 struct btree_trans *trans = bch2_trans_get(c); 518 struct bkey_i_bucket_gens g; 519 bool have_bucket_gens_key = false; 520 int ret; 521 522 ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, 523 BTREE_ITER_prefetch, k, ({ 524 /* 525 * Not a fsck error because this is checked/repaired by 526 * bch2_check_alloc_key() which runs later: 527 */ 528 if (!bch2_dev_bucket_exists(c, k.k->p)) 529 continue; 530 531 struct bch_alloc_v4 a; 532 u8 gen = bch2_alloc_to_v4(k, &a)->gen; 533 unsigned offset; 534 struct bpos pos = alloc_gens_pos(iter.pos, &offset); 535 int ret2 = 0; 536 537 if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) { 538 ret2 = bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0) ?: 539 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); 540 if (ret2) 541 goto iter_err; 542 have_bucket_gens_key = false; 543 } 544 545 if (!have_bucket_gens_key) { 546 bkey_bucket_gens_init(&g.k_i); 547 g.k.p = pos; 548 have_bucket_gens_key = true; 549 } 550 551 g.v.gens[offset] = gen; 552 iter_err: 553 ret2; 554 })); 555 556 if (have_bucket_gens_key && !ret) 557 ret = commit_do(trans, NULL, NULL, 558 BCH_TRANS_COMMIT_no_enospc, 559 bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0)); 560 561 bch2_trans_put(trans); 562 563 bch_err_fn(c, ret); 564 return ret; 565 } 566 567 int bch2_alloc_read(struct bch_fs *c) 568 { 569 struct btree_trans *trans = bch2_trans_get(c); 570 struct bch_dev *ca = NULL; 571 int ret; 572 573 down_read(&c->gc_lock); 574 575 if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) { 576 ret = for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN, 577 BTREE_ITER_prefetch, k, ({ 578 u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; 579 u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; 580 581 if (k.k->type != KEY_TYPE_bucket_gens) 582 continue; 583 584 ca = bch2_dev_iterate(c, ca, k.k->p.inode); 585 /* 586 * Not a fsck error because this is checked/repaired by 587 * bch2_check_alloc_key() which runs later: 588 */ 589 if (!ca) { 590 bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); 591 continue; 592 } 593 594 const struct bch_bucket_gens *g = bkey_s_c_to_bucket_gens(k).v; 595 596 for (u64 b = max_t(u64, ca->mi.first_bucket, start); 597 b < min_t(u64, ca->mi.nbuckets, end); 598 b++) 599 *bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK]; 600 0; 601 })); 602 } else { 603 ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, 604 BTREE_ITER_prefetch, k, ({ 605 ca = bch2_dev_iterate(c, ca, k.k->p.inode); 606 /* 607 * Not a fsck error because this is checked/repaired by 608 * bch2_check_alloc_key() which runs later: 609 */ 610 if (!ca) { 611 bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); 612 continue; 613 } 614 615 struct bch_alloc_v4 a; 616 *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen; 617 0; 618 })); 619 } 620 621 bch2_dev_put(ca); 622 bch2_trans_put(trans); 623 up_read(&c->gc_lock); 624 625 bch_err_fn(c, ret); 626 return ret; 627 } 628 629 /* Free space/discard btree: */ 630 631 static int bch2_bucket_do_index(struct btree_trans *trans, 632 struct bch_dev *ca, 633 struct bkey_s_c alloc_k, 634 const struct bch_alloc_v4 *a, 635 bool set) 636 { 637 struct bch_fs *c = trans->c; 638 struct btree_iter iter; 639 struct bkey_s_c old; 640 struct bkey_i *k; 641 enum btree_id btree; 642 enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted; 643 enum bch_bkey_type new_type = set ? KEY_TYPE_set : KEY_TYPE_deleted; 644 struct printbuf buf = PRINTBUF; 645 int ret; 646 647 if (a->data_type != BCH_DATA_free && 648 a->data_type != BCH_DATA_need_discard) 649 return 0; 650 651 k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k)); 652 if (IS_ERR(k)) 653 return PTR_ERR(k); 654 655 bkey_init(&k->k); 656 k->k.type = new_type; 657 658 switch (a->data_type) { 659 case BCH_DATA_free: 660 btree = BTREE_ID_freespace; 661 k->k.p = alloc_freespace_pos(alloc_k.k->p, *a); 662 bch2_key_resize(&k->k, 1); 663 break; 664 case BCH_DATA_need_discard: 665 btree = BTREE_ID_need_discard; 666 k->k.p = alloc_k.k->p; 667 break; 668 default: 669 return 0; 670 } 671 672 old = bch2_bkey_get_iter(trans, &iter, btree, 673 bkey_start_pos(&k->k), 674 BTREE_ITER_intent); 675 ret = bkey_err(old); 676 if (ret) 677 return ret; 678 679 if (ca->mi.freespace_initialized && 680 c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info && 681 bch2_trans_inconsistent_on(old.k->type != old_type, trans, 682 "incorrect key when %s %s:%llu:%llu:0 (got %s should be %s)\n" 683 " for %s", 684 set ? "setting" : "clearing", 685 bch2_btree_id_str(btree), 686 iter.pos.inode, 687 iter.pos.offset, 688 bch2_bkey_types[old.k->type], 689 bch2_bkey_types[old_type], 690 (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { 691 ret = -EIO; 692 goto err; 693 } 694 695 ret = bch2_trans_update(trans, &iter, k, 0); 696 err: 697 bch2_trans_iter_exit(trans, &iter); 698 printbuf_exit(&buf); 699 return ret; 700 } 701 702 static noinline int bch2_bucket_gen_update(struct btree_trans *trans, 703 struct bpos bucket, u8 gen) 704 { 705 struct btree_iter iter; 706 unsigned offset; 707 struct bpos pos = alloc_gens_pos(bucket, &offset); 708 struct bkey_i_bucket_gens *g; 709 struct bkey_s_c k; 710 int ret; 711 712 g = bch2_trans_kmalloc(trans, sizeof(*g)); 713 ret = PTR_ERR_OR_ZERO(g); 714 if (ret) 715 return ret; 716 717 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_bucket_gens, pos, 718 BTREE_ITER_intent| 719 BTREE_ITER_with_updates); 720 ret = bkey_err(k); 721 if (ret) 722 return ret; 723 724 if (k.k->type != KEY_TYPE_bucket_gens) { 725 bkey_bucket_gens_init(&g->k_i); 726 g->k.p = iter.pos; 727 } else { 728 bkey_reassemble(&g->k_i, k); 729 } 730 731 g->v.gens[offset] = gen; 732 733 ret = bch2_trans_update(trans, &iter, &g->k_i, 0); 734 bch2_trans_iter_exit(trans, &iter); 735 return ret; 736 } 737 738 int bch2_trigger_alloc(struct btree_trans *trans, 739 enum btree_id btree, unsigned level, 740 struct bkey_s_c old, struct bkey_s new, 741 enum btree_iter_update_trigger_flags flags) 742 { 743 struct bch_fs *c = trans->c; 744 struct printbuf buf = PRINTBUF; 745 int ret = 0; 746 747 struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p); 748 if (!ca) 749 return -EIO; 750 751 struct bch_alloc_v4 old_a_convert; 752 const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert); 753 754 if (flags & BTREE_TRIGGER_transactional) { 755 struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v; 756 757 alloc_data_type_set(new_a, new_a->data_type); 758 759 if (bch2_bucket_sectors_total(*new_a) > bch2_bucket_sectors_total(*old_a)) { 760 new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); 761 new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now)); 762 SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true); 763 SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true); 764 } 765 766 if (data_type_is_empty(new_a->data_type) && 767 BCH_ALLOC_V4_NEED_INC_GEN(new_a) && 768 !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) { 769 new_a->gen++; 770 SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false); 771 } 772 773 if (old_a->data_type != new_a->data_type || 774 (new_a->data_type == BCH_DATA_free && 775 alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) { 776 ret = bch2_bucket_do_index(trans, ca, old, old_a, false) ?: 777 bch2_bucket_do_index(trans, ca, new.s_c, new_a, true); 778 if (ret) 779 goto err; 780 } 781 782 if (new_a->data_type == BCH_DATA_cached && 783 !new_a->io_time[READ]) 784 new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); 785 786 u64 old_lru = alloc_lru_idx_read(*old_a); 787 u64 new_lru = alloc_lru_idx_read(*new_a); 788 if (old_lru != new_lru) { 789 ret = bch2_lru_change(trans, new.k->p.inode, 790 bucket_to_u64(new.k->p), 791 old_lru, new_lru); 792 if (ret) 793 goto err; 794 } 795 796 new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a, ca); 797 if (old_a->fragmentation_lru != new_a->fragmentation_lru) { 798 ret = bch2_lru_change(trans, 799 BCH_LRU_FRAGMENTATION_START, 800 bucket_to_u64(new.k->p), 801 old_a->fragmentation_lru, new_a->fragmentation_lru); 802 if (ret) 803 goto err; 804 } 805 806 if (old_a->gen != new_a->gen) { 807 ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen); 808 if (ret) 809 goto err; 810 } 811 812 /* 813 * need to know if we're getting called from the invalidate path or 814 * not: 815 */ 816 817 if ((flags & BTREE_TRIGGER_bucket_invalidate) && 818 old_a->cached_sectors) { 819 ret = bch2_update_cached_sectors_list(trans, new.k->p.inode, 820 -((s64) old_a->cached_sectors)); 821 if (ret) 822 goto err; 823 } 824 } 825 826 if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) { 827 struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v; 828 u64 journal_seq = trans->journal_res.seq; 829 u64 bucket_journal_seq = new_a->journal_seq; 830 831 if ((flags & BTREE_TRIGGER_insert) && 832 data_type_is_empty(old_a->data_type) != 833 data_type_is_empty(new_a->data_type) && 834 new.k->type == KEY_TYPE_alloc_v4) { 835 struct bch_alloc_v4 *v = bkey_s_to_alloc_v4(new).v; 836 837 /* 838 * If the btree updates referring to a bucket weren't flushed 839 * before the bucket became empty again, then the we don't have 840 * to wait on a journal flush before we can reuse the bucket: 841 */ 842 v->journal_seq = bucket_journal_seq = 843 data_type_is_empty(new_a->data_type) && 844 (journal_seq == v->journal_seq || 845 bch2_journal_noflush_seq(&c->journal, v->journal_seq)) 846 ? 0 : journal_seq; 847 } 848 849 if (!data_type_is_empty(old_a->data_type) && 850 data_type_is_empty(new_a->data_type) && 851 bucket_journal_seq) { 852 ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, 853 c->journal.flushed_seq_ondisk, 854 new.k->p.inode, new.k->p.offset, 855 bucket_journal_seq); 856 if (ret) { 857 bch2_fs_fatal_error(c, 858 "setting bucket_needs_journal_commit: %s", bch2_err_str(ret)); 859 goto err; 860 } 861 } 862 863 percpu_down_read(&c->mark_lock); 864 if (new_a->gen != old_a->gen) { 865 u8 *gen = bucket_gen(ca, new.k->p.offset); 866 if (unlikely(!gen)) { 867 percpu_up_read(&c->mark_lock); 868 goto invalid_bucket; 869 } 870 *gen = new_a->gen; 871 } 872 873 bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, false); 874 percpu_up_read(&c->mark_lock); 875 876 #define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; }) 877 #define statechange(expr) !eval_state(old_a, expr) && eval_state(new_a, expr) 878 #define bucket_flushed(a) (!a->journal_seq || a->journal_seq <= c->journal.flushed_seq_ondisk) 879 880 if (statechange(a->data_type == BCH_DATA_free) && 881 bucket_flushed(new_a)) 882 closure_wake_up(&c->freelist_wait); 883 884 if (statechange(a->data_type == BCH_DATA_need_discard) && 885 !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) && 886 bucket_flushed(new_a)) 887 bch2_discard_one_bucket_fast(c, new.k->p); 888 889 if (statechange(a->data_type == BCH_DATA_cached) && 890 !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) && 891 should_invalidate_buckets(ca, bch2_dev_usage_read(ca))) 892 bch2_do_invalidates(c); 893 894 if (statechange(a->data_type == BCH_DATA_need_gc_gens)) 895 bch2_gc_gens_async(c); 896 } 897 898 if ((flags & BTREE_TRIGGER_gc) && 899 (flags & BTREE_TRIGGER_bucket_invalidate)) { 900 struct bch_alloc_v4 new_a_convert; 901 const struct bch_alloc_v4 *new_a = bch2_alloc_to_v4(new.s_c, &new_a_convert); 902 903 percpu_down_read(&c->mark_lock); 904 struct bucket *g = gc_bucket(ca, new.k->p.offset); 905 if (unlikely(!g)) { 906 percpu_up_read(&c->mark_lock); 907 goto invalid_bucket; 908 } 909 g->gen_valid = 1; 910 911 bucket_lock(g); 912 913 g->gen_valid = 1; 914 g->gen = new_a->gen; 915 g->data_type = new_a->data_type; 916 g->stripe = new_a->stripe; 917 g->stripe_redundancy = new_a->stripe_redundancy; 918 g->dirty_sectors = new_a->dirty_sectors; 919 g->cached_sectors = new_a->cached_sectors; 920 921 bucket_unlock(g); 922 percpu_up_read(&c->mark_lock); 923 } 924 err: 925 printbuf_exit(&buf); 926 bch2_dev_put(ca); 927 return ret; 928 invalid_bucket: 929 bch2_fs_inconsistent(c, "reference to invalid bucket\n %s", 930 (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf)); 931 ret = -EIO; 932 goto err; 933 } 934 935 /* 936 * This synthesizes deleted extents for holes, similar to BTREE_ITER_slots for 937 * extents style btrees, but works on non-extents btrees: 938 */ 939 static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole) 940 { 941 struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); 942 943 if (bkey_err(k)) 944 return k; 945 946 if (k.k->type) { 947 return k; 948 } else { 949 struct btree_iter iter2; 950 struct bpos next; 951 952 bch2_trans_copy_iter(&iter2, iter); 953 954 struct btree_path *path = btree_iter_path(iter->trans, iter); 955 if (!bpos_eq(path->l[0].b->key.k.p, SPOS_MAX)) 956 end = bkey_min(end, bpos_nosnap_successor(path->l[0].b->key.k.p)); 957 958 end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1)); 959 960 /* 961 * btree node min/max is a closed interval, upto takes a half 962 * open interval: 963 */ 964 k = bch2_btree_iter_peek_upto(&iter2, end); 965 next = iter2.pos; 966 bch2_trans_iter_exit(iter->trans, &iter2); 967 968 BUG_ON(next.offset >= iter->pos.offset + U32_MAX); 969 970 if (bkey_err(k)) 971 return k; 972 973 bkey_init(hole); 974 hole->p = iter->pos; 975 976 bch2_key_resize(hole, next.offset - iter->pos.offset); 977 return (struct bkey_s_c) { hole, NULL }; 978 } 979 } 980 981 static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *bucket) 982 { 983 if (*ca) { 984 if (bucket->offset < (*ca)->mi.first_bucket) 985 bucket->offset = (*ca)->mi.first_bucket; 986 987 if (bucket->offset < (*ca)->mi.nbuckets) 988 return true; 989 990 bch2_dev_put(*ca); 991 *ca = NULL; 992 bucket->inode++; 993 bucket->offset = 0; 994 } 995 996 rcu_read_lock(); 997 *ca = __bch2_next_dev_idx(c, bucket->inode, NULL); 998 if (*ca) { 999 *bucket = POS((*ca)->dev_idx, (*ca)->mi.first_bucket); 1000 bch2_dev_get(*ca); 1001 } 1002 rcu_read_unlock(); 1003 1004 return *ca != NULL; 1005 } 1006 1007 static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter, 1008 struct bch_dev **ca, struct bkey *hole) 1009 { 1010 struct bch_fs *c = iter->trans->c; 1011 struct bkey_s_c k; 1012 again: 1013 k = bch2_get_key_or_hole(iter, POS_MAX, hole); 1014 if (bkey_err(k)) 1015 return k; 1016 1017 *ca = bch2_dev_iterate_noerror(c, *ca, k.k->p.inode); 1018 1019 if (!k.k->type) { 1020 struct bpos hole_start = bkey_start_pos(k.k); 1021 1022 if (!*ca || !bucket_valid(*ca, hole_start.offset)) { 1023 if (!next_bucket(c, ca, &hole_start)) 1024 return bkey_s_c_null; 1025 1026 bch2_btree_iter_set_pos(iter, hole_start); 1027 goto again; 1028 } 1029 1030 if (k.k->p.offset > (*ca)->mi.nbuckets) 1031 bch2_key_resize(hole, (*ca)->mi.nbuckets - hole_start.offset); 1032 } 1033 1034 return k; 1035 } 1036 1037 static noinline_for_stack 1038 int bch2_check_alloc_key(struct btree_trans *trans, 1039 struct bkey_s_c alloc_k, 1040 struct btree_iter *alloc_iter, 1041 struct btree_iter *discard_iter, 1042 struct btree_iter *freespace_iter, 1043 struct btree_iter *bucket_gens_iter) 1044 { 1045 struct bch_fs *c = trans->c; 1046 struct bch_alloc_v4 a_convert; 1047 const struct bch_alloc_v4 *a; 1048 unsigned discard_key_type, freespace_key_type; 1049 unsigned gens_offset; 1050 struct bkey_s_c k; 1051 struct printbuf buf = PRINTBUF; 1052 int ret = 0; 1053 1054 struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_k.k->p); 1055 if (fsck_err_on(!ca, 1056 c, alloc_key_to_missing_dev_bucket, 1057 "alloc key for invalid device:bucket %llu:%llu", 1058 alloc_k.k->p.inode, alloc_k.k->p.offset)) 1059 ret = bch2_btree_delete_at(trans, alloc_iter, 0); 1060 if (!ca) 1061 return ret; 1062 1063 if (!ca->mi.freespace_initialized) 1064 goto out; 1065 1066 a = bch2_alloc_to_v4(alloc_k, &a_convert); 1067 1068 discard_key_type = a->data_type == BCH_DATA_need_discard ? KEY_TYPE_set : 0; 1069 bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p); 1070 k = bch2_btree_iter_peek_slot(discard_iter); 1071 ret = bkey_err(k); 1072 if (ret) 1073 goto err; 1074 1075 if (fsck_err_on(k.k->type != discard_key_type, 1076 c, need_discard_key_wrong, 1077 "incorrect key in need_discard btree (got %s should be %s)\n" 1078 " %s", 1079 bch2_bkey_types[k.k->type], 1080 bch2_bkey_types[discard_key_type], 1081 (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { 1082 struct bkey_i *update = 1083 bch2_trans_kmalloc(trans, sizeof(*update)); 1084 1085 ret = PTR_ERR_OR_ZERO(update); 1086 if (ret) 1087 goto err; 1088 1089 bkey_init(&update->k); 1090 update->k.type = discard_key_type; 1091 update->k.p = discard_iter->pos; 1092 1093 ret = bch2_trans_update(trans, discard_iter, update, 0); 1094 if (ret) 1095 goto err; 1096 } 1097 1098 freespace_key_type = a->data_type == BCH_DATA_free ? KEY_TYPE_set : 0; 1099 bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a)); 1100 k = bch2_btree_iter_peek_slot(freespace_iter); 1101 ret = bkey_err(k); 1102 if (ret) 1103 goto err; 1104 1105 if (fsck_err_on(k.k->type != freespace_key_type, 1106 c, freespace_key_wrong, 1107 "incorrect key in freespace btree (got %s should be %s)\n" 1108 " %s", 1109 bch2_bkey_types[k.k->type], 1110 bch2_bkey_types[freespace_key_type], 1111 (printbuf_reset(&buf), 1112 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { 1113 struct bkey_i *update = 1114 bch2_trans_kmalloc(trans, sizeof(*update)); 1115 1116 ret = PTR_ERR_OR_ZERO(update); 1117 if (ret) 1118 goto err; 1119 1120 bkey_init(&update->k); 1121 update->k.type = freespace_key_type; 1122 update->k.p = freespace_iter->pos; 1123 bch2_key_resize(&update->k, 1); 1124 1125 ret = bch2_trans_update(trans, freespace_iter, update, 0); 1126 if (ret) 1127 goto err; 1128 } 1129 1130 bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset)); 1131 k = bch2_btree_iter_peek_slot(bucket_gens_iter); 1132 ret = bkey_err(k); 1133 if (ret) 1134 goto err; 1135 1136 if (fsck_err_on(a->gen != alloc_gen(k, gens_offset), 1137 c, bucket_gens_key_wrong, 1138 "incorrect gen in bucket_gens btree (got %u should be %u)\n" 1139 " %s", 1140 alloc_gen(k, gens_offset), a->gen, 1141 (printbuf_reset(&buf), 1142 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { 1143 struct bkey_i_bucket_gens *g = 1144 bch2_trans_kmalloc(trans, sizeof(*g)); 1145 1146 ret = PTR_ERR_OR_ZERO(g); 1147 if (ret) 1148 goto err; 1149 1150 if (k.k->type == KEY_TYPE_bucket_gens) { 1151 bkey_reassemble(&g->k_i, k); 1152 } else { 1153 bkey_bucket_gens_init(&g->k_i); 1154 g->k.p = alloc_gens_pos(alloc_k.k->p, &gens_offset); 1155 } 1156 1157 g->v.gens[gens_offset] = a->gen; 1158 1159 ret = bch2_trans_update(trans, bucket_gens_iter, &g->k_i, 0); 1160 if (ret) 1161 goto err; 1162 } 1163 out: 1164 err: 1165 fsck_err: 1166 bch2_dev_put(ca); 1167 printbuf_exit(&buf); 1168 return ret; 1169 } 1170 1171 static noinline_for_stack 1172 int bch2_check_alloc_hole_freespace(struct btree_trans *trans, 1173 struct bch_dev *ca, 1174 struct bpos start, 1175 struct bpos *end, 1176 struct btree_iter *freespace_iter) 1177 { 1178 struct bch_fs *c = trans->c; 1179 struct bkey_s_c k; 1180 struct printbuf buf = PRINTBUF; 1181 int ret; 1182 1183 if (!ca->mi.freespace_initialized) 1184 return 0; 1185 1186 bch2_btree_iter_set_pos(freespace_iter, start); 1187 1188 k = bch2_btree_iter_peek_slot(freespace_iter); 1189 ret = bkey_err(k); 1190 if (ret) 1191 goto err; 1192 1193 *end = bkey_min(k.k->p, *end); 1194 1195 if (fsck_err_on(k.k->type != KEY_TYPE_set, 1196 c, freespace_hole_missing, 1197 "hole in alloc btree missing in freespace btree\n" 1198 " device %llu buckets %llu-%llu", 1199 freespace_iter->pos.inode, 1200 freespace_iter->pos.offset, 1201 end->offset)) { 1202 struct bkey_i *update = 1203 bch2_trans_kmalloc(trans, sizeof(*update)); 1204 1205 ret = PTR_ERR_OR_ZERO(update); 1206 if (ret) 1207 goto err; 1208 1209 bkey_init(&update->k); 1210 update->k.type = KEY_TYPE_set; 1211 update->k.p = freespace_iter->pos; 1212 bch2_key_resize(&update->k, 1213 min_t(u64, U32_MAX, end->offset - 1214 freespace_iter->pos.offset)); 1215 1216 ret = bch2_trans_update(trans, freespace_iter, update, 0); 1217 if (ret) 1218 goto err; 1219 } 1220 err: 1221 fsck_err: 1222 printbuf_exit(&buf); 1223 return ret; 1224 } 1225 1226 static noinline_for_stack 1227 int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans, 1228 struct bpos start, 1229 struct bpos *end, 1230 struct btree_iter *bucket_gens_iter) 1231 { 1232 struct bch_fs *c = trans->c; 1233 struct bkey_s_c k; 1234 struct printbuf buf = PRINTBUF; 1235 unsigned i, gens_offset, gens_end_offset; 1236 int ret; 1237 1238 bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset)); 1239 1240 k = bch2_btree_iter_peek_slot(bucket_gens_iter); 1241 ret = bkey_err(k); 1242 if (ret) 1243 goto err; 1244 1245 if (bkey_cmp(alloc_gens_pos(start, &gens_offset), 1246 alloc_gens_pos(*end, &gens_end_offset))) 1247 gens_end_offset = KEY_TYPE_BUCKET_GENS_NR; 1248 1249 if (k.k->type == KEY_TYPE_bucket_gens) { 1250 struct bkey_i_bucket_gens g; 1251 bool need_update = false; 1252 1253 bkey_reassemble(&g.k_i, k); 1254 1255 for (i = gens_offset; i < gens_end_offset; i++) { 1256 if (fsck_err_on(g.v.gens[i], c, 1257 bucket_gens_hole_wrong, 1258 "hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)", 1259 bucket_gens_pos_to_alloc(k.k->p, i).inode, 1260 bucket_gens_pos_to_alloc(k.k->p, i).offset, 1261 g.v.gens[i])) { 1262 g.v.gens[i] = 0; 1263 need_update = true; 1264 } 1265 } 1266 1267 if (need_update) { 1268 struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g)); 1269 1270 ret = PTR_ERR_OR_ZERO(u); 1271 if (ret) 1272 goto err; 1273 1274 memcpy(u, &g, sizeof(g)); 1275 1276 ret = bch2_trans_update(trans, bucket_gens_iter, u, 0); 1277 if (ret) 1278 goto err; 1279 } 1280 } 1281 1282 *end = bkey_min(*end, bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0)); 1283 err: 1284 fsck_err: 1285 printbuf_exit(&buf); 1286 return ret; 1287 } 1288 1289 static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_trans *trans, 1290 struct btree_iter *iter) 1291 { 1292 struct bch_fs *c = trans->c; 1293 struct btree_iter alloc_iter; 1294 struct bkey_s_c alloc_k; 1295 struct bch_alloc_v4 a_convert; 1296 const struct bch_alloc_v4 *a; 1297 u64 genbits; 1298 struct bpos pos; 1299 enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard 1300 ? BCH_DATA_need_discard 1301 : BCH_DATA_free; 1302 struct printbuf buf = PRINTBUF; 1303 int ret; 1304 1305 pos = iter->pos; 1306 pos.offset &= ~(~0ULL << 56); 1307 genbits = iter->pos.offset & (~0ULL << 56); 1308 1309 alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, pos, 0); 1310 ret = bkey_err(alloc_k); 1311 if (ret) 1312 return ret; 1313 1314 if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c, 1315 need_discard_freespace_key_to_invalid_dev_bucket, 1316 "entry in %s btree for nonexistant dev:bucket %llu:%llu", 1317 bch2_btree_id_str(iter->btree_id), pos.inode, pos.offset)) 1318 goto delete; 1319 1320 a = bch2_alloc_to_v4(alloc_k, &a_convert); 1321 1322 if (fsck_err_on(a->data_type != state || 1323 (state == BCH_DATA_free && 1324 genbits != alloc_freespace_genbits(*a)), c, 1325 need_discard_freespace_key_bad, 1326 "%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)", 1327 (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), 1328 bch2_btree_id_str(iter->btree_id), 1329 iter->pos.inode, 1330 iter->pos.offset, 1331 a->data_type == state, 1332 genbits >> 56, alloc_freespace_genbits(*a) >> 56)) 1333 goto delete; 1334 out: 1335 fsck_err: 1336 bch2_set_btree_iter_dontneed(&alloc_iter); 1337 bch2_trans_iter_exit(trans, &alloc_iter); 1338 printbuf_exit(&buf); 1339 return ret; 1340 delete: 1341 ret = bch2_btree_delete_extent_at(trans, iter, 1342 iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?: 1343 bch2_trans_commit(trans, NULL, NULL, 1344 BCH_TRANS_COMMIT_no_enospc); 1345 goto out; 1346 } 1347 1348 /* 1349 * We've already checked that generation numbers in the bucket_gens btree are 1350 * valid for buckets that exist; this just checks for keys for nonexistent 1351 * buckets. 1352 */ 1353 static noinline_for_stack 1354 int bch2_check_bucket_gens_key(struct btree_trans *trans, 1355 struct btree_iter *iter, 1356 struct bkey_s_c k) 1357 { 1358 struct bch_fs *c = trans->c; 1359 struct bkey_i_bucket_gens g; 1360 u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; 1361 u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; 1362 u64 b; 1363 bool need_update = false; 1364 struct printbuf buf = PRINTBUF; 1365 int ret = 0; 1366 1367 BUG_ON(k.k->type != KEY_TYPE_bucket_gens); 1368 bkey_reassemble(&g.k_i, k); 1369 1370 struct bch_dev *ca = bch2_dev_tryget_noerror(c, k.k->p.inode); 1371 if (!ca) { 1372 if (fsck_err(c, bucket_gens_to_invalid_dev, 1373 "bucket_gens key for invalid device:\n %s", 1374 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) 1375 ret = bch2_btree_delete_at(trans, iter, 0); 1376 goto out; 1377 } 1378 1379 if (fsck_err_on(end <= ca->mi.first_bucket || 1380 start >= ca->mi.nbuckets, c, 1381 bucket_gens_to_invalid_buckets, 1382 "bucket_gens key for invalid buckets:\n %s", 1383 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { 1384 ret = bch2_btree_delete_at(trans, iter, 0); 1385 goto out; 1386 } 1387 1388 for (b = start; b < ca->mi.first_bucket; b++) 1389 if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c, 1390 bucket_gens_nonzero_for_invalid_buckets, 1391 "bucket_gens key has nonzero gen for invalid bucket")) { 1392 g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0; 1393 need_update = true; 1394 } 1395 1396 for (b = ca->mi.nbuckets; b < end; b++) 1397 if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c, 1398 bucket_gens_nonzero_for_invalid_buckets, 1399 "bucket_gens key has nonzero gen for invalid bucket")) { 1400 g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0; 1401 need_update = true; 1402 } 1403 1404 if (need_update) { 1405 struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g)); 1406 1407 ret = PTR_ERR_OR_ZERO(u); 1408 if (ret) 1409 goto out; 1410 1411 memcpy(u, &g, sizeof(g)); 1412 ret = bch2_trans_update(trans, iter, u, 0); 1413 } 1414 out: 1415 fsck_err: 1416 bch2_dev_put(ca); 1417 printbuf_exit(&buf); 1418 return ret; 1419 } 1420 1421 int bch2_check_alloc_info(struct bch_fs *c) 1422 { 1423 struct btree_trans *trans = bch2_trans_get(c); 1424 struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter; 1425 struct bch_dev *ca = NULL; 1426 struct bkey hole; 1427 struct bkey_s_c k; 1428 int ret = 0; 1429 1430 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN, 1431 BTREE_ITER_prefetch); 1432 bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN, 1433 BTREE_ITER_prefetch); 1434 bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, POS_MIN, 1435 BTREE_ITER_prefetch); 1436 bch2_trans_iter_init(trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN, 1437 BTREE_ITER_prefetch); 1438 1439 while (1) { 1440 struct bpos next; 1441 1442 bch2_trans_begin(trans); 1443 1444 k = bch2_get_key_or_real_bucket_hole(&iter, &ca, &hole); 1445 ret = bkey_err(k); 1446 if (ret) 1447 goto bkey_err; 1448 1449 if (!k.k) 1450 break; 1451 1452 if (k.k->type) { 1453 next = bpos_nosnap_successor(k.k->p); 1454 1455 ret = bch2_check_alloc_key(trans, 1456 k, &iter, 1457 &discard_iter, 1458 &freespace_iter, 1459 &bucket_gens_iter); 1460 if (ret) 1461 goto bkey_err; 1462 } else { 1463 next = k.k->p; 1464 1465 ret = bch2_check_alloc_hole_freespace(trans, ca, 1466 bkey_start_pos(k.k), 1467 &next, 1468 &freespace_iter) ?: 1469 bch2_check_alloc_hole_bucket_gens(trans, 1470 bkey_start_pos(k.k), 1471 &next, 1472 &bucket_gens_iter); 1473 if (ret) 1474 goto bkey_err; 1475 } 1476 1477 ret = bch2_trans_commit(trans, NULL, NULL, 1478 BCH_TRANS_COMMIT_no_enospc); 1479 if (ret) 1480 goto bkey_err; 1481 1482 bch2_btree_iter_set_pos(&iter, next); 1483 bkey_err: 1484 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1485 continue; 1486 if (ret) 1487 break; 1488 } 1489 bch2_trans_iter_exit(trans, &bucket_gens_iter); 1490 bch2_trans_iter_exit(trans, &freespace_iter); 1491 bch2_trans_iter_exit(trans, &discard_iter); 1492 bch2_trans_iter_exit(trans, &iter); 1493 bch2_dev_put(ca); 1494 ca = NULL; 1495 1496 if (ret < 0) 1497 goto err; 1498 1499 ret = for_each_btree_key(trans, iter, 1500 BTREE_ID_need_discard, POS_MIN, 1501 BTREE_ITER_prefetch, k, 1502 bch2_check_discard_freespace_key(trans, &iter)); 1503 if (ret) 1504 goto err; 1505 1506 bch2_trans_iter_init(trans, &iter, BTREE_ID_freespace, POS_MIN, 1507 BTREE_ITER_prefetch); 1508 while (1) { 1509 bch2_trans_begin(trans); 1510 k = bch2_btree_iter_peek(&iter); 1511 if (!k.k) 1512 break; 1513 1514 ret = bkey_err(k) ?: 1515 bch2_check_discard_freespace_key(trans, &iter); 1516 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { 1517 ret = 0; 1518 continue; 1519 } 1520 if (ret) { 1521 struct printbuf buf = PRINTBUF; 1522 bch2_bkey_val_to_text(&buf, c, k); 1523 1524 bch_err(c, "while checking %s", buf.buf); 1525 printbuf_exit(&buf); 1526 break; 1527 } 1528 1529 bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos)); 1530 } 1531 bch2_trans_iter_exit(trans, &iter); 1532 if (ret) 1533 goto err; 1534 1535 ret = for_each_btree_key_commit(trans, iter, 1536 BTREE_ID_bucket_gens, POS_MIN, 1537 BTREE_ITER_prefetch, k, 1538 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 1539 bch2_check_bucket_gens_key(trans, &iter, k)); 1540 err: 1541 bch2_trans_put(trans); 1542 bch_err_fn(c, ret); 1543 return ret; 1544 } 1545 1546 static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, 1547 struct btree_iter *alloc_iter) 1548 { 1549 struct bch_fs *c = trans->c; 1550 struct btree_iter lru_iter; 1551 struct bch_alloc_v4 a_convert; 1552 const struct bch_alloc_v4 *a; 1553 struct bkey_s_c alloc_k, lru_k; 1554 struct printbuf buf = PRINTBUF; 1555 int ret; 1556 1557 alloc_k = bch2_btree_iter_peek(alloc_iter); 1558 if (!alloc_k.k) 1559 return 0; 1560 1561 ret = bkey_err(alloc_k); 1562 if (ret) 1563 return ret; 1564 1565 a = bch2_alloc_to_v4(alloc_k, &a_convert); 1566 1567 if (a->data_type != BCH_DATA_cached) 1568 return 0; 1569 1570 if (fsck_err_on(!a->io_time[READ], c, 1571 alloc_key_cached_but_read_time_zero, 1572 "cached bucket with read_time 0\n" 1573 " %s", 1574 (printbuf_reset(&buf), 1575 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { 1576 struct bkey_i_alloc_v4 *a_mut = 1577 bch2_alloc_to_v4_mut(trans, alloc_k); 1578 ret = PTR_ERR_OR_ZERO(a_mut); 1579 if (ret) 1580 goto err; 1581 1582 a_mut->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now); 1583 ret = bch2_trans_update(trans, alloc_iter, 1584 &a_mut->k_i, BTREE_TRIGGER_norun); 1585 if (ret) 1586 goto err; 1587 1588 a = &a_mut->v; 1589 } 1590 1591 lru_k = bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru, 1592 lru_pos(alloc_k.k->p.inode, 1593 bucket_to_u64(alloc_k.k->p), 1594 a->io_time[READ]), 0); 1595 ret = bkey_err(lru_k); 1596 if (ret) 1597 return ret; 1598 1599 if (fsck_err_on(lru_k.k->type != KEY_TYPE_set, c, 1600 alloc_key_to_missing_lru_entry, 1601 "missing lru entry\n" 1602 " %s", 1603 (printbuf_reset(&buf), 1604 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { 1605 ret = bch2_lru_set(trans, 1606 alloc_k.k->p.inode, 1607 bucket_to_u64(alloc_k.k->p), 1608 a->io_time[READ]); 1609 if (ret) 1610 goto err; 1611 } 1612 err: 1613 fsck_err: 1614 bch2_trans_iter_exit(trans, &lru_iter); 1615 printbuf_exit(&buf); 1616 return ret; 1617 } 1618 1619 int bch2_check_alloc_to_lru_refs(struct bch_fs *c) 1620 { 1621 int ret = bch2_trans_run(c, 1622 for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, 1623 POS_MIN, BTREE_ITER_prefetch, k, 1624 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 1625 bch2_check_alloc_to_lru_ref(trans, &iter))); 1626 bch_err_fn(c, ret); 1627 return ret; 1628 } 1629 1630 static int discard_in_flight_add(struct bch_fs *c, struct bpos bucket) 1631 { 1632 int ret; 1633 1634 mutex_lock(&c->discard_buckets_in_flight_lock); 1635 darray_for_each(c->discard_buckets_in_flight, i) 1636 if (bkey_eq(*i, bucket)) { 1637 ret = -EEXIST; 1638 goto out; 1639 } 1640 1641 ret = darray_push(&c->discard_buckets_in_flight, bucket); 1642 out: 1643 mutex_unlock(&c->discard_buckets_in_flight_lock); 1644 return ret; 1645 } 1646 1647 static void discard_in_flight_remove(struct bch_fs *c, struct bpos bucket) 1648 { 1649 mutex_lock(&c->discard_buckets_in_flight_lock); 1650 darray_for_each(c->discard_buckets_in_flight, i) 1651 if (bkey_eq(*i, bucket)) { 1652 darray_remove_item(&c->discard_buckets_in_flight, i); 1653 goto found; 1654 } 1655 BUG(); 1656 found: 1657 mutex_unlock(&c->discard_buckets_in_flight_lock); 1658 } 1659 1660 struct discard_buckets_state { 1661 u64 seen; 1662 u64 open; 1663 u64 need_journal_commit; 1664 u64 discarded; 1665 struct bch_dev *ca; 1666 u64 need_journal_commit_this_dev; 1667 }; 1668 1669 static void discard_buckets_next_dev(struct bch_fs *c, struct discard_buckets_state *s, struct bch_dev *ca) 1670 { 1671 if (s->ca == ca) 1672 return; 1673 1674 if (s->ca && s->need_journal_commit_this_dev > 1675 bch2_dev_usage_read(s->ca).d[BCH_DATA_free].buckets) 1676 bch2_journal_flush_async(&c->journal, NULL); 1677 1678 if (s->ca) 1679 percpu_ref_put(&s->ca->io_ref); 1680 s->ca = ca; 1681 s->need_journal_commit_this_dev = 0; 1682 } 1683 1684 static int bch2_discard_one_bucket(struct btree_trans *trans, 1685 struct btree_iter *need_discard_iter, 1686 struct bpos *discard_pos_done, 1687 struct discard_buckets_state *s) 1688 { 1689 struct bch_fs *c = trans->c; 1690 struct bpos pos = need_discard_iter->pos; 1691 struct btree_iter iter = { NULL }; 1692 struct bkey_s_c k; 1693 struct bkey_i_alloc_v4 *a; 1694 struct printbuf buf = PRINTBUF; 1695 bool discard_locked = false; 1696 int ret = 0; 1697 1698 struct bch_dev *ca = s->ca && s->ca->dev_idx == pos.inode 1699 ? s->ca 1700 : bch2_dev_get_ioref(c, pos.inode, WRITE); 1701 if (!ca) { 1702 bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0)); 1703 return 0; 1704 } 1705 1706 discard_buckets_next_dev(c, s, ca); 1707 1708 if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) { 1709 s->open++; 1710 goto out; 1711 } 1712 1713 if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, 1714 c->journal.flushed_seq_ondisk, 1715 pos.inode, pos.offset)) { 1716 s->need_journal_commit++; 1717 s->need_journal_commit_this_dev++; 1718 goto out; 1719 } 1720 1721 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, 1722 need_discard_iter->pos, 1723 BTREE_ITER_cached); 1724 ret = bkey_err(k); 1725 if (ret) 1726 goto out; 1727 1728 a = bch2_alloc_to_v4_mut(trans, k); 1729 ret = PTR_ERR_OR_ZERO(a); 1730 if (ret) 1731 goto out; 1732 1733 if (bch2_bucket_sectors_total(a->v)) { 1734 if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, 1735 trans, "attempting to discard bucket with dirty data\n%s", 1736 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) 1737 ret = -EIO; 1738 goto out; 1739 } 1740 1741 if (a->v.data_type != BCH_DATA_need_discard) { 1742 if (data_type_is_empty(a->v.data_type) && 1743 BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) { 1744 a->v.gen++; 1745 SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); 1746 goto write; 1747 } 1748 1749 if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, 1750 trans, "bucket incorrectly set in need_discard btree\n" 1751 "%s", 1752 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) 1753 ret = -EIO; 1754 goto out; 1755 } 1756 1757 if (a->v.journal_seq > c->journal.flushed_seq_ondisk) { 1758 if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, 1759 trans, "clearing need_discard but journal_seq %llu > flushed_seq %llu\n%s", 1760 a->v.journal_seq, 1761 c->journal.flushed_seq_ondisk, 1762 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) 1763 ret = -EIO; 1764 goto out; 1765 } 1766 1767 if (discard_in_flight_add(c, SPOS(iter.pos.inode, iter.pos.offset, true))) 1768 goto out; 1769 1770 discard_locked = true; 1771 1772 if (!bkey_eq(*discard_pos_done, iter.pos) && 1773 ca->mi.discard && !c->opts.nochanges) { 1774 /* 1775 * This works without any other locks because this is the only 1776 * thread that removes items from the need_discard tree 1777 */ 1778 bch2_trans_unlock_long(trans); 1779 blkdev_issue_discard(ca->disk_sb.bdev, 1780 k.k->p.offset * ca->mi.bucket_size, 1781 ca->mi.bucket_size, 1782 GFP_KERNEL); 1783 *discard_pos_done = iter.pos; 1784 1785 ret = bch2_trans_relock_notrace(trans); 1786 if (ret) 1787 goto out; 1788 } 1789 1790 SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); 1791 alloc_data_type_set(&a->v, a->v.data_type); 1792 write: 1793 ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: 1794 bch2_trans_commit(trans, NULL, NULL, 1795 BCH_WATERMARK_btree| 1796 BCH_TRANS_COMMIT_no_enospc); 1797 if (ret) 1798 goto out; 1799 1800 count_event(c, bucket_discard); 1801 s->discarded++; 1802 out: 1803 if (discard_locked) 1804 discard_in_flight_remove(c, iter.pos); 1805 s->seen++; 1806 bch2_trans_iter_exit(trans, &iter); 1807 printbuf_exit(&buf); 1808 return ret; 1809 } 1810 1811 static void bch2_do_discards_work(struct work_struct *work) 1812 { 1813 struct bch_fs *c = container_of(work, struct bch_fs, discard_work); 1814 struct discard_buckets_state s = {}; 1815 struct bpos discard_pos_done = POS_MAX; 1816 int ret; 1817 1818 /* 1819 * We're doing the commit in bch2_discard_one_bucket instead of using 1820 * for_each_btree_key_commit() so that we can increment counters after 1821 * successful commit: 1822 */ 1823 ret = bch2_trans_run(c, 1824 for_each_btree_key(trans, iter, 1825 BTREE_ID_need_discard, POS_MIN, 0, k, 1826 bch2_discard_one_bucket(trans, &iter, &discard_pos_done, &s))); 1827 1828 discard_buckets_next_dev(c, &s, NULL); 1829 1830 trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, 1831 bch2_err_str(ret)); 1832 1833 bch2_write_ref_put(c, BCH_WRITE_REF_discard); 1834 } 1835 1836 void bch2_do_discards(struct bch_fs *c) 1837 { 1838 if (bch2_write_ref_tryget(c, BCH_WRITE_REF_discard) && 1839 !queue_work(c->write_ref_wq, &c->discard_work)) 1840 bch2_write_ref_put(c, BCH_WRITE_REF_discard); 1841 } 1842 1843 static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket) 1844 { 1845 struct btree_iter iter; 1846 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_intent); 1847 struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); 1848 int ret = bkey_err(k); 1849 if (ret) 1850 goto err; 1851 1852 struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, k); 1853 ret = PTR_ERR_OR_ZERO(a); 1854 if (ret) 1855 goto err; 1856 1857 BUG_ON(a->v.dirty_sectors); 1858 SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); 1859 alloc_data_type_set(&a->v, a->v.data_type); 1860 1861 ret = bch2_trans_update(trans, &iter, &a->k_i, 0); 1862 err: 1863 bch2_trans_iter_exit(trans, &iter); 1864 return ret; 1865 } 1866 1867 static void bch2_do_discards_fast_work(struct work_struct *work) 1868 { 1869 struct bch_fs *c = container_of(work, struct bch_fs, discard_fast_work); 1870 1871 while (1) { 1872 bool got_bucket = false; 1873 struct bpos bucket; 1874 struct bch_dev *ca; 1875 1876 mutex_lock(&c->discard_buckets_in_flight_lock); 1877 darray_for_each(c->discard_buckets_in_flight, i) { 1878 if (i->snapshot) 1879 continue; 1880 1881 ca = bch2_dev_get_ioref(c, i->inode, WRITE); 1882 if (!ca) { 1883 darray_remove_item(&c->discard_buckets_in_flight, i); 1884 continue; 1885 } 1886 1887 got_bucket = true; 1888 bucket = *i; 1889 i->snapshot = true; 1890 break; 1891 } 1892 mutex_unlock(&c->discard_buckets_in_flight_lock); 1893 1894 if (!got_bucket) 1895 break; 1896 1897 if (ca->mi.discard && !c->opts.nochanges) 1898 blkdev_issue_discard(ca->disk_sb.bdev, 1899 bucket.offset * ca->mi.bucket_size, 1900 ca->mi.bucket_size, 1901 GFP_KERNEL); 1902 1903 int ret = bch2_trans_do(c, NULL, NULL, 1904 BCH_WATERMARK_btree| 1905 BCH_TRANS_COMMIT_no_enospc, 1906 bch2_clear_bucket_needs_discard(trans, bucket)); 1907 bch_err_fn(c, ret); 1908 1909 percpu_ref_put(&ca->io_ref); 1910 discard_in_flight_remove(c, bucket); 1911 1912 if (ret) 1913 break; 1914 } 1915 1916 bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); 1917 } 1918 1919 static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket) 1920 { 1921 rcu_read_lock(); 1922 struct bch_dev *ca = bch2_dev_rcu(c, bucket.inode); 1923 bool dead = !ca || percpu_ref_is_dying(&ca->io_ref); 1924 rcu_read_unlock(); 1925 1926 if (!dead && 1927 !discard_in_flight_add(c, bucket) && 1928 bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast) && 1929 !queue_work(c->write_ref_wq, &c->discard_fast_work)) 1930 bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); 1931 } 1932 1933 static int invalidate_one_bucket(struct btree_trans *trans, 1934 struct btree_iter *lru_iter, 1935 struct bkey_s_c lru_k, 1936 s64 *nr_to_invalidate) 1937 { 1938 struct bch_fs *c = trans->c; 1939 struct bkey_i_alloc_v4 *a = NULL; 1940 struct printbuf buf = PRINTBUF; 1941 struct bpos bucket = u64_to_bucket(lru_k.k->p.offset); 1942 unsigned cached_sectors; 1943 int ret = 0; 1944 1945 if (*nr_to_invalidate <= 0) 1946 return 1; 1947 1948 if (!bch2_dev_bucket_exists(c, bucket)) { 1949 prt_str(&buf, "lru entry points to invalid bucket"); 1950 goto err; 1951 } 1952 1953 if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset)) 1954 return 0; 1955 1956 a = bch2_trans_start_alloc_update(trans, bucket); 1957 ret = PTR_ERR_OR_ZERO(a); 1958 if (ret) 1959 goto out; 1960 1961 /* We expect harmless races here due to the btree write buffer: */ 1962 if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(a->v)) 1963 goto out; 1964 1965 BUG_ON(a->v.data_type != BCH_DATA_cached); 1966 BUG_ON(a->v.dirty_sectors); 1967 1968 if (!a->v.cached_sectors) 1969 bch_err(c, "invalidating empty bucket, confused"); 1970 1971 cached_sectors = a->v.cached_sectors; 1972 1973 SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); 1974 a->v.gen++; 1975 a->v.data_type = 0; 1976 a->v.dirty_sectors = 0; 1977 a->v.cached_sectors = 0; 1978 a->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now); 1979 a->v.io_time[WRITE] = atomic64_read(&c->io_clock[WRITE].now); 1980 1981 ret = bch2_trans_commit(trans, NULL, NULL, 1982 BCH_WATERMARK_btree| 1983 BCH_TRANS_COMMIT_no_enospc); 1984 if (ret) 1985 goto out; 1986 1987 trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors); 1988 --*nr_to_invalidate; 1989 out: 1990 printbuf_exit(&buf); 1991 return ret; 1992 err: 1993 prt_str(&buf, "\n lru key: "); 1994 bch2_bkey_val_to_text(&buf, c, lru_k); 1995 1996 prt_str(&buf, "\n lru entry: "); 1997 bch2_lru_pos_to_text(&buf, lru_iter->pos); 1998 1999 prt_str(&buf, "\n alloc key: "); 2000 if (!a) 2001 bch2_bpos_to_text(&buf, bucket); 2002 else 2003 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i)); 2004 2005 bch_err(c, "%s", buf.buf); 2006 if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_lrus) { 2007 bch2_inconsistent_error(c); 2008 ret = -EINVAL; 2009 } 2010 2011 goto out; 2012 } 2013 2014 static void bch2_do_invalidates_work(struct work_struct *work) 2015 { 2016 struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work); 2017 struct btree_trans *trans = bch2_trans_get(c); 2018 int ret = 0; 2019 2020 ret = bch2_btree_write_buffer_tryflush(trans); 2021 if (ret) 2022 goto err; 2023 2024 for_each_member_device(c, ca) { 2025 s64 nr_to_invalidate = 2026 should_invalidate_buckets(ca, bch2_dev_usage_read(ca)); 2027 2028 ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru, 2029 lru_pos(ca->dev_idx, 0, 0), 2030 lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX), 2031 BTREE_ITER_intent, k, 2032 invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate)); 2033 2034 if (ret < 0) { 2035 bch2_dev_put(ca); 2036 break; 2037 } 2038 } 2039 err: 2040 bch2_trans_put(trans); 2041 bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); 2042 } 2043 2044 void bch2_do_invalidates(struct bch_fs *c) 2045 { 2046 if (bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate) && 2047 !queue_work(c->write_ref_wq, &c->invalidate_work)) 2048 bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); 2049 } 2050 2051 int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, 2052 u64 bucket_start, u64 bucket_end) 2053 { 2054 struct btree_trans *trans = bch2_trans_get(c); 2055 struct btree_iter iter; 2056 struct bkey_s_c k; 2057 struct bkey hole; 2058 struct bpos end = POS(ca->dev_idx, bucket_end); 2059 struct bch_member *m; 2060 unsigned long last_updated = jiffies; 2061 int ret; 2062 2063 BUG_ON(bucket_start > bucket_end); 2064 BUG_ON(bucket_end > ca->mi.nbuckets); 2065 2066 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, 2067 POS(ca->dev_idx, max_t(u64, ca->mi.first_bucket, bucket_start)), 2068 BTREE_ITER_prefetch); 2069 /* 2070 * Scan the alloc btree for every bucket on @ca, and add buckets to the 2071 * freespace/need_discard/need_gc_gens btrees as needed: 2072 */ 2073 while (1) { 2074 if (last_updated + HZ * 10 < jiffies) { 2075 bch_info(ca, "%s: currently at %llu/%llu", 2076 __func__, iter.pos.offset, ca->mi.nbuckets); 2077 last_updated = jiffies; 2078 } 2079 2080 bch2_trans_begin(trans); 2081 2082 if (bkey_ge(iter.pos, end)) { 2083 ret = 0; 2084 break; 2085 } 2086 2087 k = bch2_get_key_or_hole(&iter, end, &hole); 2088 ret = bkey_err(k); 2089 if (ret) 2090 goto bkey_err; 2091 2092 if (k.k->type) { 2093 /* 2094 * We process live keys in the alloc btree one at a 2095 * time: 2096 */ 2097 struct bch_alloc_v4 a_convert; 2098 const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); 2099 2100 ret = bch2_bucket_do_index(trans, ca, k, a, true) ?: 2101 bch2_trans_commit(trans, NULL, NULL, 2102 BCH_TRANS_COMMIT_no_enospc); 2103 if (ret) 2104 goto bkey_err; 2105 2106 bch2_btree_iter_advance(&iter); 2107 } else { 2108 struct bkey_i *freespace; 2109 2110 freespace = bch2_trans_kmalloc(trans, sizeof(*freespace)); 2111 ret = PTR_ERR_OR_ZERO(freespace); 2112 if (ret) 2113 goto bkey_err; 2114 2115 bkey_init(&freespace->k); 2116 freespace->k.type = KEY_TYPE_set; 2117 freespace->k.p = k.k->p; 2118 freespace->k.size = k.k->size; 2119 2120 ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?: 2121 bch2_trans_commit(trans, NULL, NULL, 2122 BCH_TRANS_COMMIT_no_enospc); 2123 if (ret) 2124 goto bkey_err; 2125 2126 bch2_btree_iter_set_pos(&iter, k.k->p); 2127 } 2128 bkey_err: 2129 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 2130 continue; 2131 if (ret) 2132 break; 2133 } 2134 2135 bch2_trans_iter_exit(trans, &iter); 2136 bch2_trans_put(trans); 2137 2138 if (ret < 0) { 2139 bch_err_msg(ca, ret, "initializing free space"); 2140 return ret; 2141 } 2142 2143 mutex_lock(&c->sb_lock); 2144 m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); 2145 SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true); 2146 mutex_unlock(&c->sb_lock); 2147 2148 return 0; 2149 } 2150 2151 int bch2_fs_freespace_init(struct bch_fs *c) 2152 { 2153 int ret = 0; 2154 bool doing_init = false; 2155 2156 /* 2157 * We can crash during the device add path, so we need to check this on 2158 * every mount: 2159 */ 2160 2161 for_each_member_device(c, ca) { 2162 if (ca->mi.freespace_initialized) 2163 continue; 2164 2165 if (!doing_init) { 2166 bch_info(c, "initializing freespace"); 2167 doing_init = true; 2168 } 2169 2170 ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); 2171 if (ret) { 2172 bch2_dev_put(ca); 2173 bch_err_fn(c, ret); 2174 return ret; 2175 } 2176 } 2177 2178 if (doing_init) { 2179 mutex_lock(&c->sb_lock); 2180 bch2_write_super(c); 2181 mutex_unlock(&c->sb_lock); 2182 bch_verbose(c, "done initializing freespace"); 2183 } 2184 2185 return 0; 2186 } 2187 2188 /* Bucket IO clocks: */ 2189 2190 int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, 2191 size_t bucket_nr, int rw) 2192 { 2193 struct bch_fs *c = trans->c; 2194 struct btree_iter iter; 2195 struct bkey_i_alloc_v4 *a; 2196 u64 now; 2197 int ret = 0; 2198 2199 if (bch2_trans_relock(trans)) 2200 bch2_trans_begin(trans); 2201 2202 a = bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(dev, bucket_nr)); 2203 ret = PTR_ERR_OR_ZERO(a); 2204 if (ret) 2205 return ret; 2206 2207 now = atomic64_read(&c->io_clock[rw].now); 2208 if (a->v.io_time[rw] == now) 2209 goto out; 2210 2211 a->v.io_time[rw] = now; 2212 2213 ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: 2214 bch2_trans_commit(trans, NULL, NULL, 0); 2215 out: 2216 bch2_trans_iter_exit(trans, &iter); 2217 return ret; 2218 } 2219 2220 /* Startup/shutdown (ro/rw): */ 2221 2222 void bch2_recalc_capacity(struct bch_fs *c) 2223 { 2224 u64 capacity = 0, reserved_sectors = 0, gc_reserve; 2225 unsigned bucket_size_max = 0; 2226 unsigned long ra_pages = 0; 2227 2228 lockdep_assert_held(&c->state_lock); 2229 2230 for_each_online_member(c, ca) { 2231 struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi; 2232 2233 ra_pages += bdi->ra_pages; 2234 } 2235 2236 bch2_set_ra_pages(c, ra_pages); 2237 2238 for_each_rw_member(c, ca) { 2239 u64 dev_reserve = 0; 2240 2241 /* 2242 * We need to reserve buckets (from the number 2243 * of currently available buckets) against 2244 * foreground writes so that mainly copygc can 2245 * make forward progress. 2246 * 2247 * We need enough to refill the various reserves 2248 * from scratch - copygc will use its entire 2249 * reserve all at once, then run against when 2250 * its reserve is refilled (from the formerly 2251 * available buckets). 2252 * 2253 * This reserve is just used when considering if 2254 * allocations for foreground writes must wait - 2255 * not -ENOSPC calculations. 2256 */ 2257 2258 dev_reserve += ca->nr_btree_reserve * 2; 2259 dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */ 2260 2261 dev_reserve += 1; /* btree write point */ 2262 dev_reserve += 1; /* copygc write point */ 2263 dev_reserve += 1; /* rebalance write point */ 2264 2265 dev_reserve *= ca->mi.bucket_size; 2266 2267 capacity += bucket_to_sector(ca, ca->mi.nbuckets - 2268 ca->mi.first_bucket); 2269 2270 reserved_sectors += dev_reserve * 2; 2271 2272 bucket_size_max = max_t(unsigned, bucket_size_max, 2273 ca->mi.bucket_size); 2274 } 2275 2276 gc_reserve = c->opts.gc_reserve_bytes 2277 ? c->opts.gc_reserve_bytes >> 9 2278 : div64_u64(capacity * c->opts.gc_reserve_percent, 100); 2279 2280 reserved_sectors = max(gc_reserve, reserved_sectors); 2281 2282 reserved_sectors = min(reserved_sectors, capacity); 2283 2284 c->capacity = capacity - reserved_sectors; 2285 2286 c->bucket_size_max = bucket_size_max; 2287 2288 /* Wake up case someone was waiting for buckets */ 2289 closure_wake_up(&c->freelist_wait); 2290 } 2291 2292 u64 bch2_min_rw_member_capacity(struct bch_fs *c) 2293 { 2294 u64 ret = U64_MAX; 2295 2296 for_each_rw_member(c, ca) 2297 ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size); 2298 return ret; 2299 } 2300 2301 static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) 2302 { 2303 struct open_bucket *ob; 2304 bool ret = false; 2305 2306 for (ob = c->open_buckets; 2307 ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); 2308 ob++) { 2309 spin_lock(&ob->lock); 2310 if (ob->valid && !ob->on_partial_list && 2311 ob->dev == ca->dev_idx) 2312 ret = true; 2313 spin_unlock(&ob->lock); 2314 } 2315 2316 return ret; 2317 } 2318 2319 /* device goes ro: */ 2320 void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) 2321 { 2322 unsigned i; 2323 2324 /* First, remove device from allocation groups: */ 2325 2326 for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) 2327 clear_bit(ca->dev_idx, c->rw_devs[i].d); 2328 2329 /* 2330 * Capacity is calculated based off of devices in allocation groups: 2331 */ 2332 bch2_recalc_capacity(c); 2333 2334 bch2_open_buckets_stop(c, ca, false); 2335 2336 /* 2337 * Wake up threads that were blocked on allocation, so they can notice 2338 * the device can no longer be removed and the capacity has changed: 2339 */ 2340 closure_wake_up(&c->freelist_wait); 2341 2342 /* 2343 * journal_res_get() can block waiting for free space in the journal - 2344 * it needs to notice there may not be devices to allocate from anymore: 2345 */ 2346 wake_up(&c->journal.wait); 2347 2348 /* Now wait for any in flight writes: */ 2349 2350 closure_wait_event(&c->open_buckets_wait, 2351 !bch2_dev_has_open_write_point(c, ca)); 2352 } 2353 2354 /* device goes rw: */ 2355 void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) 2356 { 2357 unsigned i; 2358 2359 for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) 2360 if (ca->mi.data_allowed & (1 << i)) 2361 set_bit(ca->dev_idx, c->rw_devs[i].d); 2362 } 2363 2364 void bch2_fs_allocator_background_exit(struct bch_fs *c) 2365 { 2366 darray_exit(&c->discard_buckets_in_flight); 2367 } 2368 2369 void bch2_fs_allocator_background_init(struct bch_fs *c) 2370 { 2371 spin_lock_init(&c->freelist_lock); 2372 mutex_init(&c->discard_buckets_in_flight_lock); 2373 INIT_WORK(&c->discard_work, bch2_do_discards_work); 2374 INIT_WORK(&c->discard_fast_work, bch2_do_discards_fast_work); 2375 INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work); 2376 } 2377