1 // SPDX-License-Identifier: GPL-2.0 2 #include "bcachefs.h" 3 #include "alloc_background.h" 4 #include "alloc_foreground.h" 5 #include "backpointers.h" 6 #include "bkey_buf.h" 7 #include "btree_cache.h" 8 #include "btree_io.h" 9 #include "btree_key_cache.h" 10 #include "btree_update.h" 11 #include "btree_update_interior.h" 12 #include "btree_gc.h" 13 #include "btree_write_buffer.h" 14 #include "buckets.h" 15 #include "buckets_waiting_for_journal.h" 16 #include "clock.h" 17 #include "debug.h" 18 #include "disk_accounting.h" 19 #include "ec.h" 20 #include "error.h" 21 #include "lru.h" 22 #include "recovery.h" 23 #include "trace.h" 24 #include "varint.h" 25 26 #include <linux/kthread.h> 27 #include <linux/math64.h> 28 #include <linux/random.h> 29 #include <linux/rculist.h> 30 #include <linux/rcupdate.h> 31 #include <linux/sched/task.h> 32 #include <linux/sort.h> 33 34 static void bch2_discard_one_bucket_fast(struct bch_dev *, u64); 35 36 /* Persistent alloc info: */ 37 38 static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { 39 #define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8, 40 BCH_ALLOC_FIELDS_V1() 41 #undef x 42 }; 43 44 struct bkey_alloc_unpacked { 45 u64 journal_seq; 46 u8 gen; 47 u8 oldest_gen; 48 u8 data_type; 49 bool need_discard:1; 50 bool need_inc_gen:1; 51 #define x(_name, _bits) u##_bits _name; 52 BCH_ALLOC_FIELDS_V2() 53 #undef x 54 }; 55 56 static inline u64 alloc_field_v1_get(const struct bch_alloc *a, 57 const void **p, unsigned field) 58 { 59 unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field]; 60 u64 v; 61 62 if (!(a->fields & (1 << field))) 63 return 0; 64 65 switch (bytes) { 66 case 1: 67 v = *((const u8 *) *p); 68 break; 69 case 2: 70 v = le16_to_cpup(*p); 71 break; 72 case 4: 73 v = le32_to_cpup(*p); 74 break; 75 case 8: 76 v = le64_to_cpup(*p); 77 break; 78 default: 79 BUG(); 80 } 81 82 *p += bytes; 83 return v; 84 } 85 86 static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out, 87 struct bkey_s_c k) 88 { 89 const struct bch_alloc *in = bkey_s_c_to_alloc(k).v; 90 const void *d = in->data; 91 unsigned idx = 0; 92 93 out->gen = in->gen; 94 95 #define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++); 96 BCH_ALLOC_FIELDS_V1() 97 #undef x 98 } 99 100 static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out, 101 struct bkey_s_c k) 102 { 103 struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k); 104 const u8 *in = a.v->data; 105 const u8 *end = bkey_val_end(a); 106 unsigned fieldnr = 0; 107 int ret; 108 u64 v; 109 110 out->gen = a.v->gen; 111 out->oldest_gen = a.v->oldest_gen; 112 out->data_type = a.v->data_type; 113 114 #define x(_name, _bits) \ 115 if (fieldnr < a.v->nr_fields) { \ 116 ret = bch2_varint_decode_fast(in, end, &v); \ 117 if (ret < 0) \ 118 return ret; \ 119 in += ret; \ 120 } else { \ 121 v = 0; \ 122 } \ 123 out->_name = v; \ 124 if (v != out->_name) \ 125 return -1; \ 126 fieldnr++; 127 128 BCH_ALLOC_FIELDS_V2() 129 #undef x 130 return 0; 131 } 132 133 static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out, 134 struct bkey_s_c k) 135 { 136 struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k); 137 const u8 *in = a.v->data; 138 const u8 *end = bkey_val_end(a); 139 unsigned fieldnr = 0; 140 int ret; 141 u64 v; 142 143 out->gen = a.v->gen; 144 out->oldest_gen = a.v->oldest_gen; 145 out->data_type = a.v->data_type; 146 out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v); 147 out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v); 148 out->journal_seq = le64_to_cpu(a.v->journal_seq); 149 150 #define x(_name, _bits) \ 151 if (fieldnr < a.v->nr_fields) { \ 152 ret = bch2_varint_decode_fast(in, end, &v); \ 153 if (ret < 0) \ 154 return ret; \ 155 in += ret; \ 156 } else { \ 157 v = 0; \ 158 } \ 159 out->_name = v; \ 160 if (v != out->_name) \ 161 return -1; \ 162 fieldnr++; 163 164 BCH_ALLOC_FIELDS_V2() 165 #undef x 166 return 0; 167 } 168 169 static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) 170 { 171 struct bkey_alloc_unpacked ret = { .gen = 0 }; 172 173 switch (k.k->type) { 174 case KEY_TYPE_alloc: 175 bch2_alloc_unpack_v1(&ret, k); 176 break; 177 case KEY_TYPE_alloc_v2: 178 bch2_alloc_unpack_v2(&ret, k); 179 break; 180 case KEY_TYPE_alloc_v3: 181 bch2_alloc_unpack_v3(&ret, k); 182 break; 183 } 184 185 return ret; 186 } 187 188 static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) 189 { 190 unsigned i, bytes = offsetof(struct bch_alloc, data); 191 192 for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++) 193 if (a->fields & (1 << i)) 194 bytes += BCH_ALLOC_V1_FIELD_BYTES[i]; 195 196 return DIV_ROUND_UP(bytes, sizeof(u64)); 197 } 198 199 int bch2_alloc_v1_validate(struct bch_fs *c, struct bkey_s_c k, 200 enum bch_validate_flags flags) 201 { 202 struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); 203 int ret = 0; 204 205 /* allow for unknown fields */ 206 bkey_fsck_err_on(bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v), 207 c, alloc_v1_val_size_bad, 208 "incorrect value size (%zu < %u)", 209 bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v)); 210 fsck_err: 211 return ret; 212 } 213 214 int bch2_alloc_v2_validate(struct bch_fs *c, struct bkey_s_c k, 215 enum bch_validate_flags flags) 216 { 217 struct bkey_alloc_unpacked u; 218 int ret = 0; 219 220 bkey_fsck_err_on(bch2_alloc_unpack_v2(&u, k), 221 c, alloc_v2_unpack_error, 222 "unpack error"); 223 fsck_err: 224 return ret; 225 } 226 227 int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k, 228 enum bch_validate_flags flags) 229 { 230 struct bkey_alloc_unpacked u; 231 int ret = 0; 232 233 bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k), 234 c, alloc_v2_unpack_error, 235 "unpack error"); 236 fsck_err: 237 return ret; 238 } 239 240 int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k, 241 enum bch_validate_flags flags) 242 { 243 struct bch_alloc_v4 a; 244 int ret = 0; 245 246 bkey_val_copy(&a, bkey_s_c_to_alloc_v4(k)); 247 248 bkey_fsck_err_on(alloc_v4_u64s_noerror(&a) > bkey_val_u64s(k.k), 249 c, alloc_v4_val_size_bad, 250 "bad val size (%u > %zu)", 251 alloc_v4_u64s_noerror(&a), bkey_val_u64s(k.k)); 252 253 bkey_fsck_err_on(!BCH_ALLOC_V4_BACKPOINTERS_START(&a) && 254 BCH_ALLOC_V4_NR_BACKPOINTERS(&a), 255 c, alloc_v4_backpointers_start_bad, 256 "invalid backpointers_start"); 257 258 bkey_fsck_err_on(alloc_data_type(a, a.data_type) != a.data_type, 259 c, alloc_key_data_type_bad, 260 "invalid data type (got %u should be %u)", 261 a.data_type, alloc_data_type(a, a.data_type)); 262 263 for (unsigned i = 0; i < 2; i++) 264 bkey_fsck_err_on(a.io_time[i] > LRU_TIME_MAX, 265 c, alloc_key_io_time_bad, 266 "invalid io_time[%s]: %llu, max %llu", 267 i == READ ? "read" : "write", 268 a.io_time[i], LRU_TIME_MAX); 269 270 unsigned stripe_sectors = BCH_ALLOC_V4_BACKPOINTERS_START(&a) * sizeof(u64) > 271 offsetof(struct bch_alloc_v4, stripe_sectors) 272 ? a.stripe_sectors 273 : 0; 274 275 switch (a.data_type) { 276 case BCH_DATA_free: 277 case BCH_DATA_need_gc_gens: 278 case BCH_DATA_need_discard: 279 bkey_fsck_err_on(stripe_sectors || 280 a.dirty_sectors || 281 a.cached_sectors || 282 a.stripe, 283 c, alloc_key_empty_but_have_data, 284 "empty data type free but have data %u.%u.%u %u", 285 stripe_sectors, 286 a.dirty_sectors, 287 a.cached_sectors, 288 a.stripe); 289 break; 290 case BCH_DATA_sb: 291 case BCH_DATA_journal: 292 case BCH_DATA_btree: 293 case BCH_DATA_user: 294 case BCH_DATA_parity: 295 bkey_fsck_err_on(!a.dirty_sectors && 296 !stripe_sectors, 297 c, alloc_key_dirty_sectors_0, 298 "data_type %s but dirty_sectors==0", 299 bch2_data_type_str(a.data_type)); 300 break; 301 case BCH_DATA_cached: 302 bkey_fsck_err_on(!a.cached_sectors || 303 a.dirty_sectors || 304 stripe_sectors || 305 a.stripe, 306 c, alloc_key_cached_inconsistency, 307 "data type inconsistency"); 308 309 bkey_fsck_err_on(!a.io_time[READ] && 310 c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs, 311 c, alloc_key_cached_but_read_time_zero, 312 "cached bucket with read_time == 0"); 313 break; 314 case BCH_DATA_stripe: 315 break; 316 } 317 fsck_err: 318 return ret; 319 } 320 321 void bch2_alloc_v4_swab(struct bkey_s k) 322 { 323 struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v; 324 struct bch_backpointer *bp, *bps; 325 326 a->journal_seq = swab64(a->journal_seq); 327 a->flags = swab32(a->flags); 328 a->dirty_sectors = swab32(a->dirty_sectors); 329 a->cached_sectors = swab32(a->cached_sectors); 330 a->io_time[0] = swab64(a->io_time[0]); 331 a->io_time[1] = swab64(a->io_time[1]); 332 a->stripe = swab32(a->stripe); 333 a->nr_external_backpointers = swab32(a->nr_external_backpointers); 334 a->fragmentation_lru = swab64(a->fragmentation_lru); 335 a->stripe_sectors = swab32(a->stripe_sectors); 336 337 bps = alloc_v4_backpointers(a); 338 for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) { 339 bp->bucket_offset = swab40(bp->bucket_offset); 340 bp->bucket_len = swab32(bp->bucket_len); 341 bch2_bpos_swab(&bp->pos); 342 } 343 } 344 345 void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) 346 { 347 struct bch_alloc_v4 _a; 348 const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); 349 350 prt_newline(out); 351 printbuf_indent_add(out, 2); 352 353 prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen); 354 bch2_prt_data_type(out, a->data_type); 355 prt_newline(out); 356 prt_printf(out, "journal_seq %llu\n", a->journal_seq); 357 prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a)); 358 prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a)); 359 prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors); 360 prt_printf(out, "stripe_sectors %u\n", a->stripe_sectors); 361 prt_printf(out, "cached_sectors %u\n", a->cached_sectors); 362 prt_printf(out, "stripe %u\n", a->stripe); 363 prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy); 364 prt_printf(out, "io_time[READ] %llu\n", a->io_time[READ]); 365 prt_printf(out, "io_time[WRITE] %llu\n", a->io_time[WRITE]); 366 prt_printf(out, "fragmentation %llu\n", a->fragmentation_lru); 367 prt_printf(out, "bp_start %llu\n", BCH_ALLOC_V4_BACKPOINTERS_START(a)); 368 printbuf_indent_sub(out, 2); 369 } 370 371 void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out) 372 { 373 if (k.k->type == KEY_TYPE_alloc_v4) { 374 void *src, *dst; 375 376 *out = *bkey_s_c_to_alloc_v4(k).v; 377 378 src = alloc_v4_backpointers(out); 379 SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s); 380 dst = alloc_v4_backpointers(out); 381 382 if (src < dst) 383 memset(src, 0, dst - src); 384 385 SET_BCH_ALLOC_V4_NR_BACKPOINTERS(out, 0); 386 } else { 387 struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); 388 389 *out = (struct bch_alloc_v4) { 390 .journal_seq = u.journal_seq, 391 .flags = u.need_discard, 392 .gen = u.gen, 393 .oldest_gen = u.oldest_gen, 394 .data_type = u.data_type, 395 .stripe_redundancy = u.stripe_redundancy, 396 .dirty_sectors = u.dirty_sectors, 397 .cached_sectors = u.cached_sectors, 398 .io_time[READ] = u.read_time, 399 .io_time[WRITE] = u.write_time, 400 .stripe = u.stripe, 401 }; 402 403 SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s); 404 } 405 } 406 407 static noinline struct bkey_i_alloc_v4 * 408 __bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) 409 { 410 struct bkey_i_alloc_v4 *ret; 411 412 ret = bch2_trans_kmalloc(trans, max(bkey_bytes(k.k), sizeof(struct bkey_i_alloc_v4))); 413 if (IS_ERR(ret)) 414 return ret; 415 416 if (k.k->type == KEY_TYPE_alloc_v4) { 417 void *src, *dst; 418 419 bkey_reassemble(&ret->k_i, k); 420 421 src = alloc_v4_backpointers(&ret->v); 422 SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s); 423 dst = alloc_v4_backpointers(&ret->v); 424 425 if (src < dst) 426 memset(src, 0, dst - src); 427 428 SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v, 0); 429 set_alloc_v4_u64s(ret); 430 } else { 431 bkey_alloc_v4_init(&ret->k_i); 432 ret->k.p = k.k->p; 433 bch2_alloc_to_v4(k, &ret->v); 434 } 435 return ret; 436 } 437 438 static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k) 439 { 440 struct bkey_s_c_alloc_v4 a; 441 442 if (likely(k.k->type == KEY_TYPE_alloc_v4) && 443 ((a = bkey_s_c_to_alloc_v4(k), true) && 444 BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) == 0)) 445 return bch2_bkey_make_mut_noupdate_typed(trans, k, alloc_v4); 446 447 return __bch2_alloc_to_v4_mut(trans, k); 448 } 449 450 struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) 451 { 452 return bch2_alloc_to_v4_mut_inlined(trans, k); 453 } 454 455 struct bkey_i_alloc_v4 * 456 bch2_trans_start_alloc_update_noupdate(struct btree_trans *trans, struct btree_iter *iter, 457 struct bpos pos) 458 { 459 struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos, 460 BTREE_ITER_with_updates| 461 BTREE_ITER_cached| 462 BTREE_ITER_intent); 463 int ret = bkey_err(k); 464 if (unlikely(ret)) 465 return ERR_PTR(ret); 466 467 struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k); 468 ret = PTR_ERR_OR_ZERO(a); 469 if (unlikely(ret)) 470 goto err; 471 return a; 472 err: 473 bch2_trans_iter_exit(trans, iter); 474 return ERR_PTR(ret); 475 } 476 477 __flatten 478 struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, struct bpos pos, 479 enum btree_iter_update_trigger_flags flags) 480 { 481 struct btree_iter iter; 482 struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update_noupdate(trans, &iter, pos); 483 int ret = PTR_ERR_OR_ZERO(a); 484 if (ret) 485 return ERR_PTR(ret); 486 487 ret = bch2_trans_update(trans, &iter, &a->k_i, flags); 488 bch2_trans_iter_exit(trans, &iter); 489 return unlikely(ret) ? ERR_PTR(ret) : a; 490 } 491 492 static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset) 493 { 494 *offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK; 495 496 pos.offset >>= KEY_TYPE_BUCKET_GENS_BITS; 497 return pos; 498 } 499 500 static struct bpos bucket_gens_pos_to_alloc(struct bpos pos, unsigned offset) 501 { 502 pos.offset <<= KEY_TYPE_BUCKET_GENS_BITS; 503 pos.offset += offset; 504 return pos; 505 } 506 507 static unsigned alloc_gen(struct bkey_s_c k, unsigned offset) 508 { 509 return k.k->type == KEY_TYPE_bucket_gens 510 ? bkey_s_c_to_bucket_gens(k).v->gens[offset] 511 : 0; 512 } 513 514 int bch2_bucket_gens_validate(struct bch_fs *c, struct bkey_s_c k, 515 enum bch_validate_flags flags) 516 { 517 int ret = 0; 518 519 bkey_fsck_err_on(bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens), 520 c, bucket_gens_val_size_bad, 521 "bad val size (%zu != %zu)", 522 bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens)); 523 fsck_err: 524 return ret; 525 } 526 527 void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) 528 { 529 struct bkey_s_c_bucket_gens g = bkey_s_c_to_bucket_gens(k); 530 unsigned i; 531 532 for (i = 0; i < ARRAY_SIZE(g.v->gens); i++) { 533 if (i) 534 prt_char(out, ' '); 535 prt_printf(out, "%u", g.v->gens[i]); 536 } 537 } 538 539 int bch2_bucket_gens_init(struct bch_fs *c) 540 { 541 struct btree_trans *trans = bch2_trans_get(c); 542 struct bkey_i_bucket_gens g; 543 bool have_bucket_gens_key = false; 544 int ret; 545 546 ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, 547 BTREE_ITER_prefetch, k, ({ 548 /* 549 * Not a fsck error because this is checked/repaired by 550 * bch2_check_alloc_key() which runs later: 551 */ 552 if (!bch2_dev_bucket_exists(c, k.k->p)) 553 continue; 554 555 struct bch_alloc_v4 a; 556 u8 gen = bch2_alloc_to_v4(k, &a)->gen; 557 unsigned offset; 558 struct bpos pos = alloc_gens_pos(iter.pos, &offset); 559 int ret2 = 0; 560 561 if (have_bucket_gens_key && !bkey_eq(g.k.p, pos)) { 562 ret2 = bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0) ?: 563 bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); 564 if (ret2) 565 goto iter_err; 566 have_bucket_gens_key = false; 567 } 568 569 if (!have_bucket_gens_key) { 570 bkey_bucket_gens_init(&g.k_i); 571 g.k.p = pos; 572 have_bucket_gens_key = true; 573 } 574 575 g.v.gens[offset] = gen; 576 iter_err: 577 ret2; 578 })); 579 580 if (have_bucket_gens_key && !ret) 581 ret = commit_do(trans, NULL, NULL, 582 BCH_TRANS_COMMIT_no_enospc, 583 bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0)); 584 585 bch2_trans_put(trans); 586 587 bch_err_fn(c, ret); 588 return ret; 589 } 590 591 int bch2_alloc_read(struct bch_fs *c) 592 { 593 struct btree_trans *trans = bch2_trans_get(c); 594 struct bch_dev *ca = NULL; 595 int ret; 596 597 if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) { 598 ret = for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN, 599 BTREE_ITER_prefetch, k, ({ 600 u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; 601 u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; 602 603 if (k.k->type != KEY_TYPE_bucket_gens) 604 continue; 605 606 ca = bch2_dev_iterate(c, ca, k.k->p.inode); 607 /* 608 * Not a fsck error because this is checked/repaired by 609 * bch2_check_alloc_key() which runs later: 610 */ 611 if (!ca) { 612 bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); 613 continue; 614 } 615 616 const struct bch_bucket_gens *g = bkey_s_c_to_bucket_gens(k).v; 617 618 for (u64 b = max_t(u64, ca->mi.first_bucket, start); 619 b < min_t(u64, ca->mi.nbuckets, end); 620 b++) 621 *bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK]; 622 0; 623 })); 624 } else { 625 ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, 626 BTREE_ITER_prefetch, k, ({ 627 ca = bch2_dev_iterate(c, ca, k.k->p.inode); 628 /* 629 * Not a fsck error because this is checked/repaired by 630 * bch2_check_alloc_key() which runs later: 631 */ 632 if (!ca) { 633 bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); 634 continue; 635 } 636 637 struct bch_alloc_v4 a; 638 *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen; 639 0; 640 })); 641 } 642 643 bch2_dev_put(ca); 644 bch2_trans_put(trans); 645 646 bch_err_fn(c, ret); 647 return ret; 648 } 649 650 /* Free space/discard btree: */ 651 652 static int bch2_bucket_do_index(struct btree_trans *trans, 653 struct bch_dev *ca, 654 struct bkey_s_c alloc_k, 655 const struct bch_alloc_v4 *a, 656 bool set) 657 { 658 struct bch_fs *c = trans->c; 659 struct btree_iter iter; 660 struct bkey_s_c old; 661 struct bkey_i *k; 662 enum btree_id btree; 663 enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted; 664 enum bch_bkey_type new_type = set ? KEY_TYPE_set : KEY_TYPE_deleted; 665 struct printbuf buf = PRINTBUF; 666 int ret; 667 668 if (a->data_type != BCH_DATA_free && 669 a->data_type != BCH_DATA_need_discard) 670 return 0; 671 672 k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k)); 673 if (IS_ERR(k)) 674 return PTR_ERR(k); 675 676 bkey_init(&k->k); 677 k->k.type = new_type; 678 679 switch (a->data_type) { 680 case BCH_DATA_free: 681 btree = BTREE_ID_freespace; 682 k->k.p = alloc_freespace_pos(alloc_k.k->p, *a); 683 bch2_key_resize(&k->k, 1); 684 break; 685 case BCH_DATA_need_discard: 686 btree = BTREE_ID_need_discard; 687 k->k.p = alloc_k.k->p; 688 break; 689 default: 690 return 0; 691 } 692 693 old = bch2_bkey_get_iter(trans, &iter, btree, 694 bkey_start_pos(&k->k), 695 BTREE_ITER_intent); 696 ret = bkey_err(old); 697 if (ret) 698 return ret; 699 700 if (ca->mi.freespace_initialized && 701 c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info && 702 bch2_trans_inconsistent_on(old.k->type != old_type, trans, 703 "incorrect key when %s %s:%llu:%llu:0 (got %s should be %s)\n" 704 " for %s", 705 set ? "setting" : "clearing", 706 bch2_btree_id_str(btree), 707 iter.pos.inode, 708 iter.pos.offset, 709 bch2_bkey_types[old.k->type], 710 bch2_bkey_types[old_type], 711 (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { 712 ret = -EIO; 713 goto err; 714 } 715 716 ret = bch2_trans_update(trans, &iter, k, 0); 717 err: 718 bch2_trans_iter_exit(trans, &iter); 719 printbuf_exit(&buf); 720 return ret; 721 } 722 723 static noinline int bch2_bucket_gen_update(struct btree_trans *trans, 724 struct bpos bucket, u8 gen) 725 { 726 struct btree_iter iter; 727 unsigned offset; 728 struct bpos pos = alloc_gens_pos(bucket, &offset); 729 struct bkey_i_bucket_gens *g; 730 struct bkey_s_c k; 731 int ret; 732 733 g = bch2_trans_kmalloc(trans, sizeof(*g)); 734 ret = PTR_ERR_OR_ZERO(g); 735 if (ret) 736 return ret; 737 738 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_bucket_gens, pos, 739 BTREE_ITER_intent| 740 BTREE_ITER_with_updates); 741 ret = bkey_err(k); 742 if (ret) 743 return ret; 744 745 if (k.k->type != KEY_TYPE_bucket_gens) { 746 bkey_bucket_gens_init(&g->k_i); 747 g->k.p = iter.pos; 748 } else { 749 bkey_reassemble(&g->k_i, k); 750 } 751 752 g->v.gens[offset] = gen; 753 754 ret = bch2_trans_update(trans, &iter, &g->k_i, 0); 755 bch2_trans_iter_exit(trans, &iter); 756 return ret; 757 } 758 759 static inline int bch2_dev_data_type_accounting_mod(struct btree_trans *trans, struct bch_dev *ca, 760 enum bch_data_type data_type, 761 s64 delta_buckets, 762 s64 delta_sectors, 763 s64 delta_fragmented, unsigned flags) 764 { 765 struct disk_accounting_pos acc = { 766 .type = BCH_DISK_ACCOUNTING_dev_data_type, 767 .dev_data_type.dev = ca->dev_idx, 768 .dev_data_type.data_type = data_type, 769 }; 770 s64 d[3] = { delta_buckets, delta_sectors, delta_fragmented }; 771 772 return bch2_disk_accounting_mod(trans, &acc, d, 3, flags & BTREE_TRIGGER_gc); 773 } 774 775 int bch2_alloc_key_to_dev_counters(struct btree_trans *trans, struct bch_dev *ca, 776 const struct bch_alloc_v4 *old, 777 const struct bch_alloc_v4 *new, 778 unsigned flags) 779 { 780 s64 old_sectors = bch2_bucket_sectors(*old); 781 s64 new_sectors = bch2_bucket_sectors(*new); 782 if (old->data_type != new->data_type) { 783 int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type, 784 1, new_sectors, bch2_bucket_sectors_fragmented(ca, *new), flags) ?: 785 bch2_dev_data_type_accounting_mod(trans, ca, old->data_type, 786 -1, -old_sectors, -bch2_bucket_sectors_fragmented(ca, *old), flags); 787 if (ret) 788 return ret; 789 } else if (old_sectors != new_sectors) { 790 int ret = bch2_dev_data_type_accounting_mod(trans, ca, new->data_type, 791 0, 792 new_sectors - old_sectors, 793 bch2_bucket_sectors_fragmented(ca, *new) - 794 bch2_bucket_sectors_fragmented(ca, *old), flags); 795 if (ret) 796 return ret; 797 } 798 799 s64 old_unstriped = bch2_bucket_sectors_unstriped(*old); 800 s64 new_unstriped = bch2_bucket_sectors_unstriped(*new); 801 if (old_unstriped != new_unstriped) { 802 int ret = bch2_dev_data_type_accounting_mod(trans, ca, BCH_DATA_unstriped, 803 !!new_unstriped - !!old_unstriped, 804 new_unstriped - old_unstriped, 805 0, 806 flags); 807 if (ret) 808 return ret; 809 } 810 811 return 0; 812 } 813 814 int bch2_trigger_alloc(struct btree_trans *trans, 815 enum btree_id btree, unsigned level, 816 struct bkey_s_c old, struct bkey_s new, 817 enum btree_iter_update_trigger_flags flags) 818 { 819 struct bch_fs *c = trans->c; 820 struct printbuf buf = PRINTBUF; 821 int ret = 0; 822 823 struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p); 824 if (!ca) 825 return -EIO; 826 827 struct bch_alloc_v4 old_a_convert; 828 const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert); 829 830 struct bch_alloc_v4 *new_a; 831 if (likely(new.k->type == KEY_TYPE_alloc_v4)) { 832 new_a = bkey_s_to_alloc_v4(new).v; 833 } else { 834 BUG_ON(!(flags & (BTREE_TRIGGER_gc|BTREE_TRIGGER_check_repair))); 835 836 struct bkey_i_alloc_v4 *new_ka = bch2_alloc_to_v4_mut_inlined(trans, new.s_c); 837 ret = PTR_ERR_OR_ZERO(new_ka); 838 if (unlikely(ret)) 839 goto err; 840 new_a = &new_ka->v; 841 } 842 843 if (flags & BTREE_TRIGGER_transactional) { 844 alloc_data_type_set(new_a, new_a->data_type); 845 846 if (bch2_bucket_sectors_total(*new_a) > bch2_bucket_sectors_total(*old_a)) { 847 new_a->io_time[READ] = bch2_current_io_time(c, READ); 848 new_a->io_time[WRITE]= bch2_current_io_time(c, WRITE); 849 SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true); 850 SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true); 851 } 852 853 if (data_type_is_empty(new_a->data_type) && 854 BCH_ALLOC_V4_NEED_INC_GEN(new_a) && 855 !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) { 856 new_a->gen++; 857 SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false); 858 alloc_data_type_set(new_a, new_a->data_type); 859 } 860 861 if (old_a->data_type != new_a->data_type || 862 (new_a->data_type == BCH_DATA_free && 863 alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) { 864 ret = bch2_bucket_do_index(trans, ca, old, old_a, false) ?: 865 bch2_bucket_do_index(trans, ca, new.s_c, new_a, true); 866 if (ret) 867 goto err; 868 } 869 870 if (new_a->data_type == BCH_DATA_cached && 871 !new_a->io_time[READ]) 872 new_a->io_time[READ] = bch2_current_io_time(c, READ); 873 874 u64 old_lru = alloc_lru_idx_read(*old_a); 875 u64 new_lru = alloc_lru_idx_read(*new_a); 876 if (old_lru != new_lru) { 877 ret = bch2_lru_change(trans, new.k->p.inode, 878 bucket_to_u64(new.k->p), 879 old_lru, new_lru); 880 if (ret) 881 goto err; 882 } 883 884 new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a, ca); 885 if (old_a->fragmentation_lru != new_a->fragmentation_lru) { 886 ret = bch2_lru_change(trans, 887 BCH_LRU_FRAGMENTATION_START, 888 bucket_to_u64(new.k->p), 889 old_a->fragmentation_lru, new_a->fragmentation_lru); 890 if (ret) 891 goto err; 892 } 893 894 if (old_a->gen != new_a->gen) { 895 ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen); 896 if (ret) 897 goto err; 898 } 899 900 if ((flags & BTREE_TRIGGER_bucket_invalidate) && 901 old_a->cached_sectors) { 902 ret = bch2_mod_dev_cached_sectors(trans, ca->dev_idx, 903 -((s64) old_a->cached_sectors), 904 flags & BTREE_TRIGGER_gc); 905 if (ret) 906 goto err; 907 } 908 909 ret = bch2_alloc_key_to_dev_counters(trans, ca, old_a, new_a, flags); 910 if (ret) 911 goto err; 912 } 913 914 if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) { 915 u64 journal_seq = trans->journal_res.seq; 916 u64 bucket_journal_seq = new_a->journal_seq; 917 918 if ((flags & BTREE_TRIGGER_insert) && 919 data_type_is_empty(old_a->data_type) != 920 data_type_is_empty(new_a->data_type) && 921 new.k->type == KEY_TYPE_alloc_v4) { 922 struct bch_alloc_v4 *v = bkey_s_to_alloc_v4(new).v; 923 924 /* 925 * If the btree updates referring to a bucket weren't flushed 926 * before the bucket became empty again, then the we don't have 927 * to wait on a journal flush before we can reuse the bucket: 928 */ 929 v->journal_seq = bucket_journal_seq = 930 data_type_is_empty(new_a->data_type) && 931 (journal_seq == v->journal_seq || 932 bch2_journal_noflush_seq(&c->journal, v->journal_seq)) 933 ? 0 : journal_seq; 934 } 935 936 if (!data_type_is_empty(old_a->data_type) && 937 data_type_is_empty(new_a->data_type) && 938 bucket_journal_seq) { 939 ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, 940 c->journal.flushed_seq_ondisk, 941 new.k->p.inode, new.k->p.offset, 942 bucket_journal_seq); 943 if (bch2_fs_fatal_err_on(ret, c, 944 "setting bucket_needs_journal_commit: %s", bch2_err_str(ret))) 945 goto err; 946 } 947 948 if (new_a->gen != old_a->gen) { 949 rcu_read_lock(); 950 u8 *gen = bucket_gen(ca, new.k->p.offset); 951 if (unlikely(!gen)) { 952 rcu_read_unlock(); 953 goto invalid_bucket; 954 } 955 *gen = new_a->gen; 956 rcu_read_unlock(); 957 } 958 959 #define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; }) 960 #define statechange(expr) !eval_state(old_a, expr) && eval_state(new_a, expr) 961 #define bucket_flushed(a) (!a->journal_seq || a->journal_seq <= c->journal.flushed_seq_ondisk) 962 963 if (statechange(a->data_type == BCH_DATA_free) && 964 bucket_flushed(new_a)) 965 closure_wake_up(&c->freelist_wait); 966 967 if (statechange(a->data_type == BCH_DATA_need_discard) && 968 !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset) && 969 bucket_flushed(new_a)) 970 bch2_discard_one_bucket_fast(ca, new.k->p.offset); 971 972 if (statechange(a->data_type == BCH_DATA_cached) && 973 !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) && 974 should_invalidate_buckets(ca, bch2_dev_usage_read(ca))) 975 bch2_dev_do_invalidates(ca); 976 977 if (statechange(a->data_type == BCH_DATA_need_gc_gens)) 978 bch2_gc_gens_async(c); 979 } 980 981 if ((flags & BTREE_TRIGGER_gc) && (flags & BTREE_TRIGGER_insert)) { 982 rcu_read_lock(); 983 struct bucket *g = gc_bucket(ca, new.k->p.offset); 984 if (unlikely(!g)) { 985 rcu_read_unlock(); 986 goto invalid_bucket; 987 } 988 g->gen_valid = 1; 989 g->gen = new_a->gen; 990 rcu_read_unlock(); 991 } 992 err: 993 printbuf_exit(&buf); 994 bch2_dev_put(ca); 995 return ret; 996 invalid_bucket: 997 bch2_fs_inconsistent(c, "reference to invalid bucket\n %s", 998 (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf)); 999 ret = -EIO; 1000 goto err; 1001 } 1002 1003 /* 1004 * This synthesizes deleted extents for holes, similar to BTREE_ITER_slots for 1005 * extents style btrees, but works on non-extents btrees: 1006 */ 1007 static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole) 1008 { 1009 struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); 1010 1011 if (bkey_err(k)) 1012 return k; 1013 1014 if (k.k->type) { 1015 return k; 1016 } else { 1017 struct btree_iter iter2; 1018 struct bpos next; 1019 1020 bch2_trans_copy_iter(&iter2, iter); 1021 1022 struct btree_path *path = btree_iter_path(iter->trans, iter); 1023 if (!bpos_eq(path->l[0].b->key.k.p, SPOS_MAX)) 1024 end = bkey_min(end, bpos_nosnap_successor(path->l[0].b->key.k.p)); 1025 1026 end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1)); 1027 1028 /* 1029 * btree node min/max is a closed interval, upto takes a half 1030 * open interval: 1031 */ 1032 k = bch2_btree_iter_peek_upto(&iter2, end); 1033 next = iter2.pos; 1034 bch2_trans_iter_exit(iter->trans, &iter2); 1035 1036 BUG_ON(next.offset >= iter->pos.offset + U32_MAX); 1037 1038 if (bkey_err(k)) 1039 return k; 1040 1041 bkey_init(hole); 1042 hole->p = iter->pos; 1043 1044 bch2_key_resize(hole, next.offset - iter->pos.offset); 1045 return (struct bkey_s_c) { hole, NULL }; 1046 } 1047 } 1048 1049 static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *bucket) 1050 { 1051 if (*ca) { 1052 if (bucket->offset < (*ca)->mi.first_bucket) 1053 bucket->offset = (*ca)->mi.first_bucket; 1054 1055 if (bucket->offset < (*ca)->mi.nbuckets) 1056 return true; 1057 1058 bch2_dev_put(*ca); 1059 *ca = NULL; 1060 bucket->inode++; 1061 bucket->offset = 0; 1062 } 1063 1064 rcu_read_lock(); 1065 *ca = __bch2_next_dev_idx(c, bucket->inode, NULL); 1066 if (*ca) { 1067 *bucket = POS((*ca)->dev_idx, (*ca)->mi.first_bucket); 1068 bch2_dev_get(*ca); 1069 } 1070 rcu_read_unlock(); 1071 1072 return *ca != NULL; 1073 } 1074 1075 static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter, 1076 struct bch_dev **ca, struct bkey *hole) 1077 { 1078 struct bch_fs *c = iter->trans->c; 1079 struct bkey_s_c k; 1080 again: 1081 k = bch2_get_key_or_hole(iter, POS_MAX, hole); 1082 if (bkey_err(k)) 1083 return k; 1084 1085 *ca = bch2_dev_iterate_noerror(c, *ca, k.k->p.inode); 1086 1087 if (!k.k->type) { 1088 struct bpos hole_start = bkey_start_pos(k.k); 1089 1090 if (!*ca || !bucket_valid(*ca, hole_start.offset)) { 1091 if (!next_bucket(c, ca, &hole_start)) 1092 return bkey_s_c_null; 1093 1094 bch2_btree_iter_set_pos(iter, hole_start); 1095 goto again; 1096 } 1097 1098 if (k.k->p.offset > (*ca)->mi.nbuckets) 1099 bch2_key_resize(hole, (*ca)->mi.nbuckets - hole_start.offset); 1100 } 1101 1102 return k; 1103 } 1104 1105 static noinline_for_stack 1106 int bch2_check_alloc_key(struct btree_trans *trans, 1107 struct bkey_s_c alloc_k, 1108 struct btree_iter *alloc_iter, 1109 struct btree_iter *discard_iter, 1110 struct btree_iter *freespace_iter, 1111 struct btree_iter *bucket_gens_iter) 1112 { 1113 struct bch_fs *c = trans->c; 1114 struct bch_alloc_v4 a_convert; 1115 const struct bch_alloc_v4 *a; 1116 unsigned discard_key_type, freespace_key_type; 1117 unsigned gens_offset; 1118 struct bkey_s_c k; 1119 struct printbuf buf = PRINTBUF; 1120 int ret = 0; 1121 1122 struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_k.k->p); 1123 if (fsck_err_on(!ca, 1124 trans, alloc_key_to_missing_dev_bucket, 1125 "alloc key for invalid device:bucket %llu:%llu", 1126 alloc_k.k->p.inode, alloc_k.k->p.offset)) 1127 ret = bch2_btree_delete_at(trans, alloc_iter, 0); 1128 if (!ca) 1129 return ret; 1130 1131 if (!ca->mi.freespace_initialized) 1132 goto out; 1133 1134 a = bch2_alloc_to_v4(alloc_k, &a_convert); 1135 1136 discard_key_type = a->data_type == BCH_DATA_need_discard ? KEY_TYPE_set : 0; 1137 bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p); 1138 k = bch2_btree_iter_peek_slot(discard_iter); 1139 ret = bkey_err(k); 1140 if (ret) 1141 goto err; 1142 1143 if (fsck_err_on(k.k->type != discard_key_type, 1144 trans, need_discard_key_wrong, 1145 "incorrect key in need_discard btree (got %s should be %s)\n" 1146 " %s", 1147 bch2_bkey_types[k.k->type], 1148 bch2_bkey_types[discard_key_type], 1149 (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { 1150 struct bkey_i *update = 1151 bch2_trans_kmalloc(trans, sizeof(*update)); 1152 1153 ret = PTR_ERR_OR_ZERO(update); 1154 if (ret) 1155 goto err; 1156 1157 bkey_init(&update->k); 1158 update->k.type = discard_key_type; 1159 update->k.p = discard_iter->pos; 1160 1161 ret = bch2_trans_update(trans, discard_iter, update, 0); 1162 if (ret) 1163 goto err; 1164 } 1165 1166 freespace_key_type = a->data_type == BCH_DATA_free ? KEY_TYPE_set : 0; 1167 bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a)); 1168 k = bch2_btree_iter_peek_slot(freespace_iter); 1169 ret = bkey_err(k); 1170 if (ret) 1171 goto err; 1172 1173 if (fsck_err_on(k.k->type != freespace_key_type, 1174 trans, freespace_key_wrong, 1175 "incorrect key in freespace btree (got %s should be %s)\n" 1176 " %s", 1177 bch2_bkey_types[k.k->type], 1178 bch2_bkey_types[freespace_key_type], 1179 (printbuf_reset(&buf), 1180 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { 1181 struct bkey_i *update = 1182 bch2_trans_kmalloc(trans, sizeof(*update)); 1183 1184 ret = PTR_ERR_OR_ZERO(update); 1185 if (ret) 1186 goto err; 1187 1188 bkey_init(&update->k); 1189 update->k.type = freespace_key_type; 1190 update->k.p = freespace_iter->pos; 1191 bch2_key_resize(&update->k, 1); 1192 1193 ret = bch2_trans_update(trans, freespace_iter, update, 0); 1194 if (ret) 1195 goto err; 1196 } 1197 1198 bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset)); 1199 k = bch2_btree_iter_peek_slot(bucket_gens_iter); 1200 ret = bkey_err(k); 1201 if (ret) 1202 goto err; 1203 1204 if (fsck_err_on(a->gen != alloc_gen(k, gens_offset), 1205 trans, bucket_gens_key_wrong, 1206 "incorrect gen in bucket_gens btree (got %u should be %u)\n" 1207 " %s", 1208 alloc_gen(k, gens_offset), a->gen, 1209 (printbuf_reset(&buf), 1210 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { 1211 struct bkey_i_bucket_gens *g = 1212 bch2_trans_kmalloc(trans, sizeof(*g)); 1213 1214 ret = PTR_ERR_OR_ZERO(g); 1215 if (ret) 1216 goto err; 1217 1218 if (k.k->type == KEY_TYPE_bucket_gens) { 1219 bkey_reassemble(&g->k_i, k); 1220 } else { 1221 bkey_bucket_gens_init(&g->k_i); 1222 g->k.p = alloc_gens_pos(alloc_k.k->p, &gens_offset); 1223 } 1224 1225 g->v.gens[gens_offset] = a->gen; 1226 1227 ret = bch2_trans_update(trans, bucket_gens_iter, &g->k_i, 0); 1228 if (ret) 1229 goto err; 1230 } 1231 out: 1232 err: 1233 fsck_err: 1234 bch2_dev_put(ca); 1235 printbuf_exit(&buf); 1236 return ret; 1237 } 1238 1239 static noinline_for_stack 1240 int bch2_check_alloc_hole_freespace(struct btree_trans *trans, 1241 struct bch_dev *ca, 1242 struct bpos start, 1243 struct bpos *end, 1244 struct btree_iter *freespace_iter) 1245 { 1246 struct bkey_s_c k; 1247 struct printbuf buf = PRINTBUF; 1248 int ret; 1249 1250 if (!ca->mi.freespace_initialized) 1251 return 0; 1252 1253 bch2_btree_iter_set_pos(freespace_iter, start); 1254 1255 k = bch2_btree_iter_peek_slot(freespace_iter); 1256 ret = bkey_err(k); 1257 if (ret) 1258 goto err; 1259 1260 *end = bkey_min(k.k->p, *end); 1261 1262 if (fsck_err_on(k.k->type != KEY_TYPE_set, 1263 trans, freespace_hole_missing, 1264 "hole in alloc btree missing in freespace btree\n" 1265 " device %llu buckets %llu-%llu", 1266 freespace_iter->pos.inode, 1267 freespace_iter->pos.offset, 1268 end->offset)) { 1269 struct bkey_i *update = 1270 bch2_trans_kmalloc(trans, sizeof(*update)); 1271 1272 ret = PTR_ERR_OR_ZERO(update); 1273 if (ret) 1274 goto err; 1275 1276 bkey_init(&update->k); 1277 update->k.type = KEY_TYPE_set; 1278 update->k.p = freespace_iter->pos; 1279 bch2_key_resize(&update->k, 1280 min_t(u64, U32_MAX, end->offset - 1281 freespace_iter->pos.offset)); 1282 1283 ret = bch2_trans_update(trans, freespace_iter, update, 0); 1284 if (ret) 1285 goto err; 1286 } 1287 err: 1288 fsck_err: 1289 printbuf_exit(&buf); 1290 return ret; 1291 } 1292 1293 static noinline_for_stack 1294 int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans, 1295 struct bpos start, 1296 struct bpos *end, 1297 struct btree_iter *bucket_gens_iter) 1298 { 1299 struct bkey_s_c k; 1300 struct printbuf buf = PRINTBUF; 1301 unsigned i, gens_offset, gens_end_offset; 1302 int ret; 1303 1304 bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset)); 1305 1306 k = bch2_btree_iter_peek_slot(bucket_gens_iter); 1307 ret = bkey_err(k); 1308 if (ret) 1309 goto err; 1310 1311 if (bkey_cmp(alloc_gens_pos(start, &gens_offset), 1312 alloc_gens_pos(*end, &gens_end_offset))) 1313 gens_end_offset = KEY_TYPE_BUCKET_GENS_NR; 1314 1315 if (k.k->type == KEY_TYPE_bucket_gens) { 1316 struct bkey_i_bucket_gens g; 1317 bool need_update = false; 1318 1319 bkey_reassemble(&g.k_i, k); 1320 1321 for (i = gens_offset; i < gens_end_offset; i++) { 1322 if (fsck_err_on(g.v.gens[i], trans, 1323 bucket_gens_hole_wrong, 1324 "hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)", 1325 bucket_gens_pos_to_alloc(k.k->p, i).inode, 1326 bucket_gens_pos_to_alloc(k.k->p, i).offset, 1327 g.v.gens[i])) { 1328 g.v.gens[i] = 0; 1329 need_update = true; 1330 } 1331 } 1332 1333 if (need_update) { 1334 struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g)); 1335 1336 ret = PTR_ERR_OR_ZERO(u); 1337 if (ret) 1338 goto err; 1339 1340 memcpy(u, &g, sizeof(g)); 1341 1342 ret = bch2_trans_update(trans, bucket_gens_iter, u, 0); 1343 if (ret) 1344 goto err; 1345 } 1346 } 1347 1348 *end = bkey_min(*end, bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0)); 1349 err: 1350 fsck_err: 1351 printbuf_exit(&buf); 1352 return ret; 1353 } 1354 1355 static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_trans *trans, 1356 struct btree_iter *iter) 1357 { 1358 struct bch_fs *c = trans->c; 1359 struct btree_iter alloc_iter; 1360 struct bkey_s_c alloc_k; 1361 struct bch_alloc_v4 a_convert; 1362 const struct bch_alloc_v4 *a; 1363 u64 genbits; 1364 struct bpos pos; 1365 enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard 1366 ? BCH_DATA_need_discard 1367 : BCH_DATA_free; 1368 struct printbuf buf = PRINTBUF; 1369 int ret; 1370 1371 pos = iter->pos; 1372 pos.offset &= ~(~0ULL << 56); 1373 genbits = iter->pos.offset & (~0ULL << 56); 1374 1375 alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, pos, 0); 1376 ret = bkey_err(alloc_k); 1377 if (ret) 1378 return ret; 1379 1380 if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), 1381 trans, need_discard_freespace_key_to_invalid_dev_bucket, 1382 "entry in %s btree for nonexistant dev:bucket %llu:%llu", 1383 bch2_btree_id_str(iter->btree_id), pos.inode, pos.offset)) 1384 goto delete; 1385 1386 a = bch2_alloc_to_v4(alloc_k, &a_convert); 1387 1388 if (fsck_err_on(a->data_type != state || 1389 (state == BCH_DATA_free && 1390 genbits != alloc_freespace_genbits(*a)), 1391 trans, need_discard_freespace_key_bad, 1392 "%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)", 1393 (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), 1394 bch2_btree_id_str(iter->btree_id), 1395 iter->pos.inode, 1396 iter->pos.offset, 1397 a->data_type == state, 1398 genbits >> 56, alloc_freespace_genbits(*a) >> 56)) 1399 goto delete; 1400 out: 1401 fsck_err: 1402 bch2_set_btree_iter_dontneed(&alloc_iter); 1403 bch2_trans_iter_exit(trans, &alloc_iter); 1404 printbuf_exit(&buf); 1405 return ret; 1406 delete: 1407 ret = bch2_btree_delete_extent_at(trans, iter, 1408 iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?: 1409 bch2_trans_commit(trans, NULL, NULL, 1410 BCH_TRANS_COMMIT_no_enospc); 1411 goto out; 1412 } 1413 1414 /* 1415 * We've already checked that generation numbers in the bucket_gens btree are 1416 * valid for buckets that exist; this just checks for keys for nonexistent 1417 * buckets. 1418 */ 1419 static noinline_for_stack 1420 int bch2_check_bucket_gens_key(struct btree_trans *trans, 1421 struct btree_iter *iter, 1422 struct bkey_s_c k) 1423 { 1424 struct bch_fs *c = trans->c; 1425 struct bkey_i_bucket_gens g; 1426 u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; 1427 u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; 1428 u64 b; 1429 bool need_update = false; 1430 struct printbuf buf = PRINTBUF; 1431 int ret = 0; 1432 1433 BUG_ON(k.k->type != KEY_TYPE_bucket_gens); 1434 bkey_reassemble(&g.k_i, k); 1435 1436 struct bch_dev *ca = bch2_dev_tryget_noerror(c, k.k->p.inode); 1437 if (!ca) { 1438 if (fsck_err(trans, bucket_gens_to_invalid_dev, 1439 "bucket_gens key for invalid device:\n %s", 1440 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) 1441 ret = bch2_btree_delete_at(trans, iter, 0); 1442 goto out; 1443 } 1444 1445 if (fsck_err_on(end <= ca->mi.first_bucket || 1446 start >= ca->mi.nbuckets, 1447 trans, bucket_gens_to_invalid_buckets, 1448 "bucket_gens key for invalid buckets:\n %s", 1449 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { 1450 ret = bch2_btree_delete_at(trans, iter, 0); 1451 goto out; 1452 } 1453 1454 for (b = start; b < ca->mi.first_bucket; b++) 1455 if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], 1456 trans, bucket_gens_nonzero_for_invalid_buckets, 1457 "bucket_gens key has nonzero gen for invalid bucket")) { 1458 g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0; 1459 need_update = true; 1460 } 1461 1462 for (b = ca->mi.nbuckets; b < end; b++) 1463 if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], 1464 trans, bucket_gens_nonzero_for_invalid_buckets, 1465 "bucket_gens key has nonzero gen for invalid bucket")) { 1466 g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0; 1467 need_update = true; 1468 } 1469 1470 if (need_update) { 1471 struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g)); 1472 1473 ret = PTR_ERR_OR_ZERO(u); 1474 if (ret) 1475 goto out; 1476 1477 memcpy(u, &g, sizeof(g)); 1478 ret = bch2_trans_update(trans, iter, u, 0); 1479 } 1480 out: 1481 fsck_err: 1482 bch2_dev_put(ca); 1483 printbuf_exit(&buf); 1484 return ret; 1485 } 1486 1487 int bch2_check_alloc_info(struct bch_fs *c) 1488 { 1489 struct btree_trans *trans = bch2_trans_get(c); 1490 struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter; 1491 struct bch_dev *ca = NULL; 1492 struct bkey hole; 1493 struct bkey_s_c k; 1494 int ret = 0; 1495 1496 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN, 1497 BTREE_ITER_prefetch); 1498 bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN, 1499 BTREE_ITER_prefetch); 1500 bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace, POS_MIN, 1501 BTREE_ITER_prefetch); 1502 bch2_trans_iter_init(trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN, 1503 BTREE_ITER_prefetch); 1504 1505 while (1) { 1506 struct bpos next; 1507 1508 bch2_trans_begin(trans); 1509 1510 k = bch2_get_key_or_real_bucket_hole(&iter, &ca, &hole); 1511 ret = bkey_err(k); 1512 if (ret) 1513 goto bkey_err; 1514 1515 if (!k.k) 1516 break; 1517 1518 if (k.k->type) { 1519 next = bpos_nosnap_successor(k.k->p); 1520 1521 ret = bch2_check_alloc_key(trans, 1522 k, &iter, 1523 &discard_iter, 1524 &freespace_iter, 1525 &bucket_gens_iter); 1526 if (ret) 1527 goto bkey_err; 1528 } else { 1529 next = k.k->p; 1530 1531 ret = bch2_check_alloc_hole_freespace(trans, ca, 1532 bkey_start_pos(k.k), 1533 &next, 1534 &freespace_iter) ?: 1535 bch2_check_alloc_hole_bucket_gens(trans, 1536 bkey_start_pos(k.k), 1537 &next, 1538 &bucket_gens_iter); 1539 if (ret) 1540 goto bkey_err; 1541 } 1542 1543 ret = bch2_trans_commit(trans, NULL, NULL, 1544 BCH_TRANS_COMMIT_no_enospc); 1545 if (ret) 1546 goto bkey_err; 1547 1548 bch2_btree_iter_set_pos(&iter, next); 1549 bkey_err: 1550 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1551 continue; 1552 if (ret) 1553 break; 1554 } 1555 bch2_trans_iter_exit(trans, &bucket_gens_iter); 1556 bch2_trans_iter_exit(trans, &freespace_iter); 1557 bch2_trans_iter_exit(trans, &discard_iter); 1558 bch2_trans_iter_exit(trans, &iter); 1559 bch2_dev_put(ca); 1560 ca = NULL; 1561 1562 if (ret < 0) 1563 goto err; 1564 1565 ret = for_each_btree_key(trans, iter, 1566 BTREE_ID_need_discard, POS_MIN, 1567 BTREE_ITER_prefetch, k, 1568 bch2_check_discard_freespace_key(trans, &iter)); 1569 if (ret) 1570 goto err; 1571 1572 bch2_trans_iter_init(trans, &iter, BTREE_ID_freespace, POS_MIN, 1573 BTREE_ITER_prefetch); 1574 while (1) { 1575 bch2_trans_begin(trans); 1576 k = bch2_btree_iter_peek(&iter); 1577 if (!k.k) 1578 break; 1579 1580 ret = bkey_err(k) ?: 1581 bch2_check_discard_freespace_key(trans, &iter); 1582 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { 1583 ret = 0; 1584 continue; 1585 } 1586 if (ret) { 1587 struct printbuf buf = PRINTBUF; 1588 bch2_bkey_val_to_text(&buf, c, k); 1589 1590 bch_err(c, "while checking %s", buf.buf); 1591 printbuf_exit(&buf); 1592 break; 1593 } 1594 1595 bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos)); 1596 } 1597 bch2_trans_iter_exit(trans, &iter); 1598 if (ret) 1599 goto err; 1600 1601 ret = for_each_btree_key_commit(trans, iter, 1602 BTREE_ID_bucket_gens, POS_MIN, 1603 BTREE_ITER_prefetch, k, 1604 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 1605 bch2_check_bucket_gens_key(trans, &iter, k)); 1606 err: 1607 bch2_trans_put(trans); 1608 bch_err_fn(c, ret); 1609 return ret; 1610 } 1611 1612 static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, 1613 struct btree_iter *alloc_iter, 1614 struct bkey_buf *last_flushed) 1615 { 1616 struct bch_fs *c = trans->c; 1617 struct bch_alloc_v4 a_convert; 1618 const struct bch_alloc_v4 *a; 1619 struct bkey_s_c alloc_k; 1620 struct printbuf buf = PRINTBUF; 1621 int ret; 1622 1623 alloc_k = bch2_btree_iter_peek(alloc_iter); 1624 if (!alloc_k.k) 1625 return 0; 1626 1627 ret = bkey_err(alloc_k); 1628 if (ret) 1629 return ret; 1630 1631 a = bch2_alloc_to_v4(alloc_k, &a_convert); 1632 1633 if (a->fragmentation_lru) { 1634 ret = bch2_lru_check_set(trans, BCH_LRU_FRAGMENTATION_START, 1635 a->fragmentation_lru, 1636 alloc_k, last_flushed); 1637 if (ret) 1638 return ret; 1639 } 1640 1641 if (a->data_type != BCH_DATA_cached) 1642 return 0; 1643 1644 if (fsck_err_on(!a->io_time[READ], 1645 trans, alloc_key_cached_but_read_time_zero, 1646 "cached bucket with read_time 0\n" 1647 " %s", 1648 (printbuf_reset(&buf), 1649 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { 1650 struct bkey_i_alloc_v4 *a_mut = 1651 bch2_alloc_to_v4_mut(trans, alloc_k); 1652 ret = PTR_ERR_OR_ZERO(a_mut); 1653 if (ret) 1654 goto err; 1655 1656 a_mut->v.io_time[READ] = bch2_current_io_time(c, READ); 1657 ret = bch2_trans_update(trans, alloc_iter, 1658 &a_mut->k_i, BTREE_TRIGGER_norun); 1659 if (ret) 1660 goto err; 1661 1662 a = &a_mut->v; 1663 } 1664 1665 ret = bch2_lru_check_set(trans, alloc_k.k->p.inode, a->io_time[READ], 1666 alloc_k, last_flushed); 1667 if (ret) 1668 goto err; 1669 err: 1670 fsck_err: 1671 printbuf_exit(&buf); 1672 return ret; 1673 } 1674 1675 int bch2_check_alloc_to_lru_refs(struct bch_fs *c) 1676 { 1677 struct bkey_buf last_flushed; 1678 1679 bch2_bkey_buf_init(&last_flushed); 1680 bkey_init(&last_flushed.k->k); 1681 1682 int ret = bch2_trans_run(c, 1683 for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, 1684 POS_MIN, BTREE_ITER_prefetch, k, 1685 NULL, NULL, BCH_TRANS_COMMIT_no_enospc, 1686 bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed))); 1687 1688 bch2_bkey_buf_exit(&last_flushed, c); 1689 bch_err_fn(c, ret); 1690 return ret; 1691 } 1692 1693 static int discard_in_flight_add(struct bch_dev *ca, u64 bucket, bool in_progress) 1694 { 1695 int ret; 1696 1697 mutex_lock(&ca->discard_buckets_in_flight_lock); 1698 darray_for_each(ca->discard_buckets_in_flight, i) 1699 if (i->bucket == bucket) { 1700 ret = -BCH_ERR_EEXIST_discard_in_flight_add; 1701 goto out; 1702 } 1703 1704 ret = darray_push(&ca->discard_buckets_in_flight, ((struct discard_in_flight) { 1705 .in_progress = in_progress, 1706 .bucket = bucket, 1707 })); 1708 out: 1709 mutex_unlock(&ca->discard_buckets_in_flight_lock); 1710 return ret; 1711 } 1712 1713 static void discard_in_flight_remove(struct bch_dev *ca, u64 bucket) 1714 { 1715 mutex_lock(&ca->discard_buckets_in_flight_lock); 1716 darray_for_each(ca->discard_buckets_in_flight, i) 1717 if (i->bucket == bucket) { 1718 BUG_ON(!i->in_progress); 1719 darray_remove_item(&ca->discard_buckets_in_flight, i); 1720 goto found; 1721 } 1722 BUG(); 1723 found: 1724 mutex_unlock(&ca->discard_buckets_in_flight_lock); 1725 } 1726 1727 struct discard_buckets_state { 1728 u64 seen; 1729 u64 open; 1730 u64 need_journal_commit; 1731 u64 discarded; 1732 u64 need_journal_commit_this_dev; 1733 }; 1734 1735 static int bch2_discard_one_bucket(struct btree_trans *trans, 1736 struct bch_dev *ca, 1737 struct btree_iter *need_discard_iter, 1738 struct bpos *discard_pos_done, 1739 struct discard_buckets_state *s) 1740 { 1741 struct bch_fs *c = trans->c; 1742 struct bpos pos = need_discard_iter->pos; 1743 struct btree_iter iter = { NULL }; 1744 struct bkey_s_c k; 1745 struct bkey_i_alloc_v4 *a; 1746 struct printbuf buf = PRINTBUF; 1747 bool discard_locked = false; 1748 int ret = 0; 1749 1750 if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) { 1751 s->open++; 1752 goto out; 1753 } 1754 1755 if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, 1756 c->journal.flushed_seq_ondisk, 1757 pos.inode, pos.offset)) { 1758 s->need_journal_commit++; 1759 s->need_journal_commit_this_dev++; 1760 goto out; 1761 } 1762 1763 k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, 1764 need_discard_iter->pos, 1765 BTREE_ITER_cached); 1766 ret = bkey_err(k); 1767 if (ret) 1768 goto out; 1769 1770 a = bch2_alloc_to_v4_mut(trans, k); 1771 ret = PTR_ERR_OR_ZERO(a); 1772 if (ret) 1773 goto out; 1774 1775 if (bch2_bucket_sectors_total(a->v)) { 1776 if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, 1777 trans, "attempting to discard bucket with dirty data\n%s", 1778 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) 1779 ret = -EIO; 1780 goto out; 1781 } 1782 1783 if (a->v.data_type != BCH_DATA_need_discard) { 1784 if (data_type_is_empty(a->v.data_type) && 1785 BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) { 1786 a->v.gen++; 1787 SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); 1788 goto write; 1789 } 1790 1791 if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, 1792 trans, "bucket incorrectly set in need_discard btree\n" 1793 "%s", 1794 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) 1795 ret = -EIO; 1796 goto out; 1797 } 1798 1799 if (a->v.journal_seq > c->journal.flushed_seq_ondisk) { 1800 if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, 1801 trans, "clearing need_discard but journal_seq %llu > flushed_seq %llu\n%s", 1802 a->v.journal_seq, 1803 c->journal.flushed_seq_ondisk, 1804 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) 1805 ret = -EIO; 1806 goto out; 1807 } 1808 1809 if (discard_in_flight_add(ca, iter.pos.offset, true)) 1810 goto out; 1811 1812 discard_locked = true; 1813 1814 if (!bkey_eq(*discard_pos_done, iter.pos) && 1815 ca->mi.discard && !c->opts.nochanges) { 1816 /* 1817 * This works without any other locks because this is the only 1818 * thread that removes items from the need_discard tree 1819 */ 1820 bch2_trans_unlock_long(trans); 1821 blkdev_issue_discard(ca->disk_sb.bdev, 1822 k.k->p.offset * ca->mi.bucket_size, 1823 ca->mi.bucket_size, 1824 GFP_KERNEL); 1825 *discard_pos_done = iter.pos; 1826 1827 ret = bch2_trans_relock_notrace(trans); 1828 if (ret) 1829 goto out; 1830 } 1831 1832 SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); 1833 write: 1834 alloc_data_type_set(&a->v, a->v.data_type); 1835 1836 ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: 1837 bch2_trans_commit(trans, NULL, NULL, 1838 BCH_WATERMARK_btree| 1839 BCH_TRANS_COMMIT_no_enospc); 1840 if (ret) 1841 goto out; 1842 1843 count_event(c, bucket_discard); 1844 s->discarded++; 1845 out: 1846 if (discard_locked) 1847 discard_in_flight_remove(ca, iter.pos.offset); 1848 s->seen++; 1849 bch2_trans_iter_exit(trans, &iter); 1850 printbuf_exit(&buf); 1851 return ret; 1852 } 1853 1854 static void bch2_do_discards_work(struct work_struct *work) 1855 { 1856 struct bch_dev *ca = container_of(work, struct bch_dev, discard_work); 1857 struct bch_fs *c = ca->fs; 1858 struct discard_buckets_state s = {}; 1859 struct bpos discard_pos_done = POS_MAX; 1860 int ret; 1861 1862 /* 1863 * We're doing the commit in bch2_discard_one_bucket instead of using 1864 * for_each_btree_key_commit() so that we can increment counters after 1865 * successful commit: 1866 */ 1867 ret = bch2_trans_run(c, 1868 for_each_btree_key_upto(trans, iter, 1869 BTREE_ID_need_discard, 1870 POS(ca->dev_idx, 0), 1871 POS(ca->dev_idx, U64_MAX), 0, k, 1872 bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s))); 1873 1874 trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, 1875 bch2_err_str(ret)); 1876 1877 percpu_ref_put(&ca->io_ref); 1878 bch2_write_ref_put(c, BCH_WRITE_REF_discard); 1879 } 1880 1881 void bch2_dev_do_discards(struct bch_dev *ca) 1882 { 1883 struct bch_fs *c = ca->fs; 1884 1885 if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard)) 1886 return; 1887 1888 if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) 1889 goto put_write_ref; 1890 1891 if (queue_work(c->write_ref_wq, &ca->discard_work)) 1892 return; 1893 1894 percpu_ref_put(&ca->io_ref); 1895 put_write_ref: 1896 bch2_write_ref_put(c, BCH_WRITE_REF_discard); 1897 } 1898 1899 void bch2_do_discards(struct bch_fs *c) 1900 { 1901 for_each_member_device(c, ca) 1902 bch2_dev_do_discards(ca); 1903 } 1904 1905 static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket) 1906 { 1907 struct btree_iter iter; 1908 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_intent); 1909 struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); 1910 int ret = bkey_err(k); 1911 if (ret) 1912 goto err; 1913 1914 struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, k); 1915 ret = PTR_ERR_OR_ZERO(a); 1916 if (ret) 1917 goto err; 1918 1919 BUG_ON(a->v.dirty_sectors); 1920 SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); 1921 alloc_data_type_set(&a->v, a->v.data_type); 1922 1923 ret = bch2_trans_update(trans, &iter, &a->k_i, 0); 1924 err: 1925 bch2_trans_iter_exit(trans, &iter); 1926 return ret; 1927 } 1928 1929 static void bch2_do_discards_fast_work(struct work_struct *work) 1930 { 1931 struct bch_dev *ca = container_of(work, struct bch_dev, discard_fast_work); 1932 struct bch_fs *c = ca->fs; 1933 1934 while (1) { 1935 bool got_bucket = false; 1936 u64 bucket; 1937 1938 mutex_lock(&ca->discard_buckets_in_flight_lock); 1939 darray_for_each(ca->discard_buckets_in_flight, i) { 1940 if (i->in_progress) 1941 continue; 1942 1943 got_bucket = true; 1944 bucket = i->bucket; 1945 i->in_progress = true; 1946 break; 1947 } 1948 mutex_unlock(&ca->discard_buckets_in_flight_lock); 1949 1950 if (!got_bucket) 1951 break; 1952 1953 if (ca->mi.discard && !c->opts.nochanges) 1954 blkdev_issue_discard(ca->disk_sb.bdev, 1955 bucket_to_sector(ca, bucket), 1956 ca->mi.bucket_size, 1957 GFP_KERNEL); 1958 1959 int ret = bch2_trans_do(c, NULL, NULL, 1960 BCH_WATERMARK_btree| 1961 BCH_TRANS_COMMIT_no_enospc, 1962 bch2_clear_bucket_needs_discard(trans, POS(ca->dev_idx, bucket))); 1963 bch_err_fn(c, ret); 1964 1965 discard_in_flight_remove(ca, bucket); 1966 1967 if (ret) 1968 break; 1969 } 1970 1971 percpu_ref_put(&ca->io_ref); 1972 bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); 1973 } 1974 1975 static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket) 1976 { 1977 struct bch_fs *c = ca->fs; 1978 1979 if (discard_in_flight_add(ca, bucket, false)) 1980 return; 1981 1982 if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast)) 1983 return; 1984 1985 if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) 1986 goto put_ref; 1987 1988 if (queue_work(c->write_ref_wq, &ca->discard_fast_work)) 1989 return; 1990 1991 percpu_ref_put(&ca->io_ref); 1992 put_ref: 1993 bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); 1994 } 1995 1996 static int invalidate_one_bucket(struct btree_trans *trans, 1997 struct btree_iter *lru_iter, 1998 struct bkey_s_c lru_k, 1999 s64 *nr_to_invalidate) 2000 { 2001 struct bch_fs *c = trans->c; 2002 struct bkey_i_alloc_v4 *a = NULL; 2003 struct printbuf buf = PRINTBUF; 2004 struct bpos bucket = u64_to_bucket(lru_k.k->p.offset); 2005 unsigned cached_sectors; 2006 int ret = 0; 2007 2008 if (*nr_to_invalidate <= 0) 2009 return 1; 2010 2011 if (!bch2_dev_bucket_exists(c, bucket)) { 2012 prt_str(&buf, "lru entry points to invalid bucket"); 2013 goto err; 2014 } 2015 2016 if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset)) 2017 return 0; 2018 2019 a = bch2_trans_start_alloc_update(trans, bucket, BTREE_TRIGGER_bucket_invalidate); 2020 ret = PTR_ERR_OR_ZERO(a); 2021 if (ret) 2022 goto out; 2023 2024 /* We expect harmless races here due to the btree write buffer: */ 2025 if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(a->v)) 2026 goto out; 2027 2028 BUG_ON(a->v.data_type != BCH_DATA_cached); 2029 BUG_ON(a->v.dirty_sectors); 2030 2031 if (!a->v.cached_sectors) 2032 bch_err(c, "invalidating empty bucket, confused"); 2033 2034 cached_sectors = a->v.cached_sectors; 2035 2036 SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); 2037 a->v.gen++; 2038 a->v.data_type = 0; 2039 a->v.dirty_sectors = 0; 2040 a->v.stripe_sectors = 0; 2041 a->v.cached_sectors = 0; 2042 a->v.io_time[READ] = bch2_current_io_time(c, READ); 2043 a->v.io_time[WRITE] = bch2_current_io_time(c, WRITE); 2044 2045 ret = bch2_trans_commit(trans, NULL, NULL, 2046 BCH_WATERMARK_btree| 2047 BCH_TRANS_COMMIT_no_enospc); 2048 if (ret) 2049 goto out; 2050 2051 trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors); 2052 --*nr_to_invalidate; 2053 out: 2054 printbuf_exit(&buf); 2055 return ret; 2056 err: 2057 prt_str(&buf, "\n lru key: "); 2058 bch2_bkey_val_to_text(&buf, c, lru_k); 2059 2060 prt_str(&buf, "\n lru entry: "); 2061 bch2_lru_pos_to_text(&buf, lru_iter->pos); 2062 2063 prt_str(&buf, "\n alloc key: "); 2064 if (!a) 2065 bch2_bpos_to_text(&buf, bucket); 2066 else 2067 bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i)); 2068 2069 bch_err(c, "%s", buf.buf); 2070 if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_lrus) { 2071 bch2_inconsistent_error(c); 2072 ret = -EINVAL; 2073 } 2074 2075 goto out; 2076 } 2077 2078 static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter *iter, 2079 struct bch_dev *ca, bool *wrapped) 2080 { 2081 struct bkey_s_c k; 2082 again: 2083 k = bch2_btree_iter_peek_upto(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX)); 2084 if (!k.k && !*wrapped) { 2085 bch2_btree_iter_set_pos(iter, lru_pos(ca->dev_idx, 0, 0)); 2086 *wrapped = true; 2087 goto again; 2088 } 2089 2090 return k; 2091 } 2092 2093 static void bch2_do_invalidates_work(struct work_struct *work) 2094 { 2095 struct bch_dev *ca = container_of(work, struct bch_dev, invalidate_work); 2096 struct bch_fs *c = ca->fs; 2097 struct btree_trans *trans = bch2_trans_get(c); 2098 int ret = 0; 2099 2100 ret = bch2_btree_write_buffer_tryflush(trans); 2101 if (ret) 2102 goto err; 2103 2104 s64 nr_to_invalidate = 2105 should_invalidate_buckets(ca, bch2_dev_usage_read(ca)); 2106 struct btree_iter iter; 2107 bool wrapped = false; 2108 2109 bch2_trans_iter_init(trans, &iter, BTREE_ID_lru, 2110 lru_pos(ca->dev_idx, 0, 2111 ((bch2_current_io_time(c, READ) + U32_MAX) & 2112 LRU_TIME_MAX)), 0); 2113 2114 while (true) { 2115 bch2_trans_begin(trans); 2116 2117 struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped); 2118 ret = bkey_err(k); 2119 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 2120 continue; 2121 if (ret) 2122 break; 2123 if (!k.k) 2124 break; 2125 2126 ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate); 2127 if (ret) 2128 break; 2129 2130 bch2_btree_iter_advance(&iter); 2131 } 2132 bch2_trans_iter_exit(trans, &iter); 2133 err: 2134 bch2_trans_put(trans); 2135 percpu_ref_put(&ca->io_ref); 2136 bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); 2137 } 2138 2139 void bch2_dev_do_invalidates(struct bch_dev *ca) 2140 { 2141 struct bch_fs *c = ca->fs; 2142 2143 if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate)) 2144 return; 2145 2146 if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) 2147 goto put_ref; 2148 2149 if (queue_work(c->write_ref_wq, &ca->invalidate_work)) 2150 return; 2151 2152 percpu_ref_put(&ca->io_ref); 2153 put_ref: 2154 bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); 2155 } 2156 2157 void bch2_do_invalidates(struct bch_fs *c) 2158 { 2159 for_each_member_device(c, ca) 2160 bch2_dev_do_invalidates(ca); 2161 } 2162 2163 int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, 2164 u64 bucket_start, u64 bucket_end) 2165 { 2166 struct btree_trans *trans = bch2_trans_get(c); 2167 struct btree_iter iter; 2168 struct bkey_s_c k; 2169 struct bkey hole; 2170 struct bpos end = POS(ca->dev_idx, bucket_end); 2171 struct bch_member *m; 2172 unsigned long last_updated = jiffies; 2173 int ret; 2174 2175 BUG_ON(bucket_start > bucket_end); 2176 BUG_ON(bucket_end > ca->mi.nbuckets); 2177 2178 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, 2179 POS(ca->dev_idx, max_t(u64, ca->mi.first_bucket, bucket_start)), 2180 BTREE_ITER_prefetch); 2181 /* 2182 * Scan the alloc btree for every bucket on @ca, and add buckets to the 2183 * freespace/need_discard/need_gc_gens btrees as needed: 2184 */ 2185 while (1) { 2186 if (last_updated + HZ * 10 < jiffies) { 2187 bch_info(ca, "%s: currently at %llu/%llu", 2188 __func__, iter.pos.offset, ca->mi.nbuckets); 2189 last_updated = jiffies; 2190 } 2191 2192 bch2_trans_begin(trans); 2193 2194 if (bkey_ge(iter.pos, end)) { 2195 ret = 0; 2196 break; 2197 } 2198 2199 k = bch2_get_key_or_hole(&iter, end, &hole); 2200 ret = bkey_err(k); 2201 if (ret) 2202 goto bkey_err; 2203 2204 if (k.k->type) { 2205 /* 2206 * We process live keys in the alloc btree one at a 2207 * time: 2208 */ 2209 struct bch_alloc_v4 a_convert; 2210 const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); 2211 2212 ret = bch2_bucket_do_index(trans, ca, k, a, true) ?: 2213 bch2_trans_commit(trans, NULL, NULL, 2214 BCH_TRANS_COMMIT_no_enospc); 2215 if (ret) 2216 goto bkey_err; 2217 2218 bch2_btree_iter_advance(&iter); 2219 } else { 2220 struct bkey_i *freespace; 2221 2222 freespace = bch2_trans_kmalloc(trans, sizeof(*freespace)); 2223 ret = PTR_ERR_OR_ZERO(freespace); 2224 if (ret) 2225 goto bkey_err; 2226 2227 bkey_init(&freespace->k); 2228 freespace->k.type = KEY_TYPE_set; 2229 freespace->k.p = k.k->p; 2230 freespace->k.size = k.k->size; 2231 2232 ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?: 2233 bch2_trans_commit(trans, NULL, NULL, 2234 BCH_TRANS_COMMIT_no_enospc); 2235 if (ret) 2236 goto bkey_err; 2237 2238 bch2_btree_iter_set_pos(&iter, k.k->p); 2239 } 2240 bkey_err: 2241 if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) 2242 continue; 2243 if (ret) 2244 break; 2245 } 2246 2247 bch2_trans_iter_exit(trans, &iter); 2248 bch2_trans_put(trans); 2249 2250 if (ret < 0) { 2251 bch_err_msg(ca, ret, "initializing free space"); 2252 return ret; 2253 } 2254 2255 mutex_lock(&c->sb_lock); 2256 m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); 2257 SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true); 2258 mutex_unlock(&c->sb_lock); 2259 2260 return 0; 2261 } 2262 2263 int bch2_fs_freespace_init(struct bch_fs *c) 2264 { 2265 int ret = 0; 2266 bool doing_init = false; 2267 2268 /* 2269 * We can crash during the device add path, so we need to check this on 2270 * every mount: 2271 */ 2272 2273 for_each_member_device(c, ca) { 2274 if (ca->mi.freespace_initialized) 2275 continue; 2276 2277 if (!doing_init) { 2278 bch_info(c, "initializing freespace"); 2279 doing_init = true; 2280 } 2281 2282 ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); 2283 if (ret) { 2284 bch2_dev_put(ca); 2285 bch_err_fn(c, ret); 2286 return ret; 2287 } 2288 } 2289 2290 if (doing_init) { 2291 mutex_lock(&c->sb_lock); 2292 bch2_write_super(c); 2293 mutex_unlock(&c->sb_lock); 2294 bch_verbose(c, "done initializing freespace"); 2295 } 2296 2297 return 0; 2298 } 2299 2300 /* Bucket IO clocks: */ 2301 2302 int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, 2303 size_t bucket_nr, int rw) 2304 { 2305 struct bch_fs *c = trans->c; 2306 struct btree_iter iter; 2307 struct bkey_i_alloc_v4 *a; 2308 u64 now; 2309 int ret = 0; 2310 2311 if (bch2_trans_relock(trans)) 2312 bch2_trans_begin(trans); 2313 2314 a = bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(dev, bucket_nr)); 2315 ret = PTR_ERR_OR_ZERO(a); 2316 if (ret) 2317 return ret; 2318 2319 now = bch2_current_io_time(c, rw); 2320 if (a->v.io_time[rw] == now) 2321 goto out; 2322 2323 a->v.io_time[rw] = now; 2324 2325 ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: 2326 bch2_trans_commit(trans, NULL, NULL, 0); 2327 out: 2328 bch2_trans_iter_exit(trans, &iter); 2329 return ret; 2330 } 2331 2332 /* Startup/shutdown (ro/rw): */ 2333 2334 void bch2_recalc_capacity(struct bch_fs *c) 2335 { 2336 u64 capacity = 0, reserved_sectors = 0, gc_reserve; 2337 unsigned bucket_size_max = 0; 2338 unsigned long ra_pages = 0; 2339 2340 lockdep_assert_held(&c->state_lock); 2341 2342 for_each_online_member(c, ca) { 2343 struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi; 2344 2345 ra_pages += bdi->ra_pages; 2346 } 2347 2348 bch2_set_ra_pages(c, ra_pages); 2349 2350 for_each_rw_member(c, ca) { 2351 u64 dev_reserve = 0; 2352 2353 /* 2354 * We need to reserve buckets (from the number 2355 * of currently available buckets) against 2356 * foreground writes so that mainly copygc can 2357 * make forward progress. 2358 * 2359 * We need enough to refill the various reserves 2360 * from scratch - copygc will use its entire 2361 * reserve all at once, then run against when 2362 * its reserve is refilled (from the formerly 2363 * available buckets). 2364 * 2365 * This reserve is just used when considering if 2366 * allocations for foreground writes must wait - 2367 * not -ENOSPC calculations. 2368 */ 2369 2370 dev_reserve += ca->nr_btree_reserve * 2; 2371 dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */ 2372 2373 dev_reserve += 1; /* btree write point */ 2374 dev_reserve += 1; /* copygc write point */ 2375 dev_reserve += 1; /* rebalance write point */ 2376 2377 dev_reserve *= ca->mi.bucket_size; 2378 2379 capacity += bucket_to_sector(ca, ca->mi.nbuckets - 2380 ca->mi.first_bucket); 2381 2382 reserved_sectors += dev_reserve * 2; 2383 2384 bucket_size_max = max_t(unsigned, bucket_size_max, 2385 ca->mi.bucket_size); 2386 } 2387 2388 gc_reserve = c->opts.gc_reserve_bytes 2389 ? c->opts.gc_reserve_bytes >> 9 2390 : div64_u64(capacity * c->opts.gc_reserve_percent, 100); 2391 2392 reserved_sectors = max(gc_reserve, reserved_sectors); 2393 2394 reserved_sectors = min(reserved_sectors, capacity); 2395 2396 c->reserved = reserved_sectors; 2397 c->capacity = capacity - reserved_sectors; 2398 2399 c->bucket_size_max = bucket_size_max; 2400 2401 /* Wake up case someone was waiting for buckets */ 2402 closure_wake_up(&c->freelist_wait); 2403 } 2404 2405 u64 bch2_min_rw_member_capacity(struct bch_fs *c) 2406 { 2407 u64 ret = U64_MAX; 2408 2409 for_each_rw_member(c, ca) 2410 ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size); 2411 return ret; 2412 } 2413 2414 static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) 2415 { 2416 struct open_bucket *ob; 2417 bool ret = false; 2418 2419 for (ob = c->open_buckets; 2420 ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); 2421 ob++) { 2422 spin_lock(&ob->lock); 2423 if (ob->valid && !ob->on_partial_list && 2424 ob->dev == ca->dev_idx) 2425 ret = true; 2426 spin_unlock(&ob->lock); 2427 } 2428 2429 return ret; 2430 } 2431 2432 /* device goes ro: */ 2433 void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) 2434 { 2435 unsigned i; 2436 2437 /* First, remove device from allocation groups: */ 2438 2439 for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) 2440 clear_bit(ca->dev_idx, c->rw_devs[i].d); 2441 2442 /* 2443 * Capacity is calculated based off of devices in allocation groups: 2444 */ 2445 bch2_recalc_capacity(c); 2446 2447 bch2_open_buckets_stop(c, ca, false); 2448 2449 /* 2450 * Wake up threads that were blocked on allocation, so they can notice 2451 * the device can no longer be removed and the capacity has changed: 2452 */ 2453 closure_wake_up(&c->freelist_wait); 2454 2455 /* 2456 * journal_res_get() can block waiting for free space in the journal - 2457 * it needs to notice there may not be devices to allocate from anymore: 2458 */ 2459 wake_up(&c->journal.wait); 2460 2461 /* Now wait for any in flight writes: */ 2462 2463 closure_wait_event(&c->open_buckets_wait, 2464 !bch2_dev_has_open_write_point(c, ca)); 2465 } 2466 2467 /* device goes rw: */ 2468 void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) 2469 { 2470 unsigned i; 2471 2472 for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) 2473 if (ca->mi.data_allowed & (1 << i)) 2474 set_bit(ca->dev_idx, c->rw_devs[i].d); 2475 } 2476 2477 void bch2_dev_allocator_background_exit(struct bch_dev *ca) 2478 { 2479 darray_exit(&ca->discard_buckets_in_flight); 2480 } 2481 2482 void bch2_dev_allocator_background_init(struct bch_dev *ca) 2483 { 2484 mutex_init(&ca->discard_buckets_in_flight_lock); 2485 INIT_WORK(&ca->discard_work, bch2_do_discards_work); 2486 INIT_WORK(&ca->discard_fast_work, bch2_do_discards_fast_work); 2487 INIT_WORK(&ca->invalidate_work, bch2_do_invalidates_work); 2488 } 2489 2490 void bch2_fs_allocator_background_init(struct bch_fs *c) 2491 { 2492 spin_lock_init(&c->freelist_lock); 2493 } 2494