1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "bcachefs_ioctl.h" 5 #include "btree_cache.h" 6 #include "btree_journal_iter.h" 7 #include "btree_update.h" 8 #include "btree_write_buffer.h" 9 #include "buckets.h" 10 #include "compress.h" 11 #include "disk_accounting.h" 12 #include "error.h" 13 #include "journal_io.h" 14 #include "replicas.h" 15 16 /* 17 * Notes on disk accounting: 18 * 19 * We have two parallel sets of counters to be concerned with, and both must be 20 * kept in sync. 21 * 22 * - Persistent/on disk accounting, stored in the accounting btree and updated 23 * via btree write buffer updates that treat new accounting keys as deltas to 24 * apply to existing values. But reading from a write buffer btree is 25 * expensive, so we also have 26 * 27 * - In memory accounting, where accounting is stored as an array of percpu 28 * counters, indexed by an eytzinger array of disk acounting keys/bpos (which 29 * are the same thing, excepting byte swabbing on big endian). 30 * 31 * Cheap to read, but non persistent. 32 * 33 * Disk accounting updates are generated by transactional triggers; these run as 34 * keys enter and leave the btree, and can compare old and new versions of keys; 35 * the output of these triggers are deltas to the various counters. 36 * 37 * Disk accounting updates are done as btree write buffer updates, where the 38 * counters in the disk accounting key are deltas that will be applied to the 39 * counter in the btree when the key is flushed by the write buffer (or journal 40 * replay). 41 * 42 * To do a disk accounting update: 43 * - initialize a disk_accounting_pos, to specify which counter is being update 44 * - initialize counter deltas, as an array of 1-3 s64s 45 * - call bch2_disk_accounting_mod() 46 * 47 * This queues up the accounting update to be done at transaction commit time. 48 * Underneath, it's a normal btree write buffer update. 49 * 50 * The transaction commit path is responsible for propagating updates to the in 51 * memory counters, with bch2_accounting_mem_mod(). 52 * 53 * The commit path also assigns every disk accounting update a unique version 54 * number, based on the journal sequence number and offset within that journal 55 * buffer; this is used by journal replay to determine which updates have been 56 * done. 57 * 58 * The transaction commit path also ensures that replicas entry accounting 59 * updates are properly marked in the superblock (so that we know whether we can 60 * mount without data being unavailable); it will update the superblock if 61 * bch2_accounting_mem_mod() tells it to. 62 */ 63 64 static const char * const disk_accounting_type_strs[] = { 65 #define x(t, n, ...) [n] = #t, 66 BCH_DISK_ACCOUNTING_TYPES() 67 #undef x 68 NULL 69 }; 70 71 static inline void __accounting_key_init(struct bkey_i *k, struct bpos pos, 72 s64 *d, unsigned nr) 73 { 74 struct bkey_i_accounting *acc = bkey_accounting_init(k); 75 76 acc->k.p = pos; 77 set_bkey_val_u64s(&acc->k, sizeof(struct bch_accounting) / sizeof(u64) + nr); 78 79 memcpy_u64s_small(acc->v.d, d, nr); 80 } 81 82 static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_pos *pos, 83 s64 *d, unsigned nr) 84 { 85 return __accounting_key_init(k, disk_accounting_pos_to_bpos(pos), d, nr); 86 } 87 88 static int bch2_accounting_update_sb_one(struct bch_fs *, struct bpos); 89 90 int bch2_disk_accounting_mod(struct btree_trans *trans, 91 struct disk_accounting_pos *k, 92 s64 *d, unsigned nr, bool gc) 93 { 94 BUG_ON(nr > BCH_ACCOUNTING_MAX_COUNTERS); 95 96 /* Normalize: */ 97 switch (k->type) { 98 case BCH_DISK_ACCOUNTING_replicas: 99 bubble_sort(k->replicas.devs, k->replicas.nr_devs, u8_cmp); 100 break; 101 } 102 103 struct bpos pos = disk_accounting_pos_to_bpos(k); 104 105 if (likely(!gc)) { 106 struct bkey_i_accounting *a; 107 #if 0 108 for (a = btree_trans_subbuf_base(trans, &trans->accounting); 109 a != btree_trans_subbuf_top(trans, &trans->accounting); 110 a = (void *) bkey_next(&a->k_i)) 111 if (bpos_eq(a->k.p, pos)) { 112 BUG_ON(nr != bch2_accounting_counters(&a->k)); 113 acc_u64s(a->v.d, d, nr); 114 115 if (bch2_accounting_key_is_zero(accounting_i_to_s_c(a))) { 116 unsigned offset = (u64 *) a - 117 (u64 *) btree_trans_subbuf_base(trans, &trans->accounting); 118 119 trans->accounting.u64s -= a->k.u64s; 120 memmove_u64s_down(a, 121 bkey_next(&a->k_i), 122 trans->accounting.u64s - offset); 123 } 124 return 0; 125 } 126 #endif 127 unsigned u64s = sizeof(*a) / sizeof(u64) + nr; 128 a = bch2_trans_subbuf_alloc(trans, &trans->accounting, u64s); 129 int ret = PTR_ERR_OR_ZERO(a); 130 if (ret) 131 return ret; 132 133 __accounting_key_init(&a->k_i, pos, d, nr); 134 return 0; 135 } else { 136 struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i; 137 138 __accounting_key_init(&k_i.k, pos, d, nr); 139 140 int ret = bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); 141 if (ret == -BCH_ERR_btree_insert_need_mark_replicas) 142 ret = drop_locks_do(trans, 143 bch2_accounting_update_sb_one(trans->c, disk_accounting_pos_to_bpos(k))) ?: 144 bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); 145 return ret; 146 } 147 } 148 149 int bch2_mod_dev_cached_sectors(struct btree_trans *trans, 150 unsigned dev, s64 sectors, 151 bool gc) 152 { 153 struct disk_accounting_pos acc; 154 memset(&acc, 0, sizeof(acc)); 155 acc.type = BCH_DISK_ACCOUNTING_replicas; 156 bch2_replicas_entry_cached(&acc.replicas, dev); 157 158 return bch2_disk_accounting_mod(trans, &acc, §ors, 1, gc); 159 } 160 161 static inline bool is_zero(char *start, char *end) 162 { 163 BUG_ON(start > end); 164 165 for (; start < end; start++) 166 if (*start) 167 return false; 168 return true; 169 } 170 171 #define field_end(p, member) (((void *) (&p.member)) + sizeof(p.member)) 172 173 static const unsigned bch2_accounting_type_nr_counters[] = { 174 #define x(f, id, nr) [BCH_DISK_ACCOUNTING_##f] = nr, 175 BCH_DISK_ACCOUNTING_TYPES() 176 #undef x 177 }; 178 179 int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k, 180 struct bkey_validate_context from) 181 { 182 struct disk_accounting_pos acc_k; 183 bpos_to_disk_accounting_pos(&acc_k, k.k->p); 184 void *end = &acc_k + 1; 185 int ret = 0; 186 187 bkey_fsck_err_on((from.flags & BCH_VALIDATE_commit) && 188 bversion_zero(k.k->bversion), 189 c, accounting_key_version_0, 190 "accounting key with version=0"); 191 192 switch (acc_k.type) { 193 case BCH_DISK_ACCOUNTING_nr_inodes: 194 end = field_end(acc_k, nr_inodes); 195 break; 196 case BCH_DISK_ACCOUNTING_persistent_reserved: 197 end = field_end(acc_k, persistent_reserved); 198 break; 199 case BCH_DISK_ACCOUNTING_replicas: 200 bkey_fsck_err_on(!acc_k.replicas.nr_devs, 201 c, accounting_key_replicas_nr_devs_0, 202 "accounting key replicas entry with nr_devs=0"); 203 204 bkey_fsck_err_on(acc_k.replicas.nr_required > acc_k.replicas.nr_devs || 205 (acc_k.replicas.nr_required > 1 && 206 acc_k.replicas.nr_required == acc_k.replicas.nr_devs), 207 c, accounting_key_replicas_nr_required_bad, 208 "accounting key replicas entry with bad nr_required"); 209 210 for (unsigned i = 0; i + 1 < acc_k.replicas.nr_devs; i++) 211 bkey_fsck_err_on(acc_k.replicas.devs[i] >= acc_k.replicas.devs[i + 1], 212 c, accounting_key_replicas_devs_unsorted, 213 "accounting key replicas entry with unsorted devs"); 214 215 end = (void *) &acc_k.replicas + replicas_entry_bytes(&acc_k.replicas); 216 break; 217 case BCH_DISK_ACCOUNTING_dev_data_type: 218 end = field_end(acc_k, dev_data_type); 219 break; 220 case BCH_DISK_ACCOUNTING_compression: 221 end = field_end(acc_k, compression); 222 break; 223 case BCH_DISK_ACCOUNTING_snapshot: 224 end = field_end(acc_k, snapshot); 225 break; 226 case BCH_DISK_ACCOUNTING_btree: 227 end = field_end(acc_k, btree); 228 break; 229 case BCH_DISK_ACCOUNTING_rebalance_work: 230 end = field_end(acc_k, rebalance_work); 231 break; 232 } 233 234 bkey_fsck_err_on(!is_zero(end, (void *) (&acc_k + 1)), 235 c, accounting_key_junk_at_end, 236 "junk at end of accounting key"); 237 238 bkey_fsck_err_on(bch2_accounting_counters(k.k) != bch2_accounting_type_nr_counters[acc_k.type], 239 c, accounting_key_nr_counters_wrong, 240 "accounting key with %u counters, should be %u", 241 bch2_accounting_counters(k.k), bch2_accounting_type_nr_counters[acc_k.type]); 242 fsck_err: 243 return ret; 244 } 245 246 void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_pos *k) 247 { 248 if (k->type >= BCH_DISK_ACCOUNTING_TYPE_NR) { 249 prt_printf(out, "unknown type %u", k->type); 250 return; 251 } 252 253 prt_str(out, disk_accounting_type_strs[k->type]); 254 prt_str(out, " "); 255 256 switch (k->type) { 257 case BCH_DISK_ACCOUNTING_nr_inodes: 258 break; 259 case BCH_DISK_ACCOUNTING_persistent_reserved: 260 prt_printf(out, "replicas=%u", k->persistent_reserved.nr_replicas); 261 break; 262 case BCH_DISK_ACCOUNTING_replicas: 263 bch2_replicas_entry_to_text(out, &k->replicas); 264 break; 265 case BCH_DISK_ACCOUNTING_dev_data_type: 266 prt_printf(out, "dev=%u data_type=", k->dev_data_type.dev); 267 bch2_prt_data_type(out, k->dev_data_type.data_type); 268 break; 269 case BCH_DISK_ACCOUNTING_compression: 270 bch2_prt_compression_type(out, k->compression.type); 271 break; 272 case BCH_DISK_ACCOUNTING_snapshot: 273 prt_printf(out, "id=%u", k->snapshot.id); 274 break; 275 case BCH_DISK_ACCOUNTING_btree: 276 prt_str(out, "btree="); 277 bch2_btree_id_to_text(out, k->btree.id); 278 break; 279 } 280 } 281 282 void bch2_accounting_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) 283 { 284 struct bkey_s_c_accounting acc = bkey_s_c_to_accounting(k); 285 struct disk_accounting_pos acc_k; 286 bpos_to_disk_accounting_pos(&acc_k, k.k->p); 287 288 bch2_accounting_key_to_text(out, &acc_k); 289 290 for (unsigned i = 0; i < bch2_accounting_counters(k.k); i++) 291 prt_printf(out, " %lli", acc.v->d[i]); 292 } 293 294 void bch2_accounting_swab(struct bkey_s k) 295 { 296 for (u64 *p = (u64 *) k.v; 297 p < (u64 *) bkey_val_end(k); 298 p++) 299 *p = swab64(*p); 300 } 301 302 static inline void __accounting_to_replicas(struct bch_replicas_entry_v1 *r, 303 struct disk_accounting_pos *acc) 304 { 305 unsafe_memcpy(r, &acc->replicas, 306 replicas_entry_bytes(&acc->replicas), 307 "variable length struct"); 308 } 309 310 static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struct bpos p) 311 { 312 struct disk_accounting_pos acc_k; 313 bpos_to_disk_accounting_pos(&acc_k, p); 314 315 switch (acc_k.type) { 316 case BCH_DISK_ACCOUNTING_replicas: 317 __accounting_to_replicas(r, &acc_k); 318 return true; 319 default: 320 return false; 321 } 322 } 323 324 static int bch2_accounting_update_sb_one(struct bch_fs *c, struct bpos p) 325 { 326 union bch_replicas_padded r; 327 return accounting_to_replicas(&r.e, p) 328 ? bch2_mark_replicas(c, &r.e) 329 : 0; 330 } 331 332 /* 333 * Ensure accounting keys being updated are present in the superblock, when 334 * applicable (i.e. replicas updates) 335 */ 336 int bch2_accounting_update_sb(struct btree_trans *trans) 337 { 338 for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting); 339 i != btree_trans_subbuf_top(trans, &trans->accounting); 340 i = bkey_next(i)) { 341 int ret = bch2_accounting_update_sb_one(trans->c, i->k.p); 342 if (ret) 343 return ret; 344 } 345 346 return 0; 347 } 348 349 static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a) 350 { 351 struct bch_accounting_mem *acc = &c->accounting; 352 353 /* raced with another insert, already present: */ 354 if (eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), 355 accounting_pos_cmp, &a.k->p) < acc->k.nr) 356 return 0; 357 358 struct accounting_mem_entry n = { 359 .pos = a.k->p, 360 .bversion = a.k->bversion, 361 .nr_counters = bch2_accounting_counters(a.k), 362 .v[0] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64), 363 sizeof(u64), GFP_KERNEL), 364 }; 365 366 if (!n.v[0]) 367 goto err; 368 369 if (acc->gc_running) { 370 n.v[1] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64), 371 sizeof(u64), GFP_KERNEL); 372 if (!n.v[1]) 373 goto err; 374 } 375 376 if (darray_push(&acc->k, n)) 377 goto err; 378 379 eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), 380 accounting_pos_cmp, NULL); 381 382 if (trace_accounting_mem_insert_enabled()) { 383 struct printbuf buf = PRINTBUF; 384 385 bch2_accounting_to_text(&buf, c, a.s_c); 386 trace_accounting_mem_insert(c, buf.buf); 387 printbuf_exit(&buf); 388 } 389 return 0; 390 err: 391 free_percpu(n.v[1]); 392 free_percpu(n.v[0]); 393 return bch_err_throw(c, ENOMEM_disk_accounting); 394 } 395 396 int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, 397 enum bch_accounting_mode mode) 398 { 399 union bch_replicas_padded r; 400 401 if (mode != BCH_ACCOUNTING_read && 402 accounting_to_replicas(&r.e, a.k->p) && 403 !bch2_replicas_marked_locked(c, &r.e)) 404 return bch_err_throw(c, btree_insert_need_mark_replicas); 405 406 percpu_up_read(&c->mark_lock); 407 percpu_down_write(&c->mark_lock); 408 int ret = __bch2_accounting_mem_insert(c, a); 409 percpu_up_write(&c->mark_lock); 410 percpu_down_read(&c->mark_lock); 411 return ret; 412 } 413 414 int bch2_accounting_mem_insert_locked(struct bch_fs *c, struct bkey_s_c_accounting a, 415 enum bch_accounting_mode mode) 416 { 417 union bch_replicas_padded r; 418 419 if (mode != BCH_ACCOUNTING_read && 420 accounting_to_replicas(&r.e, a.k->p) && 421 !bch2_replicas_marked_locked(c, &r.e)) 422 return bch_err_throw(c, btree_insert_need_mark_replicas); 423 424 return __bch2_accounting_mem_insert(c, a); 425 } 426 427 static bool accounting_mem_entry_is_zero(struct accounting_mem_entry *e) 428 { 429 for (unsigned i = 0; i < e->nr_counters; i++) 430 if (percpu_u64_get(e->v[0] + i) || 431 (e->v[1] && 432 percpu_u64_get(e->v[1] + i))) 433 return false; 434 return true; 435 } 436 437 void bch2_accounting_mem_gc(struct bch_fs *c) 438 { 439 struct bch_accounting_mem *acc = &c->accounting; 440 441 percpu_down_write(&c->mark_lock); 442 struct accounting_mem_entry *dst = acc->k.data; 443 444 darray_for_each(acc->k, src) { 445 if (accounting_mem_entry_is_zero(src)) { 446 free_percpu(src->v[0]); 447 free_percpu(src->v[1]); 448 } else { 449 *dst++ = *src; 450 } 451 } 452 453 acc->k.nr = dst - acc->k.data; 454 eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), 455 accounting_pos_cmp, NULL); 456 percpu_up_write(&c->mark_lock); 457 } 458 459 /* 460 * Read out accounting keys for replicas entries, as an array of 461 * bch_replicas_usage entries. 462 * 463 * Note: this may be deprecated/removed at smoe point in the future and replaced 464 * with something more general, it exists to support the ioctl used by the 465 * 'bcachefs fs usage' command. 466 */ 467 int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage) 468 { 469 struct bch_accounting_mem *acc = &c->accounting; 470 int ret = 0; 471 472 darray_init(usage); 473 474 percpu_down_read(&c->mark_lock); 475 darray_for_each(acc->k, i) { 476 union { 477 u8 bytes[struct_size_t(struct bch_replicas_usage, r.devs, 478 BCH_BKEY_PTRS_MAX)]; 479 struct bch_replicas_usage r; 480 } u; 481 u.r.r.nr_devs = BCH_BKEY_PTRS_MAX; 482 483 if (!accounting_to_replicas(&u.r.r, i->pos)) 484 continue; 485 486 u64 sectors; 487 bch2_accounting_mem_read_counters(acc, i - acc->k.data, §ors, 1, false); 488 u.r.sectors = sectors; 489 490 ret = darray_make_room(usage, replicas_usage_bytes(&u.r)); 491 if (ret) 492 break; 493 494 memcpy(&darray_top(*usage), &u.r, replicas_usage_bytes(&u.r)); 495 usage->nr += replicas_usage_bytes(&u.r); 496 } 497 percpu_up_read(&c->mark_lock); 498 499 if (ret) 500 darray_exit(usage); 501 return ret; 502 } 503 504 int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned accounting_types_mask) 505 { 506 507 struct bch_accounting_mem *acc = &c->accounting; 508 int ret = 0; 509 510 darray_init(out_buf); 511 512 percpu_down_read(&c->mark_lock); 513 darray_for_each(acc->k, i) { 514 struct disk_accounting_pos a_p; 515 bpos_to_disk_accounting_pos(&a_p, i->pos); 516 517 if (!(accounting_types_mask & BIT(a_p.type))) 518 continue; 519 520 ret = darray_make_room(out_buf, sizeof(struct bkey_i_accounting) + 521 sizeof(u64) * i->nr_counters); 522 if (ret) 523 break; 524 525 struct bkey_i_accounting *a_out = 526 bkey_accounting_init((void *) &darray_top(*out_buf)); 527 set_bkey_val_u64s(&a_out->k, i->nr_counters); 528 a_out->k.p = i->pos; 529 bch2_accounting_mem_read_counters(acc, i - acc->k.data, 530 a_out->v.d, i->nr_counters, false); 531 532 if (!bch2_accounting_key_is_zero(accounting_i_to_s_c(a_out))) 533 out_buf->nr += bkey_bytes(&a_out->k); 534 } 535 536 percpu_up_read(&c->mark_lock); 537 538 if (ret) 539 darray_exit(out_buf); 540 return ret; 541 } 542 543 static void bch2_accounting_free_counters(struct bch_accounting_mem *acc, bool gc) 544 { 545 darray_for_each(acc->k, e) { 546 free_percpu(e->v[gc]); 547 e->v[gc] = NULL; 548 } 549 } 550 551 int bch2_gc_accounting_start(struct bch_fs *c) 552 { 553 struct bch_accounting_mem *acc = &c->accounting; 554 int ret = 0; 555 556 percpu_down_write(&c->mark_lock); 557 darray_for_each(acc->k, e) { 558 e->v[1] = __alloc_percpu_gfp(e->nr_counters * sizeof(u64), 559 sizeof(u64), GFP_KERNEL); 560 if (!e->v[1]) { 561 bch2_accounting_free_counters(acc, true); 562 ret = bch_err_throw(c, ENOMEM_disk_accounting); 563 break; 564 } 565 } 566 567 acc->gc_running = !ret; 568 percpu_up_write(&c->mark_lock); 569 570 return ret; 571 } 572 573 int bch2_gc_accounting_done(struct bch_fs *c) 574 { 575 struct bch_accounting_mem *acc = &c->accounting; 576 struct btree_trans *trans = bch2_trans_get(c); 577 struct printbuf buf = PRINTBUF; 578 struct bpos pos = POS_MIN; 579 int ret = 0; 580 581 percpu_down_write(&c->mark_lock); 582 while (1) { 583 unsigned idx = eytzinger0_find_ge(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), 584 accounting_pos_cmp, &pos); 585 586 if (idx >= acc->k.nr) 587 break; 588 589 struct accounting_mem_entry *e = acc->k.data + idx; 590 pos = bpos_successor(e->pos); 591 592 struct disk_accounting_pos acc_k; 593 bpos_to_disk_accounting_pos(&acc_k, e->pos); 594 595 if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) 596 continue; 597 598 u64 src_v[BCH_ACCOUNTING_MAX_COUNTERS]; 599 u64 dst_v[BCH_ACCOUNTING_MAX_COUNTERS]; 600 601 unsigned nr = e->nr_counters; 602 bch2_accounting_mem_read_counters(acc, idx, dst_v, nr, false); 603 bch2_accounting_mem_read_counters(acc, idx, src_v, nr, true); 604 605 if (memcmp(dst_v, src_v, nr * sizeof(u64))) { 606 printbuf_reset(&buf); 607 prt_str(&buf, "accounting mismatch for "); 608 bch2_accounting_key_to_text(&buf, &acc_k); 609 610 prt_str(&buf, ":\n got"); 611 for (unsigned j = 0; j < nr; j++) 612 prt_printf(&buf, " %llu", dst_v[j]); 613 614 prt_str(&buf, "\nshould be"); 615 for (unsigned j = 0; j < nr; j++) 616 prt_printf(&buf, " %llu", src_v[j]); 617 618 for (unsigned j = 0; j < nr; j++) 619 src_v[j] -= dst_v[j]; 620 621 bch2_trans_unlock_long(trans); 622 623 if (fsck_err(c, accounting_mismatch, "%s", buf.buf)) { 624 percpu_up_write(&c->mark_lock); 625 ret = commit_do(trans, NULL, NULL, 0, 626 bch2_disk_accounting_mod(trans, &acc_k, src_v, nr, false)); 627 percpu_down_write(&c->mark_lock); 628 if (ret) 629 goto err; 630 631 if (!test_bit(BCH_FS_may_go_rw, &c->flags)) { 632 memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta)); 633 struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i; 634 635 accounting_key_init(&k_i.k, &acc_k, src_v, nr); 636 bch2_accounting_mem_mod_locked(trans, 637 bkey_i_to_s_c_accounting(&k_i.k), 638 BCH_ACCOUNTING_normal, true); 639 640 preempt_disable(); 641 struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage); 642 struct bch_fs_usage_base *src = &trans->fs_usage_delta; 643 acc_u64s((u64 *) dst, (u64 *) src, sizeof(*src) / sizeof(u64)); 644 preempt_enable(); 645 } 646 } 647 } 648 } 649 err: 650 fsck_err: 651 percpu_up_write(&c->mark_lock); 652 printbuf_exit(&buf); 653 bch2_trans_put(trans); 654 bch_err_fn(c, ret); 655 return ret; 656 } 657 658 static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k) 659 { 660 struct bch_fs *c = trans->c; 661 662 if (k.k->type != KEY_TYPE_accounting) 663 return 0; 664 665 percpu_down_read(&c->mark_lock); 666 int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k), 667 BCH_ACCOUNTING_read, false); 668 percpu_up_read(&c->mark_lock); 669 return ret; 670 } 671 672 static int bch2_disk_accounting_validate_late(struct btree_trans *trans, 673 struct disk_accounting_pos *acc, 674 u64 *v, unsigned nr) 675 { 676 struct bch_fs *c = trans->c; 677 struct printbuf buf = PRINTBUF; 678 int ret = 0, invalid_dev = -1; 679 680 switch (acc->type) { 681 case BCH_DISK_ACCOUNTING_replicas: { 682 union bch_replicas_padded r; 683 __accounting_to_replicas(&r.e, acc); 684 685 for (unsigned i = 0; i < r.e.nr_devs; i++) 686 if (r.e.devs[i] != BCH_SB_MEMBER_INVALID && 687 !bch2_dev_exists(c, r.e.devs[i])) { 688 invalid_dev = r.e.devs[i]; 689 goto invalid_device; 690 } 691 692 /* 693 * All replicas entry checks except for invalid device are done 694 * in bch2_accounting_validate 695 */ 696 BUG_ON(bch2_replicas_entry_validate(&r.e, c, &buf)); 697 698 if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e), 699 trans, accounting_replicas_not_marked, 700 "accounting not marked in superblock replicas\n%s", 701 (printbuf_reset(&buf), 702 bch2_accounting_key_to_text(&buf, acc), 703 buf.buf))) { 704 /* 705 * We're not RW yet and still single threaded, dropping 706 * and retaking lock is ok: 707 */ 708 percpu_up_write(&c->mark_lock); 709 ret = bch2_mark_replicas(c, &r.e); 710 if (ret) 711 goto fsck_err; 712 percpu_down_write(&c->mark_lock); 713 } 714 break; 715 } 716 717 case BCH_DISK_ACCOUNTING_dev_data_type: 718 if (!bch2_dev_exists(c, acc->dev_data_type.dev)) { 719 invalid_dev = acc->dev_data_type.dev; 720 goto invalid_device; 721 } 722 break; 723 } 724 725 fsck_err: 726 printbuf_exit(&buf); 727 return ret; 728 invalid_device: 729 if (fsck_err(trans, accounting_to_invalid_device, 730 "accounting entry points to invalid device %i\n%s", 731 invalid_dev, 732 (printbuf_reset(&buf), 733 bch2_accounting_key_to_text(&buf, acc), 734 buf.buf))) { 735 for (unsigned i = 0; i < nr; i++) 736 v[i] = -v[i]; 737 738 ret = commit_do(trans, NULL, NULL, 0, 739 bch2_disk_accounting_mod(trans, acc, v, nr, false)) ?: 740 -BCH_ERR_remove_disk_accounting_entry; 741 } else { 742 ret = bch_err_throw(c, remove_disk_accounting_entry); 743 } 744 goto fsck_err; 745 } 746 747 /* 748 * At startup time, initialize the in memory accounting from the btree (and 749 * journal) 750 */ 751 int bch2_accounting_read(struct bch_fs *c) 752 { 753 struct bch_accounting_mem *acc = &c->accounting; 754 struct btree_trans *trans = bch2_trans_get(c); 755 struct printbuf buf = PRINTBUF; 756 757 /* 758 * We might run more than once if we rewind to start topology repair or 759 * btree node scan - and those might cause us to get different results, 760 * so we can't just skip if we've already run. 761 * 762 * Instead, zero out any accounting we have: 763 */ 764 percpu_down_write(&c->mark_lock); 765 darray_for_each(acc->k, e) 766 percpu_memset(e->v[0], 0, sizeof(u64) * e->nr_counters); 767 for_each_member_device(c, ca) 768 percpu_memset(ca->usage, 0, sizeof(*ca->usage)); 769 percpu_memset(c->usage, 0, sizeof(*c->usage)); 770 percpu_up_write(&c->mark_lock); 771 772 struct btree_iter iter; 773 bch2_trans_iter_init(trans, &iter, BTREE_ID_accounting, POS_MIN, 774 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots); 775 iter.flags &= ~BTREE_ITER_with_journal; 776 int ret = for_each_btree_key_continue(trans, iter, 777 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ 778 struct bkey u; 779 struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u); 780 781 if (k.k->type != KEY_TYPE_accounting) 782 continue; 783 784 struct disk_accounting_pos acc_k; 785 bpos_to_disk_accounting_pos(&acc_k, k.k->p); 786 787 if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) 788 break; 789 790 if (!bch2_accounting_is_mem(&acc_k)) { 791 struct disk_accounting_pos next; 792 memset(&next, 0, sizeof(next)); 793 next.type = acc_k.type + 1; 794 bch2_btree_iter_set_pos(trans, &iter, disk_accounting_pos_to_bpos(&next)); 795 continue; 796 } 797 798 accounting_read_key(trans, k); 799 })); 800 if (ret) 801 goto err; 802 803 struct journal_keys *keys = &c->journal_keys; 804 struct journal_key *dst = keys->data; 805 move_gap(keys, keys->nr); 806 807 darray_for_each(*keys, i) { 808 if (i->k->k.type == KEY_TYPE_accounting) { 809 struct disk_accounting_pos acc_k; 810 bpos_to_disk_accounting_pos(&acc_k, i->k->k.p); 811 812 if (!bch2_accounting_is_mem(&acc_k)) 813 continue; 814 815 struct bkey_s_c k = bkey_i_to_s_c(i->k); 816 unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, 817 sizeof(acc->k.data[0]), 818 accounting_pos_cmp, &k.k->p); 819 820 bool applied = idx < acc->k.nr && 821 bversion_cmp(acc->k.data[idx].bversion, k.k->bversion) >= 0; 822 823 if (applied) 824 continue; 825 826 if (i + 1 < &darray_top(*keys) && 827 i[1].k->k.type == KEY_TYPE_accounting && 828 !journal_key_cmp(i, i + 1)) { 829 WARN_ON(bversion_cmp(i[0].k->k.bversion, i[1].k->k.bversion) >= 0); 830 831 i[1].journal_seq = i[0].journal_seq; 832 833 bch2_accounting_accumulate(bkey_i_to_accounting(i[1].k), 834 bkey_s_c_to_accounting(k)); 835 continue; 836 } 837 838 ret = accounting_read_key(trans, k); 839 if (ret) 840 goto err; 841 } 842 843 *dst++ = *i; 844 } 845 keys->gap = keys->nr = dst - keys->data; 846 847 percpu_down_write(&c->mark_lock); 848 849 darray_for_each_reverse(acc->k, i) { 850 struct disk_accounting_pos acc_k; 851 bpos_to_disk_accounting_pos(&acc_k, i->pos); 852 853 u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; 854 memset(v, 0, sizeof(v)); 855 856 for (unsigned j = 0; j < i->nr_counters; j++) 857 v[j] = percpu_u64_get(i->v[0] + j); 858 859 /* 860 * If the entry counters are zeroed, it should be treated as 861 * nonexistent - it might point to an invalid device. 862 * 863 * Remove it, so that if it's re-added it gets re-marked in the 864 * superblock: 865 */ 866 ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters) 867 ? -BCH_ERR_remove_disk_accounting_entry 868 : bch2_disk_accounting_validate_late(trans, &acc_k, v, i->nr_counters); 869 870 if (ret == -BCH_ERR_remove_disk_accounting_entry) { 871 free_percpu(i->v[0]); 872 free_percpu(i->v[1]); 873 darray_remove_item(&acc->k, i); 874 ret = 0; 875 continue; 876 } 877 878 if (ret) 879 goto fsck_err; 880 } 881 882 eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), 883 accounting_pos_cmp, NULL); 884 885 preempt_disable(); 886 struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage); 887 888 for (unsigned i = 0; i < acc->k.nr; i++) { 889 struct disk_accounting_pos k; 890 bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos); 891 892 u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; 893 bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false); 894 895 switch (k.type) { 896 case BCH_DISK_ACCOUNTING_persistent_reserved: 897 usage->reserved += v[0] * k.persistent_reserved.nr_replicas; 898 break; 899 case BCH_DISK_ACCOUNTING_replicas: 900 fs_usage_data_type_to_base(usage, k.replicas.data_type, v[0]); 901 break; 902 case BCH_DISK_ACCOUNTING_dev_data_type: { 903 guard(rcu)(); 904 struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.dev_data_type.dev); 905 if (ca) { 906 struct bch_dev_usage_type __percpu *d = &ca->usage->d[k.dev_data_type.data_type]; 907 percpu_u64_set(&d->buckets, v[0]); 908 percpu_u64_set(&d->sectors, v[1]); 909 percpu_u64_set(&d->fragmented, v[2]); 910 911 if (k.dev_data_type.data_type == BCH_DATA_sb || 912 k.dev_data_type.data_type == BCH_DATA_journal) 913 usage->hidden += v[0] * ca->mi.bucket_size; 914 } 915 break; 916 } 917 } 918 } 919 preempt_enable(); 920 fsck_err: 921 percpu_up_write(&c->mark_lock); 922 err: 923 printbuf_exit(&buf); 924 bch2_trans_put(trans); 925 bch_err_fn(c, ret); 926 return ret; 927 } 928 929 int bch2_dev_usage_remove(struct bch_fs *c, unsigned dev) 930 { 931 return bch2_trans_run(c, 932 bch2_btree_write_buffer_flush_sync(trans) ?: 933 for_each_btree_key_commit(trans, iter, BTREE_ID_accounting, POS_MIN, 934 BTREE_ITER_all_snapshots, k, NULL, NULL, 0, ({ 935 struct disk_accounting_pos acc; 936 bpos_to_disk_accounting_pos(&acc, k.k->p); 937 938 acc.type == BCH_DISK_ACCOUNTING_dev_data_type && 939 acc.dev_data_type.dev == dev 940 ? bch2_btree_bit_mod_buffered(trans, BTREE_ID_accounting, k.k->p, 0) 941 : 0; 942 })) ?: 943 bch2_btree_write_buffer_flush_sync(trans)); 944 } 945 946 int bch2_dev_usage_init(struct bch_dev *ca, bool gc) 947 { 948 struct bch_fs *c = ca->fs; 949 u64 v[3] = { ca->mi.nbuckets - ca->mi.first_bucket, 0, 0 }; 950 951 int ret = bch2_trans_do(c, ({ 952 bch2_disk_accounting_mod2(trans, gc, 953 v, dev_data_type, 954 .dev = ca->dev_idx, 955 .data_type = BCH_DATA_free) ?: 956 (!gc ? bch2_trans_commit(trans, NULL, NULL, 0) : 0); 957 })); 958 bch_err_fn(c, ret); 959 return ret; 960 } 961 962 void bch2_verify_accounting_clean(struct bch_fs *c) 963 { 964 bool mismatch = false; 965 struct bch_fs_usage_base base = {}, base_inmem = {}; 966 967 bch2_trans_run(c, 968 for_each_btree_key(trans, iter, 969 BTREE_ID_accounting, POS_MIN, 970 BTREE_ITER_all_snapshots, k, ({ 971 u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; 972 struct bkey_s_c_accounting a = bkey_s_c_to_accounting(k); 973 unsigned nr = bch2_accounting_counters(k.k); 974 975 struct disk_accounting_pos acc_k; 976 bpos_to_disk_accounting_pos(&acc_k, k.k->p); 977 978 if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) 979 break; 980 981 if (!bch2_accounting_is_mem(&acc_k)) { 982 struct disk_accounting_pos next; 983 memset(&next, 0, sizeof(next)); 984 next.type = acc_k.type + 1; 985 bch2_btree_iter_set_pos(trans, &iter, disk_accounting_pos_to_bpos(&next)); 986 continue; 987 } 988 989 bch2_accounting_mem_read(c, k.k->p, v, nr); 990 991 if (memcmp(a.v->d, v, nr * sizeof(u64))) { 992 struct printbuf buf = PRINTBUF; 993 994 bch2_bkey_val_to_text(&buf, c, k); 995 prt_str(&buf, " !="); 996 for (unsigned j = 0; j < nr; j++) 997 prt_printf(&buf, " %llu", v[j]); 998 999 pr_err("%s", buf.buf); 1000 printbuf_exit(&buf); 1001 mismatch = true; 1002 } 1003 1004 switch (acc_k.type) { 1005 case BCH_DISK_ACCOUNTING_persistent_reserved: 1006 base.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0]; 1007 break; 1008 case BCH_DISK_ACCOUNTING_replicas: 1009 fs_usage_data_type_to_base(&base, acc_k.replicas.data_type, a.v->d[0]); 1010 break; 1011 case BCH_DISK_ACCOUNTING_dev_data_type: 1012 { 1013 guard(rcu)(); /* scoped guard is a loop, and doesn't play nicely with continue */ 1014 struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); 1015 if (!ca) 1016 continue; 1017 1018 v[0] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].buckets); 1019 v[1] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].sectors); 1020 v[2] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].fragmented); 1021 } 1022 1023 if (memcmp(a.v->d, v, 3 * sizeof(u64))) { 1024 struct printbuf buf = PRINTBUF; 1025 1026 bch2_bkey_val_to_text(&buf, c, k); 1027 prt_str(&buf, " in mem"); 1028 for (unsigned j = 0; j < nr; j++) 1029 prt_printf(&buf, " %llu", v[j]); 1030 1031 pr_err("dev accounting mismatch: %s", buf.buf); 1032 printbuf_exit(&buf); 1033 mismatch = true; 1034 } 1035 } 1036 1037 0; 1038 }))); 1039 1040 acc_u64s_percpu(&base_inmem.hidden, &c->usage->hidden, sizeof(base_inmem) / sizeof(u64)); 1041 1042 #define check(x) \ 1043 if (base.x != base_inmem.x) { \ 1044 pr_err("fs_usage_base.%s mismatch: %llu != %llu", #x, base.x, base_inmem.x); \ 1045 mismatch = true; \ 1046 } 1047 1048 //check(hidden); 1049 check(btree); 1050 check(data); 1051 check(cached); 1052 check(reserved); 1053 check(nr_inodes); 1054 1055 WARN_ON(mismatch); 1056 } 1057 1058 void bch2_accounting_gc_free(struct bch_fs *c) 1059 { 1060 lockdep_assert_held(&c->mark_lock); 1061 1062 struct bch_accounting_mem *acc = &c->accounting; 1063 1064 bch2_accounting_free_counters(acc, true); 1065 acc->gc_running = false; 1066 } 1067 1068 void bch2_fs_accounting_exit(struct bch_fs *c) 1069 { 1070 struct bch_accounting_mem *acc = &c->accounting; 1071 1072 bch2_accounting_free_counters(acc, false); 1073 darray_exit(&acc->k); 1074 } 1075