1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "bcachefs_ioctl.h" 5 #include "btree_cache.h" 6 #include "btree_journal_iter.h" 7 #include "btree_update.h" 8 #include "btree_write_buffer.h" 9 #include "buckets.h" 10 #include "compress.h" 11 #include "disk_accounting.h" 12 #include "error.h" 13 #include "journal_io.h" 14 #include "replicas.h" 15 16 /* 17 * Notes on disk accounting: 18 * 19 * We have two parallel sets of counters to be concerned with, and both must be 20 * kept in sync. 21 * 22 * - Persistent/on disk accounting, stored in the accounting btree and updated 23 * via btree write buffer updates that treat new accounting keys as deltas to 24 * apply to existing values. But reading from a write buffer btree is 25 * expensive, so we also have 26 * 27 * - In memory accounting, where accounting is stored as an array of percpu 28 * counters, indexed by an eytzinger array of disk acounting keys/bpos (which 29 * are the same thing, excepting byte swabbing on big endian). 30 * 31 * Cheap to read, but non persistent. 32 * 33 * Disk accounting updates are generated by transactional triggers; these run as 34 * keys enter and leave the btree, and can compare old and new versions of keys; 35 * the output of these triggers are deltas to the various counters. 36 * 37 * Disk accounting updates are done as btree write buffer updates, where the 38 * counters in the disk accounting key are deltas that will be applied to the 39 * counter in the btree when the key is flushed by the write buffer (or journal 40 * replay). 41 * 42 * To do a disk accounting update: 43 * - initialize a disk_accounting_pos, to specify which counter is being update 44 * - initialize counter deltas, as an array of 1-3 s64s 45 * - call bch2_disk_accounting_mod() 46 * 47 * This queues up the accounting update to be done at transaction commit time. 48 * Underneath, it's a normal btree write buffer update. 49 * 50 * The transaction commit path is responsible for propagating updates to the in 51 * memory counters, with bch2_accounting_mem_mod(). 52 * 53 * The commit path also assigns every disk accounting update a unique version 54 * number, based on the journal sequence number and offset within that journal 55 * buffer; this is used by journal replay to determine which updates have been 56 * done. 57 * 58 * The transaction commit path also ensures that replicas entry accounting 59 * updates are properly marked in the superblock (so that we know whether we can 60 * mount without data being unavailable); it will update the superblock if 61 * bch2_accounting_mem_mod() tells it to. 62 */ 63 64 static const char * const disk_accounting_type_strs[] = { 65 #define x(t, n, ...) [n] = #t, 66 BCH_DISK_ACCOUNTING_TYPES() 67 #undef x 68 NULL 69 }; 70 71 static inline void __accounting_key_init(struct bkey_i *k, struct bpos pos, 72 s64 *d, unsigned nr) 73 { 74 struct bkey_i_accounting *acc = bkey_accounting_init(k); 75 76 acc->k.p = pos; 77 set_bkey_val_u64s(&acc->k, sizeof(struct bch_accounting) / sizeof(u64) + nr); 78 79 memcpy_u64s_small(acc->v.d, d, nr); 80 } 81 82 static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_pos *pos, 83 s64 *d, unsigned nr) 84 { 85 return __accounting_key_init(k, disk_accounting_pos_to_bpos(pos), d, nr); 86 } 87 88 static int bch2_accounting_update_sb_one(struct bch_fs *, struct bpos); 89 90 int bch2_disk_accounting_mod(struct btree_trans *trans, 91 struct disk_accounting_pos *k, 92 s64 *d, unsigned nr, bool gc) 93 { 94 BUG_ON(nr > BCH_ACCOUNTING_MAX_COUNTERS); 95 96 /* Normalize: */ 97 switch (k->type) { 98 case BCH_DISK_ACCOUNTING_replicas: 99 bubble_sort(k->replicas.devs, k->replicas.nr_devs, u8_cmp); 100 break; 101 } 102 103 struct bpos pos = disk_accounting_pos_to_bpos(k); 104 105 if (likely(!gc)) { 106 struct bkey_i_accounting *a; 107 #if 0 108 for (a = btree_trans_subbuf_base(trans, &trans->accounting); 109 a != btree_trans_subbuf_top(trans, &trans->accounting); 110 a = (void *) bkey_next(&a->k_i)) 111 if (bpos_eq(a->k.p, pos)) { 112 BUG_ON(nr != bch2_accounting_counters(&a->k)); 113 acc_u64s(a->v.d, d, nr); 114 115 if (bch2_accounting_key_is_zero(accounting_i_to_s_c(a))) { 116 unsigned offset = (u64 *) a - 117 (u64 *) btree_trans_subbuf_base(trans, &trans->accounting); 118 119 trans->accounting.u64s -= a->k.u64s; 120 memmove_u64s_down(a, 121 bkey_next(&a->k_i), 122 trans->accounting.u64s - offset); 123 } 124 return 0; 125 } 126 #endif 127 unsigned u64s = sizeof(*a) / sizeof(u64) + nr; 128 a = bch2_trans_subbuf_alloc(trans, &trans->accounting, u64s); 129 int ret = PTR_ERR_OR_ZERO(a); 130 if (ret) 131 return ret; 132 133 __accounting_key_init(&a->k_i, pos, d, nr); 134 return 0; 135 } else { 136 struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i; 137 138 __accounting_key_init(&k_i.k, pos, d, nr); 139 140 int ret = bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); 141 if (ret == -BCH_ERR_btree_insert_need_mark_replicas) 142 ret = drop_locks_do(trans, 143 bch2_accounting_update_sb_one(trans->c, disk_accounting_pos_to_bpos(k))) ?: 144 bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); 145 return ret; 146 } 147 } 148 149 int bch2_mod_dev_cached_sectors(struct btree_trans *trans, 150 unsigned dev, s64 sectors, 151 bool gc) 152 { 153 struct disk_accounting_pos acc; 154 memset(&acc, 0, sizeof(acc)); 155 acc.type = BCH_DISK_ACCOUNTING_replicas; 156 bch2_replicas_entry_cached(&acc.replicas, dev); 157 158 return bch2_disk_accounting_mod(trans, &acc, §ors, 1, gc); 159 } 160 161 static inline bool is_zero(char *start, char *end) 162 { 163 BUG_ON(start > end); 164 165 for (; start < end; start++) 166 if (*start) 167 return false; 168 return true; 169 } 170 171 #define field_end(p, member) (((void *) (&p.member)) + sizeof(p.member)) 172 173 static const unsigned bch2_accounting_type_nr_counters[] = { 174 #define x(f, id, nr) [BCH_DISK_ACCOUNTING_##f] = nr, 175 BCH_DISK_ACCOUNTING_TYPES() 176 #undef x 177 }; 178 179 int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k, 180 struct bkey_validate_context from) 181 { 182 struct disk_accounting_pos acc_k; 183 bpos_to_disk_accounting_pos(&acc_k, k.k->p); 184 void *end = &acc_k + 1; 185 int ret = 0; 186 187 bkey_fsck_err_on((from.flags & BCH_VALIDATE_commit) && 188 bversion_zero(k.k->bversion), 189 c, accounting_key_version_0, 190 "accounting key with version=0"); 191 192 switch (acc_k.type) { 193 case BCH_DISK_ACCOUNTING_nr_inodes: 194 end = field_end(acc_k, nr_inodes); 195 break; 196 case BCH_DISK_ACCOUNTING_persistent_reserved: 197 end = field_end(acc_k, persistent_reserved); 198 break; 199 case BCH_DISK_ACCOUNTING_replicas: 200 bkey_fsck_err_on(!acc_k.replicas.nr_devs, 201 c, accounting_key_replicas_nr_devs_0, 202 "accounting key replicas entry with nr_devs=0"); 203 204 bkey_fsck_err_on(acc_k.replicas.nr_required > acc_k.replicas.nr_devs || 205 (acc_k.replicas.nr_required > 1 && 206 acc_k.replicas.nr_required == acc_k.replicas.nr_devs), 207 c, accounting_key_replicas_nr_required_bad, 208 "accounting key replicas entry with bad nr_required"); 209 210 for (unsigned i = 0; i + 1 < acc_k.replicas.nr_devs; i++) 211 bkey_fsck_err_on(acc_k.replicas.devs[i] >= acc_k.replicas.devs[i + 1], 212 c, accounting_key_replicas_devs_unsorted, 213 "accounting key replicas entry with unsorted devs"); 214 215 end = (void *) &acc_k.replicas + replicas_entry_bytes(&acc_k.replicas); 216 break; 217 case BCH_DISK_ACCOUNTING_dev_data_type: 218 end = field_end(acc_k, dev_data_type); 219 break; 220 case BCH_DISK_ACCOUNTING_compression: 221 end = field_end(acc_k, compression); 222 break; 223 case BCH_DISK_ACCOUNTING_snapshot: 224 end = field_end(acc_k, snapshot); 225 break; 226 case BCH_DISK_ACCOUNTING_btree: 227 end = field_end(acc_k, btree); 228 break; 229 case BCH_DISK_ACCOUNTING_rebalance_work: 230 end = field_end(acc_k, rebalance_work); 231 break; 232 } 233 234 bkey_fsck_err_on(!is_zero(end, (void *) (&acc_k + 1)), 235 c, accounting_key_junk_at_end, 236 "junk at end of accounting key"); 237 238 bkey_fsck_err_on(bch2_accounting_counters(k.k) != bch2_accounting_type_nr_counters[acc_k.type], 239 c, accounting_key_nr_counters_wrong, 240 "accounting key with %u counters, should be %u", 241 bch2_accounting_counters(k.k), bch2_accounting_type_nr_counters[acc_k.type]); 242 fsck_err: 243 return ret; 244 } 245 246 void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_pos *k) 247 { 248 if (k->type >= BCH_DISK_ACCOUNTING_TYPE_NR) { 249 prt_printf(out, "unknown type %u", k->type); 250 return; 251 } 252 253 prt_str(out, disk_accounting_type_strs[k->type]); 254 prt_str(out, " "); 255 256 switch (k->type) { 257 case BCH_DISK_ACCOUNTING_nr_inodes: 258 break; 259 case BCH_DISK_ACCOUNTING_persistent_reserved: 260 prt_printf(out, "replicas=%u", k->persistent_reserved.nr_replicas); 261 break; 262 case BCH_DISK_ACCOUNTING_replicas: 263 bch2_replicas_entry_to_text(out, &k->replicas); 264 break; 265 case BCH_DISK_ACCOUNTING_dev_data_type: 266 prt_printf(out, "dev=%u data_type=", k->dev_data_type.dev); 267 bch2_prt_data_type(out, k->dev_data_type.data_type); 268 break; 269 case BCH_DISK_ACCOUNTING_compression: 270 bch2_prt_compression_type(out, k->compression.type); 271 break; 272 case BCH_DISK_ACCOUNTING_snapshot: 273 prt_printf(out, "id=%u", k->snapshot.id); 274 break; 275 case BCH_DISK_ACCOUNTING_btree: 276 prt_str(out, "btree="); 277 bch2_btree_id_to_text(out, k->btree.id); 278 break; 279 } 280 } 281 282 void bch2_accounting_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) 283 { 284 struct bkey_s_c_accounting acc = bkey_s_c_to_accounting(k); 285 struct disk_accounting_pos acc_k; 286 bpos_to_disk_accounting_pos(&acc_k, k.k->p); 287 288 bch2_accounting_key_to_text(out, &acc_k); 289 290 for (unsigned i = 0; i < bch2_accounting_counters(k.k); i++) 291 prt_printf(out, " %lli", acc.v->d[i]); 292 } 293 294 void bch2_accounting_swab(struct bkey_s k) 295 { 296 for (u64 *p = (u64 *) k.v; 297 p < (u64 *) bkey_val_end(k); 298 p++) 299 *p = swab64(*p); 300 } 301 302 static inline void __accounting_to_replicas(struct bch_replicas_entry_v1 *r, 303 struct disk_accounting_pos *acc) 304 { 305 unsafe_memcpy(r, &acc->replicas, 306 replicas_entry_bytes(&acc->replicas), 307 "variable length struct"); 308 } 309 310 static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struct bpos p) 311 { 312 struct disk_accounting_pos acc_k; 313 bpos_to_disk_accounting_pos(&acc_k, p); 314 315 switch (acc_k.type) { 316 case BCH_DISK_ACCOUNTING_replicas: 317 __accounting_to_replicas(r, &acc_k); 318 return true; 319 default: 320 return false; 321 } 322 } 323 324 static int bch2_accounting_update_sb_one(struct bch_fs *c, struct bpos p) 325 { 326 union bch_replicas_padded r; 327 return accounting_to_replicas(&r.e, p) 328 ? bch2_mark_replicas(c, &r.e) 329 : 0; 330 } 331 332 /* 333 * Ensure accounting keys being updated are present in the superblock, when 334 * applicable (i.e. replicas updates) 335 */ 336 int bch2_accounting_update_sb(struct btree_trans *trans) 337 { 338 for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting); 339 i != btree_trans_subbuf_top(trans, &trans->accounting); 340 i = bkey_next(i)) { 341 int ret = bch2_accounting_update_sb_one(trans->c, i->k.p); 342 if (ret) 343 return ret; 344 } 345 346 return 0; 347 } 348 349 static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a) 350 { 351 struct bch_accounting_mem *acc = &c->accounting; 352 353 /* raced with another insert, already present: */ 354 if (eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), 355 accounting_pos_cmp, &a.k->p) < acc->k.nr) 356 return 0; 357 358 struct accounting_mem_entry n = { 359 .pos = a.k->p, 360 .bversion = a.k->bversion, 361 .nr_counters = bch2_accounting_counters(a.k), 362 .v[0] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64), 363 sizeof(u64), GFP_KERNEL), 364 }; 365 366 if (!n.v[0]) 367 goto err; 368 369 if (acc->gc_running) { 370 n.v[1] = __alloc_percpu_gfp(n.nr_counters * sizeof(u64), 371 sizeof(u64), GFP_KERNEL); 372 if (!n.v[1]) 373 goto err; 374 } 375 376 if (darray_push(&acc->k, n)) 377 goto err; 378 379 eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), 380 accounting_pos_cmp, NULL); 381 382 if (trace_accounting_mem_insert_enabled()) { 383 struct printbuf buf = PRINTBUF; 384 385 bch2_accounting_to_text(&buf, c, a.s_c); 386 trace_accounting_mem_insert(c, buf.buf); 387 printbuf_exit(&buf); 388 } 389 return 0; 390 err: 391 free_percpu(n.v[1]); 392 free_percpu(n.v[0]); 393 return bch_err_throw(c, ENOMEM_disk_accounting); 394 } 395 396 int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, 397 enum bch_accounting_mode mode) 398 { 399 union bch_replicas_padded r; 400 401 if (mode != BCH_ACCOUNTING_read && 402 accounting_to_replicas(&r.e, a.k->p) && 403 !bch2_replicas_marked_locked(c, &r.e)) 404 return bch_err_throw(c, btree_insert_need_mark_replicas); 405 406 percpu_up_read(&c->mark_lock); 407 percpu_down_write(&c->mark_lock); 408 int ret = __bch2_accounting_mem_insert(c, a); 409 percpu_up_write(&c->mark_lock); 410 percpu_down_read(&c->mark_lock); 411 return ret; 412 } 413 414 int bch2_accounting_mem_insert_locked(struct bch_fs *c, struct bkey_s_c_accounting a, 415 enum bch_accounting_mode mode) 416 { 417 union bch_replicas_padded r; 418 419 if (mode != BCH_ACCOUNTING_read && 420 accounting_to_replicas(&r.e, a.k->p) && 421 !bch2_replicas_marked_locked(c, &r.e)) 422 return bch_err_throw(c, btree_insert_need_mark_replicas); 423 424 return __bch2_accounting_mem_insert(c, a); 425 } 426 427 static bool accounting_mem_entry_is_zero(struct accounting_mem_entry *e) 428 { 429 for (unsigned i = 0; i < e->nr_counters; i++) 430 if (percpu_u64_get(e->v[0] + i) || 431 (e->v[1] && 432 percpu_u64_get(e->v[1] + i))) 433 return false; 434 return true; 435 } 436 437 void bch2_accounting_mem_gc(struct bch_fs *c) 438 { 439 struct bch_accounting_mem *acc = &c->accounting; 440 441 percpu_down_write(&c->mark_lock); 442 struct accounting_mem_entry *dst = acc->k.data; 443 444 darray_for_each(acc->k, src) { 445 if (accounting_mem_entry_is_zero(src)) { 446 free_percpu(src->v[0]); 447 free_percpu(src->v[1]); 448 } else { 449 *dst++ = *src; 450 } 451 } 452 453 acc->k.nr = dst - acc->k.data; 454 eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), 455 accounting_pos_cmp, NULL); 456 percpu_up_write(&c->mark_lock); 457 } 458 459 /* 460 * Read out accounting keys for replicas entries, as an array of 461 * bch_replicas_usage entries. 462 * 463 * Note: this may be deprecated/removed at smoe point in the future and replaced 464 * with something more general, it exists to support the ioctl used by the 465 * 'bcachefs fs usage' command. 466 */ 467 int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage) 468 { 469 struct bch_accounting_mem *acc = &c->accounting; 470 int ret = 0; 471 472 darray_init(usage); 473 474 percpu_down_read(&c->mark_lock); 475 darray_for_each(acc->k, i) { 476 union { 477 u8 bytes[struct_size_t(struct bch_replicas_usage, r.devs, 478 BCH_BKEY_PTRS_MAX)]; 479 struct bch_replicas_usage r; 480 } u; 481 u.r.r.nr_devs = BCH_BKEY_PTRS_MAX; 482 483 if (!accounting_to_replicas(&u.r.r, i->pos)) 484 continue; 485 486 u64 sectors; 487 bch2_accounting_mem_read_counters(acc, i - acc->k.data, §ors, 1, false); 488 u.r.sectors = sectors; 489 490 ret = darray_make_room(usage, replicas_usage_bytes(&u.r)); 491 if (ret) 492 break; 493 494 memcpy(&darray_top(*usage), &u.r, replicas_usage_bytes(&u.r)); 495 usage->nr += replicas_usage_bytes(&u.r); 496 } 497 percpu_up_read(&c->mark_lock); 498 499 if (ret) 500 darray_exit(usage); 501 return ret; 502 } 503 504 int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned accounting_types_mask) 505 { 506 507 struct bch_accounting_mem *acc = &c->accounting; 508 int ret = 0; 509 510 darray_init(out_buf); 511 512 percpu_down_read(&c->mark_lock); 513 darray_for_each(acc->k, i) { 514 struct disk_accounting_pos a_p; 515 bpos_to_disk_accounting_pos(&a_p, i->pos); 516 517 if (!(accounting_types_mask & BIT(a_p.type))) 518 continue; 519 520 ret = darray_make_room(out_buf, sizeof(struct bkey_i_accounting) + 521 sizeof(u64) * i->nr_counters); 522 if (ret) 523 break; 524 525 struct bkey_i_accounting *a_out = 526 bkey_accounting_init((void *) &darray_top(*out_buf)); 527 set_bkey_val_u64s(&a_out->k, i->nr_counters); 528 a_out->k.p = i->pos; 529 bch2_accounting_mem_read_counters(acc, i - acc->k.data, 530 a_out->v.d, i->nr_counters, false); 531 532 if (!bch2_accounting_key_is_zero(accounting_i_to_s_c(a_out))) 533 out_buf->nr += bkey_bytes(&a_out->k); 534 } 535 536 percpu_up_read(&c->mark_lock); 537 538 if (ret) 539 darray_exit(out_buf); 540 return ret; 541 } 542 543 static void bch2_accounting_free_counters(struct bch_accounting_mem *acc, bool gc) 544 { 545 darray_for_each(acc->k, e) { 546 free_percpu(e->v[gc]); 547 e->v[gc] = NULL; 548 } 549 } 550 551 int bch2_gc_accounting_start(struct bch_fs *c) 552 { 553 struct bch_accounting_mem *acc = &c->accounting; 554 int ret = 0; 555 556 percpu_down_write(&c->mark_lock); 557 darray_for_each(acc->k, e) { 558 e->v[1] = __alloc_percpu_gfp(e->nr_counters * sizeof(u64), 559 sizeof(u64), GFP_KERNEL); 560 if (!e->v[1]) { 561 bch2_accounting_free_counters(acc, true); 562 ret = bch_err_throw(c, ENOMEM_disk_accounting); 563 break; 564 } 565 } 566 567 acc->gc_running = !ret; 568 percpu_up_write(&c->mark_lock); 569 570 return ret; 571 } 572 573 int bch2_gc_accounting_done(struct bch_fs *c) 574 { 575 struct bch_accounting_mem *acc = &c->accounting; 576 struct btree_trans *trans = bch2_trans_get(c); 577 struct printbuf buf = PRINTBUF; 578 struct bpos pos = POS_MIN; 579 int ret = 0; 580 581 percpu_down_write(&c->mark_lock); 582 while (1) { 583 unsigned idx = eytzinger0_find_ge(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), 584 accounting_pos_cmp, &pos); 585 586 if (idx >= acc->k.nr) 587 break; 588 589 struct accounting_mem_entry *e = acc->k.data + idx; 590 pos = bpos_successor(e->pos); 591 592 struct disk_accounting_pos acc_k; 593 bpos_to_disk_accounting_pos(&acc_k, e->pos); 594 595 if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) 596 continue; 597 598 u64 src_v[BCH_ACCOUNTING_MAX_COUNTERS]; 599 u64 dst_v[BCH_ACCOUNTING_MAX_COUNTERS]; 600 601 unsigned nr = e->nr_counters; 602 bch2_accounting_mem_read_counters(acc, idx, dst_v, nr, false); 603 bch2_accounting_mem_read_counters(acc, idx, src_v, nr, true); 604 605 if (memcmp(dst_v, src_v, nr * sizeof(u64))) { 606 printbuf_reset(&buf); 607 prt_str(&buf, "accounting mismatch for "); 608 bch2_accounting_key_to_text(&buf, &acc_k); 609 610 prt_str(&buf, ":\n got"); 611 for (unsigned j = 0; j < nr; j++) 612 prt_printf(&buf, " %llu", dst_v[j]); 613 614 prt_str(&buf, "\nshould be"); 615 for (unsigned j = 0; j < nr; j++) 616 prt_printf(&buf, " %llu", src_v[j]); 617 618 for (unsigned j = 0; j < nr; j++) 619 src_v[j] -= dst_v[j]; 620 621 if (fsck_err(trans, accounting_mismatch, "%s", buf.buf)) { 622 percpu_up_write(&c->mark_lock); 623 ret = commit_do(trans, NULL, NULL, 0, 624 bch2_disk_accounting_mod(trans, &acc_k, src_v, nr, false)); 625 percpu_down_write(&c->mark_lock); 626 if (ret) 627 goto err; 628 629 if (!test_bit(BCH_FS_may_go_rw, &c->flags)) { 630 memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta)); 631 struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i; 632 633 accounting_key_init(&k_i.k, &acc_k, src_v, nr); 634 bch2_accounting_mem_mod_locked(trans, 635 bkey_i_to_s_c_accounting(&k_i.k), 636 BCH_ACCOUNTING_normal, true); 637 638 preempt_disable(); 639 struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage); 640 struct bch_fs_usage_base *src = &trans->fs_usage_delta; 641 acc_u64s((u64 *) dst, (u64 *) src, sizeof(*src) / sizeof(u64)); 642 preempt_enable(); 643 } 644 } 645 } 646 } 647 err: 648 fsck_err: 649 percpu_up_write(&c->mark_lock); 650 printbuf_exit(&buf); 651 bch2_trans_put(trans); 652 bch_err_fn(c, ret); 653 return ret; 654 } 655 656 static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k) 657 { 658 struct bch_fs *c = trans->c; 659 660 if (k.k->type != KEY_TYPE_accounting) 661 return 0; 662 663 percpu_down_read(&c->mark_lock); 664 int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k), 665 BCH_ACCOUNTING_read, false); 666 percpu_up_read(&c->mark_lock); 667 return ret; 668 } 669 670 static int bch2_disk_accounting_validate_late(struct btree_trans *trans, 671 struct disk_accounting_pos *acc, 672 u64 *v, unsigned nr) 673 { 674 struct bch_fs *c = trans->c; 675 struct printbuf buf = PRINTBUF; 676 int ret = 0, invalid_dev = -1; 677 678 switch (acc->type) { 679 case BCH_DISK_ACCOUNTING_replicas: { 680 union bch_replicas_padded r; 681 __accounting_to_replicas(&r.e, acc); 682 683 for (unsigned i = 0; i < r.e.nr_devs; i++) 684 if (r.e.devs[i] != BCH_SB_MEMBER_INVALID && 685 !bch2_dev_exists(c, r.e.devs[i])) { 686 invalid_dev = r.e.devs[i]; 687 goto invalid_device; 688 } 689 690 /* 691 * All replicas entry checks except for invalid device are done 692 * in bch2_accounting_validate 693 */ 694 BUG_ON(bch2_replicas_entry_validate(&r.e, c, &buf)); 695 696 if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e), 697 trans, accounting_replicas_not_marked, 698 "accounting not marked in superblock replicas\n%s", 699 (printbuf_reset(&buf), 700 bch2_accounting_key_to_text(&buf, acc), 701 buf.buf))) { 702 /* 703 * We're not RW yet and still single threaded, dropping 704 * and retaking lock is ok: 705 */ 706 percpu_up_write(&c->mark_lock); 707 ret = bch2_mark_replicas(c, &r.e); 708 if (ret) 709 goto fsck_err; 710 percpu_down_write(&c->mark_lock); 711 } 712 break; 713 } 714 715 case BCH_DISK_ACCOUNTING_dev_data_type: 716 if (!bch2_dev_exists(c, acc->dev_data_type.dev)) { 717 invalid_dev = acc->dev_data_type.dev; 718 goto invalid_device; 719 } 720 break; 721 } 722 723 fsck_err: 724 printbuf_exit(&buf); 725 return ret; 726 invalid_device: 727 if (fsck_err(trans, accounting_to_invalid_device, 728 "accounting entry points to invalid device %i\n%s", 729 invalid_dev, 730 (printbuf_reset(&buf), 731 bch2_accounting_key_to_text(&buf, acc), 732 buf.buf))) { 733 for (unsigned i = 0; i < nr; i++) 734 v[i] = -v[i]; 735 736 ret = commit_do(trans, NULL, NULL, 0, 737 bch2_disk_accounting_mod(trans, acc, v, nr, false)) ?: 738 -BCH_ERR_remove_disk_accounting_entry; 739 } else { 740 ret = bch_err_throw(c, remove_disk_accounting_entry); 741 } 742 goto fsck_err; 743 } 744 745 /* 746 * At startup time, initialize the in memory accounting from the btree (and 747 * journal) 748 */ 749 int bch2_accounting_read(struct bch_fs *c) 750 { 751 struct bch_accounting_mem *acc = &c->accounting; 752 struct btree_trans *trans = bch2_trans_get(c); 753 struct printbuf buf = PRINTBUF; 754 755 /* 756 * We might run more than once if we rewind to start topology repair or 757 * btree node scan - and those might cause us to get different results, 758 * so we can't just skip if we've already run. 759 * 760 * Instead, zero out any accounting we have: 761 */ 762 percpu_down_write(&c->mark_lock); 763 darray_for_each(acc->k, e) 764 percpu_memset(e->v[0], 0, sizeof(u64) * e->nr_counters); 765 for_each_member_device(c, ca) 766 percpu_memset(ca->usage, 0, sizeof(*ca->usage)); 767 percpu_memset(c->usage, 0, sizeof(*c->usage)); 768 percpu_up_write(&c->mark_lock); 769 770 struct btree_iter iter; 771 bch2_trans_iter_init(trans, &iter, BTREE_ID_accounting, POS_MIN, 772 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots); 773 iter.flags &= ~BTREE_ITER_with_journal; 774 int ret = for_each_btree_key_continue(trans, iter, 775 BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ 776 struct bkey u; 777 struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u); 778 779 if (k.k->type != KEY_TYPE_accounting) 780 continue; 781 782 struct disk_accounting_pos acc_k; 783 bpos_to_disk_accounting_pos(&acc_k, k.k->p); 784 785 if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) 786 break; 787 788 if (!bch2_accounting_is_mem(&acc_k)) { 789 struct disk_accounting_pos next; 790 memset(&next, 0, sizeof(next)); 791 next.type = acc_k.type + 1; 792 bch2_btree_iter_set_pos(trans, &iter, disk_accounting_pos_to_bpos(&next)); 793 continue; 794 } 795 796 accounting_read_key(trans, k); 797 })); 798 if (ret) 799 goto err; 800 801 struct journal_keys *keys = &c->journal_keys; 802 struct journal_key *dst = keys->data; 803 move_gap(keys, keys->nr); 804 805 darray_for_each(*keys, i) { 806 if (i->k->k.type == KEY_TYPE_accounting) { 807 struct disk_accounting_pos acc_k; 808 bpos_to_disk_accounting_pos(&acc_k, i->k->k.p); 809 810 if (!bch2_accounting_is_mem(&acc_k)) 811 continue; 812 813 struct bkey_s_c k = bkey_i_to_s_c(i->k); 814 unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, 815 sizeof(acc->k.data[0]), 816 accounting_pos_cmp, &k.k->p); 817 818 bool applied = idx < acc->k.nr && 819 bversion_cmp(acc->k.data[idx].bversion, k.k->bversion) >= 0; 820 821 if (applied) 822 continue; 823 824 if (i + 1 < &darray_top(*keys) && 825 i[1].k->k.type == KEY_TYPE_accounting && 826 !journal_key_cmp(i, i + 1)) { 827 WARN_ON(bversion_cmp(i[0].k->k.bversion, i[1].k->k.bversion) >= 0); 828 829 i[1].journal_seq = i[0].journal_seq; 830 831 bch2_accounting_accumulate(bkey_i_to_accounting(i[1].k), 832 bkey_s_c_to_accounting(k)); 833 continue; 834 } 835 836 ret = accounting_read_key(trans, k); 837 if (ret) 838 goto err; 839 } 840 841 *dst++ = *i; 842 } 843 keys->gap = keys->nr = dst - keys->data; 844 845 percpu_down_write(&c->mark_lock); 846 847 darray_for_each_reverse(acc->k, i) { 848 struct disk_accounting_pos acc_k; 849 bpos_to_disk_accounting_pos(&acc_k, i->pos); 850 851 u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; 852 memset(v, 0, sizeof(v)); 853 854 for (unsigned j = 0; j < i->nr_counters; j++) 855 v[j] = percpu_u64_get(i->v[0] + j); 856 857 /* 858 * If the entry counters are zeroed, it should be treated as 859 * nonexistent - it might point to an invalid device. 860 * 861 * Remove it, so that if it's re-added it gets re-marked in the 862 * superblock: 863 */ 864 ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters) 865 ? -BCH_ERR_remove_disk_accounting_entry 866 : bch2_disk_accounting_validate_late(trans, &acc_k, v, i->nr_counters); 867 868 if (ret == -BCH_ERR_remove_disk_accounting_entry) { 869 free_percpu(i->v[0]); 870 free_percpu(i->v[1]); 871 darray_remove_item(&acc->k, i); 872 ret = 0; 873 continue; 874 } 875 876 if (ret) 877 goto fsck_err; 878 } 879 880 eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), 881 accounting_pos_cmp, NULL); 882 883 preempt_disable(); 884 struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage); 885 886 for (unsigned i = 0; i < acc->k.nr; i++) { 887 struct disk_accounting_pos k; 888 bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos); 889 890 u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; 891 bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false); 892 893 switch (k.type) { 894 case BCH_DISK_ACCOUNTING_persistent_reserved: 895 usage->reserved += v[0] * k.persistent_reserved.nr_replicas; 896 break; 897 case BCH_DISK_ACCOUNTING_replicas: 898 fs_usage_data_type_to_base(usage, k.replicas.data_type, v[0]); 899 break; 900 case BCH_DISK_ACCOUNTING_dev_data_type: { 901 guard(rcu)(); 902 struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.dev_data_type.dev); 903 if (ca) { 904 struct bch_dev_usage_type __percpu *d = &ca->usage->d[k.dev_data_type.data_type]; 905 percpu_u64_set(&d->buckets, v[0]); 906 percpu_u64_set(&d->sectors, v[1]); 907 percpu_u64_set(&d->fragmented, v[2]); 908 909 if (k.dev_data_type.data_type == BCH_DATA_sb || 910 k.dev_data_type.data_type == BCH_DATA_journal) 911 usage->hidden += v[0] * ca->mi.bucket_size; 912 } 913 break; 914 } 915 } 916 } 917 preempt_enable(); 918 fsck_err: 919 percpu_up_write(&c->mark_lock); 920 err: 921 printbuf_exit(&buf); 922 bch2_trans_put(trans); 923 bch_err_fn(c, ret); 924 return ret; 925 } 926 927 int bch2_dev_usage_remove(struct bch_fs *c, unsigned dev) 928 { 929 return bch2_trans_run(c, 930 bch2_btree_write_buffer_flush_sync(trans) ?: 931 for_each_btree_key_commit(trans, iter, BTREE_ID_accounting, POS_MIN, 932 BTREE_ITER_all_snapshots, k, NULL, NULL, 0, ({ 933 struct disk_accounting_pos acc; 934 bpos_to_disk_accounting_pos(&acc, k.k->p); 935 936 acc.type == BCH_DISK_ACCOUNTING_dev_data_type && 937 acc.dev_data_type.dev == dev 938 ? bch2_btree_bit_mod_buffered(trans, BTREE_ID_accounting, k.k->p, 0) 939 : 0; 940 })) ?: 941 bch2_btree_write_buffer_flush_sync(trans)); 942 } 943 944 int bch2_dev_usage_init(struct bch_dev *ca, bool gc) 945 { 946 struct bch_fs *c = ca->fs; 947 u64 v[3] = { ca->mi.nbuckets - ca->mi.first_bucket, 0, 0 }; 948 949 int ret = bch2_trans_do(c, ({ 950 bch2_disk_accounting_mod2(trans, gc, 951 v, dev_data_type, 952 .dev = ca->dev_idx, 953 .data_type = BCH_DATA_free) ?: 954 (!gc ? bch2_trans_commit(trans, NULL, NULL, 0) : 0); 955 })); 956 bch_err_fn(c, ret); 957 return ret; 958 } 959 960 void bch2_verify_accounting_clean(struct bch_fs *c) 961 { 962 bool mismatch = false; 963 struct bch_fs_usage_base base = {}, base_inmem = {}; 964 965 bch2_trans_run(c, 966 for_each_btree_key(trans, iter, 967 BTREE_ID_accounting, POS_MIN, 968 BTREE_ITER_all_snapshots, k, ({ 969 u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; 970 struct bkey_s_c_accounting a = bkey_s_c_to_accounting(k); 971 unsigned nr = bch2_accounting_counters(k.k); 972 973 struct disk_accounting_pos acc_k; 974 bpos_to_disk_accounting_pos(&acc_k, k.k->p); 975 976 if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) 977 break; 978 979 if (!bch2_accounting_is_mem(&acc_k)) { 980 struct disk_accounting_pos next; 981 memset(&next, 0, sizeof(next)); 982 next.type = acc_k.type + 1; 983 bch2_btree_iter_set_pos(trans, &iter, disk_accounting_pos_to_bpos(&next)); 984 continue; 985 } 986 987 bch2_accounting_mem_read(c, k.k->p, v, nr); 988 989 if (memcmp(a.v->d, v, nr * sizeof(u64))) { 990 struct printbuf buf = PRINTBUF; 991 992 bch2_bkey_val_to_text(&buf, c, k); 993 prt_str(&buf, " !="); 994 for (unsigned j = 0; j < nr; j++) 995 prt_printf(&buf, " %llu", v[j]); 996 997 pr_err("%s", buf.buf); 998 printbuf_exit(&buf); 999 mismatch = true; 1000 } 1001 1002 switch (acc_k.type) { 1003 case BCH_DISK_ACCOUNTING_persistent_reserved: 1004 base.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0]; 1005 break; 1006 case BCH_DISK_ACCOUNTING_replicas: 1007 fs_usage_data_type_to_base(&base, acc_k.replicas.data_type, a.v->d[0]); 1008 break; 1009 case BCH_DISK_ACCOUNTING_dev_data_type: 1010 { 1011 guard(rcu)(); /* scoped guard is a loop, and doesn't play nicely with continue */ 1012 struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); 1013 if (!ca) 1014 continue; 1015 1016 v[0] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].buckets); 1017 v[1] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].sectors); 1018 v[2] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].fragmented); 1019 } 1020 1021 if (memcmp(a.v->d, v, 3 * sizeof(u64))) { 1022 struct printbuf buf = PRINTBUF; 1023 1024 bch2_bkey_val_to_text(&buf, c, k); 1025 prt_str(&buf, " in mem"); 1026 for (unsigned j = 0; j < nr; j++) 1027 prt_printf(&buf, " %llu", v[j]); 1028 1029 pr_err("dev accounting mismatch: %s", buf.buf); 1030 printbuf_exit(&buf); 1031 mismatch = true; 1032 } 1033 } 1034 1035 0; 1036 }))); 1037 1038 acc_u64s_percpu(&base_inmem.hidden, &c->usage->hidden, sizeof(base_inmem) / sizeof(u64)); 1039 1040 #define check(x) \ 1041 if (base.x != base_inmem.x) { \ 1042 pr_err("fs_usage_base.%s mismatch: %llu != %llu", #x, base.x, base_inmem.x); \ 1043 mismatch = true; \ 1044 } 1045 1046 //check(hidden); 1047 check(btree); 1048 check(data); 1049 check(cached); 1050 check(reserved); 1051 check(nr_inodes); 1052 1053 WARN_ON(mismatch); 1054 } 1055 1056 void bch2_accounting_gc_free(struct bch_fs *c) 1057 { 1058 lockdep_assert_held(&c->mark_lock); 1059 1060 struct bch_accounting_mem *acc = &c->accounting; 1061 1062 bch2_accounting_free_counters(acc, true); 1063 acc->gc_running = false; 1064 } 1065 1066 void bch2_fs_accounting_exit(struct bch_fs *c) 1067 { 1068 struct bch_accounting_mem *acc = &c->accounting; 1069 1070 bch2_accounting_free_counters(acc, false); 1071 darray_exit(&acc->k); 1072 } 1073