1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "buckets.h" 5 #include "journal.h" 6 #include "replicas.h" 7 #include "super-io.h" 8 9 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, 10 struct bch_replicas_cpu *); 11 12 /* Replicas tracking - in memory: */ 13 14 static void verify_replicas_entry(struct bch_replicas_entry *e) 15 { 16 #ifdef CONFIG_BCACHEFS_DEBUG 17 unsigned i; 18 19 BUG_ON(e->data_type >= BCH_DATA_NR); 20 BUG_ON(!e->nr_devs); 21 BUG_ON(e->nr_required > 1 && 22 e->nr_required >= e->nr_devs); 23 24 for (i = 0; i + 1 < e->nr_devs; i++) 25 BUG_ON(e->devs[i] >= e->devs[i + 1]); 26 #endif 27 } 28 29 void bch2_replicas_entry_sort(struct bch_replicas_entry *e) 30 { 31 bubble_sort(e->devs, e->nr_devs, u8_cmp); 32 } 33 34 static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) 35 { 36 eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL); 37 } 38 39 static void bch2_replicas_entry_v0_to_text(struct printbuf *out, 40 struct bch_replicas_entry_v0 *e) 41 { 42 unsigned i; 43 44 if (e->data_type < BCH_DATA_NR) 45 prt_printf(out, "%s", bch2_data_types[e->data_type]); 46 else 47 prt_printf(out, "(invalid data type %u)", e->data_type); 48 49 prt_printf(out, ": %u [", e->nr_devs); 50 for (i = 0; i < e->nr_devs; i++) 51 prt_printf(out, i ? " %u" : "%u", e->devs[i]); 52 prt_printf(out, "]"); 53 } 54 55 void bch2_replicas_entry_to_text(struct printbuf *out, 56 struct bch_replicas_entry *e) 57 { 58 unsigned i; 59 60 if (e->data_type < BCH_DATA_NR) 61 prt_printf(out, "%s", bch2_data_types[e->data_type]); 62 else 63 prt_printf(out, "(invalid data type %u)", e->data_type); 64 65 prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs); 66 for (i = 0; i < e->nr_devs; i++) 67 prt_printf(out, i ? " %u" : "%u", e->devs[i]); 68 prt_printf(out, "]"); 69 } 70 71 void bch2_cpu_replicas_to_text(struct printbuf *out, 72 struct bch_replicas_cpu *r) 73 { 74 struct bch_replicas_entry *e; 75 bool first = true; 76 77 for_each_cpu_replicas_entry(r, e) { 78 if (!first) 79 prt_printf(out, " "); 80 first = false; 81 82 bch2_replicas_entry_to_text(out, e); 83 } 84 } 85 86 static void extent_to_replicas(struct bkey_s_c k, 87 struct bch_replicas_entry *r) 88 { 89 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 90 const union bch_extent_entry *entry; 91 struct extent_ptr_decoded p; 92 93 r->nr_required = 1; 94 95 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { 96 if (p.ptr.cached) 97 continue; 98 99 if (!p.has_ec) 100 r->devs[r->nr_devs++] = p.ptr.dev; 101 else 102 r->nr_required = 0; 103 } 104 } 105 106 static void stripe_to_replicas(struct bkey_s_c k, 107 struct bch_replicas_entry *r) 108 { 109 struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); 110 const struct bch_extent_ptr *ptr; 111 112 r->nr_required = s.v->nr_blocks - s.v->nr_redundant; 113 114 for (ptr = s.v->ptrs; 115 ptr < s.v->ptrs + s.v->nr_blocks; 116 ptr++) 117 r->devs[r->nr_devs++] = ptr->dev; 118 } 119 120 void bch2_bkey_to_replicas(struct bch_replicas_entry *e, 121 struct bkey_s_c k) 122 { 123 e->nr_devs = 0; 124 125 switch (k.k->type) { 126 case KEY_TYPE_btree_ptr: 127 case KEY_TYPE_btree_ptr_v2: 128 e->data_type = BCH_DATA_btree; 129 extent_to_replicas(k, e); 130 break; 131 case KEY_TYPE_extent: 132 case KEY_TYPE_reflink_v: 133 e->data_type = BCH_DATA_user; 134 extent_to_replicas(k, e); 135 break; 136 case KEY_TYPE_stripe: 137 e->data_type = BCH_DATA_parity; 138 stripe_to_replicas(k, e); 139 break; 140 } 141 142 bch2_replicas_entry_sort(e); 143 } 144 145 void bch2_devlist_to_replicas(struct bch_replicas_entry *e, 146 enum bch_data_type data_type, 147 struct bch_devs_list devs) 148 { 149 unsigned i; 150 151 BUG_ON(!data_type || 152 data_type == BCH_DATA_sb || 153 data_type >= BCH_DATA_NR); 154 155 e->data_type = data_type; 156 e->nr_devs = 0; 157 e->nr_required = 1; 158 159 for (i = 0; i < devs.nr; i++) 160 e->devs[e->nr_devs++] = devs.devs[i]; 161 162 bch2_replicas_entry_sort(e); 163 } 164 165 static struct bch_replicas_cpu 166 cpu_replicas_add_entry(struct bch_replicas_cpu *old, 167 struct bch_replicas_entry *new_entry) 168 { 169 unsigned i; 170 struct bch_replicas_cpu new = { 171 .nr = old->nr + 1, 172 .entry_size = max_t(unsigned, old->entry_size, 173 replicas_entry_bytes(new_entry)), 174 }; 175 176 BUG_ON(!new_entry->data_type); 177 verify_replicas_entry(new_entry); 178 179 new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL); 180 if (!new.entries) 181 return new; 182 183 for (i = 0; i < old->nr; i++) 184 memcpy(cpu_replicas_entry(&new, i), 185 cpu_replicas_entry(old, i), 186 old->entry_size); 187 188 memcpy(cpu_replicas_entry(&new, old->nr), 189 new_entry, 190 replicas_entry_bytes(new_entry)); 191 192 bch2_cpu_replicas_sort(&new); 193 return new; 194 } 195 196 static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, 197 struct bch_replicas_entry *search) 198 { 199 int idx, entry_size = replicas_entry_bytes(search); 200 201 if (unlikely(entry_size > r->entry_size)) 202 return -1; 203 204 verify_replicas_entry(search); 205 206 #define entry_cmp(_l, _r, size) memcmp(_l, _r, entry_size) 207 idx = eytzinger0_find(r->entries, r->nr, r->entry_size, 208 entry_cmp, search); 209 #undef entry_cmp 210 211 return idx < r->nr ? idx : -1; 212 } 213 214 int bch2_replicas_entry_idx(struct bch_fs *c, 215 struct bch_replicas_entry *search) 216 { 217 bch2_replicas_entry_sort(search); 218 219 return __replicas_entry_idx(&c->replicas, search); 220 } 221 222 static bool __replicas_has_entry(struct bch_replicas_cpu *r, 223 struct bch_replicas_entry *search) 224 { 225 return __replicas_entry_idx(r, search) >= 0; 226 } 227 228 bool bch2_replicas_marked(struct bch_fs *c, 229 struct bch_replicas_entry *search) 230 { 231 bool marked; 232 233 if (!search->nr_devs) 234 return true; 235 236 verify_replicas_entry(search); 237 238 percpu_down_read(&c->mark_lock); 239 marked = __replicas_has_entry(&c->replicas, search) && 240 (likely((!c->replicas_gc.entries)) || 241 __replicas_has_entry(&c->replicas_gc, search)); 242 percpu_up_read(&c->mark_lock); 243 244 return marked; 245 } 246 247 static void __replicas_table_update(struct bch_fs_usage *dst, 248 struct bch_replicas_cpu *dst_r, 249 struct bch_fs_usage *src, 250 struct bch_replicas_cpu *src_r) 251 { 252 int src_idx, dst_idx; 253 254 *dst = *src; 255 256 for (src_idx = 0; src_idx < src_r->nr; src_idx++) { 257 if (!src->replicas[src_idx]) 258 continue; 259 260 dst_idx = __replicas_entry_idx(dst_r, 261 cpu_replicas_entry(src_r, src_idx)); 262 BUG_ON(dst_idx < 0); 263 264 dst->replicas[dst_idx] = src->replicas[src_idx]; 265 } 266 } 267 268 static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p, 269 struct bch_replicas_cpu *dst_r, 270 struct bch_fs_usage __percpu *src_p, 271 struct bch_replicas_cpu *src_r) 272 { 273 unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr; 274 struct bch_fs_usage *dst, *src = (void *) 275 bch2_acc_percpu_u64s((u64 __percpu *) src_p, src_nr); 276 277 preempt_disable(); 278 dst = this_cpu_ptr(dst_p); 279 preempt_enable(); 280 281 __replicas_table_update(dst, dst_r, src, src_r); 282 } 283 284 /* 285 * Resize filesystem accounting: 286 */ 287 static int replicas_table_update(struct bch_fs *c, 288 struct bch_replicas_cpu *new_r) 289 { 290 struct bch_fs_usage __percpu *new_usage[JOURNAL_BUF_NR]; 291 struct bch_fs_usage_online *new_scratch = NULL; 292 struct bch_fs_usage __percpu *new_gc = NULL; 293 struct bch_fs_usage *new_base = NULL; 294 unsigned i, bytes = sizeof(struct bch_fs_usage) + 295 sizeof(u64) * new_r->nr; 296 unsigned scratch_bytes = sizeof(struct bch_fs_usage_online) + 297 sizeof(u64) * new_r->nr; 298 int ret = 0; 299 300 memset(new_usage, 0, sizeof(new_usage)); 301 302 for (i = 0; i < ARRAY_SIZE(new_usage); i++) 303 if (!(new_usage[i] = __alloc_percpu_gfp(bytes, 304 sizeof(u64), GFP_KERNEL))) 305 goto err; 306 307 if (!(new_base = kzalloc(bytes, GFP_KERNEL)) || 308 !(new_scratch = kmalloc(scratch_bytes, GFP_KERNEL)) || 309 (c->usage_gc && 310 !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_KERNEL)))) 311 goto err; 312 313 for (i = 0; i < ARRAY_SIZE(new_usage); i++) 314 if (c->usage[i]) 315 __replicas_table_update_pcpu(new_usage[i], new_r, 316 c->usage[i], &c->replicas); 317 if (c->usage_base) 318 __replicas_table_update(new_base, new_r, 319 c->usage_base, &c->replicas); 320 if (c->usage_gc) 321 __replicas_table_update_pcpu(new_gc, new_r, 322 c->usage_gc, &c->replicas); 323 324 for (i = 0; i < ARRAY_SIZE(new_usage); i++) 325 swap(c->usage[i], new_usage[i]); 326 swap(c->usage_base, new_base); 327 swap(c->usage_scratch, new_scratch); 328 swap(c->usage_gc, new_gc); 329 swap(c->replicas, *new_r); 330 out: 331 free_percpu(new_gc); 332 kfree(new_scratch); 333 for (i = 0; i < ARRAY_SIZE(new_usage); i++) 334 free_percpu(new_usage[i]); 335 kfree(new_base); 336 return ret; 337 err: 338 bch_err(c, "error updating replicas table: memory allocation failure"); 339 ret = -BCH_ERR_ENOMEM_replicas_table; 340 goto out; 341 } 342 343 static unsigned reserve_journal_replicas(struct bch_fs *c, 344 struct bch_replicas_cpu *r) 345 { 346 struct bch_replicas_entry *e; 347 unsigned journal_res_u64s = 0; 348 349 /* nr_inodes: */ 350 journal_res_u64s += 351 DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)); 352 353 /* key_version: */ 354 journal_res_u64s += 355 DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)); 356 357 /* persistent_reserved: */ 358 journal_res_u64s += 359 DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)) * 360 BCH_REPLICAS_MAX; 361 362 for_each_cpu_replicas_entry(r, e) 363 journal_res_u64s += 364 DIV_ROUND_UP(sizeof(struct jset_entry_data_usage) + 365 e->nr_devs, sizeof(u64)); 366 return journal_res_u64s; 367 } 368 369 noinline 370 static int bch2_mark_replicas_slowpath(struct bch_fs *c, 371 struct bch_replicas_entry *new_entry) 372 { 373 struct bch_replicas_cpu new_r, new_gc; 374 int ret = 0; 375 376 verify_replicas_entry(new_entry); 377 378 memset(&new_r, 0, sizeof(new_r)); 379 memset(&new_gc, 0, sizeof(new_gc)); 380 381 mutex_lock(&c->sb_lock); 382 383 if (c->replicas_gc.entries && 384 !__replicas_has_entry(&c->replicas_gc, new_entry)) { 385 new_gc = cpu_replicas_add_entry(&c->replicas_gc, new_entry); 386 if (!new_gc.entries) { 387 ret = -BCH_ERR_ENOMEM_cpu_replicas; 388 goto err; 389 } 390 } 391 392 if (!__replicas_has_entry(&c->replicas, new_entry)) { 393 new_r = cpu_replicas_add_entry(&c->replicas, new_entry); 394 if (!new_r.entries) { 395 ret = -BCH_ERR_ENOMEM_cpu_replicas; 396 goto err; 397 } 398 399 ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r); 400 if (ret) 401 goto err; 402 403 bch2_journal_entry_res_resize(&c->journal, 404 &c->replicas_journal_res, 405 reserve_journal_replicas(c, &new_r)); 406 } 407 408 if (!new_r.entries && 409 !new_gc.entries) 410 goto out; 411 412 /* allocations done, now commit: */ 413 414 if (new_r.entries) 415 bch2_write_super(c); 416 417 /* don't update in memory replicas until changes are persistent */ 418 percpu_down_write(&c->mark_lock); 419 if (new_r.entries) 420 ret = replicas_table_update(c, &new_r); 421 if (new_gc.entries) 422 swap(new_gc, c->replicas_gc); 423 percpu_up_write(&c->mark_lock); 424 out: 425 mutex_unlock(&c->sb_lock); 426 427 kfree(new_r.entries); 428 kfree(new_gc.entries); 429 430 return ret; 431 err: 432 bch_err_msg(c, ret, "adding replicas entry"); 433 goto out; 434 } 435 436 int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r) 437 { 438 return likely(bch2_replicas_marked(c, r)) 439 ? 0 : bch2_mark_replicas_slowpath(c, r); 440 } 441 442 /* replicas delta list: */ 443 444 int bch2_replicas_delta_list_mark(struct bch_fs *c, 445 struct replicas_delta_list *r) 446 { 447 struct replicas_delta *d = r->d; 448 struct replicas_delta *top = (void *) r->d + r->used; 449 int ret = 0; 450 451 for (d = r->d; !ret && d != top; d = replicas_delta_next(d)) 452 ret = bch2_mark_replicas(c, &d->r); 453 return ret; 454 } 455 456 /* 457 * Old replicas_gc mechanism: only used for journal replicas entries now, should 458 * die at some point: 459 */ 460 461 int bch2_replicas_gc_end(struct bch_fs *c, int ret) 462 { 463 lockdep_assert_held(&c->replicas_gc_lock); 464 465 if (ret) 466 goto err; 467 468 mutex_lock(&c->sb_lock); 469 percpu_down_write(&c->mark_lock); 470 471 ret = bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc); 472 if (ret) 473 goto err; 474 475 ret = replicas_table_update(c, &c->replicas_gc); 476 err: 477 kfree(c->replicas_gc.entries); 478 c->replicas_gc.entries = NULL; 479 480 percpu_up_write(&c->mark_lock); 481 482 if (!ret) 483 bch2_write_super(c); 484 485 mutex_unlock(&c->sb_lock); 486 487 return ret; 488 } 489 490 int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) 491 { 492 struct bch_replicas_entry *e; 493 unsigned i = 0; 494 495 lockdep_assert_held(&c->replicas_gc_lock); 496 497 mutex_lock(&c->sb_lock); 498 BUG_ON(c->replicas_gc.entries); 499 500 c->replicas_gc.nr = 0; 501 c->replicas_gc.entry_size = 0; 502 503 for_each_cpu_replicas_entry(&c->replicas, e) 504 if (!((1 << e->data_type) & typemask)) { 505 c->replicas_gc.nr++; 506 c->replicas_gc.entry_size = 507 max_t(unsigned, c->replicas_gc.entry_size, 508 replicas_entry_bytes(e)); 509 } 510 511 c->replicas_gc.entries = kcalloc(c->replicas_gc.nr, 512 c->replicas_gc.entry_size, 513 GFP_KERNEL); 514 if (!c->replicas_gc.entries) { 515 mutex_unlock(&c->sb_lock); 516 bch_err(c, "error allocating c->replicas_gc"); 517 return -BCH_ERR_ENOMEM_replicas_gc; 518 } 519 520 for_each_cpu_replicas_entry(&c->replicas, e) 521 if (!((1 << e->data_type) & typemask)) 522 memcpy(cpu_replicas_entry(&c->replicas_gc, i++), 523 e, c->replicas_gc.entry_size); 524 525 bch2_cpu_replicas_sort(&c->replicas_gc); 526 mutex_unlock(&c->sb_lock); 527 528 return 0; 529 } 530 531 /* 532 * New much simpler mechanism for clearing out unneeded replicas entries - drop 533 * replicas entries that have 0 sectors used. 534 * 535 * However, we don't track sector counts for journal usage, so this doesn't drop 536 * any BCH_DATA_journal entries; the old bch2_replicas_gc_(start|end) mechanism 537 * is retained for that. 538 */ 539 int bch2_replicas_gc2(struct bch_fs *c) 540 { 541 struct bch_replicas_cpu new = { 0 }; 542 unsigned i, nr; 543 int ret = 0; 544 545 bch2_journal_meta(&c->journal); 546 retry: 547 nr = READ_ONCE(c->replicas.nr); 548 new.entry_size = READ_ONCE(c->replicas.entry_size); 549 new.entries = kcalloc(nr, new.entry_size, GFP_KERNEL); 550 if (!new.entries) { 551 bch_err(c, "error allocating c->replicas_gc"); 552 return -BCH_ERR_ENOMEM_replicas_gc; 553 } 554 555 mutex_lock(&c->sb_lock); 556 percpu_down_write(&c->mark_lock); 557 558 if (nr != c->replicas.nr || 559 new.entry_size != c->replicas.entry_size) { 560 percpu_up_write(&c->mark_lock); 561 mutex_unlock(&c->sb_lock); 562 kfree(new.entries); 563 goto retry; 564 } 565 566 for (i = 0; i < c->replicas.nr; i++) { 567 struct bch_replicas_entry *e = 568 cpu_replicas_entry(&c->replicas, i); 569 570 if (e->data_type == BCH_DATA_journal || 571 c->usage_base->replicas[i] || 572 percpu_u64_get(&c->usage[0]->replicas[i]) || 573 percpu_u64_get(&c->usage[1]->replicas[i]) || 574 percpu_u64_get(&c->usage[2]->replicas[i]) || 575 percpu_u64_get(&c->usage[3]->replicas[i])) 576 memcpy(cpu_replicas_entry(&new, new.nr++), 577 e, new.entry_size); 578 } 579 580 bch2_cpu_replicas_sort(&new); 581 582 ret = bch2_cpu_replicas_to_sb_replicas(c, &new); 583 if (ret) 584 goto err; 585 586 ret = replicas_table_update(c, &new); 587 err: 588 kfree(new.entries); 589 590 percpu_up_write(&c->mark_lock); 591 592 if (!ret) 593 bch2_write_super(c); 594 595 mutex_unlock(&c->sb_lock); 596 597 return ret; 598 } 599 600 int bch2_replicas_set_usage(struct bch_fs *c, 601 struct bch_replicas_entry *r, 602 u64 sectors) 603 { 604 int ret, idx = bch2_replicas_entry_idx(c, r); 605 606 if (idx < 0) { 607 struct bch_replicas_cpu n; 608 609 n = cpu_replicas_add_entry(&c->replicas, r); 610 if (!n.entries) 611 return -BCH_ERR_ENOMEM_cpu_replicas; 612 613 ret = replicas_table_update(c, &n); 614 if (ret) 615 return ret; 616 617 kfree(n.entries); 618 619 idx = bch2_replicas_entry_idx(c, r); 620 BUG_ON(ret < 0); 621 } 622 623 c->usage_base->replicas[idx] = sectors; 624 625 return 0; 626 } 627 628 /* Replicas tracking - superblock: */ 629 630 static int 631 __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r, 632 struct bch_replicas_cpu *cpu_r) 633 { 634 struct bch_replicas_entry *e, *dst; 635 unsigned nr = 0, entry_size = 0, idx = 0; 636 637 for_each_replicas_entry(sb_r, e) { 638 entry_size = max_t(unsigned, entry_size, 639 replicas_entry_bytes(e)); 640 nr++; 641 } 642 643 cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); 644 if (!cpu_r->entries) 645 return -BCH_ERR_ENOMEM_cpu_replicas; 646 647 cpu_r->nr = nr; 648 cpu_r->entry_size = entry_size; 649 650 for_each_replicas_entry(sb_r, e) { 651 dst = cpu_replicas_entry(cpu_r, idx++); 652 memcpy(dst, e, replicas_entry_bytes(e)); 653 bch2_replicas_entry_sort(dst); 654 } 655 656 return 0; 657 } 658 659 static int 660 __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, 661 struct bch_replicas_cpu *cpu_r) 662 { 663 struct bch_replicas_entry_v0 *e; 664 unsigned nr = 0, entry_size = 0, idx = 0; 665 666 for_each_replicas_entry(sb_r, e) { 667 entry_size = max_t(unsigned, entry_size, 668 replicas_entry_bytes(e)); 669 nr++; 670 } 671 672 entry_size += sizeof(struct bch_replicas_entry) - 673 sizeof(struct bch_replicas_entry_v0); 674 675 cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); 676 if (!cpu_r->entries) 677 return -BCH_ERR_ENOMEM_cpu_replicas; 678 679 cpu_r->nr = nr; 680 cpu_r->entry_size = entry_size; 681 682 for_each_replicas_entry(sb_r, e) { 683 struct bch_replicas_entry *dst = 684 cpu_replicas_entry(cpu_r, idx++); 685 686 dst->data_type = e->data_type; 687 dst->nr_devs = e->nr_devs; 688 dst->nr_required = 1; 689 memcpy(dst->devs, e->devs, e->nr_devs); 690 bch2_replicas_entry_sort(dst); 691 } 692 693 return 0; 694 } 695 696 int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c) 697 { 698 struct bch_sb_field_replicas *sb_v1; 699 struct bch_sb_field_replicas_v0 *sb_v0; 700 struct bch_replicas_cpu new_r = { 0, 0, NULL }; 701 int ret = 0; 702 703 if ((sb_v1 = bch2_sb_field_get(c->disk_sb.sb, replicas))) 704 ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r); 705 else if ((sb_v0 = bch2_sb_field_get(c->disk_sb.sb, replicas_v0))) 706 ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r); 707 if (ret) 708 return ret; 709 710 bch2_cpu_replicas_sort(&new_r); 711 712 percpu_down_write(&c->mark_lock); 713 714 ret = replicas_table_update(c, &new_r); 715 percpu_up_write(&c->mark_lock); 716 717 kfree(new_r.entries); 718 719 return 0; 720 } 721 722 static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c, 723 struct bch_replicas_cpu *r) 724 { 725 struct bch_sb_field_replicas_v0 *sb_r; 726 struct bch_replicas_entry_v0 *dst; 727 struct bch_replicas_entry *src; 728 size_t bytes; 729 730 bytes = sizeof(struct bch_sb_field_replicas); 731 732 for_each_cpu_replicas_entry(r, src) 733 bytes += replicas_entry_bytes(src) - 1; 734 735 sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0, 736 DIV_ROUND_UP(bytes, sizeof(u64))); 737 if (!sb_r) 738 return -BCH_ERR_ENOSPC_sb_replicas; 739 740 bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas); 741 sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas_v0); 742 743 memset(&sb_r->entries, 0, 744 vstruct_end(&sb_r->field) - 745 (void *) &sb_r->entries); 746 747 dst = sb_r->entries; 748 for_each_cpu_replicas_entry(r, src) { 749 dst->data_type = src->data_type; 750 dst->nr_devs = src->nr_devs; 751 memcpy(dst->devs, src->devs, src->nr_devs); 752 753 dst = replicas_entry_next(dst); 754 755 BUG_ON((void *) dst > vstruct_end(&sb_r->field)); 756 } 757 758 return 0; 759 } 760 761 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, 762 struct bch_replicas_cpu *r) 763 { 764 struct bch_sb_field_replicas *sb_r; 765 struct bch_replicas_entry *dst, *src; 766 bool need_v1 = false; 767 size_t bytes; 768 769 bytes = sizeof(struct bch_sb_field_replicas); 770 771 for_each_cpu_replicas_entry(r, src) { 772 bytes += replicas_entry_bytes(src); 773 if (src->nr_required != 1) 774 need_v1 = true; 775 } 776 777 if (!need_v1) 778 return bch2_cpu_replicas_to_sb_replicas_v0(c, r); 779 780 sb_r = bch2_sb_field_resize(&c->disk_sb, replicas, 781 DIV_ROUND_UP(bytes, sizeof(u64))); 782 if (!sb_r) 783 return -BCH_ERR_ENOSPC_sb_replicas; 784 785 bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0); 786 sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas); 787 788 memset(&sb_r->entries, 0, 789 vstruct_end(&sb_r->field) - 790 (void *) &sb_r->entries); 791 792 dst = sb_r->entries; 793 for_each_cpu_replicas_entry(r, src) { 794 memcpy(dst, src, replicas_entry_bytes(src)); 795 796 dst = replicas_entry_next(dst); 797 798 BUG_ON((void *) dst > vstruct_end(&sb_r->field)); 799 } 800 801 return 0; 802 } 803 804 static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, 805 struct bch_sb *sb, 806 struct printbuf *err) 807 { 808 unsigned i, j; 809 810 sort_cmp_size(cpu_r->entries, 811 cpu_r->nr, 812 cpu_r->entry_size, 813 memcmp, NULL); 814 815 for (i = 0; i < cpu_r->nr; i++) { 816 struct bch_replicas_entry *e = 817 cpu_replicas_entry(cpu_r, i); 818 819 if (e->data_type >= BCH_DATA_NR) { 820 prt_printf(err, "invalid data type in entry "); 821 bch2_replicas_entry_to_text(err, e); 822 return -BCH_ERR_invalid_sb_replicas; 823 } 824 825 if (!e->nr_devs) { 826 prt_printf(err, "no devices in entry "); 827 bch2_replicas_entry_to_text(err, e); 828 return -BCH_ERR_invalid_sb_replicas; 829 } 830 831 if (e->nr_required > 1 && 832 e->nr_required >= e->nr_devs) { 833 prt_printf(err, "bad nr_required in entry "); 834 bch2_replicas_entry_to_text(err, e); 835 return -BCH_ERR_invalid_sb_replicas; 836 } 837 838 for (j = 0; j < e->nr_devs; j++) 839 if (!bch2_dev_exists(sb, e->devs[j])) { 840 prt_printf(err, "invalid device %u in entry ", e->devs[j]); 841 bch2_replicas_entry_to_text(err, e); 842 return -BCH_ERR_invalid_sb_replicas; 843 } 844 845 if (i + 1 < cpu_r->nr) { 846 struct bch_replicas_entry *n = 847 cpu_replicas_entry(cpu_r, i + 1); 848 849 BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0); 850 851 if (!memcmp(e, n, cpu_r->entry_size)) { 852 prt_printf(err, "duplicate replicas entry "); 853 bch2_replicas_entry_to_text(err, e); 854 return -BCH_ERR_invalid_sb_replicas; 855 } 856 } 857 } 858 859 return 0; 860 } 861 862 static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f, 863 struct printbuf *err) 864 { 865 struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas); 866 struct bch_replicas_cpu cpu_r; 867 int ret; 868 869 ret = __bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r); 870 if (ret) 871 return ret; 872 873 ret = bch2_cpu_replicas_validate(&cpu_r, sb, err); 874 kfree(cpu_r.entries); 875 return ret; 876 } 877 878 static void bch2_sb_replicas_to_text(struct printbuf *out, 879 struct bch_sb *sb, 880 struct bch_sb_field *f) 881 { 882 struct bch_sb_field_replicas *r = field_to_type(f, replicas); 883 struct bch_replicas_entry *e; 884 bool first = true; 885 886 for_each_replicas_entry(r, e) { 887 if (!first) 888 prt_printf(out, " "); 889 first = false; 890 891 bch2_replicas_entry_to_text(out, e); 892 } 893 prt_newline(out); 894 } 895 896 const struct bch_sb_field_ops bch_sb_field_ops_replicas = { 897 .validate = bch2_sb_replicas_validate, 898 .to_text = bch2_sb_replicas_to_text, 899 }; 900 901 static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f, 902 struct printbuf *err) 903 { 904 struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); 905 struct bch_replicas_cpu cpu_r; 906 int ret; 907 908 ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r); 909 if (ret) 910 return ret; 911 912 ret = bch2_cpu_replicas_validate(&cpu_r, sb, err); 913 kfree(cpu_r.entries); 914 return ret; 915 } 916 917 static void bch2_sb_replicas_v0_to_text(struct printbuf *out, 918 struct bch_sb *sb, 919 struct bch_sb_field *f) 920 { 921 struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); 922 struct bch_replicas_entry_v0 *e; 923 bool first = true; 924 925 for_each_replicas_entry(sb_r, e) { 926 if (!first) 927 prt_printf(out, " "); 928 first = false; 929 930 bch2_replicas_entry_v0_to_text(out, e); 931 } 932 prt_newline(out); 933 } 934 935 const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = { 936 .validate = bch2_sb_replicas_v0_validate, 937 .to_text = bch2_sb_replicas_v0_to_text, 938 }; 939 940 /* Query replicas: */ 941 942 bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, 943 unsigned flags, bool print) 944 { 945 struct bch_replicas_entry *e; 946 bool ret = true; 947 948 percpu_down_read(&c->mark_lock); 949 for_each_cpu_replicas_entry(&c->replicas, e) { 950 unsigned i, nr_online = 0, nr_failed = 0, dflags = 0; 951 bool metadata = e->data_type < BCH_DATA_user; 952 953 if (e->data_type == BCH_DATA_cached) 954 continue; 955 956 for (i = 0; i < e->nr_devs; i++) { 957 struct bch_dev *ca = bch_dev_bkey_exists(c, e->devs[i]); 958 959 nr_online += test_bit(e->devs[i], devs.d); 960 nr_failed += ca->mi.state == BCH_MEMBER_STATE_failed; 961 } 962 963 if (nr_failed == e->nr_devs) 964 continue; 965 966 if (nr_online < e->nr_required) 967 dflags |= metadata 968 ? BCH_FORCE_IF_METADATA_LOST 969 : BCH_FORCE_IF_DATA_LOST; 970 971 if (nr_online < e->nr_devs) 972 dflags |= metadata 973 ? BCH_FORCE_IF_METADATA_DEGRADED 974 : BCH_FORCE_IF_DATA_DEGRADED; 975 976 if (dflags & ~flags) { 977 if (print) { 978 struct printbuf buf = PRINTBUF; 979 980 bch2_replicas_entry_to_text(&buf, e); 981 bch_err(c, "insufficient devices online (%u) for replicas entry %s", 982 nr_online, buf.buf); 983 printbuf_exit(&buf); 984 } 985 ret = false; 986 break; 987 } 988 989 } 990 percpu_up_read(&c->mark_lock); 991 992 return ret; 993 } 994 995 unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev) 996 { 997 struct bch_sb_field_replicas *replicas; 998 struct bch_sb_field_replicas_v0 *replicas_v0; 999 unsigned i, data_has = 0; 1000 1001 replicas = bch2_sb_field_get(sb, replicas); 1002 replicas_v0 = bch2_sb_field_get(sb, replicas_v0); 1003 1004 if (replicas) { 1005 struct bch_replicas_entry *r; 1006 1007 for_each_replicas_entry(replicas, r) 1008 for (i = 0; i < r->nr_devs; i++) 1009 if (r->devs[i] == dev) 1010 data_has |= 1 << r->data_type; 1011 } else if (replicas_v0) { 1012 struct bch_replicas_entry_v0 *r; 1013 1014 for_each_replicas_entry_v0(replicas_v0, r) 1015 for (i = 0; i < r->nr_devs; i++) 1016 if (r->devs[i] == dev) 1017 data_has |= 1 << r->data_type; 1018 } 1019 1020 1021 return data_has; 1022 } 1023 1024 unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) 1025 { 1026 unsigned ret; 1027 1028 mutex_lock(&c->sb_lock); 1029 ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx); 1030 mutex_unlock(&c->sb_lock); 1031 1032 return ret; 1033 } 1034 1035 void bch2_fs_replicas_exit(struct bch_fs *c) 1036 { 1037 unsigned i; 1038 1039 kfree(c->usage_scratch); 1040 for (i = 0; i < ARRAY_SIZE(c->usage); i++) 1041 free_percpu(c->usage[i]); 1042 kfree(c->usage_base); 1043 kfree(c->replicas.entries); 1044 kfree(c->replicas_gc.entries); 1045 1046 mempool_exit(&c->replicas_delta_pool); 1047 } 1048 1049 int bch2_fs_replicas_init(struct bch_fs *c) 1050 { 1051 bch2_journal_entry_res_resize(&c->journal, 1052 &c->replicas_journal_res, 1053 reserve_journal_replicas(c, &c->replicas)); 1054 1055 return mempool_init_kmalloc_pool(&c->replicas_delta_pool, 1, 1056 REPLICAS_DELTA_LIST_MAX) ?: 1057 replicas_table_update(c, &c->replicas); 1058 } 1059