1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "buckets.h" 5 #include "journal.h" 6 #include "replicas.h" 7 #include "super-io.h" 8 9 #include <linux/sort.h> 10 11 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, 12 struct bch_replicas_cpu *); 13 14 /* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */ 15 static int bch2_memcmp(const void *l, const void *r, const void *priv) 16 { 17 size_t size = (size_t) priv; 18 return memcmp(l, r, size); 19 } 20 21 /* Replicas tracking - in memory: */ 22 23 static void verify_replicas_entry(struct bch_replicas_entry_v1 *e) 24 { 25 #ifdef CONFIG_BCACHEFS_DEBUG 26 BUG_ON(e->data_type >= BCH_DATA_NR); 27 BUG_ON(!e->nr_devs); 28 BUG_ON(e->nr_required > 1 && 29 e->nr_required >= e->nr_devs); 30 31 for (unsigned i = 0; i + 1 < e->nr_devs; i++) 32 BUG_ON(e->devs[i] >= e->devs[i + 1]); 33 #endif 34 } 35 36 void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e) 37 { 38 bubble_sort(e->devs, e->nr_devs, u8_cmp); 39 } 40 41 static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) 42 { 43 eytzinger0_sort_r(r->entries, r->nr, r->entry_size, 44 bch2_memcmp, NULL, (void *)(size_t)r->entry_size); 45 } 46 47 static void bch2_replicas_entry_v0_to_text(struct printbuf *out, 48 struct bch_replicas_entry_v0 *e) 49 { 50 bch2_prt_data_type(out, e->data_type); 51 52 prt_printf(out, ": %u [", e->nr_devs); 53 for (unsigned i = 0; i < e->nr_devs; i++) 54 prt_printf(out, i ? " %u" : "%u", e->devs[i]); 55 prt_printf(out, "]"); 56 } 57 58 void bch2_replicas_entry_to_text(struct printbuf *out, 59 struct bch_replicas_entry_v1 *e) 60 { 61 bch2_prt_data_type(out, e->data_type); 62 63 prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs); 64 for (unsigned i = 0; i < e->nr_devs; i++) 65 prt_printf(out, i ? " %u" : "%u", e->devs[i]); 66 prt_printf(out, "]"); 67 } 68 69 int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r, 70 struct bch_sb *sb, 71 struct printbuf *err) 72 { 73 if (!r->nr_devs) { 74 prt_printf(err, "no devices in entry "); 75 goto bad; 76 } 77 78 if (r->nr_required > 1 && 79 r->nr_required >= r->nr_devs) { 80 prt_printf(err, "bad nr_required in entry "); 81 goto bad; 82 } 83 84 for (unsigned i = 0; i < r->nr_devs; i++) 85 if (!bch2_member_exists(sb, r->devs[i])) { 86 prt_printf(err, "invalid device %u in entry ", r->devs[i]); 87 goto bad; 88 } 89 90 return 0; 91 bad: 92 bch2_replicas_entry_to_text(err, r); 93 return -BCH_ERR_invalid_replicas_entry; 94 } 95 96 void bch2_cpu_replicas_to_text(struct printbuf *out, 97 struct bch_replicas_cpu *r) 98 { 99 struct bch_replicas_entry_v1 *e; 100 bool first = true; 101 102 for_each_cpu_replicas_entry(r, e) { 103 if (!first) 104 prt_printf(out, " "); 105 first = false; 106 107 bch2_replicas_entry_to_text(out, e); 108 } 109 } 110 111 static void extent_to_replicas(struct bkey_s_c k, 112 struct bch_replicas_entry_v1 *r) 113 { 114 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 115 const union bch_extent_entry *entry; 116 struct extent_ptr_decoded p; 117 118 r->nr_required = 1; 119 120 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { 121 if (p.ptr.cached) 122 continue; 123 124 if (!p.has_ec) 125 r->devs[r->nr_devs++] = p.ptr.dev; 126 else 127 r->nr_required = 0; 128 } 129 } 130 131 static void stripe_to_replicas(struct bkey_s_c k, 132 struct bch_replicas_entry_v1 *r) 133 { 134 struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); 135 const struct bch_extent_ptr *ptr; 136 137 r->nr_required = s.v->nr_blocks - s.v->nr_redundant; 138 139 for (ptr = s.v->ptrs; 140 ptr < s.v->ptrs + s.v->nr_blocks; 141 ptr++) 142 r->devs[r->nr_devs++] = ptr->dev; 143 } 144 145 void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *e, 146 struct bkey_s_c k) 147 { 148 e->nr_devs = 0; 149 150 switch (k.k->type) { 151 case KEY_TYPE_btree_ptr: 152 case KEY_TYPE_btree_ptr_v2: 153 e->data_type = BCH_DATA_btree; 154 extent_to_replicas(k, e); 155 break; 156 case KEY_TYPE_extent: 157 case KEY_TYPE_reflink_v: 158 e->data_type = BCH_DATA_user; 159 extent_to_replicas(k, e); 160 break; 161 case KEY_TYPE_stripe: 162 e->data_type = BCH_DATA_parity; 163 stripe_to_replicas(k, e); 164 break; 165 } 166 167 bch2_replicas_entry_sort(e); 168 } 169 170 void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e, 171 enum bch_data_type data_type, 172 struct bch_devs_list devs) 173 { 174 BUG_ON(!data_type || 175 data_type == BCH_DATA_sb || 176 data_type >= BCH_DATA_NR); 177 178 e->data_type = data_type; 179 e->nr_devs = 0; 180 e->nr_required = 1; 181 182 darray_for_each(devs, i) 183 e->devs[e->nr_devs++] = *i; 184 185 bch2_replicas_entry_sort(e); 186 } 187 188 static struct bch_replicas_cpu 189 cpu_replicas_add_entry(struct bch_fs *c, 190 struct bch_replicas_cpu *old, 191 struct bch_replicas_entry_v1 *new_entry) 192 { 193 struct bch_replicas_cpu new = { 194 .nr = old->nr + 1, 195 .entry_size = max_t(unsigned, old->entry_size, 196 replicas_entry_bytes(new_entry)), 197 }; 198 199 new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL); 200 if (!new.entries) 201 return new; 202 203 for (unsigned i = 0; i < old->nr; i++) 204 memcpy(cpu_replicas_entry(&new, i), 205 cpu_replicas_entry(old, i), 206 old->entry_size); 207 208 memcpy(cpu_replicas_entry(&new, old->nr), 209 new_entry, 210 replicas_entry_bytes(new_entry)); 211 212 bch2_cpu_replicas_sort(&new); 213 return new; 214 } 215 216 static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, 217 struct bch_replicas_entry_v1 *search) 218 { 219 int idx, entry_size = replicas_entry_bytes(search); 220 221 if (unlikely(entry_size > r->entry_size)) 222 return -1; 223 224 #define entry_cmp(_l, _r) memcmp(_l, _r, entry_size) 225 idx = eytzinger0_find(r->entries, r->nr, r->entry_size, 226 entry_cmp, search); 227 #undef entry_cmp 228 229 return idx < r->nr ? idx : -1; 230 } 231 232 int bch2_replicas_entry_idx(struct bch_fs *c, 233 struct bch_replicas_entry_v1 *search) 234 { 235 bch2_replicas_entry_sort(search); 236 237 return __replicas_entry_idx(&c->replicas, search); 238 } 239 240 static bool __replicas_has_entry(struct bch_replicas_cpu *r, 241 struct bch_replicas_entry_v1 *search) 242 { 243 return __replicas_entry_idx(r, search) >= 0; 244 } 245 246 bool bch2_replicas_marked(struct bch_fs *c, 247 struct bch_replicas_entry_v1 *search) 248 { 249 bool marked; 250 251 if (!search->nr_devs) 252 return true; 253 254 verify_replicas_entry(search); 255 256 percpu_down_read(&c->mark_lock); 257 marked = __replicas_has_entry(&c->replicas, search) && 258 (likely((!c->replicas_gc.entries)) || 259 __replicas_has_entry(&c->replicas_gc, search)); 260 percpu_up_read(&c->mark_lock); 261 262 return marked; 263 } 264 265 static void __replicas_table_update(struct bch_fs_usage *dst, 266 struct bch_replicas_cpu *dst_r, 267 struct bch_fs_usage *src, 268 struct bch_replicas_cpu *src_r) 269 { 270 int src_idx, dst_idx; 271 272 *dst = *src; 273 274 for (src_idx = 0; src_idx < src_r->nr; src_idx++) { 275 if (!src->replicas[src_idx]) 276 continue; 277 278 dst_idx = __replicas_entry_idx(dst_r, 279 cpu_replicas_entry(src_r, src_idx)); 280 BUG_ON(dst_idx < 0); 281 282 dst->replicas[dst_idx] = src->replicas[src_idx]; 283 } 284 } 285 286 static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p, 287 struct bch_replicas_cpu *dst_r, 288 struct bch_fs_usage __percpu *src_p, 289 struct bch_replicas_cpu *src_r) 290 { 291 unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr; 292 struct bch_fs_usage *dst, *src = (void *) 293 bch2_acc_percpu_u64s((u64 __percpu *) src_p, src_nr); 294 295 preempt_disable(); 296 dst = this_cpu_ptr(dst_p); 297 preempt_enable(); 298 299 __replicas_table_update(dst, dst_r, src, src_r); 300 } 301 302 /* 303 * Resize filesystem accounting: 304 */ 305 static int replicas_table_update(struct bch_fs *c, 306 struct bch_replicas_cpu *new_r) 307 { 308 struct bch_fs_usage __percpu *new_usage[JOURNAL_BUF_NR]; 309 struct bch_fs_usage_online *new_scratch = NULL; 310 struct bch_fs_usage __percpu *new_gc = NULL; 311 struct bch_fs_usage *new_base = NULL; 312 unsigned i, bytes = sizeof(struct bch_fs_usage) + 313 sizeof(u64) * new_r->nr; 314 unsigned scratch_bytes = sizeof(struct bch_fs_usage_online) + 315 sizeof(u64) * new_r->nr; 316 int ret = 0; 317 318 memset(new_usage, 0, sizeof(new_usage)); 319 320 for (i = 0; i < ARRAY_SIZE(new_usage); i++) 321 if (!(new_usage[i] = __alloc_percpu_gfp(bytes, 322 sizeof(u64), GFP_KERNEL))) 323 goto err; 324 325 if (!(new_base = kzalloc(bytes, GFP_KERNEL)) || 326 !(new_scratch = kmalloc(scratch_bytes, GFP_KERNEL)) || 327 (c->usage_gc && 328 !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_KERNEL)))) 329 goto err; 330 331 for (i = 0; i < ARRAY_SIZE(new_usage); i++) 332 if (c->usage[i]) 333 __replicas_table_update_pcpu(new_usage[i], new_r, 334 c->usage[i], &c->replicas); 335 if (c->usage_base) 336 __replicas_table_update(new_base, new_r, 337 c->usage_base, &c->replicas); 338 if (c->usage_gc) 339 __replicas_table_update_pcpu(new_gc, new_r, 340 c->usage_gc, &c->replicas); 341 342 for (i = 0; i < ARRAY_SIZE(new_usage); i++) 343 swap(c->usage[i], new_usage[i]); 344 swap(c->usage_base, new_base); 345 swap(c->usage_scratch, new_scratch); 346 swap(c->usage_gc, new_gc); 347 swap(c->replicas, *new_r); 348 out: 349 free_percpu(new_gc); 350 kfree(new_scratch); 351 for (i = 0; i < ARRAY_SIZE(new_usage); i++) 352 free_percpu(new_usage[i]); 353 kfree(new_base); 354 return ret; 355 err: 356 bch_err(c, "error updating replicas table: memory allocation failure"); 357 ret = -BCH_ERR_ENOMEM_replicas_table; 358 goto out; 359 } 360 361 static unsigned reserve_journal_replicas(struct bch_fs *c, 362 struct bch_replicas_cpu *r) 363 { 364 struct bch_replicas_entry_v1 *e; 365 unsigned journal_res_u64s = 0; 366 367 /* nr_inodes: */ 368 journal_res_u64s += 369 DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)); 370 371 /* key_version: */ 372 journal_res_u64s += 373 DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)); 374 375 /* persistent_reserved: */ 376 journal_res_u64s += 377 DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)) * 378 BCH_REPLICAS_MAX; 379 380 for_each_cpu_replicas_entry(r, e) 381 journal_res_u64s += 382 DIV_ROUND_UP(sizeof(struct jset_entry_data_usage) + 383 e->nr_devs, sizeof(u64)); 384 return journal_res_u64s; 385 } 386 387 noinline 388 static int bch2_mark_replicas_slowpath(struct bch_fs *c, 389 struct bch_replicas_entry_v1 *new_entry) 390 { 391 struct bch_replicas_cpu new_r, new_gc; 392 int ret = 0; 393 394 verify_replicas_entry(new_entry); 395 396 memset(&new_r, 0, sizeof(new_r)); 397 memset(&new_gc, 0, sizeof(new_gc)); 398 399 mutex_lock(&c->sb_lock); 400 401 if (c->replicas_gc.entries && 402 !__replicas_has_entry(&c->replicas_gc, new_entry)) { 403 new_gc = cpu_replicas_add_entry(c, &c->replicas_gc, new_entry); 404 if (!new_gc.entries) { 405 ret = -BCH_ERR_ENOMEM_cpu_replicas; 406 goto err; 407 } 408 } 409 410 if (!__replicas_has_entry(&c->replicas, new_entry)) { 411 new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry); 412 if (!new_r.entries) { 413 ret = -BCH_ERR_ENOMEM_cpu_replicas; 414 goto err; 415 } 416 417 ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r); 418 if (ret) 419 goto err; 420 421 bch2_journal_entry_res_resize(&c->journal, 422 &c->replicas_journal_res, 423 reserve_journal_replicas(c, &new_r)); 424 } 425 426 if (!new_r.entries && 427 !new_gc.entries) 428 goto out; 429 430 /* allocations done, now commit: */ 431 432 if (new_r.entries) 433 bch2_write_super(c); 434 435 /* don't update in memory replicas until changes are persistent */ 436 percpu_down_write(&c->mark_lock); 437 if (new_r.entries) 438 ret = replicas_table_update(c, &new_r); 439 if (new_gc.entries) 440 swap(new_gc, c->replicas_gc); 441 percpu_up_write(&c->mark_lock); 442 out: 443 mutex_unlock(&c->sb_lock); 444 445 kfree(new_r.entries); 446 kfree(new_gc.entries); 447 448 return ret; 449 err: 450 bch_err_msg(c, ret, "adding replicas entry"); 451 goto out; 452 } 453 454 int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r) 455 { 456 return likely(bch2_replicas_marked(c, r)) 457 ? 0 : bch2_mark_replicas_slowpath(c, r); 458 } 459 460 /* replicas delta list: */ 461 462 int bch2_replicas_delta_list_mark(struct bch_fs *c, 463 struct replicas_delta_list *r) 464 { 465 struct replicas_delta *d = r->d; 466 struct replicas_delta *top = (void *) r->d + r->used; 467 int ret = 0; 468 469 for (d = r->d; !ret && d != top; d = replicas_delta_next(d)) 470 ret = bch2_mark_replicas(c, &d->r); 471 return ret; 472 } 473 474 /* 475 * Old replicas_gc mechanism: only used for journal replicas entries now, should 476 * die at some point: 477 */ 478 479 int bch2_replicas_gc_end(struct bch_fs *c, int ret) 480 { 481 lockdep_assert_held(&c->replicas_gc_lock); 482 483 mutex_lock(&c->sb_lock); 484 percpu_down_write(&c->mark_lock); 485 486 ret = ret ?: 487 bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc) ?: 488 replicas_table_update(c, &c->replicas_gc); 489 490 kfree(c->replicas_gc.entries); 491 c->replicas_gc.entries = NULL; 492 493 percpu_up_write(&c->mark_lock); 494 495 if (!ret) 496 bch2_write_super(c); 497 498 mutex_unlock(&c->sb_lock); 499 500 return ret; 501 } 502 503 int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) 504 { 505 struct bch_replicas_entry_v1 *e; 506 unsigned i = 0; 507 508 lockdep_assert_held(&c->replicas_gc_lock); 509 510 mutex_lock(&c->sb_lock); 511 BUG_ON(c->replicas_gc.entries); 512 513 c->replicas_gc.nr = 0; 514 c->replicas_gc.entry_size = 0; 515 516 for_each_cpu_replicas_entry(&c->replicas, e) { 517 /* Preserve unknown data types */ 518 if (e->data_type >= BCH_DATA_NR || 519 !((1 << e->data_type) & typemask)) { 520 c->replicas_gc.nr++; 521 c->replicas_gc.entry_size = 522 max_t(unsigned, c->replicas_gc.entry_size, 523 replicas_entry_bytes(e)); 524 } 525 } 526 527 c->replicas_gc.entries = kcalloc(c->replicas_gc.nr, 528 c->replicas_gc.entry_size, 529 GFP_KERNEL); 530 if (!c->replicas_gc.entries) { 531 mutex_unlock(&c->sb_lock); 532 bch_err(c, "error allocating c->replicas_gc"); 533 return -BCH_ERR_ENOMEM_replicas_gc; 534 } 535 536 for_each_cpu_replicas_entry(&c->replicas, e) 537 if (e->data_type >= BCH_DATA_NR || 538 !((1 << e->data_type) & typemask)) 539 memcpy(cpu_replicas_entry(&c->replicas_gc, i++), 540 e, c->replicas_gc.entry_size); 541 542 bch2_cpu_replicas_sort(&c->replicas_gc); 543 mutex_unlock(&c->sb_lock); 544 545 return 0; 546 } 547 548 /* 549 * New much simpler mechanism for clearing out unneeded replicas entries - drop 550 * replicas entries that have 0 sectors used. 551 * 552 * However, we don't track sector counts for journal usage, so this doesn't drop 553 * any BCH_DATA_journal entries; the old bch2_replicas_gc_(start|end) mechanism 554 * is retained for that. 555 */ 556 int bch2_replicas_gc2(struct bch_fs *c) 557 { 558 struct bch_replicas_cpu new = { 0 }; 559 unsigned i, nr; 560 int ret = 0; 561 562 bch2_journal_meta(&c->journal); 563 retry: 564 nr = READ_ONCE(c->replicas.nr); 565 new.entry_size = READ_ONCE(c->replicas.entry_size); 566 new.entries = kcalloc(nr, new.entry_size, GFP_KERNEL); 567 if (!new.entries) { 568 bch_err(c, "error allocating c->replicas_gc"); 569 return -BCH_ERR_ENOMEM_replicas_gc; 570 } 571 572 mutex_lock(&c->sb_lock); 573 percpu_down_write(&c->mark_lock); 574 575 if (nr != c->replicas.nr || 576 new.entry_size != c->replicas.entry_size) { 577 percpu_up_write(&c->mark_lock); 578 mutex_unlock(&c->sb_lock); 579 kfree(new.entries); 580 goto retry; 581 } 582 583 for (i = 0; i < c->replicas.nr; i++) { 584 struct bch_replicas_entry_v1 *e = 585 cpu_replicas_entry(&c->replicas, i); 586 587 if (e->data_type == BCH_DATA_journal || 588 c->usage_base->replicas[i] || 589 percpu_u64_get(&c->usage[0]->replicas[i]) || 590 percpu_u64_get(&c->usage[1]->replicas[i]) || 591 percpu_u64_get(&c->usage[2]->replicas[i]) || 592 percpu_u64_get(&c->usage[3]->replicas[i])) 593 memcpy(cpu_replicas_entry(&new, new.nr++), 594 e, new.entry_size); 595 } 596 597 bch2_cpu_replicas_sort(&new); 598 599 ret = bch2_cpu_replicas_to_sb_replicas(c, &new) ?: 600 replicas_table_update(c, &new); 601 602 kfree(new.entries); 603 604 percpu_up_write(&c->mark_lock); 605 606 if (!ret) 607 bch2_write_super(c); 608 609 mutex_unlock(&c->sb_lock); 610 611 return ret; 612 } 613 614 int bch2_replicas_set_usage(struct bch_fs *c, 615 struct bch_replicas_entry_v1 *r, 616 u64 sectors) 617 { 618 int ret, idx = bch2_replicas_entry_idx(c, r); 619 620 if (idx < 0) { 621 struct bch_replicas_cpu n; 622 623 n = cpu_replicas_add_entry(c, &c->replicas, r); 624 if (!n.entries) 625 return -BCH_ERR_ENOMEM_cpu_replicas; 626 627 ret = replicas_table_update(c, &n); 628 if (ret) 629 return ret; 630 631 kfree(n.entries); 632 633 idx = bch2_replicas_entry_idx(c, r); 634 BUG_ON(ret < 0); 635 } 636 637 c->usage_base->replicas[idx] = sectors; 638 639 return 0; 640 } 641 642 /* Replicas tracking - superblock: */ 643 644 static int 645 __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r, 646 struct bch_replicas_cpu *cpu_r) 647 { 648 struct bch_replicas_entry_v1 *e, *dst; 649 unsigned nr = 0, entry_size = 0, idx = 0; 650 651 for_each_replicas_entry(sb_r, e) { 652 entry_size = max_t(unsigned, entry_size, 653 replicas_entry_bytes(e)); 654 nr++; 655 } 656 657 cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); 658 if (!cpu_r->entries) 659 return -BCH_ERR_ENOMEM_cpu_replicas; 660 661 cpu_r->nr = nr; 662 cpu_r->entry_size = entry_size; 663 664 for_each_replicas_entry(sb_r, e) { 665 dst = cpu_replicas_entry(cpu_r, idx++); 666 memcpy(dst, e, replicas_entry_bytes(e)); 667 bch2_replicas_entry_sort(dst); 668 } 669 670 return 0; 671 } 672 673 static int 674 __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, 675 struct bch_replicas_cpu *cpu_r) 676 { 677 struct bch_replicas_entry_v0 *e; 678 unsigned nr = 0, entry_size = 0, idx = 0; 679 680 for_each_replicas_entry(sb_r, e) { 681 entry_size = max_t(unsigned, entry_size, 682 replicas_entry_bytes(e)); 683 nr++; 684 } 685 686 entry_size += sizeof(struct bch_replicas_entry_v1) - 687 sizeof(struct bch_replicas_entry_v0); 688 689 cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); 690 if (!cpu_r->entries) 691 return -BCH_ERR_ENOMEM_cpu_replicas; 692 693 cpu_r->nr = nr; 694 cpu_r->entry_size = entry_size; 695 696 for_each_replicas_entry(sb_r, e) { 697 struct bch_replicas_entry_v1 *dst = 698 cpu_replicas_entry(cpu_r, idx++); 699 700 dst->data_type = e->data_type; 701 dst->nr_devs = e->nr_devs; 702 dst->nr_required = 1; 703 memcpy(dst->devs, e->devs, e->nr_devs); 704 bch2_replicas_entry_sort(dst); 705 } 706 707 return 0; 708 } 709 710 int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c) 711 { 712 struct bch_sb_field_replicas *sb_v1; 713 struct bch_sb_field_replicas_v0 *sb_v0; 714 struct bch_replicas_cpu new_r = { 0, 0, NULL }; 715 int ret = 0; 716 717 if ((sb_v1 = bch2_sb_field_get(c->disk_sb.sb, replicas))) 718 ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r); 719 else if ((sb_v0 = bch2_sb_field_get(c->disk_sb.sb, replicas_v0))) 720 ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r); 721 if (ret) 722 return ret; 723 724 bch2_cpu_replicas_sort(&new_r); 725 726 percpu_down_write(&c->mark_lock); 727 728 ret = replicas_table_update(c, &new_r); 729 percpu_up_write(&c->mark_lock); 730 731 kfree(new_r.entries); 732 733 return 0; 734 } 735 736 static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c, 737 struct bch_replicas_cpu *r) 738 { 739 struct bch_sb_field_replicas_v0 *sb_r; 740 struct bch_replicas_entry_v0 *dst; 741 struct bch_replicas_entry_v1 *src; 742 size_t bytes; 743 744 bytes = sizeof(struct bch_sb_field_replicas); 745 746 for_each_cpu_replicas_entry(r, src) 747 bytes += replicas_entry_bytes(src) - 1; 748 749 sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0, 750 DIV_ROUND_UP(bytes, sizeof(u64))); 751 if (!sb_r) 752 return -BCH_ERR_ENOSPC_sb_replicas; 753 754 bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas); 755 sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas_v0); 756 757 memset(&sb_r->entries, 0, 758 vstruct_end(&sb_r->field) - 759 (void *) &sb_r->entries); 760 761 dst = sb_r->entries; 762 for_each_cpu_replicas_entry(r, src) { 763 dst->data_type = src->data_type; 764 dst->nr_devs = src->nr_devs; 765 memcpy(dst->devs, src->devs, src->nr_devs); 766 767 dst = replicas_entry_next(dst); 768 769 BUG_ON((void *) dst > vstruct_end(&sb_r->field)); 770 } 771 772 return 0; 773 } 774 775 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, 776 struct bch_replicas_cpu *r) 777 { 778 struct bch_sb_field_replicas *sb_r; 779 struct bch_replicas_entry_v1 *dst, *src; 780 bool need_v1 = false; 781 size_t bytes; 782 783 bytes = sizeof(struct bch_sb_field_replicas); 784 785 for_each_cpu_replicas_entry(r, src) { 786 bytes += replicas_entry_bytes(src); 787 if (src->nr_required != 1) 788 need_v1 = true; 789 } 790 791 if (!need_v1) 792 return bch2_cpu_replicas_to_sb_replicas_v0(c, r); 793 794 sb_r = bch2_sb_field_resize(&c->disk_sb, replicas, 795 DIV_ROUND_UP(bytes, sizeof(u64))); 796 if (!sb_r) 797 return -BCH_ERR_ENOSPC_sb_replicas; 798 799 bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0); 800 sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas); 801 802 memset(&sb_r->entries, 0, 803 vstruct_end(&sb_r->field) - 804 (void *) &sb_r->entries); 805 806 dst = sb_r->entries; 807 for_each_cpu_replicas_entry(r, src) { 808 memcpy(dst, src, replicas_entry_bytes(src)); 809 810 dst = replicas_entry_next(dst); 811 812 BUG_ON((void *) dst > vstruct_end(&sb_r->field)); 813 } 814 815 return 0; 816 } 817 818 static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, 819 struct bch_sb *sb, 820 struct printbuf *err) 821 { 822 unsigned i; 823 824 sort_r(cpu_r->entries, 825 cpu_r->nr, 826 cpu_r->entry_size, 827 bch2_memcmp, NULL, 828 (void *)(size_t)cpu_r->entry_size); 829 830 for (i = 0; i < cpu_r->nr; i++) { 831 struct bch_replicas_entry_v1 *e = 832 cpu_replicas_entry(cpu_r, i); 833 834 int ret = bch2_replicas_entry_validate(e, sb, err); 835 if (ret) 836 return ret; 837 838 if (i + 1 < cpu_r->nr) { 839 struct bch_replicas_entry_v1 *n = 840 cpu_replicas_entry(cpu_r, i + 1); 841 842 BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0); 843 844 if (!memcmp(e, n, cpu_r->entry_size)) { 845 prt_printf(err, "duplicate replicas entry "); 846 bch2_replicas_entry_to_text(err, e); 847 return -BCH_ERR_invalid_sb_replicas; 848 } 849 } 850 } 851 852 return 0; 853 } 854 855 static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f, 856 enum bch_validate_flags flags, struct printbuf *err) 857 { 858 struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas); 859 struct bch_replicas_cpu cpu_r; 860 int ret; 861 862 ret = __bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r); 863 if (ret) 864 return ret; 865 866 ret = bch2_cpu_replicas_validate(&cpu_r, sb, err); 867 kfree(cpu_r.entries); 868 return ret; 869 } 870 871 static void bch2_sb_replicas_to_text(struct printbuf *out, 872 struct bch_sb *sb, 873 struct bch_sb_field *f) 874 { 875 struct bch_sb_field_replicas *r = field_to_type(f, replicas); 876 struct bch_replicas_entry_v1 *e; 877 bool first = true; 878 879 for_each_replicas_entry(r, e) { 880 if (!first) 881 prt_printf(out, " "); 882 first = false; 883 884 bch2_replicas_entry_to_text(out, e); 885 } 886 prt_newline(out); 887 } 888 889 const struct bch_sb_field_ops bch_sb_field_ops_replicas = { 890 .validate = bch2_sb_replicas_validate, 891 .to_text = bch2_sb_replicas_to_text, 892 }; 893 894 static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f, 895 enum bch_validate_flags flags, struct printbuf *err) 896 { 897 struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); 898 struct bch_replicas_cpu cpu_r; 899 int ret; 900 901 ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r); 902 if (ret) 903 return ret; 904 905 ret = bch2_cpu_replicas_validate(&cpu_r, sb, err); 906 kfree(cpu_r.entries); 907 return ret; 908 } 909 910 static void bch2_sb_replicas_v0_to_text(struct printbuf *out, 911 struct bch_sb *sb, 912 struct bch_sb_field *f) 913 { 914 struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); 915 struct bch_replicas_entry_v0 *e; 916 bool first = true; 917 918 for_each_replicas_entry(sb_r, e) { 919 if (!first) 920 prt_printf(out, " "); 921 first = false; 922 923 bch2_replicas_entry_v0_to_text(out, e); 924 } 925 prt_newline(out); 926 } 927 928 const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = { 929 .validate = bch2_sb_replicas_v0_validate, 930 .to_text = bch2_sb_replicas_v0_to_text, 931 }; 932 933 /* Query replicas: */ 934 935 bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, 936 unsigned flags, bool print) 937 { 938 struct bch_replicas_entry_v1 *e; 939 bool ret = true; 940 941 percpu_down_read(&c->mark_lock); 942 for_each_cpu_replicas_entry(&c->replicas, e) { 943 unsigned nr_online = 0, nr_failed = 0, dflags = 0; 944 bool metadata = e->data_type < BCH_DATA_user; 945 946 if (e->data_type == BCH_DATA_cached) 947 continue; 948 949 rcu_read_lock(); 950 for (unsigned i = 0; i < e->nr_devs; i++) { 951 nr_online += test_bit(e->devs[i], devs.d); 952 953 struct bch_dev *ca = bch2_dev_rcu(c, e->devs[i]); 954 nr_failed += ca && ca->mi.state == BCH_MEMBER_STATE_failed; 955 } 956 rcu_read_unlock(); 957 958 if (nr_failed == e->nr_devs) 959 continue; 960 961 if (nr_online < e->nr_required) 962 dflags |= metadata 963 ? BCH_FORCE_IF_METADATA_LOST 964 : BCH_FORCE_IF_DATA_LOST; 965 966 if (nr_online < e->nr_devs) 967 dflags |= metadata 968 ? BCH_FORCE_IF_METADATA_DEGRADED 969 : BCH_FORCE_IF_DATA_DEGRADED; 970 971 if (dflags & ~flags) { 972 if (print) { 973 struct printbuf buf = PRINTBUF; 974 975 bch2_replicas_entry_to_text(&buf, e); 976 bch_err(c, "insufficient devices online (%u) for replicas entry %s", 977 nr_online, buf.buf); 978 printbuf_exit(&buf); 979 } 980 ret = false; 981 break; 982 } 983 984 } 985 percpu_up_read(&c->mark_lock); 986 987 return ret; 988 } 989 990 unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev) 991 { 992 struct bch_sb_field_replicas *replicas; 993 struct bch_sb_field_replicas_v0 *replicas_v0; 994 unsigned data_has = 0; 995 996 replicas = bch2_sb_field_get(sb, replicas); 997 replicas_v0 = bch2_sb_field_get(sb, replicas_v0); 998 999 if (replicas) { 1000 struct bch_replicas_entry_v1 *r; 1001 1002 for_each_replicas_entry(replicas, r) { 1003 if (r->data_type >= sizeof(data_has) * 8) 1004 continue; 1005 1006 for (unsigned i = 0; i < r->nr_devs; i++) 1007 if (r->devs[i] == dev) 1008 data_has |= 1 << r->data_type; 1009 } 1010 1011 } else if (replicas_v0) { 1012 struct bch_replicas_entry_v0 *r; 1013 1014 for_each_replicas_entry_v0(replicas_v0, r) { 1015 if (r->data_type >= sizeof(data_has) * 8) 1016 continue; 1017 1018 for (unsigned i = 0; i < r->nr_devs; i++) 1019 if (r->devs[i] == dev) 1020 data_has |= 1 << r->data_type; 1021 } 1022 } 1023 1024 1025 return data_has; 1026 } 1027 1028 unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) 1029 { 1030 unsigned ret; 1031 1032 mutex_lock(&c->sb_lock); 1033 ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx); 1034 mutex_unlock(&c->sb_lock); 1035 1036 return ret; 1037 } 1038 1039 void bch2_fs_replicas_exit(struct bch_fs *c) 1040 { 1041 unsigned i; 1042 1043 kfree(c->usage_scratch); 1044 for (i = 0; i < ARRAY_SIZE(c->usage); i++) 1045 free_percpu(c->usage[i]); 1046 kfree(c->usage_base); 1047 kfree(c->replicas.entries); 1048 kfree(c->replicas_gc.entries); 1049 1050 mempool_exit(&c->replicas_delta_pool); 1051 } 1052 1053 int bch2_fs_replicas_init(struct bch_fs *c) 1054 { 1055 bch2_journal_entry_res_resize(&c->journal, 1056 &c->replicas_journal_res, 1057 reserve_journal_replicas(c, &c->replicas)); 1058 1059 return mempool_init_kmalloc_pool(&c->replicas_delta_pool, 1, 1060 REPLICAS_DELTA_LIST_MAX) ?: 1061 replicas_table_update(c, &c->replicas); 1062 } 1063