1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "bcachefs.h" 4 #include "buckets.h" 5 #include "journal.h" 6 #include "replicas.h" 7 #include "super-io.h" 8 9 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, 10 struct bch_replicas_cpu *); 11 12 /* Replicas tracking - in memory: */ 13 14 static void verify_replicas_entry(struct bch_replicas_entry *e) 15 { 16 #ifdef CONFIG_BCACHEFS_DEBUG 17 unsigned i; 18 19 BUG_ON(e->data_type >= BCH_DATA_NR); 20 BUG_ON(!e->nr_devs); 21 BUG_ON(e->nr_required > 1 && 22 e->nr_required >= e->nr_devs); 23 24 for (i = 0; i + 1 < e->nr_devs; i++) 25 BUG_ON(e->devs[i] >= e->devs[i + 1]); 26 #endif 27 } 28 29 void bch2_replicas_entry_sort(struct bch_replicas_entry *e) 30 { 31 bubble_sort(e->devs, e->nr_devs, u8_cmp); 32 } 33 34 static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) 35 { 36 eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL); 37 } 38 39 static void bch2_replicas_entry_v0_to_text(struct printbuf *out, 40 struct bch_replicas_entry_v0 *e) 41 { 42 unsigned i; 43 44 if (e->data_type < BCH_DATA_NR) 45 prt_printf(out, "%s", bch2_data_types[e->data_type]); 46 else 47 prt_printf(out, "(invalid data type %u)", e->data_type); 48 49 prt_printf(out, ": %u [", e->nr_devs); 50 for (i = 0; i < e->nr_devs; i++) 51 prt_printf(out, i ? " %u" : "%u", e->devs[i]); 52 prt_printf(out, "]"); 53 } 54 55 void bch2_replicas_entry_to_text(struct printbuf *out, 56 struct bch_replicas_entry *e) 57 { 58 unsigned i; 59 60 if (e->data_type < BCH_DATA_NR) 61 prt_printf(out, "%s", bch2_data_types[e->data_type]); 62 else 63 prt_printf(out, "(invalid data type %u)", e->data_type); 64 65 prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs); 66 for (i = 0; i < e->nr_devs; i++) 67 prt_printf(out, i ? " %u" : "%u", e->devs[i]); 68 prt_printf(out, "]"); 69 } 70 71 void bch2_cpu_replicas_to_text(struct printbuf *out, 72 struct bch_replicas_cpu *r) 73 { 74 struct bch_replicas_entry *e; 75 bool first = true; 76 77 for_each_cpu_replicas_entry(r, e) { 78 if (!first) 79 prt_printf(out, " "); 80 first = false; 81 82 bch2_replicas_entry_to_text(out, e); 83 } 84 } 85 86 static void extent_to_replicas(struct bkey_s_c k, 87 struct bch_replicas_entry *r) 88 { 89 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); 90 const union bch_extent_entry *entry; 91 struct extent_ptr_decoded p; 92 93 r->nr_required = 1; 94 95 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { 96 if (p.ptr.cached) 97 continue; 98 99 if (!p.has_ec) 100 r->devs[r->nr_devs++] = p.ptr.dev; 101 else 102 r->nr_required = 0; 103 } 104 } 105 106 static void stripe_to_replicas(struct bkey_s_c k, 107 struct bch_replicas_entry *r) 108 { 109 struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); 110 const struct bch_extent_ptr *ptr; 111 112 r->nr_required = s.v->nr_blocks - s.v->nr_redundant; 113 114 for (ptr = s.v->ptrs; 115 ptr < s.v->ptrs + s.v->nr_blocks; 116 ptr++) 117 r->devs[r->nr_devs++] = ptr->dev; 118 } 119 120 void bch2_bkey_to_replicas(struct bch_replicas_entry *e, 121 struct bkey_s_c k) 122 { 123 e->nr_devs = 0; 124 125 switch (k.k->type) { 126 case KEY_TYPE_btree_ptr: 127 case KEY_TYPE_btree_ptr_v2: 128 e->data_type = BCH_DATA_btree; 129 extent_to_replicas(k, e); 130 break; 131 case KEY_TYPE_extent: 132 case KEY_TYPE_reflink_v: 133 e->data_type = BCH_DATA_user; 134 extent_to_replicas(k, e); 135 break; 136 case KEY_TYPE_stripe: 137 e->data_type = BCH_DATA_parity; 138 stripe_to_replicas(k, e); 139 break; 140 } 141 142 bch2_replicas_entry_sort(e); 143 } 144 145 void bch2_devlist_to_replicas(struct bch_replicas_entry *e, 146 enum bch_data_type data_type, 147 struct bch_devs_list devs) 148 { 149 unsigned i; 150 151 BUG_ON(!data_type || 152 data_type == BCH_DATA_sb || 153 data_type >= BCH_DATA_NR); 154 155 e->data_type = data_type; 156 e->nr_devs = 0; 157 e->nr_required = 1; 158 159 for (i = 0; i < devs.nr; i++) 160 e->devs[e->nr_devs++] = devs.devs[i]; 161 162 bch2_replicas_entry_sort(e); 163 } 164 165 static struct bch_replicas_cpu 166 cpu_replicas_add_entry(struct bch_replicas_cpu *old, 167 struct bch_replicas_entry *new_entry) 168 { 169 unsigned i; 170 struct bch_replicas_cpu new = { 171 .nr = old->nr + 1, 172 .entry_size = max_t(unsigned, old->entry_size, 173 replicas_entry_bytes(new_entry)), 174 }; 175 176 BUG_ON(!new_entry->data_type); 177 verify_replicas_entry(new_entry); 178 179 new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL); 180 if (!new.entries) 181 return new; 182 183 for (i = 0; i < old->nr; i++) 184 memcpy(cpu_replicas_entry(&new, i), 185 cpu_replicas_entry(old, i), 186 old->entry_size); 187 188 memcpy(cpu_replicas_entry(&new, old->nr), 189 new_entry, 190 replicas_entry_bytes(new_entry)); 191 192 bch2_cpu_replicas_sort(&new); 193 return new; 194 } 195 196 static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, 197 struct bch_replicas_entry *search) 198 { 199 int idx, entry_size = replicas_entry_bytes(search); 200 201 if (unlikely(entry_size > r->entry_size)) 202 return -1; 203 204 verify_replicas_entry(search); 205 206 #define entry_cmp(_l, _r, size) memcmp(_l, _r, entry_size) 207 idx = eytzinger0_find(r->entries, r->nr, r->entry_size, 208 entry_cmp, search); 209 #undef entry_cmp 210 211 return idx < r->nr ? idx : -1; 212 } 213 214 int bch2_replicas_entry_idx(struct bch_fs *c, 215 struct bch_replicas_entry *search) 216 { 217 bch2_replicas_entry_sort(search); 218 219 return __replicas_entry_idx(&c->replicas, search); 220 } 221 222 static bool __replicas_has_entry(struct bch_replicas_cpu *r, 223 struct bch_replicas_entry *search) 224 { 225 return __replicas_entry_idx(r, search) >= 0; 226 } 227 228 bool bch2_replicas_marked(struct bch_fs *c, 229 struct bch_replicas_entry *search) 230 { 231 bool marked; 232 233 if (!search->nr_devs) 234 return true; 235 236 verify_replicas_entry(search); 237 238 percpu_down_read(&c->mark_lock); 239 marked = __replicas_has_entry(&c->replicas, search) && 240 (likely((!c->replicas_gc.entries)) || 241 __replicas_has_entry(&c->replicas_gc, search)); 242 percpu_up_read(&c->mark_lock); 243 244 return marked; 245 } 246 247 static void __replicas_table_update(struct bch_fs_usage *dst, 248 struct bch_replicas_cpu *dst_r, 249 struct bch_fs_usage *src, 250 struct bch_replicas_cpu *src_r) 251 { 252 int src_idx, dst_idx; 253 254 *dst = *src; 255 256 for (src_idx = 0; src_idx < src_r->nr; src_idx++) { 257 if (!src->replicas[src_idx]) 258 continue; 259 260 dst_idx = __replicas_entry_idx(dst_r, 261 cpu_replicas_entry(src_r, src_idx)); 262 BUG_ON(dst_idx < 0); 263 264 dst->replicas[dst_idx] = src->replicas[src_idx]; 265 } 266 } 267 268 static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p, 269 struct bch_replicas_cpu *dst_r, 270 struct bch_fs_usage __percpu *src_p, 271 struct bch_replicas_cpu *src_r) 272 { 273 unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr; 274 struct bch_fs_usage *dst, *src = (void *) 275 bch2_acc_percpu_u64s((u64 __percpu *) src_p, src_nr); 276 277 preempt_disable(); 278 dst = this_cpu_ptr(dst_p); 279 preempt_enable(); 280 281 __replicas_table_update(dst, dst_r, src, src_r); 282 } 283 284 /* 285 * Resize filesystem accounting: 286 */ 287 static int replicas_table_update(struct bch_fs *c, 288 struct bch_replicas_cpu *new_r) 289 { 290 struct bch_fs_usage __percpu *new_usage[JOURNAL_BUF_NR]; 291 struct bch_fs_usage_online *new_scratch = NULL; 292 struct bch_fs_usage __percpu *new_gc = NULL; 293 struct bch_fs_usage *new_base = NULL; 294 unsigned i, bytes = sizeof(struct bch_fs_usage) + 295 sizeof(u64) * new_r->nr; 296 unsigned scratch_bytes = sizeof(struct bch_fs_usage_online) + 297 sizeof(u64) * new_r->nr; 298 int ret = 0; 299 300 memset(new_usage, 0, sizeof(new_usage)); 301 302 for (i = 0; i < ARRAY_SIZE(new_usage); i++) 303 if (!(new_usage[i] = __alloc_percpu_gfp(bytes, 304 sizeof(u64), GFP_KERNEL))) 305 goto err; 306 307 if (!(new_base = kzalloc(bytes, GFP_KERNEL)) || 308 !(new_scratch = kmalloc(scratch_bytes, GFP_KERNEL)) || 309 (c->usage_gc && 310 !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_KERNEL)))) 311 goto err; 312 313 for (i = 0; i < ARRAY_SIZE(new_usage); i++) 314 if (c->usage[i]) 315 __replicas_table_update_pcpu(new_usage[i], new_r, 316 c->usage[i], &c->replicas); 317 if (c->usage_base) 318 __replicas_table_update(new_base, new_r, 319 c->usage_base, &c->replicas); 320 if (c->usage_gc) 321 __replicas_table_update_pcpu(new_gc, new_r, 322 c->usage_gc, &c->replicas); 323 324 for (i = 0; i < ARRAY_SIZE(new_usage); i++) 325 swap(c->usage[i], new_usage[i]); 326 swap(c->usage_base, new_base); 327 swap(c->usage_scratch, new_scratch); 328 swap(c->usage_gc, new_gc); 329 swap(c->replicas, *new_r); 330 out: 331 free_percpu(new_gc); 332 kfree(new_scratch); 333 for (i = 0; i < ARRAY_SIZE(new_usage); i++) 334 free_percpu(new_usage[i]); 335 kfree(new_base); 336 return ret; 337 err: 338 bch_err(c, "error updating replicas table: memory allocation failure"); 339 ret = -BCH_ERR_ENOMEM_replicas_table; 340 goto out; 341 } 342 343 static unsigned reserve_journal_replicas(struct bch_fs *c, 344 struct bch_replicas_cpu *r) 345 { 346 struct bch_replicas_entry *e; 347 unsigned journal_res_u64s = 0; 348 349 /* nr_inodes: */ 350 journal_res_u64s += 351 DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)); 352 353 /* key_version: */ 354 journal_res_u64s += 355 DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)); 356 357 /* persistent_reserved: */ 358 journal_res_u64s += 359 DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)) * 360 BCH_REPLICAS_MAX; 361 362 for_each_cpu_replicas_entry(r, e) 363 journal_res_u64s += 364 DIV_ROUND_UP(sizeof(struct jset_entry_data_usage) + 365 e->nr_devs, sizeof(u64)); 366 return journal_res_u64s; 367 } 368 369 noinline 370 static int bch2_mark_replicas_slowpath(struct bch_fs *c, 371 struct bch_replicas_entry *new_entry) 372 { 373 struct bch_replicas_cpu new_r, new_gc; 374 int ret = 0; 375 376 verify_replicas_entry(new_entry); 377 378 memset(&new_r, 0, sizeof(new_r)); 379 memset(&new_gc, 0, sizeof(new_gc)); 380 381 mutex_lock(&c->sb_lock); 382 383 if (c->replicas_gc.entries && 384 !__replicas_has_entry(&c->replicas_gc, new_entry)) { 385 new_gc = cpu_replicas_add_entry(&c->replicas_gc, new_entry); 386 if (!new_gc.entries) { 387 ret = -BCH_ERR_ENOMEM_cpu_replicas; 388 goto err; 389 } 390 } 391 392 if (!__replicas_has_entry(&c->replicas, new_entry)) { 393 new_r = cpu_replicas_add_entry(&c->replicas, new_entry); 394 if (!new_r.entries) { 395 ret = -BCH_ERR_ENOMEM_cpu_replicas; 396 goto err; 397 } 398 399 ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r); 400 if (ret) 401 goto err; 402 403 bch2_journal_entry_res_resize(&c->journal, 404 &c->replicas_journal_res, 405 reserve_journal_replicas(c, &new_r)); 406 } 407 408 if (!new_r.entries && 409 !new_gc.entries) 410 goto out; 411 412 /* allocations done, now commit: */ 413 414 if (new_r.entries) 415 bch2_write_super(c); 416 417 /* don't update in memory replicas until changes are persistent */ 418 percpu_down_write(&c->mark_lock); 419 if (new_r.entries) 420 ret = replicas_table_update(c, &new_r); 421 if (new_gc.entries) 422 swap(new_gc, c->replicas_gc); 423 percpu_up_write(&c->mark_lock); 424 out: 425 mutex_unlock(&c->sb_lock); 426 427 kfree(new_r.entries); 428 kfree(new_gc.entries); 429 430 return ret; 431 err: 432 bch_err_msg(c, ret, "adding replicas entry"); 433 goto out; 434 } 435 436 int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r) 437 { 438 return likely(bch2_replicas_marked(c, r)) 439 ? 0 : bch2_mark_replicas_slowpath(c, r); 440 } 441 442 /* replicas delta list: */ 443 444 int bch2_replicas_delta_list_mark(struct bch_fs *c, 445 struct replicas_delta_list *r) 446 { 447 struct replicas_delta *d = r->d; 448 struct replicas_delta *top = (void *) r->d + r->used; 449 int ret = 0; 450 451 for (d = r->d; !ret && d != top; d = replicas_delta_next(d)) 452 ret = bch2_mark_replicas(c, &d->r); 453 return ret; 454 } 455 456 /* 457 * Old replicas_gc mechanism: only used for journal replicas entries now, should 458 * die at some point: 459 */ 460 461 int bch2_replicas_gc_end(struct bch_fs *c, int ret) 462 { 463 lockdep_assert_held(&c->replicas_gc_lock); 464 465 mutex_lock(&c->sb_lock); 466 percpu_down_write(&c->mark_lock); 467 468 ret = ret ?: 469 bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc) ?: 470 replicas_table_update(c, &c->replicas_gc); 471 472 kfree(c->replicas_gc.entries); 473 c->replicas_gc.entries = NULL; 474 475 percpu_up_write(&c->mark_lock); 476 477 if (!ret) 478 bch2_write_super(c); 479 480 mutex_unlock(&c->sb_lock); 481 482 return ret; 483 } 484 485 int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) 486 { 487 struct bch_replicas_entry *e; 488 unsigned i = 0; 489 490 lockdep_assert_held(&c->replicas_gc_lock); 491 492 mutex_lock(&c->sb_lock); 493 BUG_ON(c->replicas_gc.entries); 494 495 c->replicas_gc.nr = 0; 496 c->replicas_gc.entry_size = 0; 497 498 for_each_cpu_replicas_entry(&c->replicas, e) 499 if (!((1 << e->data_type) & typemask)) { 500 c->replicas_gc.nr++; 501 c->replicas_gc.entry_size = 502 max_t(unsigned, c->replicas_gc.entry_size, 503 replicas_entry_bytes(e)); 504 } 505 506 c->replicas_gc.entries = kcalloc(c->replicas_gc.nr, 507 c->replicas_gc.entry_size, 508 GFP_KERNEL); 509 if (!c->replicas_gc.entries) { 510 mutex_unlock(&c->sb_lock); 511 bch_err(c, "error allocating c->replicas_gc"); 512 return -BCH_ERR_ENOMEM_replicas_gc; 513 } 514 515 for_each_cpu_replicas_entry(&c->replicas, e) 516 if (!((1 << e->data_type) & typemask)) 517 memcpy(cpu_replicas_entry(&c->replicas_gc, i++), 518 e, c->replicas_gc.entry_size); 519 520 bch2_cpu_replicas_sort(&c->replicas_gc); 521 mutex_unlock(&c->sb_lock); 522 523 return 0; 524 } 525 526 /* 527 * New much simpler mechanism for clearing out unneeded replicas entries - drop 528 * replicas entries that have 0 sectors used. 529 * 530 * However, we don't track sector counts for journal usage, so this doesn't drop 531 * any BCH_DATA_journal entries; the old bch2_replicas_gc_(start|end) mechanism 532 * is retained for that. 533 */ 534 int bch2_replicas_gc2(struct bch_fs *c) 535 { 536 struct bch_replicas_cpu new = { 0 }; 537 unsigned i, nr; 538 int ret = 0; 539 540 bch2_journal_meta(&c->journal); 541 retry: 542 nr = READ_ONCE(c->replicas.nr); 543 new.entry_size = READ_ONCE(c->replicas.entry_size); 544 new.entries = kcalloc(nr, new.entry_size, GFP_KERNEL); 545 if (!new.entries) { 546 bch_err(c, "error allocating c->replicas_gc"); 547 return -BCH_ERR_ENOMEM_replicas_gc; 548 } 549 550 mutex_lock(&c->sb_lock); 551 percpu_down_write(&c->mark_lock); 552 553 if (nr != c->replicas.nr || 554 new.entry_size != c->replicas.entry_size) { 555 percpu_up_write(&c->mark_lock); 556 mutex_unlock(&c->sb_lock); 557 kfree(new.entries); 558 goto retry; 559 } 560 561 for (i = 0; i < c->replicas.nr; i++) { 562 struct bch_replicas_entry *e = 563 cpu_replicas_entry(&c->replicas, i); 564 565 if (e->data_type == BCH_DATA_journal || 566 c->usage_base->replicas[i] || 567 percpu_u64_get(&c->usage[0]->replicas[i]) || 568 percpu_u64_get(&c->usage[1]->replicas[i]) || 569 percpu_u64_get(&c->usage[2]->replicas[i]) || 570 percpu_u64_get(&c->usage[3]->replicas[i])) 571 memcpy(cpu_replicas_entry(&new, new.nr++), 572 e, new.entry_size); 573 } 574 575 bch2_cpu_replicas_sort(&new); 576 577 ret = bch2_cpu_replicas_to_sb_replicas(c, &new) ?: 578 replicas_table_update(c, &new); 579 580 kfree(new.entries); 581 582 percpu_up_write(&c->mark_lock); 583 584 if (!ret) 585 bch2_write_super(c); 586 587 mutex_unlock(&c->sb_lock); 588 589 return ret; 590 } 591 592 int bch2_replicas_set_usage(struct bch_fs *c, 593 struct bch_replicas_entry *r, 594 u64 sectors) 595 { 596 int ret, idx = bch2_replicas_entry_idx(c, r); 597 598 if (idx < 0) { 599 struct bch_replicas_cpu n; 600 601 n = cpu_replicas_add_entry(&c->replicas, r); 602 if (!n.entries) 603 return -BCH_ERR_ENOMEM_cpu_replicas; 604 605 ret = replicas_table_update(c, &n); 606 if (ret) 607 return ret; 608 609 kfree(n.entries); 610 611 idx = bch2_replicas_entry_idx(c, r); 612 BUG_ON(ret < 0); 613 } 614 615 c->usage_base->replicas[idx] = sectors; 616 617 return 0; 618 } 619 620 /* Replicas tracking - superblock: */ 621 622 static int 623 __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r, 624 struct bch_replicas_cpu *cpu_r) 625 { 626 struct bch_replicas_entry *e, *dst; 627 unsigned nr = 0, entry_size = 0, idx = 0; 628 629 for_each_replicas_entry(sb_r, e) { 630 entry_size = max_t(unsigned, entry_size, 631 replicas_entry_bytes(e)); 632 nr++; 633 } 634 635 cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); 636 if (!cpu_r->entries) 637 return -BCH_ERR_ENOMEM_cpu_replicas; 638 639 cpu_r->nr = nr; 640 cpu_r->entry_size = entry_size; 641 642 for_each_replicas_entry(sb_r, e) { 643 dst = cpu_replicas_entry(cpu_r, idx++); 644 memcpy(dst, e, replicas_entry_bytes(e)); 645 bch2_replicas_entry_sort(dst); 646 } 647 648 return 0; 649 } 650 651 static int 652 __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, 653 struct bch_replicas_cpu *cpu_r) 654 { 655 struct bch_replicas_entry_v0 *e; 656 unsigned nr = 0, entry_size = 0, idx = 0; 657 658 for_each_replicas_entry(sb_r, e) { 659 entry_size = max_t(unsigned, entry_size, 660 replicas_entry_bytes(e)); 661 nr++; 662 } 663 664 entry_size += sizeof(struct bch_replicas_entry) - 665 sizeof(struct bch_replicas_entry_v0); 666 667 cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); 668 if (!cpu_r->entries) 669 return -BCH_ERR_ENOMEM_cpu_replicas; 670 671 cpu_r->nr = nr; 672 cpu_r->entry_size = entry_size; 673 674 for_each_replicas_entry(sb_r, e) { 675 struct bch_replicas_entry *dst = 676 cpu_replicas_entry(cpu_r, idx++); 677 678 dst->data_type = e->data_type; 679 dst->nr_devs = e->nr_devs; 680 dst->nr_required = 1; 681 memcpy(dst->devs, e->devs, e->nr_devs); 682 bch2_replicas_entry_sort(dst); 683 } 684 685 return 0; 686 } 687 688 int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c) 689 { 690 struct bch_sb_field_replicas *sb_v1; 691 struct bch_sb_field_replicas_v0 *sb_v0; 692 struct bch_replicas_cpu new_r = { 0, 0, NULL }; 693 int ret = 0; 694 695 if ((sb_v1 = bch2_sb_field_get(c->disk_sb.sb, replicas))) 696 ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r); 697 else if ((sb_v0 = bch2_sb_field_get(c->disk_sb.sb, replicas_v0))) 698 ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r); 699 if (ret) 700 return ret; 701 702 bch2_cpu_replicas_sort(&new_r); 703 704 percpu_down_write(&c->mark_lock); 705 706 ret = replicas_table_update(c, &new_r); 707 percpu_up_write(&c->mark_lock); 708 709 kfree(new_r.entries); 710 711 return 0; 712 } 713 714 static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c, 715 struct bch_replicas_cpu *r) 716 { 717 struct bch_sb_field_replicas_v0 *sb_r; 718 struct bch_replicas_entry_v0 *dst; 719 struct bch_replicas_entry *src; 720 size_t bytes; 721 722 bytes = sizeof(struct bch_sb_field_replicas); 723 724 for_each_cpu_replicas_entry(r, src) 725 bytes += replicas_entry_bytes(src) - 1; 726 727 sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0, 728 DIV_ROUND_UP(bytes, sizeof(u64))); 729 if (!sb_r) 730 return -BCH_ERR_ENOSPC_sb_replicas; 731 732 bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas); 733 sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas_v0); 734 735 memset(&sb_r->entries, 0, 736 vstruct_end(&sb_r->field) - 737 (void *) &sb_r->entries); 738 739 dst = sb_r->entries; 740 for_each_cpu_replicas_entry(r, src) { 741 dst->data_type = src->data_type; 742 dst->nr_devs = src->nr_devs; 743 memcpy(dst->devs, src->devs, src->nr_devs); 744 745 dst = replicas_entry_next(dst); 746 747 BUG_ON((void *) dst > vstruct_end(&sb_r->field)); 748 } 749 750 return 0; 751 } 752 753 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, 754 struct bch_replicas_cpu *r) 755 { 756 struct bch_sb_field_replicas *sb_r; 757 struct bch_replicas_entry *dst, *src; 758 bool need_v1 = false; 759 size_t bytes; 760 761 bytes = sizeof(struct bch_sb_field_replicas); 762 763 for_each_cpu_replicas_entry(r, src) { 764 bytes += replicas_entry_bytes(src); 765 if (src->nr_required != 1) 766 need_v1 = true; 767 } 768 769 if (!need_v1) 770 return bch2_cpu_replicas_to_sb_replicas_v0(c, r); 771 772 sb_r = bch2_sb_field_resize(&c->disk_sb, replicas, 773 DIV_ROUND_UP(bytes, sizeof(u64))); 774 if (!sb_r) 775 return -BCH_ERR_ENOSPC_sb_replicas; 776 777 bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0); 778 sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas); 779 780 memset(&sb_r->entries, 0, 781 vstruct_end(&sb_r->field) - 782 (void *) &sb_r->entries); 783 784 dst = sb_r->entries; 785 for_each_cpu_replicas_entry(r, src) { 786 memcpy(dst, src, replicas_entry_bytes(src)); 787 788 dst = replicas_entry_next(dst); 789 790 BUG_ON((void *) dst > vstruct_end(&sb_r->field)); 791 } 792 793 return 0; 794 } 795 796 static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, 797 struct bch_sb *sb, 798 struct printbuf *err) 799 { 800 unsigned i, j; 801 802 sort_cmp_size(cpu_r->entries, 803 cpu_r->nr, 804 cpu_r->entry_size, 805 memcmp, NULL); 806 807 for (i = 0; i < cpu_r->nr; i++) { 808 struct bch_replicas_entry *e = 809 cpu_replicas_entry(cpu_r, i); 810 811 if (e->data_type >= BCH_DATA_NR) { 812 prt_printf(err, "invalid data type in entry "); 813 bch2_replicas_entry_to_text(err, e); 814 return -BCH_ERR_invalid_sb_replicas; 815 } 816 817 if (!e->nr_devs) { 818 prt_printf(err, "no devices in entry "); 819 bch2_replicas_entry_to_text(err, e); 820 return -BCH_ERR_invalid_sb_replicas; 821 } 822 823 if (e->nr_required > 1 && 824 e->nr_required >= e->nr_devs) { 825 prt_printf(err, "bad nr_required in entry "); 826 bch2_replicas_entry_to_text(err, e); 827 return -BCH_ERR_invalid_sb_replicas; 828 } 829 830 for (j = 0; j < e->nr_devs; j++) 831 if (!bch2_dev_exists(sb, e->devs[j])) { 832 prt_printf(err, "invalid device %u in entry ", e->devs[j]); 833 bch2_replicas_entry_to_text(err, e); 834 return -BCH_ERR_invalid_sb_replicas; 835 } 836 837 if (i + 1 < cpu_r->nr) { 838 struct bch_replicas_entry *n = 839 cpu_replicas_entry(cpu_r, i + 1); 840 841 BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0); 842 843 if (!memcmp(e, n, cpu_r->entry_size)) { 844 prt_printf(err, "duplicate replicas entry "); 845 bch2_replicas_entry_to_text(err, e); 846 return -BCH_ERR_invalid_sb_replicas; 847 } 848 } 849 } 850 851 return 0; 852 } 853 854 static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f, 855 struct printbuf *err) 856 { 857 struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas); 858 struct bch_replicas_cpu cpu_r; 859 int ret; 860 861 ret = __bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r); 862 if (ret) 863 return ret; 864 865 ret = bch2_cpu_replicas_validate(&cpu_r, sb, err); 866 kfree(cpu_r.entries); 867 return ret; 868 } 869 870 static void bch2_sb_replicas_to_text(struct printbuf *out, 871 struct bch_sb *sb, 872 struct bch_sb_field *f) 873 { 874 struct bch_sb_field_replicas *r = field_to_type(f, replicas); 875 struct bch_replicas_entry *e; 876 bool first = true; 877 878 for_each_replicas_entry(r, e) { 879 if (!first) 880 prt_printf(out, " "); 881 first = false; 882 883 bch2_replicas_entry_to_text(out, e); 884 } 885 prt_newline(out); 886 } 887 888 const struct bch_sb_field_ops bch_sb_field_ops_replicas = { 889 .validate = bch2_sb_replicas_validate, 890 .to_text = bch2_sb_replicas_to_text, 891 }; 892 893 static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f, 894 struct printbuf *err) 895 { 896 struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); 897 struct bch_replicas_cpu cpu_r; 898 int ret; 899 900 ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r); 901 if (ret) 902 return ret; 903 904 ret = bch2_cpu_replicas_validate(&cpu_r, sb, err); 905 kfree(cpu_r.entries); 906 return ret; 907 } 908 909 static void bch2_sb_replicas_v0_to_text(struct printbuf *out, 910 struct bch_sb *sb, 911 struct bch_sb_field *f) 912 { 913 struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); 914 struct bch_replicas_entry_v0 *e; 915 bool first = true; 916 917 for_each_replicas_entry(sb_r, e) { 918 if (!first) 919 prt_printf(out, " "); 920 first = false; 921 922 bch2_replicas_entry_v0_to_text(out, e); 923 } 924 prt_newline(out); 925 } 926 927 const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = { 928 .validate = bch2_sb_replicas_v0_validate, 929 .to_text = bch2_sb_replicas_v0_to_text, 930 }; 931 932 /* Query replicas: */ 933 934 bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, 935 unsigned flags, bool print) 936 { 937 struct bch_replicas_entry *e; 938 bool ret = true; 939 940 percpu_down_read(&c->mark_lock); 941 for_each_cpu_replicas_entry(&c->replicas, e) { 942 unsigned i, nr_online = 0, nr_failed = 0, dflags = 0; 943 bool metadata = e->data_type < BCH_DATA_user; 944 945 if (e->data_type == BCH_DATA_cached) 946 continue; 947 948 for (i = 0; i < e->nr_devs; i++) { 949 struct bch_dev *ca = bch_dev_bkey_exists(c, e->devs[i]); 950 951 nr_online += test_bit(e->devs[i], devs.d); 952 nr_failed += ca->mi.state == BCH_MEMBER_STATE_failed; 953 } 954 955 if (nr_failed == e->nr_devs) 956 continue; 957 958 if (nr_online < e->nr_required) 959 dflags |= metadata 960 ? BCH_FORCE_IF_METADATA_LOST 961 : BCH_FORCE_IF_DATA_LOST; 962 963 if (nr_online < e->nr_devs) 964 dflags |= metadata 965 ? BCH_FORCE_IF_METADATA_DEGRADED 966 : BCH_FORCE_IF_DATA_DEGRADED; 967 968 if (dflags & ~flags) { 969 if (print) { 970 struct printbuf buf = PRINTBUF; 971 972 bch2_replicas_entry_to_text(&buf, e); 973 bch_err(c, "insufficient devices online (%u) for replicas entry %s", 974 nr_online, buf.buf); 975 printbuf_exit(&buf); 976 } 977 ret = false; 978 break; 979 } 980 981 } 982 percpu_up_read(&c->mark_lock); 983 984 return ret; 985 } 986 987 unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev) 988 { 989 struct bch_sb_field_replicas *replicas; 990 struct bch_sb_field_replicas_v0 *replicas_v0; 991 unsigned i, data_has = 0; 992 993 replicas = bch2_sb_field_get(sb, replicas); 994 replicas_v0 = bch2_sb_field_get(sb, replicas_v0); 995 996 if (replicas) { 997 struct bch_replicas_entry *r; 998 999 for_each_replicas_entry(replicas, r) 1000 for (i = 0; i < r->nr_devs; i++) 1001 if (r->devs[i] == dev) 1002 data_has |= 1 << r->data_type; 1003 } else if (replicas_v0) { 1004 struct bch_replicas_entry_v0 *r; 1005 1006 for_each_replicas_entry_v0(replicas_v0, r) 1007 for (i = 0; i < r->nr_devs; i++) 1008 if (r->devs[i] == dev) 1009 data_has |= 1 << r->data_type; 1010 } 1011 1012 1013 return data_has; 1014 } 1015 1016 unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) 1017 { 1018 unsigned ret; 1019 1020 mutex_lock(&c->sb_lock); 1021 ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx); 1022 mutex_unlock(&c->sb_lock); 1023 1024 return ret; 1025 } 1026 1027 void bch2_fs_replicas_exit(struct bch_fs *c) 1028 { 1029 unsigned i; 1030 1031 kfree(c->usage_scratch); 1032 for (i = 0; i < ARRAY_SIZE(c->usage); i++) 1033 free_percpu(c->usage[i]); 1034 kfree(c->usage_base); 1035 kfree(c->replicas.entries); 1036 kfree(c->replicas_gc.entries); 1037 1038 mempool_exit(&c->replicas_delta_pool); 1039 } 1040 1041 int bch2_fs_replicas_init(struct bch_fs *c) 1042 { 1043 bch2_journal_entry_res_resize(&c->journal, 1044 &c->replicas_journal_res, 1045 reserve_journal_replicas(c, &c->replicas)); 1046 1047 return mempool_init_kmalloc_pool(&c->replicas_delta_pool, 1, 1048 REPLICAS_DELTA_LIST_MAX) ?: 1049 replicas_table_update(c, &c->replicas); 1050 } 1051