1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright 2012 Google, Inc. 4 * 5 * Foreground allocator code: allocate buckets from freelist, and allocate in 6 * sector granularity from writepoints. 7 * 8 * bch2_bucket_alloc() allocates a single bucket from a specific device. 9 * 10 * bch2_bucket_alloc_set() allocates one or more buckets from different devices 11 * in a given filesystem. 12 */ 13 14 #include "bcachefs.h" 15 #include "alloc_background.h" 16 #include "alloc_foreground.h" 17 #include "backpointers.h" 18 #include "btree_iter.h" 19 #include "btree_update.h" 20 #include "btree_gc.h" 21 #include "buckets.h" 22 #include "buckets_waiting_for_journal.h" 23 #include "clock.h" 24 #include "debug.h" 25 #include "disk_groups.h" 26 #include "ec.h" 27 #include "error.h" 28 #include "io_write.h" 29 #include "journal.h" 30 #include "movinggc.h" 31 #include "nocow_locking.h" 32 #include "trace.h" 33 34 #include <linux/math64.h> 35 #include <linux/rculist.h> 36 #include <linux/rcupdate.h> 37 38 static void bch2_trans_mutex_lock_norelock(struct btree_trans *trans, 39 struct mutex *lock) 40 { 41 if (!mutex_trylock(lock)) { 42 bch2_trans_unlock(trans); 43 mutex_lock(lock); 44 } 45 } 46 47 const char * const bch2_watermarks[] = { 48 #define x(t) #t, 49 BCH_WATERMARKS() 50 #undef x 51 NULL 52 }; 53 54 /* 55 * Open buckets represent a bucket that's currently being allocated from. They 56 * serve two purposes: 57 * 58 * - They track buckets that have been partially allocated, allowing for 59 * sub-bucket sized allocations - they're used by the sector allocator below 60 * 61 * - They provide a reference to the buckets they own that mark and sweep GC 62 * can find, until the new allocation has a pointer to it inserted into the 63 * btree 64 * 65 * When allocating some space with the sector allocator, the allocation comes 66 * with a reference to an open bucket - the caller is required to put that 67 * reference _after_ doing the index update that makes its allocation reachable. 68 */ 69 70 void bch2_reset_alloc_cursors(struct bch_fs *c) 71 { 72 rcu_read_lock(); 73 for_each_member_device_rcu(c, ca, NULL) 74 ca->alloc_cursor = 0; 75 rcu_read_unlock(); 76 } 77 78 static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob) 79 { 80 open_bucket_idx_t idx = ob - c->open_buckets; 81 open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket); 82 83 ob->hash = *slot; 84 *slot = idx; 85 } 86 87 static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *ob) 88 { 89 open_bucket_idx_t idx = ob - c->open_buckets; 90 open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket); 91 92 while (*slot != idx) { 93 BUG_ON(!*slot); 94 slot = &c->open_buckets[*slot].hash; 95 } 96 97 *slot = ob->hash; 98 ob->hash = 0; 99 } 100 101 void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) 102 { 103 struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); 104 105 if (ob->ec) { 106 ec_stripe_new_put(c, ob->ec, STRIPE_REF_io); 107 return; 108 } 109 110 percpu_down_read(&c->mark_lock); 111 spin_lock(&ob->lock); 112 113 ob->valid = false; 114 ob->data_type = 0; 115 116 spin_unlock(&ob->lock); 117 percpu_up_read(&c->mark_lock); 118 119 spin_lock(&c->freelist_lock); 120 bch2_open_bucket_hash_remove(c, ob); 121 122 ob->freelist = c->open_buckets_freelist; 123 c->open_buckets_freelist = ob - c->open_buckets; 124 125 c->open_buckets_nr_free++; 126 ca->nr_open_buckets--; 127 spin_unlock(&c->freelist_lock); 128 129 closure_wake_up(&c->open_buckets_wait); 130 } 131 132 void bch2_open_bucket_write_error(struct bch_fs *c, 133 struct open_buckets *obs, 134 unsigned dev) 135 { 136 struct open_bucket *ob; 137 unsigned i; 138 139 open_bucket_for_each(c, obs, ob, i) 140 if (ob->dev == dev && ob->ec) 141 bch2_ec_bucket_cancel(c, ob); 142 } 143 144 static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) 145 { 146 struct open_bucket *ob; 147 148 BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free); 149 150 ob = c->open_buckets + c->open_buckets_freelist; 151 c->open_buckets_freelist = ob->freelist; 152 atomic_set(&ob->pin, 1); 153 ob->data_type = 0; 154 155 c->open_buckets_nr_free--; 156 return ob; 157 } 158 159 static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) 160 { 161 BUG_ON(c->open_buckets_partial_nr >= 162 ARRAY_SIZE(c->open_buckets_partial)); 163 164 spin_lock(&c->freelist_lock); 165 ob->on_partial_list = true; 166 c->open_buckets_partial[c->open_buckets_partial_nr++] = 167 ob - c->open_buckets; 168 spin_unlock(&c->freelist_lock); 169 170 closure_wake_up(&c->open_buckets_wait); 171 closure_wake_up(&c->freelist_wait); 172 } 173 174 /* _only_ for allocating the journal on a new device: */ 175 long bch2_bucket_alloc_new_fs(struct bch_dev *ca) 176 { 177 while (ca->new_fs_bucket_idx < ca->mi.nbuckets) { 178 u64 b = ca->new_fs_bucket_idx++; 179 180 if (!is_superblock_bucket(ca, b) && 181 (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse))) 182 return b; 183 } 184 185 return -1; 186 } 187 188 static inline unsigned open_buckets_reserved(enum bch_watermark watermark) 189 { 190 switch (watermark) { 191 case BCH_WATERMARK_reclaim: 192 return 0; 193 case BCH_WATERMARK_btree: 194 case BCH_WATERMARK_btree_copygc: 195 return OPEN_BUCKETS_COUNT / 4; 196 case BCH_WATERMARK_copygc: 197 return OPEN_BUCKETS_COUNT / 3; 198 default: 199 return OPEN_BUCKETS_COUNT / 2; 200 } 201 } 202 203 static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, 204 u64 bucket, 205 enum bch_watermark watermark, 206 const struct bch_alloc_v4 *a, 207 struct bucket_alloc_state *s, 208 struct closure *cl) 209 { 210 struct open_bucket *ob; 211 212 if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) { 213 s->skipped_nouse++; 214 return NULL; 215 } 216 217 if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { 218 s->skipped_open++; 219 return NULL; 220 } 221 222 if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, 223 c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) { 224 s->skipped_need_journal_commit++; 225 return NULL; 226 } 227 228 if (bch2_bucket_nocow_is_locked(&c->nocow_locks, POS(ca->dev_idx, bucket))) { 229 s->skipped_nocow++; 230 return NULL; 231 } 232 233 spin_lock(&c->freelist_lock); 234 235 if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(watermark))) { 236 if (cl) 237 closure_wait(&c->open_buckets_wait, cl); 238 239 track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], 240 &c->blocked_allocate_open_bucket, true); 241 spin_unlock(&c->freelist_lock); 242 return ERR_PTR(-BCH_ERR_open_buckets_empty); 243 } 244 245 /* Recheck under lock: */ 246 if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { 247 spin_unlock(&c->freelist_lock); 248 s->skipped_open++; 249 return NULL; 250 } 251 252 ob = bch2_open_bucket_alloc(c); 253 254 spin_lock(&ob->lock); 255 256 ob->valid = true; 257 ob->sectors_free = ca->mi.bucket_size; 258 ob->dev = ca->dev_idx; 259 ob->gen = a->gen; 260 ob->bucket = bucket; 261 spin_unlock(&ob->lock); 262 263 ca->nr_open_buckets++; 264 bch2_open_bucket_hash_add(c, ob); 265 266 track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], 267 &c->blocked_allocate_open_bucket, false); 268 269 track_event_change(&c->times[BCH_TIME_blocked_allocate], 270 &c->blocked_allocate, false); 271 272 spin_unlock(&c->freelist_lock); 273 return ob; 274 } 275 276 static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca, 277 enum bch_watermark watermark, u64 free_entry, 278 struct bucket_alloc_state *s, 279 struct bkey_s_c freespace_k, 280 struct closure *cl) 281 { 282 struct bch_fs *c = trans->c; 283 struct btree_iter iter = { NULL }; 284 struct bkey_s_c k; 285 struct open_bucket *ob; 286 struct bch_alloc_v4 a_convert; 287 const struct bch_alloc_v4 *a; 288 u64 b = free_entry & ~(~0ULL << 56); 289 unsigned genbits = free_entry >> 56; 290 struct printbuf buf = PRINTBUF; 291 int ret; 292 293 if (b < ca->mi.first_bucket || b >= ca->mi.nbuckets) { 294 prt_printf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n" 295 " freespace key ", 296 ca->mi.first_bucket, ca->mi.nbuckets); 297 bch2_bkey_val_to_text(&buf, c, freespace_k); 298 bch2_trans_inconsistent(trans, "%s", buf.buf); 299 ob = ERR_PTR(-EIO); 300 goto err; 301 } 302 303 k = bch2_bkey_get_iter(trans, &iter, 304 BTREE_ID_alloc, POS(ca->dev_idx, b), 305 BTREE_ITER_CACHED); 306 ret = bkey_err(k); 307 if (ret) { 308 ob = ERR_PTR(ret); 309 goto err; 310 } 311 312 a = bch2_alloc_to_v4(k, &a_convert); 313 314 if (a->data_type != BCH_DATA_free) { 315 if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) { 316 ob = NULL; 317 goto err; 318 } 319 320 prt_printf(&buf, "non free bucket in freespace btree\n" 321 " freespace key "); 322 bch2_bkey_val_to_text(&buf, c, freespace_k); 323 prt_printf(&buf, "\n "); 324 bch2_bkey_val_to_text(&buf, c, k); 325 bch2_trans_inconsistent(trans, "%s", buf.buf); 326 ob = ERR_PTR(-EIO); 327 goto err; 328 } 329 330 if (genbits != (alloc_freespace_genbits(*a) >> 56) && 331 c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) { 332 prt_printf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n" 333 " freespace key ", 334 genbits, alloc_freespace_genbits(*a) >> 56); 335 bch2_bkey_val_to_text(&buf, c, freespace_k); 336 prt_printf(&buf, "\n "); 337 bch2_bkey_val_to_text(&buf, c, k); 338 bch2_trans_inconsistent(trans, "%s", buf.buf); 339 ob = ERR_PTR(-EIO); 340 goto err; 341 } 342 343 if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_extents_to_backpointers) { 344 struct bch_backpointer bp; 345 struct bpos bp_pos = POS_MIN; 346 347 ret = bch2_get_next_backpointer(trans, POS(ca->dev_idx, b), -1, 348 &bp_pos, &bp, 349 BTREE_ITER_NOPRESERVE); 350 if (ret) { 351 ob = ERR_PTR(ret); 352 goto err; 353 } 354 355 if (!bkey_eq(bp_pos, POS_MAX)) { 356 /* 357 * Bucket may have data in it - we don't call 358 * bc2h_trans_inconnsistent() because fsck hasn't 359 * finished yet 360 */ 361 ob = NULL; 362 goto err; 363 } 364 } 365 366 ob = __try_alloc_bucket(c, ca, b, watermark, a, s, cl); 367 if (!ob) 368 set_btree_iter_dontneed(&iter); 369 err: 370 if (iter.path) 371 set_btree_iter_dontneed(&iter); 372 bch2_trans_iter_exit(trans, &iter); 373 printbuf_exit(&buf); 374 return ob; 375 } 376 377 /* 378 * This path is for before the freespace btree is initialized: 379 * 380 * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock & 381 * journal buckets - journal buckets will be < ca->new_fs_bucket_idx 382 */ 383 static noinline struct open_bucket * 384 bch2_bucket_alloc_early(struct btree_trans *trans, 385 struct bch_dev *ca, 386 enum bch_watermark watermark, 387 struct bucket_alloc_state *s, 388 struct closure *cl) 389 { 390 struct btree_iter iter, citer; 391 struct bkey_s_c k, ck; 392 struct open_bucket *ob = NULL; 393 u64 first_bucket = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx); 394 u64 alloc_start = max(first_bucket, READ_ONCE(ca->alloc_cursor)); 395 u64 alloc_cursor = alloc_start; 396 int ret; 397 398 /* 399 * Scan with an uncached iterator to avoid polluting the key cache. An 400 * uncached iter will return a cached key if one exists, but if not 401 * there is no other underlying protection for the associated key cache 402 * slot. To avoid racing bucket allocations, look up the cached key slot 403 * of any likely allocation candidate before attempting to proceed with 404 * the allocation. This provides proper exclusion on the associated 405 * bucket. 406 */ 407 again: 408 for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor), 409 BTREE_ITER_SLOTS, k, ret) { 410 struct bch_alloc_v4 a_convert; 411 const struct bch_alloc_v4 *a; 412 413 if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets))) 414 break; 415 416 if (ca->new_fs_bucket_idx && 417 is_superblock_bucket(ca, k.k->p.offset)) 418 continue; 419 420 a = bch2_alloc_to_v4(k, &a_convert); 421 if (a->data_type != BCH_DATA_free) 422 continue; 423 424 /* now check the cached key to serialize concurrent allocs of the bucket */ 425 ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_CACHED); 426 ret = bkey_err(ck); 427 if (ret) 428 break; 429 430 a = bch2_alloc_to_v4(ck, &a_convert); 431 if (a->data_type != BCH_DATA_free) 432 goto next; 433 434 s->buckets_seen++; 435 436 ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl); 437 next: 438 set_btree_iter_dontneed(&citer); 439 bch2_trans_iter_exit(trans, &citer); 440 if (ob) 441 break; 442 } 443 bch2_trans_iter_exit(trans, &iter); 444 445 alloc_cursor = iter.pos.offset; 446 ca->alloc_cursor = alloc_cursor; 447 448 if (!ob && ret) 449 ob = ERR_PTR(ret); 450 451 if (!ob && alloc_start > first_bucket) { 452 alloc_cursor = alloc_start = first_bucket; 453 goto again; 454 } 455 456 return ob; 457 } 458 459 static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, 460 struct bch_dev *ca, 461 enum bch_watermark watermark, 462 struct bucket_alloc_state *s, 463 struct closure *cl) 464 { 465 struct btree_iter iter; 466 struct bkey_s_c k; 467 struct open_bucket *ob = NULL; 468 u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(ca->alloc_cursor)); 469 u64 alloc_cursor = alloc_start; 470 int ret; 471 472 BUG_ON(ca->new_fs_bucket_idx); 473 again: 474 for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace, 475 POS(ca->dev_idx, alloc_cursor), 0, k, ret) { 476 if (k.k->p.inode != ca->dev_idx) 477 break; 478 479 for (alloc_cursor = max(alloc_cursor, bkey_start_offset(k.k)); 480 alloc_cursor < k.k->p.offset; 481 alloc_cursor++) { 482 ret = btree_trans_too_many_iters(trans); 483 if (ret) { 484 ob = ERR_PTR(ret); 485 break; 486 } 487 488 s->buckets_seen++; 489 490 ob = try_alloc_bucket(trans, ca, watermark, 491 alloc_cursor, s, k, cl); 492 if (ob) { 493 set_btree_iter_dontneed(&iter); 494 break; 495 } 496 } 497 498 if (ob || ret) 499 break; 500 } 501 bch2_trans_iter_exit(trans, &iter); 502 503 ca->alloc_cursor = alloc_cursor; 504 505 if (!ob && ret) 506 ob = ERR_PTR(ret); 507 508 if (!ob && alloc_start > ca->mi.first_bucket) { 509 alloc_cursor = alloc_start = ca->mi.first_bucket; 510 goto again; 511 } 512 513 return ob; 514 } 515 516 /** 517 * bch2_bucket_alloc_trans - allocate a single bucket from a specific device 518 * @trans: transaction object 519 * @ca: device to allocate from 520 * @watermark: how important is this allocation? 521 * @cl: if not NULL, closure to be used to wait if buckets not available 522 * @usage: for secondarily also returning the current device usage 523 * 524 * Returns: an open_bucket on success, or an ERR_PTR() on failure. 525 */ 526 static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, 527 struct bch_dev *ca, 528 enum bch_watermark watermark, 529 struct closure *cl, 530 struct bch_dev_usage *usage) 531 { 532 struct bch_fs *c = trans->c; 533 struct open_bucket *ob = NULL; 534 bool freespace = READ_ONCE(ca->mi.freespace_initialized); 535 u64 avail; 536 struct bucket_alloc_state s = { 0 }; 537 bool waiting = false; 538 again: 539 bch2_dev_usage_read_fast(ca, usage); 540 avail = dev_buckets_free(ca, *usage, watermark); 541 542 if (usage->d[BCH_DATA_need_discard].buckets > avail) 543 bch2_do_discards(c); 544 545 if (usage->d[BCH_DATA_need_gc_gens].buckets > avail) 546 bch2_do_gc_gens(c); 547 548 if (should_invalidate_buckets(ca, *usage)) 549 bch2_do_invalidates(c); 550 551 if (!avail) { 552 if (cl && !waiting) { 553 closure_wait(&c->freelist_wait, cl); 554 waiting = true; 555 goto again; 556 } 557 558 track_event_change(&c->times[BCH_TIME_blocked_allocate], 559 &c->blocked_allocate, true); 560 561 ob = ERR_PTR(-BCH_ERR_freelist_empty); 562 goto err; 563 } 564 565 if (waiting) 566 closure_wake_up(&c->freelist_wait); 567 alloc: 568 ob = likely(freespace) 569 ? bch2_bucket_alloc_freelist(trans, ca, watermark, &s, cl) 570 : bch2_bucket_alloc_early(trans, ca, watermark, &s, cl); 571 572 if (s.skipped_need_journal_commit * 2 > avail) 573 bch2_journal_flush_async(&c->journal, NULL); 574 575 if (!ob && freespace && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) { 576 freespace = false; 577 goto alloc; 578 } 579 err: 580 if (!ob) 581 ob = ERR_PTR(-BCH_ERR_no_buckets_found); 582 583 if (!IS_ERR(ob)) 584 trace_and_count(c, bucket_alloc, ca, 585 bch2_watermarks[watermark], 586 ob->bucket, 587 usage->d[BCH_DATA_free].buckets, 588 avail, 589 bch2_copygc_wait_amount(c), 590 c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now), 591 &s, 592 cl == NULL, 593 ""); 594 else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart)) 595 trace_and_count(c, bucket_alloc_fail, ca, 596 bch2_watermarks[watermark], 597 0, 598 usage->d[BCH_DATA_free].buckets, 599 avail, 600 bch2_copygc_wait_amount(c), 601 c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now), 602 &s, 603 cl == NULL, 604 bch2_err_str(PTR_ERR(ob))); 605 606 return ob; 607 } 608 609 struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, 610 enum bch_watermark watermark, 611 struct closure *cl) 612 { 613 struct bch_dev_usage usage; 614 struct open_bucket *ob; 615 616 bch2_trans_do(c, NULL, NULL, 0, 617 PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark, 618 cl, &usage))); 619 return ob; 620 } 621 622 static int __dev_stripe_cmp(struct dev_stripe_state *stripe, 623 unsigned l, unsigned r) 624 { 625 return ((stripe->next_alloc[l] > stripe->next_alloc[r]) - 626 (stripe->next_alloc[l] < stripe->next_alloc[r])); 627 } 628 629 #define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r) 630 631 struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c, 632 struct dev_stripe_state *stripe, 633 struct bch_devs_mask *devs) 634 { 635 struct dev_alloc_list ret = { .nr = 0 }; 636 unsigned i; 637 638 for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX) 639 ret.devs[ret.nr++] = i; 640 641 bubble_sort(ret.devs, ret.nr, dev_stripe_cmp); 642 return ret; 643 } 644 645 static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca, 646 struct dev_stripe_state *stripe, 647 struct bch_dev_usage *usage) 648 { 649 u64 *v = stripe->next_alloc + ca->dev_idx; 650 u64 free_space = dev_buckets_available(ca, BCH_WATERMARK_normal); 651 u64 free_space_inv = free_space 652 ? div64_u64(1ULL << 48, free_space) 653 : 1ULL << 48; 654 u64 scale = *v / 4; 655 656 if (*v + free_space_inv >= *v) 657 *v += free_space_inv; 658 else 659 *v = U64_MAX; 660 661 for (v = stripe->next_alloc; 662 v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) 663 *v = *v < scale ? 0 : *v - scale; 664 } 665 666 void bch2_dev_stripe_increment(struct bch_dev *ca, 667 struct dev_stripe_state *stripe) 668 { 669 struct bch_dev_usage usage; 670 671 bch2_dev_usage_read_fast(ca, &usage); 672 bch2_dev_stripe_increment_inlined(ca, stripe, &usage); 673 } 674 675 static int add_new_bucket(struct bch_fs *c, 676 struct open_buckets *ptrs, 677 struct bch_devs_mask *devs_may_alloc, 678 unsigned nr_replicas, 679 unsigned *nr_effective, 680 bool *have_cache, 681 unsigned flags, 682 struct open_bucket *ob) 683 { 684 unsigned durability = 685 bch_dev_bkey_exists(c, ob->dev)->mi.durability; 686 687 BUG_ON(*nr_effective >= nr_replicas); 688 689 __clear_bit(ob->dev, devs_may_alloc->d); 690 *nr_effective += durability; 691 *have_cache |= !durability; 692 693 ob_push(c, ptrs, ob); 694 695 if (*nr_effective >= nr_replicas) 696 return 1; 697 if (ob->ec) 698 return 1; 699 return 0; 700 } 701 702 int bch2_bucket_alloc_set_trans(struct btree_trans *trans, 703 struct open_buckets *ptrs, 704 struct dev_stripe_state *stripe, 705 struct bch_devs_mask *devs_may_alloc, 706 unsigned nr_replicas, 707 unsigned *nr_effective, 708 bool *have_cache, 709 unsigned flags, 710 enum bch_data_type data_type, 711 enum bch_watermark watermark, 712 struct closure *cl) 713 { 714 struct bch_fs *c = trans->c; 715 struct dev_alloc_list devs_sorted = 716 bch2_dev_alloc_list(c, stripe, devs_may_alloc); 717 unsigned dev; 718 struct bch_dev *ca; 719 int ret = -BCH_ERR_insufficient_devices; 720 unsigned i; 721 722 BUG_ON(*nr_effective >= nr_replicas); 723 724 for (i = 0; i < devs_sorted.nr; i++) { 725 struct bch_dev_usage usage; 726 struct open_bucket *ob; 727 728 dev = devs_sorted.devs[i]; 729 730 rcu_read_lock(); 731 ca = rcu_dereference(c->devs[dev]); 732 if (ca) 733 percpu_ref_get(&ca->ref); 734 rcu_read_unlock(); 735 736 if (!ca) 737 continue; 738 739 if (!ca->mi.durability && *have_cache) { 740 percpu_ref_put(&ca->ref); 741 continue; 742 } 743 744 ob = bch2_bucket_alloc_trans(trans, ca, watermark, cl, &usage); 745 if (!IS_ERR(ob)) 746 bch2_dev_stripe_increment_inlined(ca, stripe, &usage); 747 percpu_ref_put(&ca->ref); 748 749 if (IS_ERR(ob)) { 750 ret = PTR_ERR(ob); 751 if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || cl) 752 break; 753 continue; 754 } 755 756 ob->data_type = data_type; 757 758 if (add_new_bucket(c, ptrs, devs_may_alloc, 759 nr_replicas, nr_effective, 760 have_cache, flags, ob)) { 761 ret = 0; 762 break; 763 } 764 } 765 766 return ret; 767 } 768 769 /* Allocate from stripes: */ 770 771 /* 772 * if we can't allocate a new stripe because there are already too many 773 * partially filled stripes, force allocating from an existing stripe even when 774 * it's to a device we don't want: 775 */ 776 777 static int bucket_alloc_from_stripe(struct btree_trans *trans, 778 struct open_buckets *ptrs, 779 struct write_point *wp, 780 struct bch_devs_mask *devs_may_alloc, 781 u16 target, 782 unsigned nr_replicas, 783 unsigned *nr_effective, 784 bool *have_cache, 785 enum bch_watermark watermark, 786 unsigned flags, 787 struct closure *cl) 788 { 789 struct bch_fs *c = trans->c; 790 struct dev_alloc_list devs_sorted; 791 struct ec_stripe_head *h; 792 struct open_bucket *ob; 793 unsigned i, ec_idx; 794 int ret = 0; 795 796 if (nr_replicas < 2) 797 return 0; 798 799 if (ec_open_bucket(c, ptrs)) 800 return 0; 801 802 h = bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl); 803 if (IS_ERR(h)) 804 return PTR_ERR(h); 805 if (!h) 806 return 0; 807 808 devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); 809 810 for (i = 0; i < devs_sorted.nr; i++) 811 for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) { 812 if (!h->s->blocks[ec_idx]) 813 continue; 814 815 ob = c->open_buckets + h->s->blocks[ec_idx]; 816 if (ob->dev == devs_sorted.devs[i] && 817 !test_and_set_bit(ec_idx, h->s->blocks_allocated)) 818 goto got_bucket; 819 } 820 goto out_put_head; 821 got_bucket: 822 ob->ec_idx = ec_idx; 823 ob->ec = h->s; 824 ec_stripe_new_get(h->s, STRIPE_REF_io); 825 826 ret = add_new_bucket(c, ptrs, devs_may_alloc, 827 nr_replicas, nr_effective, 828 have_cache, flags, ob); 829 out_put_head: 830 bch2_ec_stripe_head_put(c, h); 831 return ret; 832 } 833 834 /* Sector allocator */ 835 836 static bool want_bucket(struct bch_fs *c, 837 struct write_point *wp, 838 struct bch_devs_mask *devs_may_alloc, 839 bool *have_cache, bool ec, 840 struct open_bucket *ob) 841 { 842 struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); 843 844 if (!test_bit(ob->dev, devs_may_alloc->d)) 845 return false; 846 847 if (ob->data_type != wp->data_type) 848 return false; 849 850 if (!ca->mi.durability && 851 (wp->data_type == BCH_DATA_btree || ec || *have_cache)) 852 return false; 853 854 if (ec != (ob->ec != NULL)) 855 return false; 856 857 return true; 858 } 859 860 static int bucket_alloc_set_writepoint(struct bch_fs *c, 861 struct open_buckets *ptrs, 862 struct write_point *wp, 863 struct bch_devs_mask *devs_may_alloc, 864 unsigned nr_replicas, 865 unsigned *nr_effective, 866 bool *have_cache, 867 bool ec, unsigned flags) 868 { 869 struct open_buckets ptrs_skip = { .nr = 0 }; 870 struct open_bucket *ob; 871 unsigned i; 872 int ret = 0; 873 874 open_bucket_for_each(c, &wp->ptrs, ob, i) { 875 if (!ret && want_bucket(c, wp, devs_may_alloc, 876 have_cache, ec, ob)) 877 ret = add_new_bucket(c, ptrs, devs_may_alloc, 878 nr_replicas, nr_effective, 879 have_cache, flags, ob); 880 else 881 ob_push(c, &ptrs_skip, ob); 882 } 883 wp->ptrs = ptrs_skip; 884 885 return ret; 886 } 887 888 static int bucket_alloc_set_partial(struct bch_fs *c, 889 struct open_buckets *ptrs, 890 struct write_point *wp, 891 struct bch_devs_mask *devs_may_alloc, 892 unsigned nr_replicas, 893 unsigned *nr_effective, 894 bool *have_cache, bool ec, 895 enum bch_watermark watermark, 896 unsigned flags) 897 { 898 int i, ret = 0; 899 900 if (!c->open_buckets_partial_nr) 901 return 0; 902 903 spin_lock(&c->freelist_lock); 904 905 if (!c->open_buckets_partial_nr) 906 goto unlock; 907 908 for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) { 909 struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i]; 910 911 if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) { 912 struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); 913 struct bch_dev_usage usage; 914 u64 avail; 915 916 bch2_dev_usage_read_fast(ca, &usage); 917 avail = dev_buckets_free(ca, usage, watermark); 918 if (!avail) 919 continue; 920 921 array_remove_item(c->open_buckets_partial, 922 c->open_buckets_partial_nr, 923 i); 924 ob->on_partial_list = false; 925 926 ret = add_new_bucket(c, ptrs, devs_may_alloc, 927 nr_replicas, nr_effective, 928 have_cache, flags, ob); 929 if (ret) 930 break; 931 } 932 } 933 unlock: 934 spin_unlock(&c->freelist_lock); 935 return ret; 936 } 937 938 static int __open_bucket_add_buckets(struct btree_trans *trans, 939 struct open_buckets *ptrs, 940 struct write_point *wp, 941 struct bch_devs_list *devs_have, 942 u16 target, 943 bool erasure_code, 944 unsigned nr_replicas, 945 unsigned *nr_effective, 946 bool *have_cache, 947 enum bch_watermark watermark, 948 unsigned flags, 949 struct closure *_cl) 950 { 951 struct bch_fs *c = trans->c; 952 struct bch_devs_mask devs; 953 struct open_bucket *ob; 954 struct closure *cl = NULL; 955 unsigned i; 956 int ret; 957 958 devs = target_rw_devs(c, wp->data_type, target); 959 960 /* Don't allocate from devices we already have pointers to: */ 961 darray_for_each(*devs_have, i) 962 __clear_bit(*i, devs.d); 963 964 open_bucket_for_each(c, ptrs, ob, i) 965 __clear_bit(ob->dev, devs.d); 966 967 if (erasure_code && ec_open_bucket(c, ptrs)) 968 return 0; 969 970 ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs, 971 nr_replicas, nr_effective, 972 have_cache, erasure_code, flags); 973 if (ret) 974 return ret; 975 976 ret = bucket_alloc_set_partial(c, ptrs, wp, &devs, 977 nr_replicas, nr_effective, 978 have_cache, erasure_code, watermark, flags); 979 if (ret) 980 return ret; 981 982 if (erasure_code) { 983 ret = bucket_alloc_from_stripe(trans, ptrs, wp, &devs, 984 target, 985 nr_replicas, nr_effective, 986 have_cache, 987 watermark, flags, _cl); 988 } else { 989 retry_blocking: 990 /* 991 * Try nonblocking first, so that if one device is full we'll try from 992 * other devices: 993 */ 994 ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs, 995 nr_replicas, nr_effective, have_cache, 996 flags, wp->data_type, watermark, cl); 997 if (ret && 998 !bch2_err_matches(ret, BCH_ERR_transaction_restart) && 999 !bch2_err_matches(ret, BCH_ERR_insufficient_devices) && 1000 !cl && _cl) { 1001 cl = _cl; 1002 goto retry_blocking; 1003 } 1004 } 1005 1006 return ret; 1007 } 1008 1009 static int open_bucket_add_buckets(struct btree_trans *trans, 1010 struct open_buckets *ptrs, 1011 struct write_point *wp, 1012 struct bch_devs_list *devs_have, 1013 u16 target, 1014 unsigned erasure_code, 1015 unsigned nr_replicas, 1016 unsigned *nr_effective, 1017 bool *have_cache, 1018 enum bch_watermark watermark, 1019 unsigned flags, 1020 struct closure *cl) 1021 { 1022 int ret; 1023 1024 if (erasure_code) { 1025 ret = __open_bucket_add_buckets(trans, ptrs, wp, 1026 devs_have, target, erasure_code, 1027 nr_replicas, nr_effective, have_cache, 1028 watermark, flags, cl); 1029 if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || 1030 bch2_err_matches(ret, BCH_ERR_operation_blocked) || 1031 bch2_err_matches(ret, BCH_ERR_freelist_empty) || 1032 bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) 1033 return ret; 1034 if (*nr_effective >= nr_replicas) 1035 return 0; 1036 } 1037 1038 ret = __open_bucket_add_buckets(trans, ptrs, wp, 1039 devs_have, target, false, 1040 nr_replicas, nr_effective, have_cache, 1041 watermark, flags, cl); 1042 return ret < 0 ? ret : 0; 1043 } 1044 1045 /** 1046 * should_drop_bucket - check if this is open_bucket should go away 1047 * @ob: open_bucket to predicate on 1048 * @c: filesystem handle 1049 * @ca: if set, we're killing buckets for a particular device 1050 * @ec: if true, we're shutting down erasure coding and killing all ec 1051 * open_buckets 1052 * otherwise, return true 1053 * Returns: true if we should kill this open_bucket 1054 * 1055 * We're killing open_buckets because we're shutting down a device, erasure 1056 * coding, or the entire filesystem - check if this open_bucket matches: 1057 */ 1058 static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c, 1059 struct bch_dev *ca, bool ec) 1060 { 1061 if (ec) { 1062 return ob->ec != NULL; 1063 } else if (ca) { 1064 bool drop = ob->dev == ca->dev_idx; 1065 struct open_bucket *ob2; 1066 unsigned i; 1067 1068 if (!drop && ob->ec) { 1069 unsigned nr_blocks; 1070 1071 mutex_lock(&ob->ec->lock); 1072 nr_blocks = bkey_i_to_stripe(&ob->ec->new_stripe.key)->v.nr_blocks; 1073 1074 for (i = 0; i < nr_blocks; i++) { 1075 if (!ob->ec->blocks[i]) 1076 continue; 1077 1078 ob2 = c->open_buckets + ob->ec->blocks[i]; 1079 drop |= ob2->dev == ca->dev_idx; 1080 } 1081 mutex_unlock(&ob->ec->lock); 1082 } 1083 1084 return drop; 1085 } else { 1086 return true; 1087 } 1088 } 1089 1090 static void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca, 1091 bool ec, struct write_point *wp) 1092 { 1093 struct open_buckets ptrs = { .nr = 0 }; 1094 struct open_bucket *ob; 1095 unsigned i; 1096 1097 mutex_lock(&wp->lock); 1098 open_bucket_for_each(c, &wp->ptrs, ob, i) 1099 if (should_drop_bucket(ob, c, ca, ec)) 1100 bch2_open_bucket_put(c, ob); 1101 else 1102 ob_push(c, &ptrs, ob); 1103 wp->ptrs = ptrs; 1104 mutex_unlock(&wp->lock); 1105 } 1106 1107 void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca, 1108 bool ec) 1109 { 1110 unsigned i; 1111 1112 /* Next, close write points that point to this device... */ 1113 for (i = 0; i < ARRAY_SIZE(c->write_points); i++) 1114 bch2_writepoint_stop(c, ca, ec, &c->write_points[i]); 1115 1116 bch2_writepoint_stop(c, ca, ec, &c->copygc_write_point); 1117 bch2_writepoint_stop(c, ca, ec, &c->rebalance_write_point); 1118 bch2_writepoint_stop(c, ca, ec, &c->btree_write_point); 1119 1120 mutex_lock(&c->btree_reserve_cache_lock); 1121 while (c->btree_reserve_cache_nr) { 1122 struct btree_alloc *a = 1123 &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; 1124 1125 bch2_open_buckets_put(c, &a->ob); 1126 } 1127 mutex_unlock(&c->btree_reserve_cache_lock); 1128 1129 spin_lock(&c->freelist_lock); 1130 i = 0; 1131 while (i < c->open_buckets_partial_nr) { 1132 struct open_bucket *ob = 1133 c->open_buckets + c->open_buckets_partial[i]; 1134 1135 if (should_drop_bucket(ob, c, ca, ec)) { 1136 --c->open_buckets_partial_nr; 1137 swap(c->open_buckets_partial[i], 1138 c->open_buckets_partial[c->open_buckets_partial_nr]); 1139 ob->on_partial_list = false; 1140 spin_unlock(&c->freelist_lock); 1141 bch2_open_bucket_put(c, ob); 1142 spin_lock(&c->freelist_lock); 1143 } else { 1144 i++; 1145 } 1146 } 1147 spin_unlock(&c->freelist_lock); 1148 1149 bch2_ec_stop_dev(c, ca); 1150 } 1151 1152 static inline struct hlist_head *writepoint_hash(struct bch_fs *c, 1153 unsigned long write_point) 1154 { 1155 unsigned hash = 1156 hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash))); 1157 1158 return &c->write_points_hash[hash]; 1159 } 1160 1161 static struct write_point *__writepoint_find(struct hlist_head *head, 1162 unsigned long write_point) 1163 { 1164 struct write_point *wp; 1165 1166 rcu_read_lock(); 1167 hlist_for_each_entry_rcu(wp, head, node) 1168 if (wp->write_point == write_point) 1169 goto out; 1170 wp = NULL; 1171 out: 1172 rcu_read_unlock(); 1173 return wp; 1174 } 1175 1176 static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor) 1177 { 1178 u64 stranded = c->write_points_nr * c->bucket_size_max; 1179 u64 free = bch2_fs_usage_read_short(c).free; 1180 1181 return stranded * factor > free; 1182 } 1183 1184 static bool try_increase_writepoints(struct bch_fs *c) 1185 { 1186 struct write_point *wp; 1187 1188 if (c->write_points_nr == ARRAY_SIZE(c->write_points) || 1189 too_many_writepoints(c, 32)) 1190 return false; 1191 1192 wp = c->write_points + c->write_points_nr++; 1193 hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point)); 1194 return true; 1195 } 1196 1197 static bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr) 1198 { 1199 struct bch_fs *c = trans->c; 1200 struct write_point *wp; 1201 struct open_bucket *ob; 1202 unsigned i; 1203 1204 mutex_lock(&c->write_points_hash_lock); 1205 if (c->write_points_nr < old_nr) { 1206 mutex_unlock(&c->write_points_hash_lock); 1207 return true; 1208 } 1209 1210 if (c->write_points_nr == 1 || 1211 !too_many_writepoints(c, 8)) { 1212 mutex_unlock(&c->write_points_hash_lock); 1213 return false; 1214 } 1215 1216 wp = c->write_points + --c->write_points_nr; 1217 1218 hlist_del_rcu(&wp->node); 1219 mutex_unlock(&c->write_points_hash_lock); 1220 1221 bch2_trans_mutex_lock_norelock(trans, &wp->lock); 1222 open_bucket_for_each(c, &wp->ptrs, ob, i) 1223 open_bucket_free_unused(c, ob); 1224 wp->ptrs.nr = 0; 1225 mutex_unlock(&wp->lock); 1226 return true; 1227 } 1228 1229 static struct write_point *writepoint_find(struct btree_trans *trans, 1230 unsigned long write_point) 1231 { 1232 struct bch_fs *c = trans->c; 1233 struct write_point *wp, *oldest; 1234 struct hlist_head *head; 1235 1236 if (!(write_point & 1UL)) { 1237 wp = (struct write_point *) write_point; 1238 bch2_trans_mutex_lock_norelock(trans, &wp->lock); 1239 return wp; 1240 } 1241 1242 head = writepoint_hash(c, write_point); 1243 restart_find: 1244 wp = __writepoint_find(head, write_point); 1245 if (wp) { 1246 lock_wp: 1247 bch2_trans_mutex_lock_norelock(trans, &wp->lock); 1248 if (wp->write_point == write_point) 1249 goto out; 1250 mutex_unlock(&wp->lock); 1251 goto restart_find; 1252 } 1253 restart_find_oldest: 1254 oldest = NULL; 1255 for (wp = c->write_points; 1256 wp < c->write_points + c->write_points_nr; wp++) 1257 if (!oldest || time_before64(wp->last_used, oldest->last_used)) 1258 oldest = wp; 1259 1260 bch2_trans_mutex_lock_norelock(trans, &oldest->lock); 1261 bch2_trans_mutex_lock_norelock(trans, &c->write_points_hash_lock); 1262 if (oldest >= c->write_points + c->write_points_nr || 1263 try_increase_writepoints(c)) { 1264 mutex_unlock(&c->write_points_hash_lock); 1265 mutex_unlock(&oldest->lock); 1266 goto restart_find_oldest; 1267 } 1268 1269 wp = __writepoint_find(head, write_point); 1270 if (wp && wp != oldest) { 1271 mutex_unlock(&c->write_points_hash_lock); 1272 mutex_unlock(&oldest->lock); 1273 goto lock_wp; 1274 } 1275 1276 wp = oldest; 1277 hlist_del_rcu(&wp->node); 1278 wp->write_point = write_point; 1279 hlist_add_head_rcu(&wp->node, head); 1280 mutex_unlock(&c->write_points_hash_lock); 1281 out: 1282 wp->last_used = local_clock(); 1283 return wp; 1284 } 1285 1286 static noinline void 1287 deallocate_extra_replicas(struct bch_fs *c, 1288 struct open_buckets *ptrs, 1289 struct open_buckets *ptrs_no_use, 1290 unsigned extra_replicas) 1291 { 1292 struct open_buckets ptrs2 = { 0 }; 1293 struct open_bucket *ob; 1294 unsigned i; 1295 1296 open_bucket_for_each(c, ptrs, ob, i) { 1297 unsigned d = bch_dev_bkey_exists(c, ob->dev)->mi.durability; 1298 1299 if (d && d <= extra_replicas) { 1300 extra_replicas -= d; 1301 ob_push(c, ptrs_no_use, ob); 1302 } else { 1303 ob_push(c, &ptrs2, ob); 1304 } 1305 } 1306 1307 *ptrs = ptrs2; 1308 } 1309 1310 /* 1311 * Get us an open_bucket we can allocate from, return with it locked: 1312 */ 1313 int bch2_alloc_sectors_start_trans(struct btree_trans *trans, 1314 unsigned target, 1315 unsigned erasure_code, 1316 struct write_point_specifier write_point, 1317 struct bch_devs_list *devs_have, 1318 unsigned nr_replicas, 1319 unsigned nr_replicas_required, 1320 enum bch_watermark watermark, 1321 unsigned flags, 1322 struct closure *cl, 1323 struct write_point **wp_ret) 1324 { 1325 struct bch_fs *c = trans->c; 1326 struct write_point *wp; 1327 struct open_bucket *ob; 1328 struct open_buckets ptrs; 1329 unsigned nr_effective, write_points_nr; 1330 bool have_cache; 1331 int ret; 1332 int i; 1333 1334 if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING)) 1335 erasure_code = false; 1336 1337 BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS); 1338 1339 BUG_ON(!nr_replicas || !nr_replicas_required); 1340 retry: 1341 ptrs.nr = 0; 1342 nr_effective = 0; 1343 write_points_nr = c->write_points_nr; 1344 have_cache = false; 1345 1346 *wp_ret = wp = writepoint_find(trans, write_point.v); 1347 1348 /* metadata may not allocate on cache devices: */ 1349 if (wp->data_type != BCH_DATA_user) 1350 have_cache = true; 1351 1352 if (target && !(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { 1353 ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, 1354 target, erasure_code, 1355 nr_replicas, &nr_effective, 1356 &have_cache, watermark, 1357 flags, NULL); 1358 if (!ret || 1359 bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1360 goto alloc_done; 1361 1362 /* Don't retry from all devices if we're out of open buckets: */ 1363 if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) { 1364 int ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, 1365 target, erasure_code, 1366 nr_replicas, &nr_effective, 1367 &have_cache, watermark, 1368 flags, cl); 1369 if (!ret || 1370 bch2_err_matches(ret, BCH_ERR_transaction_restart) || 1371 bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) 1372 goto alloc_done; 1373 } 1374 1375 /* 1376 * Only try to allocate cache (durability = 0 devices) from the 1377 * specified target: 1378 */ 1379 have_cache = true; 1380 1381 ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, 1382 0, erasure_code, 1383 nr_replicas, &nr_effective, 1384 &have_cache, watermark, 1385 flags, cl); 1386 } else { 1387 ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, 1388 target, erasure_code, 1389 nr_replicas, &nr_effective, 1390 &have_cache, watermark, 1391 flags, cl); 1392 } 1393 alloc_done: 1394 BUG_ON(!ret && nr_effective < nr_replicas); 1395 1396 if (erasure_code && !ec_open_bucket(c, &ptrs)) 1397 pr_debug("failed to get ec bucket: ret %u", ret); 1398 1399 if (ret == -BCH_ERR_insufficient_devices && 1400 nr_effective >= nr_replicas_required) 1401 ret = 0; 1402 1403 if (ret) 1404 goto err; 1405 1406 if (nr_effective > nr_replicas) 1407 deallocate_extra_replicas(c, &ptrs, &wp->ptrs, nr_effective - nr_replicas); 1408 1409 /* Free buckets we didn't use: */ 1410 open_bucket_for_each(c, &wp->ptrs, ob, i) 1411 open_bucket_free_unused(c, ob); 1412 1413 wp->ptrs = ptrs; 1414 1415 wp->sectors_free = UINT_MAX; 1416 1417 open_bucket_for_each(c, &wp->ptrs, ob, i) 1418 wp->sectors_free = min(wp->sectors_free, ob->sectors_free); 1419 1420 BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX); 1421 1422 return 0; 1423 err: 1424 open_bucket_for_each(c, &wp->ptrs, ob, i) 1425 if (ptrs.nr < ARRAY_SIZE(ptrs.v)) 1426 ob_push(c, &ptrs, ob); 1427 else 1428 open_bucket_free_unused(c, ob); 1429 wp->ptrs = ptrs; 1430 1431 mutex_unlock(&wp->lock); 1432 1433 if (bch2_err_matches(ret, BCH_ERR_freelist_empty) && 1434 try_decrease_writepoints(trans, write_points_nr)) 1435 goto retry; 1436 1437 if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) || 1438 bch2_err_matches(ret, BCH_ERR_freelist_empty)) 1439 return cl 1440 ? -BCH_ERR_bucket_alloc_blocked 1441 : -BCH_ERR_ENOSPC_bucket_alloc; 1442 1443 return ret; 1444 } 1445 1446 struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob) 1447 { 1448 struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); 1449 1450 return (struct bch_extent_ptr) { 1451 .type = 1 << BCH_EXTENT_ENTRY_ptr, 1452 .gen = ob->gen, 1453 .dev = ob->dev, 1454 .offset = bucket_to_sector(ca, ob->bucket) + 1455 ca->mi.bucket_size - 1456 ob->sectors_free, 1457 }; 1458 } 1459 1460 void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, 1461 struct bkey_i *k, unsigned sectors, 1462 bool cached) 1463 { 1464 bch2_alloc_sectors_append_ptrs_inlined(c, wp, k, sectors, cached); 1465 } 1466 1467 /* 1468 * Append pointers to the space we just allocated to @k, and mark @sectors space 1469 * as allocated out of @ob 1470 */ 1471 void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp) 1472 { 1473 bch2_alloc_sectors_done_inlined(c, wp); 1474 } 1475 1476 static inline void writepoint_init(struct write_point *wp, 1477 enum bch_data_type type) 1478 { 1479 mutex_init(&wp->lock); 1480 wp->data_type = type; 1481 1482 INIT_WORK(&wp->index_update_work, bch2_write_point_do_index_updates); 1483 INIT_LIST_HEAD(&wp->writes); 1484 spin_lock_init(&wp->writes_lock); 1485 } 1486 1487 void bch2_fs_allocator_foreground_init(struct bch_fs *c) 1488 { 1489 struct open_bucket *ob; 1490 struct write_point *wp; 1491 1492 mutex_init(&c->write_points_hash_lock); 1493 c->write_points_nr = ARRAY_SIZE(c->write_points); 1494 1495 /* open bucket 0 is a sentinal NULL: */ 1496 spin_lock_init(&c->open_buckets[0].lock); 1497 1498 for (ob = c->open_buckets + 1; 1499 ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { 1500 spin_lock_init(&ob->lock); 1501 c->open_buckets_nr_free++; 1502 1503 ob->freelist = c->open_buckets_freelist; 1504 c->open_buckets_freelist = ob - c->open_buckets; 1505 } 1506 1507 writepoint_init(&c->btree_write_point, BCH_DATA_btree); 1508 writepoint_init(&c->rebalance_write_point, BCH_DATA_user); 1509 writepoint_init(&c->copygc_write_point, BCH_DATA_user); 1510 1511 for (wp = c->write_points; 1512 wp < c->write_points + c->write_points_nr; wp++) { 1513 writepoint_init(wp, BCH_DATA_user); 1514 1515 wp->last_used = local_clock(); 1516 wp->write_point = (unsigned long) wp; 1517 hlist_add_head_rcu(&wp->node, 1518 writepoint_hash(c, wp->write_point)); 1519 } 1520 } 1521 1522 static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob) 1523 { 1524 struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); 1525 unsigned data_type = ob->data_type; 1526 barrier(); /* READ_ONCE() doesn't work on bitfields */ 1527 1528 prt_printf(out, "%zu ref %u ", 1529 ob - c->open_buckets, 1530 atomic_read(&ob->pin)); 1531 bch2_prt_data_type(out, data_type); 1532 prt_printf(out, " %u:%llu gen %u allocated %u/%u", 1533 ob->dev, ob->bucket, ob->gen, 1534 ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size); 1535 if (ob->ec) 1536 prt_printf(out, " ec idx %llu", ob->ec->idx); 1537 if (ob->on_partial_list) 1538 prt_str(out, " partial"); 1539 prt_newline(out); 1540 } 1541 1542 void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c) 1543 { 1544 struct open_bucket *ob; 1545 1546 out->atomic++; 1547 1548 for (ob = c->open_buckets; 1549 ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); 1550 ob++) { 1551 spin_lock(&ob->lock); 1552 if (ob->valid && !ob->on_partial_list) 1553 bch2_open_bucket_to_text(out, c, ob); 1554 spin_unlock(&ob->lock); 1555 } 1556 1557 --out->atomic; 1558 } 1559 1560 void bch2_open_buckets_partial_to_text(struct printbuf *out, struct bch_fs *c) 1561 { 1562 unsigned i; 1563 1564 out->atomic++; 1565 spin_lock(&c->freelist_lock); 1566 1567 for (i = 0; i < c->open_buckets_partial_nr; i++) 1568 bch2_open_bucket_to_text(out, c, 1569 c->open_buckets + c->open_buckets_partial[i]); 1570 1571 spin_unlock(&c->freelist_lock); 1572 --out->atomic; 1573 } 1574 1575 static const char * const bch2_write_point_states[] = { 1576 #define x(n) #n, 1577 WRITE_POINT_STATES() 1578 #undef x 1579 NULL 1580 }; 1581 1582 static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c, 1583 struct write_point *wp) 1584 { 1585 struct open_bucket *ob; 1586 unsigned i; 1587 1588 prt_printf(out, "%lu: ", wp->write_point); 1589 prt_human_readable_u64(out, wp->sectors_allocated); 1590 1591 prt_printf(out, " last wrote: "); 1592 bch2_pr_time_units(out, sched_clock() - wp->last_used); 1593 1594 for (i = 0; i < WRITE_POINT_STATE_NR; i++) { 1595 prt_printf(out, " %s: ", bch2_write_point_states[i]); 1596 bch2_pr_time_units(out, wp->time[i]); 1597 } 1598 1599 prt_newline(out); 1600 1601 printbuf_indent_add(out, 2); 1602 open_bucket_for_each(c, &wp->ptrs, ob, i) 1603 bch2_open_bucket_to_text(out, c, ob); 1604 printbuf_indent_sub(out, 2); 1605 } 1606 1607 void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c) 1608 { 1609 struct write_point *wp; 1610 1611 prt_str(out, "Foreground write points\n"); 1612 for (wp = c->write_points; 1613 wp < c->write_points + ARRAY_SIZE(c->write_points); 1614 wp++) 1615 bch2_write_point_to_text(out, c, wp); 1616 1617 prt_str(out, "Copygc write point\n"); 1618 bch2_write_point_to_text(out, c, &c->copygc_write_point); 1619 1620 prt_str(out, "Rebalance write point\n"); 1621 bch2_write_point_to_text(out, c, &c->rebalance_write_point); 1622 1623 prt_str(out, "Btree write point\n"); 1624 bch2_write_point_to_text(out, c, &c->btree_write_point); 1625 } 1626