1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright 2012 Google, Inc. 4 * 5 * Foreground allocator code: allocate buckets from freelist, and allocate in 6 * sector granularity from writepoints. 7 * 8 * bch2_bucket_alloc() allocates a single bucket from a specific device. 9 * 10 * bch2_bucket_alloc_set() allocates one or more buckets from different devices 11 * in a given filesystem. 12 */ 13 14 #include "bcachefs.h" 15 #include "alloc_background.h" 16 #include "alloc_foreground.h" 17 #include "backpointers.h" 18 #include "btree_iter.h" 19 #include "btree_update.h" 20 #include "btree_gc.h" 21 #include "buckets.h" 22 #include "buckets_waiting_for_journal.h" 23 #include "clock.h" 24 #include "debug.h" 25 #include "disk_groups.h" 26 #include "ec.h" 27 #include "error.h" 28 #include "io_write.h" 29 #include "journal.h" 30 #include "movinggc.h" 31 #include "nocow_locking.h" 32 #include "trace.h" 33 34 #include <linux/math64.h> 35 #include <linux/rculist.h> 36 #include <linux/rcupdate.h> 37 38 static void bch2_trans_mutex_lock_norelock(struct btree_trans *trans, 39 struct mutex *lock) 40 { 41 if (!mutex_trylock(lock)) { 42 bch2_trans_unlock(trans); 43 mutex_lock(lock); 44 } 45 } 46 47 const char * const bch2_watermarks[] = { 48 #define x(t) #t, 49 BCH_WATERMARKS() 50 #undef x 51 NULL 52 }; 53 54 /* 55 * Open buckets represent a bucket that's currently being allocated from. They 56 * serve two purposes: 57 * 58 * - They track buckets that have been partially allocated, allowing for 59 * sub-bucket sized allocations - they're used by the sector allocator below 60 * 61 * - They provide a reference to the buckets they own that mark and sweep GC 62 * can find, until the new allocation has a pointer to it inserted into the 63 * btree 64 * 65 * When allocating some space with the sector allocator, the allocation comes 66 * with a reference to an open bucket - the caller is required to put that 67 * reference _after_ doing the index update that makes its allocation reachable. 68 */ 69 70 void bch2_reset_alloc_cursors(struct bch_fs *c) 71 { 72 rcu_read_lock(); 73 for_each_member_device_rcu(c, ca, NULL) 74 memset(ca->alloc_cursor, 0, sizeof(ca->alloc_cursor)); 75 rcu_read_unlock(); 76 } 77 78 static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob) 79 { 80 open_bucket_idx_t idx = ob - c->open_buckets; 81 open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket); 82 83 ob->hash = *slot; 84 *slot = idx; 85 } 86 87 static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *ob) 88 { 89 open_bucket_idx_t idx = ob - c->open_buckets; 90 open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket); 91 92 while (*slot != idx) { 93 BUG_ON(!*slot); 94 slot = &c->open_buckets[*slot].hash; 95 } 96 97 *slot = ob->hash; 98 ob->hash = 0; 99 } 100 101 void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) 102 { 103 struct bch_dev *ca = ob_dev(c, ob); 104 105 if (ob->ec) { 106 ec_stripe_new_put(c, ob->ec, STRIPE_REF_io); 107 return; 108 } 109 110 spin_lock(&ob->lock); 111 ob->valid = false; 112 ob->data_type = 0; 113 spin_unlock(&ob->lock); 114 115 spin_lock(&c->freelist_lock); 116 bch2_open_bucket_hash_remove(c, ob); 117 118 ob->freelist = c->open_buckets_freelist; 119 c->open_buckets_freelist = ob - c->open_buckets; 120 121 c->open_buckets_nr_free++; 122 ca->nr_open_buckets--; 123 spin_unlock(&c->freelist_lock); 124 125 closure_wake_up(&c->open_buckets_wait); 126 } 127 128 void bch2_open_bucket_write_error(struct bch_fs *c, 129 struct open_buckets *obs, 130 unsigned dev) 131 { 132 struct open_bucket *ob; 133 unsigned i; 134 135 open_bucket_for_each(c, obs, ob, i) 136 if (ob->dev == dev && ob->ec) 137 bch2_ec_bucket_cancel(c, ob); 138 } 139 140 static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) 141 { 142 struct open_bucket *ob; 143 144 BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free); 145 146 ob = c->open_buckets + c->open_buckets_freelist; 147 c->open_buckets_freelist = ob->freelist; 148 atomic_set(&ob->pin, 1); 149 ob->data_type = 0; 150 151 c->open_buckets_nr_free--; 152 return ob; 153 } 154 155 static inline bool is_superblock_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) 156 { 157 if (c->curr_recovery_pass > BCH_RECOVERY_PASS_trans_mark_dev_sbs) 158 return false; 159 160 return bch2_is_superblock_bucket(ca, b); 161 } 162 163 static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) 164 { 165 BUG_ON(c->open_buckets_partial_nr >= 166 ARRAY_SIZE(c->open_buckets_partial)); 167 168 spin_lock(&c->freelist_lock); 169 rcu_read_lock(); 170 bch2_dev_rcu(c, ob->dev)->nr_partial_buckets++; 171 rcu_read_unlock(); 172 173 ob->on_partial_list = true; 174 c->open_buckets_partial[c->open_buckets_partial_nr++] = 175 ob - c->open_buckets; 176 spin_unlock(&c->freelist_lock); 177 178 closure_wake_up(&c->open_buckets_wait); 179 closure_wake_up(&c->freelist_wait); 180 } 181 182 static inline unsigned open_buckets_reserved(enum bch_watermark watermark) 183 { 184 switch (watermark) { 185 case BCH_WATERMARK_interior_updates: 186 return 0; 187 case BCH_WATERMARK_reclaim: 188 return OPEN_BUCKETS_COUNT / 6; 189 case BCH_WATERMARK_btree: 190 case BCH_WATERMARK_btree_copygc: 191 return OPEN_BUCKETS_COUNT / 4; 192 case BCH_WATERMARK_copygc: 193 return OPEN_BUCKETS_COUNT / 3; 194 default: 195 return OPEN_BUCKETS_COUNT / 2; 196 } 197 } 198 199 static inline bool may_alloc_bucket(struct bch_fs *c, 200 struct bpos bucket, 201 struct bucket_alloc_state *s) 202 { 203 if (bch2_bucket_is_open(c, bucket.inode, bucket.offset)) { 204 s->skipped_open++; 205 return false; 206 } 207 208 u64 journal_seq_ready = 209 bch2_bucket_journal_seq_ready(&c->buckets_waiting_for_journal, 210 bucket.inode, bucket.offset); 211 if (journal_seq_ready > c->journal.flushed_seq_ondisk) { 212 if (journal_seq_ready > c->journal.flushing_seq) 213 s->need_journal_commit++; 214 s->skipped_need_journal_commit++; 215 return false; 216 } 217 218 if (bch2_bucket_nocow_is_locked(&c->nocow_locks, bucket)) { 219 s->skipped_nocow++; 220 return false; 221 } 222 223 return true; 224 } 225 226 static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, 227 u64 bucket, u8 gen, 228 enum bch_watermark watermark, 229 struct bucket_alloc_state *s, 230 struct closure *cl) 231 { 232 if (unlikely(is_superblock_bucket(c, ca, bucket))) 233 return NULL; 234 235 if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) { 236 s->skipped_nouse++; 237 return NULL; 238 } 239 240 spin_lock(&c->freelist_lock); 241 242 if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(watermark))) { 243 if (cl) 244 closure_wait(&c->open_buckets_wait, cl); 245 246 track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true); 247 spin_unlock(&c->freelist_lock); 248 return ERR_PTR(-BCH_ERR_open_buckets_empty); 249 } 250 251 /* Recheck under lock: */ 252 if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { 253 spin_unlock(&c->freelist_lock); 254 s->skipped_open++; 255 return NULL; 256 } 257 258 struct open_bucket *ob = bch2_open_bucket_alloc(c); 259 260 spin_lock(&ob->lock); 261 ob->valid = true; 262 ob->sectors_free = ca->mi.bucket_size; 263 ob->dev = ca->dev_idx; 264 ob->gen = gen; 265 ob->bucket = bucket; 266 spin_unlock(&ob->lock); 267 268 ca->nr_open_buckets++; 269 bch2_open_bucket_hash_add(c, ob); 270 271 track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], false); 272 track_event_change(&c->times[BCH_TIME_blocked_allocate], false); 273 274 spin_unlock(&c->freelist_lock); 275 return ob; 276 } 277 278 static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca, 279 enum bch_watermark watermark, 280 struct bucket_alloc_state *s, 281 struct btree_iter *freespace_iter, 282 struct closure *cl) 283 { 284 struct bch_fs *c = trans->c; 285 u64 b = freespace_iter->pos.offset & ~(~0ULL << 56); 286 287 if (!may_alloc_bucket(c, POS(ca->dev_idx, b), s)) 288 return NULL; 289 290 u8 gen; 291 int ret = bch2_check_discard_freespace_key(trans, freespace_iter, &gen, true); 292 if (ret < 0) 293 return ERR_PTR(ret); 294 if (ret) 295 return NULL; 296 297 return __try_alloc_bucket(c, ca, b, gen, watermark, s, cl); 298 } 299 300 /* 301 * This path is for before the freespace btree is initialized: 302 */ 303 static noinline struct open_bucket * 304 bch2_bucket_alloc_early(struct btree_trans *trans, 305 struct bch_dev *ca, 306 enum bch_watermark watermark, 307 struct bucket_alloc_state *s, 308 struct closure *cl) 309 { 310 struct bch_fs *c = trans->c; 311 struct btree_iter iter, citer; 312 struct bkey_s_c k, ck; 313 struct open_bucket *ob = NULL; 314 u64 first_bucket = ca->mi.first_bucket; 315 u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap]; 316 u64 alloc_start = max(first_bucket, *dev_alloc_cursor); 317 u64 alloc_cursor = alloc_start; 318 int ret; 319 320 /* 321 * Scan with an uncached iterator to avoid polluting the key cache. An 322 * uncached iter will return a cached key if one exists, but if not 323 * there is no other underlying protection for the associated key cache 324 * slot. To avoid racing bucket allocations, look up the cached key slot 325 * of any likely allocation candidate before attempting to proceed with 326 * the allocation. This provides proper exclusion on the associated 327 * bucket. 328 */ 329 again: 330 for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor), 331 BTREE_ITER_slots, k, ret) { 332 u64 bucket = k.k->p.offset; 333 334 if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets))) 335 break; 336 337 if (s->btree_bitmap != BTREE_BITMAP_ANY && 338 s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, 339 bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { 340 if (s->btree_bitmap == BTREE_BITMAP_YES && 341 bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift) 342 break; 343 344 bucket = sector_to_bucket(ca, 345 round_up(bucket_to_sector(ca, bucket) + 1, 346 1ULL << ca->mi.btree_bitmap_shift)); 347 bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, bucket)); 348 s->buckets_seen++; 349 s->skipped_mi_btree_bitmap++; 350 continue; 351 } 352 353 struct bch_alloc_v4 a_convert; 354 const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); 355 if (a->data_type != BCH_DATA_free) 356 continue; 357 358 /* now check the cached key to serialize concurrent allocs of the bucket */ 359 ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_cached); 360 ret = bkey_err(ck); 361 if (ret) 362 break; 363 364 a = bch2_alloc_to_v4(ck, &a_convert); 365 if (a->data_type != BCH_DATA_free) 366 goto next; 367 368 s->buckets_seen++; 369 370 ob = may_alloc_bucket(c, k.k->p, s) 371 ? __try_alloc_bucket(c, ca, k.k->p.offset, a->gen, 372 watermark, s, cl) 373 : NULL; 374 next: 375 bch2_set_btree_iter_dontneed(&citer); 376 bch2_trans_iter_exit(trans, &citer); 377 if (ob) 378 break; 379 } 380 bch2_trans_iter_exit(trans, &iter); 381 382 alloc_cursor = iter.pos.offset; 383 384 if (!ob && ret) 385 ob = ERR_PTR(ret); 386 387 if (!ob && alloc_start > first_bucket) { 388 alloc_cursor = alloc_start = first_bucket; 389 goto again; 390 } 391 392 *dev_alloc_cursor = alloc_cursor; 393 394 return ob; 395 } 396 397 static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, 398 struct bch_dev *ca, 399 enum bch_watermark watermark, 400 struct bucket_alloc_state *s, 401 struct closure *cl) 402 { 403 struct btree_iter iter; 404 struct bkey_s_c k; 405 struct open_bucket *ob = NULL; 406 u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap]; 407 u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(*dev_alloc_cursor)); 408 u64 alloc_cursor = alloc_start; 409 int ret; 410 again: 411 for_each_btree_key_max_norestart(trans, iter, BTREE_ID_freespace, 412 POS(ca->dev_idx, alloc_cursor), 413 POS(ca->dev_idx, U64_MAX), 414 0, k, ret) { 415 /* 416 * peek normally dosen't trim extents - they can span iter.pos, 417 * which is not what we want here: 418 */ 419 iter.k.size = iter.k.p.offset - iter.pos.offset; 420 421 while (iter.k.size) { 422 s->buckets_seen++; 423 424 u64 bucket = iter.pos.offset & ~(~0ULL << 56); 425 if (s->btree_bitmap != BTREE_BITMAP_ANY && 426 s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, 427 bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { 428 if (s->btree_bitmap == BTREE_BITMAP_YES && 429 bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift) 430 goto fail; 431 432 bucket = sector_to_bucket(ca, 433 round_up(bucket_to_sector(ca, bucket + 1), 434 1ULL << ca->mi.btree_bitmap_shift)); 435 alloc_cursor = bucket|(iter.pos.offset & (~0ULL << 56)); 436 437 bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, alloc_cursor)); 438 s->skipped_mi_btree_bitmap++; 439 goto next; 440 } 441 442 ob = try_alloc_bucket(trans, ca, watermark, s, &iter, cl); 443 if (ob) { 444 if (!IS_ERR(ob)) 445 *dev_alloc_cursor = iter.pos.offset; 446 bch2_set_btree_iter_dontneed(&iter); 447 break; 448 } 449 450 iter.k.size--; 451 iter.pos.offset++; 452 } 453 next: 454 if (ob || ret) 455 break; 456 } 457 fail: 458 bch2_trans_iter_exit(trans, &iter); 459 460 BUG_ON(ob && ret); 461 462 if (ret) 463 ob = ERR_PTR(ret); 464 465 if (!ob && alloc_start > ca->mi.first_bucket) { 466 alloc_cursor = alloc_start = ca->mi.first_bucket; 467 goto again; 468 } 469 470 return ob; 471 } 472 473 static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca, 474 enum bch_watermark watermark, 475 enum bch_data_type data_type, 476 struct closure *cl, 477 struct bch_dev_usage *usage, 478 struct bucket_alloc_state *s, 479 struct open_bucket *ob) 480 { 481 struct printbuf buf = PRINTBUF; 482 483 printbuf_tabstop_push(&buf, 24); 484 485 prt_printf(&buf, "dev\t%s (%u)\n", ca->name, ca->dev_idx); 486 prt_printf(&buf, "watermark\t%s\n", bch2_watermarks[watermark]); 487 prt_printf(&buf, "data type\t%s\n", __bch2_data_types[data_type]); 488 prt_printf(&buf, "blocking\t%u\n", cl != NULL); 489 prt_printf(&buf, "free\t%llu\n", usage->d[BCH_DATA_free].buckets); 490 prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(ca, *usage, watermark)); 491 prt_printf(&buf, "copygc_wait\t%lu/%lli\n", 492 bch2_copygc_wait_amount(c), 493 c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now)); 494 prt_printf(&buf, "seen\t%llu\n", s->buckets_seen); 495 prt_printf(&buf, "open\t%llu\n", s->skipped_open); 496 prt_printf(&buf, "need journal commit\t%llu\n", s->skipped_need_journal_commit); 497 prt_printf(&buf, "nocow\t%llu\n", s->skipped_nocow); 498 prt_printf(&buf, "nouse\t%llu\n", s->skipped_nouse); 499 prt_printf(&buf, "mi_btree_bitmap\t%llu\n", s->skipped_mi_btree_bitmap); 500 501 if (!IS_ERR(ob)) { 502 prt_printf(&buf, "allocated\t%llu\n", ob->bucket); 503 trace_bucket_alloc(c, buf.buf); 504 } else { 505 prt_printf(&buf, "err\t%s\n", bch2_err_str(PTR_ERR(ob))); 506 trace_bucket_alloc_fail(c, buf.buf); 507 } 508 509 printbuf_exit(&buf); 510 } 511 512 /** 513 * bch2_bucket_alloc_trans - allocate a single bucket from a specific device 514 * @trans: transaction object 515 * @ca: device to allocate from 516 * @watermark: how important is this allocation? 517 * @data_type: BCH_DATA_journal, btree, user... 518 * @cl: if not NULL, closure to be used to wait if buckets not available 519 * @nowait: if true, do not wait for buckets to become available 520 * @usage: for secondarily also returning the current device usage 521 * 522 * Returns: an open_bucket on success, or an ERR_PTR() on failure. 523 */ 524 static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, 525 struct bch_dev *ca, 526 enum bch_watermark watermark, 527 enum bch_data_type data_type, 528 struct closure *cl, 529 bool nowait, 530 struct bch_dev_usage *usage) 531 { 532 struct bch_fs *c = trans->c; 533 struct open_bucket *ob = NULL; 534 bool freespace = READ_ONCE(ca->mi.freespace_initialized); 535 u64 avail; 536 struct bucket_alloc_state s = { 537 .btree_bitmap = data_type == BCH_DATA_btree, 538 }; 539 bool waiting = nowait; 540 again: 541 bch2_dev_usage_read_fast(ca, usage); 542 avail = dev_buckets_free(ca, *usage, watermark); 543 544 if (usage->d[BCH_DATA_need_discard].buckets > avail) 545 bch2_dev_do_discards(ca); 546 547 if (usage->d[BCH_DATA_need_gc_gens].buckets > avail) 548 bch2_gc_gens_async(c); 549 550 if (should_invalidate_buckets(ca, *usage)) 551 bch2_dev_do_invalidates(ca); 552 553 if (!avail) { 554 if (watermark > BCH_WATERMARK_normal && 555 c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) 556 goto alloc; 557 558 if (cl && !waiting) { 559 closure_wait(&c->freelist_wait, cl); 560 waiting = true; 561 goto again; 562 } 563 564 track_event_change(&c->times[BCH_TIME_blocked_allocate], true); 565 566 ob = ERR_PTR(-BCH_ERR_freelist_empty); 567 goto err; 568 } 569 570 if (waiting) 571 closure_wake_up(&c->freelist_wait); 572 alloc: 573 ob = likely(freespace) 574 ? bch2_bucket_alloc_freelist(trans, ca, watermark, &s, cl) 575 : bch2_bucket_alloc_early(trans, ca, watermark, &s, cl); 576 577 if (s.need_journal_commit * 2 > avail) 578 bch2_journal_flush_async(&c->journal, NULL); 579 580 if (!ob && s.btree_bitmap != BTREE_BITMAP_ANY) { 581 s.btree_bitmap = BTREE_BITMAP_ANY; 582 goto alloc; 583 } 584 585 if (!ob && freespace && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) { 586 freespace = false; 587 goto alloc; 588 } 589 err: 590 if (!ob) 591 ob = ERR_PTR(-BCH_ERR_no_buckets_found); 592 593 if (!IS_ERR(ob)) 594 ob->data_type = data_type; 595 596 if (!IS_ERR(ob)) 597 count_event(c, bucket_alloc); 598 else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart)) 599 count_event(c, bucket_alloc_fail); 600 601 if (!IS_ERR(ob) 602 ? trace_bucket_alloc_enabled() 603 : trace_bucket_alloc_fail_enabled()) 604 trace_bucket_alloc2(c, ca, watermark, data_type, cl, usage, &s, ob); 605 606 return ob; 607 } 608 609 struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, 610 enum bch_watermark watermark, 611 enum bch_data_type data_type, 612 struct closure *cl) 613 { 614 struct bch_dev_usage usage; 615 struct open_bucket *ob; 616 617 bch2_trans_do(c, 618 PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark, 619 data_type, cl, false, &usage))); 620 return ob; 621 } 622 623 static int __dev_stripe_cmp(struct dev_stripe_state *stripe, 624 unsigned l, unsigned r) 625 { 626 return ((stripe->next_alloc[l] > stripe->next_alloc[r]) - 627 (stripe->next_alloc[l] < stripe->next_alloc[r])); 628 } 629 630 #define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r) 631 632 struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c, 633 struct dev_stripe_state *stripe, 634 struct bch_devs_mask *devs) 635 { 636 struct dev_alloc_list ret = { .nr = 0 }; 637 unsigned i; 638 639 for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX) 640 ret.data[ret.nr++] = i; 641 642 bubble_sort(ret.data, ret.nr, dev_stripe_cmp); 643 return ret; 644 } 645 646 static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca, 647 struct dev_stripe_state *stripe, 648 struct bch_dev_usage *usage) 649 { 650 u64 *v = stripe->next_alloc + ca->dev_idx; 651 u64 free_space = dev_buckets_available(ca, BCH_WATERMARK_normal); 652 u64 free_space_inv = free_space 653 ? div64_u64(1ULL << 48, free_space) 654 : 1ULL << 48; 655 u64 scale = *v / 4; 656 657 if (*v + free_space_inv >= *v) 658 *v += free_space_inv; 659 else 660 *v = U64_MAX; 661 662 for (v = stripe->next_alloc; 663 v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) 664 *v = *v < scale ? 0 : *v - scale; 665 } 666 667 void bch2_dev_stripe_increment(struct bch_dev *ca, 668 struct dev_stripe_state *stripe) 669 { 670 struct bch_dev_usage usage; 671 672 bch2_dev_usage_read_fast(ca, &usage); 673 bch2_dev_stripe_increment_inlined(ca, stripe, &usage); 674 } 675 676 static int add_new_bucket(struct bch_fs *c, 677 struct open_buckets *ptrs, 678 struct bch_devs_mask *devs_may_alloc, 679 unsigned nr_replicas, 680 unsigned *nr_effective, 681 bool *have_cache, 682 struct open_bucket *ob) 683 { 684 unsigned durability = ob_dev(c, ob)->mi.durability; 685 686 BUG_ON(*nr_effective >= nr_replicas); 687 688 __clear_bit(ob->dev, devs_may_alloc->d); 689 *nr_effective += durability; 690 *have_cache |= !durability; 691 692 ob_push(c, ptrs, ob); 693 694 if (*nr_effective >= nr_replicas) 695 return 1; 696 if (ob->ec) 697 return 1; 698 return 0; 699 } 700 701 int bch2_bucket_alloc_set_trans(struct btree_trans *trans, 702 struct open_buckets *ptrs, 703 struct dev_stripe_state *stripe, 704 struct bch_devs_mask *devs_may_alloc, 705 unsigned nr_replicas, 706 unsigned *nr_effective, 707 bool *have_cache, 708 enum bch_write_flags flags, 709 enum bch_data_type data_type, 710 enum bch_watermark watermark, 711 struct closure *cl) 712 { 713 struct bch_fs *c = trans->c; 714 int ret = -BCH_ERR_insufficient_devices; 715 716 BUG_ON(*nr_effective >= nr_replicas); 717 718 struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, devs_may_alloc); 719 darray_for_each(devs_sorted, i) { 720 struct bch_dev *ca = bch2_dev_tryget_noerror(c, *i); 721 if (!ca) 722 continue; 723 724 if (!ca->mi.durability && *have_cache) { 725 bch2_dev_put(ca); 726 continue; 727 } 728 729 struct bch_dev_usage usage; 730 struct open_bucket *ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, 731 cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage); 732 if (!IS_ERR(ob)) 733 bch2_dev_stripe_increment_inlined(ca, stripe, &usage); 734 bch2_dev_put(ca); 735 736 if (IS_ERR(ob)) { 737 ret = PTR_ERR(ob); 738 if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || cl) 739 break; 740 continue; 741 } 742 743 if (add_new_bucket(c, ptrs, devs_may_alloc, 744 nr_replicas, nr_effective, 745 have_cache, ob)) { 746 ret = 0; 747 break; 748 } 749 } 750 751 return ret; 752 } 753 754 /* Allocate from stripes: */ 755 756 /* 757 * if we can't allocate a new stripe because there are already too many 758 * partially filled stripes, force allocating from an existing stripe even when 759 * it's to a device we don't want: 760 */ 761 762 static int bucket_alloc_from_stripe(struct btree_trans *trans, 763 struct open_buckets *ptrs, 764 struct write_point *wp, 765 struct bch_devs_mask *devs_may_alloc, 766 u16 target, 767 unsigned nr_replicas, 768 unsigned *nr_effective, 769 bool *have_cache, 770 enum bch_watermark watermark, 771 enum bch_write_flags flags, 772 struct closure *cl) 773 { 774 struct bch_fs *c = trans->c; 775 int ret = 0; 776 777 if (nr_replicas < 2) 778 return 0; 779 780 if (ec_open_bucket(c, ptrs)) 781 return 0; 782 783 struct ec_stripe_head *h = 784 bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl); 785 if (IS_ERR(h)) 786 return PTR_ERR(h); 787 if (!h) 788 return 0; 789 790 struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); 791 darray_for_each(devs_sorted, i) 792 for (unsigned ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) { 793 if (!h->s->blocks[ec_idx]) 794 continue; 795 796 struct open_bucket *ob = c->open_buckets + h->s->blocks[ec_idx]; 797 if (ob->dev == *i && !test_and_set_bit(ec_idx, h->s->blocks_allocated)) { 798 ob->ec_idx = ec_idx; 799 ob->ec = h->s; 800 ec_stripe_new_get(h->s, STRIPE_REF_io); 801 802 ret = add_new_bucket(c, ptrs, devs_may_alloc, 803 nr_replicas, nr_effective, 804 have_cache, ob); 805 goto out; 806 } 807 } 808 out: 809 bch2_ec_stripe_head_put(c, h); 810 return ret; 811 } 812 813 /* Sector allocator */ 814 815 static bool want_bucket(struct bch_fs *c, 816 struct write_point *wp, 817 struct bch_devs_mask *devs_may_alloc, 818 bool *have_cache, bool ec, 819 struct open_bucket *ob) 820 { 821 struct bch_dev *ca = ob_dev(c, ob); 822 823 if (!test_bit(ob->dev, devs_may_alloc->d)) 824 return false; 825 826 if (ob->data_type != wp->data_type) 827 return false; 828 829 if (!ca->mi.durability && 830 (wp->data_type == BCH_DATA_btree || ec || *have_cache)) 831 return false; 832 833 if (ec != (ob->ec != NULL)) 834 return false; 835 836 return true; 837 } 838 839 static int bucket_alloc_set_writepoint(struct bch_fs *c, 840 struct open_buckets *ptrs, 841 struct write_point *wp, 842 struct bch_devs_mask *devs_may_alloc, 843 unsigned nr_replicas, 844 unsigned *nr_effective, 845 bool *have_cache, 846 bool ec) 847 { 848 struct open_buckets ptrs_skip = { .nr = 0 }; 849 struct open_bucket *ob; 850 unsigned i; 851 int ret = 0; 852 853 open_bucket_for_each(c, &wp->ptrs, ob, i) { 854 if (!ret && want_bucket(c, wp, devs_may_alloc, 855 have_cache, ec, ob)) 856 ret = add_new_bucket(c, ptrs, devs_may_alloc, 857 nr_replicas, nr_effective, 858 have_cache, ob); 859 else 860 ob_push(c, &ptrs_skip, ob); 861 } 862 wp->ptrs = ptrs_skip; 863 864 return ret; 865 } 866 867 static int bucket_alloc_set_partial(struct bch_fs *c, 868 struct open_buckets *ptrs, 869 struct write_point *wp, 870 struct bch_devs_mask *devs_may_alloc, 871 unsigned nr_replicas, 872 unsigned *nr_effective, 873 bool *have_cache, bool ec, 874 enum bch_watermark watermark) 875 { 876 int i, ret = 0; 877 878 if (!c->open_buckets_partial_nr) 879 return 0; 880 881 spin_lock(&c->freelist_lock); 882 883 if (!c->open_buckets_partial_nr) 884 goto unlock; 885 886 for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) { 887 struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i]; 888 889 if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) { 890 struct bch_dev *ca = ob_dev(c, ob); 891 struct bch_dev_usage usage; 892 u64 avail; 893 894 bch2_dev_usage_read_fast(ca, &usage); 895 avail = dev_buckets_free(ca, usage, watermark) + ca->nr_partial_buckets; 896 if (!avail) 897 continue; 898 899 array_remove_item(c->open_buckets_partial, 900 c->open_buckets_partial_nr, 901 i); 902 ob->on_partial_list = false; 903 904 rcu_read_lock(); 905 bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; 906 rcu_read_unlock(); 907 908 ret = add_new_bucket(c, ptrs, devs_may_alloc, 909 nr_replicas, nr_effective, 910 have_cache, ob); 911 if (ret) 912 break; 913 } 914 } 915 unlock: 916 spin_unlock(&c->freelist_lock); 917 return ret; 918 } 919 920 static int __open_bucket_add_buckets(struct btree_trans *trans, 921 struct open_buckets *ptrs, 922 struct write_point *wp, 923 struct bch_devs_list *devs_have, 924 u16 target, 925 bool erasure_code, 926 unsigned nr_replicas, 927 unsigned *nr_effective, 928 bool *have_cache, 929 enum bch_watermark watermark, 930 enum bch_write_flags flags, 931 struct closure *_cl) 932 { 933 struct bch_fs *c = trans->c; 934 struct bch_devs_mask devs; 935 struct open_bucket *ob; 936 struct closure *cl = NULL; 937 unsigned i; 938 int ret; 939 940 devs = target_rw_devs(c, wp->data_type, target); 941 942 /* Don't allocate from devices we already have pointers to: */ 943 darray_for_each(*devs_have, i) 944 __clear_bit(*i, devs.d); 945 946 open_bucket_for_each(c, ptrs, ob, i) 947 __clear_bit(ob->dev, devs.d); 948 949 ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs, 950 nr_replicas, nr_effective, 951 have_cache, erasure_code); 952 if (ret) 953 return ret; 954 955 ret = bucket_alloc_set_partial(c, ptrs, wp, &devs, 956 nr_replicas, nr_effective, 957 have_cache, erasure_code, watermark); 958 if (ret) 959 return ret; 960 961 if (erasure_code) { 962 ret = bucket_alloc_from_stripe(trans, ptrs, wp, &devs, 963 target, 964 nr_replicas, nr_effective, 965 have_cache, 966 watermark, flags, _cl); 967 } else { 968 retry_blocking: 969 /* 970 * Try nonblocking first, so that if one device is full we'll try from 971 * other devices: 972 */ 973 ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs, 974 nr_replicas, nr_effective, have_cache, 975 flags, wp->data_type, watermark, cl); 976 if (ret && 977 !bch2_err_matches(ret, BCH_ERR_transaction_restart) && 978 !bch2_err_matches(ret, BCH_ERR_insufficient_devices) && 979 !cl && _cl) { 980 cl = _cl; 981 goto retry_blocking; 982 } 983 } 984 985 return ret; 986 } 987 988 static int open_bucket_add_buckets(struct btree_trans *trans, 989 struct open_buckets *ptrs, 990 struct write_point *wp, 991 struct bch_devs_list *devs_have, 992 u16 target, 993 unsigned erasure_code, 994 unsigned nr_replicas, 995 unsigned *nr_effective, 996 bool *have_cache, 997 enum bch_watermark watermark, 998 enum bch_write_flags flags, 999 struct closure *cl) 1000 { 1001 int ret; 1002 1003 if (erasure_code && !ec_open_bucket(trans->c, ptrs)) { 1004 ret = __open_bucket_add_buckets(trans, ptrs, wp, 1005 devs_have, target, erasure_code, 1006 nr_replicas, nr_effective, have_cache, 1007 watermark, flags, cl); 1008 if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || 1009 bch2_err_matches(ret, BCH_ERR_operation_blocked) || 1010 bch2_err_matches(ret, BCH_ERR_freelist_empty) || 1011 bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) 1012 return ret; 1013 if (*nr_effective >= nr_replicas) 1014 return 0; 1015 } 1016 1017 ret = __open_bucket_add_buckets(trans, ptrs, wp, 1018 devs_have, target, false, 1019 nr_replicas, nr_effective, have_cache, 1020 watermark, flags, cl); 1021 return ret < 0 ? ret : 0; 1022 } 1023 1024 /** 1025 * should_drop_bucket - check if this is open_bucket should go away 1026 * @ob: open_bucket to predicate on 1027 * @c: filesystem handle 1028 * @ca: if set, we're killing buckets for a particular device 1029 * @ec: if true, we're shutting down erasure coding and killing all ec 1030 * open_buckets 1031 * otherwise, return true 1032 * Returns: true if we should kill this open_bucket 1033 * 1034 * We're killing open_buckets because we're shutting down a device, erasure 1035 * coding, or the entire filesystem - check if this open_bucket matches: 1036 */ 1037 static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c, 1038 struct bch_dev *ca, bool ec) 1039 { 1040 if (ec) { 1041 return ob->ec != NULL; 1042 } else if (ca) { 1043 bool drop = ob->dev == ca->dev_idx; 1044 struct open_bucket *ob2; 1045 unsigned i; 1046 1047 if (!drop && ob->ec) { 1048 unsigned nr_blocks; 1049 1050 mutex_lock(&ob->ec->lock); 1051 nr_blocks = bkey_i_to_stripe(&ob->ec->new_stripe.key)->v.nr_blocks; 1052 1053 for (i = 0; i < nr_blocks; i++) { 1054 if (!ob->ec->blocks[i]) 1055 continue; 1056 1057 ob2 = c->open_buckets + ob->ec->blocks[i]; 1058 drop |= ob2->dev == ca->dev_idx; 1059 } 1060 mutex_unlock(&ob->ec->lock); 1061 } 1062 1063 return drop; 1064 } else { 1065 return true; 1066 } 1067 } 1068 1069 static void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca, 1070 bool ec, struct write_point *wp) 1071 { 1072 struct open_buckets ptrs = { .nr = 0 }; 1073 struct open_bucket *ob; 1074 unsigned i; 1075 1076 mutex_lock(&wp->lock); 1077 open_bucket_for_each(c, &wp->ptrs, ob, i) 1078 if (should_drop_bucket(ob, c, ca, ec)) 1079 bch2_open_bucket_put(c, ob); 1080 else 1081 ob_push(c, &ptrs, ob); 1082 wp->ptrs = ptrs; 1083 mutex_unlock(&wp->lock); 1084 } 1085 1086 void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca, 1087 bool ec) 1088 { 1089 unsigned i; 1090 1091 /* Next, close write points that point to this device... */ 1092 for (i = 0; i < ARRAY_SIZE(c->write_points); i++) 1093 bch2_writepoint_stop(c, ca, ec, &c->write_points[i]); 1094 1095 bch2_writepoint_stop(c, ca, ec, &c->copygc_write_point); 1096 bch2_writepoint_stop(c, ca, ec, &c->rebalance_write_point); 1097 bch2_writepoint_stop(c, ca, ec, &c->btree_write_point); 1098 1099 mutex_lock(&c->btree_reserve_cache_lock); 1100 while (c->btree_reserve_cache_nr) { 1101 struct btree_alloc *a = 1102 &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; 1103 1104 bch2_open_buckets_put(c, &a->ob); 1105 } 1106 mutex_unlock(&c->btree_reserve_cache_lock); 1107 1108 spin_lock(&c->freelist_lock); 1109 i = 0; 1110 while (i < c->open_buckets_partial_nr) { 1111 struct open_bucket *ob = 1112 c->open_buckets + c->open_buckets_partial[i]; 1113 1114 if (should_drop_bucket(ob, c, ca, ec)) { 1115 --c->open_buckets_partial_nr; 1116 swap(c->open_buckets_partial[i], 1117 c->open_buckets_partial[c->open_buckets_partial_nr]); 1118 1119 ob->on_partial_list = false; 1120 1121 rcu_read_lock(); 1122 bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; 1123 rcu_read_unlock(); 1124 1125 spin_unlock(&c->freelist_lock); 1126 bch2_open_bucket_put(c, ob); 1127 spin_lock(&c->freelist_lock); 1128 } else { 1129 i++; 1130 } 1131 } 1132 spin_unlock(&c->freelist_lock); 1133 1134 bch2_ec_stop_dev(c, ca); 1135 } 1136 1137 static inline struct hlist_head *writepoint_hash(struct bch_fs *c, 1138 unsigned long write_point) 1139 { 1140 unsigned hash = 1141 hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash))); 1142 1143 return &c->write_points_hash[hash]; 1144 } 1145 1146 static struct write_point *__writepoint_find(struct hlist_head *head, 1147 unsigned long write_point) 1148 { 1149 struct write_point *wp; 1150 1151 rcu_read_lock(); 1152 hlist_for_each_entry_rcu(wp, head, node) 1153 if (wp->write_point == write_point) 1154 goto out; 1155 wp = NULL; 1156 out: 1157 rcu_read_unlock(); 1158 return wp; 1159 } 1160 1161 static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor) 1162 { 1163 u64 stranded = c->write_points_nr * c->bucket_size_max; 1164 u64 free = bch2_fs_usage_read_short(c).free; 1165 1166 return stranded * factor > free; 1167 } 1168 1169 static bool try_increase_writepoints(struct bch_fs *c) 1170 { 1171 struct write_point *wp; 1172 1173 if (c->write_points_nr == ARRAY_SIZE(c->write_points) || 1174 too_many_writepoints(c, 32)) 1175 return false; 1176 1177 wp = c->write_points + c->write_points_nr++; 1178 hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point)); 1179 return true; 1180 } 1181 1182 static bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr) 1183 { 1184 struct bch_fs *c = trans->c; 1185 struct write_point *wp; 1186 struct open_bucket *ob; 1187 unsigned i; 1188 1189 mutex_lock(&c->write_points_hash_lock); 1190 if (c->write_points_nr < old_nr) { 1191 mutex_unlock(&c->write_points_hash_lock); 1192 return true; 1193 } 1194 1195 if (c->write_points_nr == 1 || 1196 !too_many_writepoints(c, 8)) { 1197 mutex_unlock(&c->write_points_hash_lock); 1198 return false; 1199 } 1200 1201 wp = c->write_points + --c->write_points_nr; 1202 1203 hlist_del_rcu(&wp->node); 1204 mutex_unlock(&c->write_points_hash_lock); 1205 1206 bch2_trans_mutex_lock_norelock(trans, &wp->lock); 1207 open_bucket_for_each(c, &wp->ptrs, ob, i) 1208 open_bucket_free_unused(c, ob); 1209 wp->ptrs.nr = 0; 1210 mutex_unlock(&wp->lock); 1211 return true; 1212 } 1213 1214 static struct write_point *writepoint_find(struct btree_trans *trans, 1215 unsigned long write_point) 1216 { 1217 struct bch_fs *c = trans->c; 1218 struct write_point *wp, *oldest; 1219 struct hlist_head *head; 1220 1221 if (!(write_point & 1UL)) { 1222 wp = (struct write_point *) write_point; 1223 bch2_trans_mutex_lock_norelock(trans, &wp->lock); 1224 return wp; 1225 } 1226 1227 head = writepoint_hash(c, write_point); 1228 restart_find: 1229 wp = __writepoint_find(head, write_point); 1230 if (wp) { 1231 lock_wp: 1232 bch2_trans_mutex_lock_norelock(trans, &wp->lock); 1233 if (wp->write_point == write_point) 1234 goto out; 1235 mutex_unlock(&wp->lock); 1236 goto restart_find; 1237 } 1238 restart_find_oldest: 1239 oldest = NULL; 1240 for (wp = c->write_points; 1241 wp < c->write_points + c->write_points_nr; wp++) 1242 if (!oldest || time_before64(wp->last_used, oldest->last_used)) 1243 oldest = wp; 1244 1245 bch2_trans_mutex_lock_norelock(trans, &oldest->lock); 1246 bch2_trans_mutex_lock_norelock(trans, &c->write_points_hash_lock); 1247 if (oldest >= c->write_points + c->write_points_nr || 1248 try_increase_writepoints(c)) { 1249 mutex_unlock(&c->write_points_hash_lock); 1250 mutex_unlock(&oldest->lock); 1251 goto restart_find_oldest; 1252 } 1253 1254 wp = __writepoint_find(head, write_point); 1255 if (wp && wp != oldest) { 1256 mutex_unlock(&c->write_points_hash_lock); 1257 mutex_unlock(&oldest->lock); 1258 goto lock_wp; 1259 } 1260 1261 wp = oldest; 1262 hlist_del_rcu(&wp->node); 1263 wp->write_point = write_point; 1264 hlist_add_head_rcu(&wp->node, head); 1265 mutex_unlock(&c->write_points_hash_lock); 1266 out: 1267 wp->last_used = local_clock(); 1268 return wp; 1269 } 1270 1271 static noinline void 1272 deallocate_extra_replicas(struct bch_fs *c, 1273 struct open_buckets *ptrs, 1274 struct open_buckets *ptrs_no_use, 1275 unsigned extra_replicas) 1276 { 1277 struct open_buckets ptrs2 = { 0 }; 1278 struct open_bucket *ob; 1279 unsigned i; 1280 1281 open_bucket_for_each(c, ptrs, ob, i) { 1282 unsigned d = ob_dev(c, ob)->mi.durability; 1283 1284 if (d && d <= extra_replicas) { 1285 extra_replicas -= d; 1286 ob_push(c, ptrs_no_use, ob); 1287 } else { 1288 ob_push(c, &ptrs2, ob); 1289 } 1290 } 1291 1292 *ptrs = ptrs2; 1293 } 1294 1295 /* 1296 * Get us an open_bucket we can allocate from, return with it locked: 1297 */ 1298 int bch2_alloc_sectors_start_trans(struct btree_trans *trans, 1299 unsigned target, 1300 unsigned erasure_code, 1301 struct write_point_specifier write_point, 1302 struct bch_devs_list *devs_have, 1303 unsigned nr_replicas, 1304 unsigned nr_replicas_required, 1305 enum bch_watermark watermark, 1306 enum bch_write_flags flags, 1307 struct closure *cl, 1308 struct write_point **wp_ret) 1309 { 1310 struct bch_fs *c = trans->c; 1311 struct write_point *wp; 1312 struct open_bucket *ob; 1313 struct open_buckets ptrs; 1314 unsigned nr_effective, write_points_nr; 1315 bool have_cache; 1316 int ret; 1317 int i; 1318 1319 if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING)) 1320 erasure_code = false; 1321 1322 BUG_ON(!nr_replicas || !nr_replicas_required); 1323 retry: 1324 ptrs.nr = 0; 1325 nr_effective = 0; 1326 write_points_nr = c->write_points_nr; 1327 have_cache = false; 1328 1329 *wp_ret = wp = writepoint_find(trans, write_point.v); 1330 1331 ret = bch2_trans_relock(trans); 1332 if (ret) 1333 goto err; 1334 1335 /* metadata may not allocate on cache devices: */ 1336 if (wp->data_type != BCH_DATA_user) 1337 have_cache = true; 1338 1339 if (target && !(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { 1340 ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, 1341 target, erasure_code, 1342 nr_replicas, &nr_effective, 1343 &have_cache, watermark, 1344 flags, NULL); 1345 if (!ret || 1346 bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1347 goto alloc_done; 1348 1349 /* Don't retry from all devices if we're out of open buckets: */ 1350 if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) { 1351 int ret2 = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, 1352 target, erasure_code, 1353 nr_replicas, &nr_effective, 1354 &have_cache, watermark, 1355 flags, cl); 1356 if (!ret2 || 1357 bch2_err_matches(ret2, BCH_ERR_transaction_restart) || 1358 bch2_err_matches(ret2, BCH_ERR_open_buckets_empty)) { 1359 ret = ret2; 1360 goto alloc_done; 1361 } 1362 } 1363 1364 /* 1365 * Only try to allocate cache (durability = 0 devices) from the 1366 * specified target: 1367 */ 1368 have_cache = true; 1369 1370 ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, 1371 0, erasure_code, 1372 nr_replicas, &nr_effective, 1373 &have_cache, watermark, 1374 flags, cl); 1375 } else { 1376 ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, 1377 target, erasure_code, 1378 nr_replicas, &nr_effective, 1379 &have_cache, watermark, 1380 flags, cl); 1381 } 1382 alloc_done: 1383 BUG_ON(!ret && nr_effective < nr_replicas); 1384 1385 if (erasure_code && !ec_open_bucket(c, &ptrs)) 1386 pr_debug("failed to get ec bucket: ret %u", ret); 1387 1388 if (ret == -BCH_ERR_insufficient_devices && 1389 nr_effective >= nr_replicas_required) 1390 ret = 0; 1391 1392 if (ret) 1393 goto err; 1394 1395 if (nr_effective > nr_replicas) 1396 deallocate_extra_replicas(c, &ptrs, &wp->ptrs, nr_effective - nr_replicas); 1397 1398 /* Free buckets we didn't use: */ 1399 open_bucket_for_each(c, &wp->ptrs, ob, i) 1400 open_bucket_free_unused(c, ob); 1401 1402 wp->ptrs = ptrs; 1403 1404 wp->sectors_free = UINT_MAX; 1405 1406 open_bucket_for_each(c, &wp->ptrs, ob, i) 1407 wp->sectors_free = min(wp->sectors_free, ob->sectors_free); 1408 1409 BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX); 1410 1411 return 0; 1412 err: 1413 open_bucket_for_each(c, &wp->ptrs, ob, i) 1414 if (ptrs.nr < ARRAY_SIZE(ptrs.v)) 1415 ob_push(c, &ptrs, ob); 1416 else 1417 open_bucket_free_unused(c, ob); 1418 wp->ptrs = ptrs; 1419 1420 mutex_unlock(&wp->lock); 1421 1422 if (bch2_err_matches(ret, BCH_ERR_freelist_empty) && 1423 try_decrease_writepoints(trans, write_points_nr)) 1424 goto retry; 1425 1426 if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) 1427 ret = -BCH_ERR_bucket_alloc_blocked; 1428 1429 if (cl && !(flags & BCH_WRITE_ALLOC_NOWAIT) && 1430 bch2_err_matches(ret, BCH_ERR_freelist_empty)) 1431 ret = -BCH_ERR_bucket_alloc_blocked; 1432 1433 return ret; 1434 } 1435 1436 struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob) 1437 { 1438 struct bch_dev *ca = ob_dev(c, ob); 1439 1440 return (struct bch_extent_ptr) { 1441 .type = 1 << BCH_EXTENT_ENTRY_ptr, 1442 .gen = ob->gen, 1443 .dev = ob->dev, 1444 .offset = bucket_to_sector(ca, ob->bucket) + 1445 ca->mi.bucket_size - 1446 ob->sectors_free, 1447 }; 1448 } 1449 1450 void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, 1451 struct bkey_i *k, unsigned sectors, 1452 bool cached) 1453 { 1454 bch2_alloc_sectors_append_ptrs_inlined(c, wp, k, sectors, cached); 1455 } 1456 1457 /* 1458 * Append pointers to the space we just allocated to @k, and mark @sectors space 1459 * as allocated out of @ob 1460 */ 1461 void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp) 1462 { 1463 bch2_alloc_sectors_done_inlined(c, wp); 1464 } 1465 1466 static inline void writepoint_init(struct write_point *wp, 1467 enum bch_data_type type) 1468 { 1469 mutex_init(&wp->lock); 1470 wp->data_type = type; 1471 1472 INIT_WORK(&wp->index_update_work, bch2_write_point_do_index_updates); 1473 INIT_LIST_HEAD(&wp->writes); 1474 spin_lock_init(&wp->writes_lock); 1475 } 1476 1477 void bch2_fs_allocator_foreground_init(struct bch_fs *c) 1478 { 1479 struct open_bucket *ob; 1480 struct write_point *wp; 1481 1482 mutex_init(&c->write_points_hash_lock); 1483 c->write_points_nr = ARRAY_SIZE(c->write_points); 1484 1485 /* open bucket 0 is a sentinal NULL: */ 1486 spin_lock_init(&c->open_buckets[0].lock); 1487 1488 for (ob = c->open_buckets + 1; 1489 ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { 1490 spin_lock_init(&ob->lock); 1491 c->open_buckets_nr_free++; 1492 1493 ob->freelist = c->open_buckets_freelist; 1494 c->open_buckets_freelist = ob - c->open_buckets; 1495 } 1496 1497 writepoint_init(&c->btree_write_point, BCH_DATA_btree); 1498 writepoint_init(&c->rebalance_write_point, BCH_DATA_user); 1499 writepoint_init(&c->copygc_write_point, BCH_DATA_user); 1500 1501 for (wp = c->write_points; 1502 wp < c->write_points + c->write_points_nr; wp++) { 1503 writepoint_init(wp, BCH_DATA_user); 1504 1505 wp->last_used = local_clock(); 1506 wp->write_point = (unsigned long) wp; 1507 hlist_add_head_rcu(&wp->node, 1508 writepoint_hash(c, wp->write_point)); 1509 } 1510 } 1511 1512 void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob) 1513 { 1514 struct bch_dev *ca = ob_dev(c, ob); 1515 unsigned data_type = ob->data_type; 1516 barrier(); /* READ_ONCE() doesn't work on bitfields */ 1517 1518 prt_printf(out, "%zu ref %u ", 1519 ob - c->open_buckets, 1520 atomic_read(&ob->pin)); 1521 bch2_prt_data_type(out, data_type); 1522 prt_printf(out, " %u:%llu gen %u allocated %u/%u", 1523 ob->dev, ob->bucket, ob->gen, 1524 ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size); 1525 if (ob->ec) 1526 prt_printf(out, " ec idx %llu", ob->ec->idx); 1527 if (ob->on_partial_list) 1528 prt_str(out, " partial"); 1529 prt_newline(out); 1530 } 1531 1532 void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c, 1533 struct bch_dev *ca) 1534 { 1535 struct open_bucket *ob; 1536 1537 out->atomic++; 1538 1539 for (ob = c->open_buckets; 1540 ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); 1541 ob++) { 1542 spin_lock(&ob->lock); 1543 if (ob->valid && (!ca || ob->dev == ca->dev_idx)) 1544 bch2_open_bucket_to_text(out, c, ob); 1545 spin_unlock(&ob->lock); 1546 } 1547 1548 --out->atomic; 1549 } 1550 1551 void bch2_open_buckets_partial_to_text(struct printbuf *out, struct bch_fs *c) 1552 { 1553 unsigned i; 1554 1555 out->atomic++; 1556 spin_lock(&c->freelist_lock); 1557 1558 for (i = 0; i < c->open_buckets_partial_nr; i++) 1559 bch2_open_bucket_to_text(out, c, 1560 c->open_buckets + c->open_buckets_partial[i]); 1561 1562 spin_unlock(&c->freelist_lock); 1563 --out->atomic; 1564 } 1565 1566 static const char * const bch2_write_point_states[] = { 1567 #define x(n) #n, 1568 WRITE_POINT_STATES() 1569 #undef x 1570 NULL 1571 }; 1572 1573 static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c, 1574 struct write_point *wp) 1575 { 1576 struct open_bucket *ob; 1577 unsigned i; 1578 1579 prt_printf(out, "%lu: ", wp->write_point); 1580 prt_human_readable_u64(out, wp->sectors_allocated); 1581 1582 prt_printf(out, " last wrote: "); 1583 bch2_pr_time_units(out, sched_clock() - wp->last_used); 1584 1585 for (i = 0; i < WRITE_POINT_STATE_NR; i++) { 1586 prt_printf(out, " %s: ", bch2_write_point_states[i]); 1587 bch2_pr_time_units(out, wp->time[i]); 1588 } 1589 1590 prt_newline(out); 1591 1592 printbuf_indent_add(out, 2); 1593 open_bucket_for_each(c, &wp->ptrs, ob, i) 1594 bch2_open_bucket_to_text(out, c, ob); 1595 printbuf_indent_sub(out, 2); 1596 } 1597 1598 void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c) 1599 { 1600 struct write_point *wp; 1601 1602 prt_str(out, "Foreground write points\n"); 1603 for (wp = c->write_points; 1604 wp < c->write_points + ARRAY_SIZE(c->write_points); 1605 wp++) 1606 bch2_write_point_to_text(out, c, wp); 1607 1608 prt_str(out, "Copygc write point\n"); 1609 bch2_write_point_to_text(out, c, &c->copygc_write_point); 1610 1611 prt_str(out, "Rebalance write point\n"); 1612 bch2_write_point_to_text(out, c, &c->rebalance_write_point); 1613 1614 prt_str(out, "Btree write point\n"); 1615 bch2_write_point_to_text(out, c, &c->btree_write_point); 1616 } 1617 1618 void bch2_fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c) 1619 { 1620 unsigned nr[BCH_DATA_NR]; 1621 1622 memset(nr, 0, sizeof(nr)); 1623 1624 for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++) 1625 nr[c->open_buckets[i].data_type]++; 1626 1627 printbuf_tabstops_reset(out); 1628 printbuf_tabstop_push(out, 24); 1629 1630 prt_printf(out, "capacity\t%llu\n", c->capacity); 1631 prt_printf(out, "reserved\t%llu\n", c->reserved); 1632 prt_printf(out, "hidden\t%llu\n", percpu_u64_get(&c->usage->hidden)); 1633 prt_printf(out, "btree\t%llu\n", percpu_u64_get(&c->usage->btree)); 1634 prt_printf(out, "data\t%llu\n", percpu_u64_get(&c->usage->data)); 1635 prt_printf(out, "cached\t%llu\n", percpu_u64_get(&c->usage->cached)); 1636 prt_printf(out, "reserved\t%llu\n", percpu_u64_get(&c->usage->reserved)); 1637 prt_printf(out, "online_reserved\t%llu\n", percpu_u64_get(c->online_reserved)); 1638 prt_printf(out, "nr_inodes\t%llu\n", percpu_u64_get(&c->usage->nr_inodes)); 1639 1640 prt_newline(out); 1641 prt_printf(out, "freelist_wait\t%s\n", c->freelist_wait.list.first ? "waiting" : "empty"); 1642 prt_printf(out, "open buckets allocated\t%i\n", OPEN_BUCKETS_COUNT - c->open_buckets_nr_free); 1643 prt_printf(out, "open buckets total\t%u\n", OPEN_BUCKETS_COUNT); 1644 prt_printf(out, "open_buckets_wait\t%s\n", c->open_buckets_wait.list.first ? "waiting" : "empty"); 1645 prt_printf(out, "open_buckets_btree\t%u\n", nr[BCH_DATA_btree]); 1646 prt_printf(out, "open_buckets_user\t%u\n", nr[BCH_DATA_user]); 1647 prt_printf(out, "btree reserve cache\t%u\n", c->btree_reserve_cache_nr); 1648 } 1649 1650 void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) 1651 { 1652 struct bch_fs *c = ca->fs; 1653 struct bch_dev_usage stats = bch2_dev_usage_read(ca); 1654 unsigned nr[BCH_DATA_NR]; 1655 1656 memset(nr, 0, sizeof(nr)); 1657 1658 for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++) 1659 nr[c->open_buckets[i].data_type]++; 1660 1661 bch2_dev_usage_to_text(out, ca, &stats); 1662 1663 prt_newline(out); 1664 1665 prt_printf(out, "reserves:\n"); 1666 for (unsigned i = 0; i < BCH_WATERMARK_NR; i++) 1667 prt_printf(out, "%s\t%llu\r\n", bch2_watermarks[i], bch2_dev_buckets_reserved(ca, i)); 1668 1669 prt_newline(out); 1670 1671 printbuf_tabstops_reset(out); 1672 printbuf_tabstop_push(out, 12); 1673 printbuf_tabstop_push(out, 16); 1674 1675 prt_printf(out, "open buckets\t%i\r\n", ca->nr_open_buckets); 1676 prt_printf(out, "buckets to invalidate\t%llu\r\n", should_invalidate_buckets(ca, stats)); 1677 } 1678 1679 static noinline void bch2_print_allocator_stuck(struct bch_fs *c) 1680 { 1681 struct printbuf buf = PRINTBUF; 1682 1683 prt_printf(&buf, "Allocator stuck? Waited for %u seconds\n", 1684 c->opts.allocator_stuck_timeout); 1685 1686 prt_printf(&buf, "Allocator debug:\n"); 1687 printbuf_indent_add(&buf, 2); 1688 bch2_fs_alloc_debug_to_text(&buf, c); 1689 printbuf_indent_sub(&buf, 2); 1690 prt_newline(&buf); 1691 1692 for_each_online_member(c, ca) { 1693 prt_printf(&buf, "Dev %u:\n", ca->dev_idx); 1694 printbuf_indent_add(&buf, 2); 1695 bch2_dev_alloc_debug_to_text(&buf, ca); 1696 printbuf_indent_sub(&buf, 2); 1697 prt_newline(&buf); 1698 } 1699 1700 prt_printf(&buf, "Copygc debug:\n"); 1701 printbuf_indent_add(&buf, 2); 1702 bch2_copygc_wait_to_text(&buf, c); 1703 printbuf_indent_sub(&buf, 2); 1704 prt_newline(&buf); 1705 1706 prt_printf(&buf, "Journal debug:\n"); 1707 printbuf_indent_add(&buf, 2); 1708 bch2_journal_debug_to_text(&buf, &c->journal); 1709 printbuf_indent_sub(&buf, 2); 1710 1711 bch2_print_string_as_lines(KERN_ERR, buf.buf); 1712 printbuf_exit(&buf); 1713 } 1714 1715 static inline unsigned allocator_wait_timeout(struct bch_fs *c) 1716 { 1717 if (c->allocator_last_stuck && 1718 time_after(c->allocator_last_stuck + HZ * 60 * 2, jiffies)) 1719 return 0; 1720 1721 return c->opts.allocator_stuck_timeout * HZ; 1722 } 1723 1724 void __bch2_wait_on_allocator(struct bch_fs *c, struct closure *cl) 1725 { 1726 unsigned t = allocator_wait_timeout(c); 1727 1728 if (t && closure_sync_timeout(cl, t)) { 1729 c->allocator_last_stuck = jiffies; 1730 bch2_print_allocator_stuck(c); 1731 } 1732 1733 closure_sync(cl); 1734 } 1735