1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright 2012 Google, Inc. 4 * 5 * Foreground allocator code: allocate buckets from freelist, and allocate in 6 * sector granularity from writepoints. 7 * 8 * bch2_bucket_alloc() allocates a single bucket from a specific device. 9 * 10 * bch2_bucket_alloc_set() allocates one or more buckets from different devices 11 * in a given filesystem. 12 */ 13 14 #include "bcachefs.h" 15 #include "alloc_background.h" 16 #include "alloc_foreground.h" 17 #include "backpointers.h" 18 #include "btree_iter.h" 19 #include "btree_update.h" 20 #include "btree_gc.h" 21 #include "buckets.h" 22 #include "buckets_waiting_for_journal.h" 23 #include "clock.h" 24 #include "debug.h" 25 #include "disk_groups.h" 26 #include "ec.h" 27 #include "error.h" 28 #include "io_write.h" 29 #include "journal.h" 30 #include "movinggc.h" 31 #include "nocow_locking.h" 32 #include "trace.h" 33 34 #include <linux/math64.h> 35 #include <linux/rculist.h> 36 #include <linux/rcupdate.h> 37 38 static void bch2_trans_mutex_lock_norelock(struct btree_trans *trans, 39 struct mutex *lock) 40 { 41 if (!mutex_trylock(lock)) { 42 bch2_trans_unlock(trans); 43 mutex_lock(lock); 44 } 45 } 46 47 const char * const bch2_watermarks[] = { 48 #define x(t) #t, 49 BCH_WATERMARKS() 50 #undef x 51 NULL 52 }; 53 54 /* 55 * Open buckets represent a bucket that's currently being allocated from. They 56 * serve two purposes: 57 * 58 * - They track buckets that have been partially allocated, allowing for 59 * sub-bucket sized allocations - they're used by the sector allocator below 60 * 61 * - They provide a reference to the buckets they own that mark and sweep GC 62 * can find, until the new allocation has a pointer to it inserted into the 63 * btree 64 * 65 * When allocating some space with the sector allocator, the allocation comes 66 * with a reference to an open bucket - the caller is required to put that 67 * reference _after_ doing the index update that makes its allocation reachable. 68 */ 69 70 void bch2_reset_alloc_cursors(struct bch_fs *c) 71 { 72 guard(rcu)(); 73 for_each_member_device_rcu(c, ca, NULL) 74 memset(ca->alloc_cursor, 0, sizeof(ca->alloc_cursor)); 75 } 76 77 static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob) 78 { 79 open_bucket_idx_t idx = ob - c->open_buckets; 80 open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket); 81 82 ob->hash = *slot; 83 *slot = idx; 84 } 85 86 static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *ob) 87 { 88 open_bucket_idx_t idx = ob - c->open_buckets; 89 open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket); 90 91 while (*slot != idx) { 92 BUG_ON(!*slot); 93 slot = &c->open_buckets[*slot].hash; 94 } 95 96 *slot = ob->hash; 97 ob->hash = 0; 98 } 99 100 void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) 101 { 102 struct bch_dev *ca = ob_dev(c, ob); 103 104 if (ob->ec) { 105 ec_stripe_new_put(c, ob->ec, STRIPE_REF_io); 106 return; 107 } 108 109 spin_lock(&ob->lock); 110 ob->valid = false; 111 ob->data_type = 0; 112 spin_unlock(&ob->lock); 113 114 spin_lock(&c->freelist_lock); 115 bch2_open_bucket_hash_remove(c, ob); 116 117 ob->freelist = c->open_buckets_freelist; 118 c->open_buckets_freelist = ob - c->open_buckets; 119 120 c->open_buckets_nr_free++; 121 ca->nr_open_buckets--; 122 spin_unlock(&c->freelist_lock); 123 124 closure_wake_up(&c->open_buckets_wait); 125 } 126 127 void bch2_open_bucket_write_error(struct bch_fs *c, 128 struct open_buckets *obs, 129 unsigned dev, int err) 130 { 131 struct open_bucket *ob; 132 unsigned i; 133 134 open_bucket_for_each(c, obs, ob, i) 135 if (ob->dev == dev && ob->ec) 136 bch2_ec_bucket_cancel(c, ob, err); 137 } 138 139 static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) 140 { 141 struct open_bucket *ob; 142 143 BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free); 144 145 ob = c->open_buckets + c->open_buckets_freelist; 146 c->open_buckets_freelist = ob->freelist; 147 atomic_set(&ob->pin, 1); 148 ob->data_type = 0; 149 150 c->open_buckets_nr_free--; 151 return ob; 152 } 153 154 static inline bool is_superblock_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) 155 { 156 if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_trans_mark_dev_sbs)) 157 return false; 158 159 return bch2_is_superblock_bucket(ca, b); 160 } 161 162 static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) 163 { 164 BUG_ON(c->open_buckets_partial_nr >= 165 ARRAY_SIZE(c->open_buckets_partial)); 166 167 spin_lock(&c->freelist_lock); 168 scoped_guard(rcu) 169 bch2_dev_rcu(c, ob->dev)->nr_partial_buckets++; 170 171 ob->on_partial_list = true; 172 c->open_buckets_partial[c->open_buckets_partial_nr++] = 173 ob - c->open_buckets; 174 spin_unlock(&c->freelist_lock); 175 176 closure_wake_up(&c->open_buckets_wait); 177 closure_wake_up(&c->freelist_wait); 178 } 179 180 static inline bool may_alloc_bucket(struct bch_fs *c, 181 struct alloc_request *req, 182 struct bpos bucket) 183 { 184 if (bch2_bucket_is_open(c, bucket.inode, bucket.offset)) { 185 req->counters.skipped_open++; 186 return false; 187 } 188 189 u64 journal_seq_ready = 190 bch2_bucket_journal_seq_ready(&c->buckets_waiting_for_journal, 191 bucket.inode, bucket.offset); 192 if (journal_seq_ready > c->journal.flushed_seq_ondisk) { 193 if (journal_seq_ready > c->journal.flushing_seq) 194 req->counters.need_journal_commit++; 195 req->counters.skipped_need_journal_commit++; 196 return false; 197 } 198 199 if (bch2_bucket_nocow_is_locked(&c->nocow_locks, bucket)) { 200 req->counters.skipped_nocow++; 201 return false; 202 } 203 204 return true; 205 } 206 207 static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, 208 struct alloc_request *req, 209 u64 bucket, u8 gen, 210 struct closure *cl) 211 { 212 struct bch_dev *ca = req->ca; 213 214 if (unlikely(is_superblock_bucket(c, ca, bucket))) 215 return NULL; 216 217 if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) { 218 req->counters.skipped_nouse++; 219 return NULL; 220 } 221 222 spin_lock(&c->freelist_lock); 223 224 if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(req->watermark))) { 225 if (cl) 226 closure_wait(&c->open_buckets_wait, cl); 227 228 track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true); 229 spin_unlock(&c->freelist_lock); 230 return ERR_PTR(bch_err_throw(c, open_buckets_empty)); 231 } 232 233 /* Recheck under lock: */ 234 if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { 235 spin_unlock(&c->freelist_lock); 236 req->counters.skipped_open++; 237 return NULL; 238 } 239 240 struct open_bucket *ob = bch2_open_bucket_alloc(c); 241 242 spin_lock(&ob->lock); 243 ob->valid = true; 244 ob->sectors_free = ca->mi.bucket_size; 245 ob->dev = ca->dev_idx; 246 ob->gen = gen; 247 ob->bucket = bucket; 248 spin_unlock(&ob->lock); 249 250 ca->nr_open_buckets++; 251 bch2_open_bucket_hash_add(c, ob); 252 253 track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], false); 254 track_event_change(&c->times[BCH_TIME_blocked_allocate], false); 255 256 spin_unlock(&c->freelist_lock); 257 return ob; 258 } 259 260 static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, 261 struct alloc_request *req, 262 struct btree_iter *freespace_iter, 263 struct closure *cl) 264 { 265 struct bch_fs *c = trans->c; 266 u64 b = freespace_iter->pos.offset & ~(~0ULL << 56); 267 268 if (!may_alloc_bucket(c, req, POS(req->ca->dev_idx, b))) 269 return NULL; 270 271 u8 gen; 272 int ret = bch2_check_discard_freespace_key(trans, freespace_iter, &gen, true); 273 if (ret < 0) 274 return ERR_PTR(ret); 275 if (ret) 276 return NULL; 277 278 return __try_alloc_bucket(c, req, b, gen, cl); 279 } 280 281 /* 282 * This path is for before the freespace btree is initialized: 283 */ 284 static noinline struct open_bucket * 285 bch2_bucket_alloc_early(struct btree_trans *trans, 286 struct alloc_request *req, 287 struct closure *cl) 288 { 289 struct bch_fs *c = trans->c; 290 struct bch_dev *ca = req->ca; 291 struct btree_iter iter, citer; 292 struct bkey_s_c k, ck; 293 struct open_bucket *ob = NULL; 294 u64 first_bucket = ca->mi.first_bucket; 295 u64 *dev_alloc_cursor = &ca->alloc_cursor[req->btree_bitmap]; 296 u64 alloc_start = max(first_bucket, *dev_alloc_cursor); 297 u64 alloc_cursor = alloc_start; 298 int ret; 299 300 /* 301 * Scan with an uncached iterator to avoid polluting the key cache. An 302 * uncached iter will return a cached key if one exists, but if not 303 * there is no other underlying protection for the associated key cache 304 * slot. To avoid racing bucket allocations, look up the cached key slot 305 * of any likely allocation candidate before attempting to proceed with 306 * the allocation. This provides proper exclusion on the associated 307 * bucket. 308 */ 309 again: 310 for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor), 311 BTREE_ITER_slots, k, ret) { 312 u64 bucket = k.k->p.offset; 313 314 if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets))) 315 break; 316 317 if (req->btree_bitmap != BTREE_BITMAP_ANY && 318 req->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, 319 bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { 320 if (req->btree_bitmap == BTREE_BITMAP_YES && 321 bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift) 322 break; 323 324 bucket = sector_to_bucket(ca, 325 round_up(bucket_to_sector(ca, bucket) + 1, 326 1ULL << ca->mi.btree_bitmap_shift)); 327 bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, bucket)); 328 req->counters.buckets_seen++; 329 req->counters.skipped_mi_btree_bitmap++; 330 continue; 331 } 332 333 struct bch_alloc_v4 a_convert; 334 const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); 335 if (a->data_type != BCH_DATA_free) 336 continue; 337 338 /* now check the cached key to serialize concurrent allocs of the bucket */ 339 ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_cached); 340 ret = bkey_err(ck); 341 if (ret) 342 break; 343 344 a = bch2_alloc_to_v4(ck, &a_convert); 345 if (a->data_type != BCH_DATA_free) 346 goto next; 347 348 req->counters.buckets_seen++; 349 350 ob = may_alloc_bucket(c, req, k.k->p) 351 ? __try_alloc_bucket(c, req, k.k->p.offset, a->gen, cl) 352 : NULL; 353 next: 354 bch2_set_btree_iter_dontneed(trans, &citer); 355 bch2_trans_iter_exit(trans, &citer); 356 if (ob) 357 break; 358 } 359 bch2_trans_iter_exit(trans, &iter); 360 361 alloc_cursor = iter.pos.offset; 362 363 if (!ob && ret) 364 ob = ERR_PTR(ret); 365 366 if (!ob && alloc_start > first_bucket) { 367 alloc_cursor = alloc_start = first_bucket; 368 goto again; 369 } 370 371 *dev_alloc_cursor = alloc_cursor; 372 373 return ob; 374 } 375 376 static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, 377 struct alloc_request *req, 378 struct closure *cl) 379 { 380 struct bch_dev *ca = req->ca; 381 struct btree_iter iter; 382 struct bkey_s_c k; 383 struct open_bucket *ob = NULL; 384 u64 *dev_alloc_cursor = &ca->alloc_cursor[req->btree_bitmap]; 385 u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(*dev_alloc_cursor)); 386 u64 alloc_cursor = alloc_start; 387 int ret; 388 again: 389 for_each_btree_key_max_norestart(trans, iter, BTREE_ID_freespace, 390 POS(ca->dev_idx, alloc_cursor), 391 POS(ca->dev_idx, U64_MAX), 392 0, k, ret) { 393 /* 394 * peek normally dosen't trim extents - they can span iter.pos, 395 * which is not what we want here: 396 */ 397 iter.k.size = iter.k.p.offset - iter.pos.offset; 398 399 while (iter.k.size) { 400 req->counters.buckets_seen++; 401 402 u64 bucket = iter.pos.offset & ~(~0ULL << 56); 403 if (req->btree_bitmap != BTREE_BITMAP_ANY && 404 req->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, 405 bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { 406 if (req->btree_bitmap == BTREE_BITMAP_YES && 407 bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift) 408 goto fail; 409 410 bucket = sector_to_bucket(ca, 411 round_up(bucket_to_sector(ca, bucket + 1), 412 1ULL << ca->mi.btree_bitmap_shift)); 413 alloc_cursor = bucket|(iter.pos.offset & (~0ULL << 56)); 414 415 bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, alloc_cursor)); 416 req->counters.skipped_mi_btree_bitmap++; 417 goto next; 418 } 419 420 ob = try_alloc_bucket(trans, req, &iter, cl); 421 if (ob) { 422 if (!IS_ERR(ob)) 423 *dev_alloc_cursor = iter.pos.offset; 424 bch2_set_btree_iter_dontneed(trans, &iter); 425 break; 426 } 427 428 iter.k.size--; 429 iter.pos.offset++; 430 } 431 next: 432 if (ob || ret) 433 break; 434 } 435 fail: 436 bch2_trans_iter_exit(trans, &iter); 437 438 BUG_ON(ob && ret); 439 440 if (ret) 441 ob = ERR_PTR(ret); 442 443 if (!ob && alloc_start > ca->mi.first_bucket) { 444 alloc_cursor = alloc_start = ca->mi.first_bucket; 445 goto again; 446 } 447 448 return ob; 449 } 450 451 static noinline void trace_bucket_alloc2(struct bch_fs *c, 452 struct alloc_request *req, 453 struct closure *cl, 454 struct open_bucket *ob) 455 { 456 struct printbuf buf = PRINTBUF; 457 458 printbuf_tabstop_push(&buf, 24); 459 460 prt_printf(&buf, "dev\t%s (%u)\n", req->ca->name, req->ca->dev_idx); 461 prt_printf(&buf, "watermark\t%s\n", bch2_watermarks[req->watermark]); 462 prt_printf(&buf, "data type\t%s\n", __bch2_data_types[req->data_type]); 463 prt_printf(&buf, "blocking\t%u\n", cl != NULL); 464 prt_printf(&buf, "free\t%llu\n", req->usage.buckets[BCH_DATA_free]); 465 prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(req->ca, req->usage, req->watermark)); 466 prt_printf(&buf, "copygc_wait\t%llu/%lli\n", 467 bch2_copygc_wait_amount(c), 468 c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now)); 469 prt_printf(&buf, "seen\t%llu\n", req->counters.buckets_seen); 470 prt_printf(&buf, "open\t%llu\n", req->counters.skipped_open); 471 prt_printf(&buf, "need journal commit\t%llu\n", req->counters.skipped_need_journal_commit); 472 prt_printf(&buf, "nocow\t%llu\n", req->counters.skipped_nocow); 473 prt_printf(&buf, "nouse\t%llu\n", req->counters.skipped_nouse); 474 prt_printf(&buf, "mi_btree_bitmap\t%llu\n", req->counters.skipped_mi_btree_bitmap); 475 476 if (!IS_ERR(ob)) { 477 prt_printf(&buf, "allocated\t%llu\n", ob->bucket); 478 trace_bucket_alloc(c, buf.buf); 479 } else { 480 prt_printf(&buf, "err\t%s\n", bch2_err_str(PTR_ERR(ob))); 481 trace_bucket_alloc_fail(c, buf.buf); 482 } 483 484 printbuf_exit(&buf); 485 } 486 487 /** 488 * bch2_bucket_alloc_trans - allocate a single bucket from a specific device 489 * @trans: transaction object 490 * @req: state for the entire allocation 491 * @cl: if not NULL, closure to be used to wait if buckets not available 492 * @nowait: if true, do not wait for buckets to become available 493 * 494 * Returns: an open_bucket on success, or an ERR_PTR() on failure. 495 */ 496 static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, 497 struct alloc_request *req, 498 struct closure *cl, 499 bool nowait) 500 { 501 struct bch_fs *c = trans->c; 502 struct bch_dev *ca = req->ca; 503 struct open_bucket *ob = NULL; 504 bool freespace = READ_ONCE(ca->mi.freespace_initialized); 505 u64 avail; 506 bool waiting = nowait; 507 508 req->btree_bitmap = req->data_type == BCH_DATA_btree; 509 memset(&req->counters, 0, sizeof(req->counters)); 510 again: 511 bch2_dev_usage_read_fast(ca, &req->usage); 512 avail = dev_buckets_free(ca, req->usage, req->watermark); 513 514 if (req->usage.buckets[BCH_DATA_need_discard] > avail) 515 bch2_dev_do_discards(ca); 516 517 if (req->usage.buckets[BCH_DATA_need_gc_gens] > avail) 518 bch2_gc_gens_async(c); 519 520 if (should_invalidate_buckets(ca, req->usage)) 521 bch2_dev_do_invalidates(ca); 522 523 if (!avail) { 524 if (req->watermark > BCH_WATERMARK_normal && 525 c->recovery.pass_done < BCH_RECOVERY_PASS_check_allocations) 526 goto alloc; 527 528 if (cl && !waiting) { 529 closure_wait(&c->freelist_wait, cl); 530 waiting = true; 531 goto again; 532 } 533 534 track_event_change(&c->times[BCH_TIME_blocked_allocate], true); 535 536 ob = ERR_PTR(bch_err_throw(c, freelist_empty)); 537 goto err; 538 } 539 540 if (waiting) 541 closure_wake_up(&c->freelist_wait); 542 alloc: 543 ob = likely(freespace) 544 ? bch2_bucket_alloc_freelist(trans, req, cl) 545 : bch2_bucket_alloc_early(trans, req, cl); 546 547 if (req->counters.need_journal_commit * 2 > avail) 548 bch2_journal_flush_async(&c->journal, NULL); 549 550 if (!ob && req->btree_bitmap != BTREE_BITMAP_ANY) { 551 req->btree_bitmap = BTREE_BITMAP_ANY; 552 goto alloc; 553 } 554 555 if (!ob && freespace && c->recovery.pass_done < BCH_RECOVERY_PASS_check_alloc_info) { 556 freespace = false; 557 goto alloc; 558 } 559 err: 560 if (!ob) 561 ob = ERR_PTR(bch_err_throw(c, no_buckets_found)); 562 563 if (!IS_ERR(ob)) 564 ob->data_type = req->data_type; 565 566 if (!IS_ERR(ob)) 567 count_event(c, bucket_alloc); 568 else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart)) 569 count_event(c, bucket_alloc_fail); 570 571 if (!IS_ERR(ob) 572 ? trace_bucket_alloc_enabled() 573 : trace_bucket_alloc_fail_enabled()) 574 trace_bucket_alloc2(c, req, cl, ob); 575 576 return ob; 577 } 578 579 struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, 580 enum bch_watermark watermark, 581 enum bch_data_type data_type, 582 struct closure *cl) 583 { 584 struct open_bucket *ob; 585 struct alloc_request req = { 586 .watermark = watermark, 587 .data_type = data_type, 588 .ca = ca, 589 }; 590 591 bch2_trans_do(c, 592 PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, &req, cl, false))); 593 return ob; 594 } 595 596 static int __dev_stripe_cmp(struct dev_stripe_state *stripe, 597 unsigned l, unsigned r) 598 { 599 return cmp_int(stripe->next_alloc[l], stripe->next_alloc[r]); 600 } 601 602 #define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r) 603 604 void bch2_dev_alloc_list(struct bch_fs *c, 605 struct dev_stripe_state *stripe, 606 struct bch_devs_mask *devs, 607 struct dev_alloc_list *ret) 608 { 609 ret->nr = 0; 610 611 unsigned i; 612 for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX) 613 ret->data[ret->nr++] = i; 614 615 bubble_sort(ret->data, ret->nr, dev_stripe_cmp); 616 } 617 618 static const u64 stripe_clock_hand_rescale = 1ULL << 62; /* trigger rescale at */ 619 static const u64 stripe_clock_hand_max = 1ULL << 56; /* max after rescale */ 620 static const u64 stripe_clock_hand_inv = 1ULL << 52; /* max increment, if a device is empty */ 621 622 static noinline void bch2_stripe_state_rescale(struct dev_stripe_state *stripe) 623 { 624 /* 625 * Avoid underflowing clock hands if at all possible, if clock hands go 626 * to 0 then we lose information - clock hands can be in a wide range if 627 * we have devices we rarely try to allocate from, if we generally 628 * allocate from a specified target but only sometimes have to fall back 629 * to the whole filesystem. 630 */ 631 u64 scale_max = U64_MAX; /* maximum we can subtract without underflow */ 632 u64 scale_min = 0; /* minumum we must subtract to avoid overflow */ 633 634 for (u64 *v = stripe->next_alloc; 635 v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) { 636 if (*v) 637 scale_max = min(scale_max, *v); 638 if (*v > stripe_clock_hand_max) 639 scale_min = max(scale_min, *v - stripe_clock_hand_max); 640 } 641 642 u64 scale = max(scale_min, scale_max); 643 644 for (u64 *v = stripe->next_alloc; 645 v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) 646 *v = *v < scale ? 0 : *v - scale; 647 } 648 649 static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca, 650 struct dev_stripe_state *stripe, 651 struct bch_dev_usage *usage) 652 { 653 /* 654 * Stripe state has a per device clock hand: we allocate from the device 655 * with the smallest clock hand. 656 * 657 * When we allocate, we don't do a simple increment; we add the inverse 658 * of the device's free space. This results in round robin behavior that 659 * biases in favor of the device(s) with more free space. 660 */ 661 662 u64 *v = stripe->next_alloc + ca->dev_idx; 663 u64 free_space = __dev_buckets_available(ca, *usage, BCH_WATERMARK_normal); 664 u64 free_space_inv = free_space 665 ? div64_u64(stripe_clock_hand_inv, free_space) 666 : stripe_clock_hand_inv; 667 668 /* Saturating add, avoid overflow: */ 669 u64 sum = *v + free_space_inv; 670 *v = sum >= *v ? sum : U64_MAX; 671 672 if (unlikely(*v > stripe_clock_hand_rescale)) 673 bch2_stripe_state_rescale(stripe); 674 } 675 676 void bch2_dev_stripe_increment(struct bch_dev *ca, 677 struct dev_stripe_state *stripe) 678 { 679 struct bch_dev_usage usage; 680 681 bch2_dev_usage_read_fast(ca, &usage); 682 bch2_dev_stripe_increment_inlined(ca, stripe, &usage); 683 } 684 685 static int add_new_bucket(struct bch_fs *c, 686 struct alloc_request *req, 687 struct open_bucket *ob) 688 { 689 unsigned durability = ob_dev(c, ob)->mi.durability; 690 691 BUG_ON(req->nr_effective >= req->nr_replicas); 692 693 __clear_bit(ob->dev, req->devs_may_alloc.d); 694 req->nr_effective += durability; 695 req->have_cache |= !durability; 696 697 ob_push(c, &req->ptrs, ob); 698 699 if (req->nr_effective >= req->nr_replicas) 700 return 1; 701 if (ob->ec) 702 return 1; 703 return 0; 704 } 705 706 inline int bch2_bucket_alloc_set_trans(struct btree_trans *trans, 707 struct alloc_request *req, 708 struct dev_stripe_state *stripe, 709 struct closure *cl) 710 { 711 struct bch_fs *c = trans->c; 712 int ret = 0; 713 714 BUG_ON(req->nr_effective >= req->nr_replicas); 715 716 bch2_dev_alloc_list(c, stripe, &req->devs_may_alloc, &req->devs_sorted); 717 718 darray_for_each(req->devs_sorted, i) { 719 req->ca = bch2_dev_tryget_noerror(c, *i); 720 if (!req->ca) 721 continue; 722 723 if (!req->ca->mi.durability && req->have_cache) { 724 bch2_dev_put(req->ca); 725 continue; 726 } 727 728 struct open_bucket *ob = bch2_bucket_alloc_trans(trans, req, cl, 729 req->flags & BCH_WRITE_alloc_nowait); 730 if (!IS_ERR(ob)) 731 bch2_dev_stripe_increment_inlined(req->ca, stripe, &req->usage); 732 bch2_dev_put(req->ca); 733 734 if (IS_ERR(ob)) { 735 ret = PTR_ERR(ob); 736 if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || cl) 737 break; 738 continue; 739 } 740 741 ret = add_new_bucket(c, req, ob); 742 if (ret) 743 break; 744 } 745 746 if (ret == 1) 747 return 0; 748 if (ret) 749 return ret; 750 return bch_err_throw(c, insufficient_devices); 751 } 752 753 /* Allocate from stripes: */ 754 755 /* 756 * if we can't allocate a new stripe because there are already too many 757 * partially filled stripes, force allocating from an existing stripe even when 758 * it's to a device we don't want: 759 */ 760 761 static int bucket_alloc_from_stripe(struct btree_trans *trans, 762 struct alloc_request *req, 763 struct closure *cl) 764 { 765 struct bch_fs *c = trans->c; 766 int ret = 0; 767 768 if (req->nr_replicas < 2) 769 return 0; 770 771 if (ec_open_bucket(c, &req->ptrs)) 772 return 0; 773 774 struct ec_stripe_head *h = 775 bch2_ec_stripe_head_get(trans, req, 0, cl); 776 if (IS_ERR(h)) 777 return PTR_ERR(h); 778 if (!h) 779 return 0; 780 781 bch2_dev_alloc_list(c, &req->wp->stripe, &req->devs_may_alloc, &req->devs_sorted); 782 783 darray_for_each(req->devs_sorted, i) 784 for (unsigned ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) { 785 if (!h->s->blocks[ec_idx]) 786 continue; 787 788 struct open_bucket *ob = c->open_buckets + h->s->blocks[ec_idx]; 789 if (ob->dev == *i && !test_and_set_bit(ec_idx, h->s->blocks_allocated)) { 790 ob->ec_idx = ec_idx; 791 ob->ec = h->s; 792 ec_stripe_new_get(h->s, STRIPE_REF_io); 793 794 ret = add_new_bucket(c, req, ob); 795 goto out; 796 } 797 } 798 out: 799 bch2_ec_stripe_head_put(c, h); 800 return ret; 801 } 802 803 /* Sector allocator */ 804 805 static bool want_bucket(struct bch_fs *c, 806 struct alloc_request *req, 807 struct open_bucket *ob) 808 { 809 struct bch_dev *ca = ob_dev(c, ob); 810 811 if (!test_bit(ob->dev, req->devs_may_alloc.d)) 812 return false; 813 814 if (ob->data_type != req->wp->data_type) 815 return false; 816 817 if (!ca->mi.durability && 818 (req->wp->data_type == BCH_DATA_btree || req->ec || req->have_cache)) 819 return false; 820 821 if (req->ec != (ob->ec != NULL)) 822 return false; 823 824 return true; 825 } 826 827 static int bucket_alloc_set_writepoint(struct bch_fs *c, 828 struct alloc_request *req) 829 { 830 struct open_bucket *ob; 831 unsigned i; 832 int ret = 0; 833 834 req->scratch_ptrs.nr = 0; 835 836 open_bucket_for_each(c, &req->wp->ptrs, ob, i) { 837 if (!ret && want_bucket(c, req, ob)) 838 ret = add_new_bucket(c, req, ob); 839 else 840 ob_push(c, &req->scratch_ptrs, ob); 841 } 842 req->wp->ptrs = req->scratch_ptrs; 843 844 return ret; 845 } 846 847 static int bucket_alloc_set_partial(struct bch_fs *c, 848 struct alloc_request *req) 849 { 850 int i, ret = 0; 851 852 if (!c->open_buckets_partial_nr) 853 return 0; 854 855 spin_lock(&c->freelist_lock); 856 857 if (!c->open_buckets_partial_nr) 858 goto unlock; 859 860 for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) { 861 struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i]; 862 863 if (want_bucket(c, req, ob)) { 864 struct bch_dev *ca = ob_dev(c, ob); 865 u64 avail; 866 867 bch2_dev_usage_read_fast(ca, &req->usage); 868 avail = dev_buckets_free(ca, req->usage, req->watermark) + ca->nr_partial_buckets; 869 if (!avail) 870 continue; 871 872 array_remove_item(c->open_buckets_partial, 873 c->open_buckets_partial_nr, 874 i); 875 ob->on_partial_list = false; 876 877 scoped_guard(rcu) 878 bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; 879 880 ret = add_new_bucket(c, req, ob); 881 if (ret) 882 break; 883 } 884 } 885 unlock: 886 spin_unlock(&c->freelist_lock); 887 return ret; 888 } 889 890 static int __open_bucket_add_buckets(struct btree_trans *trans, 891 struct alloc_request *req, 892 struct closure *_cl) 893 { 894 struct bch_fs *c = trans->c; 895 struct open_bucket *ob; 896 struct closure *cl = NULL; 897 unsigned i; 898 int ret; 899 900 req->devs_may_alloc = target_rw_devs(c, req->wp->data_type, req->target); 901 902 /* Don't allocate from devices we already have pointers to: */ 903 darray_for_each(*req->devs_have, i) 904 __clear_bit(*i, req->devs_may_alloc.d); 905 906 open_bucket_for_each(c, &req->ptrs, ob, i) 907 __clear_bit(ob->dev, req->devs_may_alloc.d); 908 909 ret = bucket_alloc_set_writepoint(c, req); 910 if (ret) 911 return ret; 912 913 ret = bucket_alloc_set_partial(c, req); 914 if (ret) 915 return ret; 916 917 if (req->ec) { 918 ret = bucket_alloc_from_stripe(trans, req, _cl); 919 } else { 920 retry_blocking: 921 /* 922 * Try nonblocking first, so that if one device is full we'll try from 923 * other devices: 924 */ 925 ret = bch2_bucket_alloc_set_trans(trans, req, &req->wp->stripe, cl); 926 if (ret && 927 !bch2_err_matches(ret, BCH_ERR_transaction_restart) && 928 !bch2_err_matches(ret, BCH_ERR_insufficient_devices) && 929 !cl && _cl) { 930 cl = _cl; 931 goto retry_blocking; 932 } 933 } 934 935 return ret; 936 } 937 938 static int open_bucket_add_buckets(struct btree_trans *trans, 939 struct alloc_request *req, 940 struct closure *cl) 941 { 942 int ret; 943 944 if (req->ec && !ec_open_bucket(trans->c, &req->ptrs)) { 945 ret = __open_bucket_add_buckets(trans, req, cl); 946 if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || 947 bch2_err_matches(ret, BCH_ERR_operation_blocked) || 948 bch2_err_matches(ret, BCH_ERR_freelist_empty) || 949 bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) 950 return ret; 951 if (req->nr_effective >= req->nr_replicas) 952 return 0; 953 } 954 955 bool ec = false; 956 swap(ec, req->ec); 957 ret = __open_bucket_add_buckets(trans, req, cl); 958 swap(ec, req->ec); 959 960 return ret < 0 ? ret : 0; 961 } 962 963 /** 964 * should_drop_bucket - check if this is open_bucket should go away 965 * @ob: open_bucket to predicate on 966 * @c: filesystem handle 967 * @ca: if set, we're killing buckets for a particular device 968 * @ec: if true, we're shutting down erasure coding and killing all ec 969 * open_buckets 970 * otherwise, return true 971 * Returns: true if we should kill this open_bucket 972 * 973 * We're killing open_buckets because we're shutting down a device, erasure 974 * coding, or the entire filesystem - check if this open_bucket matches: 975 */ 976 static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c, 977 struct bch_dev *ca, bool ec) 978 { 979 if (ec) { 980 return ob->ec != NULL; 981 } else if (ca) { 982 bool drop = ob->dev == ca->dev_idx; 983 struct open_bucket *ob2; 984 unsigned i; 985 986 if (!drop && ob->ec) { 987 unsigned nr_blocks; 988 989 mutex_lock(&ob->ec->lock); 990 nr_blocks = bkey_i_to_stripe(&ob->ec->new_stripe.key)->v.nr_blocks; 991 992 for (i = 0; i < nr_blocks; i++) { 993 if (!ob->ec->blocks[i]) 994 continue; 995 996 ob2 = c->open_buckets + ob->ec->blocks[i]; 997 drop |= ob2->dev == ca->dev_idx; 998 } 999 mutex_unlock(&ob->ec->lock); 1000 } 1001 1002 return drop; 1003 } else { 1004 return true; 1005 } 1006 } 1007 1008 static void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca, 1009 bool ec, struct write_point *wp) 1010 { 1011 struct open_buckets ptrs = { .nr = 0 }; 1012 struct open_bucket *ob; 1013 unsigned i; 1014 1015 mutex_lock(&wp->lock); 1016 open_bucket_for_each(c, &wp->ptrs, ob, i) 1017 if (should_drop_bucket(ob, c, ca, ec)) 1018 bch2_open_bucket_put(c, ob); 1019 else 1020 ob_push(c, &ptrs, ob); 1021 wp->ptrs = ptrs; 1022 mutex_unlock(&wp->lock); 1023 } 1024 1025 void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca, 1026 bool ec) 1027 { 1028 unsigned i; 1029 1030 /* Next, close write points that point to this device... */ 1031 for (i = 0; i < ARRAY_SIZE(c->write_points); i++) 1032 bch2_writepoint_stop(c, ca, ec, &c->write_points[i]); 1033 1034 bch2_writepoint_stop(c, ca, ec, &c->copygc_write_point); 1035 bch2_writepoint_stop(c, ca, ec, &c->rebalance_write_point); 1036 bch2_writepoint_stop(c, ca, ec, &c->btree_write_point); 1037 1038 mutex_lock(&c->btree_reserve_cache_lock); 1039 while (c->btree_reserve_cache_nr) { 1040 struct btree_alloc *a = 1041 &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; 1042 1043 bch2_open_buckets_put(c, &a->ob); 1044 } 1045 mutex_unlock(&c->btree_reserve_cache_lock); 1046 1047 spin_lock(&c->freelist_lock); 1048 i = 0; 1049 while (i < c->open_buckets_partial_nr) { 1050 struct open_bucket *ob = 1051 c->open_buckets + c->open_buckets_partial[i]; 1052 1053 if (should_drop_bucket(ob, c, ca, ec)) { 1054 --c->open_buckets_partial_nr; 1055 swap(c->open_buckets_partial[i], 1056 c->open_buckets_partial[c->open_buckets_partial_nr]); 1057 1058 ob->on_partial_list = false; 1059 1060 scoped_guard(rcu) 1061 bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; 1062 1063 spin_unlock(&c->freelist_lock); 1064 bch2_open_bucket_put(c, ob); 1065 spin_lock(&c->freelist_lock); 1066 } else { 1067 i++; 1068 } 1069 } 1070 spin_unlock(&c->freelist_lock); 1071 1072 bch2_ec_stop_dev(c, ca); 1073 } 1074 1075 static inline struct hlist_head *writepoint_hash(struct bch_fs *c, 1076 unsigned long write_point) 1077 { 1078 unsigned hash = 1079 hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash))); 1080 1081 return &c->write_points_hash[hash]; 1082 } 1083 1084 static struct write_point *__writepoint_find(struct hlist_head *head, 1085 unsigned long write_point) 1086 { 1087 struct write_point *wp; 1088 1089 guard(rcu)(); 1090 hlist_for_each_entry_rcu(wp, head, node) 1091 if (wp->write_point == write_point) 1092 return wp; 1093 return NULL; 1094 } 1095 1096 static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor) 1097 { 1098 u64 stranded = c->write_points_nr * c->bucket_size_max; 1099 u64 free = bch2_fs_usage_read_short(c).free; 1100 1101 return stranded * factor > free; 1102 } 1103 1104 static noinline bool try_increase_writepoints(struct bch_fs *c) 1105 { 1106 struct write_point *wp; 1107 1108 if (c->write_points_nr == ARRAY_SIZE(c->write_points) || 1109 too_many_writepoints(c, 32)) 1110 return false; 1111 1112 wp = c->write_points + c->write_points_nr++; 1113 hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point)); 1114 return true; 1115 } 1116 1117 static noinline bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr) 1118 { 1119 struct bch_fs *c = trans->c; 1120 struct write_point *wp; 1121 struct open_bucket *ob; 1122 unsigned i; 1123 1124 mutex_lock(&c->write_points_hash_lock); 1125 if (c->write_points_nr < old_nr) { 1126 mutex_unlock(&c->write_points_hash_lock); 1127 return true; 1128 } 1129 1130 if (c->write_points_nr == 1 || 1131 !too_many_writepoints(c, 8)) { 1132 mutex_unlock(&c->write_points_hash_lock); 1133 return false; 1134 } 1135 1136 wp = c->write_points + --c->write_points_nr; 1137 1138 hlist_del_rcu(&wp->node); 1139 mutex_unlock(&c->write_points_hash_lock); 1140 1141 bch2_trans_mutex_lock_norelock(trans, &wp->lock); 1142 open_bucket_for_each(c, &wp->ptrs, ob, i) 1143 open_bucket_free_unused(c, ob); 1144 wp->ptrs.nr = 0; 1145 mutex_unlock(&wp->lock); 1146 return true; 1147 } 1148 1149 static struct write_point *writepoint_find(struct btree_trans *trans, 1150 unsigned long write_point) 1151 { 1152 struct bch_fs *c = trans->c; 1153 struct write_point *wp, *oldest; 1154 struct hlist_head *head; 1155 1156 if (!(write_point & 1UL)) { 1157 wp = (struct write_point *) write_point; 1158 bch2_trans_mutex_lock_norelock(trans, &wp->lock); 1159 return wp; 1160 } 1161 1162 head = writepoint_hash(c, write_point); 1163 restart_find: 1164 wp = __writepoint_find(head, write_point); 1165 if (wp) { 1166 lock_wp: 1167 bch2_trans_mutex_lock_norelock(trans, &wp->lock); 1168 if (wp->write_point == write_point) 1169 goto out; 1170 mutex_unlock(&wp->lock); 1171 goto restart_find; 1172 } 1173 restart_find_oldest: 1174 oldest = NULL; 1175 for (wp = c->write_points; 1176 wp < c->write_points + c->write_points_nr; wp++) 1177 if (!oldest || time_before64(wp->last_used, oldest->last_used)) 1178 oldest = wp; 1179 1180 bch2_trans_mutex_lock_norelock(trans, &oldest->lock); 1181 bch2_trans_mutex_lock_norelock(trans, &c->write_points_hash_lock); 1182 if (oldest >= c->write_points + c->write_points_nr || 1183 try_increase_writepoints(c)) { 1184 mutex_unlock(&c->write_points_hash_lock); 1185 mutex_unlock(&oldest->lock); 1186 goto restart_find_oldest; 1187 } 1188 1189 wp = __writepoint_find(head, write_point); 1190 if (wp && wp != oldest) { 1191 mutex_unlock(&c->write_points_hash_lock); 1192 mutex_unlock(&oldest->lock); 1193 goto lock_wp; 1194 } 1195 1196 wp = oldest; 1197 hlist_del_rcu(&wp->node); 1198 wp->write_point = write_point; 1199 hlist_add_head_rcu(&wp->node, head); 1200 mutex_unlock(&c->write_points_hash_lock); 1201 out: 1202 wp->last_used = local_clock(); 1203 return wp; 1204 } 1205 1206 static noinline void 1207 deallocate_extra_replicas(struct bch_fs *c, 1208 struct alloc_request *req) 1209 { 1210 struct open_bucket *ob; 1211 unsigned extra_replicas = req->nr_effective - req->nr_replicas; 1212 unsigned i; 1213 1214 req->scratch_ptrs.nr = 0; 1215 1216 open_bucket_for_each(c, &req->ptrs, ob, i) { 1217 unsigned d = ob_dev(c, ob)->mi.durability; 1218 1219 if (d && d <= extra_replicas) { 1220 extra_replicas -= d; 1221 ob_push(c, &req->wp->ptrs, ob); 1222 } else { 1223 ob_push(c, &req->scratch_ptrs, ob); 1224 } 1225 } 1226 1227 req->ptrs = req->scratch_ptrs; 1228 } 1229 1230 /* 1231 * Get us an open_bucket we can allocate from, return with it locked: 1232 */ 1233 int bch2_alloc_sectors_start_trans(struct btree_trans *trans, 1234 unsigned target, 1235 unsigned erasure_code, 1236 struct write_point_specifier write_point, 1237 struct bch_devs_list *devs_have, 1238 unsigned nr_replicas, 1239 unsigned nr_replicas_required, 1240 enum bch_watermark watermark, 1241 enum bch_write_flags flags, 1242 struct closure *cl, 1243 struct write_point **wp_ret) 1244 { 1245 struct bch_fs *c = trans->c; 1246 struct open_bucket *ob; 1247 unsigned write_points_nr; 1248 int i; 1249 1250 struct alloc_request *req = bch2_trans_kmalloc_nomemzero(trans, sizeof(*req)); 1251 int ret = PTR_ERR_OR_ZERO(req); 1252 if (unlikely(ret)) 1253 return ret; 1254 1255 if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING)) 1256 erasure_code = false; 1257 1258 req->nr_replicas = nr_replicas; 1259 req->target = target; 1260 req->ec = erasure_code; 1261 req->watermark = watermark; 1262 req->flags = flags; 1263 req->devs_have = devs_have; 1264 1265 BUG_ON(!nr_replicas || !nr_replicas_required); 1266 retry: 1267 req->ptrs.nr = 0; 1268 req->nr_effective = 0; 1269 req->have_cache = false; 1270 write_points_nr = c->write_points_nr; 1271 1272 *wp_ret = req->wp = writepoint_find(trans, write_point.v); 1273 1274 req->data_type = req->wp->data_type; 1275 1276 ret = bch2_trans_relock(trans); 1277 if (ret) 1278 goto err; 1279 1280 /* metadata may not allocate on cache devices: */ 1281 if (req->data_type != BCH_DATA_user) 1282 req->have_cache = true; 1283 1284 if (target && !(flags & BCH_WRITE_only_specified_devs)) { 1285 ret = open_bucket_add_buckets(trans, req, NULL); 1286 if (!ret || 1287 bch2_err_matches(ret, BCH_ERR_transaction_restart)) 1288 goto alloc_done; 1289 1290 /* Don't retry from all devices if we're out of open buckets: */ 1291 if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) { 1292 int ret2 = open_bucket_add_buckets(trans, req, cl); 1293 if (!ret2 || 1294 bch2_err_matches(ret2, BCH_ERR_transaction_restart) || 1295 bch2_err_matches(ret2, BCH_ERR_open_buckets_empty)) { 1296 ret = ret2; 1297 goto alloc_done; 1298 } 1299 } 1300 1301 /* 1302 * Only try to allocate cache (durability = 0 devices) from the 1303 * specified target: 1304 */ 1305 req->have_cache = true; 1306 req->target = 0; 1307 1308 ret = open_bucket_add_buckets(trans, req, cl); 1309 } else { 1310 ret = open_bucket_add_buckets(trans, req, cl); 1311 } 1312 alloc_done: 1313 BUG_ON(!ret && req->nr_effective < req->nr_replicas); 1314 1315 if (erasure_code && !ec_open_bucket(c, &req->ptrs)) 1316 pr_debug("failed to get ec bucket: ret %u", ret); 1317 1318 if (ret == -BCH_ERR_insufficient_devices && 1319 req->nr_effective >= nr_replicas_required) 1320 ret = 0; 1321 1322 if (ret) 1323 goto err; 1324 1325 if (req->nr_effective > req->nr_replicas) 1326 deallocate_extra_replicas(c, req); 1327 1328 /* Free buckets we didn't use: */ 1329 open_bucket_for_each(c, &req->wp->ptrs, ob, i) 1330 open_bucket_free_unused(c, ob); 1331 1332 req->wp->ptrs = req->ptrs; 1333 1334 req->wp->sectors_free = UINT_MAX; 1335 1336 open_bucket_for_each(c, &req->wp->ptrs, ob, i) { 1337 /* 1338 * Ensure proper write alignment - either due to misaligned 1339 * bucket sizes (from buggy bcachefs-tools), or writes that mix 1340 * logical/physical alignment: 1341 */ 1342 struct bch_dev *ca = ob_dev(c, ob); 1343 u64 offset = bucket_to_sector(ca, ob->bucket) + 1344 ca->mi.bucket_size - 1345 ob->sectors_free; 1346 unsigned align = round_up(offset, block_sectors(c)) - offset; 1347 1348 ob->sectors_free = max_t(int, 0, ob->sectors_free - align); 1349 1350 req->wp->sectors_free = min(req->wp->sectors_free, ob->sectors_free); 1351 } 1352 1353 req->wp->sectors_free = rounddown(req->wp->sectors_free, block_sectors(c)); 1354 1355 /* Did alignment use up space in an open_bucket? */ 1356 if (unlikely(!req->wp->sectors_free)) { 1357 bch2_alloc_sectors_done(c, req->wp); 1358 goto retry; 1359 } 1360 1361 BUG_ON(!req->wp->sectors_free || req->wp->sectors_free == UINT_MAX); 1362 1363 return 0; 1364 err: 1365 open_bucket_for_each(c, &req->wp->ptrs, ob, i) 1366 if (req->ptrs.nr < ARRAY_SIZE(req->ptrs.v)) 1367 ob_push(c, &req->ptrs, ob); 1368 else 1369 open_bucket_free_unused(c, ob); 1370 req->wp->ptrs = req->ptrs; 1371 1372 mutex_unlock(&req->wp->lock); 1373 1374 if (bch2_err_matches(ret, BCH_ERR_freelist_empty) && 1375 try_decrease_writepoints(trans, write_points_nr)) 1376 goto retry; 1377 1378 if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) 1379 ret = bch_err_throw(c, bucket_alloc_blocked); 1380 1381 if (cl && !(flags & BCH_WRITE_alloc_nowait) && 1382 bch2_err_matches(ret, BCH_ERR_freelist_empty)) 1383 ret = bch_err_throw(c, bucket_alloc_blocked); 1384 1385 return ret; 1386 } 1387 1388 void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, 1389 struct bkey_i *k, unsigned sectors, 1390 bool cached) 1391 { 1392 bch2_alloc_sectors_append_ptrs_inlined(c, wp, k, sectors, cached); 1393 } 1394 1395 /* 1396 * Append pointers to the space we just allocated to @k, and mark @sectors space 1397 * as allocated out of @ob 1398 */ 1399 void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp) 1400 { 1401 bch2_alloc_sectors_done_inlined(c, wp); 1402 } 1403 1404 static inline void writepoint_init(struct write_point *wp, 1405 enum bch_data_type type) 1406 { 1407 mutex_init(&wp->lock); 1408 wp->data_type = type; 1409 1410 INIT_WORK(&wp->index_update_work, bch2_write_point_do_index_updates); 1411 INIT_LIST_HEAD(&wp->writes); 1412 spin_lock_init(&wp->writes_lock); 1413 } 1414 1415 void bch2_fs_allocator_foreground_init(struct bch_fs *c) 1416 { 1417 struct open_bucket *ob; 1418 struct write_point *wp; 1419 1420 mutex_init(&c->write_points_hash_lock); 1421 c->write_points_nr = ARRAY_SIZE(c->write_points); 1422 1423 /* open bucket 0 is a sentinal NULL: */ 1424 spin_lock_init(&c->open_buckets[0].lock); 1425 1426 for (ob = c->open_buckets + 1; 1427 ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { 1428 spin_lock_init(&ob->lock); 1429 c->open_buckets_nr_free++; 1430 1431 ob->freelist = c->open_buckets_freelist; 1432 c->open_buckets_freelist = ob - c->open_buckets; 1433 } 1434 1435 writepoint_init(&c->btree_write_point, BCH_DATA_btree); 1436 writepoint_init(&c->rebalance_write_point, BCH_DATA_user); 1437 writepoint_init(&c->copygc_write_point, BCH_DATA_user); 1438 1439 for (wp = c->write_points; 1440 wp < c->write_points + c->write_points_nr; wp++) { 1441 writepoint_init(wp, BCH_DATA_user); 1442 1443 wp->last_used = local_clock(); 1444 wp->write_point = (unsigned long) wp; 1445 hlist_add_head_rcu(&wp->node, 1446 writepoint_hash(c, wp->write_point)); 1447 } 1448 } 1449 1450 void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob) 1451 { 1452 struct bch_dev *ca = ob_dev(c, ob); 1453 unsigned data_type = ob->data_type; 1454 barrier(); /* READ_ONCE() doesn't work on bitfields */ 1455 1456 prt_printf(out, "%zu ref %u ", 1457 ob - c->open_buckets, 1458 atomic_read(&ob->pin)); 1459 bch2_prt_data_type(out, data_type); 1460 prt_printf(out, " %u:%llu gen %u allocated %u/%u", 1461 ob->dev, ob->bucket, ob->gen, 1462 ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size); 1463 if (ob->ec) 1464 prt_printf(out, " ec idx %llu", ob->ec->idx); 1465 if (ob->on_partial_list) 1466 prt_str(out, " partial"); 1467 prt_newline(out); 1468 } 1469 1470 void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c, 1471 struct bch_dev *ca) 1472 { 1473 struct open_bucket *ob; 1474 1475 out->atomic++; 1476 1477 for (ob = c->open_buckets; 1478 ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); 1479 ob++) { 1480 spin_lock(&ob->lock); 1481 if (ob->valid && (!ca || ob->dev == ca->dev_idx)) 1482 bch2_open_bucket_to_text(out, c, ob); 1483 spin_unlock(&ob->lock); 1484 } 1485 1486 --out->atomic; 1487 } 1488 1489 void bch2_open_buckets_partial_to_text(struct printbuf *out, struct bch_fs *c) 1490 { 1491 unsigned i; 1492 1493 out->atomic++; 1494 spin_lock(&c->freelist_lock); 1495 1496 for (i = 0; i < c->open_buckets_partial_nr; i++) 1497 bch2_open_bucket_to_text(out, c, 1498 c->open_buckets + c->open_buckets_partial[i]); 1499 1500 spin_unlock(&c->freelist_lock); 1501 --out->atomic; 1502 } 1503 1504 static const char * const bch2_write_point_states[] = { 1505 #define x(n) #n, 1506 WRITE_POINT_STATES() 1507 #undef x 1508 NULL 1509 }; 1510 1511 static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c, 1512 struct write_point *wp) 1513 { 1514 struct open_bucket *ob; 1515 unsigned i; 1516 1517 mutex_lock(&wp->lock); 1518 1519 prt_printf(out, "%lu: ", wp->write_point); 1520 prt_human_readable_u64(out, wp->sectors_allocated << 9); 1521 1522 prt_printf(out, " last wrote: "); 1523 bch2_pr_time_units(out, sched_clock() - wp->last_used); 1524 1525 for (i = 0; i < WRITE_POINT_STATE_NR; i++) { 1526 prt_printf(out, " %s: ", bch2_write_point_states[i]); 1527 bch2_pr_time_units(out, wp->time[i]); 1528 } 1529 1530 prt_newline(out); 1531 1532 printbuf_indent_add(out, 2); 1533 open_bucket_for_each(c, &wp->ptrs, ob, i) 1534 bch2_open_bucket_to_text(out, c, ob); 1535 printbuf_indent_sub(out, 2); 1536 1537 mutex_unlock(&wp->lock); 1538 } 1539 1540 void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c) 1541 { 1542 struct write_point *wp; 1543 1544 prt_str(out, "Foreground write points\n"); 1545 for (wp = c->write_points; 1546 wp < c->write_points + ARRAY_SIZE(c->write_points); 1547 wp++) 1548 bch2_write_point_to_text(out, c, wp); 1549 1550 prt_str(out, "Copygc write point\n"); 1551 bch2_write_point_to_text(out, c, &c->copygc_write_point); 1552 1553 prt_str(out, "Rebalance write point\n"); 1554 bch2_write_point_to_text(out, c, &c->rebalance_write_point); 1555 1556 prt_str(out, "Btree write point\n"); 1557 bch2_write_point_to_text(out, c, &c->btree_write_point); 1558 } 1559 1560 void bch2_fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c) 1561 { 1562 unsigned nr[BCH_DATA_NR]; 1563 1564 memset(nr, 0, sizeof(nr)); 1565 1566 for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++) 1567 nr[c->open_buckets[i].data_type]++; 1568 1569 printbuf_tabstops_reset(out); 1570 printbuf_tabstop_push(out, 24); 1571 1572 prt_printf(out, "capacity\t%llu\n", c->capacity); 1573 prt_printf(out, "reserved\t%llu\n", c->reserved); 1574 prt_printf(out, "hidden\t%llu\n", percpu_u64_get(&c->usage->hidden)); 1575 prt_printf(out, "btree\t%llu\n", percpu_u64_get(&c->usage->btree)); 1576 prt_printf(out, "data\t%llu\n", percpu_u64_get(&c->usage->data)); 1577 prt_printf(out, "cached\t%llu\n", percpu_u64_get(&c->usage->cached)); 1578 prt_printf(out, "reserved\t%llu\n", percpu_u64_get(&c->usage->reserved)); 1579 prt_printf(out, "online_reserved\t%llu\n", percpu_u64_get(c->online_reserved)); 1580 prt_printf(out, "nr_inodes\t%llu\n", percpu_u64_get(&c->usage->nr_inodes)); 1581 1582 prt_newline(out); 1583 prt_printf(out, "freelist_wait\t%s\n", c->freelist_wait.list.first ? "waiting" : "empty"); 1584 prt_printf(out, "open buckets allocated\t%i\n", OPEN_BUCKETS_COUNT - c->open_buckets_nr_free); 1585 prt_printf(out, "open buckets total\t%u\n", OPEN_BUCKETS_COUNT); 1586 prt_printf(out, "open_buckets_wait\t%s\n", c->open_buckets_wait.list.first ? "waiting" : "empty"); 1587 prt_printf(out, "open_buckets_btree\t%u\n", nr[BCH_DATA_btree]); 1588 prt_printf(out, "open_buckets_user\t%u\n", nr[BCH_DATA_user]); 1589 prt_printf(out, "btree reserve cache\t%u\n", c->btree_reserve_cache_nr); 1590 } 1591 1592 void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) 1593 { 1594 struct bch_fs *c = ca->fs; 1595 struct bch_dev_usage_full stats = bch2_dev_usage_full_read(ca); 1596 unsigned nr[BCH_DATA_NR]; 1597 1598 memset(nr, 0, sizeof(nr)); 1599 1600 for (unsigned i = 0; i < ARRAY_SIZE(c->open_buckets); i++) 1601 nr[c->open_buckets[i].data_type]++; 1602 1603 bch2_dev_usage_to_text(out, ca, &stats); 1604 1605 prt_newline(out); 1606 1607 prt_printf(out, "reserves:\n"); 1608 for (unsigned i = 0; i < BCH_WATERMARK_NR; i++) 1609 prt_printf(out, "%s\t%llu\r\n", bch2_watermarks[i], bch2_dev_buckets_reserved(ca, i)); 1610 1611 prt_newline(out); 1612 1613 printbuf_tabstops_reset(out); 1614 printbuf_tabstop_push(out, 12); 1615 printbuf_tabstop_push(out, 16); 1616 1617 prt_printf(out, "open buckets\t%i\r\n", ca->nr_open_buckets); 1618 prt_printf(out, "buckets to invalidate\t%llu\r\n", 1619 should_invalidate_buckets(ca, bch2_dev_usage_read(ca))); 1620 } 1621 1622 static noinline void bch2_print_allocator_stuck(struct bch_fs *c) 1623 { 1624 struct printbuf buf = PRINTBUF; 1625 1626 prt_printf(&buf, "Allocator stuck? Waited for %u seconds\n", 1627 c->opts.allocator_stuck_timeout); 1628 1629 prt_printf(&buf, "Allocator debug:\n"); 1630 printbuf_indent_add(&buf, 2); 1631 bch2_fs_alloc_debug_to_text(&buf, c); 1632 printbuf_indent_sub(&buf, 2); 1633 prt_newline(&buf); 1634 1635 bch2_printbuf_make_room(&buf, 4096); 1636 1637 buf.atomic++; 1638 scoped_guard(rcu) 1639 for_each_online_member_rcu(c, ca) { 1640 prt_printf(&buf, "Dev %u:\n", ca->dev_idx); 1641 printbuf_indent_add(&buf, 2); 1642 bch2_dev_alloc_debug_to_text(&buf, ca); 1643 printbuf_indent_sub(&buf, 2); 1644 prt_newline(&buf); 1645 } 1646 --buf.atomic; 1647 1648 prt_printf(&buf, "Copygc debug:\n"); 1649 printbuf_indent_add(&buf, 2); 1650 bch2_copygc_wait_to_text(&buf, c); 1651 printbuf_indent_sub(&buf, 2); 1652 prt_newline(&buf); 1653 1654 prt_printf(&buf, "Journal debug:\n"); 1655 printbuf_indent_add(&buf, 2); 1656 bch2_journal_debug_to_text(&buf, &c->journal); 1657 printbuf_indent_sub(&buf, 2); 1658 1659 bch2_print_str(c, KERN_ERR, buf.buf); 1660 printbuf_exit(&buf); 1661 } 1662 1663 static inline unsigned allocator_wait_timeout(struct bch_fs *c) 1664 { 1665 if (c->allocator_last_stuck && 1666 time_after(c->allocator_last_stuck + HZ * 60 * 2, jiffies)) 1667 return 0; 1668 1669 return c->opts.allocator_stuck_timeout * HZ; 1670 } 1671 1672 void __bch2_wait_on_allocator(struct bch_fs *c, struct closure *cl) 1673 { 1674 unsigned t = allocator_wait_timeout(c); 1675 1676 if (t && closure_sync_timeout(cl, t)) { 1677 c->allocator_last_stuck = jiffies; 1678 bch2_print_allocator_stuck(c); 1679 } 1680 1681 closure_sync(cl); 1682 } 1683