1 /* 2 * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> 3 * 4 * Uses a block device as cache for other block devices; optimized for SSDs. 5 * All allocation is done in buckets, which should match the erase block size 6 * of the device. 7 * 8 * Buckets containing cached data are kept on a heap sorted by priority; 9 * bucket priority is increased on cache hit, and periodically all the buckets 10 * on the heap have their priority scaled down. This currently is just used as 11 * an LRU but in the future should allow for more intelligent heuristics. 12 * 13 * Buckets have an 8 bit counter; freeing is accomplished by incrementing the 14 * counter. Garbage collection is used to remove stale pointers. 15 * 16 * Indexing is done via a btree; nodes are not necessarily fully sorted, rather 17 * as keys are inserted we only sort the pages that have not yet been written. 18 * When garbage collection is run, we resort the entire node. 19 * 20 * All configuration is done via sysfs; see Documentation/bcache.txt. 21 */ 22 23 #include "bcache.h" 24 #include "btree.h" 25 #include "debug.h" 26 #include "extents.h" 27 28 #include <linux/slab.h> 29 #include <linux/bitops.h> 30 #include <linux/hash.h> 31 #include <linux/kthread.h> 32 #include <linux/prefetch.h> 33 #include <linux/random.h> 34 #include <linux/rcupdate.h> 35 #include <linux/sched/clock.h> 36 #include <linux/rculist.h> 37 38 #include <trace/events/bcache.h> 39 40 /* 41 * Todo: 42 * register_bcache: Return errors out to userspace correctly 43 * 44 * Writeback: don't undirty key until after a cache flush 45 * 46 * Create an iterator for key pointers 47 * 48 * On btree write error, mark bucket such that it won't be freed from the cache 49 * 50 * Journalling: 51 * Check for bad keys in replay 52 * Propagate barriers 53 * Refcount journal entries in journal_replay 54 * 55 * Garbage collection: 56 * Finish incremental gc 57 * Gc should free old UUIDs, data for invalid UUIDs 58 * 59 * Provide a way to list backing device UUIDs we have data cached for, and 60 * probably how long it's been since we've seen them, and a way to invalidate 61 * dirty data for devices that will never be attached again 62 * 63 * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so 64 * that based on that and how much dirty data we have we can keep writeback 65 * from being starved 66 * 67 * Add a tracepoint or somesuch to watch for writeback starvation 68 * 69 * When btree depth > 1 and splitting an interior node, we have to make sure 70 * alloc_bucket() cannot fail. This should be true but is not completely 71 * obvious. 72 * 73 * Plugging? 74 * 75 * If data write is less than hard sector size of ssd, round up offset in open 76 * bucket to the next whole sector 77 * 78 * Superblock needs to be fleshed out for multiple cache devices 79 * 80 * Add a sysfs tunable for the number of writeback IOs in flight 81 * 82 * Add a sysfs tunable for the number of open data buckets 83 * 84 * IO tracking: Can we track when one process is doing io on behalf of another? 85 * IO tracking: Don't use just an average, weigh more recent stuff higher 86 * 87 * Test module load/unload 88 */ 89 90 #define MAX_NEED_GC 64 91 #define MAX_SAVE_PRIO 72 92 93 #define PTR_DIRTY_BIT (((uint64_t) 1 << 36)) 94 95 #define PTR_HASH(c, k) \ 96 (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) 97 98 #define insert_lock(s, b) ((b)->level <= (s)->lock) 99 100 /* 101 * These macros are for recursing down the btree - they handle the details of 102 * locking and looking up nodes in the cache for you. They're best treated as 103 * mere syntax when reading code that uses them. 104 * 105 * op->lock determines whether we take a read or a write lock at a given depth. 106 * If you've got a read lock and find that you need a write lock (i.e. you're 107 * going to have to split), set op->lock and return -EINTR; btree_root() will 108 * call you again and you'll have the correct lock. 109 */ 110 111 /** 112 * btree - recurse down the btree on a specified key 113 * @fn: function to call, which will be passed the child node 114 * @key: key to recurse on 115 * @b: parent btree node 116 * @op: pointer to struct btree_op 117 */ 118 #define btree(fn, key, b, op, ...) \ 119 ({ \ 120 int _r, l = (b)->level - 1; \ 121 bool _w = l <= (op)->lock; \ 122 struct btree *_child = bch_btree_node_get((b)->c, op, key, l, \ 123 _w, b); \ 124 if (!IS_ERR(_child)) { \ 125 _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \ 126 rw_unlock(_w, _child); \ 127 } else \ 128 _r = PTR_ERR(_child); \ 129 _r; \ 130 }) 131 132 /** 133 * btree_root - call a function on the root of the btree 134 * @fn: function to call, which will be passed the child node 135 * @c: cache set 136 * @op: pointer to struct btree_op 137 */ 138 #define btree_root(fn, c, op, ...) \ 139 ({ \ 140 int _r = -EINTR; \ 141 do { \ 142 struct btree *_b = (c)->root; \ 143 bool _w = insert_lock(op, _b); \ 144 rw_lock(_w, _b, _b->level); \ 145 if (_b == (c)->root && \ 146 _w == insert_lock(op, _b)) { \ 147 _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ 148 } \ 149 rw_unlock(_w, _b); \ 150 bch_cannibalize_unlock(c); \ 151 if (_r == -EINTR) \ 152 schedule(); \ 153 } while (_r == -EINTR); \ 154 \ 155 finish_wait(&(c)->btree_cache_wait, &(op)->wait); \ 156 _r; \ 157 }) 158 159 static inline struct bset *write_block(struct btree *b) 160 { 161 return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c); 162 } 163 164 static void bch_btree_init_next(struct btree *b) 165 { 166 /* If not a leaf node, always sort */ 167 if (b->level && b->keys.nsets) 168 bch_btree_sort(&b->keys, &b->c->sort); 169 else 170 bch_btree_sort_lazy(&b->keys, &b->c->sort); 171 172 if (b->written < btree_blocks(b)) 173 bch_bset_init_next(&b->keys, write_block(b), 174 bset_magic(&b->c->sb)); 175 176 } 177 178 /* Btree key manipulation */ 179 180 void bkey_put(struct cache_set *c, struct bkey *k) 181 { 182 unsigned i; 183 184 for (i = 0; i < KEY_PTRS(k); i++) 185 if (ptr_available(c, k, i)) 186 atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin); 187 } 188 189 /* Btree IO */ 190 191 static uint64_t btree_csum_set(struct btree *b, struct bset *i) 192 { 193 uint64_t crc = b->key.ptr[0]; 194 void *data = (void *) i + 8, *end = bset_bkey_last(i); 195 196 crc = bch_crc64_update(crc, data, end - data); 197 return crc ^ 0xffffffffffffffffULL; 198 } 199 200 void bch_btree_node_read_done(struct btree *b) 201 { 202 const char *err = "bad btree header"; 203 struct bset *i = btree_bset_first(b); 204 struct btree_iter *iter; 205 206 iter = mempool_alloc(b->c->fill_iter, GFP_NOIO); 207 iter->size = b->c->sb.bucket_size / b->c->sb.block_size; 208 iter->used = 0; 209 210 #ifdef CONFIG_BCACHE_DEBUG 211 iter->b = &b->keys; 212 #endif 213 214 if (!i->seq) 215 goto err; 216 217 for (; 218 b->written < btree_blocks(b) && i->seq == b->keys.set[0].data->seq; 219 i = write_block(b)) { 220 err = "unsupported bset version"; 221 if (i->version > BCACHE_BSET_VERSION) 222 goto err; 223 224 err = "bad btree header"; 225 if (b->written + set_blocks(i, block_bytes(b->c)) > 226 btree_blocks(b)) 227 goto err; 228 229 err = "bad magic"; 230 if (i->magic != bset_magic(&b->c->sb)) 231 goto err; 232 233 err = "bad checksum"; 234 switch (i->version) { 235 case 0: 236 if (i->csum != csum_set(i)) 237 goto err; 238 break; 239 case BCACHE_BSET_VERSION: 240 if (i->csum != btree_csum_set(b, i)) 241 goto err; 242 break; 243 } 244 245 err = "empty set"; 246 if (i != b->keys.set[0].data && !i->keys) 247 goto err; 248 249 bch_btree_iter_push(iter, i->start, bset_bkey_last(i)); 250 251 b->written += set_blocks(i, block_bytes(b->c)); 252 } 253 254 err = "corrupted btree"; 255 for (i = write_block(b); 256 bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key); 257 i = ((void *) i) + block_bytes(b->c)) 258 if (i->seq == b->keys.set[0].data->seq) 259 goto err; 260 261 bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort); 262 263 i = b->keys.set[0].data; 264 err = "short btree key"; 265 if (b->keys.set[0].size && 266 bkey_cmp(&b->key, &b->keys.set[0].end) < 0) 267 goto err; 268 269 if (b->written < btree_blocks(b)) 270 bch_bset_init_next(&b->keys, write_block(b), 271 bset_magic(&b->c->sb)); 272 out: 273 mempool_free(iter, b->c->fill_iter); 274 return; 275 err: 276 set_btree_node_io_error(b); 277 bch_cache_set_error(b->c, "%s at bucket %zu, block %u, %u keys", 278 err, PTR_BUCKET_NR(b->c, &b->key, 0), 279 bset_block_offset(b, i), i->keys); 280 goto out; 281 } 282 283 static void btree_node_read_endio(struct bio *bio) 284 { 285 struct closure *cl = bio->bi_private; 286 closure_put(cl); 287 } 288 289 static void bch_btree_node_read(struct btree *b) 290 { 291 uint64_t start_time = local_clock(); 292 struct closure cl; 293 struct bio *bio; 294 295 trace_bcache_btree_read(b); 296 297 closure_init_stack(&cl); 298 299 bio = bch_bbio_alloc(b->c); 300 bio->bi_iter.bi_size = KEY_SIZE(&b->key) << 9; 301 bio->bi_end_io = btree_node_read_endio; 302 bio->bi_private = &cl; 303 bio->bi_opf = REQ_OP_READ | REQ_META; 304 305 bch_bio_map(bio, b->keys.set[0].data); 306 307 bch_submit_bbio(bio, b->c, &b->key, 0); 308 closure_sync(&cl); 309 310 if (bio->bi_error) 311 set_btree_node_io_error(b); 312 313 bch_bbio_free(bio, b->c); 314 315 if (btree_node_io_error(b)) 316 goto err; 317 318 bch_btree_node_read_done(b); 319 bch_time_stats_update(&b->c->btree_read_time, start_time); 320 321 return; 322 err: 323 bch_cache_set_error(b->c, "io error reading bucket %zu", 324 PTR_BUCKET_NR(b->c, &b->key, 0)); 325 } 326 327 static void btree_complete_write(struct btree *b, struct btree_write *w) 328 { 329 if (w->prio_blocked && 330 !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked)) 331 wake_up_allocators(b->c); 332 333 if (w->journal) { 334 atomic_dec_bug(w->journal); 335 __closure_wake_up(&b->c->journal.wait); 336 } 337 338 w->prio_blocked = 0; 339 w->journal = NULL; 340 } 341 342 static void btree_node_write_unlock(struct closure *cl) 343 { 344 struct btree *b = container_of(cl, struct btree, io); 345 346 up(&b->io_mutex); 347 } 348 349 static void __btree_node_write_done(struct closure *cl) 350 { 351 struct btree *b = container_of(cl, struct btree, io); 352 struct btree_write *w = btree_prev_write(b); 353 354 bch_bbio_free(b->bio, b->c); 355 b->bio = NULL; 356 btree_complete_write(b, w); 357 358 if (btree_node_dirty(b)) 359 schedule_delayed_work(&b->work, 30 * HZ); 360 361 closure_return_with_destructor(cl, btree_node_write_unlock); 362 } 363 364 static void btree_node_write_done(struct closure *cl) 365 { 366 struct btree *b = container_of(cl, struct btree, io); 367 368 bio_free_pages(b->bio); 369 __btree_node_write_done(cl); 370 } 371 372 static void btree_node_write_endio(struct bio *bio) 373 { 374 struct closure *cl = bio->bi_private; 375 struct btree *b = container_of(cl, struct btree, io); 376 377 if (bio->bi_error) 378 set_btree_node_io_error(b); 379 380 bch_bbio_count_io_errors(b->c, bio, bio->bi_error, "writing btree"); 381 closure_put(cl); 382 } 383 384 static void do_btree_node_write(struct btree *b) 385 { 386 struct closure *cl = &b->io; 387 struct bset *i = btree_bset_last(b); 388 BKEY_PADDED(key) k; 389 390 i->version = BCACHE_BSET_VERSION; 391 i->csum = btree_csum_set(b, i); 392 393 BUG_ON(b->bio); 394 b->bio = bch_bbio_alloc(b->c); 395 396 b->bio->bi_end_io = btree_node_write_endio; 397 b->bio->bi_private = cl; 398 b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c)); 399 b->bio->bi_opf = REQ_OP_WRITE | REQ_META | REQ_FUA; 400 bch_bio_map(b->bio, i); 401 402 /* 403 * If we're appending to a leaf node, we don't technically need FUA - 404 * this write just needs to be persisted before the next journal write, 405 * which will be marked FLUSH|FUA. 406 * 407 * Similarly if we're writing a new btree root - the pointer is going to 408 * be in the next journal entry. 409 * 410 * But if we're writing a new btree node (that isn't a root) or 411 * appending to a non leaf btree node, we need either FUA or a flush 412 * when we write the parent with the new pointer. FUA is cheaper than a 413 * flush, and writes appending to leaf nodes aren't blocking anything so 414 * just make all btree node writes FUA to keep things sane. 415 */ 416 417 bkey_copy(&k.key, &b->key); 418 SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + 419 bset_sector_offset(&b->keys, i)); 420 421 if (!bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) { 422 int j; 423 struct bio_vec *bv; 424 void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); 425 426 bio_for_each_segment_all(bv, b->bio, j) 427 memcpy(page_address(bv->bv_page), 428 base + j * PAGE_SIZE, PAGE_SIZE); 429 430 bch_submit_bbio(b->bio, b->c, &k.key, 0); 431 432 continue_at(cl, btree_node_write_done, NULL); 433 } else { 434 b->bio->bi_vcnt = 0; 435 bch_bio_map(b->bio, i); 436 437 bch_submit_bbio(b->bio, b->c, &k.key, 0); 438 439 closure_sync(cl); 440 continue_at_nobarrier(cl, __btree_node_write_done, NULL); 441 } 442 } 443 444 void __bch_btree_node_write(struct btree *b, struct closure *parent) 445 { 446 struct bset *i = btree_bset_last(b); 447 448 lockdep_assert_held(&b->write_lock); 449 450 trace_bcache_btree_write(b); 451 452 BUG_ON(current->bio_list); 453 BUG_ON(b->written >= btree_blocks(b)); 454 BUG_ON(b->written && !i->keys); 455 BUG_ON(btree_bset_first(b)->seq != i->seq); 456 bch_check_keys(&b->keys, "writing"); 457 458 cancel_delayed_work(&b->work); 459 460 /* If caller isn't waiting for write, parent refcount is cache set */ 461 down(&b->io_mutex); 462 closure_init(&b->io, parent ?: &b->c->cl); 463 464 clear_bit(BTREE_NODE_dirty, &b->flags); 465 change_bit(BTREE_NODE_write_idx, &b->flags); 466 467 do_btree_node_write(b); 468 469 atomic_long_add(set_blocks(i, block_bytes(b->c)) * b->c->sb.block_size, 470 &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); 471 472 b->written += set_blocks(i, block_bytes(b->c)); 473 } 474 475 void bch_btree_node_write(struct btree *b, struct closure *parent) 476 { 477 unsigned nsets = b->keys.nsets; 478 479 lockdep_assert_held(&b->lock); 480 481 __bch_btree_node_write(b, parent); 482 483 /* 484 * do verify if there was more than one set initially (i.e. we did a 485 * sort) and we sorted down to a single set: 486 */ 487 if (nsets && !b->keys.nsets) 488 bch_btree_verify(b); 489 490 bch_btree_init_next(b); 491 } 492 493 static void bch_btree_node_write_sync(struct btree *b) 494 { 495 struct closure cl; 496 497 closure_init_stack(&cl); 498 499 mutex_lock(&b->write_lock); 500 bch_btree_node_write(b, &cl); 501 mutex_unlock(&b->write_lock); 502 503 closure_sync(&cl); 504 } 505 506 static void btree_node_write_work(struct work_struct *w) 507 { 508 struct btree *b = container_of(to_delayed_work(w), struct btree, work); 509 510 mutex_lock(&b->write_lock); 511 if (btree_node_dirty(b)) 512 __bch_btree_node_write(b, NULL); 513 mutex_unlock(&b->write_lock); 514 } 515 516 static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) 517 { 518 struct bset *i = btree_bset_last(b); 519 struct btree_write *w = btree_current_write(b); 520 521 lockdep_assert_held(&b->write_lock); 522 523 BUG_ON(!b->written); 524 BUG_ON(!i->keys); 525 526 if (!btree_node_dirty(b)) 527 schedule_delayed_work(&b->work, 30 * HZ); 528 529 set_btree_node_dirty(b); 530 531 if (journal_ref) { 532 if (w->journal && 533 journal_pin_cmp(b->c, w->journal, journal_ref)) { 534 atomic_dec_bug(w->journal); 535 w->journal = NULL; 536 } 537 538 if (!w->journal) { 539 w->journal = journal_ref; 540 atomic_inc(w->journal); 541 } 542 } 543 544 /* Force write if set is too big */ 545 if (set_bytes(i) > PAGE_SIZE - 48 && 546 !current->bio_list) 547 bch_btree_node_write(b, NULL); 548 } 549 550 /* 551 * Btree in memory cache - allocation/freeing 552 * mca -> memory cache 553 */ 554 555 #define mca_reserve(c) (((c->root && c->root->level) \ 556 ? c->root->level : 1) * 8 + 16) 557 #define mca_can_free(c) \ 558 max_t(int, 0, c->btree_cache_used - mca_reserve(c)) 559 560 static void mca_data_free(struct btree *b) 561 { 562 BUG_ON(b->io_mutex.count != 1); 563 564 bch_btree_keys_free(&b->keys); 565 566 b->c->btree_cache_used--; 567 list_move(&b->list, &b->c->btree_cache_freed); 568 } 569 570 static void mca_bucket_free(struct btree *b) 571 { 572 BUG_ON(btree_node_dirty(b)); 573 574 b->key.ptr[0] = 0; 575 hlist_del_init_rcu(&b->hash); 576 list_move(&b->list, &b->c->btree_cache_freeable); 577 } 578 579 static unsigned btree_order(struct bkey *k) 580 { 581 return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1); 582 } 583 584 static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) 585 { 586 if (!bch_btree_keys_alloc(&b->keys, 587 max_t(unsigned, 588 ilog2(b->c->btree_pages), 589 btree_order(k)), 590 gfp)) { 591 b->c->btree_cache_used++; 592 list_move(&b->list, &b->c->btree_cache); 593 } else { 594 list_move(&b->list, &b->c->btree_cache_freed); 595 } 596 } 597 598 static struct btree *mca_bucket_alloc(struct cache_set *c, 599 struct bkey *k, gfp_t gfp) 600 { 601 struct btree *b = kzalloc(sizeof(struct btree), gfp); 602 if (!b) 603 return NULL; 604 605 init_rwsem(&b->lock); 606 lockdep_set_novalidate_class(&b->lock); 607 mutex_init(&b->write_lock); 608 lockdep_set_novalidate_class(&b->write_lock); 609 INIT_LIST_HEAD(&b->list); 610 INIT_DELAYED_WORK(&b->work, btree_node_write_work); 611 b->c = c; 612 sema_init(&b->io_mutex, 1); 613 614 mca_data_alloc(b, k, gfp); 615 return b; 616 } 617 618 static int mca_reap(struct btree *b, unsigned min_order, bool flush) 619 { 620 struct closure cl; 621 622 closure_init_stack(&cl); 623 lockdep_assert_held(&b->c->bucket_lock); 624 625 if (!down_write_trylock(&b->lock)) 626 return -ENOMEM; 627 628 BUG_ON(btree_node_dirty(b) && !b->keys.set[0].data); 629 630 if (b->keys.page_order < min_order) 631 goto out_unlock; 632 633 if (!flush) { 634 if (btree_node_dirty(b)) 635 goto out_unlock; 636 637 if (down_trylock(&b->io_mutex)) 638 goto out_unlock; 639 up(&b->io_mutex); 640 } 641 642 mutex_lock(&b->write_lock); 643 if (btree_node_dirty(b)) 644 __bch_btree_node_write(b, &cl); 645 mutex_unlock(&b->write_lock); 646 647 closure_sync(&cl); 648 649 /* wait for any in flight btree write */ 650 down(&b->io_mutex); 651 up(&b->io_mutex); 652 653 return 0; 654 out_unlock: 655 rw_unlock(true, b); 656 return -ENOMEM; 657 } 658 659 static unsigned long bch_mca_scan(struct shrinker *shrink, 660 struct shrink_control *sc) 661 { 662 struct cache_set *c = container_of(shrink, struct cache_set, shrink); 663 struct btree *b, *t; 664 unsigned long i, nr = sc->nr_to_scan; 665 unsigned long freed = 0; 666 667 if (c->shrinker_disabled) 668 return SHRINK_STOP; 669 670 if (c->btree_cache_alloc_lock) 671 return SHRINK_STOP; 672 673 /* Return -1 if we can't do anything right now */ 674 if (sc->gfp_mask & __GFP_IO) 675 mutex_lock(&c->bucket_lock); 676 else if (!mutex_trylock(&c->bucket_lock)) 677 return -1; 678 679 /* 680 * It's _really_ critical that we don't free too many btree nodes - we 681 * have to always leave ourselves a reserve. The reserve is how we 682 * guarantee that allocating memory for a new btree node can always 683 * succeed, so that inserting keys into the btree can always succeed and 684 * IO can always make forward progress: 685 */ 686 nr /= c->btree_pages; 687 nr = min_t(unsigned long, nr, mca_can_free(c)); 688 689 i = 0; 690 list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) { 691 if (freed >= nr) 692 break; 693 694 if (++i > 3 && 695 !mca_reap(b, 0, false)) { 696 mca_data_free(b); 697 rw_unlock(true, b); 698 freed++; 699 } 700 } 701 702 for (i = 0; (nr--) && i < c->btree_cache_used; i++) { 703 if (list_empty(&c->btree_cache)) 704 goto out; 705 706 b = list_first_entry(&c->btree_cache, struct btree, list); 707 list_rotate_left(&c->btree_cache); 708 709 if (!b->accessed && 710 !mca_reap(b, 0, false)) { 711 mca_bucket_free(b); 712 mca_data_free(b); 713 rw_unlock(true, b); 714 freed++; 715 } else 716 b->accessed = 0; 717 } 718 out: 719 mutex_unlock(&c->bucket_lock); 720 return freed; 721 } 722 723 static unsigned long bch_mca_count(struct shrinker *shrink, 724 struct shrink_control *sc) 725 { 726 struct cache_set *c = container_of(shrink, struct cache_set, shrink); 727 728 if (c->shrinker_disabled) 729 return 0; 730 731 if (c->btree_cache_alloc_lock) 732 return 0; 733 734 return mca_can_free(c) * c->btree_pages; 735 } 736 737 void bch_btree_cache_free(struct cache_set *c) 738 { 739 struct btree *b; 740 struct closure cl; 741 closure_init_stack(&cl); 742 743 if (c->shrink.list.next) 744 unregister_shrinker(&c->shrink); 745 746 mutex_lock(&c->bucket_lock); 747 748 #ifdef CONFIG_BCACHE_DEBUG 749 if (c->verify_data) 750 list_move(&c->verify_data->list, &c->btree_cache); 751 752 free_pages((unsigned long) c->verify_ondisk, ilog2(bucket_pages(c))); 753 #endif 754 755 list_splice(&c->btree_cache_freeable, 756 &c->btree_cache); 757 758 while (!list_empty(&c->btree_cache)) { 759 b = list_first_entry(&c->btree_cache, struct btree, list); 760 761 if (btree_node_dirty(b)) 762 btree_complete_write(b, btree_current_write(b)); 763 clear_bit(BTREE_NODE_dirty, &b->flags); 764 765 mca_data_free(b); 766 } 767 768 while (!list_empty(&c->btree_cache_freed)) { 769 b = list_first_entry(&c->btree_cache_freed, 770 struct btree, list); 771 list_del(&b->list); 772 cancel_delayed_work_sync(&b->work); 773 kfree(b); 774 } 775 776 mutex_unlock(&c->bucket_lock); 777 } 778 779 int bch_btree_cache_alloc(struct cache_set *c) 780 { 781 unsigned i; 782 783 for (i = 0; i < mca_reserve(c); i++) 784 if (!mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL)) 785 return -ENOMEM; 786 787 list_splice_init(&c->btree_cache, 788 &c->btree_cache_freeable); 789 790 #ifdef CONFIG_BCACHE_DEBUG 791 mutex_init(&c->verify_lock); 792 793 c->verify_ondisk = (void *) 794 __get_free_pages(GFP_KERNEL, ilog2(bucket_pages(c))); 795 796 c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); 797 798 if (c->verify_data && 799 c->verify_data->keys.set->data) 800 list_del_init(&c->verify_data->list); 801 else 802 c->verify_data = NULL; 803 #endif 804 805 c->shrink.count_objects = bch_mca_count; 806 c->shrink.scan_objects = bch_mca_scan; 807 c->shrink.seeks = 4; 808 c->shrink.batch = c->btree_pages * 2; 809 register_shrinker(&c->shrink); 810 811 return 0; 812 } 813 814 /* Btree in memory cache - hash table */ 815 816 static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k) 817 { 818 return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)]; 819 } 820 821 static struct btree *mca_find(struct cache_set *c, struct bkey *k) 822 { 823 struct btree *b; 824 825 rcu_read_lock(); 826 hlist_for_each_entry_rcu(b, mca_hash(c, k), hash) 827 if (PTR_HASH(c, &b->key) == PTR_HASH(c, k)) 828 goto out; 829 b = NULL; 830 out: 831 rcu_read_unlock(); 832 return b; 833 } 834 835 static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op) 836 { 837 struct task_struct *old; 838 839 old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current); 840 if (old && old != current) { 841 if (op) 842 prepare_to_wait(&c->btree_cache_wait, &op->wait, 843 TASK_UNINTERRUPTIBLE); 844 return -EINTR; 845 } 846 847 return 0; 848 } 849 850 static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op, 851 struct bkey *k) 852 { 853 struct btree *b; 854 855 trace_bcache_btree_cache_cannibalize(c); 856 857 if (mca_cannibalize_lock(c, op)) 858 return ERR_PTR(-EINTR); 859 860 list_for_each_entry_reverse(b, &c->btree_cache, list) 861 if (!mca_reap(b, btree_order(k), false)) 862 return b; 863 864 list_for_each_entry_reverse(b, &c->btree_cache, list) 865 if (!mca_reap(b, btree_order(k), true)) 866 return b; 867 868 WARN(1, "btree cache cannibalize failed\n"); 869 return ERR_PTR(-ENOMEM); 870 } 871 872 /* 873 * We can only have one thread cannibalizing other cached btree nodes at a time, 874 * or we'll deadlock. We use an open coded mutex to ensure that, which a 875 * cannibalize_bucket() will take. This means every time we unlock the root of 876 * the btree, we need to release this lock if we have it held. 877 */ 878 static void bch_cannibalize_unlock(struct cache_set *c) 879 { 880 if (c->btree_cache_alloc_lock == current) { 881 c->btree_cache_alloc_lock = NULL; 882 wake_up(&c->btree_cache_wait); 883 } 884 } 885 886 static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op, 887 struct bkey *k, int level) 888 { 889 struct btree *b; 890 891 BUG_ON(current->bio_list); 892 893 lockdep_assert_held(&c->bucket_lock); 894 895 if (mca_find(c, k)) 896 return NULL; 897 898 /* btree_free() doesn't free memory; it sticks the node on the end of 899 * the list. Check if there's any freed nodes there: 900 */ 901 list_for_each_entry(b, &c->btree_cache_freeable, list) 902 if (!mca_reap(b, btree_order(k), false)) 903 goto out; 904 905 /* We never free struct btree itself, just the memory that holds the on 906 * disk node. Check the freed list before allocating a new one: 907 */ 908 list_for_each_entry(b, &c->btree_cache_freed, list) 909 if (!mca_reap(b, 0, false)) { 910 mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); 911 if (!b->keys.set[0].data) 912 goto err; 913 else 914 goto out; 915 } 916 917 b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO); 918 if (!b) 919 goto err; 920 921 BUG_ON(!down_write_trylock(&b->lock)); 922 if (!b->keys.set->data) 923 goto err; 924 out: 925 BUG_ON(b->io_mutex.count != 1); 926 927 bkey_copy(&b->key, k); 928 list_move(&b->list, &c->btree_cache); 929 hlist_del_init_rcu(&b->hash); 930 hlist_add_head_rcu(&b->hash, mca_hash(c, k)); 931 932 lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); 933 b->parent = (void *) ~0UL; 934 b->flags = 0; 935 b->written = 0; 936 b->level = level; 937 938 if (!b->level) 939 bch_btree_keys_init(&b->keys, &bch_extent_keys_ops, 940 &b->c->expensive_debug_checks); 941 else 942 bch_btree_keys_init(&b->keys, &bch_btree_keys_ops, 943 &b->c->expensive_debug_checks); 944 945 return b; 946 err: 947 if (b) 948 rw_unlock(true, b); 949 950 b = mca_cannibalize(c, op, k); 951 if (!IS_ERR(b)) 952 goto out; 953 954 return b; 955 } 956 957 /** 958 * bch_btree_node_get - find a btree node in the cache and lock it, reading it 959 * in from disk if necessary. 960 * 961 * If IO is necessary and running under generic_make_request, returns -EAGAIN. 962 * 963 * The btree node will have either a read or a write lock held, depending on 964 * level and op->lock. 965 */ 966 struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op, 967 struct bkey *k, int level, bool write, 968 struct btree *parent) 969 { 970 int i = 0; 971 struct btree *b; 972 973 BUG_ON(level < 0); 974 retry: 975 b = mca_find(c, k); 976 977 if (!b) { 978 if (current->bio_list) 979 return ERR_PTR(-EAGAIN); 980 981 mutex_lock(&c->bucket_lock); 982 b = mca_alloc(c, op, k, level); 983 mutex_unlock(&c->bucket_lock); 984 985 if (!b) 986 goto retry; 987 if (IS_ERR(b)) 988 return b; 989 990 bch_btree_node_read(b); 991 992 if (!write) 993 downgrade_write(&b->lock); 994 } else { 995 rw_lock(write, b, level); 996 if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) { 997 rw_unlock(write, b); 998 goto retry; 999 } 1000 BUG_ON(b->level != level); 1001 } 1002 1003 b->parent = parent; 1004 b->accessed = 1; 1005 1006 for (; i <= b->keys.nsets && b->keys.set[i].size; i++) { 1007 prefetch(b->keys.set[i].tree); 1008 prefetch(b->keys.set[i].data); 1009 } 1010 1011 for (; i <= b->keys.nsets; i++) 1012 prefetch(b->keys.set[i].data); 1013 1014 if (btree_node_io_error(b)) { 1015 rw_unlock(write, b); 1016 return ERR_PTR(-EIO); 1017 } 1018 1019 BUG_ON(!b->written); 1020 1021 return b; 1022 } 1023 1024 static void btree_node_prefetch(struct btree *parent, struct bkey *k) 1025 { 1026 struct btree *b; 1027 1028 mutex_lock(&parent->c->bucket_lock); 1029 b = mca_alloc(parent->c, NULL, k, parent->level - 1); 1030 mutex_unlock(&parent->c->bucket_lock); 1031 1032 if (!IS_ERR_OR_NULL(b)) { 1033 b->parent = parent; 1034 bch_btree_node_read(b); 1035 rw_unlock(true, b); 1036 } 1037 } 1038 1039 /* Btree alloc */ 1040 1041 static void btree_node_free(struct btree *b) 1042 { 1043 trace_bcache_btree_node_free(b); 1044 1045 BUG_ON(b == b->c->root); 1046 1047 mutex_lock(&b->write_lock); 1048 1049 if (btree_node_dirty(b)) 1050 btree_complete_write(b, btree_current_write(b)); 1051 clear_bit(BTREE_NODE_dirty, &b->flags); 1052 1053 mutex_unlock(&b->write_lock); 1054 1055 cancel_delayed_work(&b->work); 1056 1057 mutex_lock(&b->c->bucket_lock); 1058 bch_bucket_free(b->c, &b->key); 1059 mca_bucket_free(b); 1060 mutex_unlock(&b->c->bucket_lock); 1061 } 1062 1063 struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op, 1064 int level, bool wait, 1065 struct btree *parent) 1066 { 1067 BKEY_PADDED(key) k; 1068 struct btree *b = ERR_PTR(-EAGAIN); 1069 1070 mutex_lock(&c->bucket_lock); 1071 retry: 1072 if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait)) 1073 goto err; 1074 1075 bkey_put(c, &k.key); 1076 SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); 1077 1078 b = mca_alloc(c, op, &k.key, level); 1079 if (IS_ERR(b)) 1080 goto err_free; 1081 1082 if (!b) { 1083 cache_bug(c, 1084 "Tried to allocate bucket that was in btree cache"); 1085 goto retry; 1086 } 1087 1088 b->accessed = 1; 1089 b->parent = parent; 1090 bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb)); 1091 1092 mutex_unlock(&c->bucket_lock); 1093 1094 trace_bcache_btree_node_alloc(b); 1095 return b; 1096 err_free: 1097 bch_bucket_free(c, &k.key); 1098 err: 1099 mutex_unlock(&c->bucket_lock); 1100 1101 trace_bcache_btree_node_alloc_fail(c); 1102 return b; 1103 } 1104 1105 static struct btree *bch_btree_node_alloc(struct cache_set *c, 1106 struct btree_op *op, int level, 1107 struct btree *parent) 1108 { 1109 return __bch_btree_node_alloc(c, op, level, op != NULL, parent); 1110 } 1111 1112 static struct btree *btree_node_alloc_replacement(struct btree *b, 1113 struct btree_op *op) 1114 { 1115 struct btree *n = bch_btree_node_alloc(b->c, op, b->level, b->parent); 1116 if (!IS_ERR_OR_NULL(n)) { 1117 mutex_lock(&n->write_lock); 1118 bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort); 1119 bkey_copy_key(&n->key, &b->key); 1120 mutex_unlock(&n->write_lock); 1121 } 1122 1123 return n; 1124 } 1125 1126 static void make_btree_freeing_key(struct btree *b, struct bkey *k) 1127 { 1128 unsigned i; 1129 1130 mutex_lock(&b->c->bucket_lock); 1131 1132 atomic_inc(&b->c->prio_blocked); 1133 1134 bkey_copy(k, &b->key); 1135 bkey_copy_key(k, &ZERO_KEY); 1136 1137 for (i = 0; i < KEY_PTRS(k); i++) 1138 SET_PTR_GEN(k, i, 1139 bch_inc_gen(PTR_CACHE(b->c, &b->key, i), 1140 PTR_BUCKET(b->c, &b->key, i))); 1141 1142 mutex_unlock(&b->c->bucket_lock); 1143 } 1144 1145 static int btree_check_reserve(struct btree *b, struct btree_op *op) 1146 { 1147 struct cache_set *c = b->c; 1148 struct cache *ca; 1149 unsigned i, reserve = (c->root->level - b->level) * 2 + 1; 1150 1151 mutex_lock(&c->bucket_lock); 1152 1153 for_each_cache(ca, c, i) 1154 if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) { 1155 if (op) 1156 prepare_to_wait(&c->btree_cache_wait, &op->wait, 1157 TASK_UNINTERRUPTIBLE); 1158 mutex_unlock(&c->bucket_lock); 1159 return -EINTR; 1160 } 1161 1162 mutex_unlock(&c->bucket_lock); 1163 1164 return mca_cannibalize_lock(b->c, op); 1165 } 1166 1167 /* Garbage collection */ 1168 1169 static uint8_t __bch_btree_mark_key(struct cache_set *c, int level, 1170 struct bkey *k) 1171 { 1172 uint8_t stale = 0; 1173 unsigned i; 1174 struct bucket *g; 1175 1176 /* 1177 * ptr_invalid() can't return true for the keys that mark btree nodes as 1178 * freed, but since ptr_bad() returns true we'll never actually use them 1179 * for anything and thus we don't want mark their pointers here 1180 */ 1181 if (!bkey_cmp(k, &ZERO_KEY)) 1182 return stale; 1183 1184 for (i = 0; i < KEY_PTRS(k); i++) { 1185 if (!ptr_available(c, k, i)) 1186 continue; 1187 1188 g = PTR_BUCKET(c, k, i); 1189 1190 if (gen_after(g->last_gc, PTR_GEN(k, i))) 1191 g->last_gc = PTR_GEN(k, i); 1192 1193 if (ptr_stale(c, k, i)) { 1194 stale = max(stale, ptr_stale(c, k, i)); 1195 continue; 1196 } 1197 1198 cache_bug_on(GC_MARK(g) && 1199 (GC_MARK(g) == GC_MARK_METADATA) != (level != 0), 1200 c, "inconsistent ptrs: mark = %llu, level = %i", 1201 GC_MARK(g), level); 1202 1203 if (level) 1204 SET_GC_MARK(g, GC_MARK_METADATA); 1205 else if (KEY_DIRTY(k)) 1206 SET_GC_MARK(g, GC_MARK_DIRTY); 1207 else if (!GC_MARK(g)) 1208 SET_GC_MARK(g, GC_MARK_RECLAIMABLE); 1209 1210 /* guard against overflow */ 1211 SET_GC_SECTORS_USED(g, min_t(unsigned, 1212 GC_SECTORS_USED(g) + KEY_SIZE(k), 1213 MAX_GC_SECTORS_USED)); 1214 1215 BUG_ON(!GC_SECTORS_USED(g)); 1216 } 1217 1218 return stale; 1219 } 1220 1221 #define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k) 1222 1223 void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k) 1224 { 1225 unsigned i; 1226 1227 for (i = 0; i < KEY_PTRS(k); i++) 1228 if (ptr_available(c, k, i) && 1229 !ptr_stale(c, k, i)) { 1230 struct bucket *b = PTR_BUCKET(c, k, i); 1231 1232 b->gen = PTR_GEN(k, i); 1233 1234 if (level && bkey_cmp(k, &ZERO_KEY)) 1235 b->prio = BTREE_PRIO; 1236 else if (!level && b->prio == BTREE_PRIO) 1237 b->prio = INITIAL_PRIO; 1238 } 1239 1240 __bch_btree_mark_key(c, level, k); 1241 } 1242 1243 static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) 1244 { 1245 uint8_t stale = 0; 1246 unsigned keys = 0, good_keys = 0; 1247 struct bkey *k; 1248 struct btree_iter iter; 1249 struct bset_tree *t; 1250 1251 gc->nodes++; 1252 1253 for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) { 1254 stale = max(stale, btree_mark_key(b, k)); 1255 keys++; 1256 1257 if (bch_ptr_bad(&b->keys, k)) 1258 continue; 1259 1260 gc->key_bytes += bkey_u64s(k); 1261 gc->nkeys++; 1262 good_keys++; 1263 1264 gc->data += KEY_SIZE(k); 1265 } 1266 1267 for (t = b->keys.set; t <= &b->keys.set[b->keys.nsets]; t++) 1268 btree_bug_on(t->size && 1269 bset_written(&b->keys, t) && 1270 bkey_cmp(&b->key, &t->end) < 0, 1271 b, "found short btree key in gc"); 1272 1273 if (b->c->gc_always_rewrite) 1274 return true; 1275 1276 if (stale > 10) 1277 return true; 1278 1279 if ((keys - good_keys) * 2 > keys) 1280 return true; 1281 1282 return false; 1283 } 1284 1285 #define GC_MERGE_NODES 4U 1286 1287 struct gc_merge_info { 1288 struct btree *b; 1289 unsigned keys; 1290 }; 1291 1292 static int bch_btree_insert_node(struct btree *, struct btree_op *, 1293 struct keylist *, atomic_t *, struct bkey *); 1294 1295 static int btree_gc_coalesce(struct btree *b, struct btree_op *op, 1296 struct gc_stat *gc, struct gc_merge_info *r) 1297 { 1298 unsigned i, nodes = 0, keys = 0, blocks; 1299 struct btree *new_nodes[GC_MERGE_NODES]; 1300 struct keylist keylist; 1301 struct closure cl; 1302 struct bkey *k; 1303 1304 bch_keylist_init(&keylist); 1305 1306 if (btree_check_reserve(b, NULL)) 1307 return 0; 1308 1309 memset(new_nodes, 0, sizeof(new_nodes)); 1310 closure_init_stack(&cl); 1311 1312 while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b)) 1313 keys += r[nodes++].keys; 1314 1315 blocks = btree_default_blocks(b->c) * 2 / 3; 1316 1317 if (nodes < 2 || 1318 __set_blocks(b->keys.set[0].data, keys, 1319 block_bytes(b->c)) > blocks * (nodes - 1)) 1320 return 0; 1321 1322 for (i = 0; i < nodes; i++) { 1323 new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL); 1324 if (IS_ERR_OR_NULL(new_nodes[i])) 1325 goto out_nocoalesce; 1326 } 1327 1328 /* 1329 * We have to check the reserve here, after we've allocated our new 1330 * nodes, to make sure the insert below will succeed - we also check 1331 * before as an optimization to potentially avoid a bunch of expensive 1332 * allocs/sorts 1333 */ 1334 if (btree_check_reserve(b, NULL)) 1335 goto out_nocoalesce; 1336 1337 for (i = 0; i < nodes; i++) 1338 mutex_lock(&new_nodes[i]->write_lock); 1339 1340 for (i = nodes - 1; i > 0; --i) { 1341 struct bset *n1 = btree_bset_first(new_nodes[i]); 1342 struct bset *n2 = btree_bset_first(new_nodes[i - 1]); 1343 struct bkey *k, *last = NULL; 1344 1345 keys = 0; 1346 1347 if (i > 1) { 1348 for (k = n2->start; 1349 k < bset_bkey_last(n2); 1350 k = bkey_next(k)) { 1351 if (__set_blocks(n1, n1->keys + keys + 1352 bkey_u64s(k), 1353 block_bytes(b->c)) > blocks) 1354 break; 1355 1356 last = k; 1357 keys += bkey_u64s(k); 1358 } 1359 } else { 1360 /* 1361 * Last node we're not getting rid of - we're getting 1362 * rid of the node at r[0]. Have to try and fit all of 1363 * the remaining keys into this node; we can't ensure 1364 * they will always fit due to rounding and variable 1365 * length keys (shouldn't be possible in practice, 1366 * though) 1367 */ 1368 if (__set_blocks(n1, n1->keys + n2->keys, 1369 block_bytes(b->c)) > 1370 btree_blocks(new_nodes[i])) 1371 goto out_nocoalesce; 1372 1373 keys = n2->keys; 1374 /* Take the key of the node we're getting rid of */ 1375 last = &r->b->key; 1376 } 1377 1378 BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c)) > 1379 btree_blocks(new_nodes[i])); 1380 1381 if (last) 1382 bkey_copy_key(&new_nodes[i]->key, last); 1383 1384 memcpy(bset_bkey_last(n1), 1385 n2->start, 1386 (void *) bset_bkey_idx(n2, keys) - (void *) n2->start); 1387 1388 n1->keys += keys; 1389 r[i].keys = n1->keys; 1390 1391 memmove(n2->start, 1392 bset_bkey_idx(n2, keys), 1393 (void *) bset_bkey_last(n2) - 1394 (void *) bset_bkey_idx(n2, keys)); 1395 1396 n2->keys -= keys; 1397 1398 if (__bch_keylist_realloc(&keylist, 1399 bkey_u64s(&new_nodes[i]->key))) 1400 goto out_nocoalesce; 1401 1402 bch_btree_node_write(new_nodes[i], &cl); 1403 bch_keylist_add(&keylist, &new_nodes[i]->key); 1404 } 1405 1406 for (i = 0; i < nodes; i++) 1407 mutex_unlock(&new_nodes[i]->write_lock); 1408 1409 closure_sync(&cl); 1410 1411 /* We emptied out this node */ 1412 BUG_ON(btree_bset_first(new_nodes[0])->keys); 1413 btree_node_free(new_nodes[0]); 1414 rw_unlock(true, new_nodes[0]); 1415 new_nodes[0] = NULL; 1416 1417 for (i = 0; i < nodes; i++) { 1418 if (__bch_keylist_realloc(&keylist, bkey_u64s(&r[i].b->key))) 1419 goto out_nocoalesce; 1420 1421 make_btree_freeing_key(r[i].b, keylist.top); 1422 bch_keylist_push(&keylist); 1423 } 1424 1425 bch_btree_insert_node(b, op, &keylist, NULL, NULL); 1426 BUG_ON(!bch_keylist_empty(&keylist)); 1427 1428 for (i = 0; i < nodes; i++) { 1429 btree_node_free(r[i].b); 1430 rw_unlock(true, r[i].b); 1431 1432 r[i].b = new_nodes[i]; 1433 } 1434 1435 memmove(r, r + 1, sizeof(r[0]) * (nodes - 1)); 1436 r[nodes - 1].b = ERR_PTR(-EINTR); 1437 1438 trace_bcache_btree_gc_coalesce(nodes); 1439 gc->nodes--; 1440 1441 bch_keylist_free(&keylist); 1442 1443 /* Invalidated our iterator */ 1444 return -EINTR; 1445 1446 out_nocoalesce: 1447 closure_sync(&cl); 1448 bch_keylist_free(&keylist); 1449 1450 while ((k = bch_keylist_pop(&keylist))) 1451 if (!bkey_cmp(k, &ZERO_KEY)) 1452 atomic_dec(&b->c->prio_blocked); 1453 1454 for (i = 0; i < nodes; i++) 1455 if (!IS_ERR_OR_NULL(new_nodes[i])) { 1456 btree_node_free(new_nodes[i]); 1457 rw_unlock(true, new_nodes[i]); 1458 } 1459 return 0; 1460 } 1461 1462 static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op, 1463 struct btree *replace) 1464 { 1465 struct keylist keys; 1466 struct btree *n; 1467 1468 if (btree_check_reserve(b, NULL)) 1469 return 0; 1470 1471 n = btree_node_alloc_replacement(replace, NULL); 1472 1473 /* recheck reserve after allocating replacement node */ 1474 if (btree_check_reserve(b, NULL)) { 1475 btree_node_free(n); 1476 rw_unlock(true, n); 1477 return 0; 1478 } 1479 1480 bch_btree_node_write_sync(n); 1481 1482 bch_keylist_init(&keys); 1483 bch_keylist_add(&keys, &n->key); 1484 1485 make_btree_freeing_key(replace, keys.top); 1486 bch_keylist_push(&keys); 1487 1488 bch_btree_insert_node(b, op, &keys, NULL, NULL); 1489 BUG_ON(!bch_keylist_empty(&keys)); 1490 1491 btree_node_free(replace); 1492 rw_unlock(true, n); 1493 1494 /* Invalidated our iterator */ 1495 return -EINTR; 1496 } 1497 1498 static unsigned btree_gc_count_keys(struct btree *b) 1499 { 1500 struct bkey *k; 1501 struct btree_iter iter; 1502 unsigned ret = 0; 1503 1504 for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) 1505 ret += bkey_u64s(k); 1506 1507 return ret; 1508 } 1509 1510 static int btree_gc_recurse(struct btree *b, struct btree_op *op, 1511 struct closure *writes, struct gc_stat *gc) 1512 { 1513 int ret = 0; 1514 bool should_rewrite; 1515 struct bkey *k; 1516 struct btree_iter iter; 1517 struct gc_merge_info r[GC_MERGE_NODES]; 1518 struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1; 1519 1520 bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done); 1521 1522 for (i = r; i < r + ARRAY_SIZE(r); i++) 1523 i->b = ERR_PTR(-EINTR); 1524 1525 while (1) { 1526 k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); 1527 if (k) { 1528 r->b = bch_btree_node_get(b->c, op, k, b->level - 1, 1529 true, b); 1530 if (IS_ERR(r->b)) { 1531 ret = PTR_ERR(r->b); 1532 break; 1533 } 1534 1535 r->keys = btree_gc_count_keys(r->b); 1536 1537 ret = btree_gc_coalesce(b, op, gc, r); 1538 if (ret) 1539 break; 1540 } 1541 1542 if (!last->b) 1543 break; 1544 1545 if (!IS_ERR(last->b)) { 1546 should_rewrite = btree_gc_mark_node(last->b, gc); 1547 if (should_rewrite) { 1548 ret = btree_gc_rewrite_node(b, op, last->b); 1549 if (ret) 1550 break; 1551 } 1552 1553 if (last->b->level) { 1554 ret = btree_gc_recurse(last->b, op, writes, gc); 1555 if (ret) 1556 break; 1557 } 1558 1559 bkey_copy_key(&b->c->gc_done, &last->b->key); 1560 1561 /* 1562 * Must flush leaf nodes before gc ends, since replace 1563 * operations aren't journalled 1564 */ 1565 mutex_lock(&last->b->write_lock); 1566 if (btree_node_dirty(last->b)) 1567 bch_btree_node_write(last->b, writes); 1568 mutex_unlock(&last->b->write_lock); 1569 rw_unlock(true, last->b); 1570 } 1571 1572 memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1)); 1573 r->b = NULL; 1574 1575 if (need_resched()) { 1576 ret = -EAGAIN; 1577 break; 1578 } 1579 } 1580 1581 for (i = r; i < r + ARRAY_SIZE(r); i++) 1582 if (!IS_ERR_OR_NULL(i->b)) { 1583 mutex_lock(&i->b->write_lock); 1584 if (btree_node_dirty(i->b)) 1585 bch_btree_node_write(i->b, writes); 1586 mutex_unlock(&i->b->write_lock); 1587 rw_unlock(true, i->b); 1588 } 1589 1590 return ret; 1591 } 1592 1593 static int bch_btree_gc_root(struct btree *b, struct btree_op *op, 1594 struct closure *writes, struct gc_stat *gc) 1595 { 1596 struct btree *n = NULL; 1597 int ret = 0; 1598 bool should_rewrite; 1599 1600 should_rewrite = btree_gc_mark_node(b, gc); 1601 if (should_rewrite) { 1602 n = btree_node_alloc_replacement(b, NULL); 1603 1604 if (!IS_ERR_OR_NULL(n)) { 1605 bch_btree_node_write_sync(n); 1606 1607 bch_btree_set_root(n); 1608 btree_node_free(b); 1609 rw_unlock(true, n); 1610 1611 return -EINTR; 1612 } 1613 } 1614 1615 __bch_btree_mark_key(b->c, b->level + 1, &b->key); 1616 1617 if (b->level) { 1618 ret = btree_gc_recurse(b, op, writes, gc); 1619 if (ret) 1620 return ret; 1621 } 1622 1623 bkey_copy_key(&b->c->gc_done, &b->key); 1624 1625 return ret; 1626 } 1627 1628 static void btree_gc_start(struct cache_set *c) 1629 { 1630 struct cache *ca; 1631 struct bucket *b; 1632 unsigned i; 1633 1634 if (!c->gc_mark_valid) 1635 return; 1636 1637 mutex_lock(&c->bucket_lock); 1638 1639 c->gc_mark_valid = 0; 1640 c->gc_done = ZERO_KEY; 1641 1642 for_each_cache(ca, c, i) 1643 for_each_bucket(b, ca) { 1644 b->last_gc = b->gen; 1645 if (!atomic_read(&b->pin)) { 1646 SET_GC_MARK(b, 0); 1647 SET_GC_SECTORS_USED(b, 0); 1648 } 1649 } 1650 1651 mutex_unlock(&c->bucket_lock); 1652 } 1653 1654 static size_t bch_btree_gc_finish(struct cache_set *c) 1655 { 1656 size_t available = 0; 1657 struct bucket *b; 1658 struct cache *ca; 1659 unsigned i; 1660 1661 mutex_lock(&c->bucket_lock); 1662 1663 set_gc_sectors(c); 1664 c->gc_mark_valid = 1; 1665 c->need_gc = 0; 1666 1667 for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++) 1668 SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i), 1669 GC_MARK_METADATA); 1670 1671 /* don't reclaim buckets to which writeback keys point */ 1672 rcu_read_lock(); 1673 for (i = 0; i < c->nr_uuids; i++) { 1674 struct bcache_device *d = c->devices[i]; 1675 struct cached_dev *dc; 1676 struct keybuf_key *w, *n; 1677 unsigned j; 1678 1679 if (!d || UUID_FLASH_ONLY(&c->uuids[i])) 1680 continue; 1681 dc = container_of(d, struct cached_dev, disk); 1682 1683 spin_lock(&dc->writeback_keys.lock); 1684 rbtree_postorder_for_each_entry_safe(w, n, 1685 &dc->writeback_keys.keys, node) 1686 for (j = 0; j < KEY_PTRS(&w->key); j++) 1687 SET_GC_MARK(PTR_BUCKET(c, &w->key, j), 1688 GC_MARK_DIRTY); 1689 spin_unlock(&dc->writeback_keys.lock); 1690 } 1691 rcu_read_unlock(); 1692 1693 for_each_cache(ca, c, i) { 1694 uint64_t *i; 1695 1696 ca->invalidate_needs_gc = 0; 1697 1698 for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++) 1699 SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); 1700 1701 for (i = ca->prio_buckets; 1702 i < ca->prio_buckets + prio_buckets(ca) * 2; i++) 1703 SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); 1704 1705 for_each_bucket(b, ca) { 1706 c->need_gc = max(c->need_gc, bucket_gc_gen(b)); 1707 1708 if (atomic_read(&b->pin)) 1709 continue; 1710 1711 BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b)); 1712 1713 if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE) 1714 available++; 1715 } 1716 } 1717 1718 mutex_unlock(&c->bucket_lock); 1719 return available; 1720 } 1721 1722 static void bch_btree_gc(struct cache_set *c) 1723 { 1724 int ret; 1725 unsigned long available; 1726 struct gc_stat stats; 1727 struct closure writes; 1728 struct btree_op op; 1729 uint64_t start_time = local_clock(); 1730 1731 trace_bcache_gc_start(c); 1732 1733 memset(&stats, 0, sizeof(struct gc_stat)); 1734 closure_init_stack(&writes); 1735 bch_btree_op_init(&op, SHRT_MAX); 1736 1737 btree_gc_start(c); 1738 1739 do { 1740 ret = btree_root(gc_root, c, &op, &writes, &stats); 1741 closure_sync(&writes); 1742 cond_resched(); 1743 1744 if (ret && ret != -EAGAIN) 1745 pr_warn("gc failed!"); 1746 } while (ret); 1747 1748 available = bch_btree_gc_finish(c); 1749 wake_up_allocators(c); 1750 1751 bch_time_stats_update(&c->btree_gc_time, start_time); 1752 1753 stats.key_bytes *= sizeof(uint64_t); 1754 stats.data <<= 9; 1755 stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; 1756 memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); 1757 1758 trace_bcache_gc_end(c); 1759 1760 bch_moving_gc(c); 1761 } 1762 1763 static bool gc_should_run(struct cache_set *c) 1764 { 1765 struct cache *ca; 1766 unsigned i; 1767 1768 for_each_cache(ca, c, i) 1769 if (ca->invalidate_needs_gc) 1770 return true; 1771 1772 if (atomic_read(&c->sectors_to_gc) < 0) 1773 return true; 1774 1775 return false; 1776 } 1777 1778 static int bch_gc_thread(void *arg) 1779 { 1780 struct cache_set *c = arg; 1781 1782 while (1) { 1783 wait_event_interruptible(c->gc_wait, 1784 kthread_should_stop() || gc_should_run(c)); 1785 1786 if (kthread_should_stop()) 1787 break; 1788 1789 set_gc_sectors(c); 1790 bch_btree_gc(c); 1791 } 1792 1793 return 0; 1794 } 1795 1796 int bch_gc_thread_start(struct cache_set *c) 1797 { 1798 c->gc_thread = kthread_run(bch_gc_thread, c, "bcache_gc"); 1799 if (IS_ERR(c->gc_thread)) 1800 return PTR_ERR(c->gc_thread); 1801 1802 return 0; 1803 } 1804 1805 /* Initial partial gc */ 1806 1807 static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) 1808 { 1809 int ret = 0; 1810 struct bkey *k, *p = NULL; 1811 struct btree_iter iter; 1812 1813 for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) 1814 bch_initial_mark_key(b->c, b->level, k); 1815 1816 bch_initial_mark_key(b->c, b->level + 1, &b->key); 1817 1818 if (b->level) { 1819 bch_btree_iter_init(&b->keys, &iter, NULL); 1820 1821 do { 1822 k = bch_btree_iter_next_filter(&iter, &b->keys, 1823 bch_ptr_bad); 1824 if (k) 1825 btree_node_prefetch(b, k); 1826 1827 if (p) 1828 ret = btree(check_recurse, p, b, op); 1829 1830 p = k; 1831 } while (p && !ret); 1832 } 1833 1834 return ret; 1835 } 1836 1837 int bch_btree_check(struct cache_set *c) 1838 { 1839 struct btree_op op; 1840 1841 bch_btree_op_init(&op, SHRT_MAX); 1842 1843 return btree_root(check_recurse, c, &op); 1844 } 1845 1846 void bch_initial_gc_finish(struct cache_set *c) 1847 { 1848 struct cache *ca; 1849 struct bucket *b; 1850 unsigned i; 1851 1852 bch_btree_gc_finish(c); 1853 1854 mutex_lock(&c->bucket_lock); 1855 1856 /* 1857 * We need to put some unused buckets directly on the prio freelist in 1858 * order to get the allocator thread started - it needs freed buckets in 1859 * order to rewrite the prios and gens, and it needs to rewrite prios 1860 * and gens in order to free buckets. 1861 * 1862 * This is only safe for buckets that have no live data in them, which 1863 * there should always be some of. 1864 */ 1865 for_each_cache(ca, c, i) { 1866 for_each_bucket(b, ca) { 1867 if (fifo_full(&ca->free[RESERVE_PRIO])) 1868 break; 1869 1870 if (bch_can_invalidate_bucket(ca, b) && 1871 !GC_MARK(b)) { 1872 __bch_invalidate_one_bucket(ca, b); 1873 fifo_push(&ca->free[RESERVE_PRIO], 1874 b - ca->buckets); 1875 } 1876 } 1877 } 1878 1879 mutex_unlock(&c->bucket_lock); 1880 } 1881 1882 /* Btree insertion */ 1883 1884 static bool btree_insert_key(struct btree *b, struct bkey *k, 1885 struct bkey *replace_key) 1886 { 1887 unsigned status; 1888 1889 BUG_ON(bkey_cmp(k, &b->key) > 0); 1890 1891 status = bch_btree_insert_key(&b->keys, k, replace_key); 1892 if (status != BTREE_INSERT_STATUS_NO_INSERT) { 1893 bch_check_keys(&b->keys, "%u for %s", status, 1894 replace_key ? "replace" : "insert"); 1895 1896 trace_bcache_btree_insert_key(b, k, replace_key != NULL, 1897 status); 1898 return true; 1899 } else 1900 return false; 1901 } 1902 1903 static size_t insert_u64s_remaining(struct btree *b) 1904 { 1905 long ret = bch_btree_keys_u64s_remaining(&b->keys); 1906 1907 /* 1908 * Might land in the middle of an existing extent and have to split it 1909 */ 1910 if (b->keys.ops->is_extents) 1911 ret -= KEY_MAX_U64S; 1912 1913 return max(ret, 0L); 1914 } 1915 1916 static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, 1917 struct keylist *insert_keys, 1918 struct bkey *replace_key) 1919 { 1920 bool ret = false; 1921 int oldsize = bch_count_data(&b->keys); 1922 1923 while (!bch_keylist_empty(insert_keys)) { 1924 struct bkey *k = insert_keys->keys; 1925 1926 if (bkey_u64s(k) > insert_u64s_remaining(b)) 1927 break; 1928 1929 if (bkey_cmp(k, &b->key) <= 0) { 1930 if (!b->level) 1931 bkey_put(b->c, k); 1932 1933 ret |= btree_insert_key(b, k, replace_key); 1934 bch_keylist_pop_front(insert_keys); 1935 } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) { 1936 BKEY_PADDED(key) temp; 1937 bkey_copy(&temp.key, insert_keys->keys); 1938 1939 bch_cut_back(&b->key, &temp.key); 1940 bch_cut_front(&b->key, insert_keys->keys); 1941 1942 ret |= btree_insert_key(b, &temp.key, replace_key); 1943 break; 1944 } else { 1945 break; 1946 } 1947 } 1948 1949 if (!ret) 1950 op->insert_collision = true; 1951 1952 BUG_ON(!bch_keylist_empty(insert_keys) && b->level); 1953 1954 BUG_ON(bch_count_data(&b->keys) < oldsize); 1955 return ret; 1956 } 1957 1958 static int btree_split(struct btree *b, struct btree_op *op, 1959 struct keylist *insert_keys, 1960 struct bkey *replace_key) 1961 { 1962 bool split; 1963 struct btree *n1, *n2 = NULL, *n3 = NULL; 1964 uint64_t start_time = local_clock(); 1965 struct closure cl; 1966 struct keylist parent_keys; 1967 1968 closure_init_stack(&cl); 1969 bch_keylist_init(&parent_keys); 1970 1971 if (btree_check_reserve(b, op)) { 1972 if (!b->level) 1973 return -EINTR; 1974 else 1975 WARN(1, "insufficient reserve for split\n"); 1976 } 1977 1978 n1 = btree_node_alloc_replacement(b, op); 1979 if (IS_ERR(n1)) 1980 goto err; 1981 1982 split = set_blocks(btree_bset_first(n1), 1983 block_bytes(n1->c)) > (btree_blocks(b) * 4) / 5; 1984 1985 if (split) { 1986 unsigned keys = 0; 1987 1988 trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys); 1989 1990 n2 = bch_btree_node_alloc(b->c, op, b->level, b->parent); 1991 if (IS_ERR(n2)) 1992 goto err_free1; 1993 1994 if (!b->parent) { 1995 n3 = bch_btree_node_alloc(b->c, op, b->level + 1, NULL); 1996 if (IS_ERR(n3)) 1997 goto err_free2; 1998 } 1999 2000 mutex_lock(&n1->write_lock); 2001 mutex_lock(&n2->write_lock); 2002 2003 bch_btree_insert_keys(n1, op, insert_keys, replace_key); 2004 2005 /* 2006 * Has to be a linear search because we don't have an auxiliary 2007 * search tree yet 2008 */ 2009 2010 while (keys < (btree_bset_first(n1)->keys * 3) / 5) 2011 keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), 2012 keys)); 2013 2014 bkey_copy_key(&n1->key, 2015 bset_bkey_idx(btree_bset_first(n1), keys)); 2016 keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), keys)); 2017 2018 btree_bset_first(n2)->keys = btree_bset_first(n1)->keys - keys; 2019 btree_bset_first(n1)->keys = keys; 2020 2021 memcpy(btree_bset_first(n2)->start, 2022 bset_bkey_last(btree_bset_first(n1)), 2023 btree_bset_first(n2)->keys * sizeof(uint64_t)); 2024 2025 bkey_copy_key(&n2->key, &b->key); 2026 2027 bch_keylist_add(&parent_keys, &n2->key); 2028 bch_btree_node_write(n2, &cl); 2029 mutex_unlock(&n2->write_lock); 2030 rw_unlock(true, n2); 2031 } else { 2032 trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys); 2033 2034 mutex_lock(&n1->write_lock); 2035 bch_btree_insert_keys(n1, op, insert_keys, replace_key); 2036 } 2037 2038 bch_keylist_add(&parent_keys, &n1->key); 2039 bch_btree_node_write(n1, &cl); 2040 mutex_unlock(&n1->write_lock); 2041 2042 if (n3) { 2043 /* Depth increases, make a new root */ 2044 mutex_lock(&n3->write_lock); 2045 bkey_copy_key(&n3->key, &MAX_KEY); 2046 bch_btree_insert_keys(n3, op, &parent_keys, NULL); 2047 bch_btree_node_write(n3, &cl); 2048 mutex_unlock(&n3->write_lock); 2049 2050 closure_sync(&cl); 2051 bch_btree_set_root(n3); 2052 rw_unlock(true, n3); 2053 } else if (!b->parent) { 2054 /* Root filled up but didn't need to be split */ 2055 closure_sync(&cl); 2056 bch_btree_set_root(n1); 2057 } else { 2058 /* Split a non root node */ 2059 closure_sync(&cl); 2060 make_btree_freeing_key(b, parent_keys.top); 2061 bch_keylist_push(&parent_keys); 2062 2063 bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL); 2064 BUG_ON(!bch_keylist_empty(&parent_keys)); 2065 } 2066 2067 btree_node_free(b); 2068 rw_unlock(true, n1); 2069 2070 bch_time_stats_update(&b->c->btree_split_time, start_time); 2071 2072 return 0; 2073 err_free2: 2074 bkey_put(b->c, &n2->key); 2075 btree_node_free(n2); 2076 rw_unlock(true, n2); 2077 err_free1: 2078 bkey_put(b->c, &n1->key); 2079 btree_node_free(n1); 2080 rw_unlock(true, n1); 2081 err: 2082 WARN(1, "bcache: btree split failed (level %u)", b->level); 2083 2084 if (n3 == ERR_PTR(-EAGAIN) || 2085 n2 == ERR_PTR(-EAGAIN) || 2086 n1 == ERR_PTR(-EAGAIN)) 2087 return -EAGAIN; 2088 2089 return -ENOMEM; 2090 } 2091 2092 static int bch_btree_insert_node(struct btree *b, struct btree_op *op, 2093 struct keylist *insert_keys, 2094 atomic_t *journal_ref, 2095 struct bkey *replace_key) 2096 { 2097 struct closure cl; 2098 2099 BUG_ON(b->level && replace_key); 2100 2101 closure_init_stack(&cl); 2102 2103 mutex_lock(&b->write_lock); 2104 2105 if (write_block(b) != btree_bset_last(b) && 2106 b->keys.last_set_unwritten) 2107 bch_btree_init_next(b); /* just wrote a set */ 2108 2109 if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) { 2110 mutex_unlock(&b->write_lock); 2111 goto split; 2112 } 2113 2114 BUG_ON(write_block(b) != btree_bset_last(b)); 2115 2116 if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) { 2117 if (!b->level) 2118 bch_btree_leaf_dirty(b, journal_ref); 2119 else 2120 bch_btree_node_write(b, &cl); 2121 } 2122 2123 mutex_unlock(&b->write_lock); 2124 2125 /* wait for btree node write if necessary, after unlock */ 2126 closure_sync(&cl); 2127 2128 return 0; 2129 split: 2130 if (current->bio_list) { 2131 op->lock = b->c->root->level + 1; 2132 return -EAGAIN; 2133 } else if (op->lock <= b->c->root->level) { 2134 op->lock = b->c->root->level + 1; 2135 return -EINTR; 2136 } else { 2137 /* Invalidated all iterators */ 2138 int ret = btree_split(b, op, insert_keys, replace_key); 2139 2140 if (bch_keylist_empty(insert_keys)) 2141 return 0; 2142 else if (!ret) 2143 return -EINTR; 2144 return ret; 2145 } 2146 } 2147 2148 int bch_btree_insert_check_key(struct btree *b, struct btree_op *op, 2149 struct bkey *check_key) 2150 { 2151 int ret = -EINTR; 2152 uint64_t btree_ptr = b->key.ptr[0]; 2153 unsigned long seq = b->seq; 2154 struct keylist insert; 2155 bool upgrade = op->lock == -1; 2156 2157 bch_keylist_init(&insert); 2158 2159 if (upgrade) { 2160 rw_unlock(false, b); 2161 rw_lock(true, b, b->level); 2162 2163 if (b->key.ptr[0] != btree_ptr || 2164 b->seq != seq + 1) { 2165 op->lock = b->level; 2166 goto out; 2167 } 2168 } 2169 2170 SET_KEY_PTRS(check_key, 1); 2171 get_random_bytes(&check_key->ptr[0], sizeof(uint64_t)); 2172 2173 SET_PTR_DEV(check_key, 0, PTR_CHECK_DEV); 2174 2175 bch_keylist_add(&insert, check_key); 2176 2177 ret = bch_btree_insert_node(b, op, &insert, NULL, NULL); 2178 2179 BUG_ON(!ret && !bch_keylist_empty(&insert)); 2180 out: 2181 if (upgrade) 2182 downgrade_write(&b->lock); 2183 return ret; 2184 } 2185 2186 struct btree_insert_op { 2187 struct btree_op op; 2188 struct keylist *keys; 2189 atomic_t *journal_ref; 2190 struct bkey *replace_key; 2191 }; 2192 2193 static int btree_insert_fn(struct btree_op *b_op, struct btree *b) 2194 { 2195 struct btree_insert_op *op = container_of(b_op, 2196 struct btree_insert_op, op); 2197 2198 int ret = bch_btree_insert_node(b, &op->op, op->keys, 2199 op->journal_ref, op->replace_key); 2200 if (ret && !bch_keylist_empty(op->keys)) 2201 return ret; 2202 else 2203 return MAP_DONE; 2204 } 2205 2206 int bch_btree_insert(struct cache_set *c, struct keylist *keys, 2207 atomic_t *journal_ref, struct bkey *replace_key) 2208 { 2209 struct btree_insert_op op; 2210 int ret = 0; 2211 2212 BUG_ON(current->bio_list); 2213 BUG_ON(bch_keylist_empty(keys)); 2214 2215 bch_btree_op_init(&op.op, 0); 2216 op.keys = keys; 2217 op.journal_ref = journal_ref; 2218 op.replace_key = replace_key; 2219 2220 while (!ret && !bch_keylist_empty(keys)) { 2221 op.op.lock = 0; 2222 ret = bch_btree_map_leaf_nodes(&op.op, c, 2223 &START_KEY(keys->keys), 2224 btree_insert_fn); 2225 } 2226 2227 if (ret) { 2228 struct bkey *k; 2229 2230 pr_err("error %i", ret); 2231 2232 while ((k = bch_keylist_pop(keys))) 2233 bkey_put(c, k); 2234 } else if (op.op.insert_collision) 2235 ret = -ESRCH; 2236 2237 return ret; 2238 } 2239 2240 void bch_btree_set_root(struct btree *b) 2241 { 2242 unsigned i; 2243 struct closure cl; 2244 2245 closure_init_stack(&cl); 2246 2247 trace_bcache_btree_set_root(b); 2248 2249 BUG_ON(!b->written); 2250 2251 for (i = 0; i < KEY_PTRS(&b->key); i++) 2252 BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO); 2253 2254 mutex_lock(&b->c->bucket_lock); 2255 list_del_init(&b->list); 2256 mutex_unlock(&b->c->bucket_lock); 2257 2258 b->c->root = b; 2259 2260 bch_journal_meta(b->c, &cl); 2261 closure_sync(&cl); 2262 } 2263 2264 /* Map across nodes or keys */ 2265 2266 static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, 2267 struct bkey *from, 2268 btree_map_nodes_fn *fn, int flags) 2269 { 2270 int ret = MAP_CONTINUE; 2271 2272 if (b->level) { 2273 struct bkey *k; 2274 struct btree_iter iter; 2275 2276 bch_btree_iter_init(&b->keys, &iter, from); 2277 2278 while ((k = bch_btree_iter_next_filter(&iter, &b->keys, 2279 bch_ptr_bad))) { 2280 ret = btree(map_nodes_recurse, k, b, 2281 op, from, fn, flags); 2282 from = NULL; 2283 2284 if (ret != MAP_CONTINUE) 2285 return ret; 2286 } 2287 } 2288 2289 if (!b->level || flags == MAP_ALL_NODES) 2290 ret = fn(op, b); 2291 2292 return ret; 2293 } 2294 2295 int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c, 2296 struct bkey *from, btree_map_nodes_fn *fn, int flags) 2297 { 2298 return btree_root(map_nodes_recurse, c, op, from, fn, flags); 2299 } 2300 2301 static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, 2302 struct bkey *from, btree_map_keys_fn *fn, 2303 int flags) 2304 { 2305 int ret = MAP_CONTINUE; 2306 struct bkey *k; 2307 struct btree_iter iter; 2308 2309 bch_btree_iter_init(&b->keys, &iter, from); 2310 2311 while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { 2312 ret = !b->level 2313 ? fn(op, b, k) 2314 : btree(map_keys_recurse, k, b, op, from, fn, flags); 2315 from = NULL; 2316 2317 if (ret != MAP_CONTINUE) 2318 return ret; 2319 } 2320 2321 if (!b->level && (flags & MAP_END_KEY)) 2322 ret = fn(op, b, &KEY(KEY_INODE(&b->key), 2323 KEY_OFFSET(&b->key), 0)); 2324 2325 return ret; 2326 } 2327 2328 int bch_btree_map_keys(struct btree_op *op, struct cache_set *c, 2329 struct bkey *from, btree_map_keys_fn *fn, int flags) 2330 { 2331 return btree_root(map_keys_recurse, c, op, from, fn, flags); 2332 } 2333 2334 /* Keybuf code */ 2335 2336 static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r) 2337 { 2338 /* Overlapping keys compare equal */ 2339 if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0) 2340 return -1; 2341 if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0) 2342 return 1; 2343 return 0; 2344 } 2345 2346 static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l, 2347 struct keybuf_key *r) 2348 { 2349 return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1); 2350 } 2351 2352 struct refill { 2353 struct btree_op op; 2354 unsigned nr_found; 2355 struct keybuf *buf; 2356 struct bkey *end; 2357 keybuf_pred_fn *pred; 2358 }; 2359 2360 static int refill_keybuf_fn(struct btree_op *op, struct btree *b, 2361 struct bkey *k) 2362 { 2363 struct refill *refill = container_of(op, struct refill, op); 2364 struct keybuf *buf = refill->buf; 2365 int ret = MAP_CONTINUE; 2366 2367 if (bkey_cmp(k, refill->end) >= 0) { 2368 ret = MAP_DONE; 2369 goto out; 2370 } 2371 2372 if (!KEY_SIZE(k)) /* end key */ 2373 goto out; 2374 2375 if (refill->pred(buf, k)) { 2376 struct keybuf_key *w; 2377 2378 spin_lock(&buf->lock); 2379 2380 w = array_alloc(&buf->freelist); 2381 if (!w) { 2382 spin_unlock(&buf->lock); 2383 return MAP_DONE; 2384 } 2385 2386 w->private = NULL; 2387 bkey_copy(&w->key, k); 2388 2389 if (RB_INSERT(&buf->keys, w, node, keybuf_cmp)) 2390 array_free(&buf->freelist, w); 2391 else 2392 refill->nr_found++; 2393 2394 if (array_freelist_empty(&buf->freelist)) 2395 ret = MAP_DONE; 2396 2397 spin_unlock(&buf->lock); 2398 } 2399 out: 2400 buf->last_scanned = *k; 2401 return ret; 2402 } 2403 2404 void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, 2405 struct bkey *end, keybuf_pred_fn *pred) 2406 { 2407 struct bkey start = buf->last_scanned; 2408 struct refill refill; 2409 2410 cond_resched(); 2411 2412 bch_btree_op_init(&refill.op, -1); 2413 refill.nr_found = 0; 2414 refill.buf = buf; 2415 refill.end = end; 2416 refill.pred = pred; 2417 2418 bch_btree_map_keys(&refill.op, c, &buf->last_scanned, 2419 refill_keybuf_fn, MAP_END_KEY); 2420 2421 trace_bcache_keyscan(refill.nr_found, 2422 KEY_INODE(&start), KEY_OFFSET(&start), 2423 KEY_INODE(&buf->last_scanned), 2424 KEY_OFFSET(&buf->last_scanned)); 2425 2426 spin_lock(&buf->lock); 2427 2428 if (!RB_EMPTY_ROOT(&buf->keys)) { 2429 struct keybuf_key *w; 2430 w = RB_FIRST(&buf->keys, struct keybuf_key, node); 2431 buf->start = START_KEY(&w->key); 2432 2433 w = RB_LAST(&buf->keys, struct keybuf_key, node); 2434 buf->end = w->key; 2435 } else { 2436 buf->start = MAX_KEY; 2437 buf->end = MAX_KEY; 2438 } 2439 2440 spin_unlock(&buf->lock); 2441 } 2442 2443 static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) 2444 { 2445 rb_erase(&w->node, &buf->keys); 2446 array_free(&buf->freelist, w); 2447 } 2448 2449 void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) 2450 { 2451 spin_lock(&buf->lock); 2452 __bch_keybuf_del(buf, w); 2453 spin_unlock(&buf->lock); 2454 } 2455 2456 bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start, 2457 struct bkey *end) 2458 { 2459 bool ret = false; 2460 struct keybuf_key *p, *w, s; 2461 s.key = *start; 2462 2463 if (bkey_cmp(end, &buf->start) <= 0 || 2464 bkey_cmp(start, &buf->end) >= 0) 2465 return false; 2466 2467 spin_lock(&buf->lock); 2468 w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp); 2469 2470 while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) { 2471 p = w; 2472 w = RB_NEXT(w, node); 2473 2474 if (p->private) 2475 ret = true; 2476 else 2477 __bch_keybuf_del(buf, p); 2478 } 2479 2480 spin_unlock(&buf->lock); 2481 return ret; 2482 } 2483 2484 struct keybuf_key *bch_keybuf_next(struct keybuf *buf) 2485 { 2486 struct keybuf_key *w; 2487 spin_lock(&buf->lock); 2488 2489 w = RB_FIRST(&buf->keys, struct keybuf_key, node); 2490 2491 while (w && w->private) 2492 w = RB_NEXT(w, node); 2493 2494 if (w) 2495 w->private = ERR_PTR(-EINTR); 2496 2497 spin_unlock(&buf->lock); 2498 return w; 2499 } 2500 2501 struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, 2502 struct keybuf *buf, 2503 struct bkey *end, 2504 keybuf_pred_fn *pred) 2505 { 2506 struct keybuf_key *ret; 2507 2508 while (1) { 2509 ret = bch_keybuf_next(buf); 2510 if (ret) 2511 break; 2512 2513 if (bkey_cmp(&buf->last_scanned, end) >= 0) { 2514 pr_debug("scan finished"); 2515 break; 2516 } 2517 2518 bch_refill_keybuf(c, buf, end, pred); 2519 } 2520 2521 return ret; 2522 } 2523 2524 void bch_keybuf_init(struct keybuf *buf) 2525 { 2526 buf->last_scanned = MAX_KEY; 2527 buf->keys = RB_ROOT; 2528 2529 spin_lock_init(&buf->lock); 2530 array_allocator_init(&buf->freelist); 2531 } 2532