1 /* 2 * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> 3 * 4 * Uses a block device as cache for other block devices; optimized for SSDs. 5 * All allocation is done in buckets, which should match the erase block size 6 * of the device. 7 * 8 * Buckets containing cached data are kept on a heap sorted by priority; 9 * bucket priority is increased on cache hit, and periodically all the buckets 10 * on the heap have their priority scaled down. This currently is just used as 11 * an LRU but in the future should allow for more intelligent heuristics. 12 * 13 * Buckets have an 8 bit counter; freeing is accomplished by incrementing the 14 * counter. Garbage collection is used to remove stale pointers. 15 * 16 * Indexing is done via a btree; nodes are not necessarily fully sorted, rather 17 * as keys are inserted we only sort the pages that have not yet been written. 18 * When garbage collection is run, we resort the entire node. 19 * 20 * All configuration is done via sysfs; see Documentation/bcache.txt. 21 */ 22 23 #include "bcache.h" 24 #include "btree.h" 25 #include "debug.h" 26 #include "request.h" 27 #include "writeback.h" 28 29 #include <linux/slab.h> 30 #include <linux/bitops.h> 31 #include <linux/hash.h> 32 #include <linux/prefetch.h> 33 #include <linux/random.h> 34 #include <linux/rcupdate.h> 35 #include <trace/events/bcache.h> 36 37 /* 38 * Todo: 39 * register_bcache: Return errors out to userspace correctly 40 * 41 * Writeback: don't undirty key until after a cache flush 42 * 43 * Create an iterator for key pointers 44 * 45 * On btree write error, mark bucket such that it won't be freed from the cache 46 * 47 * Journalling: 48 * Check for bad keys in replay 49 * Propagate barriers 50 * Refcount journal entries in journal_replay 51 * 52 * Garbage collection: 53 * Finish incremental gc 54 * Gc should free old UUIDs, data for invalid UUIDs 55 * 56 * Provide a way to list backing device UUIDs we have data cached for, and 57 * probably how long it's been since we've seen them, and a way to invalidate 58 * dirty data for devices that will never be attached again 59 * 60 * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so 61 * that based on that and how much dirty data we have we can keep writeback 62 * from being starved 63 * 64 * Add a tracepoint or somesuch to watch for writeback starvation 65 * 66 * When btree depth > 1 and splitting an interior node, we have to make sure 67 * alloc_bucket() cannot fail. This should be true but is not completely 68 * obvious. 69 * 70 * Make sure all allocations get charged to the root cgroup 71 * 72 * Plugging? 73 * 74 * If data write is less than hard sector size of ssd, round up offset in open 75 * bucket to the next whole sector 76 * 77 * Also lookup by cgroup in get_open_bucket() 78 * 79 * Superblock needs to be fleshed out for multiple cache devices 80 * 81 * Add a sysfs tunable for the number of writeback IOs in flight 82 * 83 * Add a sysfs tunable for the number of open data buckets 84 * 85 * IO tracking: Can we track when one process is doing io on behalf of another? 86 * IO tracking: Don't use just an average, weigh more recent stuff higher 87 * 88 * Test module load/unload 89 */ 90 91 static const char * const op_types[] = { 92 "insert", "replace" 93 }; 94 95 static const char *op_type(struct btree_op *op) 96 { 97 return op_types[op->type]; 98 } 99 100 #define MAX_NEED_GC 64 101 #define MAX_SAVE_PRIO 72 102 103 #define PTR_DIRTY_BIT (((uint64_t) 1 << 36)) 104 105 #define PTR_HASH(c, k) \ 106 (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) 107 108 struct workqueue_struct *bch_gc_wq; 109 static struct workqueue_struct *btree_io_wq; 110 111 void bch_btree_op_init_stack(struct btree_op *op) 112 { 113 memset(op, 0, sizeof(struct btree_op)); 114 closure_init_stack(&op->cl); 115 op->lock = -1; 116 bch_keylist_init(&op->keys); 117 } 118 119 /* Btree key manipulation */ 120 121 static void bkey_put(struct cache_set *c, struct bkey *k, int level) 122 { 123 if ((level && KEY_OFFSET(k)) || !level) 124 __bkey_put(c, k); 125 } 126 127 /* Btree IO */ 128 129 static uint64_t btree_csum_set(struct btree *b, struct bset *i) 130 { 131 uint64_t crc = b->key.ptr[0]; 132 void *data = (void *) i + 8, *end = end(i); 133 134 crc = bch_crc64_update(crc, data, end - data); 135 return crc ^ 0xffffffffffffffffULL; 136 } 137 138 static void bch_btree_node_read_done(struct btree *b) 139 { 140 const char *err = "bad btree header"; 141 struct bset *i = b->sets[0].data; 142 struct btree_iter *iter; 143 144 iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT); 145 iter->size = b->c->sb.bucket_size / b->c->sb.block_size; 146 iter->used = 0; 147 148 if (!i->seq) 149 goto err; 150 151 for (; 152 b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq; 153 i = write_block(b)) { 154 err = "unsupported bset version"; 155 if (i->version > BCACHE_BSET_VERSION) 156 goto err; 157 158 err = "bad btree header"; 159 if (b->written + set_blocks(i, b->c) > btree_blocks(b)) 160 goto err; 161 162 err = "bad magic"; 163 if (i->magic != bset_magic(b->c)) 164 goto err; 165 166 err = "bad checksum"; 167 switch (i->version) { 168 case 0: 169 if (i->csum != csum_set(i)) 170 goto err; 171 break; 172 case BCACHE_BSET_VERSION: 173 if (i->csum != btree_csum_set(b, i)) 174 goto err; 175 break; 176 } 177 178 err = "empty set"; 179 if (i != b->sets[0].data && !i->keys) 180 goto err; 181 182 bch_btree_iter_push(iter, i->start, end(i)); 183 184 b->written += set_blocks(i, b->c); 185 } 186 187 err = "corrupted btree"; 188 for (i = write_block(b); 189 index(i, b) < btree_blocks(b); 190 i = ((void *) i) + block_bytes(b->c)) 191 if (i->seq == b->sets[0].data->seq) 192 goto err; 193 194 bch_btree_sort_and_fix_extents(b, iter); 195 196 i = b->sets[0].data; 197 err = "short btree key"; 198 if (b->sets[0].size && 199 bkey_cmp(&b->key, &b->sets[0].end) < 0) 200 goto err; 201 202 if (b->written < btree_blocks(b)) 203 bch_bset_init_next(b); 204 out: 205 mempool_free(iter, b->c->fill_iter); 206 return; 207 err: 208 set_btree_node_io_error(b); 209 bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys", 210 err, PTR_BUCKET_NR(b->c, &b->key, 0), 211 index(i, b), i->keys); 212 goto out; 213 } 214 215 static void btree_node_read_endio(struct bio *bio, int error) 216 { 217 struct closure *cl = bio->bi_private; 218 closure_put(cl); 219 } 220 221 void bch_btree_node_read(struct btree *b) 222 { 223 uint64_t start_time = local_clock(); 224 struct closure cl; 225 struct bio *bio; 226 227 trace_bcache_btree_read(b); 228 229 closure_init_stack(&cl); 230 231 bio = bch_bbio_alloc(b->c); 232 bio->bi_rw = REQ_META|READ_SYNC; 233 bio->bi_size = KEY_SIZE(&b->key) << 9; 234 bio->bi_end_io = btree_node_read_endio; 235 bio->bi_private = &cl; 236 237 bch_bio_map(bio, b->sets[0].data); 238 239 bch_submit_bbio(bio, b->c, &b->key, 0); 240 closure_sync(&cl); 241 242 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 243 set_btree_node_io_error(b); 244 245 bch_bbio_free(bio, b->c); 246 247 if (btree_node_io_error(b)) 248 goto err; 249 250 bch_btree_node_read_done(b); 251 252 spin_lock(&b->c->btree_read_time_lock); 253 bch_time_stats_update(&b->c->btree_read_time, start_time); 254 spin_unlock(&b->c->btree_read_time_lock); 255 256 return; 257 err: 258 bch_cache_set_error(b->c, "io error reading bucket %lu", 259 PTR_BUCKET_NR(b->c, &b->key, 0)); 260 } 261 262 static void btree_complete_write(struct btree *b, struct btree_write *w) 263 { 264 if (w->prio_blocked && 265 !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked)) 266 wake_up_allocators(b->c); 267 268 if (w->journal) { 269 atomic_dec_bug(w->journal); 270 __closure_wake_up(&b->c->journal.wait); 271 } 272 273 w->prio_blocked = 0; 274 w->journal = NULL; 275 } 276 277 static void __btree_node_write_done(struct closure *cl) 278 { 279 struct btree *b = container_of(cl, struct btree, io.cl); 280 struct btree_write *w = btree_prev_write(b); 281 282 bch_bbio_free(b->bio, b->c); 283 b->bio = NULL; 284 btree_complete_write(b, w); 285 286 if (btree_node_dirty(b)) 287 queue_delayed_work(btree_io_wq, &b->work, 288 msecs_to_jiffies(30000)); 289 290 closure_return(cl); 291 } 292 293 static void btree_node_write_done(struct closure *cl) 294 { 295 struct btree *b = container_of(cl, struct btree, io.cl); 296 struct bio_vec *bv; 297 int n; 298 299 __bio_for_each_segment(bv, b->bio, n, 0) 300 __free_page(bv->bv_page); 301 302 __btree_node_write_done(cl); 303 } 304 305 static void btree_node_write_endio(struct bio *bio, int error) 306 { 307 struct closure *cl = bio->bi_private; 308 struct btree *b = container_of(cl, struct btree, io.cl); 309 310 if (error) 311 set_btree_node_io_error(b); 312 313 bch_bbio_count_io_errors(b->c, bio, error, "writing btree"); 314 closure_put(cl); 315 } 316 317 static void do_btree_node_write(struct btree *b) 318 { 319 struct closure *cl = &b->io.cl; 320 struct bset *i = b->sets[b->nsets].data; 321 BKEY_PADDED(key) k; 322 323 i->version = BCACHE_BSET_VERSION; 324 i->csum = btree_csum_set(b, i); 325 326 BUG_ON(b->bio); 327 b->bio = bch_bbio_alloc(b->c); 328 329 b->bio->bi_end_io = btree_node_write_endio; 330 b->bio->bi_private = &b->io.cl; 331 b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA; 332 b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); 333 bch_bio_map(b->bio, i); 334 335 /* 336 * If we're appending to a leaf node, we don't technically need FUA - 337 * this write just needs to be persisted before the next journal write, 338 * which will be marked FLUSH|FUA. 339 * 340 * Similarly if we're writing a new btree root - the pointer is going to 341 * be in the next journal entry. 342 * 343 * But if we're writing a new btree node (that isn't a root) or 344 * appending to a non leaf btree node, we need either FUA or a flush 345 * when we write the parent with the new pointer. FUA is cheaper than a 346 * flush, and writes appending to leaf nodes aren't blocking anything so 347 * just make all btree node writes FUA to keep things sane. 348 */ 349 350 bkey_copy(&k.key, &b->key); 351 SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i)); 352 353 if (!bio_alloc_pages(b->bio, GFP_NOIO)) { 354 int j; 355 struct bio_vec *bv; 356 void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); 357 358 bio_for_each_segment(bv, b->bio, j) 359 memcpy(page_address(bv->bv_page), 360 base + j * PAGE_SIZE, PAGE_SIZE); 361 362 bch_submit_bbio(b->bio, b->c, &k.key, 0); 363 364 continue_at(cl, btree_node_write_done, NULL); 365 } else { 366 b->bio->bi_vcnt = 0; 367 bch_bio_map(b->bio, i); 368 369 bch_submit_bbio(b->bio, b->c, &k.key, 0); 370 371 closure_sync(cl); 372 __btree_node_write_done(cl); 373 } 374 } 375 376 void bch_btree_node_write(struct btree *b, struct closure *parent) 377 { 378 struct bset *i = b->sets[b->nsets].data; 379 380 trace_bcache_btree_write(b); 381 382 BUG_ON(current->bio_list); 383 BUG_ON(b->written >= btree_blocks(b)); 384 BUG_ON(b->written && !i->keys); 385 BUG_ON(b->sets->data->seq != i->seq); 386 bch_check_key_order(b, i); 387 388 cancel_delayed_work(&b->work); 389 390 /* If caller isn't waiting for write, parent refcount is cache set */ 391 closure_lock(&b->io, parent ?: &b->c->cl); 392 393 clear_bit(BTREE_NODE_dirty, &b->flags); 394 change_bit(BTREE_NODE_write_idx, &b->flags); 395 396 do_btree_node_write(b); 397 398 b->written += set_blocks(i, b->c); 399 atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size, 400 &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); 401 402 bch_btree_sort_lazy(b); 403 404 if (b->written < btree_blocks(b)) 405 bch_bset_init_next(b); 406 } 407 408 static void btree_node_write_work(struct work_struct *w) 409 { 410 struct btree *b = container_of(to_delayed_work(w), struct btree, work); 411 412 rw_lock(true, b, b->level); 413 414 if (btree_node_dirty(b)) 415 bch_btree_node_write(b, NULL); 416 rw_unlock(true, b); 417 } 418 419 static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op) 420 { 421 struct bset *i = b->sets[b->nsets].data; 422 struct btree_write *w = btree_current_write(b); 423 424 BUG_ON(!b->written); 425 BUG_ON(!i->keys); 426 427 if (!btree_node_dirty(b)) 428 queue_delayed_work(btree_io_wq, &b->work, 30 * HZ); 429 430 set_btree_node_dirty(b); 431 432 if (op && op->journal) { 433 if (w->journal && 434 journal_pin_cmp(b->c, w, op)) { 435 atomic_dec_bug(w->journal); 436 w->journal = NULL; 437 } 438 439 if (!w->journal) { 440 w->journal = op->journal; 441 atomic_inc(w->journal); 442 } 443 } 444 445 /* Force write if set is too big */ 446 if (set_bytes(i) > PAGE_SIZE - 48 && 447 !current->bio_list) 448 bch_btree_node_write(b, NULL); 449 } 450 451 /* 452 * Btree in memory cache - allocation/freeing 453 * mca -> memory cache 454 */ 455 456 static void mca_reinit(struct btree *b) 457 { 458 unsigned i; 459 460 b->flags = 0; 461 b->written = 0; 462 b->nsets = 0; 463 464 for (i = 0; i < MAX_BSETS; i++) 465 b->sets[i].size = 0; 466 /* 467 * Second loop starts at 1 because b->sets[0]->data is the memory we 468 * allocated 469 */ 470 for (i = 1; i < MAX_BSETS; i++) 471 b->sets[i].data = NULL; 472 } 473 474 #define mca_reserve(c) (((c->root && c->root->level) \ 475 ? c->root->level : 1) * 8 + 16) 476 #define mca_can_free(c) \ 477 max_t(int, 0, c->bucket_cache_used - mca_reserve(c)) 478 479 static void mca_data_free(struct btree *b) 480 { 481 struct bset_tree *t = b->sets; 482 BUG_ON(!closure_is_unlocked(&b->io.cl)); 483 484 if (bset_prev_bytes(b) < PAGE_SIZE) 485 kfree(t->prev); 486 else 487 free_pages((unsigned long) t->prev, 488 get_order(bset_prev_bytes(b))); 489 490 if (bset_tree_bytes(b) < PAGE_SIZE) 491 kfree(t->tree); 492 else 493 free_pages((unsigned long) t->tree, 494 get_order(bset_tree_bytes(b))); 495 496 free_pages((unsigned long) t->data, b->page_order); 497 498 t->prev = NULL; 499 t->tree = NULL; 500 t->data = NULL; 501 list_move(&b->list, &b->c->btree_cache_freed); 502 b->c->bucket_cache_used--; 503 } 504 505 static void mca_bucket_free(struct btree *b) 506 { 507 BUG_ON(btree_node_dirty(b)); 508 509 b->key.ptr[0] = 0; 510 hlist_del_init_rcu(&b->hash); 511 list_move(&b->list, &b->c->btree_cache_freeable); 512 } 513 514 static unsigned btree_order(struct bkey *k) 515 { 516 return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1); 517 } 518 519 static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) 520 { 521 struct bset_tree *t = b->sets; 522 BUG_ON(t->data); 523 524 b->page_order = max_t(unsigned, 525 ilog2(b->c->btree_pages), 526 btree_order(k)); 527 528 t->data = (void *) __get_free_pages(gfp, b->page_order); 529 if (!t->data) 530 goto err; 531 532 t->tree = bset_tree_bytes(b) < PAGE_SIZE 533 ? kmalloc(bset_tree_bytes(b), gfp) 534 : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b))); 535 if (!t->tree) 536 goto err; 537 538 t->prev = bset_prev_bytes(b) < PAGE_SIZE 539 ? kmalloc(bset_prev_bytes(b), gfp) 540 : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b))); 541 if (!t->prev) 542 goto err; 543 544 list_move(&b->list, &b->c->btree_cache); 545 b->c->bucket_cache_used++; 546 return; 547 err: 548 mca_data_free(b); 549 } 550 551 static struct btree *mca_bucket_alloc(struct cache_set *c, 552 struct bkey *k, gfp_t gfp) 553 { 554 struct btree *b = kzalloc(sizeof(struct btree), gfp); 555 if (!b) 556 return NULL; 557 558 init_rwsem(&b->lock); 559 lockdep_set_novalidate_class(&b->lock); 560 INIT_LIST_HEAD(&b->list); 561 INIT_DELAYED_WORK(&b->work, btree_node_write_work); 562 b->c = c; 563 closure_init_unlocked(&b->io); 564 565 mca_data_alloc(b, k, gfp); 566 return b; 567 } 568 569 static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order) 570 { 571 lockdep_assert_held(&b->c->bucket_lock); 572 573 if (!down_write_trylock(&b->lock)) 574 return -ENOMEM; 575 576 if (b->page_order < min_order) { 577 rw_unlock(true, b); 578 return -ENOMEM; 579 } 580 581 BUG_ON(btree_node_dirty(b) && !b->sets[0].data); 582 583 if (cl && btree_node_dirty(b)) 584 bch_btree_node_write(b, NULL); 585 586 if (cl) 587 closure_wait_event_async(&b->io.wait, cl, 588 atomic_read(&b->io.cl.remaining) == -1); 589 590 if (btree_node_dirty(b) || 591 !closure_is_unlocked(&b->io.cl) || 592 work_pending(&b->work.work)) { 593 rw_unlock(true, b); 594 return -EAGAIN; 595 } 596 597 return 0; 598 } 599 600 static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc) 601 { 602 struct cache_set *c = container_of(shrink, struct cache_set, shrink); 603 struct btree *b, *t; 604 unsigned long i, nr = sc->nr_to_scan; 605 606 if (c->shrinker_disabled) 607 return 0; 608 609 if (c->try_harder) 610 return 0; 611 612 /* 613 * If nr == 0, we're supposed to return the number of items we have 614 * cached. Not allowed to return -1. 615 */ 616 if (!nr) 617 return mca_can_free(c) * c->btree_pages; 618 619 /* Return -1 if we can't do anything right now */ 620 if (sc->gfp_mask & __GFP_WAIT) 621 mutex_lock(&c->bucket_lock); 622 else if (!mutex_trylock(&c->bucket_lock)) 623 return -1; 624 625 /* 626 * It's _really_ critical that we don't free too many btree nodes - we 627 * have to always leave ourselves a reserve. The reserve is how we 628 * guarantee that allocating memory for a new btree node can always 629 * succeed, so that inserting keys into the btree can always succeed and 630 * IO can always make forward progress: 631 */ 632 nr /= c->btree_pages; 633 nr = min_t(unsigned long, nr, mca_can_free(c)); 634 635 i = 0; 636 list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) { 637 if (!nr) 638 break; 639 640 if (++i > 3 && 641 !mca_reap(b, NULL, 0)) { 642 mca_data_free(b); 643 rw_unlock(true, b); 644 --nr; 645 } 646 } 647 648 /* 649 * Can happen right when we first start up, before we've read in any 650 * btree nodes 651 */ 652 if (list_empty(&c->btree_cache)) 653 goto out; 654 655 for (i = 0; nr && i < c->bucket_cache_used; i++) { 656 b = list_first_entry(&c->btree_cache, struct btree, list); 657 list_rotate_left(&c->btree_cache); 658 659 if (!b->accessed && 660 !mca_reap(b, NULL, 0)) { 661 mca_bucket_free(b); 662 mca_data_free(b); 663 rw_unlock(true, b); 664 --nr; 665 } else 666 b->accessed = 0; 667 } 668 out: 669 nr = mca_can_free(c) * c->btree_pages; 670 mutex_unlock(&c->bucket_lock); 671 return nr; 672 } 673 674 void bch_btree_cache_free(struct cache_set *c) 675 { 676 struct btree *b; 677 struct closure cl; 678 closure_init_stack(&cl); 679 680 if (c->shrink.list.next) 681 unregister_shrinker(&c->shrink); 682 683 mutex_lock(&c->bucket_lock); 684 685 #ifdef CONFIG_BCACHE_DEBUG 686 if (c->verify_data) 687 list_move(&c->verify_data->list, &c->btree_cache); 688 #endif 689 690 list_splice(&c->btree_cache_freeable, 691 &c->btree_cache); 692 693 while (!list_empty(&c->btree_cache)) { 694 b = list_first_entry(&c->btree_cache, struct btree, list); 695 696 if (btree_node_dirty(b)) 697 btree_complete_write(b, btree_current_write(b)); 698 clear_bit(BTREE_NODE_dirty, &b->flags); 699 700 mca_data_free(b); 701 } 702 703 while (!list_empty(&c->btree_cache_freed)) { 704 b = list_first_entry(&c->btree_cache_freed, 705 struct btree, list); 706 list_del(&b->list); 707 cancel_delayed_work_sync(&b->work); 708 kfree(b); 709 } 710 711 mutex_unlock(&c->bucket_lock); 712 } 713 714 int bch_btree_cache_alloc(struct cache_set *c) 715 { 716 unsigned i; 717 718 /* XXX: doesn't check for errors */ 719 720 closure_init_unlocked(&c->gc); 721 722 for (i = 0; i < mca_reserve(c); i++) 723 mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); 724 725 list_splice_init(&c->btree_cache, 726 &c->btree_cache_freeable); 727 728 #ifdef CONFIG_BCACHE_DEBUG 729 mutex_init(&c->verify_lock); 730 731 c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); 732 733 if (c->verify_data && 734 c->verify_data->sets[0].data) 735 list_del_init(&c->verify_data->list); 736 else 737 c->verify_data = NULL; 738 #endif 739 740 c->shrink.shrink = bch_mca_shrink; 741 c->shrink.seeks = 4; 742 c->shrink.batch = c->btree_pages * 2; 743 register_shrinker(&c->shrink); 744 745 return 0; 746 } 747 748 /* Btree in memory cache - hash table */ 749 750 static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k) 751 { 752 return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)]; 753 } 754 755 static struct btree *mca_find(struct cache_set *c, struct bkey *k) 756 { 757 struct btree *b; 758 759 rcu_read_lock(); 760 hlist_for_each_entry_rcu(b, mca_hash(c, k), hash) 761 if (PTR_HASH(c, &b->key) == PTR_HASH(c, k)) 762 goto out; 763 b = NULL; 764 out: 765 rcu_read_unlock(); 766 return b; 767 } 768 769 static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k, 770 int level, struct closure *cl) 771 { 772 int ret = -ENOMEM; 773 struct btree *i; 774 775 trace_bcache_btree_cache_cannibalize(c); 776 777 if (!cl) 778 return ERR_PTR(-ENOMEM); 779 780 /* 781 * Trying to free up some memory - i.e. reuse some btree nodes - may 782 * require initiating IO to flush the dirty part of the node. If we're 783 * running under generic_make_request(), that IO will never finish and 784 * we would deadlock. Returning -EAGAIN causes the cache lookup code to 785 * punt to workqueue and retry. 786 */ 787 if (current->bio_list) 788 return ERR_PTR(-EAGAIN); 789 790 if (c->try_harder && c->try_harder != cl) { 791 closure_wait_event_async(&c->try_wait, cl, !c->try_harder); 792 return ERR_PTR(-EAGAIN); 793 } 794 795 c->try_harder = cl; 796 c->try_harder_start = local_clock(); 797 retry: 798 list_for_each_entry_reverse(i, &c->btree_cache, list) { 799 int r = mca_reap(i, cl, btree_order(k)); 800 if (!r) 801 return i; 802 if (r != -ENOMEM) 803 ret = r; 804 } 805 806 if (ret == -EAGAIN && 807 closure_blocking(cl)) { 808 mutex_unlock(&c->bucket_lock); 809 closure_sync(cl); 810 mutex_lock(&c->bucket_lock); 811 goto retry; 812 } 813 814 return ERR_PTR(ret); 815 } 816 817 /* 818 * We can only have one thread cannibalizing other cached btree nodes at a time, 819 * or we'll deadlock. We use an open coded mutex to ensure that, which a 820 * cannibalize_bucket() will take. This means every time we unlock the root of 821 * the btree, we need to release this lock if we have it held. 822 */ 823 void bch_cannibalize_unlock(struct cache_set *c, struct closure *cl) 824 { 825 if (c->try_harder == cl) { 826 bch_time_stats_update(&c->try_harder_time, c->try_harder_start); 827 c->try_harder = NULL; 828 __closure_wake_up(&c->try_wait); 829 } 830 } 831 832 static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, 833 int level, struct closure *cl) 834 { 835 struct btree *b; 836 837 lockdep_assert_held(&c->bucket_lock); 838 839 if (mca_find(c, k)) 840 return NULL; 841 842 /* btree_free() doesn't free memory; it sticks the node on the end of 843 * the list. Check if there's any freed nodes there: 844 */ 845 list_for_each_entry(b, &c->btree_cache_freeable, list) 846 if (!mca_reap(b, NULL, btree_order(k))) 847 goto out; 848 849 /* We never free struct btree itself, just the memory that holds the on 850 * disk node. Check the freed list before allocating a new one: 851 */ 852 list_for_each_entry(b, &c->btree_cache_freed, list) 853 if (!mca_reap(b, NULL, 0)) { 854 mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); 855 if (!b->sets[0].data) 856 goto err; 857 else 858 goto out; 859 } 860 861 b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO); 862 if (!b) 863 goto err; 864 865 BUG_ON(!down_write_trylock(&b->lock)); 866 if (!b->sets->data) 867 goto err; 868 out: 869 BUG_ON(!closure_is_unlocked(&b->io.cl)); 870 871 bkey_copy(&b->key, k); 872 list_move(&b->list, &c->btree_cache); 873 hlist_del_init_rcu(&b->hash); 874 hlist_add_head_rcu(&b->hash, mca_hash(c, k)); 875 876 lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); 877 b->level = level; 878 879 mca_reinit(b); 880 881 return b; 882 err: 883 if (b) 884 rw_unlock(true, b); 885 886 b = mca_cannibalize(c, k, level, cl); 887 if (!IS_ERR(b)) 888 goto out; 889 890 return b; 891 } 892 893 /** 894 * bch_btree_node_get - find a btree node in the cache and lock it, reading it 895 * in from disk if necessary. 896 * 897 * If IO is necessary, it uses the closure embedded in struct btree_op to wait; 898 * if that closure is in non blocking mode, will return -EAGAIN. 899 * 900 * The btree node will have either a read or a write lock held, depending on 901 * level and op->lock. 902 */ 903 struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k, 904 int level, struct btree_op *op) 905 { 906 int i = 0; 907 bool write = level <= op->lock; 908 struct btree *b; 909 910 BUG_ON(level < 0); 911 retry: 912 b = mca_find(c, k); 913 914 if (!b) { 915 if (current->bio_list) 916 return ERR_PTR(-EAGAIN); 917 918 mutex_lock(&c->bucket_lock); 919 b = mca_alloc(c, k, level, &op->cl); 920 mutex_unlock(&c->bucket_lock); 921 922 if (!b) 923 goto retry; 924 if (IS_ERR(b)) 925 return b; 926 927 bch_btree_node_read(b); 928 929 if (!write) 930 downgrade_write(&b->lock); 931 } else { 932 rw_lock(write, b, level); 933 if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) { 934 rw_unlock(write, b); 935 goto retry; 936 } 937 BUG_ON(b->level != level); 938 } 939 940 b->accessed = 1; 941 942 for (; i <= b->nsets && b->sets[i].size; i++) { 943 prefetch(b->sets[i].tree); 944 prefetch(b->sets[i].data); 945 } 946 947 for (; i <= b->nsets; i++) 948 prefetch(b->sets[i].data); 949 950 if (btree_node_io_error(b)) { 951 rw_unlock(write, b); 952 return ERR_PTR(-EIO); 953 } 954 955 BUG_ON(!b->written); 956 957 return b; 958 } 959 960 static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) 961 { 962 struct btree *b; 963 964 mutex_lock(&c->bucket_lock); 965 b = mca_alloc(c, k, level, NULL); 966 mutex_unlock(&c->bucket_lock); 967 968 if (!IS_ERR_OR_NULL(b)) { 969 bch_btree_node_read(b); 970 rw_unlock(true, b); 971 } 972 } 973 974 /* Btree alloc */ 975 976 static void btree_node_free(struct btree *b, struct btree_op *op) 977 { 978 unsigned i; 979 980 trace_bcache_btree_node_free(b); 981 982 /* 983 * The BUG_ON() in btree_node_get() implies that we must have a write 984 * lock on parent to free or even invalidate a node 985 */ 986 BUG_ON(op->lock <= b->level); 987 BUG_ON(b == b->c->root); 988 989 if (btree_node_dirty(b)) 990 btree_complete_write(b, btree_current_write(b)); 991 clear_bit(BTREE_NODE_dirty, &b->flags); 992 993 cancel_delayed_work(&b->work); 994 995 mutex_lock(&b->c->bucket_lock); 996 997 for (i = 0; i < KEY_PTRS(&b->key); i++) { 998 BUG_ON(atomic_read(&PTR_BUCKET(b->c, &b->key, i)->pin)); 999 1000 bch_inc_gen(PTR_CACHE(b->c, &b->key, i), 1001 PTR_BUCKET(b->c, &b->key, i)); 1002 } 1003 1004 bch_bucket_free(b->c, &b->key); 1005 mca_bucket_free(b); 1006 mutex_unlock(&b->c->bucket_lock); 1007 } 1008 1009 struct btree *bch_btree_node_alloc(struct cache_set *c, int level, 1010 struct closure *cl) 1011 { 1012 BKEY_PADDED(key) k; 1013 struct btree *b = ERR_PTR(-EAGAIN); 1014 1015 mutex_lock(&c->bucket_lock); 1016 retry: 1017 if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, cl)) 1018 goto err; 1019 1020 SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); 1021 1022 b = mca_alloc(c, &k.key, level, cl); 1023 if (IS_ERR(b)) 1024 goto err_free; 1025 1026 if (!b) { 1027 cache_bug(c, 1028 "Tried to allocate bucket that was in btree cache"); 1029 __bkey_put(c, &k.key); 1030 goto retry; 1031 } 1032 1033 b->accessed = 1; 1034 bch_bset_init_next(b); 1035 1036 mutex_unlock(&c->bucket_lock); 1037 1038 trace_bcache_btree_node_alloc(b); 1039 return b; 1040 err_free: 1041 bch_bucket_free(c, &k.key); 1042 __bkey_put(c, &k.key); 1043 err: 1044 mutex_unlock(&c->bucket_lock); 1045 1046 trace_bcache_btree_node_alloc_fail(b); 1047 return b; 1048 } 1049 1050 static struct btree *btree_node_alloc_replacement(struct btree *b, 1051 struct closure *cl) 1052 { 1053 struct btree *n = bch_btree_node_alloc(b->c, b->level, cl); 1054 if (!IS_ERR_OR_NULL(n)) 1055 bch_btree_sort_into(b, n); 1056 1057 return n; 1058 } 1059 1060 /* Garbage collection */ 1061 1062 uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) 1063 { 1064 uint8_t stale = 0; 1065 unsigned i; 1066 struct bucket *g; 1067 1068 /* 1069 * ptr_invalid() can't return true for the keys that mark btree nodes as 1070 * freed, but since ptr_bad() returns true we'll never actually use them 1071 * for anything and thus we don't want mark their pointers here 1072 */ 1073 if (!bkey_cmp(k, &ZERO_KEY)) 1074 return stale; 1075 1076 for (i = 0; i < KEY_PTRS(k); i++) { 1077 if (!ptr_available(c, k, i)) 1078 continue; 1079 1080 g = PTR_BUCKET(c, k, i); 1081 1082 if (gen_after(g->gc_gen, PTR_GEN(k, i))) 1083 g->gc_gen = PTR_GEN(k, i); 1084 1085 if (ptr_stale(c, k, i)) { 1086 stale = max(stale, ptr_stale(c, k, i)); 1087 continue; 1088 } 1089 1090 cache_bug_on(GC_MARK(g) && 1091 (GC_MARK(g) == GC_MARK_METADATA) != (level != 0), 1092 c, "inconsistent ptrs: mark = %llu, level = %i", 1093 GC_MARK(g), level); 1094 1095 if (level) 1096 SET_GC_MARK(g, GC_MARK_METADATA); 1097 else if (KEY_DIRTY(k)) 1098 SET_GC_MARK(g, GC_MARK_DIRTY); 1099 1100 /* guard against overflow */ 1101 SET_GC_SECTORS_USED(g, min_t(unsigned, 1102 GC_SECTORS_USED(g) + KEY_SIZE(k), 1103 (1 << 14) - 1)); 1104 1105 BUG_ON(!GC_SECTORS_USED(g)); 1106 } 1107 1108 return stale; 1109 } 1110 1111 #define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k) 1112 1113 static int btree_gc_mark_node(struct btree *b, unsigned *keys, 1114 struct gc_stat *gc) 1115 { 1116 uint8_t stale = 0; 1117 unsigned last_dev = -1; 1118 struct bcache_device *d = NULL; 1119 struct bkey *k; 1120 struct btree_iter iter; 1121 struct bset_tree *t; 1122 1123 gc->nodes++; 1124 1125 for_each_key_filter(b, k, &iter, bch_ptr_invalid) { 1126 if (last_dev != KEY_INODE(k)) { 1127 last_dev = KEY_INODE(k); 1128 1129 d = KEY_INODE(k) < b->c->nr_uuids 1130 ? b->c->devices[last_dev] 1131 : NULL; 1132 } 1133 1134 stale = max(stale, btree_mark_key(b, k)); 1135 1136 if (bch_ptr_bad(b, k)) 1137 continue; 1138 1139 *keys += bkey_u64s(k); 1140 1141 gc->key_bytes += bkey_u64s(k); 1142 gc->nkeys++; 1143 1144 gc->data += KEY_SIZE(k); 1145 if (KEY_DIRTY(k)) 1146 gc->dirty += KEY_SIZE(k); 1147 } 1148 1149 for (t = b->sets; t <= &b->sets[b->nsets]; t++) 1150 btree_bug_on(t->size && 1151 bset_written(b, t) && 1152 bkey_cmp(&b->key, &t->end) < 0, 1153 b, "found short btree key in gc"); 1154 1155 return stale; 1156 } 1157 1158 static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k, 1159 struct btree_op *op) 1160 { 1161 /* 1162 * We block priorities from being written for the duration of garbage 1163 * collection, so we can't sleep in btree_alloc() -> 1164 * bch_bucket_alloc_set(), or we'd risk deadlock - so we don't pass it 1165 * our closure. 1166 */ 1167 struct btree *n = btree_node_alloc_replacement(b, NULL); 1168 1169 if (!IS_ERR_OR_NULL(n)) { 1170 swap(b, n); 1171 __bkey_put(b->c, &b->key); 1172 1173 memcpy(k->ptr, b->key.ptr, 1174 sizeof(uint64_t) * KEY_PTRS(&b->key)); 1175 1176 btree_node_free(n, op); 1177 up_write(&n->lock); 1178 } 1179 1180 return b; 1181 } 1182 1183 /* 1184 * Leaving this at 2 until we've got incremental garbage collection done; it 1185 * could be higher (and has been tested with 4) except that garbage collection 1186 * could take much longer, adversely affecting latency. 1187 */ 1188 #define GC_MERGE_NODES 2U 1189 1190 struct gc_merge_info { 1191 struct btree *b; 1192 struct bkey *k; 1193 unsigned keys; 1194 }; 1195 1196 static void btree_gc_coalesce(struct btree *b, struct btree_op *op, 1197 struct gc_stat *gc, struct gc_merge_info *r) 1198 { 1199 unsigned nodes = 0, keys = 0, blocks; 1200 int i; 1201 1202 while (nodes < GC_MERGE_NODES && r[nodes].b) 1203 keys += r[nodes++].keys; 1204 1205 blocks = btree_default_blocks(b->c) * 2 / 3; 1206 1207 if (nodes < 2 || 1208 __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1)) 1209 return; 1210 1211 for (i = nodes - 1; i >= 0; --i) { 1212 if (r[i].b->written) 1213 r[i].b = btree_gc_alloc(r[i].b, r[i].k, op); 1214 1215 if (r[i].b->written) 1216 return; 1217 } 1218 1219 for (i = nodes - 1; i > 0; --i) { 1220 struct bset *n1 = r[i].b->sets->data; 1221 struct bset *n2 = r[i - 1].b->sets->data; 1222 struct bkey *k, *last = NULL; 1223 1224 keys = 0; 1225 1226 if (i == 1) { 1227 /* 1228 * Last node we're not getting rid of - we're getting 1229 * rid of the node at r[0]. Have to try and fit all of 1230 * the remaining keys into this node; we can't ensure 1231 * they will always fit due to rounding and variable 1232 * length keys (shouldn't be possible in practice, 1233 * though) 1234 */ 1235 if (__set_blocks(n1, n1->keys + r->keys, 1236 b->c) > btree_blocks(r[i].b)) 1237 return; 1238 1239 keys = n2->keys; 1240 last = &r->b->key; 1241 } else 1242 for (k = n2->start; 1243 k < end(n2); 1244 k = bkey_next(k)) { 1245 if (__set_blocks(n1, n1->keys + keys + 1246 bkey_u64s(k), b->c) > blocks) 1247 break; 1248 1249 last = k; 1250 keys += bkey_u64s(k); 1251 } 1252 1253 BUG_ON(__set_blocks(n1, n1->keys + keys, 1254 b->c) > btree_blocks(r[i].b)); 1255 1256 if (last) { 1257 bkey_copy_key(&r[i].b->key, last); 1258 bkey_copy_key(r[i].k, last); 1259 } 1260 1261 memcpy(end(n1), 1262 n2->start, 1263 (void *) node(n2, keys) - (void *) n2->start); 1264 1265 n1->keys += keys; 1266 1267 memmove(n2->start, 1268 node(n2, keys), 1269 (void *) end(n2) - (void *) node(n2, keys)); 1270 1271 n2->keys -= keys; 1272 1273 r[i].keys = n1->keys; 1274 r[i - 1].keys = n2->keys; 1275 } 1276 1277 btree_node_free(r->b, op); 1278 up_write(&r->b->lock); 1279 1280 trace_bcache_btree_gc_coalesce(nodes); 1281 1282 gc->nodes--; 1283 nodes--; 1284 1285 memmove(&r[0], &r[1], sizeof(struct gc_merge_info) * nodes); 1286 memset(&r[nodes], 0, sizeof(struct gc_merge_info)); 1287 } 1288 1289 static int btree_gc_recurse(struct btree *b, struct btree_op *op, 1290 struct closure *writes, struct gc_stat *gc) 1291 { 1292 void write(struct btree *r) 1293 { 1294 if (!r->written) 1295 bch_btree_node_write(r, &op->cl); 1296 else if (btree_node_dirty(r)) 1297 bch_btree_node_write(r, writes); 1298 1299 up_write(&r->lock); 1300 } 1301 1302 int ret = 0, stale; 1303 unsigned i; 1304 struct gc_merge_info r[GC_MERGE_NODES]; 1305 1306 memset(r, 0, sizeof(r)); 1307 1308 while ((r->k = bch_next_recurse_key(b, &b->c->gc_done))) { 1309 r->b = bch_btree_node_get(b->c, r->k, b->level - 1, op); 1310 1311 if (IS_ERR(r->b)) { 1312 ret = PTR_ERR(r->b); 1313 break; 1314 } 1315 1316 r->keys = 0; 1317 stale = btree_gc_mark_node(r->b, &r->keys, gc); 1318 1319 if (!b->written && 1320 (r->b->level || stale > 10 || 1321 b->c->gc_always_rewrite)) 1322 r->b = btree_gc_alloc(r->b, r->k, op); 1323 1324 if (r->b->level) 1325 ret = btree_gc_recurse(r->b, op, writes, gc); 1326 1327 if (ret) { 1328 write(r->b); 1329 break; 1330 } 1331 1332 bkey_copy_key(&b->c->gc_done, r->k); 1333 1334 if (!b->written) 1335 btree_gc_coalesce(b, op, gc, r); 1336 1337 if (r[GC_MERGE_NODES - 1].b) 1338 write(r[GC_MERGE_NODES - 1].b); 1339 1340 memmove(&r[1], &r[0], 1341 sizeof(struct gc_merge_info) * (GC_MERGE_NODES - 1)); 1342 1343 /* When we've got incremental GC working, we'll want to do 1344 * if (should_resched()) 1345 * return -EAGAIN; 1346 */ 1347 cond_resched(); 1348 #if 0 1349 if (need_resched()) { 1350 ret = -EAGAIN; 1351 break; 1352 } 1353 #endif 1354 } 1355 1356 for (i = 1; i < GC_MERGE_NODES && r[i].b; i++) 1357 write(r[i].b); 1358 1359 /* Might have freed some children, must remove their keys */ 1360 if (!b->written) 1361 bch_btree_sort(b); 1362 1363 return ret; 1364 } 1365 1366 static int bch_btree_gc_root(struct btree *b, struct btree_op *op, 1367 struct closure *writes, struct gc_stat *gc) 1368 { 1369 struct btree *n = NULL; 1370 unsigned keys = 0; 1371 int ret = 0, stale = btree_gc_mark_node(b, &keys, gc); 1372 1373 if (b->level || stale > 10) 1374 n = btree_node_alloc_replacement(b, NULL); 1375 1376 if (!IS_ERR_OR_NULL(n)) 1377 swap(b, n); 1378 1379 if (b->level) 1380 ret = btree_gc_recurse(b, op, writes, gc); 1381 1382 if (!b->written || btree_node_dirty(b)) { 1383 bch_btree_node_write(b, n ? &op->cl : NULL); 1384 } 1385 1386 if (!IS_ERR_OR_NULL(n)) { 1387 closure_sync(&op->cl); 1388 bch_btree_set_root(b); 1389 btree_node_free(n, op); 1390 rw_unlock(true, b); 1391 } 1392 1393 return ret; 1394 } 1395 1396 static void btree_gc_start(struct cache_set *c) 1397 { 1398 struct cache *ca; 1399 struct bucket *b; 1400 unsigned i; 1401 1402 if (!c->gc_mark_valid) 1403 return; 1404 1405 mutex_lock(&c->bucket_lock); 1406 1407 c->gc_mark_valid = 0; 1408 c->gc_done = ZERO_KEY; 1409 1410 for_each_cache(ca, c, i) 1411 for_each_bucket(b, ca) { 1412 b->gc_gen = b->gen; 1413 if (!atomic_read(&b->pin)) { 1414 SET_GC_MARK(b, GC_MARK_RECLAIMABLE); 1415 SET_GC_SECTORS_USED(b, 0); 1416 } 1417 } 1418 1419 mutex_unlock(&c->bucket_lock); 1420 } 1421 1422 size_t bch_btree_gc_finish(struct cache_set *c) 1423 { 1424 size_t available = 0; 1425 struct bucket *b; 1426 struct cache *ca; 1427 unsigned i; 1428 1429 mutex_lock(&c->bucket_lock); 1430 1431 set_gc_sectors(c); 1432 c->gc_mark_valid = 1; 1433 c->need_gc = 0; 1434 1435 if (c->root) 1436 for (i = 0; i < KEY_PTRS(&c->root->key); i++) 1437 SET_GC_MARK(PTR_BUCKET(c, &c->root->key, i), 1438 GC_MARK_METADATA); 1439 1440 for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++) 1441 SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i), 1442 GC_MARK_METADATA); 1443 1444 for_each_cache(ca, c, i) { 1445 uint64_t *i; 1446 1447 ca->invalidate_needs_gc = 0; 1448 1449 for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++) 1450 SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); 1451 1452 for (i = ca->prio_buckets; 1453 i < ca->prio_buckets + prio_buckets(ca) * 2; i++) 1454 SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); 1455 1456 for_each_bucket(b, ca) { 1457 b->last_gc = b->gc_gen; 1458 c->need_gc = max(c->need_gc, bucket_gc_gen(b)); 1459 1460 if (!atomic_read(&b->pin) && 1461 GC_MARK(b) == GC_MARK_RECLAIMABLE) { 1462 available++; 1463 if (!GC_SECTORS_USED(b)) 1464 bch_bucket_add_unused(ca, b); 1465 } 1466 } 1467 } 1468 1469 mutex_unlock(&c->bucket_lock); 1470 return available; 1471 } 1472 1473 static void bch_btree_gc(struct closure *cl) 1474 { 1475 struct cache_set *c = container_of(cl, struct cache_set, gc.cl); 1476 int ret; 1477 unsigned long available; 1478 struct gc_stat stats; 1479 struct closure writes; 1480 struct btree_op op; 1481 uint64_t start_time = local_clock(); 1482 1483 trace_bcache_gc_start(c); 1484 1485 memset(&stats, 0, sizeof(struct gc_stat)); 1486 closure_init_stack(&writes); 1487 bch_btree_op_init_stack(&op); 1488 op.lock = SHRT_MAX; 1489 1490 btree_gc_start(c); 1491 1492 atomic_inc(&c->prio_blocked); 1493 1494 ret = btree_root(gc_root, c, &op, &writes, &stats); 1495 closure_sync(&op.cl); 1496 closure_sync(&writes); 1497 1498 if (ret) { 1499 pr_warn("gc failed!"); 1500 continue_at(cl, bch_btree_gc, bch_gc_wq); 1501 } 1502 1503 /* Possibly wait for new UUIDs or whatever to hit disk */ 1504 bch_journal_meta(c, &op.cl); 1505 closure_sync(&op.cl); 1506 1507 available = bch_btree_gc_finish(c); 1508 1509 atomic_dec(&c->prio_blocked); 1510 wake_up_allocators(c); 1511 1512 bch_time_stats_update(&c->btree_gc_time, start_time); 1513 1514 stats.key_bytes *= sizeof(uint64_t); 1515 stats.dirty <<= 9; 1516 stats.data <<= 9; 1517 stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; 1518 memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); 1519 1520 trace_bcache_gc_end(c); 1521 1522 continue_at(cl, bch_moving_gc, bch_gc_wq); 1523 } 1524 1525 void bch_queue_gc(struct cache_set *c) 1526 { 1527 closure_trylock_call(&c->gc.cl, bch_btree_gc, bch_gc_wq, &c->cl); 1528 } 1529 1530 /* Initial partial gc */ 1531 1532 static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, 1533 unsigned long **seen) 1534 { 1535 int ret; 1536 unsigned i; 1537 struct bkey *k; 1538 struct bucket *g; 1539 struct btree_iter iter; 1540 1541 for_each_key_filter(b, k, &iter, bch_ptr_invalid) { 1542 for (i = 0; i < KEY_PTRS(k); i++) { 1543 if (!ptr_available(b->c, k, i)) 1544 continue; 1545 1546 g = PTR_BUCKET(b->c, k, i); 1547 1548 if (!__test_and_set_bit(PTR_BUCKET_NR(b->c, k, i), 1549 seen[PTR_DEV(k, i)]) || 1550 !ptr_stale(b->c, k, i)) { 1551 g->gen = PTR_GEN(k, i); 1552 1553 if (b->level) 1554 g->prio = BTREE_PRIO; 1555 else if (g->prio == BTREE_PRIO) 1556 g->prio = INITIAL_PRIO; 1557 } 1558 } 1559 1560 btree_mark_key(b, k); 1561 } 1562 1563 if (b->level) { 1564 k = bch_next_recurse_key(b, &ZERO_KEY); 1565 1566 while (k) { 1567 struct bkey *p = bch_next_recurse_key(b, k); 1568 if (p) 1569 btree_node_prefetch(b->c, p, b->level - 1); 1570 1571 ret = btree(check_recurse, k, b, op, seen); 1572 if (ret) 1573 return ret; 1574 1575 k = p; 1576 } 1577 } 1578 1579 return 0; 1580 } 1581 1582 int bch_btree_check(struct cache_set *c, struct btree_op *op) 1583 { 1584 int ret = -ENOMEM; 1585 unsigned i; 1586 unsigned long *seen[MAX_CACHES_PER_SET]; 1587 1588 memset(seen, 0, sizeof(seen)); 1589 1590 for (i = 0; c->cache[i]; i++) { 1591 size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8); 1592 seen[i] = kmalloc(n, GFP_KERNEL); 1593 if (!seen[i]) 1594 goto err; 1595 1596 /* Disables the seen array until prio_read() uses it too */ 1597 memset(seen[i], 0xFF, n); 1598 } 1599 1600 ret = btree_root(check_recurse, c, op, seen); 1601 err: 1602 for (i = 0; i < MAX_CACHES_PER_SET; i++) 1603 kfree(seen[i]); 1604 return ret; 1605 } 1606 1607 /* Btree insertion */ 1608 1609 static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert) 1610 { 1611 struct bset *i = b->sets[b->nsets].data; 1612 1613 memmove((uint64_t *) where + bkey_u64s(insert), 1614 where, 1615 (void *) end(i) - (void *) where); 1616 1617 i->keys += bkey_u64s(insert); 1618 bkey_copy(where, insert); 1619 bch_bset_fix_lookup_table(b, where); 1620 } 1621 1622 static bool fix_overlapping_extents(struct btree *b, 1623 struct bkey *insert, 1624 struct btree_iter *iter, 1625 struct btree_op *op) 1626 { 1627 void subtract_dirty(struct bkey *k, uint64_t offset, int sectors) 1628 { 1629 if (KEY_DIRTY(k)) 1630 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), 1631 offset, -sectors); 1632 } 1633 1634 uint64_t old_offset; 1635 unsigned old_size, sectors_found = 0; 1636 1637 while (1) { 1638 struct bkey *k = bch_btree_iter_next(iter); 1639 if (!k || 1640 bkey_cmp(&START_KEY(k), insert) >= 0) 1641 break; 1642 1643 if (bkey_cmp(k, &START_KEY(insert)) <= 0) 1644 continue; 1645 1646 old_offset = KEY_START(k); 1647 old_size = KEY_SIZE(k); 1648 1649 /* 1650 * We might overlap with 0 size extents; we can't skip these 1651 * because if they're in the set we're inserting to we have to 1652 * adjust them so they don't overlap with the key we're 1653 * inserting. But we don't want to check them for BTREE_REPLACE 1654 * operations. 1655 */ 1656 1657 if (op->type == BTREE_REPLACE && 1658 KEY_SIZE(k)) { 1659 /* 1660 * k might have been split since we inserted/found the 1661 * key we're replacing 1662 */ 1663 unsigned i; 1664 uint64_t offset = KEY_START(k) - 1665 KEY_START(&op->replace); 1666 1667 /* But it must be a subset of the replace key */ 1668 if (KEY_START(k) < KEY_START(&op->replace) || 1669 KEY_OFFSET(k) > KEY_OFFSET(&op->replace)) 1670 goto check_failed; 1671 1672 /* We didn't find a key that we were supposed to */ 1673 if (KEY_START(k) > KEY_START(insert) + sectors_found) 1674 goto check_failed; 1675 1676 if (KEY_PTRS(&op->replace) != KEY_PTRS(k)) 1677 goto check_failed; 1678 1679 /* skip past gen */ 1680 offset <<= 8; 1681 1682 BUG_ON(!KEY_PTRS(&op->replace)); 1683 1684 for (i = 0; i < KEY_PTRS(&op->replace); i++) 1685 if (k->ptr[i] != op->replace.ptr[i] + offset) 1686 goto check_failed; 1687 1688 sectors_found = KEY_OFFSET(k) - KEY_START(insert); 1689 } 1690 1691 if (bkey_cmp(insert, k) < 0 && 1692 bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) { 1693 /* 1694 * We overlapped in the middle of an existing key: that 1695 * means we have to split the old key. But we have to do 1696 * slightly different things depending on whether the 1697 * old key has been written out yet. 1698 */ 1699 1700 struct bkey *top; 1701 1702 subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert)); 1703 1704 if (bkey_written(b, k)) { 1705 /* 1706 * We insert a new key to cover the top of the 1707 * old key, and the old key is modified in place 1708 * to represent the bottom split. 1709 * 1710 * It's completely arbitrary whether the new key 1711 * is the top or the bottom, but it has to match 1712 * up with what btree_sort_fixup() does - it 1713 * doesn't check for this kind of overlap, it 1714 * depends on us inserting a new key for the top 1715 * here. 1716 */ 1717 top = bch_bset_search(b, &b->sets[b->nsets], 1718 insert); 1719 shift_keys(b, top, k); 1720 } else { 1721 BKEY_PADDED(key) temp; 1722 bkey_copy(&temp.key, k); 1723 shift_keys(b, k, &temp.key); 1724 top = bkey_next(k); 1725 } 1726 1727 bch_cut_front(insert, top); 1728 bch_cut_back(&START_KEY(insert), k); 1729 bch_bset_fix_invalidated_key(b, k); 1730 return false; 1731 } 1732 1733 if (bkey_cmp(insert, k) < 0) { 1734 bch_cut_front(insert, k); 1735 } else { 1736 if (bkey_written(b, k) && 1737 bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) { 1738 /* 1739 * Completely overwrote, so we don't have to 1740 * invalidate the binary search tree 1741 */ 1742 bch_cut_front(k, k); 1743 } else { 1744 __bch_cut_back(&START_KEY(insert), k); 1745 bch_bset_fix_invalidated_key(b, k); 1746 } 1747 } 1748 1749 subtract_dirty(k, old_offset, old_size - KEY_SIZE(k)); 1750 } 1751 1752 check_failed: 1753 if (op->type == BTREE_REPLACE) { 1754 if (!sectors_found) { 1755 op->insert_collision = true; 1756 return true; 1757 } else if (sectors_found < KEY_SIZE(insert)) { 1758 SET_KEY_OFFSET(insert, KEY_OFFSET(insert) - 1759 (KEY_SIZE(insert) - sectors_found)); 1760 SET_KEY_SIZE(insert, sectors_found); 1761 } 1762 } 1763 1764 return false; 1765 } 1766 1767 static bool btree_insert_key(struct btree *b, struct btree_op *op, 1768 struct bkey *k) 1769 { 1770 struct bset *i = b->sets[b->nsets].data; 1771 struct bkey *m, *prev; 1772 unsigned status = BTREE_INSERT_STATUS_INSERT; 1773 1774 BUG_ON(bkey_cmp(k, &b->key) > 0); 1775 BUG_ON(b->level && !KEY_PTRS(k)); 1776 BUG_ON(!b->level && !KEY_OFFSET(k)); 1777 1778 if (!b->level) { 1779 struct btree_iter iter; 1780 struct bkey search = KEY(KEY_INODE(k), KEY_START(k), 0); 1781 1782 /* 1783 * bset_search() returns the first key that is strictly greater 1784 * than the search key - but for back merging, we want to find 1785 * the first key that is greater than or equal to KEY_START(k) - 1786 * unless KEY_START(k) is 0. 1787 */ 1788 if (KEY_OFFSET(&search)) 1789 SET_KEY_OFFSET(&search, KEY_OFFSET(&search) - 1); 1790 1791 prev = NULL; 1792 m = bch_btree_iter_init(b, &iter, &search); 1793 1794 if (fix_overlapping_extents(b, k, &iter, op)) 1795 return false; 1796 1797 while (m != end(i) && 1798 bkey_cmp(k, &START_KEY(m)) > 0) 1799 prev = m, m = bkey_next(m); 1800 1801 if (key_merging_disabled(b->c)) 1802 goto insert; 1803 1804 /* prev is in the tree, if we merge we're done */ 1805 status = BTREE_INSERT_STATUS_BACK_MERGE; 1806 if (prev && 1807 bch_bkey_try_merge(b, prev, k)) 1808 goto merged; 1809 1810 status = BTREE_INSERT_STATUS_OVERWROTE; 1811 if (m != end(i) && 1812 KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) 1813 goto copy; 1814 1815 status = BTREE_INSERT_STATUS_FRONT_MERGE; 1816 if (m != end(i) && 1817 bch_bkey_try_merge(b, k, m)) 1818 goto copy; 1819 } else 1820 m = bch_bset_search(b, &b->sets[b->nsets], k); 1821 1822 insert: shift_keys(b, m, k); 1823 copy: bkey_copy(m, k); 1824 merged: 1825 if (KEY_DIRTY(k)) 1826 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), 1827 KEY_START(k), KEY_SIZE(k)); 1828 1829 bch_check_keys(b, "%u for %s", status, op_type(op)); 1830 1831 if (b->level && !KEY_OFFSET(k)) 1832 btree_current_write(b)->prio_blocked++; 1833 1834 trace_bcache_btree_insert_key(b, k, op->type, status); 1835 1836 return true; 1837 } 1838 1839 static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op) 1840 { 1841 bool ret = false; 1842 struct bkey *k; 1843 unsigned oldsize = bch_count_data(b); 1844 1845 while ((k = bch_keylist_pop(&op->keys))) { 1846 bkey_put(b->c, k, b->level); 1847 ret |= btree_insert_key(b, op, k); 1848 } 1849 1850 BUG_ON(bch_count_data(b) < oldsize); 1851 return ret; 1852 } 1853 1854 bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op, 1855 struct bio *bio) 1856 { 1857 bool ret = false; 1858 uint64_t btree_ptr = b->key.ptr[0]; 1859 unsigned long seq = b->seq; 1860 BKEY_PADDED(k) tmp; 1861 1862 rw_unlock(false, b); 1863 rw_lock(true, b, b->level); 1864 1865 if (b->key.ptr[0] != btree_ptr || 1866 b->seq != seq + 1 || 1867 should_split(b)) 1868 goto out; 1869 1870 op->replace = KEY(op->inode, bio_end_sector(bio), bio_sectors(bio)); 1871 1872 SET_KEY_PTRS(&op->replace, 1); 1873 get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t)); 1874 1875 SET_PTR_DEV(&op->replace, 0, PTR_CHECK_DEV); 1876 1877 bkey_copy(&tmp.k, &op->replace); 1878 1879 BUG_ON(op->type != BTREE_INSERT); 1880 BUG_ON(!btree_insert_key(b, op, &tmp.k)); 1881 ret = true; 1882 out: 1883 downgrade_write(&b->lock); 1884 return ret; 1885 } 1886 1887 static int btree_split(struct btree *b, struct btree_op *op) 1888 { 1889 bool split, root = b == b->c->root; 1890 struct btree *n1, *n2 = NULL, *n3 = NULL; 1891 uint64_t start_time = local_clock(); 1892 1893 if (b->level) 1894 set_closure_blocking(&op->cl); 1895 1896 n1 = btree_node_alloc_replacement(b, &op->cl); 1897 if (IS_ERR(n1)) 1898 goto err; 1899 1900 split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5; 1901 1902 if (split) { 1903 unsigned keys = 0; 1904 1905 trace_bcache_btree_node_split(b, n1->sets[0].data->keys); 1906 1907 n2 = bch_btree_node_alloc(b->c, b->level, &op->cl); 1908 if (IS_ERR(n2)) 1909 goto err_free1; 1910 1911 if (root) { 1912 n3 = bch_btree_node_alloc(b->c, b->level + 1, &op->cl); 1913 if (IS_ERR(n3)) 1914 goto err_free2; 1915 } 1916 1917 bch_btree_insert_keys(n1, op); 1918 1919 /* Has to be a linear search because we don't have an auxiliary 1920 * search tree yet 1921 */ 1922 1923 while (keys < (n1->sets[0].data->keys * 3) / 5) 1924 keys += bkey_u64s(node(n1->sets[0].data, keys)); 1925 1926 bkey_copy_key(&n1->key, node(n1->sets[0].data, keys)); 1927 keys += bkey_u64s(node(n1->sets[0].data, keys)); 1928 1929 n2->sets[0].data->keys = n1->sets[0].data->keys - keys; 1930 n1->sets[0].data->keys = keys; 1931 1932 memcpy(n2->sets[0].data->start, 1933 end(n1->sets[0].data), 1934 n2->sets[0].data->keys * sizeof(uint64_t)); 1935 1936 bkey_copy_key(&n2->key, &b->key); 1937 1938 bch_keylist_add(&op->keys, &n2->key); 1939 bch_btree_node_write(n2, &op->cl); 1940 rw_unlock(true, n2); 1941 } else { 1942 trace_bcache_btree_node_compact(b, n1->sets[0].data->keys); 1943 1944 bch_btree_insert_keys(n1, op); 1945 } 1946 1947 bch_keylist_add(&op->keys, &n1->key); 1948 bch_btree_node_write(n1, &op->cl); 1949 1950 if (n3) { 1951 bkey_copy_key(&n3->key, &MAX_KEY); 1952 bch_btree_insert_keys(n3, op); 1953 bch_btree_node_write(n3, &op->cl); 1954 1955 closure_sync(&op->cl); 1956 bch_btree_set_root(n3); 1957 rw_unlock(true, n3); 1958 } else if (root) { 1959 op->keys.top = op->keys.bottom; 1960 closure_sync(&op->cl); 1961 bch_btree_set_root(n1); 1962 } else { 1963 unsigned i; 1964 1965 bkey_copy(op->keys.top, &b->key); 1966 bkey_copy_key(op->keys.top, &ZERO_KEY); 1967 1968 for (i = 0; i < KEY_PTRS(&b->key); i++) { 1969 uint8_t g = PTR_BUCKET(b->c, &b->key, i)->gen + 1; 1970 1971 SET_PTR_GEN(op->keys.top, i, g); 1972 } 1973 1974 bch_keylist_push(&op->keys); 1975 closure_sync(&op->cl); 1976 atomic_inc(&b->c->prio_blocked); 1977 } 1978 1979 rw_unlock(true, n1); 1980 btree_node_free(b, op); 1981 1982 bch_time_stats_update(&b->c->btree_split_time, start_time); 1983 1984 return 0; 1985 err_free2: 1986 __bkey_put(n2->c, &n2->key); 1987 btree_node_free(n2, op); 1988 rw_unlock(true, n2); 1989 err_free1: 1990 __bkey_put(n1->c, &n1->key); 1991 btree_node_free(n1, op); 1992 rw_unlock(true, n1); 1993 err: 1994 if (n3 == ERR_PTR(-EAGAIN) || 1995 n2 == ERR_PTR(-EAGAIN) || 1996 n1 == ERR_PTR(-EAGAIN)) 1997 return -EAGAIN; 1998 1999 pr_warn("couldn't split"); 2000 return -ENOMEM; 2001 } 2002 2003 static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op, 2004 struct keylist *stack_keys) 2005 { 2006 if (b->level) { 2007 int ret; 2008 struct bkey *insert = op->keys.bottom; 2009 struct bkey *k = bch_next_recurse_key(b, &START_KEY(insert)); 2010 2011 if (!k) { 2012 btree_bug(b, "no key to recurse on at level %i/%i", 2013 b->level, b->c->root->level); 2014 2015 op->keys.top = op->keys.bottom; 2016 return -EIO; 2017 } 2018 2019 if (bkey_cmp(insert, k) > 0) { 2020 unsigned i; 2021 2022 if (op->type == BTREE_REPLACE) { 2023 __bkey_put(b->c, insert); 2024 op->keys.top = op->keys.bottom; 2025 op->insert_collision = true; 2026 return 0; 2027 } 2028 2029 for (i = 0; i < KEY_PTRS(insert); i++) 2030 atomic_inc(&PTR_BUCKET(b->c, insert, i)->pin); 2031 2032 bkey_copy(stack_keys->top, insert); 2033 2034 bch_cut_back(k, insert); 2035 bch_cut_front(k, stack_keys->top); 2036 2037 bch_keylist_push(stack_keys); 2038 } 2039 2040 ret = btree(insert_recurse, k, b, op, stack_keys); 2041 if (ret) 2042 return ret; 2043 } 2044 2045 if (!bch_keylist_empty(&op->keys)) { 2046 if (should_split(b)) { 2047 if (op->lock <= b->c->root->level) { 2048 BUG_ON(b->level); 2049 op->lock = b->c->root->level + 1; 2050 return -EINTR; 2051 } 2052 return btree_split(b, op); 2053 } 2054 2055 BUG_ON(write_block(b) != b->sets[b->nsets].data); 2056 2057 if (bch_btree_insert_keys(b, op)) { 2058 if (!b->level) 2059 bch_btree_leaf_dirty(b, op); 2060 else 2061 bch_btree_node_write(b, &op->cl); 2062 } 2063 } 2064 2065 return 0; 2066 } 2067 2068 int bch_btree_insert(struct btree_op *op, struct cache_set *c) 2069 { 2070 int ret = 0; 2071 struct keylist stack_keys; 2072 2073 /* 2074 * Don't want to block with the btree locked unless we have to, 2075 * otherwise we get deadlocks with try_harder and between split/gc 2076 */ 2077 clear_closure_blocking(&op->cl); 2078 2079 BUG_ON(bch_keylist_empty(&op->keys)); 2080 bch_keylist_copy(&stack_keys, &op->keys); 2081 bch_keylist_init(&op->keys); 2082 2083 while (!bch_keylist_empty(&stack_keys) || 2084 !bch_keylist_empty(&op->keys)) { 2085 if (bch_keylist_empty(&op->keys)) { 2086 bch_keylist_add(&op->keys, 2087 bch_keylist_pop(&stack_keys)); 2088 op->lock = 0; 2089 } 2090 2091 ret = btree_root(insert_recurse, c, op, &stack_keys); 2092 2093 if (ret == -EAGAIN) { 2094 ret = 0; 2095 closure_sync(&op->cl); 2096 } else if (ret) { 2097 struct bkey *k; 2098 2099 pr_err("error %i trying to insert key for %s", 2100 ret, op_type(op)); 2101 2102 while ((k = bch_keylist_pop(&stack_keys) ?: 2103 bch_keylist_pop(&op->keys))) 2104 bkey_put(c, k, 0); 2105 } 2106 } 2107 2108 bch_keylist_free(&stack_keys); 2109 2110 if (op->journal) 2111 atomic_dec_bug(op->journal); 2112 op->journal = NULL; 2113 return ret; 2114 } 2115 2116 void bch_btree_set_root(struct btree *b) 2117 { 2118 unsigned i; 2119 struct closure cl; 2120 2121 closure_init_stack(&cl); 2122 2123 trace_bcache_btree_set_root(b); 2124 2125 BUG_ON(!b->written); 2126 2127 for (i = 0; i < KEY_PTRS(&b->key); i++) 2128 BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO); 2129 2130 mutex_lock(&b->c->bucket_lock); 2131 list_del_init(&b->list); 2132 mutex_unlock(&b->c->bucket_lock); 2133 2134 b->c->root = b; 2135 __bkey_put(b->c, &b->key); 2136 2137 bch_journal_meta(b->c, &cl); 2138 closure_sync(&cl); 2139 } 2140 2141 /* Cache lookup */ 2142 2143 static int submit_partial_cache_miss(struct btree *b, struct btree_op *op, 2144 struct bkey *k) 2145 { 2146 struct search *s = container_of(op, struct search, op); 2147 struct bio *bio = &s->bio.bio; 2148 int ret = 0; 2149 2150 while (!ret && 2151 !op->lookup_done) { 2152 unsigned sectors = INT_MAX; 2153 2154 if (KEY_INODE(k) == op->inode) { 2155 if (KEY_START(k) <= bio->bi_sector) 2156 break; 2157 2158 sectors = min_t(uint64_t, sectors, 2159 KEY_START(k) - bio->bi_sector); 2160 } 2161 2162 ret = s->d->cache_miss(b, s, bio, sectors); 2163 } 2164 2165 return ret; 2166 } 2167 2168 /* 2169 * Read from a single key, handling the initial cache miss if the key starts in 2170 * the middle of the bio 2171 */ 2172 static int submit_partial_cache_hit(struct btree *b, struct btree_op *op, 2173 struct bkey *k) 2174 { 2175 struct search *s = container_of(op, struct search, op); 2176 struct bio *bio = &s->bio.bio; 2177 unsigned ptr; 2178 struct bio *n; 2179 2180 int ret = submit_partial_cache_miss(b, op, k); 2181 if (ret || op->lookup_done) 2182 return ret; 2183 2184 /* XXX: figure out best pointer - for multiple cache devices */ 2185 ptr = 0; 2186 2187 PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO; 2188 2189 while (!op->lookup_done && 2190 KEY_INODE(k) == op->inode && 2191 bio->bi_sector < KEY_OFFSET(k)) { 2192 struct bkey *bio_key; 2193 sector_t sector = PTR_OFFSET(k, ptr) + 2194 (bio->bi_sector - KEY_START(k)); 2195 unsigned sectors = min_t(uint64_t, INT_MAX, 2196 KEY_OFFSET(k) - bio->bi_sector); 2197 2198 n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); 2199 if (n == bio) 2200 op->lookup_done = true; 2201 2202 bio_key = &container_of(n, struct bbio, bio)->key; 2203 2204 /* 2205 * The bucket we're reading from might be reused while our bio 2206 * is in flight, and we could then end up reading the wrong 2207 * data. 2208 * 2209 * We guard against this by checking (in cache_read_endio()) if 2210 * the pointer is stale again; if so, we treat it as an error 2211 * and reread from the backing device (but we don't pass that 2212 * error up anywhere). 2213 */ 2214 2215 bch_bkey_copy_single_ptr(bio_key, k, ptr); 2216 SET_PTR_OFFSET(bio_key, 0, sector); 2217 2218 n->bi_end_io = bch_cache_read_endio; 2219 n->bi_private = &s->cl; 2220 2221 __bch_submit_bbio(n, b->c); 2222 } 2223 2224 return 0; 2225 } 2226 2227 int bch_btree_search_recurse(struct btree *b, struct btree_op *op) 2228 { 2229 struct search *s = container_of(op, struct search, op); 2230 struct bio *bio = &s->bio.bio; 2231 2232 int ret = 0; 2233 struct bkey *k; 2234 struct btree_iter iter; 2235 bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0)); 2236 2237 do { 2238 k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); 2239 if (!k) { 2240 /* 2241 * b->key would be exactly what we want, except that 2242 * pointers to btree nodes have nonzero size - we 2243 * wouldn't go far enough 2244 */ 2245 2246 ret = submit_partial_cache_miss(b, op, 2247 &KEY(KEY_INODE(&b->key), 2248 KEY_OFFSET(&b->key), 0)); 2249 break; 2250 } 2251 2252 ret = b->level 2253 ? btree(search_recurse, k, b, op) 2254 : submit_partial_cache_hit(b, op, k); 2255 } while (!ret && 2256 !op->lookup_done); 2257 2258 return ret; 2259 } 2260 2261 /* Keybuf code */ 2262 2263 static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r) 2264 { 2265 /* Overlapping keys compare equal */ 2266 if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0) 2267 return -1; 2268 if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0) 2269 return 1; 2270 return 0; 2271 } 2272 2273 static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l, 2274 struct keybuf_key *r) 2275 { 2276 return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1); 2277 } 2278 2279 static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, 2280 struct keybuf *buf, struct bkey *end, 2281 keybuf_pred_fn *pred) 2282 { 2283 struct btree_iter iter; 2284 bch_btree_iter_init(b, &iter, &buf->last_scanned); 2285 2286 while (!array_freelist_empty(&buf->freelist)) { 2287 struct bkey *k = bch_btree_iter_next_filter(&iter, b, 2288 bch_ptr_bad); 2289 2290 if (!b->level) { 2291 if (!k) { 2292 buf->last_scanned = b->key; 2293 break; 2294 } 2295 2296 buf->last_scanned = *k; 2297 if (bkey_cmp(&buf->last_scanned, end) >= 0) 2298 break; 2299 2300 if (pred(buf, k)) { 2301 struct keybuf_key *w; 2302 2303 spin_lock(&buf->lock); 2304 2305 w = array_alloc(&buf->freelist); 2306 2307 w->private = NULL; 2308 bkey_copy(&w->key, k); 2309 2310 if (RB_INSERT(&buf->keys, w, node, keybuf_cmp)) 2311 array_free(&buf->freelist, w); 2312 2313 spin_unlock(&buf->lock); 2314 } 2315 } else { 2316 if (!k) 2317 break; 2318 2319 btree(refill_keybuf, k, b, op, buf, end, pred); 2320 /* 2321 * Might get an error here, but can't really do anything 2322 * and it'll get logged elsewhere. Just read what we 2323 * can. 2324 */ 2325 2326 if (bkey_cmp(&buf->last_scanned, end) >= 0) 2327 break; 2328 2329 cond_resched(); 2330 } 2331 } 2332 2333 return 0; 2334 } 2335 2336 void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, 2337 struct bkey *end, keybuf_pred_fn *pred) 2338 { 2339 struct bkey start = buf->last_scanned; 2340 struct btree_op op; 2341 bch_btree_op_init_stack(&op); 2342 2343 cond_resched(); 2344 2345 btree_root(refill_keybuf, c, &op, buf, end, pred); 2346 closure_sync(&op.cl); 2347 2348 pr_debug("found %s keys from %llu:%llu to %llu:%llu", 2349 RB_EMPTY_ROOT(&buf->keys) ? "no" : 2350 array_freelist_empty(&buf->freelist) ? "some" : "a few", 2351 KEY_INODE(&start), KEY_OFFSET(&start), 2352 KEY_INODE(&buf->last_scanned), KEY_OFFSET(&buf->last_scanned)); 2353 2354 spin_lock(&buf->lock); 2355 2356 if (!RB_EMPTY_ROOT(&buf->keys)) { 2357 struct keybuf_key *w; 2358 w = RB_FIRST(&buf->keys, struct keybuf_key, node); 2359 buf->start = START_KEY(&w->key); 2360 2361 w = RB_LAST(&buf->keys, struct keybuf_key, node); 2362 buf->end = w->key; 2363 } else { 2364 buf->start = MAX_KEY; 2365 buf->end = MAX_KEY; 2366 } 2367 2368 spin_unlock(&buf->lock); 2369 } 2370 2371 static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) 2372 { 2373 rb_erase(&w->node, &buf->keys); 2374 array_free(&buf->freelist, w); 2375 } 2376 2377 void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) 2378 { 2379 spin_lock(&buf->lock); 2380 __bch_keybuf_del(buf, w); 2381 spin_unlock(&buf->lock); 2382 } 2383 2384 bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start, 2385 struct bkey *end) 2386 { 2387 bool ret = false; 2388 struct keybuf_key *p, *w, s; 2389 s.key = *start; 2390 2391 if (bkey_cmp(end, &buf->start) <= 0 || 2392 bkey_cmp(start, &buf->end) >= 0) 2393 return false; 2394 2395 spin_lock(&buf->lock); 2396 w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp); 2397 2398 while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) { 2399 p = w; 2400 w = RB_NEXT(w, node); 2401 2402 if (p->private) 2403 ret = true; 2404 else 2405 __bch_keybuf_del(buf, p); 2406 } 2407 2408 spin_unlock(&buf->lock); 2409 return ret; 2410 } 2411 2412 struct keybuf_key *bch_keybuf_next(struct keybuf *buf) 2413 { 2414 struct keybuf_key *w; 2415 spin_lock(&buf->lock); 2416 2417 w = RB_FIRST(&buf->keys, struct keybuf_key, node); 2418 2419 while (w && w->private) 2420 w = RB_NEXT(w, node); 2421 2422 if (w) 2423 w->private = ERR_PTR(-EINTR); 2424 2425 spin_unlock(&buf->lock); 2426 return w; 2427 } 2428 2429 struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, 2430 struct keybuf *buf, 2431 struct bkey *end, 2432 keybuf_pred_fn *pred) 2433 { 2434 struct keybuf_key *ret; 2435 2436 while (1) { 2437 ret = bch_keybuf_next(buf); 2438 if (ret) 2439 break; 2440 2441 if (bkey_cmp(&buf->last_scanned, end) >= 0) { 2442 pr_debug("scan finished"); 2443 break; 2444 } 2445 2446 bch_refill_keybuf(c, buf, end, pred); 2447 } 2448 2449 return ret; 2450 } 2451 2452 void bch_keybuf_init(struct keybuf *buf) 2453 { 2454 buf->last_scanned = MAX_KEY; 2455 buf->keys = RB_ROOT; 2456 2457 spin_lock_init(&buf->lock); 2458 array_allocator_init(&buf->freelist); 2459 } 2460 2461 void bch_btree_exit(void) 2462 { 2463 if (btree_io_wq) 2464 destroy_workqueue(btree_io_wq); 2465 if (bch_gc_wq) 2466 destroy_workqueue(bch_gc_wq); 2467 } 2468 2469 int __init bch_btree_init(void) 2470 { 2471 if (!(bch_gc_wq = create_singlethread_workqueue("bch_btree_gc")) || 2472 !(btree_io_wq = create_singlethread_workqueue("bch_btree_io"))) 2473 return -ENOMEM; 2474 2475 return 0; 2476 } 2477