1 /* 2 * Main bcache entry point - handle a read or a write request and decide what to 3 * do with it; the make_request functions are called by the block layer. 4 * 5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> 6 * Copyright 2012 Google, Inc. 7 */ 8 9 #include "bcache.h" 10 #include "btree.h" 11 #include "debug.h" 12 #include "request.h" 13 #include "writeback.h" 14 15 #include <linux/module.h> 16 #include <linux/hash.h> 17 #include <linux/random.h> 18 #include <linux/backing-dev.h> 19 20 #include <trace/events/bcache.h> 21 22 #define CUTOFF_CACHE_ADD 95 23 #define CUTOFF_CACHE_READA 90 24 25 struct kmem_cache *bch_search_cache; 26 27 static void bch_data_insert_start(struct closure *); 28 29 static unsigned cache_mode(struct cached_dev *dc, struct bio *bio) 30 { 31 return BDEV_CACHE_MODE(&dc->sb); 32 } 33 34 static bool verify(struct cached_dev *dc, struct bio *bio) 35 { 36 return dc->verify; 37 } 38 39 static void bio_csum(struct bio *bio, struct bkey *k) 40 { 41 struct bio_vec bv; 42 struct bvec_iter iter; 43 uint64_t csum = 0; 44 45 bio_for_each_segment(bv, bio, iter) { 46 void *d = kmap(bv.bv_page) + bv.bv_offset; 47 csum = bch_crc64_update(csum, d, bv.bv_len); 48 kunmap(bv.bv_page); 49 } 50 51 k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1); 52 } 53 54 /* Insert data into cache */ 55 56 static void bch_data_insert_keys(struct closure *cl) 57 { 58 struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); 59 atomic_t *journal_ref = NULL; 60 struct bkey *replace_key = op->replace ? &op->replace_key : NULL; 61 int ret; 62 63 /* 64 * If we're looping, might already be waiting on 65 * another journal write - can't wait on more than one journal write at 66 * a time 67 * 68 * XXX: this looks wrong 69 */ 70 #if 0 71 while (atomic_read(&s->cl.remaining) & CLOSURE_WAITING) 72 closure_sync(&s->cl); 73 #endif 74 75 if (!op->replace) 76 journal_ref = bch_journal(op->c, &op->insert_keys, 77 op->flush_journal ? cl : NULL); 78 79 ret = bch_btree_insert(op->c, &op->insert_keys, 80 journal_ref, replace_key); 81 if (ret == -ESRCH) { 82 op->replace_collision = true; 83 } else if (ret) { 84 op->error = -ENOMEM; 85 op->insert_data_done = true; 86 } 87 88 if (journal_ref) 89 atomic_dec_bug(journal_ref); 90 91 if (!op->insert_data_done) 92 continue_at(cl, bch_data_insert_start, op->wq); 93 94 bch_keylist_free(&op->insert_keys); 95 closure_return(cl); 96 } 97 98 static int bch_keylist_realloc(struct keylist *l, unsigned u64s, 99 struct cache_set *c) 100 { 101 size_t oldsize = bch_keylist_nkeys(l); 102 size_t newsize = oldsize + u64s; 103 104 /* 105 * The journalling code doesn't handle the case where the keys to insert 106 * is bigger than an empty write: If we just return -ENOMEM here, 107 * bio_insert() and bio_invalidate() will insert the keys created so far 108 * and finish the rest when the keylist is empty. 109 */ 110 if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset)) 111 return -ENOMEM; 112 113 return __bch_keylist_realloc(l, u64s); 114 } 115 116 static void bch_data_invalidate(struct closure *cl) 117 { 118 struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); 119 struct bio *bio = op->bio; 120 121 pr_debug("invalidating %i sectors from %llu", 122 bio_sectors(bio), (uint64_t) bio->bi_iter.bi_sector); 123 124 while (bio_sectors(bio)) { 125 unsigned sectors = min(bio_sectors(bio), 126 1U << (KEY_SIZE_BITS - 1)); 127 128 if (bch_keylist_realloc(&op->insert_keys, 2, op->c)) 129 goto out; 130 131 bio->bi_iter.bi_sector += sectors; 132 bio->bi_iter.bi_size -= sectors << 9; 133 134 bch_keylist_add(&op->insert_keys, 135 &KEY(op->inode, bio->bi_iter.bi_sector, sectors)); 136 } 137 138 op->insert_data_done = true; 139 bio_put(bio); 140 out: 141 continue_at(cl, bch_data_insert_keys, op->wq); 142 } 143 144 static void bch_data_insert_error(struct closure *cl) 145 { 146 struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); 147 148 /* 149 * Our data write just errored, which means we've got a bunch of keys to 150 * insert that point to data that wasn't succesfully written. 151 * 152 * We don't have to insert those keys but we still have to invalidate 153 * that region of the cache - so, if we just strip off all the pointers 154 * from the keys we'll accomplish just that. 155 */ 156 157 struct bkey *src = op->insert_keys.keys, *dst = op->insert_keys.keys; 158 159 while (src != op->insert_keys.top) { 160 struct bkey *n = bkey_next(src); 161 162 SET_KEY_PTRS(src, 0); 163 memmove(dst, src, bkey_bytes(src)); 164 165 dst = bkey_next(dst); 166 src = n; 167 } 168 169 op->insert_keys.top = dst; 170 171 bch_data_insert_keys(cl); 172 } 173 174 static void bch_data_insert_endio(struct bio *bio, int error) 175 { 176 struct closure *cl = bio->bi_private; 177 struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); 178 179 if (error) { 180 /* TODO: We could try to recover from this. */ 181 if (op->writeback) 182 op->error = error; 183 else if (!op->replace) 184 set_closure_fn(cl, bch_data_insert_error, op->wq); 185 else 186 set_closure_fn(cl, NULL, NULL); 187 } 188 189 bch_bbio_endio(op->c, bio, error, "writing data to cache"); 190 } 191 192 static void bch_data_insert_start(struct closure *cl) 193 { 194 struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); 195 struct bio *bio = op->bio, *n; 196 197 if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) { 198 set_gc_sectors(op->c); 199 wake_up_gc(op->c); 200 } 201 202 if (op->bypass) 203 return bch_data_invalidate(cl); 204 205 /* 206 * Journal writes are marked REQ_FLUSH; if the original write was a 207 * flush, it'll wait on the journal write. 208 */ 209 bio->bi_rw &= ~(REQ_FLUSH|REQ_FUA); 210 211 do { 212 unsigned i; 213 struct bkey *k; 214 struct bio_set *split = op->c->bio_split; 215 216 /* 1 for the device pointer and 1 for the chksum */ 217 if (bch_keylist_realloc(&op->insert_keys, 218 3 + (op->csum ? 1 : 0), 219 op->c)) 220 continue_at(cl, bch_data_insert_keys, op->wq); 221 222 k = op->insert_keys.top; 223 bkey_init(k); 224 SET_KEY_INODE(k, op->inode); 225 SET_KEY_OFFSET(k, bio->bi_iter.bi_sector); 226 227 if (!bch_alloc_sectors(op->c, k, bio_sectors(bio), 228 op->write_point, op->write_prio, 229 op->writeback)) 230 goto err; 231 232 n = bio_next_split(bio, KEY_SIZE(k), GFP_NOIO, split); 233 234 n->bi_end_io = bch_data_insert_endio; 235 n->bi_private = cl; 236 237 if (op->writeback) { 238 SET_KEY_DIRTY(k, true); 239 240 for (i = 0; i < KEY_PTRS(k); i++) 241 SET_GC_MARK(PTR_BUCKET(op->c, k, i), 242 GC_MARK_DIRTY); 243 } 244 245 SET_KEY_CSUM(k, op->csum); 246 if (KEY_CSUM(k)) 247 bio_csum(n, k); 248 249 trace_bcache_cache_insert(k); 250 bch_keylist_push(&op->insert_keys); 251 252 n->bi_rw |= REQ_WRITE; 253 bch_submit_bbio(n, op->c, k, 0); 254 } while (n != bio); 255 256 op->insert_data_done = true; 257 continue_at(cl, bch_data_insert_keys, op->wq); 258 err: 259 /* bch_alloc_sectors() blocks if s->writeback = true */ 260 BUG_ON(op->writeback); 261 262 /* 263 * But if it's not a writeback write we'd rather just bail out if 264 * there aren't any buckets ready to write to - it might take awhile and 265 * we might be starving btree writes for gc or something. 266 */ 267 268 if (!op->replace) { 269 /* 270 * Writethrough write: We can't complete the write until we've 271 * updated the index. But we don't want to delay the write while 272 * we wait for buckets to be freed up, so just invalidate the 273 * rest of the write. 274 */ 275 op->bypass = true; 276 return bch_data_invalidate(cl); 277 } else { 278 /* 279 * From a cache miss, we can just insert the keys for the data 280 * we have written or bail out if we didn't do anything. 281 */ 282 op->insert_data_done = true; 283 bio_put(bio); 284 285 if (!bch_keylist_empty(&op->insert_keys)) 286 continue_at(cl, bch_data_insert_keys, op->wq); 287 else 288 closure_return(cl); 289 } 290 } 291 292 /** 293 * bch_data_insert - stick some data in the cache 294 * 295 * This is the starting point for any data to end up in a cache device; it could 296 * be from a normal write, or a writeback write, or a write to a flash only 297 * volume - it's also used by the moving garbage collector to compact data in 298 * mostly empty buckets. 299 * 300 * It first writes the data to the cache, creating a list of keys to be inserted 301 * (if the data had to be fragmented there will be multiple keys); after the 302 * data is written it calls bch_journal, and after the keys have been added to 303 * the next journal write they're inserted into the btree. 304 * 305 * It inserts the data in s->cache_bio; bi_sector is used for the key offset, 306 * and op->inode is used for the key inode. 307 * 308 * If s->bypass is true, instead of inserting the data it invalidates the 309 * region of the cache represented by s->cache_bio and op->inode. 310 */ 311 void bch_data_insert(struct closure *cl) 312 { 313 struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); 314 315 trace_bcache_write(op->c, op->inode, op->bio, 316 op->writeback, op->bypass); 317 318 bch_keylist_init(&op->insert_keys); 319 bio_get(op->bio); 320 bch_data_insert_start(cl); 321 } 322 323 /* Congested? */ 324 325 unsigned bch_get_congested(struct cache_set *c) 326 { 327 int i; 328 long rand; 329 330 if (!c->congested_read_threshold_us && 331 !c->congested_write_threshold_us) 332 return 0; 333 334 i = (local_clock_us() - c->congested_last_us) / 1024; 335 if (i < 0) 336 return 0; 337 338 i += atomic_read(&c->congested); 339 if (i >= 0) 340 return 0; 341 342 i += CONGESTED_MAX; 343 344 if (i > 0) 345 i = fract_exp_two(i, 6); 346 347 rand = get_random_int(); 348 i -= bitmap_weight(&rand, BITS_PER_LONG); 349 350 return i > 0 ? i : 1; 351 } 352 353 static void add_sequential(struct task_struct *t) 354 { 355 ewma_add(t->sequential_io_avg, 356 t->sequential_io, 8, 0); 357 358 t->sequential_io = 0; 359 } 360 361 static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k) 362 { 363 return &dc->io_hash[hash_64(k, RECENT_IO_BITS)]; 364 } 365 366 static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) 367 { 368 struct cache_set *c = dc->disk.c; 369 unsigned mode = cache_mode(dc, bio); 370 unsigned sectors, congested = bch_get_congested(c); 371 struct task_struct *task = current; 372 struct io *i; 373 374 if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || 375 c->gc_stats.in_use > CUTOFF_CACHE_ADD || 376 (bio->bi_rw & REQ_DISCARD)) 377 goto skip; 378 379 if (mode == CACHE_MODE_NONE || 380 (mode == CACHE_MODE_WRITEAROUND && 381 (bio->bi_rw & REQ_WRITE))) 382 goto skip; 383 384 if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) || 385 bio_sectors(bio) & (c->sb.block_size - 1)) { 386 pr_debug("skipping unaligned io"); 387 goto skip; 388 } 389 390 if (bypass_torture_test(dc)) { 391 if ((get_random_int() & 3) == 3) 392 goto skip; 393 else 394 goto rescale; 395 } 396 397 if (!congested && !dc->sequential_cutoff) 398 goto rescale; 399 400 if (!congested && 401 mode == CACHE_MODE_WRITEBACK && 402 (bio->bi_rw & REQ_WRITE) && 403 (bio->bi_rw & REQ_SYNC)) 404 goto rescale; 405 406 spin_lock(&dc->io_lock); 407 408 hlist_for_each_entry(i, iohash(dc, bio->bi_iter.bi_sector), hash) 409 if (i->last == bio->bi_iter.bi_sector && 410 time_before(jiffies, i->jiffies)) 411 goto found; 412 413 i = list_first_entry(&dc->io_lru, struct io, lru); 414 415 add_sequential(task); 416 i->sequential = 0; 417 found: 418 if (i->sequential + bio->bi_iter.bi_size > i->sequential) 419 i->sequential += bio->bi_iter.bi_size; 420 421 i->last = bio_end_sector(bio); 422 i->jiffies = jiffies + msecs_to_jiffies(5000); 423 task->sequential_io = i->sequential; 424 425 hlist_del(&i->hash); 426 hlist_add_head(&i->hash, iohash(dc, i->last)); 427 list_move_tail(&i->lru, &dc->io_lru); 428 429 spin_unlock(&dc->io_lock); 430 431 sectors = max(task->sequential_io, 432 task->sequential_io_avg) >> 9; 433 434 if (dc->sequential_cutoff && 435 sectors >= dc->sequential_cutoff >> 9) { 436 trace_bcache_bypass_sequential(bio); 437 goto skip; 438 } 439 440 if (congested && sectors >= congested) { 441 trace_bcache_bypass_congested(bio); 442 goto skip; 443 } 444 445 rescale: 446 bch_rescale_priorities(c, bio_sectors(bio)); 447 return false; 448 skip: 449 bch_mark_sectors_bypassed(c, dc, bio_sectors(bio)); 450 return true; 451 } 452 453 /* Cache lookup */ 454 455 struct search { 456 /* Stack frame for bio_complete */ 457 struct closure cl; 458 459 struct bbio bio; 460 struct bio *orig_bio; 461 struct bio *cache_miss; 462 struct bcache_device *d; 463 464 unsigned insert_bio_sectors; 465 unsigned recoverable:1; 466 unsigned write:1; 467 unsigned read_dirty_data:1; 468 469 unsigned long start_time; 470 471 struct btree_op op; 472 struct data_insert_op iop; 473 }; 474 475 static void bch_cache_read_endio(struct bio *bio, int error) 476 { 477 struct bbio *b = container_of(bio, struct bbio, bio); 478 struct closure *cl = bio->bi_private; 479 struct search *s = container_of(cl, struct search, cl); 480 481 /* 482 * If the bucket was reused while our bio was in flight, we might have 483 * read the wrong data. Set s->error but not error so it doesn't get 484 * counted against the cache device, but we'll still reread the data 485 * from the backing device. 486 */ 487 488 if (error) 489 s->iop.error = error; 490 else if (!KEY_DIRTY(&b->key) && 491 ptr_stale(s->iop.c, &b->key, 0)) { 492 atomic_long_inc(&s->iop.c->cache_read_races); 493 s->iop.error = -EINTR; 494 } 495 496 bch_bbio_endio(s->iop.c, bio, error, "reading from cache"); 497 } 498 499 /* 500 * Read from a single key, handling the initial cache miss if the key starts in 501 * the middle of the bio 502 */ 503 static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k) 504 { 505 struct search *s = container_of(op, struct search, op); 506 struct bio *n, *bio = &s->bio.bio; 507 struct bkey *bio_key; 508 unsigned ptr; 509 510 if (bkey_cmp(k, &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0)) <= 0) 511 return MAP_CONTINUE; 512 513 if (KEY_INODE(k) != s->iop.inode || 514 KEY_START(k) > bio->bi_iter.bi_sector) { 515 unsigned bio_sectors = bio_sectors(bio); 516 unsigned sectors = KEY_INODE(k) == s->iop.inode 517 ? min_t(uint64_t, INT_MAX, 518 KEY_START(k) - bio->bi_iter.bi_sector) 519 : INT_MAX; 520 521 int ret = s->d->cache_miss(b, s, bio, sectors); 522 if (ret != MAP_CONTINUE) 523 return ret; 524 525 /* if this was a complete miss we shouldn't get here */ 526 BUG_ON(bio_sectors <= sectors); 527 } 528 529 if (!KEY_SIZE(k)) 530 return MAP_CONTINUE; 531 532 /* XXX: figure out best pointer - for multiple cache devices */ 533 ptr = 0; 534 535 PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO; 536 537 if (KEY_DIRTY(k)) 538 s->read_dirty_data = true; 539 540 n = bio_next_split(bio, min_t(uint64_t, INT_MAX, 541 KEY_OFFSET(k) - bio->bi_iter.bi_sector), 542 GFP_NOIO, s->d->bio_split); 543 544 bio_key = &container_of(n, struct bbio, bio)->key; 545 bch_bkey_copy_single_ptr(bio_key, k, ptr); 546 547 bch_cut_front(&KEY(s->iop.inode, n->bi_iter.bi_sector, 0), bio_key); 548 bch_cut_back(&KEY(s->iop.inode, bio_end_sector(n), 0), bio_key); 549 550 n->bi_end_io = bch_cache_read_endio; 551 n->bi_private = &s->cl; 552 553 /* 554 * The bucket we're reading from might be reused while our bio 555 * is in flight, and we could then end up reading the wrong 556 * data. 557 * 558 * We guard against this by checking (in cache_read_endio()) if 559 * the pointer is stale again; if so, we treat it as an error 560 * and reread from the backing device (but we don't pass that 561 * error up anywhere). 562 */ 563 564 __bch_submit_bbio(n, b->c); 565 return n == bio ? MAP_DONE : MAP_CONTINUE; 566 } 567 568 static void cache_lookup(struct closure *cl) 569 { 570 struct search *s = container_of(cl, struct search, iop.cl); 571 struct bio *bio = &s->bio.bio; 572 int ret; 573 574 bch_btree_op_init(&s->op, -1); 575 576 ret = bch_btree_map_keys(&s->op, s->iop.c, 577 &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0), 578 cache_lookup_fn, MAP_END_KEY); 579 if (ret == -EAGAIN) 580 continue_at(cl, cache_lookup, bcache_wq); 581 582 closure_return(cl); 583 } 584 585 /* Common code for the make_request functions */ 586 587 static void request_endio(struct bio *bio, int error) 588 { 589 struct closure *cl = bio->bi_private; 590 591 if (error) { 592 struct search *s = container_of(cl, struct search, cl); 593 s->iop.error = error; 594 /* Only cache read errors are recoverable */ 595 s->recoverable = false; 596 } 597 598 bio_put(bio); 599 closure_put(cl); 600 } 601 602 static void bio_complete(struct search *s) 603 { 604 if (s->orig_bio) { 605 generic_end_io_acct(bio_data_dir(s->orig_bio), 606 &s->d->disk->part0, s->start_time); 607 608 trace_bcache_request_end(s->d, s->orig_bio); 609 bio_endio(s->orig_bio, s->iop.error); 610 s->orig_bio = NULL; 611 } 612 } 613 614 static void do_bio_hook(struct search *s, struct bio *orig_bio) 615 { 616 struct bio *bio = &s->bio.bio; 617 618 bio_init(bio); 619 __bio_clone_fast(bio, orig_bio); 620 bio->bi_end_io = request_endio; 621 bio->bi_private = &s->cl; 622 623 bio_cnt_set(bio, 3); 624 } 625 626 static void search_free(struct closure *cl) 627 { 628 struct search *s = container_of(cl, struct search, cl); 629 bio_complete(s); 630 631 if (s->iop.bio) 632 bio_put(s->iop.bio); 633 634 closure_debug_destroy(cl); 635 mempool_free(s, s->d->c->search); 636 } 637 638 static inline struct search *search_alloc(struct bio *bio, 639 struct bcache_device *d) 640 { 641 struct search *s; 642 643 s = mempool_alloc(d->c->search, GFP_NOIO); 644 645 closure_init(&s->cl, NULL); 646 do_bio_hook(s, bio); 647 648 s->orig_bio = bio; 649 s->cache_miss = NULL; 650 s->d = d; 651 s->recoverable = 1; 652 s->write = (bio->bi_rw & REQ_WRITE) != 0; 653 s->read_dirty_data = 0; 654 s->start_time = jiffies; 655 656 s->iop.c = d->c; 657 s->iop.bio = NULL; 658 s->iop.inode = d->id; 659 s->iop.write_point = hash_long((unsigned long) current, 16); 660 s->iop.write_prio = 0; 661 s->iop.error = 0; 662 s->iop.flags = 0; 663 s->iop.flush_journal = (bio->bi_rw & (REQ_FLUSH|REQ_FUA)) != 0; 664 s->iop.wq = bcache_wq; 665 666 return s; 667 } 668 669 /* Cached devices */ 670 671 static void cached_dev_bio_complete(struct closure *cl) 672 { 673 struct search *s = container_of(cl, struct search, cl); 674 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 675 676 search_free(cl); 677 cached_dev_put(dc); 678 } 679 680 /* Process reads */ 681 682 static void cached_dev_cache_miss_done(struct closure *cl) 683 { 684 struct search *s = container_of(cl, struct search, cl); 685 686 if (s->iop.replace_collision) 687 bch_mark_cache_miss_collision(s->iop.c, s->d); 688 689 if (s->iop.bio) { 690 int i; 691 struct bio_vec *bv; 692 693 bio_for_each_segment_all(bv, s->iop.bio, i) 694 __free_page(bv->bv_page); 695 } 696 697 cached_dev_bio_complete(cl); 698 } 699 700 static void cached_dev_read_error(struct closure *cl) 701 { 702 struct search *s = container_of(cl, struct search, cl); 703 struct bio *bio = &s->bio.bio; 704 705 if (s->recoverable) { 706 /* Retry from the backing device: */ 707 trace_bcache_read_retry(s->orig_bio); 708 709 s->iop.error = 0; 710 do_bio_hook(s, s->orig_bio); 711 712 /* XXX: invalidate cache */ 713 714 closure_bio_submit(bio, cl, s->d); 715 } 716 717 continue_at(cl, cached_dev_cache_miss_done, NULL); 718 } 719 720 static void cached_dev_read_done(struct closure *cl) 721 { 722 struct search *s = container_of(cl, struct search, cl); 723 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 724 725 /* 726 * We had a cache miss; cache_bio now contains data ready to be inserted 727 * into the cache. 728 * 729 * First, we copy the data we just read from cache_bio's bounce buffers 730 * to the buffers the original bio pointed to: 731 */ 732 733 if (s->iop.bio) { 734 bio_reset(s->iop.bio); 735 s->iop.bio->bi_iter.bi_sector = s->cache_miss->bi_iter.bi_sector; 736 s->iop.bio->bi_bdev = s->cache_miss->bi_bdev; 737 s->iop.bio->bi_iter.bi_size = s->insert_bio_sectors << 9; 738 bch_bio_map(s->iop.bio, NULL); 739 740 bio_copy_data(s->cache_miss, s->iop.bio); 741 742 bio_put(s->cache_miss); 743 s->cache_miss = NULL; 744 } 745 746 if (verify(dc, &s->bio.bio) && s->recoverable && !s->read_dirty_data) 747 bch_data_verify(dc, s->orig_bio); 748 749 bio_complete(s); 750 751 if (s->iop.bio && 752 !test_bit(CACHE_SET_STOPPING, &s->iop.c->flags)) { 753 BUG_ON(!s->iop.replace); 754 closure_call(&s->iop.cl, bch_data_insert, NULL, cl); 755 } 756 757 continue_at(cl, cached_dev_cache_miss_done, NULL); 758 } 759 760 static void cached_dev_read_done_bh(struct closure *cl) 761 { 762 struct search *s = container_of(cl, struct search, cl); 763 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 764 765 bch_mark_cache_accounting(s->iop.c, s->d, 766 !s->cache_miss, s->iop.bypass); 767 trace_bcache_read(s->orig_bio, !s->cache_miss, s->iop.bypass); 768 769 if (s->iop.error) 770 continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq); 771 else if (s->iop.bio || verify(dc, &s->bio.bio)) 772 continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq); 773 else 774 continue_at_nobarrier(cl, cached_dev_bio_complete, NULL); 775 } 776 777 static int cached_dev_cache_miss(struct btree *b, struct search *s, 778 struct bio *bio, unsigned sectors) 779 { 780 int ret = MAP_CONTINUE; 781 unsigned reada = 0; 782 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 783 struct bio *miss, *cache_bio; 784 785 if (s->cache_miss || s->iop.bypass) { 786 miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split); 787 ret = miss == bio ? MAP_DONE : MAP_CONTINUE; 788 goto out_submit; 789 } 790 791 if (!(bio->bi_rw & REQ_RAHEAD) && 792 !(bio->bi_rw & REQ_META) && 793 s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA) 794 reada = min_t(sector_t, dc->readahead >> 9, 795 bdev_sectors(bio->bi_bdev) - bio_end_sector(bio)); 796 797 s->insert_bio_sectors = min(sectors, bio_sectors(bio) + reada); 798 799 s->iop.replace_key = KEY(s->iop.inode, 800 bio->bi_iter.bi_sector + s->insert_bio_sectors, 801 s->insert_bio_sectors); 802 803 ret = bch_btree_insert_check_key(b, &s->op, &s->iop.replace_key); 804 if (ret) 805 return ret; 806 807 s->iop.replace = true; 808 809 miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split); 810 811 /* btree_search_recurse()'s btree iterator is no good anymore */ 812 ret = miss == bio ? MAP_DONE : -EINTR; 813 814 cache_bio = bio_alloc_bioset(GFP_NOWAIT, 815 DIV_ROUND_UP(s->insert_bio_sectors, PAGE_SECTORS), 816 dc->disk.bio_split); 817 if (!cache_bio) 818 goto out_submit; 819 820 cache_bio->bi_iter.bi_sector = miss->bi_iter.bi_sector; 821 cache_bio->bi_bdev = miss->bi_bdev; 822 cache_bio->bi_iter.bi_size = s->insert_bio_sectors << 9; 823 824 cache_bio->bi_end_io = request_endio; 825 cache_bio->bi_private = &s->cl; 826 827 bch_bio_map(cache_bio, NULL); 828 if (bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO)) 829 goto out_put; 830 831 if (reada) 832 bch_mark_cache_readahead(s->iop.c, s->d); 833 834 s->cache_miss = miss; 835 s->iop.bio = cache_bio; 836 bio_get(cache_bio); 837 closure_bio_submit(cache_bio, &s->cl, s->d); 838 839 return ret; 840 out_put: 841 bio_put(cache_bio); 842 out_submit: 843 miss->bi_end_io = request_endio; 844 miss->bi_private = &s->cl; 845 closure_bio_submit(miss, &s->cl, s->d); 846 return ret; 847 } 848 849 static void cached_dev_read(struct cached_dev *dc, struct search *s) 850 { 851 struct closure *cl = &s->cl; 852 853 closure_call(&s->iop.cl, cache_lookup, NULL, cl); 854 continue_at(cl, cached_dev_read_done_bh, NULL); 855 } 856 857 /* Process writes */ 858 859 static void cached_dev_write_complete(struct closure *cl) 860 { 861 struct search *s = container_of(cl, struct search, cl); 862 struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); 863 864 up_read_non_owner(&dc->writeback_lock); 865 cached_dev_bio_complete(cl); 866 } 867 868 static void cached_dev_write(struct cached_dev *dc, struct search *s) 869 { 870 struct closure *cl = &s->cl; 871 struct bio *bio = &s->bio.bio; 872 struct bkey start = KEY(dc->disk.id, bio->bi_iter.bi_sector, 0); 873 struct bkey end = KEY(dc->disk.id, bio_end_sector(bio), 0); 874 875 bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, &start, &end); 876 877 down_read_non_owner(&dc->writeback_lock); 878 if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) { 879 /* 880 * We overlap with some dirty data undergoing background 881 * writeback, force this write to writeback 882 */ 883 s->iop.bypass = false; 884 s->iop.writeback = true; 885 } 886 887 /* 888 * Discards aren't _required_ to do anything, so skipping if 889 * check_overlapping returned true is ok 890 * 891 * But check_overlapping drops dirty keys for which io hasn't started, 892 * so we still want to call it. 893 */ 894 if (bio->bi_rw & REQ_DISCARD) 895 s->iop.bypass = true; 896 897 if (should_writeback(dc, s->orig_bio, 898 cache_mode(dc, bio), 899 s->iop.bypass)) { 900 s->iop.bypass = false; 901 s->iop.writeback = true; 902 } 903 904 if (s->iop.bypass) { 905 s->iop.bio = s->orig_bio; 906 bio_get(s->iop.bio); 907 908 if (!(bio->bi_rw & REQ_DISCARD) || 909 blk_queue_discard(bdev_get_queue(dc->bdev))) 910 closure_bio_submit(bio, cl, s->d); 911 } else if (s->iop.writeback) { 912 bch_writeback_add(dc); 913 s->iop.bio = bio; 914 915 if (bio->bi_rw & REQ_FLUSH) { 916 /* Also need to send a flush to the backing device */ 917 struct bio *flush = bio_alloc_bioset(GFP_NOIO, 0, 918 dc->disk.bio_split); 919 920 flush->bi_rw = WRITE_FLUSH; 921 flush->bi_bdev = bio->bi_bdev; 922 flush->bi_end_io = request_endio; 923 flush->bi_private = cl; 924 925 closure_bio_submit(flush, cl, s->d); 926 } 927 } else { 928 s->iop.bio = bio_clone_fast(bio, GFP_NOIO, dc->disk.bio_split); 929 930 closure_bio_submit(bio, cl, s->d); 931 } 932 933 closure_call(&s->iop.cl, bch_data_insert, NULL, cl); 934 continue_at(cl, cached_dev_write_complete, NULL); 935 } 936 937 static void cached_dev_nodata(struct closure *cl) 938 { 939 struct search *s = container_of(cl, struct search, cl); 940 struct bio *bio = &s->bio.bio; 941 942 if (s->iop.flush_journal) 943 bch_journal_meta(s->iop.c, cl); 944 945 /* If it's a flush, we send the flush to the backing device too */ 946 closure_bio_submit(bio, cl, s->d); 947 948 continue_at(cl, cached_dev_bio_complete, NULL); 949 } 950 951 /* Cached devices - read & write stuff */ 952 953 static void cached_dev_make_request(struct request_queue *q, struct bio *bio) 954 { 955 struct search *s; 956 struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; 957 struct cached_dev *dc = container_of(d, struct cached_dev, disk); 958 int rw = bio_data_dir(bio); 959 960 generic_start_io_acct(rw, bio_sectors(bio), &d->disk->part0); 961 962 bio->bi_bdev = dc->bdev; 963 bio->bi_iter.bi_sector += dc->sb.data_offset; 964 965 if (cached_dev_get(dc)) { 966 s = search_alloc(bio, d); 967 trace_bcache_request_start(s->d, bio); 968 969 if (!bio->bi_iter.bi_size) { 970 /* 971 * can't call bch_journal_meta from under 972 * generic_make_request 973 */ 974 continue_at_nobarrier(&s->cl, 975 cached_dev_nodata, 976 bcache_wq); 977 } else { 978 s->iop.bypass = check_should_bypass(dc, bio); 979 980 if (rw) 981 cached_dev_write(dc, s); 982 else 983 cached_dev_read(dc, s); 984 } 985 } else { 986 if ((bio->bi_rw & REQ_DISCARD) && 987 !blk_queue_discard(bdev_get_queue(dc->bdev))) 988 bio_endio(bio, 0); 989 else 990 bch_generic_make_request(bio, &d->bio_split_hook); 991 } 992 } 993 994 static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode, 995 unsigned int cmd, unsigned long arg) 996 { 997 struct cached_dev *dc = container_of(d, struct cached_dev, disk); 998 return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg); 999 } 1000 1001 static int cached_dev_congested(void *data, int bits) 1002 { 1003 struct bcache_device *d = data; 1004 struct cached_dev *dc = container_of(d, struct cached_dev, disk); 1005 struct request_queue *q = bdev_get_queue(dc->bdev); 1006 int ret = 0; 1007 1008 if (bdi_congested(&q->backing_dev_info, bits)) 1009 return 1; 1010 1011 if (cached_dev_get(dc)) { 1012 unsigned i; 1013 struct cache *ca; 1014 1015 for_each_cache(ca, d->c, i) { 1016 q = bdev_get_queue(ca->bdev); 1017 ret |= bdi_congested(&q->backing_dev_info, bits); 1018 } 1019 1020 cached_dev_put(dc); 1021 } 1022 1023 return ret; 1024 } 1025 1026 void bch_cached_dev_request_init(struct cached_dev *dc) 1027 { 1028 struct gendisk *g = dc->disk.disk; 1029 1030 g->queue->make_request_fn = cached_dev_make_request; 1031 g->queue->backing_dev_info.congested_fn = cached_dev_congested; 1032 dc->disk.cache_miss = cached_dev_cache_miss; 1033 dc->disk.ioctl = cached_dev_ioctl; 1034 } 1035 1036 /* Flash backed devices */ 1037 1038 static int flash_dev_cache_miss(struct btree *b, struct search *s, 1039 struct bio *bio, unsigned sectors) 1040 { 1041 unsigned bytes = min(sectors, bio_sectors(bio)) << 9; 1042 1043 swap(bio->bi_iter.bi_size, bytes); 1044 zero_fill_bio(bio); 1045 swap(bio->bi_iter.bi_size, bytes); 1046 1047 bio_advance(bio, bytes); 1048 1049 if (!bio->bi_iter.bi_size) 1050 return MAP_DONE; 1051 1052 return MAP_CONTINUE; 1053 } 1054 1055 static void flash_dev_nodata(struct closure *cl) 1056 { 1057 struct search *s = container_of(cl, struct search, cl); 1058 1059 if (s->iop.flush_journal) 1060 bch_journal_meta(s->iop.c, cl); 1061 1062 continue_at(cl, search_free, NULL); 1063 } 1064 1065 static void flash_dev_make_request(struct request_queue *q, struct bio *bio) 1066 { 1067 struct search *s; 1068 struct closure *cl; 1069 struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; 1070 int rw = bio_data_dir(bio); 1071 1072 generic_start_io_acct(rw, bio_sectors(bio), &d->disk->part0); 1073 1074 s = search_alloc(bio, d); 1075 cl = &s->cl; 1076 bio = &s->bio.bio; 1077 1078 trace_bcache_request_start(s->d, bio); 1079 1080 if (!bio->bi_iter.bi_size) { 1081 /* 1082 * can't call bch_journal_meta from under 1083 * generic_make_request 1084 */ 1085 continue_at_nobarrier(&s->cl, 1086 flash_dev_nodata, 1087 bcache_wq); 1088 } else if (rw) { 1089 bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, 1090 &KEY(d->id, bio->bi_iter.bi_sector, 0), 1091 &KEY(d->id, bio_end_sector(bio), 0)); 1092 1093 s->iop.bypass = (bio->bi_rw & REQ_DISCARD) != 0; 1094 s->iop.writeback = true; 1095 s->iop.bio = bio; 1096 1097 closure_call(&s->iop.cl, bch_data_insert, NULL, cl); 1098 } else { 1099 closure_call(&s->iop.cl, cache_lookup, NULL, cl); 1100 } 1101 1102 continue_at(cl, search_free, NULL); 1103 } 1104 1105 static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode, 1106 unsigned int cmd, unsigned long arg) 1107 { 1108 return -ENOTTY; 1109 } 1110 1111 static int flash_dev_congested(void *data, int bits) 1112 { 1113 struct bcache_device *d = data; 1114 struct request_queue *q; 1115 struct cache *ca; 1116 unsigned i; 1117 int ret = 0; 1118 1119 for_each_cache(ca, d->c, i) { 1120 q = bdev_get_queue(ca->bdev); 1121 ret |= bdi_congested(&q->backing_dev_info, bits); 1122 } 1123 1124 return ret; 1125 } 1126 1127 void bch_flash_dev_request_init(struct bcache_device *d) 1128 { 1129 struct gendisk *g = d->disk; 1130 1131 g->queue->make_request_fn = flash_dev_make_request; 1132 g->queue->backing_dev_info.congested_fn = flash_dev_congested; 1133 d->cache_miss = flash_dev_cache_miss; 1134 d->ioctl = flash_dev_ioctl; 1135 } 1136 1137 void bch_request_exit(void) 1138 { 1139 if (bch_search_cache) 1140 kmem_cache_destroy(bch_search_cache); 1141 } 1142 1143 int __init bch_request_init(void) 1144 { 1145 bch_search_cache = KMEM_CACHE(search, 0); 1146 if (!bch_search_cache) 1147 return -ENOMEM; 1148 1149 return 0; 1150 } 1151