1 /* 2 * Copyright (C) 2009-2011 Red Hat, Inc. 3 * 4 * Author: Mikulas Patocka <mpatocka@redhat.com> 5 * 6 * This file is released under the GPL. 7 */ 8 9 #include "dm-bufio.h" 10 11 #include <linux/device-mapper.h> 12 #include <linux/dm-io.h> 13 #include <linux/slab.h> 14 #include <linux/vmalloc.h> 15 #include <linux/shrinker.h> 16 #include <linux/module.h> 17 #include <linux/rbtree.h> 18 19 #define DM_MSG_PREFIX "bufio" 20 21 /* 22 * Memory management policy: 23 * Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory 24 * or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower). 25 * Always allocate at least DM_BUFIO_MIN_BUFFERS buffers. 26 * Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT 27 * dirty buffers. 28 */ 29 #define DM_BUFIO_MIN_BUFFERS 8 30 31 #define DM_BUFIO_MEMORY_PERCENT 2 32 #define DM_BUFIO_VMALLOC_PERCENT 25 33 #define DM_BUFIO_WRITEBACK_PERCENT 75 34 35 /* 36 * Check buffer ages in this interval (seconds) 37 */ 38 #define DM_BUFIO_WORK_TIMER_SECS 30 39 40 /* 41 * Free buffers when they are older than this (seconds) 42 */ 43 #define DM_BUFIO_DEFAULT_AGE_SECS 300 44 45 /* 46 * The nr of bytes of cached data to keep around. 47 */ 48 #define DM_BUFIO_DEFAULT_RETAIN_BYTES (256 * 1024) 49 50 /* 51 * The number of bvec entries that are embedded directly in the buffer. 52 * If the chunk size is larger, dm-io is used to do the io. 53 */ 54 #define DM_BUFIO_INLINE_VECS 16 55 56 /* 57 * Don't try to use kmem_cache_alloc for blocks larger than this. 58 * For explanation, see alloc_buffer_data below. 59 */ 60 #define DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT (PAGE_SIZE >> 1) 61 #define DM_BUFIO_BLOCK_SIZE_GFP_LIMIT (PAGE_SIZE << (MAX_ORDER - 1)) 62 63 /* 64 * dm_buffer->list_mode 65 */ 66 #define LIST_CLEAN 0 67 #define LIST_DIRTY 1 68 #define LIST_SIZE 2 69 70 /* 71 * Linking of buffers: 72 * All buffers are linked to cache_hash with their hash_list field. 73 * 74 * Clean buffers that are not being written (B_WRITING not set) 75 * are linked to lru[LIST_CLEAN] with their lru_list field. 76 * 77 * Dirty and clean buffers that are being written are linked to 78 * lru[LIST_DIRTY] with their lru_list field. When the write 79 * finishes, the buffer cannot be relinked immediately (because we 80 * are in an interrupt context and relinking requires process 81 * context), so some clean-not-writing buffers can be held on 82 * dirty_lru too. They are later added to lru in the process 83 * context. 84 */ 85 struct dm_bufio_client { 86 struct mutex lock; 87 88 struct list_head lru[LIST_SIZE]; 89 unsigned long n_buffers[LIST_SIZE]; 90 91 struct block_device *bdev; 92 unsigned block_size; 93 unsigned char sectors_per_block_bits; 94 unsigned char pages_per_block_bits; 95 unsigned char blocks_per_page_bits; 96 unsigned aux_size; 97 void (*alloc_callback)(struct dm_buffer *); 98 void (*write_callback)(struct dm_buffer *); 99 100 struct dm_io_client *dm_io; 101 102 struct list_head reserved_buffers; 103 unsigned need_reserved_buffers; 104 105 unsigned minimum_buffers; 106 107 struct rb_root buffer_tree; 108 wait_queue_head_t free_buffer_wait; 109 110 int async_write_error; 111 112 struct list_head client_list; 113 struct shrinker shrinker; 114 }; 115 116 /* 117 * Buffer state bits. 118 */ 119 #define B_READING 0 120 #define B_WRITING 1 121 #define B_DIRTY 2 122 123 /* 124 * Describes how the block was allocated: 125 * kmem_cache_alloc(), __get_free_pages() or vmalloc(). 126 * See the comment at alloc_buffer_data. 127 */ 128 enum data_mode { 129 DATA_MODE_SLAB = 0, 130 DATA_MODE_GET_FREE_PAGES = 1, 131 DATA_MODE_VMALLOC = 2, 132 DATA_MODE_LIMIT = 3 133 }; 134 135 struct dm_buffer { 136 struct rb_node node; 137 struct list_head lru_list; 138 sector_t block; 139 void *data; 140 enum data_mode data_mode; 141 unsigned char list_mode; /* LIST_* */ 142 unsigned hold_count; 143 int read_error; 144 int write_error; 145 unsigned long state; 146 unsigned long last_accessed; 147 struct dm_bufio_client *c; 148 struct list_head write_list; 149 struct bio bio; 150 struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS]; 151 }; 152 153 /*----------------------------------------------------------------*/ 154 155 static struct kmem_cache *dm_bufio_caches[PAGE_SHIFT - SECTOR_SHIFT]; 156 static char *dm_bufio_cache_names[PAGE_SHIFT - SECTOR_SHIFT]; 157 158 static inline int dm_bufio_cache_index(struct dm_bufio_client *c) 159 { 160 unsigned ret = c->blocks_per_page_bits - 1; 161 162 BUG_ON(ret >= ARRAY_SIZE(dm_bufio_caches)); 163 164 return ret; 165 } 166 167 #define DM_BUFIO_CACHE(c) (dm_bufio_caches[dm_bufio_cache_index(c)]) 168 #define DM_BUFIO_CACHE_NAME(c) (dm_bufio_cache_names[dm_bufio_cache_index(c)]) 169 170 #define dm_bufio_in_request() (!!current->bio_list) 171 172 static void dm_bufio_lock(struct dm_bufio_client *c) 173 { 174 mutex_lock_nested(&c->lock, dm_bufio_in_request()); 175 } 176 177 static int dm_bufio_trylock(struct dm_bufio_client *c) 178 { 179 return mutex_trylock(&c->lock); 180 } 181 182 static void dm_bufio_unlock(struct dm_bufio_client *c) 183 { 184 mutex_unlock(&c->lock); 185 } 186 187 /* 188 * FIXME Move to sched.h? 189 */ 190 #ifdef CONFIG_PREEMPT_VOLUNTARY 191 # define dm_bufio_cond_resched() \ 192 do { \ 193 if (unlikely(need_resched())) \ 194 _cond_resched(); \ 195 } while (0) 196 #else 197 # define dm_bufio_cond_resched() do { } while (0) 198 #endif 199 200 /*----------------------------------------------------------------*/ 201 202 /* 203 * Default cache size: available memory divided by the ratio. 204 */ 205 static unsigned long dm_bufio_default_cache_size; 206 207 /* 208 * Total cache size set by the user. 209 */ 210 static unsigned long dm_bufio_cache_size; 211 212 /* 213 * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change 214 * at any time. If it disagrees, the user has changed cache size. 215 */ 216 static unsigned long dm_bufio_cache_size_latch; 217 218 static DEFINE_SPINLOCK(param_spinlock); 219 220 /* 221 * Buffers are freed after this timeout 222 */ 223 static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS; 224 static unsigned dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES; 225 226 static unsigned long dm_bufio_peak_allocated; 227 static unsigned long dm_bufio_allocated_kmem_cache; 228 static unsigned long dm_bufio_allocated_get_free_pages; 229 static unsigned long dm_bufio_allocated_vmalloc; 230 static unsigned long dm_bufio_current_allocated; 231 232 /*----------------------------------------------------------------*/ 233 234 /* 235 * Per-client cache: dm_bufio_cache_size / dm_bufio_client_count 236 */ 237 static unsigned long dm_bufio_cache_size_per_client; 238 239 /* 240 * The current number of clients. 241 */ 242 static int dm_bufio_client_count; 243 244 /* 245 * The list of all clients. 246 */ 247 static LIST_HEAD(dm_bufio_all_clients); 248 249 /* 250 * This mutex protects dm_bufio_cache_size_latch, 251 * dm_bufio_cache_size_per_client and dm_bufio_client_count 252 */ 253 static DEFINE_MUTEX(dm_bufio_clients_lock); 254 255 /*---------------------------------------------------------------- 256 * A red/black tree acts as an index for all the buffers. 257 *--------------------------------------------------------------*/ 258 static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block) 259 { 260 struct rb_node *n = c->buffer_tree.rb_node; 261 struct dm_buffer *b; 262 263 while (n) { 264 b = container_of(n, struct dm_buffer, node); 265 266 if (b->block == block) 267 return b; 268 269 n = (b->block < block) ? n->rb_left : n->rb_right; 270 } 271 272 return NULL; 273 } 274 275 static void __insert(struct dm_bufio_client *c, struct dm_buffer *b) 276 { 277 struct rb_node **new = &c->buffer_tree.rb_node, *parent = NULL; 278 struct dm_buffer *found; 279 280 while (*new) { 281 found = container_of(*new, struct dm_buffer, node); 282 283 if (found->block == b->block) { 284 BUG_ON(found != b); 285 return; 286 } 287 288 parent = *new; 289 new = (found->block < b->block) ? 290 &((*new)->rb_left) : &((*new)->rb_right); 291 } 292 293 rb_link_node(&b->node, parent, new); 294 rb_insert_color(&b->node, &c->buffer_tree); 295 } 296 297 static void __remove(struct dm_bufio_client *c, struct dm_buffer *b) 298 { 299 rb_erase(&b->node, &c->buffer_tree); 300 } 301 302 /*----------------------------------------------------------------*/ 303 304 static void adjust_total_allocated(enum data_mode data_mode, long diff) 305 { 306 static unsigned long * const class_ptr[DATA_MODE_LIMIT] = { 307 &dm_bufio_allocated_kmem_cache, 308 &dm_bufio_allocated_get_free_pages, 309 &dm_bufio_allocated_vmalloc, 310 }; 311 312 spin_lock(¶m_spinlock); 313 314 *class_ptr[data_mode] += diff; 315 316 dm_bufio_current_allocated += diff; 317 318 if (dm_bufio_current_allocated > dm_bufio_peak_allocated) 319 dm_bufio_peak_allocated = dm_bufio_current_allocated; 320 321 spin_unlock(¶m_spinlock); 322 } 323 324 /* 325 * Change the number of clients and recalculate per-client limit. 326 */ 327 static void __cache_size_refresh(void) 328 { 329 BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock)); 330 BUG_ON(dm_bufio_client_count < 0); 331 332 dm_bufio_cache_size_latch = ACCESS_ONCE(dm_bufio_cache_size); 333 334 /* 335 * Use default if set to 0 and report the actual cache size used. 336 */ 337 if (!dm_bufio_cache_size_latch) { 338 (void)cmpxchg(&dm_bufio_cache_size, 0, 339 dm_bufio_default_cache_size); 340 dm_bufio_cache_size_latch = dm_bufio_default_cache_size; 341 } 342 343 dm_bufio_cache_size_per_client = dm_bufio_cache_size_latch / 344 (dm_bufio_client_count ? : 1); 345 } 346 347 /* 348 * Allocating buffer data. 349 * 350 * Small buffers are allocated with kmem_cache, to use space optimally. 351 * 352 * For large buffers, we choose between get_free_pages and vmalloc. 353 * Each has advantages and disadvantages. 354 * 355 * __get_free_pages can randomly fail if the memory is fragmented. 356 * __vmalloc won't randomly fail, but vmalloc space is limited (it may be 357 * as low as 128M) so using it for caching is not appropriate. 358 * 359 * If the allocation may fail we use __get_free_pages. Memory fragmentation 360 * won't have a fatal effect here, but it just causes flushes of some other 361 * buffers and more I/O will be performed. Don't use __get_free_pages if it 362 * always fails (i.e. order >= MAX_ORDER). 363 * 364 * If the allocation shouldn't fail we use __vmalloc. This is only for the 365 * initial reserve allocation, so there's no risk of wasting all vmalloc 366 * space. 367 */ 368 static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask, 369 enum data_mode *data_mode) 370 { 371 unsigned noio_flag; 372 void *ptr; 373 374 if (c->block_size <= DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT) { 375 *data_mode = DATA_MODE_SLAB; 376 return kmem_cache_alloc(DM_BUFIO_CACHE(c), gfp_mask); 377 } 378 379 if (c->block_size <= DM_BUFIO_BLOCK_SIZE_GFP_LIMIT && 380 gfp_mask & __GFP_NORETRY) { 381 *data_mode = DATA_MODE_GET_FREE_PAGES; 382 return (void *)__get_free_pages(gfp_mask, 383 c->pages_per_block_bits); 384 } 385 386 *data_mode = DATA_MODE_VMALLOC; 387 388 /* 389 * __vmalloc allocates the data pages and auxiliary structures with 390 * gfp_flags that were specified, but pagetables are always allocated 391 * with GFP_KERNEL, no matter what was specified as gfp_mask. 392 * 393 * Consequently, we must set per-process flag PF_MEMALLOC_NOIO so that 394 * all allocations done by this process (including pagetables) are done 395 * as if GFP_NOIO was specified. 396 */ 397 398 if (gfp_mask & __GFP_NORETRY) 399 noio_flag = memalloc_noio_save(); 400 401 ptr = __vmalloc(c->block_size, gfp_mask | __GFP_HIGHMEM, PAGE_KERNEL); 402 403 if (gfp_mask & __GFP_NORETRY) 404 memalloc_noio_restore(noio_flag); 405 406 return ptr; 407 } 408 409 /* 410 * Free buffer's data. 411 */ 412 static void free_buffer_data(struct dm_bufio_client *c, 413 void *data, enum data_mode data_mode) 414 { 415 switch (data_mode) { 416 case DATA_MODE_SLAB: 417 kmem_cache_free(DM_BUFIO_CACHE(c), data); 418 break; 419 420 case DATA_MODE_GET_FREE_PAGES: 421 free_pages((unsigned long)data, c->pages_per_block_bits); 422 break; 423 424 case DATA_MODE_VMALLOC: 425 vfree(data); 426 break; 427 428 default: 429 DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d", 430 data_mode); 431 BUG(); 432 } 433 } 434 435 /* 436 * Allocate buffer and its data. 437 */ 438 static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask) 439 { 440 struct dm_buffer *b = kmalloc(sizeof(struct dm_buffer) + c->aux_size, 441 gfp_mask); 442 443 if (!b) 444 return NULL; 445 446 b->c = c; 447 448 b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode); 449 if (!b->data) { 450 kfree(b); 451 return NULL; 452 } 453 454 adjust_total_allocated(b->data_mode, (long)c->block_size); 455 456 return b; 457 } 458 459 /* 460 * Free buffer and its data. 461 */ 462 static void free_buffer(struct dm_buffer *b) 463 { 464 struct dm_bufio_client *c = b->c; 465 466 adjust_total_allocated(b->data_mode, -(long)c->block_size); 467 468 free_buffer_data(c, b->data, b->data_mode); 469 kfree(b); 470 } 471 472 /* 473 * Link buffer to the hash list and clean or dirty queue. 474 */ 475 static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty) 476 { 477 struct dm_bufio_client *c = b->c; 478 479 c->n_buffers[dirty]++; 480 b->block = block; 481 b->list_mode = dirty; 482 list_add(&b->lru_list, &c->lru[dirty]); 483 __insert(b->c, b); 484 b->last_accessed = jiffies; 485 } 486 487 /* 488 * Unlink buffer from the hash list and dirty or clean queue. 489 */ 490 static void __unlink_buffer(struct dm_buffer *b) 491 { 492 struct dm_bufio_client *c = b->c; 493 494 BUG_ON(!c->n_buffers[b->list_mode]); 495 496 c->n_buffers[b->list_mode]--; 497 __remove(b->c, b); 498 list_del(&b->lru_list); 499 } 500 501 /* 502 * Place the buffer to the head of dirty or clean LRU queue. 503 */ 504 static void __relink_lru(struct dm_buffer *b, int dirty) 505 { 506 struct dm_bufio_client *c = b->c; 507 508 BUG_ON(!c->n_buffers[b->list_mode]); 509 510 c->n_buffers[b->list_mode]--; 511 c->n_buffers[dirty]++; 512 b->list_mode = dirty; 513 list_move(&b->lru_list, &c->lru[dirty]); 514 b->last_accessed = jiffies; 515 } 516 517 /*---------------------------------------------------------------- 518 * Submit I/O on the buffer. 519 * 520 * Bio interface is faster but it has some problems: 521 * the vector list is limited (increasing this limit increases 522 * memory-consumption per buffer, so it is not viable); 523 * 524 * the memory must be direct-mapped, not vmalloced; 525 * 526 * the I/O driver can reject requests spuriously if it thinks that 527 * the requests are too big for the device or if they cross a 528 * controller-defined memory boundary. 529 * 530 * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and 531 * it is not vmalloced, try using the bio interface. 532 * 533 * If the buffer is big, if it is vmalloced or if the underlying device 534 * rejects the bio because it is too large, use dm-io layer to do the I/O. 535 * The dm-io layer splits the I/O into multiple requests, avoiding the above 536 * shortcomings. 537 *--------------------------------------------------------------*/ 538 539 /* 540 * dm-io completion routine. It just calls b->bio.bi_end_io, pretending 541 * that the request was handled directly with bio interface. 542 */ 543 static void dmio_complete(unsigned long error, void *context) 544 { 545 struct dm_buffer *b = context; 546 547 b->bio.bi_end_io(&b->bio, error ? -EIO : 0); 548 } 549 550 static void use_dmio(struct dm_buffer *b, int rw, sector_t block, 551 bio_end_io_t *end_io) 552 { 553 int r; 554 struct dm_io_request io_req = { 555 .bi_rw = rw, 556 .notify.fn = dmio_complete, 557 .notify.context = b, 558 .client = b->c->dm_io, 559 }; 560 struct dm_io_region region = { 561 .bdev = b->c->bdev, 562 .sector = block << b->c->sectors_per_block_bits, 563 .count = b->c->block_size >> SECTOR_SHIFT, 564 }; 565 566 if (b->data_mode != DATA_MODE_VMALLOC) { 567 io_req.mem.type = DM_IO_KMEM; 568 io_req.mem.ptr.addr = b->data; 569 } else { 570 io_req.mem.type = DM_IO_VMA; 571 io_req.mem.ptr.vma = b->data; 572 } 573 574 b->bio.bi_end_io = end_io; 575 576 r = dm_io(&io_req, 1, ®ion, NULL); 577 if (r) 578 end_io(&b->bio, r); 579 } 580 581 static void inline_endio(struct bio *bio, int error) 582 { 583 bio_end_io_t *end_fn = bio->bi_private; 584 585 /* 586 * Reset the bio to free any attached resources 587 * (e.g. bio integrity profiles). 588 */ 589 bio_reset(bio); 590 591 end_fn(bio, error); 592 } 593 594 static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, 595 bio_end_io_t *end_io) 596 { 597 char *ptr; 598 int len; 599 600 bio_init(&b->bio); 601 b->bio.bi_io_vec = b->bio_vec; 602 b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS; 603 b->bio.bi_iter.bi_sector = block << b->c->sectors_per_block_bits; 604 b->bio.bi_bdev = b->c->bdev; 605 b->bio.bi_end_io = inline_endio; 606 /* 607 * Use of .bi_private isn't a problem here because 608 * the dm_buffer's inline bio is local to bufio. 609 */ 610 b->bio.bi_private = end_io; 611 612 /* 613 * We assume that if len >= PAGE_SIZE ptr is page-aligned. 614 * If len < PAGE_SIZE the buffer doesn't cross page boundary. 615 */ 616 ptr = b->data; 617 len = b->c->block_size; 618 619 if (len >= PAGE_SIZE) 620 BUG_ON((unsigned long)ptr & (PAGE_SIZE - 1)); 621 else 622 BUG_ON((unsigned long)ptr & (len - 1)); 623 624 do { 625 if (!bio_add_page(&b->bio, virt_to_page(ptr), 626 len < PAGE_SIZE ? len : PAGE_SIZE, 627 virt_to_phys(ptr) & (PAGE_SIZE - 1))) { 628 BUG_ON(b->c->block_size <= PAGE_SIZE); 629 use_dmio(b, rw, block, end_io); 630 return; 631 } 632 633 len -= PAGE_SIZE; 634 ptr += PAGE_SIZE; 635 } while (len > 0); 636 637 submit_bio(rw, &b->bio); 638 } 639 640 static void submit_io(struct dm_buffer *b, int rw, sector_t block, 641 bio_end_io_t *end_io) 642 { 643 if (rw == WRITE && b->c->write_callback) 644 b->c->write_callback(b); 645 646 if (b->c->block_size <= DM_BUFIO_INLINE_VECS * PAGE_SIZE && 647 b->data_mode != DATA_MODE_VMALLOC) 648 use_inline_bio(b, rw, block, end_io); 649 else 650 use_dmio(b, rw, block, end_io); 651 } 652 653 /*---------------------------------------------------------------- 654 * Writing dirty buffers 655 *--------------------------------------------------------------*/ 656 657 /* 658 * The endio routine for write. 659 * 660 * Set the error, clear B_WRITING bit and wake anyone who was waiting on 661 * it. 662 */ 663 static void write_endio(struct bio *bio, int error) 664 { 665 struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); 666 667 b->write_error = error; 668 if (unlikely(error)) { 669 struct dm_bufio_client *c = b->c; 670 (void)cmpxchg(&c->async_write_error, 0, error); 671 } 672 673 BUG_ON(!test_bit(B_WRITING, &b->state)); 674 675 smp_mb__before_atomic(); 676 clear_bit(B_WRITING, &b->state); 677 smp_mb__after_atomic(); 678 679 wake_up_bit(&b->state, B_WRITING); 680 } 681 682 /* 683 * Initiate a write on a dirty buffer, but don't wait for it. 684 * 685 * - If the buffer is not dirty, exit. 686 * - If there some previous write going on, wait for it to finish (we can't 687 * have two writes on the same buffer simultaneously). 688 * - Submit our write and don't wait on it. We set B_WRITING indicating 689 * that there is a write in progress. 690 */ 691 static void __write_dirty_buffer(struct dm_buffer *b, 692 struct list_head *write_list) 693 { 694 if (!test_bit(B_DIRTY, &b->state)) 695 return; 696 697 clear_bit(B_DIRTY, &b->state); 698 wait_on_bit_lock_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE); 699 700 if (!write_list) 701 submit_io(b, WRITE, b->block, write_endio); 702 else 703 list_add_tail(&b->write_list, write_list); 704 } 705 706 static void __flush_write_list(struct list_head *write_list) 707 { 708 struct blk_plug plug; 709 blk_start_plug(&plug); 710 while (!list_empty(write_list)) { 711 struct dm_buffer *b = 712 list_entry(write_list->next, struct dm_buffer, write_list); 713 list_del(&b->write_list); 714 submit_io(b, WRITE, b->block, write_endio); 715 dm_bufio_cond_resched(); 716 } 717 blk_finish_plug(&plug); 718 } 719 720 /* 721 * Wait until any activity on the buffer finishes. Possibly write the 722 * buffer if it is dirty. When this function finishes, there is no I/O 723 * running on the buffer and the buffer is not dirty. 724 */ 725 static void __make_buffer_clean(struct dm_buffer *b) 726 { 727 BUG_ON(b->hold_count); 728 729 if (!b->state) /* fast case */ 730 return; 731 732 wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE); 733 __write_dirty_buffer(b, NULL); 734 wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE); 735 } 736 737 /* 738 * Find some buffer that is not held by anybody, clean it, unlink it and 739 * return it. 740 */ 741 static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c) 742 { 743 struct dm_buffer *b; 744 745 list_for_each_entry_reverse(b, &c->lru[LIST_CLEAN], lru_list) { 746 BUG_ON(test_bit(B_WRITING, &b->state)); 747 BUG_ON(test_bit(B_DIRTY, &b->state)); 748 749 if (!b->hold_count) { 750 __make_buffer_clean(b); 751 __unlink_buffer(b); 752 return b; 753 } 754 dm_bufio_cond_resched(); 755 } 756 757 list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) { 758 BUG_ON(test_bit(B_READING, &b->state)); 759 760 if (!b->hold_count) { 761 __make_buffer_clean(b); 762 __unlink_buffer(b); 763 return b; 764 } 765 dm_bufio_cond_resched(); 766 } 767 768 return NULL; 769 } 770 771 /* 772 * Wait until some other threads free some buffer or release hold count on 773 * some buffer. 774 * 775 * This function is entered with c->lock held, drops it and regains it 776 * before exiting. 777 */ 778 static void __wait_for_free_buffer(struct dm_bufio_client *c) 779 { 780 DECLARE_WAITQUEUE(wait, current); 781 782 add_wait_queue(&c->free_buffer_wait, &wait); 783 set_task_state(current, TASK_UNINTERRUPTIBLE); 784 dm_bufio_unlock(c); 785 786 io_schedule(); 787 788 remove_wait_queue(&c->free_buffer_wait, &wait); 789 790 dm_bufio_lock(c); 791 } 792 793 enum new_flag { 794 NF_FRESH = 0, 795 NF_READ = 1, 796 NF_GET = 2, 797 NF_PREFETCH = 3 798 }; 799 800 /* 801 * Allocate a new buffer. If the allocation is not possible, wait until 802 * some other thread frees a buffer. 803 * 804 * May drop the lock and regain it. 805 */ 806 static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf) 807 { 808 struct dm_buffer *b; 809 810 /* 811 * dm-bufio is resistant to allocation failures (it just keeps 812 * one buffer reserved in cases all the allocations fail). 813 * So set flags to not try too hard: 814 * GFP_NOIO: don't recurse into the I/O layer 815 * __GFP_NORETRY: don't retry and rather return failure 816 * __GFP_NOMEMALLOC: don't use emergency reserves 817 * __GFP_NOWARN: don't print a warning in case of failure 818 * 819 * For debugging, if we set the cache size to 1, no new buffers will 820 * be allocated. 821 */ 822 while (1) { 823 if (dm_bufio_cache_size_latch != 1) { 824 b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); 825 if (b) 826 return b; 827 } 828 829 if (nf == NF_PREFETCH) 830 return NULL; 831 832 if (!list_empty(&c->reserved_buffers)) { 833 b = list_entry(c->reserved_buffers.next, 834 struct dm_buffer, lru_list); 835 list_del(&b->lru_list); 836 c->need_reserved_buffers++; 837 838 return b; 839 } 840 841 b = __get_unclaimed_buffer(c); 842 if (b) 843 return b; 844 845 __wait_for_free_buffer(c); 846 } 847 } 848 849 static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf) 850 { 851 struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf); 852 853 if (!b) 854 return NULL; 855 856 if (c->alloc_callback) 857 c->alloc_callback(b); 858 859 return b; 860 } 861 862 /* 863 * Free a buffer and wake other threads waiting for free buffers. 864 */ 865 static void __free_buffer_wake(struct dm_buffer *b) 866 { 867 struct dm_bufio_client *c = b->c; 868 869 if (!c->need_reserved_buffers) 870 free_buffer(b); 871 else { 872 list_add(&b->lru_list, &c->reserved_buffers); 873 c->need_reserved_buffers--; 874 } 875 876 wake_up(&c->free_buffer_wait); 877 } 878 879 static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait, 880 struct list_head *write_list) 881 { 882 struct dm_buffer *b, *tmp; 883 884 list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) { 885 BUG_ON(test_bit(B_READING, &b->state)); 886 887 if (!test_bit(B_DIRTY, &b->state) && 888 !test_bit(B_WRITING, &b->state)) { 889 __relink_lru(b, LIST_CLEAN); 890 continue; 891 } 892 893 if (no_wait && test_bit(B_WRITING, &b->state)) 894 return; 895 896 __write_dirty_buffer(b, write_list); 897 dm_bufio_cond_resched(); 898 } 899 } 900 901 /* 902 * Get writeback threshold and buffer limit for a given client. 903 */ 904 static void __get_memory_limit(struct dm_bufio_client *c, 905 unsigned long *threshold_buffers, 906 unsigned long *limit_buffers) 907 { 908 unsigned long buffers; 909 910 if (ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch) { 911 mutex_lock(&dm_bufio_clients_lock); 912 __cache_size_refresh(); 913 mutex_unlock(&dm_bufio_clients_lock); 914 } 915 916 buffers = dm_bufio_cache_size_per_client >> 917 (c->sectors_per_block_bits + SECTOR_SHIFT); 918 919 if (buffers < c->minimum_buffers) 920 buffers = c->minimum_buffers; 921 922 *limit_buffers = buffers; 923 *threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100; 924 } 925 926 /* 927 * Check if we're over watermark. 928 * If we are over threshold_buffers, start freeing buffers. 929 * If we're over "limit_buffers", block until we get under the limit. 930 */ 931 static void __check_watermark(struct dm_bufio_client *c, 932 struct list_head *write_list) 933 { 934 unsigned long threshold_buffers, limit_buffers; 935 936 __get_memory_limit(c, &threshold_buffers, &limit_buffers); 937 938 while (c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY] > 939 limit_buffers) { 940 941 struct dm_buffer *b = __get_unclaimed_buffer(c); 942 943 if (!b) 944 return; 945 946 __free_buffer_wake(b); 947 dm_bufio_cond_resched(); 948 } 949 950 if (c->n_buffers[LIST_DIRTY] > threshold_buffers) 951 __write_dirty_buffers_async(c, 1, write_list); 952 } 953 954 /*---------------------------------------------------------------- 955 * Getting a buffer 956 *--------------------------------------------------------------*/ 957 958 static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, 959 enum new_flag nf, int *need_submit, 960 struct list_head *write_list) 961 { 962 struct dm_buffer *b, *new_b = NULL; 963 964 *need_submit = 0; 965 966 b = __find(c, block); 967 if (b) 968 goto found_buffer; 969 970 if (nf == NF_GET) 971 return NULL; 972 973 new_b = __alloc_buffer_wait(c, nf); 974 if (!new_b) 975 return NULL; 976 977 /* 978 * We've had a period where the mutex was unlocked, so need to 979 * recheck the hash table. 980 */ 981 b = __find(c, block); 982 if (b) { 983 __free_buffer_wake(new_b); 984 goto found_buffer; 985 } 986 987 __check_watermark(c, write_list); 988 989 b = new_b; 990 b->hold_count = 1; 991 b->read_error = 0; 992 b->write_error = 0; 993 __link_buffer(b, block, LIST_CLEAN); 994 995 if (nf == NF_FRESH) { 996 b->state = 0; 997 return b; 998 } 999 1000 b->state = 1 << B_READING; 1001 *need_submit = 1; 1002 1003 return b; 1004 1005 found_buffer: 1006 if (nf == NF_PREFETCH) 1007 return NULL; 1008 /* 1009 * Note: it is essential that we don't wait for the buffer to be 1010 * read if dm_bufio_get function is used. Both dm_bufio_get and 1011 * dm_bufio_prefetch can be used in the driver request routine. 1012 * If the user called both dm_bufio_prefetch and dm_bufio_get on 1013 * the same buffer, it would deadlock if we waited. 1014 */ 1015 if (nf == NF_GET && unlikely(test_bit(B_READING, &b->state))) 1016 return NULL; 1017 1018 b->hold_count++; 1019 __relink_lru(b, test_bit(B_DIRTY, &b->state) || 1020 test_bit(B_WRITING, &b->state)); 1021 return b; 1022 } 1023 1024 /* 1025 * The endio routine for reading: set the error, clear the bit and wake up 1026 * anyone waiting on the buffer. 1027 */ 1028 static void read_endio(struct bio *bio, int error) 1029 { 1030 struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); 1031 1032 b->read_error = error; 1033 1034 BUG_ON(!test_bit(B_READING, &b->state)); 1035 1036 smp_mb__before_atomic(); 1037 clear_bit(B_READING, &b->state); 1038 smp_mb__after_atomic(); 1039 1040 wake_up_bit(&b->state, B_READING); 1041 } 1042 1043 /* 1044 * A common routine for dm_bufio_new and dm_bufio_read. Operation of these 1045 * functions is similar except that dm_bufio_new doesn't read the 1046 * buffer from the disk (assuming that the caller overwrites all the data 1047 * and uses dm_bufio_mark_buffer_dirty to write new data back). 1048 */ 1049 static void *new_read(struct dm_bufio_client *c, sector_t block, 1050 enum new_flag nf, struct dm_buffer **bp) 1051 { 1052 int need_submit; 1053 struct dm_buffer *b; 1054 1055 LIST_HEAD(write_list); 1056 1057 dm_bufio_lock(c); 1058 b = __bufio_new(c, block, nf, &need_submit, &write_list); 1059 dm_bufio_unlock(c); 1060 1061 __flush_write_list(&write_list); 1062 1063 if (!b) 1064 return b; 1065 1066 if (need_submit) 1067 submit_io(b, READ, b->block, read_endio); 1068 1069 wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE); 1070 1071 if (b->read_error) { 1072 int error = b->read_error; 1073 1074 dm_bufio_release(b); 1075 1076 return ERR_PTR(error); 1077 } 1078 1079 *bp = b; 1080 1081 return b->data; 1082 } 1083 1084 void *dm_bufio_get(struct dm_bufio_client *c, sector_t block, 1085 struct dm_buffer **bp) 1086 { 1087 return new_read(c, block, NF_GET, bp); 1088 } 1089 EXPORT_SYMBOL_GPL(dm_bufio_get); 1090 1091 void *dm_bufio_read(struct dm_bufio_client *c, sector_t block, 1092 struct dm_buffer **bp) 1093 { 1094 BUG_ON(dm_bufio_in_request()); 1095 1096 return new_read(c, block, NF_READ, bp); 1097 } 1098 EXPORT_SYMBOL_GPL(dm_bufio_read); 1099 1100 void *dm_bufio_new(struct dm_bufio_client *c, sector_t block, 1101 struct dm_buffer **bp) 1102 { 1103 BUG_ON(dm_bufio_in_request()); 1104 1105 return new_read(c, block, NF_FRESH, bp); 1106 } 1107 EXPORT_SYMBOL_GPL(dm_bufio_new); 1108 1109 void dm_bufio_prefetch(struct dm_bufio_client *c, 1110 sector_t block, unsigned n_blocks) 1111 { 1112 struct blk_plug plug; 1113 1114 LIST_HEAD(write_list); 1115 1116 BUG_ON(dm_bufio_in_request()); 1117 1118 blk_start_plug(&plug); 1119 dm_bufio_lock(c); 1120 1121 for (; n_blocks--; block++) { 1122 int need_submit; 1123 struct dm_buffer *b; 1124 b = __bufio_new(c, block, NF_PREFETCH, &need_submit, 1125 &write_list); 1126 if (unlikely(!list_empty(&write_list))) { 1127 dm_bufio_unlock(c); 1128 blk_finish_plug(&plug); 1129 __flush_write_list(&write_list); 1130 blk_start_plug(&plug); 1131 dm_bufio_lock(c); 1132 } 1133 if (unlikely(b != NULL)) { 1134 dm_bufio_unlock(c); 1135 1136 if (need_submit) 1137 submit_io(b, READ, b->block, read_endio); 1138 dm_bufio_release(b); 1139 1140 dm_bufio_cond_resched(); 1141 1142 if (!n_blocks) 1143 goto flush_plug; 1144 dm_bufio_lock(c); 1145 } 1146 } 1147 1148 dm_bufio_unlock(c); 1149 1150 flush_plug: 1151 blk_finish_plug(&plug); 1152 } 1153 EXPORT_SYMBOL_GPL(dm_bufio_prefetch); 1154 1155 void dm_bufio_release(struct dm_buffer *b) 1156 { 1157 struct dm_bufio_client *c = b->c; 1158 1159 dm_bufio_lock(c); 1160 1161 BUG_ON(!b->hold_count); 1162 1163 b->hold_count--; 1164 if (!b->hold_count) { 1165 wake_up(&c->free_buffer_wait); 1166 1167 /* 1168 * If there were errors on the buffer, and the buffer is not 1169 * to be written, free the buffer. There is no point in caching 1170 * invalid buffer. 1171 */ 1172 if ((b->read_error || b->write_error) && 1173 !test_bit(B_READING, &b->state) && 1174 !test_bit(B_WRITING, &b->state) && 1175 !test_bit(B_DIRTY, &b->state)) { 1176 __unlink_buffer(b); 1177 __free_buffer_wake(b); 1178 } 1179 } 1180 1181 dm_bufio_unlock(c); 1182 } 1183 EXPORT_SYMBOL_GPL(dm_bufio_release); 1184 1185 void dm_bufio_mark_buffer_dirty(struct dm_buffer *b) 1186 { 1187 struct dm_bufio_client *c = b->c; 1188 1189 dm_bufio_lock(c); 1190 1191 BUG_ON(test_bit(B_READING, &b->state)); 1192 1193 if (!test_and_set_bit(B_DIRTY, &b->state)) 1194 __relink_lru(b, LIST_DIRTY); 1195 1196 dm_bufio_unlock(c); 1197 } 1198 EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty); 1199 1200 void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c) 1201 { 1202 LIST_HEAD(write_list); 1203 1204 BUG_ON(dm_bufio_in_request()); 1205 1206 dm_bufio_lock(c); 1207 __write_dirty_buffers_async(c, 0, &write_list); 1208 dm_bufio_unlock(c); 1209 __flush_write_list(&write_list); 1210 } 1211 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async); 1212 1213 /* 1214 * For performance, it is essential that the buffers are written asynchronously 1215 * and simultaneously (so that the block layer can merge the writes) and then 1216 * waited upon. 1217 * 1218 * Finally, we flush hardware disk cache. 1219 */ 1220 int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c) 1221 { 1222 int a, f; 1223 unsigned long buffers_processed = 0; 1224 struct dm_buffer *b, *tmp; 1225 1226 LIST_HEAD(write_list); 1227 1228 dm_bufio_lock(c); 1229 __write_dirty_buffers_async(c, 0, &write_list); 1230 dm_bufio_unlock(c); 1231 __flush_write_list(&write_list); 1232 dm_bufio_lock(c); 1233 1234 again: 1235 list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) { 1236 int dropped_lock = 0; 1237 1238 if (buffers_processed < c->n_buffers[LIST_DIRTY]) 1239 buffers_processed++; 1240 1241 BUG_ON(test_bit(B_READING, &b->state)); 1242 1243 if (test_bit(B_WRITING, &b->state)) { 1244 if (buffers_processed < c->n_buffers[LIST_DIRTY]) { 1245 dropped_lock = 1; 1246 b->hold_count++; 1247 dm_bufio_unlock(c); 1248 wait_on_bit_io(&b->state, B_WRITING, 1249 TASK_UNINTERRUPTIBLE); 1250 dm_bufio_lock(c); 1251 b->hold_count--; 1252 } else 1253 wait_on_bit_io(&b->state, B_WRITING, 1254 TASK_UNINTERRUPTIBLE); 1255 } 1256 1257 if (!test_bit(B_DIRTY, &b->state) && 1258 !test_bit(B_WRITING, &b->state)) 1259 __relink_lru(b, LIST_CLEAN); 1260 1261 dm_bufio_cond_resched(); 1262 1263 /* 1264 * If we dropped the lock, the list is no longer consistent, 1265 * so we must restart the search. 1266 * 1267 * In the most common case, the buffer just processed is 1268 * relinked to the clean list, so we won't loop scanning the 1269 * same buffer again and again. 1270 * 1271 * This may livelock if there is another thread simultaneously 1272 * dirtying buffers, so we count the number of buffers walked 1273 * and if it exceeds the total number of buffers, it means that 1274 * someone is doing some writes simultaneously with us. In 1275 * this case, stop, dropping the lock. 1276 */ 1277 if (dropped_lock) 1278 goto again; 1279 } 1280 wake_up(&c->free_buffer_wait); 1281 dm_bufio_unlock(c); 1282 1283 a = xchg(&c->async_write_error, 0); 1284 f = dm_bufio_issue_flush(c); 1285 if (a) 1286 return a; 1287 1288 return f; 1289 } 1290 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers); 1291 1292 /* 1293 * Use dm-io to send and empty barrier flush the device. 1294 */ 1295 int dm_bufio_issue_flush(struct dm_bufio_client *c) 1296 { 1297 struct dm_io_request io_req = { 1298 .bi_rw = WRITE_FLUSH, 1299 .mem.type = DM_IO_KMEM, 1300 .mem.ptr.addr = NULL, 1301 .client = c->dm_io, 1302 }; 1303 struct dm_io_region io_reg = { 1304 .bdev = c->bdev, 1305 .sector = 0, 1306 .count = 0, 1307 }; 1308 1309 BUG_ON(dm_bufio_in_request()); 1310 1311 return dm_io(&io_req, 1, &io_reg, NULL); 1312 } 1313 EXPORT_SYMBOL_GPL(dm_bufio_issue_flush); 1314 1315 /* 1316 * We first delete any other buffer that may be at that new location. 1317 * 1318 * Then, we write the buffer to the original location if it was dirty. 1319 * 1320 * Then, if we are the only one who is holding the buffer, relink the buffer 1321 * in the hash queue for the new location. 1322 * 1323 * If there was someone else holding the buffer, we write it to the new 1324 * location but not relink it, because that other user needs to have the buffer 1325 * at the same place. 1326 */ 1327 void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block) 1328 { 1329 struct dm_bufio_client *c = b->c; 1330 struct dm_buffer *new; 1331 1332 BUG_ON(dm_bufio_in_request()); 1333 1334 dm_bufio_lock(c); 1335 1336 retry: 1337 new = __find(c, new_block); 1338 if (new) { 1339 if (new->hold_count) { 1340 __wait_for_free_buffer(c); 1341 goto retry; 1342 } 1343 1344 /* 1345 * FIXME: Is there any point waiting for a write that's going 1346 * to be overwritten in a bit? 1347 */ 1348 __make_buffer_clean(new); 1349 __unlink_buffer(new); 1350 __free_buffer_wake(new); 1351 } 1352 1353 BUG_ON(!b->hold_count); 1354 BUG_ON(test_bit(B_READING, &b->state)); 1355 1356 __write_dirty_buffer(b, NULL); 1357 if (b->hold_count == 1) { 1358 wait_on_bit_io(&b->state, B_WRITING, 1359 TASK_UNINTERRUPTIBLE); 1360 set_bit(B_DIRTY, &b->state); 1361 __unlink_buffer(b); 1362 __link_buffer(b, new_block, LIST_DIRTY); 1363 } else { 1364 sector_t old_block; 1365 wait_on_bit_lock_io(&b->state, B_WRITING, 1366 TASK_UNINTERRUPTIBLE); 1367 /* 1368 * Relink buffer to "new_block" so that write_callback 1369 * sees "new_block" as a block number. 1370 * After the write, link the buffer back to old_block. 1371 * All this must be done in bufio lock, so that block number 1372 * change isn't visible to other threads. 1373 */ 1374 old_block = b->block; 1375 __unlink_buffer(b); 1376 __link_buffer(b, new_block, b->list_mode); 1377 submit_io(b, WRITE, new_block, write_endio); 1378 wait_on_bit_io(&b->state, B_WRITING, 1379 TASK_UNINTERRUPTIBLE); 1380 __unlink_buffer(b); 1381 __link_buffer(b, old_block, b->list_mode); 1382 } 1383 1384 dm_bufio_unlock(c); 1385 dm_bufio_release(b); 1386 } 1387 EXPORT_SYMBOL_GPL(dm_bufio_release_move); 1388 1389 /* 1390 * Free the given buffer. 1391 * 1392 * This is just a hint, if the buffer is in use or dirty, this function 1393 * does nothing. 1394 */ 1395 void dm_bufio_forget(struct dm_bufio_client *c, sector_t block) 1396 { 1397 struct dm_buffer *b; 1398 1399 dm_bufio_lock(c); 1400 1401 b = __find(c, block); 1402 if (b && likely(!b->hold_count) && likely(!b->state)) { 1403 __unlink_buffer(b); 1404 __free_buffer_wake(b); 1405 } 1406 1407 dm_bufio_unlock(c); 1408 } 1409 EXPORT_SYMBOL(dm_bufio_forget); 1410 1411 void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n) 1412 { 1413 c->minimum_buffers = n; 1414 } 1415 EXPORT_SYMBOL(dm_bufio_set_minimum_buffers); 1416 1417 unsigned dm_bufio_get_block_size(struct dm_bufio_client *c) 1418 { 1419 return c->block_size; 1420 } 1421 EXPORT_SYMBOL_GPL(dm_bufio_get_block_size); 1422 1423 sector_t dm_bufio_get_device_size(struct dm_bufio_client *c) 1424 { 1425 return i_size_read(c->bdev->bd_inode) >> 1426 (SECTOR_SHIFT + c->sectors_per_block_bits); 1427 } 1428 EXPORT_SYMBOL_GPL(dm_bufio_get_device_size); 1429 1430 sector_t dm_bufio_get_block_number(struct dm_buffer *b) 1431 { 1432 return b->block; 1433 } 1434 EXPORT_SYMBOL_GPL(dm_bufio_get_block_number); 1435 1436 void *dm_bufio_get_block_data(struct dm_buffer *b) 1437 { 1438 return b->data; 1439 } 1440 EXPORT_SYMBOL_GPL(dm_bufio_get_block_data); 1441 1442 void *dm_bufio_get_aux_data(struct dm_buffer *b) 1443 { 1444 return b + 1; 1445 } 1446 EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data); 1447 1448 struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b) 1449 { 1450 return b->c; 1451 } 1452 EXPORT_SYMBOL_GPL(dm_bufio_get_client); 1453 1454 static void drop_buffers(struct dm_bufio_client *c) 1455 { 1456 struct dm_buffer *b; 1457 int i; 1458 1459 BUG_ON(dm_bufio_in_request()); 1460 1461 /* 1462 * An optimization so that the buffers are not written one-by-one. 1463 */ 1464 dm_bufio_write_dirty_buffers_async(c); 1465 1466 dm_bufio_lock(c); 1467 1468 while ((b = __get_unclaimed_buffer(c))) 1469 __free_buffer_wake(b); 1470 1471 for (i = 0; i < LIST_SIZE; i++) 1472 list_for_each_entry(b, &c->lru[i], lru_list) 1473 DMERR("leaked buffer %llx, hold count %u, list %d", 1474 (unsigned long long)b->block, b->hold_count, i); 1475 1476 for (i = 0; i < LIST_SIZE; i++) 1477 BUG_ON(!list_empty(&c->lru[i])); 1478 1479 dm_bufio_unlock(c); 1480 } 1481 1482 /* 1483 * We may not be able to evict this buffer if IO pending or the client 1484 * is still using it. Caller is expected to know buffer is too old. 1485 * 1486 * And if GFP_NOFS is used, we must not do any I/O because we hold 1487 * dm_bufio_clients_lock and we would risk deadlock if the I/O gets 1488 * rerouted to different bufio client. 1489 */ 1490 static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp) 1491 { 1492 if (!(gfp & __GFP_FS)) { 1493 if (test_bit(B_READING, &b->state) || 1494 test_bit(B_WRITING, &b->state) || 1495 test_bit(B_DIRTY, &b->state)) 1496 return false; 1497 } 1498 1499 if (b->hold_count) 1500 return false; 1501 1502 __make_buffer_clean(b); 1503 __unlink_buffer(b); 1504 __free_buffer_wake(b); 1505 1506 return true; 1507 } 1508 1509 static unsigned get_retain_buffers(struct dm_bufio_client *c) 1510 { 1511 unsigned retain_bytes = ACCESS_ONCE(dm_bufio_retain_bytes); 1512 return retain_bytes / c->block_size; 1513 } 1514 1515 static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, 1516 gfp_t gfp_mask) 1517 { 1518 int l; 1519 struct dm_buffer *b, *tmp; 1520 unsigned long freed = 0; 1521 unsigned long count = nr_to_scan; 1522 unsigned retain_target = get_retain_buffers(c); 1523 1524 for (l = 0; l < LIST_SIZE; l++) { 1525 list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) { 1526 if (__try_evict_buffer(b, gfp_mask)) 1527 freed++; 1528 if (!--nr_to_scan || ((count - freed) <= retain_target)) 1529 return freed; 1530 dm_bufio_cond_resched(); 1531 } 1532 } 1533 return freed; 1534 } 1535 1536 static unsigned long 1537 dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) 1538 { 1539 struct dm_bufio_client *c; 1540 unsigned long freed; 1541 1542 c = container_of(shrink, struct dm_bufio_client, shrinker); 1543 if (sc->gfp_mask & __GFP_FS) 1544 dm_bufio_lock(c); 1545 else if (!dm_bufio_trylock(c)) 1546 return SHRINK_STOP; 1547 1548 freed = __scan(c, sc->nr_to_scan, sc->gfp_mask); 1549 dm_bufio_unlock(c); 1550 return freed; 1551 } 1552 1553 static unsigned long 1554 dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc) 1555 { 1556 struct dm_bufio_client *c; 1557 unsigned long count; 1558 1559 c = container_of(shrink, struct dm_bufio_client, shrinker); 1560 if (sc->gfp_mask & __GFP_FS) 1561 dm_bufio_lock(c); 1562 else if (!dm_bufio_trylock(c)) 1563 return 0; 1564 1565 count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY]; 1566 dm_bufio_unlock(c); 1567 return count; 1568 } 1569 1570 /* 1571 * Create the buffering interface 1572 */ 1573 struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size, 1574 unsigned reserved_buffers, unsigned aux_size, 1575 void (*alloc_callback)(struct dm_buffer *), 1576 void (*write_callback)(struct dm_buffer *)) 1577 { 1578 int r; 1579 struct dm_bufio_client *c; 1580 unsigned i; 1581 1582 BUG_ON(block_size < 1 << SECTOR_SHIFT || 1583 (block_size & (block_size - 1))); 1584 1585 c = kzalloc(sizeof(*c), GFP_KERNEL); 1586 if (!c) { 1587 r = -ENOMEM; 1588 goto bad_client; 1589 } 1590 c->buffer_tree = RB_ROOT; 1591 1592 c->bdev = bdev; 1593 c->block_size = block_size; 1594 c->sectors_per_block_bits = ffs(block_size) - 1 - SECTOR_SHIFT; 1595 c->pages_per_block_bits = (ffs(block_size) - 1 >= PAGE_SHIFT) ? 1596 ffs(block_size) - 1 - PAGE_SHIFT : 0; 1597 c->blocks_per_page_bits = (ffs(block_size) - 1 < PAGE_SHIFT ? 1598 PAGE_SHIFT - (ffs(block_size) - 1) : 0); 1599 1600 c->aux_size = aux_size; 1601 c->alloc_callback = alloc_callback; 1602 c->write_callback = write_callback; 1603 1604 for (i = 0; i < LIST_SIZE; i++) { 1605 INIT_LIST_HEAD(&c->lru[i]); 1606 c->n_buffers[i] = 0; 1607 } 1608 1609 mutex_init(&c->lock); 1610 INIT_LIST_HEAD(&c->reserved_buffers); 1611 c->need_reserved_buffers = reserved_buffers; 1612 1613 c->minimum_buffers = DM_BUFIO_MIN_BUFFERS; 1614 1615 init_waitqueue_head(&c->free_buffer_wait); 1616 c->async_write_error = 0; 1617 1618 c->dm_io = dm_io_client_create(); 1619 if (IS_ERR(c->dm_io)) { 1620 r = PTR_ERR(c->dm_io); 1621 goto bad_dm_io; 1622 } 1623 1624 mutex_lock(&dm_bufio_clients_lock); 1625 if (c->blocks_per_page_bits) { 1626 if (!DM_BUFIO_CACHE_NAME(c)) { 1627 DM_BUFIO_CACHE_NAME(c) = kasprintf(GFP_KERNEL, "dm_bufio_cache-%u", c->block_size); 1628 if (!DM_BUFIO_CACHE_NAME(c)) { 1629 r = -ENOMEM; 1630 mutex_unlock(&dm_bufio_clients_lock); 1631 goto bad_cache; 1632 } 1633 } 1634 1635 if (!DM_BUFIO_CACHE(c)) { 1636 DM_BUFIO_CACHE(c) = kmem_cache_create(DM_BUFIO_CACHE_NAME(c), 1637 c->block_size, 1638 c->block_size, 0, NULL); 1639 if (!DM_BUFIO_CACHE(c)) { 1640 r = -ENOMEM; 1641 mutex_unlock(&dm_bufio_clients_lock); 1642 goto bad_cache; 1643 } 1644 } 1645 } 1646 mutex_unlock(&dm_bufio_clients_lock); 1647 1648 while (c->need_reserved_buffers) { 1649 struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL); 1650 1651 if (!b) { 1652 r = -ENOMEM; 1653 goto bad_buffer; 1654 } 1655 __free_buffer_wake(b); 1656 } 1657 1658 mutex_lock(&dm_bufio_clients_lock); 1659 dm_bufio_client_count++; 1660 list_add(&c->client_list, &dm_bufio_all_clients); 1661 __cache_size_refresh(); 1662 mutex_unlock(&dm_bufio_clients_lock); 1663 1664 c->shrinker.count_objects = dm_bufio_shrink_count; 1665 c->shrinker.scan_objects = dm_bufio_shrink_scan; 1666 c->shrinker.seeks = 1; 1667 c->shrinker.batch = 0; 1668 register_shrinker(&c->shrinker); 1669 1670 return c; 1671 1672 bad_buffer: 1673 bad_cache: 1674 while (!list_empty(&c->reserved_buffers)) { 1675 struct dm_buffer *b = list_entry(c->reserved_buffers.next, 1676 struct dm_buffer, lru_list); 1677 list_del(&b->lru_list); 1678 free_buffer(b); 1679 } 1680 dm_io_client_destroy(c->dm_io); 1681 bad_dm_io: 1682 kfree(c); 1683 bad_client: 1684 return ERR_PTR(r); 1685 } 1686 EXPORT_SYMBOL_GPL(dm_bufio_client_create); 1687 1688 /* 1689 * Free the buffering interface. 1690 * It is required that there are no references on any buffers. 1691 */ 1692 void dm_bufio_client_destroy(struct dm_bufio_client *c) 1693 { 1694 unsigned i; 1695 1696 drop_buffers(c); 1697 1698 unregister_shrinker(&c->shrinker); 1699 1700 mutex_lock(&dm_bufio_clients_lock); 1701 1702 list_del(&c->client_list); 1703 dm_bufio_client_count--; 1704 __cache_size_refresh(); 1705 1706 mutex_unlock(&dm_bufio_clients_lock); 1707 1708 BUG_ON(!RB_EMPTY_ROOT(&c->buffer_tree)); 1709 BUG_ON(c->need_reserved_buffers); 1710 1711 while (!list_empty(&c->reserved_buffers)) { 1712 struct dm_buffer *b = list_entry(c->reserved_buffers.next, 1713 struct dm_buffer, lru_list); 1714 list_del(&b->lru_list); 1715 free_buffer(b); 1716 } 1717 1718 for (i = 0; i < LIST_SIZE; i++) 1719 if (c->n_buffers[i]) 1720 DMERR("leaked buffer count %d: %ld", i, c->n_buffers[i]); 1721 1722 for (i = 0; i < LIST_SIZE; i++) 1723 BUG_ON(c->n_buffers[i]); 1724 1725 dm_io_client_destroy(c->dm_io); 1726 kfree(c); 1727 } 1728 EXPORT_SYMBOL_GPL(dm_bufio_client_destroy); 1729 1730 static unsigned get_max_age_hz(void) 1731 { 1732 unsigned max_age = ACCESS_ONCE(dm_bufio_max_age); 1733 1734 if (max_age > UINT_MAX / HZ) 1735 max_age = UINT_MAX / HZ; 1736 1737 return max_age * HZ; 1738 } 1739 1740 static bool older_than(struct dm_buffer *b, unsigned long age_hz) 1741 { 1742 return (jiffies - b->last_accessed) >= age_hz; 1743 } 1744 1745 static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz) 1746 { 1747 struct dm_buffer *b, *tmp; 1748 unsigned retain_target = get_retain_buffers(c); 1749 unsigned count; 1750 1751 dm_bufio_lock(c); 1752 1753 count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY]; 1754 list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_CLEAN], lru_list) { 1755 if (count <= retain_target) 1756 break; 1757 1758 if (!older_than(b, age_hz)) 1759 break; 1760 1761 if (__try_evict_buffer(b, 0)) 1762 count--; 1763 1764 dm_bufio_cond_resched(); 1765 } 1766 1767 dm_bufio_unlock(c); 1768 } 1769 1770 static void cleanup_old_buffers(void) 1771 { 1772 unsigned long max_age_hz = get_max_age_hz(); 1773 struct dm_bufio_client *c; 1774 1775 mutex_lock(&dm_bufio_clients_lock); 1776 1777 list_for_each_entry(c, &dm_bufio_all_clients, client_list) 1778 __evict_old_buffers(c, max_age_hz); 1779 1780 mutex_unlock(&dm_bufio_clients_lock); 1781 } 1782 1783 static struct workqueue_struct *dm_bufio_wq; 1784 static struct delayed_work dm_bufio_work; 1785 1786 static void work_fn(struct work_struct *w) 1787 { 1788 cleanup_old_buffers(); 1789 1790 queue_delayed_work(dm_bufio_wq, &dm_bufio_work, 1791 DM_BUFIO_WORK_TIMER_SECS * HZ); 1792 } 1793 1794 /*---------------------------------------------------------------- 1795 * Module setup 1796 *--------------------------------------------------------------*/ 1797 1798 /* 1799 * This is called only once for the whole dm_bufio module. 1800 * It initializes memory limit. 1801 */ 1802 static int __init dm_bufio_init(void) 1803 { 1804 __u64 mem; 1805 1806 dm_bufio_allocated_kmem_cache = 0; 1807 dm_bufio_allocated_get_free_pages = 0; 1808 dm_bufio_allocated_vmalloc = 0; 1809 dm_bufio_current_allocated = 0; 1810 1811 memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches); 1812 memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names); 1813 1814 mem = (__u64)((totalram_pages - totalhigh_pages) * 1815 DM_BUFIO_MEMORY_PERCENT / 100) << PAGE_SHIFT; 1816 1817 if (mem > ULONG_MAX) 1818 mem = ULONG_MAX; 1819 1820 #ifdef CONFIG_MMU 1821 /* 1822 * Get the size of vmalloc space the same way as VMALLOC_TOTAL 1823 * in fs/proc/internal.h 1824 */ 1825 if (mem > (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100) 1826 mem = (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100; 1827 #endif 1828 1829 dm_bufio_default_cache_size = mem; 1830 1831 mutex_lock(&dm_bufio_clients_lock); 1832 __cache_size_refresh(); 1833 mutex_unlock(&dm_bufio_clients_lock); 1834 1835 dm_bufio_wq = create_singlethread_workqueue("dm_bufio_cache"); 1836 if (!dm_bufio_wq) 1837 return -ENOMEM; 1838 1839 INIT_DELAYED_WORK(&dm_bufio_work, work_fn); 1840 queue_delayed_work(dm_bufio_wq, &dm_bufio_work, 1841 DM_BUFIO_WORK_TIMER_SECS * HZ); 1842 1843 return 0; 1844 } 1845 1846 /* 1847 * This is called once when unloading the dm_bufio module. 1848 */ 1849 static void __exit dm_bufio_exit(void) 1850 { 1851 int bug = 0; 1852 int i; 1853 1854 cancel_delayed_work_sync(&dm_bufio_work); 1855 destroy_workqueue(dm_bufio_wq); 1856 1857 for (i = 0; i < ARRAY_SIZE(dm_bufio_caches); i++) { 1858 struct kmem_cache *kc = dm_bufio_caches[i]; 1859 1860 if (kc) 1861 kmem_cache_destroy(kc); 1862 } 1863 1864 for (i = 0; i < ARRAY_SIZE(dm_bufio_cache_names); i++) 1865 kfree(dm_bufio_cache_names[i]); 1866 1867 if (dm_bufio_client_count) { 1868 DMCRIT("%s: dm_bufio_client_count leaked: %d", 1869 __func__, dm_bufio_client_count); 1870 bug = 1; 1871 } 1872 1873 if (dm_bufio_current_allocated) { 1874 DMCRIT("%s: dm_bufio_current_allocated leaked: %lu", 1875 __func__, dm_bufio_current_allocated); 1876 bug = 1; 1877 } 1878 1879 if (dm_bufio_allocated_get_free_pages) { 1880 DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu", 1881 __func__, dm_bufio_allocated_get_free_pages); 1882 bug = 1; 1883 } 1884 1885 if (dm_bufio_allocated_vmalloc) { 1886 DMCRIT("%s: dm_bufio_vmalloc leaked: %lu", 1887 __func__, dm_bufio_allocated_vmalloc); 1888 bug = 1; 1889 } 1890 1891 if (bug) 1892 BUG(); 1893 } 1894 1895 module_init(dm_bufio_init) 1896 module_exit(dm_bufio_exit) 1897 1898 module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, S_IRUGO | S_IWUSR); 1899 MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache"); 1900 1901 module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR); 1902 MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds"); 1903 1904 module_param_named(retain_bytes, dm_bufio_retain_bytes, uint, S_IRUGO | S_IWUSR); 1905 MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory"); 1906 1907 module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR); 1908 MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory"); 1909 1910 module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, S_IRUGO); 1911 MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc"); 1912 1913 module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, S_IRUGO); 1914 MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages"); 1915 1916 module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, S_IRUGO); 1917 MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc"); 1918 1919 module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, S_IRUGO); 1920 MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache"); 1921 1922 MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>"); 1923 MODULE_DESCRIPTION(DM_NAME " buffered I/O library"); 1924 MODULE_LICENSE("GPL"); 1925