1 /* 2 * Copyright (C) 2009-2011 Red Hat, Inc. 3 * 4 * Author: Mikulas Patocka <mpatocka@redhat.com> 5 * 6 * This file is released under the GPL. 7 */ 8 9 #include "dm-bufio.h" 10 11 #include <linux/device-mapper.h> 12 #include <linux/dm-io.h> 13 #include <linux/slab.h> 14 #include <linux/vmalloc.h> 15 #include <linux/version.h> 16 #include <linux/shrinker.h> 17 #include <linux/module.h> 18 19 #define DM_MSG_PREFIX "bufio" 20 21 /* 22 * Memory management policy: 23 * Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory 24 * or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower). 25 * Always allocate at least DM_BUFIO_MIN_BUFFERS buffers. 26 * Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT 27 * dirty buffers. 28 */ 29 #define DM_BUFIO_MIN_BUFFERS 8 30 31 #define DM_BUFIO_MEMORY_PERCENT 2 32 #define DM_BUFIO_VMALLOC_PERCENT 25 33 #define DM_BUFIO_WRITEBACK_PERCENT 75 34 35 /* 36 * Check buffer ages in this interval (seconds) 37 */ 38 #define DM_BUFIO_WORK_TIMER_SECS 10 39 40 /* 41 * Free buffers when they are older than this (seconds) 42 */ 43 #define DM_BUFIO_DEFAULT_AGE_SECS 60 44 45 /* 46 * The number of bvec entries that are embedded directly in the buffer. 47 * If the chunk size is larger, dm-io is used to do the io. 48 */ 49 #define DM_BUFIO_INLINE_VECS 16 50 51 /* 52 * Buffer hash 53 */ 54 #define DM_BUFIO_HASH_BITS 20 55 #define DM_BUFIO_HASH(block) \ 56 ((((block) >> DM_BUFIO_HASH_BITS) ^ (block)) & \ 57 ((1 << DM_BUFIO_HASH_BITS) - 1)) 58 59 /* 60 * Don't try to use kmem_cache_alloc for blocks larger than this. 61 * For explanation, see alloc_buffer_data below. 62 */ 63 #define DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT (PAGE_SIZE >> 1) 64 #define DM_BUFIO_BLOCK_SIZE_GFP_LIMIT (PAGE_SIZE << (MAX_ORDER - 1)) 65 66 /* 67 * dm_buffer->list_mode 68 */ 69 #define LIST_CLEAN 0 70 #define LIST_DIRTY 1 71 #define LIST_SIZE 2 72 73 /* 74 * Linking of buffers: 75 * All buffers are linked to cache_hash with their hash_list field. 76 * 77 * Clean buffers that are not being written (B_WRITING not set) 78 * are linked to lru[LIST_CLEAN] with their lru_list field. 79 * 80 * Dirty and clean buffers that are being written are linked to 81 * lru[LIST_DIRTY] with their lru_list field. When the write 82 * finishes, the buffer cannot be relinked immediately (because we 83 * are in an interrupt context and relinking requires process 84 * context), so some clean-not-writing buffers can be held on 85 * dirty_lru too. They are later added to lru in the process 86 * context. 87 */ 88 struct dm_bufio_client { 89 struct mutex lock; 90 91 struct list_head lru[LIST_SIZE]; 92 unsigned long n_buffers[LIST_SIZE]; 93 94 struct block_device *bdev; 95 unsigned block_size; 96 unsigned char sectors_per_block_bits; 97 unsigned char pages_per_block_bits; 98 unsigned char blocks_per_page_bits; 99 unsigned aux_size; 100 void (*alloc_callback)(struct dm_buffer *); 101 void (*write_callback)(struct dm_buffer *); 102 103 struct dm_io_client *dm_io; 104 105 struct list_head reserved_buffers; 106 unsigned need_reserved_buffers; 107 108 struct hlist_head *cache_hash; 109 wait_queue_head_t free_buffer_wait; 110 111 int async_write_error; 112 113 struct list_head client_list; 114 struct shrinker shrinker; 115 }; 116 117 /* 118 * Buffer state bits. 119 */ 120 #define B_READING 0 121 #define B_WRITING 1 122 #define B_DIRTY 2 123 124 /* 125 * Describes how the block was allocated: 126 * kmem_cache_alloc(), __get_free_pages() or vmalloc(). 127 * See the comment at alloc_buffer_data. 128 */ 129 enum data_mode { 130 DATA_MODE_SLAB = 0, 131 DATA_MODE_GET_FREE_PAGES = 1, 132 DATA_MODE_VMALLOC = 2, 133 DATA_MODE_LIMIT = 3 134 }; 135 136 struct dm_buffer { 137 struct hlist_node hash_list; 138 struct list_head lru_list; 139 sector_t block; 140 void *data; 141 enum data_mode data_mode; 142 unsigned char list_mode; /* LIST_* */ 143 unsigned hold_count; 144 int read_error; 145 int write_error; 146 unsigned long state; 147 unsigned long last_accessed; 148 struct dm_bufio_client *c; 149 struct bio bio; 150 struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS]; 151 }; 152 153 /*----------------------------------------------------------------*/ 154 155 static struct kmem_cache *dm_bufio_caches[PAGE_SHIFT - SECTOR_SHIFT]; 156 static char *dm_bufio_cache_names[PAGE_SHIFT - SECTOR_SHIFT]; 157 158 static inline int dm_bufio_cache_index(struct dm_bufio_client *c) 159 { 160 unsigned ret = c->blocks_per_page_bits - 1; 161 162 BUG_ON(ret >= ARRAY_SIZE(dm_bufio_caches)); 163 164 return ret; 165 } 166 167 #define DM_BUFIO_CACHE(c) (dm_bufio_caches[dm_bufio_cache_index(c)]) 168 #define DM_BUFIO_CACHE_NAME(c) (dm_bufio_cache_names[dm_bufio_cache_index(c)]) 169 170 #define dm_bufio_in_request() (!!current->bio_list) 171 172 static void dm_bufio_lock(struct dm_bufio_client *c) 173 { 174 mutex_lock_nested(&c->lock, dm_bufio_in_request()); 175 } 176 177 static int dm_bufio_trylock(struct dm_bufio_client *c) 178 { 179 return mutex_trylock(&c->lock); 180 } 181 182 static void dm_bufio_unlock(struct dm_bufio_client *c) 183 { 184 mutex_unlock(&c->lock); 185 } 186 187 /* 188 * FIXME Move to sched.h? 189 */ 190 #ifdef CONFIG_PREEMPT_VOLUNTARY 191 # define dm_bufio_cond_resched() \ 192 do { \ 193 if (unlikely(need_resched())) \ 194 _cond_resched(); \ 195 } while (0) 196 #else 197 # define dm_bufio_cond_resched() do { } while (0) 198 #endif 199 200 /*----------------------------------------------------------------*/ 201 202 /* 203 * Default cache size: available memory divided by the ratio. 204 */ 205 static unsigned long dm_bufio_default_cache_size; 206 207 /* 208 * Total cache size set by the user. 209 */ 210 static unsigned long dm_bufio_cache_size; 211 212 /* 213 * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change 214 * at any time. If it disagrees, the user has changed cache size. 215 */ 216 static unsigned long dm_bufio_cache_size_latch; 217 218 static DEFINE_SPINLOCK(param_spinlock); 219 220 /* 221 * Buffers are freed after this timeout 222 */ 223 static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS; 224 225 static unsigned long dm_bufio_peak_allocated; 226 static unsigned long dm_bufio_allocated_kmem_cache; 227 static unsigned long dm_bufio_allocated_get_free_pages; 228 static unsigned long dm_bufio_allocated_vmalloc; 229 static unsigned long dm_bufio_current_allocated; 230 231 /*----------------------------------------------------------------*/ 232 233 /* 234 * Per-client cache: dm_bufio_cache_size / dm_bufio_client_count 235 */ 236 static unsigned long dm_bufio_cache_size_per_client; 237 238 /* 239 * The current number of clients. 240 */ 241 static int dm_bufio_client_count; 242 243 /* 244 * The list of all clients. 245 */ 246 static LIST_HEAD(dm_bufio_all_clients); 247 248 /* 249 * This mutex protects dm_bufio_cache_size_latch, 250 * dm_bufio_cache_size_per_client and dm_bufio_client_count 251 */ 252 static DEFINE_MUTEX(dm_bufio_clients_lock); 253 254 /*----------------------------------------------------------------*/ 255 256 static void adjust_total_allocated(enum data_mode data_mode, long diff) 257 { 258 static unsigned long * const class_ptr[DATA_MODE_LIMIT] = { 259 &dm_bufio_allocated_kmem_cache, 260 &dm_bufio_allocated_get_free_pages, 261 &dm_bufio_allocated_vmalloc, 262 }; 263 264 spin_lock(¶m_spinlock); 265 266 *class_ptr[data_mode] += diff; 267 268 dm_bufio_current_allocated += diff; 269 270 if (dm_bufio_current_allocated > dm_bufio_peak_allocated) 271 dm_bufio_peak_allocated = dm_bufio_current_allocated; 272 273 spin_unlock(¶m_spinlock); 274 } 275 276 /* 277 * Change the number of clients and recalculate per-client limit. 278 */ 279 static void __cache_size_refresh(void) 280 { 281 BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock)); 282 BUG_ON(dm_bufio_client_count < 0); 283 284 dm_bufio_cache_size_latch = dm_bufio_cache_size; 285 286 barrier(); 287 288 /* 289 * Use default if set to 0 and report the actual cache size used. 290 */ 291 if (!dm_bufio_cache_size_latch) { 292 (void)cmpxchg(&dm_bufio_cache_size, 0, 293 dm_bufio_default_cache_size); 294 dm_bufio_cache_size_latch = dm_bufio_default_cache_size; 295 } 296 297 dm_bufio_cache_size_per_client = dm_bufio_cache_size_latch / 298 (dm_bufio_client_count ? : 1); 299 } 300 301 /* 302 * Allocating buffer data. 303 * 304 * Small buffers are allocated with kmem_cache, to use space optimally. 305 * 306 * For large buffers, we choose between get_free_pages and vmalloc. 307 * Each has advantages and disadvantages. 308 * 309 * __get_free_pages can randomly fail if the memory is fragmented. 310 * __vmalloc won't randomly fail, but vmalloc space is limited (it may be 311 * as low as 128M) so using it for caching is not appropriate. 312 * 313 * If the allocation may fail we use __get_free_pages. Memory fragmentation 314 * won't have a fatal effect here, but it just causes flushes of some other 315 * buffers and more I/O will be performed. Don't use __get_free_pages if it 316 * always fails (i.e. order >= MAX_ORDER). 317 * 318 * If the allocation shouldn't fail we use __vmalloc. This is only for the 319 * initial reserve allocation, so there's no risk of wasting all vmalloc 320 * space. 321 */ 322 static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask, 323 enum data_mode *data_mode) 324 { 325 if (c->block_size <= DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT) { 326 *data_mode = DATA_MODE_SLAB; 327 return kmem_cache_alloc(DM_BUFIO_CACHE(c), gfp_mask); 328 } 329 330 if (c->block_size <= DM_BUFIO_BLOCK_SIZE_GFP_LIMIT && 331 gfp_mask & __GFP_NORETRY) { 332 *data_mode = DATA_MODE_GET_FREE_PAGES; 333 return (void *)__get_free_pages(gfp_mask, 334 c->pages_per_block_bits); 335 } 336 337 *data_mode = DATA_MODE_VMALLOC; 338 return __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL); 339 } 340 341 /* 342 * Free buffer's data. 343 */ 344 static void free_buffer_data(struct dm_bufio_client *c, 345 void *data, enum data_mode data_mode) 346 { 347 switch (data_mode) { 348 case DATA_MODE_SLAB: 349 kmem_cache_free(DM_BUFIO_CACHE(c), data); 350 break; 351 352 case DATA_MODE_GET_FREE_PAGES: 353 free_pages((unsigned long)data, c->pages_per_block_bits); 354 break; 355 356 case DATA_MODE_VMALLOC: 357 vfree(data); 358 break; 359 360 default: 361 DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d", 362 data_mode); 363 BUG(); 364 } 365 } 366 367 /* 368 * Allocate buffer and its data. 369 */ 370 static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask) 371 { 372 struct dm_buffer *b = kmalloc(sizeof(struct dm_buffer) + c->aux_size, 373 gfp_mask); 374 375 if (!b) 376 return NULL; 377 378 b->c = c; 379 380 b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode); 381 if (!b->data) { 382 kfree(b); 383 return NULL; 384 } 385 386 adjust_total_allocated(b->data_mode, (long)c->block_size); 387 388 return b; 389 } 390 391 /* 392 * Free buffer and its data. 393 */ 394 static void free_buffer(struct dm_buffer *b) 395 { 396 struct dm_bufio_client *c = b->c; 397 398 adjust_total_allocated(b->data_mode, -(long)c->block_size); 399 400 free_buffer_data(c, b->data, b->data_mode); 401 kfree(b); 402 } 403 404 /* 405 * Link buffer to the hash list and clean or dirty queue. 406 */ 407 static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty) 408 { 409 struct dm_bufio_client *c = b->c; 410 411 c->n_buffers[dirty]++; 412 b->block = block; 413 b->list_mode = dirty; 414 list_add(&b->lru_list, &c->lru[dirty]); 415 hlist_add_head(&b->hash_list, &c->cache_hash[DM_BUFIO_HASH(block)]); 416 b->last_accessed = jiffies; 417 } 418 419 /* 420 * Unlink buffer from the hash list and dirty or clean queue. 421 */ 422 static void __unlink_buffer(struct dm_buffer *b) 423 { 424 struct dm_bufio_client *c = b->c; 425 426 BUG_ON(!c->n_buffers[b->list_mode]); 427 428 c->n_buffers[b->list_mode]--; 429 hlist_del(&b->hash_list); 430 list_del(&b->lru_list); 431 } 432 433 /* 434 * Place the buffer to the head of dirty or clean LRU queue. 435 */ 436 static void __relink_lru(struct dm_buffer *b, int dirty) 437 { 438 struct dm_bufio_client *c = b->c; 439 440 BUG_ON(!c->n_buffers[b->list_mode]); 441 442 c->n_buffers[b->list_mode]--; 443 c->n_buffers[dirty]++; 444 b->list_mode = dirty; 445 list_del(&b->lru_list); 446 list_add(&b->lru_list, &c->lru[dirty]); 447 } 448 449 /*---------------------------------------------------------------- 450 * Submit I/O on the buffer. 451 * 452 * Bio interface is faster but it has some problems: 453 * the vector list is limited (increasing this limit increases 454 * memory-consumption per buffer, so it is not viable); 455 * 456 * the memory must be direct-mapped, not vmalloced; 457 * 458 * the I/O driver can reject requests spuriously if it thinks that 459 * the requests are too big for the device or if they cross a 460 * controller-defined memory boundary. 461 * 462 * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and 463 * it is not vmalloced, try using the bio interface. 464 * 465 * If the buffer is big, if it is vmalloced or if the underlying device 466 * rejects the bio because it is too large, use dm-io layer to do the I/O. 467 * The dm-io layer splits the I/O into multiple requests, avoiding the above 468 * shortcomings. 469 *--------------------------------------------------------------*/ 470 471 /* 472 * dm-io completion routine. It just calls b->bio.bi_end_io, pretending 473 * that the request was handled directly with bio interface. 474 */ 475 static void dmio_complete(unsigned long error, void *context) 476 { 477 struct dm_buffer *b = context; 478 479 b->bio.bi_end_io(&b->bio, error ? -EIO : 0); 480 } 481 482 static void use_dmio(struct dm_buffer *b, int rw, sector_t block, 483 bio_end_io_t *end_io) 484 { 485 int r; 486 struct dm_io_request io_req = { 487 .bi_rw = rw, 488 .notify.fn = dmio_complete, 489 .notify.context = b, 490 .client = b->c->dm_io, 491 }; 492 struct dm_io_region region = { 493 .bdev = b->c->bdev, 494 .sector = block << b->c->sectors_per_block_bits, 495 .count = b->c->block_size >> SECTOR_SHIFT, 496 }; 497 498 if (b->data_mode != DATA_MODE_VMALLOC) { 499 io_req.mem.type = DM_IO_KMEM; 500 io_req.mem.ptr.addr = b->data; 501 } else { 502 io_req.mem.type = DM_IO_VMA; 503 io_req.mem.ptr.vma = b->data; 504 } 505 506 b->bio.bi_end_io = end_io; 507 508 r = dm_io(&io_req, 1, ®ion, NULL); 509 if (r) 510 end_io(&b->bio, r); 511 } 512 513 static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, 514 bio_end_io_t *end_io) 515 { 516 char *ptr; 517 int len; 518 519 bio_init(&b->bio); 520 b->bio.bi_io_vec = b->bio_vec; 521 b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS; 522 b->bio.bi_sector = block << b->c->sectors_per_block_bits; 523 b->bio.bi_bdev = b->c->bdev; 524 b->bio.bi_end_io = end_io; 525 526 /* 527 * We assume that if len >= PAGE_SIZE ptr is page-aligned. 528 * If len < PAGE_SIZE the buffer doesn't cross page boundary. 529 */ 530 ptr = b->data; 531 len = b->c->block_size; 532 533 if (len >= PAGE_SIZE) 534 BUG_ON((unsigned long)ptr & (PAGE_SIZE - 1)); 535 else 536 BUG_ON((unsigned long)ptr & (len - 1)); 537 538 do { 539 if (!bio_add_page(&b->bio, virt_to_page(ptr), 540 len < PAGE_SIZE ? len : PAGE_SIZE, 541 virt_to_phys(ptr) & (PAGE_SIZE - 1))) { 542 BUG_ON(b->c->block_size <= PAGE_SIZE); 543 use_dmio(b, rw, block, end_io); 544 return; 545 } 546 547 len -= PAGE_SIZE; 548 ptr += PAGE_SIZE; 549 } while (len > 0); 550 551 submit_bio(rw, &b->bio); 552 } 553 554 static void submit_io(struct dm_buffer *b, int rw, sector_t block, 555 bio_end_io_t *end_io) 556 { 557 if (rw == WRITE && b->c->write_callback) 558 b->c->write_callback(b); 559 560 if (b->c->block_size <= DM_BUFIO_INLINE_VECS * PAGE_SIZE && 561 b->data_mode != DATA_MODE_VMALLOC) 562 use_inline_bio(b, rw, block, end_io); 563 else 564 use_dmio(b, rw, block, end_io); 565 } 566 567 /*---------------------------------------------------------------- 568 * Writing dirty buffers 569 *--------------------------------------------------------------*/ 570 571 /* 572 * The endio routine for write. 573 * 574 * Set the error, clear B_WRITING bit and wake anyone who was waiting on 575 * it. 576 */ 577 static void write_endio(struct bio *bio, int error) 578 { 579 struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); 580 581 b->write_error = error; 582 if (error) { 583 struct dm_bufio_client *c = b->c; 584 (void)cmpxchg(&c->async_write_error, 0, error); 585 } 586 587 BUG_ON(!test_bit(B_WRITING, &b->state)); 588 589 smp_mb__before_clear_bit(); 590 clear_bit(B_WRITING, &b->state); 591 smp_mb__after_clear_bit(); 592 593 wake_up_bit(&b->state, B_WRITING); 594 } 595 596 /* 597 * This function is called when wait_on_bit is actually waiting. 598 */ 599 static int do_io_schedule(void *word) 600 { 601 io_schedule(); 602 603 return 0; 604 } 605 606 /* 607 * Initiate a write on a dirty buffer, but don't wait for it. 608 * 609 * - If the buffer is not dirty, exit. 610 * - If there some previous write going on, wait for it to finish (we can't 611 * have two writes on the same buffer simultaneously). 612 * - Submit our write and don't wait on it. We set B_WRITING indicating 613 * that there is a write in progress. 614 */ 615 static void __write_dirty_buffer(struct dm_buffer *b) 616 { 617 if (!test_bit(B_DIRTY, &b->state)) 618 return; 619 620 clear_bit(B_DIRTY, &b->state); 621 wait_on_bit_lock(&b->state, B_WRITING, 622 do_io_schedule, TASK_UNINTERRUPTIBLE); 623 624 submit_io(b, WRITE, b->block, write_endio); 625 } 626 627 /* 628 * Wait until any activity on the buffer finishes. Possibly write the 629 * buffer if it is dirty. When this function finishes, there is no I/O 630 * running on the buffer and the buffer is not dirty. 631 */ 632 static void __make_buffer_clean(struct dm_buffer *b) 633 { 634 BUG_ON(b->hold_count); 635 636 if (!b->state) /* fast case */ 637 return; 638 639 wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE); 640 __write_dirty_buffer(b); 641 wait_on_bit(&b->state, B_WRITING, do_io_schedule, TASK_UNINTERRUPTIBLE); 642 } 643 644 /* 645 * Find some buffer that is not held by anybody, clean it, unlink it and 646 * return it. 647 */ 648 static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c) 649 { 650 struct dm_buffer *b; 651 652 list_for_each_entry_reverse(b, &c->lru[LIST_CLEAN], lru_list) { 653 BUG_ON(test_bit(B_WRITING, &b->state)); 654 BUG_ON(test_bit(B_DIRTY, &b->state)); 655 656 if (!b->hold_count) { 657 __make_buffer_clean(b); 658 __unlink_buffer(b); 659 return b; 660 } 661 dm_bufio_cond_resched(); 662 } 663 664 list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) { 665 BUG_ON(test_bit(B_READING, &b->state)); 666 667 if (!b->hold_count) { 668 __make_buffer_clean(b); 669 __unlink_buffer(b); 670 return b; 671 } 672 dm_bufio_cond_resched(); 673 } 674 675 return NULL; 676 } 677 678 /* 679 * Wait until some other threads free some buffer or release hold count on 680 * some buffer. 681 * 682 * This function is entered with c->lock held, drops it and regains it 683 * before exiting. 684 */ 685 static void __wait_for_free_buffer(struct dm_bufio_client *c) 686 { 687 DECLARE_WAITQUEUE(wait, current); 688 689 add_wait_queue(&c->free_buffer_wait, &wait); 690 set_task_state(current, TASK_UNINTERRUPTIBLE); 691 dm_bufio_unlock(c); 692 693 io_schedule(); 694 695 set_task_state(current, TASK_RUNNING); 696 remove_wait_queue(&c->free_buffer_wait, &wait); 697 698 dm_bufio_lock(c); 699 } 700 701 /* 702 * Allocate a new buffer. If the allocation is not possible, wait until 703 * some other thread frees a buffer. 704 * 705 * May drop the lock and regain it. 706 */ 707 static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c) 708 { 709 struct dm_buffer *b; 710 711 /* 712 * dm-bufio is resistant to allocation failures (it just keeps 713 * one buffer reserved in cases all the allocations fail). 714 * So set flags to not try too hard: 715 * GFP_NOIO: don't recurse into the I/O layer 716 * __GFP_NORETRY: don't retry and rather return failure 717 * __GFP_NOMEMALLOC: don't use emergency reserves 718 * __GFP_NOWARN: don't print a warning in case of failure 719 * 720 * For debugging, if we set the cache size to 1, no new buffers will 721 * be allocated. 722 */ 723 while (1) { 724 if (dm_bufio_cache_size_latch != 1) { 725 b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); 726 if (b) 727 return b; 728 } 729 730 if (!list_empty(&c->reserved_buffers)) { 731 b = list_entry(c->reserved_buffers.next, 732 struct dm_buffer, lru_list); 733 list_del(&b->lru_list); 734 c->need_reserved_buffers++; 735 736 return b; 737 } 738 739 b = __get_unclaimed_buffer(c); 740 if (b) 741 return b; 742 743 __wait_for_free_buffer(c); 744 } 745 } 746 747 static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c) 748 { 749 struct dm_buffer *b = __alloc_buffer_wait_no_callback(c); 750 751 if (c->alloc_callback) 752 c->alloc_callback(b); 753 754 return b; 755 } 756 757 /* 758 * Free a buffer and wake other threads waiting for free buffers. 759 */ 760 static void __free_buffer_wake(struct dm_buffer *b) 761 { 762 struct dm_bufio_client *c = b->c; 763 764 if (!c->need_reserved_buffers) 765 free_buffer(b); 766 else { 767 list_add(&b->lru_list, &c->reserved_buffers); 768 c->need_reserved_buffers--; 769 } 770 771 wake_up(&c->free_buffer_wait); 772 } 773 774 static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait) 775 { 776 struct dm_buffer *b, *tmp; 777 778 list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) { 779 BUG_ON(test_bit(B_READING, &b->state)); 780 781 if (!test_bit(B_DIRTY, &b->state) && 782 !test_bit(B_WRITING, &b->state)) { 783 __relink_lru(b, LIST_CLEAN); 784 continue; 785 } 786 787 if (no_wait && test_bit(B_WRITING, &b->state)) 788 return; 789 790 __write_dirty_buffer(b); 791 dm_bufio_cond_resched(); 792 } 793 } 794 795 /* 796 * Get writeback threshold and buffer limit for a given client. 797 */ 798 static void __get_memory_limit(struct dm_bufio_client *c, 799 unsigned long *threshold_buffers, 800 unsigned long *limit_buffers) 801 { 802 unsigned long buffers; 803 804 if (dm_bufio_cache_size != dm_bufio_cache_size_latch) { 805 mutex_lock(&dm_bufio_clients_lock); 806 __cache_size_refresh(); 807 mutex_unlock(&dm_bufio_clients_lock); 808 } 809 810 buffers = dm_bufio_cache_size_per_client >> 811 (c->sectors_per_block_bits + SECTOR_SHIFT); 812 813 if (buffers < DM_BUFIO_MIN_BUFFERS) 814 buffers = DM_BUFIO_MIN_BUFFERS; 815 816 *limit_buffers = buffers; 817 *threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100; 818 } 819 820 /* 821 * Check if we're over watermark. 822 * If we are over threshold_buffers, start freeing buffers. 823 * If we're over "limit_buffers", block until we get under the limit. 824 */ 825 static void __check_watermark(struct dm_bufio_client *c) 826 { 827 unsigned long threshold_buffers, limit_buffers; 828 829 __get_memory_limit(c, &threshold_buffers, &limit_buffers); 830 831 while (c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY] > 832 limit_buffers) { 833 834 struct dm_buffer *b = __get_unclaimed_buffer(c); 835 836 if (!b) 837 return; 838 839 __free_buffer_wake(b); 840 dm_bufio_cond_resched(); 841 } 842 843 if (c->n_buffers[LIST_DIRTY] > threshold_buffers) 844 __write_dirty_buffers_async(c, 1); 845 } 846 847 /* 848 * Find a buffer in the hash. 849 */ 850 static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block) 851 { 852 struct dm_buffer *b; 853 struct hlist_node *hn; 854 855 hlist_for_each_entry(b, hn, &c->cache_hash[DM_BUFIO_HASH(block)], 856 hash_list) { 857 dm_bufio_cond_resched(); 858 if (b->block == block) 859 return b; 860 } 861 862 return NULL; 863 } 864 865 /*---------------------------------------------------------------- 866 * Getting a buffer 867 *--------------------------------------------------------------*/ 868 869 enum new_flag { 870 NF_FRESH = 0, 871 NF_READ = 1, 872 NF_GET = 2 873 }; 874 875 static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, 876 enum new_flag nf, struct dm_buffer **bp, 877 int *need_submit) 878 { 879 struct dm_buffer *b, *new_b = NULL; 880 881 *need_submit = 0; 882 883 b = __find(c, block); 884 if (b) { 885 b->hold_count++; 886 __relink_lru(b, test_bit(B_DIRTY, &b->state) || 887 test_bit(B_WRITING, &b->state)); 888 return b; 889 } 890 891 if (nf == NF_GET) 892 return NULL; 893 894 new_b = __alloc_buffer_wait(c); 895 896 /* 897 * We've had a period where the mutex was unlocked, so need to 898 * recheck the hash table. 899 */ 900 b = __find(c, block); 901 if (b) { 902 __free_buffer_wake(new_b); 903 b->hold_count++; 904 __relink_lru(b, test_bit(B_DIRTY, &b->state) || 905 test_bit(B_WRITING, &b->state)); 906 return b; 907 } 908 909 __check_watermark(c); 910 911 b = new_b; 912 b->hold_count = 1; 913 b->read_error = 0; 914 b->write_error = 0; 915 __link_buffer(b, block, LIST_CLEAN); 916 917 if (nf == NF_FRESH) { 918 b->state = 0; 919 return b; 920 } 921 922 b->state = 1 << B_READING; 923 *need_submit = 1; 924 925 return b; 926 } 927 928 /* 929 * The endio routine for reading: set the error, clear the bit and wake up 930 * anyone waiting on the buffer. 931 */ 932 static void read_endio(struct bio *bio, int error) 933 { 934 struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); 935 936 b->read_error = error; 937 938 BUG_ON(!test_bit(B_READING, &b->state)); 939 940 smp_mb__before_clear_bit(); 941 clear_bit(B_READING, &b->state); 942 smp_mb__after_clear_bit(); 943 944 wake_up_bit(&b->state, B_READING); 945 } 946 947 /* 948 * A common routine for dm_bufio_new and dm_bufio_read. Operation of these 949 * functions is similar except that dm_bufio_new doesn't read the 950 * buffer from the disk (assuming that the caller overwrites all the data 951 * and uses dm_bufio_mark_buffer_dirty to write new data back). 952 */ 953 static void *new_read(struct dm_bufio_client *c, sector_t block, 954 enum new_flag nf, struct dm_buffer **bp) 955 { 956 int need_submit; 957 struct dm_buffer *b; 958 959 dm_bufio_lock(c); 960 b = __bufio_new(c, block, nf, bp, &need_submit); 961 dm_bufio_unlock(c); 962 963 if (!b || IS_ERR(b)) 964 return b; 965 966 if (need_submit) 967 submit_io(b, READ, b->block, read_endio); 968 969 wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE); 970 971 if (b->read_error) { 972 int error = b->read_error; 973 974 dm_bufio_release(b); 975 976 return ERR_PTR(error); 977 } 978 979 *bp = b; 980 981 return b->data; 982 } 983 984 void *dm_bufio_get(struct dm_bufio_client *c, sector_t block, 985 struct dm_buffer **bp) 986 { 987 return new_read(c, block, NF_GET, bp); 988 } 989 EXPORT_SYMBOL_GPL(dm_bufio_get); 990 991 void *dm_bufio_read(struct dm_bufio_client *c, sector_t block, 992 struct dm_buffer **bp) 993 { 994 BUG_ON(dm_bufio_in_request()); 995 996 return new_read(c, block, NF_READ, bp); 997 } 998 EXPORT_SYMBOL_GPL(dm_bufio_read); 999 1000 void *dm_bufio_new(struct dm_bufio_client *c, sector_t block, 1001 struct dm_buffer **bp) 1002 { 1003 BUG_ON(dm_bufio_in_request()); 1004 1005 return new_read(c, block, NF_FRESH, bp); 1006 } 1007 EXPORT_SYMBOL_GPL(dm_bufio_new); 1008 1009 void dm_bufio_release(struct dm_buffer *b) 1010 { 1011 struct dm_bufio_client *c = b->c; 1012 1013 dm_bufio_lock(c); 1014 1015 BUG_ON(test_bit(B_READING, &b->state)); 1016 BUG_ON(!b->hold_count); 1017 1018 b->hold_count--; 1019 if (!b->hold_count) { 1020 wake_up(&c->free_buffer_wait); 1021 1022 /* 1023 * If there were errors on the buffer, and the buffer is not 1024 * to be written, free the buffer. There is no point in caching 1025 * invalid buffer. 1026 */ 1027 if ((b->read_error || b->write_error) && 1028 !test_bit(B_WRITING, &b->state) && 1029 !test_bit(B_DIRTY, &b->state)) { 1030 __unlink_buffer(b); 1031 __free_buffer_wake(b); 1032 } 1033 } 1034 1035 dm_bufio_unlock(c); 1036 } 1037 EXPORT_SYMBOL_GPL(dm_bufio_release); 1038 1039 void dm_bufio_mark_buffer_dirty(struct dm_buffer *b) 1040 { 1041 struct dm_bufio_client *c = b->c; 1042 1043 dm_bufio_lock(c); 1044 1045 if (!test_and_set_bit(B_DIRTY, &b->state)) 1046 __relink_lru(b, LIST_DIRTY); 1047 1048 dm_bufio_unlock(c); 1049 } 1050 EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty); 1051 1052 void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c) 1053 { 1054 BUG_ON(dm_bufio_in_request()); 1055 1056 dm_bufio_lock(c); 1057 __write_dirty_buffers_async(c, 0); 1058 dm_bufio_unlock(c); 1059 } 1060 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async); 1061 1062 /* 1063 * For performance, it is essential that the buffers are written asynchronously 1064 * and simultaneously (so that the block layer can merge the writes) and then 1065 * waited upon. 1066 * 1067 * Finally, we flush hardware disk cache. 1068 */ 1069 int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c) 1070 { 1071 int a, f; 1072 unsigned long buffers_processed = 0; 1073 struct dm_buffer *b, *tmp; 1074 1075 dm_bufio_lock(c); 1076 __write_dirty_buffers_async(c, 0); 1077 1078 again: 1079 list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) { 1080 int dropped_lock = 0; 1081 1082 if (buffers_processed < c->n_buffers[LIST_DIRTY]) 1083 buffers_processed++; 1084 1085 BUG_ON(test_bit(B_READING, &b->state)); 1086 1087 if (test_bit(B_WRITING, &b->state)) { 1088 if (buffers_processed < c->n_buffers[LIST_DIRTY]) { 1089 dropped_lock = 1; 1090 b->hold_count++; 1091 dm_bufio_unlock(c); 1092 wait_on_bit(&b->state, B_WRITING, 1093 do_io_schedule, 1094 TASK_UNINTERRUPTIBLE); 1095 dm_bufio_lock(c); 1096 b->hold_count--; 1097 } else 1098 wait_on_bit(&b->state, B_WRITING, 1099 do_io_schedule, 1100 TASK_UNINTERRUPTIBLE); 1101 } 1102 1103 if (!test_bit(B_DIRTY, &b->state) && 1104 !test_bit(B_WRITING, &b->state)) 1105 __relink_lru(b, LIST_CLEAN); 1106 1107 dm_bufio_cond_resched(); 1108 1109 /* 1110 * If we dropped the lock, the list is no longer consistent, 1111 * so we must restart the search. 1112 * 1113 * In the most common case, the buffer just processed is 1114 * relinked to the clean list, so we won't loop scanning the 1115 * same buffer again and again. 1116 * 1117 * This may livelock if there is another thread simultaneously 1118 * dirtying buffers, so we count the number of buffers walked 1119 * and if it exceeds the total number of buffers, it means that 1120 * someone is doing some writes simultaneously with us. In 1121 * this case, stop, dropping the lock. 1122 */ 1123 if (dropped_lock) 1124 goto again; 1125 } 1126 wake_up(&c->free_buffer_wait); 1127 dm_bufio_unlock(c); 1128 1129 a = xchg(&c->async_write_error, 0); 1130 f = dm_bufio_issue_flush(c); 1131 if (a) 1132 return a; 1133 1134 return f; 1135 } 1136 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers); 1137 1138 /* 1139 * Use dm-io to send and empty barrier flush the device. 1140 */ 1141 int dm_bufio_issue_flush(struct dm_bufio_client *c) 1142 { 1143 struct dm_io_request io_req = { 1144 .bi_rw = REQ_FLUSH, 1145 .mem.type = DM_IO_KMEM, 1146 .mem.ptr.addr = NULL, 1147 .client = c->dm_io, 1148 }; 1149 struct dm_io_region io_reg = { 1150 .bdev = c->bdev, 1151 .sector = 0, 1152 .count = 0, 1153 }; 1154 1155 BUG_ON(dm_bufio_in_request()); 1156 1157 return dm_io(&io_req, 1, &io_reg, NULL); 1158 } 1159 EXPORT_SYMBOL_GPL(dm_bufio_issue_flush); 1160 1161 /* 1162 * We first delete any other buffer that may be at that new location. 1163 * 1164 * Then, we write the buffer to the original location if it was dirty. 1165 * 1166 * Then, if we are the only one who is holding the buffer, relink the buffer 1167 * in the hash queue for the new location. 1168 * 1169 * If there was someone else holding the buffer, we write it to the new 1170 * location but not relink it, because that other user needs to have the buffer 1171 * at the same place. 1172 */ 1173 void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block) 1174 { 1175 struct dm_bufio_client *c = b->c; 1176 struct dm_buffer *new; 1177 1178 BUG_ON(dm_bufio_in_request()); 1179 1180 dm_bufio_lock(c); 1181 1182 retry: 1183 new = __find(c, new_block); 1184 if (new) { 1185 if (new->hold_count) { 1186 __wait_for_free_buffer(c); 1187 goto retry; 1188 } 1189 1190 /* 1191 * FIXME: Is there any point waiting for a write that's going 1192 * to be overwritten in a bit? 1193 */ 1194 __make_buffer_clean(new); 1195 __unlink_buffer(new); 1196 __free_buffer_wake(new); 1197 } 1198 1199 BUG_ON(!b->hold_count); 1200 BUG_ON(test_bit(B_READING, &b->state)); 1201 1202 __write_dirty_buffer(b); 1203 if (b->hold_count == 1) { 1204 wait_on_bit(&b->state, B_WRITING, 1205 do_io_schedule, TASK_UNINTERRUPTIBLE); 1206 set_bit(B_DIRTY, &b->state); 1207 __unlink_buffer(b); 1208 __link_buffer(b, new_block, LIST_DIRTY); 1209 } else { 1210 sector_t old_block; 1211 wait_on_bit_lock(&b->state, B_WRITING, 1212 do_io_schedule, TASK_UNINTERRUPTIBLE); 1213 /* 1214 * Relink buffer to "new_block" so that write_callback 1215 * sees "new_block" as a block number. 1216 * After the write, link the buffer back to old_block. 1217 * All this must be done in bufio lock, so that block number 1218 * change isn't visible to other threads. 1219 */ 1220 old_block = b->block; 1221 __unlink_buffer(b); 1222 __link_buffer(b, new_block, b->list_mode); 1223 submit_io(b, WRITE, new_block, write_endio); 1224 wait_on_bit(&b->state, B_WRITING, 1225 do_io_schedule, TASK_UNINTERRUPTIBLE); 1226 __unlink_buffer(b); 1227 __link_buffer(b, old_block, b->list_mode); 1228 } 1229 1230 dm_bufio_unlock(c); 1231 dm_bufio_release(b); 1232 } 1233 EXPORT_SYMBOL_GPL(dm_bufio_release_move); 1234 1235 unsigned dm_bufio_get_block_size(struct dm_bufio_client *c) 1236 { 1237 return c->block_size; 1238 } 1239 EXPORT_SYMBOL_GPL(dm_bufio_get_block_size); 1240 1241 sector_t dm_bufio_get_device_size(struct dm_bufio_client *c) 1242 { 1243 return i_size_read(c->bdev->bd_inode) >> 1244 (SECTOR_SHIFT + c->sectors_per_block_bits); 1245 } 1246 EXPORT_SYMBOL_GPL(dm_bufio_get_device_size); 1247 1248 sector_t dm_bufio_get_block_number(struct dm_buffer *b) 1249 { 1250 return b->block; 1251 } 1252 EXPORT_SYMBOL_GPL(dm_bufio_get_block_number); 1253 1254 void *dm_bufio_get_block_data(struct dm_buffer *b) 1255 { 1256 return b->data; 1257 } 1258 EXPORT_SYMBOL_GPL(dm_bufio_get_block_data); 1259 1260 void *dm_bufio_get_aux_data(struct dm_buffer *b) 1261 { 1262 return b + 1; 1263 } 1264 EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data); 1265 1266 struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b) 1267 { 1268 return b->c; 1269 } 1270 EXPORT_SYMBOL_GPL(dm_bufio_get_client); 1271 1272 static void drop_buffers(struct dm_bufio_client *c) 1273 { 1274 struct dm_buffer *b; 1275 int i; 1276 1277 BUG_ON(dm_bufio_in_request()); 1278 1279 /* 1280 * An optimization so that the buffers are not written one-by-one. 1281 */ 1282 dm_bufio_write_dirty_buffers_async(c); 1283 1284 dm_bufio_lock(c); 1285 1286 while ((b = __get_unclaimed_buffer(c))) 1287 __free_buffer_wake(b); 1288 1289 for (i = 0; i < LIST_SIZE; i++) 1290 list_for_each_entry(b, &c->lru[i], lru_list) 1291 DMERR("leaked buffer %llx, hold count %u, list %d", 1292 (unsigned long long)b->block, b->hold_count, i); 1293 1294 for (i = 0; i < LIST_SIZE; i++) 1295 BUG_ON(!list_empty(&c->lru[i])); 1296 1297 dm_bufio_unlock(c); 1298 } 1299 1300 /* 1301 * Test if the buffer is unused and too old, and commit it. 1302 * At if noio is set, we must not do any I/O because we hold 1303 * dm_bufio_clients_lock and we would risk deadlock if the I/O gets rerouted to 1304 * different bufio client. 1305 */ 1306 static int __cleanup_old_buffer(struct dm_buffer *b, gfp_t gfp, 1307 unsigned long max_jiffies) 1308 { 1309 if (jiffies - b->last_accessed < max_jiffies) 1310 return 1; 1311 1312 if (!(gfp & __GFP_IO)) { 1313 if (test_bit(B_READING, &b->state) || 1314 test_bit(B_WRITING, &b->state) || 1315 test_bit(B_DIRTY, &b->state)) 1316 return 1; 1317 } 1318 1319 if (b->hold_count) 1320 return 1; 1321 1322 __make_buffer_clean(b); 1323 __unlink_buffer(b); 1324 __free_buffer_wake(b); 1325 1326 return 0; 1327 } 1328 1329 static void __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, 1330 struct shrink_control *sc) 1331 { 1332 int l; 1333 struct dm_buffer *b, *tmp; 1334 1335 for (l = 0; l < LIST_SIZE; l++) { 1336 list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) 1337 if (!__cleanup_old_buffer(b, sc->gfp_mask, 0) && 1338 !--nr_to_scan) 1339 return; 1340 dm_bufio_cond_resched(); 1341 } 1342 } 1343 1344 static int shrink(struct shrinker *shrinker, struct shrink_control *sc) 1345 { 1346 struct dm_bufio_client *c = 1347 container_of(shrinker, struct dm_bufio_client, shrinker); 1348 unsigned long r; 1349 unsigned long nr_to_scan = sc->nr_to_scan; 1350 1351 if (sc->gfp_mask & __GFP_IO) 1352 dm_bufio_lock(c); 1353 else if (!dm_bufio_trylock(c)) 1354 return !nr_to_scan ? 0 : -1; 1355 1356 if (nr_to_scan) 1357 __scan(c, nr_to_scan, sc); 1358 1359 r = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY]; 1360 if (r > INT_MAX) 1361 r = INT_MAX; 1362 1363 dm_bufio_unlock(c); 1364 1365 return r; 1366 } 1367 1368 /* 1369 * Create the buffering interface 1370 */ 1371 struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size, 1372 unsigned reserved_buffers, unsigned aux_size, 1373 void (*alloc_callback)(struct dm_buffer *), 1374 void (*write_callback)(struct dm_buffer *)) 1375 { 1376 int r; 1377 struct dm_bufio_client *c; 1378 unsigned i; 1379 1380 BUG_ON(block_size < 1 << SECTOR_SHIFT || 1381 (block_size & (block_size - 1))); 1382 1383 c = kmalloc(sizeof(*c), GFP_KERNEL); 1384 if (!c) { 1385 r = -ENOMEM; 1386 goto bad_client; 1387 } 1388 c->cache_hash = vmalloc(sizeof(struct hlist_head) << DM_BUFIO_HASH_BITS); 1389 if (!c->cache_hash) { 1390 r = -ENOMEM; 1391 goto bad_hash; 1392 } 1393 1394 c->bdev = bdev; 1395 c->block_size = block_size; 1396 c->sectors_per_block_bits = ffs(block_size) - 1 - SECTOR_SHIFT; 1397 c->pages_per_block_bits = (ffs(block_size) - 1 >= PAGE_SHIFT) ? 1398 ffs(block_size) - 1 - PAGE_SHIFT : 0; 1399 c->blocks_per_page_bits = (ffs(block_size) - 1 < PAGE_SHIFT ? 1400 PAGE_SHIFT - (ffs(block_size) - 1) : 0); 1401 1402 c->aux_size = aux_size; 1403 c->alloc_callback = alloc_callback; 1404 c->write_callback = write_callback; 1405 1406 for (i = 0; i < LIST_SIZE; i++) { 1407 INIT_LIST_HEAD(&c->lru[i]); 1408 c->n_buffers[i] = 0; 1409 } 1410 1411 for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++) 1412 INIT_HLIST_HEAD(&c->cache_hash[i]); 1413 1414 mutex_init(&c->lock); 1415 INIT_LIST_HEAD(&c->reserved_buffers); 1416 c->need_reserved_buffers = reserved_buffers; 1417 1418 init_waitqueue_head(&c->free_buffer_wait); 1419 c->async_write_error = 0; 1420 1421 c->dm_io = dm_io_client_create(); 1422 if (IS_ERR(c->dm_io)) { 1423 r = PTR_ERR(c->dm_io); 1424 goto bad_dm_io; 1425 } 1426 1427 mutex_lock(&dm_bufio_clients_lock); 1428 if (c->blocks_per_page_bits) { 1429 if (!DM_BUFIO_CACHE_NAME(c)) { 1430 DM_BUFIO_CACHE_NAME(c) = kasprintf(GFP_KERNEL, "dm_bufio_cache-%u", c->block_size); 1431 if (!DM_BUFIO_CACHE_NAME(c)) { 1432 r = -ENOMEM; 1433 mutex_unlock(&dm_bufio_clients_lock); 1434 goto bad_cache; 1435 } 1436 } 1437 1438 if (!DM_BUFIO_CACHE(c)) { 1439 DM_BUFIO_CACHE(c) = kmem_cache_create(DM_BUFIO_CACHE_NAME(c), 1440 c->block_size, 1441 c->block_size, 0, NULL); 1442 if (!DM_BUFIO_CACHE(c)) { 1443 r = -ENOMEM; 1444 mutex_unlock(&dm_bufio_clients_lock); 1445 goto bad_cache; 1446 } 1447 } 1448 } 1449 mutex_unlock(&dm_bufio_clients_lock); 1450 1451 while (c->need_reserved_buffers) { 1452 struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL); 1453 1454 if (!b) { 1455 r = -ENOMEM; 1456 goto bad_buffer; 1457 } 1458 __free_buffer_wake(b); 1459 } 1460 1461 mutex_lock(&dm_bufio_clients_lock); 1462 dm_bufio_client_count++; 1463 list_add(&c->client_list, &dm_bufio_all_clients); 1464 __cache_size_refresh(); 1465 mutex_unlock(&dm_bufio_clients_lock); 1466 1467 c->shrinker.shrink = shrink; 1468 c->shrinker.seeks = 1; 1469 c->shrinker.batch = 0; 1470 register_shrinker(&c->shrinker); 1471 1472 return c; 1473 1474 bad_buffer: 1475 bad_cache: 1476 while (!list_empty(&c->reserved_buffers)) { 1477 struct dm_buffer *b = list_entry(c->reserved_buffers.next, 1478 struct dm_buffer, lru_list); 1479 list_del(&b->lru_list); 1480 free_buffer(b); 1481 } 1482 dm_io_client_destroy(c->dm_io); 1483 bad_dm_io: 1484 vfree(c->cache_hash); 1485 bad_hash: 1486 kfree(c); 1487 bad_client: 1488 return ERR_PTR(r); 1489 } 1490 EXPORT_SYMBOL_GPL(dm_bufio_client_create); 1491 1492 /* 1493 * Free the buffering interface. 1494 * It is required that there are no references on any buffers. 1495 */ 1496 void dm_bufio_client_destroy(struct dm_bufio_client *c) 1497 { 1498 unsigned i; 1499 1500 drop_buffers(c); 1501 1502 unregister_shrinker(&c->shrinker); 1503 1504 mutex_lock(&dm_bufio_clients_lock); 1505 1506 list_del(&c->client_list); 1507 dm_bufio_client_count--; 1508 __cache_size_refresh(); 1509 1510 mutex_unlock(&dm_bufio_clients_lock); 1511 1512 for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++) 1513 BUG_ON(!hlist_empty(&c->cache_hash[i])); 1514 1515 BUG_ON(c->need_reserved_buffers); 1516 1517 while (!list_empty(&c->reserved_buffers)) { 1518 struct dm_buffer *b = list_entry(c->reserved_buffers.next, 1519 struct dm_buffer, lru_list); 1520 list_del(&b->lru_list); 1521 free_buffer(b); 1522 } 1523 1524 for (i = 0; i < LIST_SIZE; i++) 1525 if (c->n_buffers[i]) 1526 DMERR("leaked buffer count %d: %ld", i, c->n_buffers[i]); 1527 1528 for (i = 0; i < LIST_SIZE; i++) 1529 BUG_ON(c->n_buffers[i]); 1530 1531 dm_io_client_destroy(c->dm_io); 1532 vfree(c->cache_hash); 1533 kfree(c); 1534 } 1535 EXPORT_SYMBOL_GPL(dm_bufio_client_destroy); 1536 1537 static void cleanup_old_buffers(void) 1538 { 1539 unsigned long max_age = dm_bufio_max_age; 1540 struct dm_bufio_client *c; 1541 1542 barrier(); 1543 1544 if (max_age > ULONG_MAX / HZ) 1545 max_age = ULONG_MAX / HZ; 1546 1547 mutex_lock(&dm_bufio_clients_lock); 1548 list_for_each_entry(c, &dm_bufio_all_clients, client_list) { 1549 if (!dm_bufio_trylock(c)) 1550 continue; 1551 1552 while (!list_empty(&c->lru[LIST_CLEAN])) { 1553 struct dm_buffer *b; 1554 b = list_entry(c->lru[LIST_CLEAN].prev, 1555 struct dm_buffer, lru_list); 1556 if (__cleanup_old_buffer(b, 0, max_age * HZ)) 1557 break; 1558 dm_bufio_cond_resched(); 1559 } 1560 1561 dm_bufio_unlock(c); 1562 dm_bufio_cond_resched(); 1563 } 1564 mutex_unlock(&dm_bufio_clients_lock); 1565 } 1566 1567 static struct workqueue_struct *dm_bufio_wq; 1568 static struct delayed_work dm_bufio_work; 1569 1570 static void work_fn(struct work_struct *w) 1571 { 1572 cleanup_old_buffers(); 1573 1574 queue_delayed_work(dm_bufio_wq, &dm_bufio_work, 1575 DM_BUFIO_WORK_TIMER_SECS * HZ); 1576 } 1577 1578 /*---------------------------------------------------------------- 1579 * Module setup 1580 *--------------------------------------------------------------*/ 1581 1582 /* 1583 * This is called only once for the whole dm_bufio module. 1584 * It initializes memory limit. 1585 */ 1586 static int __init dm_bufio_init(void) 1587 { 1588 __u64 mem; 1589 1590 memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches); 1591 memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names); 1592 1593 mem = (__u64)((totalram_pages - totalhigh_pages) * 1594 DM_BUFIO_MEMORY_PERCENT / 100) << PAGE_SHIFT; 1595 1596 if (mem > ULONG_MAX) 1597 mem = ULONG_MAX; 1598 1599 #ifdef CONFIG_MMU 1600 /* 1601 * Get the size of vmalloc space the same way as VMALLOC_TOTAL 1602 * in fs/proc/internal.h 1603 */ 1604 if (mem > (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100) 1605 mem = (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100; 1606 #endif 1607 1608 dm_bufio_default_cache_size = mem; 1609 1610 mutex_lock(&dm_bufio_clients_lock); 1611 __cache_size_refresh(); 1612 mutex_unlock(&dm_bufio_clients_lock); 1613 1614 dm_bufio_wq = create_singlethread_workqueue("dm_bufio_cache"); 1615 if (!dm_bufio_wq) 1616 return -ENOMEM; 1617 1618 INIT_DELAYED_WORK(&dm_bufio_work, work_fn); 1619 queue_delayed_work(dm_bufio_wq, &dm_bufio_work, 1620 DM_BUFIO_WORK_TIMER_SECS * HZ); 1621 1622 return 0; 1623 } 1624 1625 /* 1626 * This is called once when unloading the dm_bufio module. 1627 */ 1628 static void __exit dm_bufio_exit(void) 1629 { 1630 int bug = 0; 1631 int i; 1632 1633 cancel_delayed_work_sync(&dm_bufio_work); 1634 destroy_workqueue(dm_bufio_wq); 1635 1636 for (i = 0; i < ARRAY_SIZE(dm_bufio_caches); i++) { 1637 struct kmem_cache *kc = dm_bufio_caches[i]; 1638 1639 if (kc) 1640 kmem_cache_destroy(kc); 1641 } 1642 1643 for (i = 0; i < ARRAY_SIZE(dm_bufio_cache_names); i++) 1644 kfree(dm_bufio_cache_names[i]); 1645 1646 if (dm_bufio_client_count) { 1647 DMCRIT("%s: dm_bufio_client_count leaked: %d", 1648 __func__, dm_bufio_client_count); 1649 bug = 1; 1650 } 1651 1652 if (dm_bufio_current_allocated) { 1653 DMCRIT("%s: dm_bufio_current_allocated leaked: %lu", 1654 __func__, dm_bufio_current_allocated); 1655 bug = 1; 1656 } 1657 1658 if (dm_bufio_allocated_get_free_pages) { 1659 DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu", 1660 __func__, dm_bufio_allocated_get_free_pages); 1661 bug = 1; 1662 } 1663 1664 if (dm_bufio_allocated_vmalloc) { 1665 DMCRIT("%s: dm_bufio_vmalloc leaked: %lu", 1666 __func__, dm_bufio_allocated_vmalloc); 1667 bug = 1; 1668 } 1669 1670 if (bug) 1671 BUG(); 1672 } 1673 1674 module_init(dm_bufio_init) 1675 module_exit(dm_bufio_exit) 1676 1677 module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, S_IRUGO | S_IWUSR); 1678 MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache"); 1679 1680 module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR); 1681 MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds"); 1682 1683 module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR); 1684 MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory"); 1685 1686 module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, S_IRUGO); 1687 MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc"); 1688 1689 module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, S_IRUGO); 1690 MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages"); 1691 1692 module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, S_IRUGO); 1693 MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc"); 1694 1695 module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, S_IRUGO); 1696 MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache"); 1697 1698 MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>"); 1699 MODULE_DESCRIPTION(DM_NAME " buffered I/O library"); 1700 MODULE_LICENSE("GPL"); 1701