1 /* 2 * Copyright (C) 2009-2011 Red Hat, Inc. 3 * 4 * Author: Mikulas Patocka <mpatocka@redhat.com> 5 * 6 * This file is released under the GPL. 7 */ 8 9 #include "dm-bufio.h" 10 11 #include <linux/device-mapper.h> 12 #include <linux/dm-io.h> 13 #include <linux/slab.h> 14 #include <linux/vmalloc.h> 15 #include <linux/shrinker.h> 16 #include <linux/module.h> 17 18 #define DM_MSG_PREFIX "bufio" 19 20 /* 21 * Memory management policy: 22 * Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory 23 * or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower). 24 * Always allocate at least DM_BUFIO_MIN_BUFFERS buffers. 25 * Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT 26 * dirty buffers. 27 */ 28 #define DM_BUFIO_MIN_BUFFERS 8 29 30 #define DM_BUFIO_MEMORY_PERCENT 2 31 #define DM_BUFIO_VMALLOC_PERCENT 25 32 #define DM_BUFIO_WRITEBACK_PERCENT 75 33 34 /* 35 * Check buffer ages in this interval (seconds) 36 */ 37 #define DM_BUFIO_WORK_TIMER_SECS 10 38 39 /* 40 * Free buffers when they are older than this (seconds) 41 */ 42 #define DM_BUFIO_DEFAULT_AGE_SECS 60 43 44 /* 45 * The number of bvec entries that are embedded directly in the buffer. 46 * If the chunk size is larger, dm-io is used to do the io. 47 */ 48 #define DM_BUFIO_INLINE_VECS 16 49 50 /* 51 * Buffer hash 52 */ 53 #define DM_BUFIO_HASH_BITS 20 54 #define DM_BUFIO_HASH(block) \ 55 ((((block) >> DM_BUFIO_HASH_BITS) ^ (block)) & \ 56 ((1 << DM_BUFIO_HASH_BITS) - 1)) 57 58 /* 59 * Don't try to use kmem_cache_alloc for blocks larger than this. 60 * For explanation, see alloc_buffer_data below. 61 */ 62 #define DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT (PAGE_SIZE >> 1) 63 #define DM_BUFIO_BLOCK_SIZE_GFP_LIMIT (PAGE_SIZE << (MAX_ORDER - 1)) 64 65 /* 66 * dm_buffer->list_mode 67 */ 68 #define LIST_CLEAN 0 69 #define LIST_DIRTY 1 70 #define LIST_SIZE 2 71 72 /* 73 * Linking of buffers: 74 * All buffers are linked to cache_hash with their hash_list field. 75 * 76 * Clean buffers that are not being written (B_WRITING not set) 77 * are linked to lru[LIST_CLEAN] with their lru_list field. 78 * 79 * Dirty and clean buffers that are being written are linked to 80 * lru[LIST_DIRTY] with their lru_list field. When the write 81 * finishes, the buffer cannot be relinked immediately (because we 82 * are in an interrupt context and relinking requires process 83 * context), so some clean-not-writing buffers can be held on 84 * dirty_lru too. They are later added to lru in the process 85 * context. 86 */ 87 struct dm_bufio_client { 88 struct mutex lock; 89 90 struct list_head lru[LIST_SIZE]; 91 unsigned long n_buffers[LIST_SIZE]; 92 93 struct block_device *bdev; 94 unsigned block_size; 95 unsigned char sectors_per_block_bits; 96 unsigned char pages_per_block_bits; 97 unsigned char blocks_per_page_bits; 98 unsigned aux_size; 99 void (*alloc_callback)(struct dm_buffer *); 100 void (*write_callback)(struct dm_buffer *); 101 102 struct dm_io_client *dm_io; 103 104 struct list_head reserved_buffers; 105 unsigned need_reserved_buffers; 106 107 struct hlist_head *cache_hash; 108 wait_queue_head_t free_buffer_wait; 109 110 int async_write_error; 111 112 struct list_head client_list; 113 struct shrinker shrinker; 114 }; 115 116 /* 117 * Buffer state bits. 118 */ 119 #define B_READING 0 120 #define B_WRITING 1 121 #define B_DIRTY 2 122 123 /* 124 * Describes how the block was allocated: 125 * kmem_cache_alloc(), __get_free_pages() or vmalloc(). 126 * See the comment at alloc_buffer_data. 127 */ 128 enum data_mode { 129 DATA_MODE_SLAB = 0, 130 DATA_MODE_GET_FREE_PAGES = 1, 131 DATA_MODE_VMALLOC = 2, 132 DATA_MODE_LIMIT = 3 133 }; 134 135 struct dm_buffer { 136 struct hlist_node hash_list; 137 struct list_head lru_list; 138 sector_t block; 139 void *data; 140 enum data_mode data_mode; 141 unsigned char list_mode; /* LIST_* */ 142 unsigned hold_count; 143 int read_error; 144 int write_error; 145 unsigned long state; 146 unsigned long last_accessed; 147 struct dm_bufio_client *c; 148 struct bio bio; 149 struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS]; 150 }; 151 152 /*----------------------------------------------------------------*/ 153 154 static struct kmem_cache *dm_bufio_caches[PAGE_SHIFT - SECTOR_SHIFT]; 155 static char *dm_bufio_cache_names[PAGE_SHIFT - SECTOR_SHIFT]; 156 157 static inline int dm_bufio_cache_index(struct dm_bufio_client *c) 158 { 159 unsigned ret = c->blocks_per_page_bits - 1; 160 161 BUG_ON(ret >= ARRAY_SIZE(dm_bufio_caches)); 162 163 return ret; 164 } 165 166 #define DM_BUFIO_CACHE(c) (dm_bufio_caches[dm_bufio_cache_index(c)]) 167 #define DM_BUFIO_CACHE_NAME(c) (dm_bufio_cache_names[dm_bufio_cache_index(c)]) 168 169 #define dm_bufio_in_request() (!!current->bio_list) 170 171 static void dm_bufio_lock(struct dm_bufio_client *c) 172 { 173 mutex_lock_nested(&c->lock, dm_bufio_in_request()); 174 } 175 176 static int dm_bufio_trylock(struct dm_bufio_client *c) 177 { 178 return mutex_trylock(&c->lock); 179 } 180 181 static void dm_bufio_unlock(struct dm_bufio_client *c) 182 { 183 mutex_unlock(&c->lock); 184 } 185 186 /* 187 * FIXME Move to sched.h? 188 */ 189 #ifdef CONFIG_PREEMPT_VOLUNTARY 190 # define dm_bufio_cond_resched() \ 191 do { \ 192 if (unlikely(need_resched())) \ 193 _cond_resched(); \ 194 } while (0) 195 #else 196 # define dm_bufio_cond_resched() do { } while (0) 197 #endif 198 199 /*----------------------------------------------------------------*/ 200 201 /* 202 * Default cache size: available memory divided by the ratio. 203 */ 204 static unsigned long dm_bufio_default_cache_size; 205 206 /* 207 * Total cache size set by the user. 208 */ 209 static unsigned long dm_bufio_cache_size; 210 211 /* 212 * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change 213 * at any time. If it disagrees, the user has changed cache size. 214 */ 215 static unsigned long dm_bufio_cache_size_latch; 216 217 static DEFINE_SPINLOCK(param_spinlock); 218 219 /* 220 * Buffers are freed after this timeout 221 */ 222 static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS; 223 224 static unsigned long dm_bufio_peak_allocated; 225 static unsigned long dm_bufio_allocated_kmem_cache; 226 static unsigned long dm_bufio_allocated_get_free_pages; 227 static unsigned long dm_bufio_allocated_vmalloc; 228 static unsigned long dm_bufio_current_allocated; 229 230 /*----------------------------------------------------------------*/ 231 232 /* 233 * Per-client cache: dm_bufio_cache_size / dm_bufio_client_count 234 */ 235 static unsigned long dm_bufio_cache_size_per_client; 236 237 /* 238 * The current number of clients. 239 */ 240 static int dm_bufio_client_count; 241 242 /* 243 * The list of all clients. 244 */ 245 static LIST_HEAD(dm_bufio_all_clients); 246 247 /* 248 * This mutex protects dm_bufio_cache_size_latch, 249 * dm_bufio_cache_size_per_client and dm_bufio_client_count 250 */ 251 static DEFINE_MUTEX(dm_bufio_clients_lock); 252 253 /*----------------------------------------------------------------*/ 254 255 static void adjust_total_allocated(enum data_mode data_mode, long diff) 256 { 257 static unsigned long * const class_ptr[DATA_MODE_LIMIT] = { 258 &dm_bufio_allocated_kmem_cache, 259 &dm_bufio_allocated_get_free_pages, 260 &dm_bufio_allocated_vmalloc, 261 }; 262 263 spin_lock(¶m_spinlock); 264 265 *class_ptr[data_mode] += diff; 266 267 dm_bufio_current_allocated += diff; 268 269 if (dm_bufio_current_allocated > dm_bufio_peak_allocated) 270 dm_bufio_peak_allocated = dm_bufio_current_allocated; 271 272 spin_unlock(¶m_spinlock); 273 } 274 275 /* 276 * Change the number of clients and recalculate per-client limit. 277 */ 278 static void __cache_size_refresh(void) 279 { 280 BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock)); 281 BUG_ON(dm_bufio_client_count < 0); 282 283 dm_bufio_cache_size_latch = ACCESS_ONCE(dm_bufio_cache_size); 284 285 /* 286 * Use default if set to 0 and report the actual cache size used. 287 */ 288 if (!dm_bufio_cache_size_latch) { 289 (void)cmpxchg(&dm_bufio_cache_size, 0, 290 dm_bufio_default_cache_size); 291 dm_bufio_cache_size_latch = dm_bufio_default_cache_size; 292 } 293 294 dm_bufio_cache_size_per_client = dm_bufio_cache_size_latch / 295 (dm_bufio_client_count ? : 1); 296 } 297 298 /* 299 * Allocating buffer data. 300 * 301 * Small buffers are allocated with kmem_cache, to use space optimally. 302 * 303 * For large buffers, we choose between get_free_pages and vmalloc. 304 * Each has advantages and disadvantages. 305 * 306 * __get_free_pages can randomly fail if the memory is fragmented. 307 * __vmalloc won't randomly fail, but vmalloc space is limited (it may be 308 * as low as 128M) so using it for caching is not appropriate. 309 * 310 * If the allocation may fail we use __get_free_pages. Memory fragmentation 311 * won't have a fatal effect here, but it just causes flushes of some other 312 * buffers and more I/O will be performed. Don't use __get_free_pages if it 313 * always fails (i.e. order >= MAX_ORDER). 314 * 315 * If the allocation shouldn't fail we use __vmalloc. This is only for the 316 * initial reserve allocation, so there's no risk of wasting all vmalloc 317 * space. 318 */ 319 static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask, 320 enum data_mode *data_mode) 321 { 322 if (c->block_size <= DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT) { 323 *data_mode = DATA_MODE_SLAB; 324 return kmem_cache_alloc(DM_BUFIO_CACHE(c), gfp_mask); 325 } 326 327 if (c->block_size <= DM_BUFIO_BLOCK_SIZE_GFP_LIMIT && 328 gfp_mask & __GFP_NORETRY) { 329 *data_mode = DATA_MODE_GET_FREE_PAGES; 330 return (void *)__get_free_pages(gfp_mask, 331 c->pages_per_block_bits); 332 } 333 334 *data_mode = DATA_MODE_VMALLOC; 335 return __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL); 336 } 337 338 /* 339 * Free buffer's data. 340 */ 341 static void free_buffer_data(struct dm_bufio_client *c, 342 void *data, enum data_mode data_mode) 343 { 344 switch (data_mode) { 345 case DATA_MODE_SLAB: 346 kmem_cache_free(DM_BUFIO_CACHE(c), data); 347 break; 348 349 case DATA_MODE_GET_FREE_PAGES: 350 free_pages((unsigned long)data, c->pages_per_block_bits); 351 break; 352 353 case DATA_MODE_VMALLOC: 354 vfree(data); 355 break; 356 357 default: 358 DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d", 359 data_mode); 360 BUG(); 361 } 362 } 363 364 /* 365 * Allocate buffer and its data. 366 */ 367 static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask) 368 { 369 struct dm_buffer *b = kmalloc(sizeof(struct dm_buffer) + c->aux_size, 370 gfp_mask); 371 372 if (!b) 373 return NULL; 374 375 b->c = c; 376 377 b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode); 378 if (!b->data) { 379 kfree(b); 380 return NULL; 381 } 382 383 adjust_total_allocated(b->data_mode, (long)c->block_size); 384 385 return b; 386 } 387 388 /* 389 * Free buffer and its data. 390 */ 391 static void free_buffer(struct dm_buffer *b) 392 { 393 struct dm_bufio_client *c = b->c; 394 395 adjust_total_allocated(b->data_mode, -(long)c->block_size); 396 397 free_buffer_data(c, b->data, b->data_mode); 398 kfree(b); 399 } 400 401 /* 402 * Link buffer to the hash list and clean or dirty queue. 403 */ 404 static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty) 405 { 406 struct dm_bufio_client *c = b->c; 407 408 c->n_buffers[dirty]++; 409 b->block = block; 410 b->list_mode = dirty; 411 list_add(&b->lru_list, &c->lru[dirty]); 412 hlist_add_head(&b->hash_list, &c->cache_hash[DM_BUFIO_HASH(block)]); 413 b->last_accessed = jiffies; 414 } 415 416 /* 417 * Unlink buffer from the hash list and dirty or clean queue. 418 */ 419 static void __unlink_buffer(struct dm_buffer *b) 420 { 421 struct dm_bufio_client *c = b->c; 422 423 BUG_ON(!c->n_buffers[b->list_mode]); 424 425 c->n_buffers[b->list_mode]--; 426 hlist_del(&b->hash_list); 427 list_del(&b->lru_list); 428 } 429 430 /* 431 * Place the buffer to the head of dirty or clean LRU queue. 432 */ 433 static void __relink_lru(struct dm_buffer *b, int dirty) 434 { 435 struct dm_bufio_client *c = b->c; 436 437 BUG_ON(!c->n_buffers[b->list_mode]); 438 439 c->n_buffers[b->list_mode]--; 440 c->n_buffers[dirty]++; 441 b->list_mode = dirty; 442 list_move(&b->lru_list, &c->lru[dirty]); 443 } 444 445 /*---------------------------------------------------------------- 446 * Submit I/O on the buffer. 447 * 448 * Bio interface is faster but it has some problems: 449 * the vector list is limited (increasing this limit increases 450 * memory-consumption per buffer, so it is not viable); 451 * 452 * the memory must be direct-mapped, not vmalloced; 453 * 454 * the I/O driver can reject requests spuriously if it thinks that 455 * the requests are too big for the device or if they cross a 456 * controller-defined memory boundary. 457 * 458 * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and 459 * it is not vmalloced, try using the bio interface. 460 * 461 * If the buffer is big, if it is vmalloced or if the underlying device 462 * rejects the bio because it is too large, use dm-io layer to do the I/O. 463 * The dm-io layer splits the I/O into multiple requests, avoiding the above 464 * shortcomings. 465 *--------------------------------------------------------------*/ 466 467 /* 468 * dm-io completion routine. It just calls b->bio.bi_end_io, pretending 469 * that the request was handled directly with bio interface. 470 */ 471 static void dmio_complete(unsigned long error, void *context) 472 { 473 struct dm_buffer *b = context; 474 475 b->bio.bi_end_io(&b->bio, error ? -EIO : 0); 476 } 477 478 static void use_dmio(struct dm_buffer *b, int rw, sector_t block, 479 bio_end_io_t *end_io) 480 { 481 int r; 482 struct dm_io_request io_req = { 483 .bi_rw = rw, 484 .notify.fn = dmio_complete, 485 .notify.context = b, 486 .client = b->c->dm_io, 487 }; 488 struct dm_io_region region = { 489 .bdev = b->c->bdev, 490 .sector = block << b->c->sectors_per_block_bits, 491 .count = b->c->block_size >> SECTOR_SHIFT, 492 }; 493 494 if (b->data_mode != DATA_MODE_VMALLOC) { 495 io_req.mem.type = DM_IO_KMEM; 496 io_req.mem.ptr.addr = b->data; 497 } else { 498 io_req.mem.type = DM_IO_VMA; 499 io_req.mem.ptr.vma = b->data; 500 } 501 502 b->bio.bi_end_io = end_io; 503 504 r = dm_io(&io_req, 1, ®ion, NULL); 505 if (r) 506 end_io(&b->bio, r); 507 } 508 509 static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, 510 bio_end_io_t *end_io) 511 { 512 char *ptr; 513 int len; 514 515 bio_init(&b->bio); 516 b->bio.bi_io_vec = b->bio_vec; 517 b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS; 518 b->bio.bi_sector = block << b->c->sectors_per_block_bits; 519 b->bio.bi_bdev = b->c->bdev; 520 b->bio.bi_end_io = end_io; 521 522 /* 523 * We assume that if len >= PAGE_SIZE ptr is page-aligned. 524 * If len < PAGE_SIZE the buffer doesn't cross page boundary. 525 */ 526 ptr = b->data; 527 len = b->c->block_size; 528 529 if (len >= PAGE_SIZE) 530 BUG_ON((unsigned long)ptr & (PAGE_SIZE - 1)); 531 else 532 BUG_ON((unsigned long)ptr & (len - 1)); 533 534 do { 535 if (!bio_add_page(&b->bio, virt_to_page(ptr), 536 len < PAGE_SIZE ? len : PAGE_SIZE, 537 virt_to_phys(ptr) & (PAGE_SIZE - 1))) { 538 BUG_ON(b->c->block_size <= PAGE_SIZE); 539 use_dmio(b, rw, block, end_io); 540 return; 541 } 542 543 len -= PAGE_SIZE; 544 ptr += PAGE_SIZE; 545 } while (len > 0); 546 547 submit_bio(rw, &b->bio); 548 } 549 550 static void submit_io(struct dm_buffer *b, int rw, sector_t block, 551 bio_end_io_t *end_io) 552 { 553 if (rw == WRITE && b->c->write_callback) 554 b->c->write_callback(b); 555 556 if (b->c->block_size <= DM_BUFIO_INLINE_VECS * PAGE_SIZE && 557 b->data_mode != DATA_MODE_VMALLOC) 558 use_inline_bio(b, rw, block, end_io); 559 else 560 use_dmio(b, rw, block, end_io); 561 } 562 563 /*---------------------------------------------------------------- 564 * Writing dirty buffers 565 *--------------------------------------------------------------*/ 566 567 /* 568 * The endio routine for write. 569 * 570 * Set the error, clear B_WRITING bit and wake anyone who was waiting on 571 * it. 572 */ 573 static void write_endio(struct bio *bio, int error) 574 { 575 struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); 576 577 b->write_error = error; 578 if (unlikely(error)) { 579 struct dm_bufio_client *c = b->c; 580 (void)cmpxchg(&c->async_write_error, 0, error); 581 } 582 583 BUG_ON(!test_bit(B_WRITING, &b->state)); 584 585 smp_mb__before_clear_bit(); 586 clear_bit(B_WRITING, &b->state); 587 smp_mb__after_clear_bit(); 588 589 wake_up_bit(&b->state, B_WRITING); 590 } 591 592 /* 593 * This function is called when wait_on_bit is actually waiting. 594 */ 595 static int do_io_schedule(void *word) 596 { 597 io_schedule(); 598 599 return 0; 600 } 601 602 /* 603 * Initiate a write on a dirty buffer, but don't wait for it. 604 * 605 * - If the buffer is not dirty, exit. 606 * - If there some previous write going on, wait for it to finish (we can't 607 * have two writes on the same buffer simultaneously). 608 * - Submit our write and don't wait on it. We set B_WRITING indicating 609 * that there is a write in progress. 610 */ 611 static void __write_dirty_buffer(struct dm_buffer *b) 612 { 613 if (!test_bit(B_DIRTY, &b->state)) 614 return; 615 616 clear_bit(B_DIRTY, &b->state); 617 wait_on_bit_lock(&b->state, B_WRITING, 618 do_io_schedule, TASK_UNINTERRUPTIBLE); 619 620 submit_io(b, WRITE, b->block, write_endio); 621 } 622 623 /* 624 * Wait until any activity on the buffer finishes. Possibly write the 625 * buffer if it is dirty. When this function finishes, there is no I/O 626 * running on the buffer and the buffer is not dirty. 627 */ 628 static void __make_buffer_clean(struct dm_buffer *b) 629 { 630 BUG_ON(b->hold_count); 631 632 if (!b->state) /* fast case */ 633 return; 634 635 wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE); 636 __write_dirty_buffer(b); 637 wait_on_bit(&b->state, B_WRITING, do_io_schedule, TASK_UNINTERRUPTIBLE); 638 } 639 640 /* 641 * Find some buffer that is not held by anybody, clean it, unlink it and 642 * return it. 643 */ 644 static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c) 645 { 646 struct dm_buffer *b; 647 648 list_for_each_entry_reverse(b, &c->lru[LIST_CLEAN], lru_list) { 649 BUG_ON(test_bit(B_WRITING, &b->state)); 650 BUG_ON(test_bit(B_DIRTY, &b->state)); 651 652 if (!b->hold_count) { 653 __make_buffer_clean(b); 654 __unlink_buffer(b); 655 return b; 656 } 657 dm_bufio_cond_resched(); 658 } 659 660 list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) { 661 BUG_ON(test_bit(B_READING, &b->state)); 662 663 if (!b->hold_count) { 664 __make_buffer_clean(b); 665 __unlink_buffer(b); 666 return b; 667 } 668 dm_bufio_cond_resched(); 669 } 670 671 return NULL; 672 } 673 674 /* 675 * Wait until some other threads free some buffer or release hold count on 676 * some buffer. 677 * 678 * This function is entered with c->lock held, drops it and regains it 679 * before exiting. 680 */ 681 static void __wait_for_free_buffer(struct dm_bufio_client *c) 682 { 683 DECLARE_WAITQUEUE(wait, current); 684 685 add_wait_queue(&c->free_buffer_wait, &wait); 686 set_task_state(current, TASK_UNINTERRUPTIBLE); 687 dm_bufio_unlock(c); 688 689 io_schedule(); 690 691 set_task_state(current, TASK_RUNNING); 692 remove_wait_queue(&c->free_buffer_wait, &wait); 693 694 dm_bufio_lock(c); 695 } 696 697 enum new_flag { 698 NF_FRESH = 0, 699 NF_READ = 1, 700 NF_GET = 2, 701 NF_PREFETCH = 3 702 }; 703 704 /* 705 * Allocate a new buffer. If the allocation is not possible, wait until 706 * some other thread frees a buffer. 707 * 708 * May drop the lock and regain it. 709 */ 710 static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf) 711 { 712 struct dm_buffer *b; 713 714 /* 715 * dm-bufio is resistant to allocation failures (it just keeps 716 * one buffer reserved in cases all the allocations fail). 717 * So set flags to not try too hard: 718 * GFP_NOIO: don't recurse into the I/O layer 719 * __GFP_NORETRY: don't retry and rather return failure 720 * __GFP_NOMEMALLOC: don't use emergency reserves 721 * __GFP_NOWARN: don't print a warning in case of failure 722 * 723 * For debugging, if we set the cache size to 1, no new buffers will 724 * be allocated. 725 */ 726 while (1) { 727 if (dm_bufio_cache_size_latch != 1) { 728 b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); 729 if (b) 730 return b; 731 } 732 733 if (nf == NF_PREFETCH) 734 return NULL; 735 736 if (!list_empty(&c->reserved_buffers)) { 737 b = list_entry(c->reserved_buffers.next, 738 struct dm_buffer, lru_list); 739 list_del(&b->lru_list); 740 c->need_reserved_buffers++; 741 742 return b; 743 } 744 745 b = __get_unclaimed_buffer(c); 746 if (b) 747 return b; 748 749 __wait_for_free_buffer(c); 750 } 751 } 752 753 static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf) 754 { 755 struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf); 756 757 if (!b) 758 return NULL; 759 760 if (c->alloc_callback) 761 c->alloc_callback(b); 762 763 return b; 764 } 765 766 /* 767 * Free a buffer and wake other threads waiting for free buffers. 768 */ 769 static void __free_buffer_wake(struct dm_buffer *b) 770 { 771 struct dm_bufio_client *c = b->c; 772 773 if (!c->need_reserved_buffers) 774 free_buffer(b); 775 else { 776 list_add(&b->lru_list, &c->reserved_buffers); 777 c->need_reserved_buffers--; 778 } 779 780 wake_up(&c->free_buffer_wait); 781 } 782 783 static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait) 784 { 785 struct dm_buffer *b, *tmp; 786 787 list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) { 788 BUG_ON(test_bit(B_READING, &b->state)); 789 790 if (!test_bit(B_DIRTY, &b->state) && 791 !test_bit(B_WRITING, &b->state)) { 792 __relink_lru(b, LIST_CLEAN); 793 continue; 794 } 795 796 if (no_wait && test_bit(B_WRITING, &b->state)) 797 return; 798 799 __write_dirty_buffer(b); 800 dm_bufio_cond_resched(); 801 } 802 } 803 804 /* 805 * Get writeback threshold and buffer limit for a given client. 806 */ 807 static void __get_memory_limit(struct dm_bufio_client *c, 808 unsigned long *threshold_buffers, 809 unsigned long *limit_buffers) 810 { 811 unsigned long buffers; 812 813 if (ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch) { 814 mutex_lock(&dm_bufio_clients_lock); 815 __cache_size_refresh(); 816 mutex_unlock(&dm_bufio_clients_lock); 817 } 818 819 buffers = dm_bufio_cache_size_per_client >> 820 (c->sectors_per_block_bits + SECTOR_SHIFT); 821 822 if (buffers < DM_BUFIO_MIN_BUFFERS) 823 buffers = DM_BUFIO_MIN_BUFFERS; 824 825 *limit_buffers = buffers; 826 *threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100; 827 } 828 829 /* 830 * Check if we're over watermark. 831 * If we are over threshold_buffers, start freeing buffers. 832 * If we're over "limit_buffers", block until we get under the limit. 833 */ 834 static void __check_watermark(struct dm_bufio_client *c) 835 { 836 unsigned long threshold_buffers, limit_buffers; 837 838 __get_memory_limit(c, &threshold_buffers, &limit_buffers); 839 840 while (c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY] > 841 limit_buffers) { 842 843 struct dm_buffer *b = __get_unclaimed_buffer(c); 844 845 if (!b) 846 return; 847 848 __free_buffer_wake(b); 849 dm_bufio_cond_resched(); 850 } 851 852 if (c->n_buffers[LIST_DIRTY] > threshold_buffers) 853 __write_dirty_buffers_async(c, 1); 854 } 855 856 /* 857 * Find a buffer in the hash. 858 */ 859 static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block) 860 { 861 struct dm_buffer *b; 862 863 hlist_for_each_entry(b, &c->cache_hash[DM_BUFIO_HASH(block)], 864 hash_list) { 865 dm_bufio_cond_resched(); 866 if (b->block == block) 867 return b; 868 } 869 870 return NULL; 871 } 872 873 /*---------------------------------------------------------------- 874 * Getting a buffer 875 *--------------------------------------------------------------*/ 876 877 static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, 878 enum new_flag nf, int *need_submit) 879 { 880 struct dm_buffer *b, *new_b = NULL; 881 882 *need_submit = 0; 883 884 b = __find(c, block); 885 if (b) 886 goto found_buffer; 887 888 if (nf == NF_GET) 889 return NULL; 890 891 new_b = __alloc_buffer_wait(c, nf); 892 if (!new_b) 893 return NULL; 894 895 /* 896 * We've had a period where the mutex was unlocked, so need to 897 * recheck the hash table. 898 */ 899 b = __find(c, block); 900 if (b) { 901 __free_buffer_wake(new_b); 902 goto found_buffer; 903 } 904 905 __check_watermark(c); 906 907 b = new_b; 908 b->hold_count = 1; 909 b->read_error = 0; 910 b->write_error = 0; 911 __link_buffer(b, block, LIST_CLEAN); 912 913 if (nf == NF_FRESH) { 914 b->state = 0; 915 return b; 916 } 917 918 b->state = 1 << B_READING; 919 *need_submit = 1; 920 921 return b; 922 923 found_buffer: 924 if (nf == NF_PREFETCH) 925 return NULL; 926 /* 927 * Note: it is essential that we don't wait for the buffer to be 928 * read if dm_bufio_get function is used. Both dm_bufio_get and 929 * dm_bufio_prefetch can be used in the driver request routine. 930 * If the user called both dm_bufio_prefetch and dm_bufio_get on 931 * the same buffer, it would deadlock if we waited. 932 */ 933 if (nf == NF_GET && unlikely(test_bit(B_READING, &b->state))) 934 return NULL; 935 936 b->hold_count++; 937 __relink_lru(b, test_bit(B_DIRTY, &b->state) || 938 test_bit(B_WRITING, &b->state)); 939 return b; 940 } 941 942 /* 943 * The endio routine for reading: set the error, clear the bit and wake up 944 * anyone waiting on the buffer. 945 */ 946 static void read_endio(struct bio *bio, int error) 947 { 948 struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); 949 950 b->read_error = error; 951 952 BUG_ON(!test_bit(B_READING, &b->state)); 953 954 smp_mb__before_clear_bit(); 955 clear_bit(B_READING, &b->state); 956 smp_mb__after_clear_bit(); 957 958 wake_up_bit(&b->state, B_READING); 959 } 960 961 /* 962 * A common routine for dm_bufio_new and dm_bufio_read. Operation of these 963 * functions is similar except that dm_bufio_new doesn't read the 964 * buffer from the disk (assuming that the caller overwrites all the data 965 * and uses dm_bufio_mark_buffer_dirty to write new data back). 966 */ 967 static void *new_read(struct dm_bufio_client *c, sector_t block, 968 enum new_flag nf, struct dm_buffer **bp) 969 { 970 int need_submit; 971 struct dm_buffer *b; 972 973 dm_bufio_lock(c); 974 b = __bufio_new(c, block, nf, &need_submit); 975 dm_bufio_unlock(c); 976 977 if (!b) 978 return b; 979 980 if (need_submit) 981 submit_io(b, READ, b->block, read_endio); 982 983 wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE); 984 985 if (b->read_error) { 986 int error = b->read_error; 987 988 dm_bufio_release(b); 989 990 return ERR_PTR(error); 991 } 992 993 *bp = b; 994 995 return b->data; 996 } 997 998 void *dm_bufio_get(struct dm_bufio_client *c, sector_t block, 999 struct dm_buffer **bp) 1000 { 1001 return new_read(c, block, NF_GET, bp); 1002 } 1003 EXPORT_SYMBOL_GPL(dm_bufio_get); 1004 1005 void *dm_bufio_read(struct dm_bufio_client *c, sector_t block, 1006 struct dm_buffer **bp) 1007 { 1008 BUG_ON(dm_bufio_in_request()); 1009 1010 return new_read(c, block, NF_READ, bp); 1011 } 1012 EXPORT_SYMBOL_GPL(dm_bufio_read); 1013 1014 void *dm_bufio_new(struct dm_bufio_client *c, sector_t block, 1015 struct dm_buffer **bp) 1016 { 1017 BUG_ON(dm_bufio_in_request()); 1018 1019 return new_read(c, block, NF_FRESH, bp); 1020 } 1021 EXPORT_SYMBOL_GPL(dm_bufio_new); 1022 1023 void dm_bufio_prefetch(struct dm_bufio_client *c, 1024 sector_t block, unsigned n_blocks) 1025 { 1026 struct blk_plug plug; 1027 1028 BUG_ON(dm_bufio_in_request()); 1029 1030 blk_start_plug(&plug); 1031 dm_bufio_lock(c); 1032 1033 for (; n_blocks--; block++) { 1034 int need_submit; 1035 struct dm_buffer *b; 1036 b = __bufio_new(c, block, NF_PREFETCH, &need_submit); 1037 if (unlikely(b != NULL)) { 1038 dm_bufio_unlock(c); 1039 1040 if (need_submit) 1041 submit_io(b, READ, b->block, read_endio); 1042 dm_bufio_release(b); 1043 1044 dm_bufio_cond_resched(); 1045 1046 if (!n_blocks) 1047 goto flush_plug; 1048 dm_bufio_lock(c); 1049 } 1050 1051 } 1052 1053 dm_bufio_unlock(c); 1054 1055 flush_plug: 1056 blk_finish_plug(&plug); 1057 } 1058 EXPORT_SYMBOL_GPL(dm_bufio_prefetch); 1059 1060 void dm_bufio_release(struct dm_buffer *b) 1061 { 1062 struct dm_bufio_client *c = b->c; 1063 1064 dm_bufio_lock(c); 1065 1066 BUG_ON(!b->hold_count); 1067 1068 b->hold_count--; 1069 if (!b->hold_count) { 1070 wake_up(&c->free_buffer_wait); 1071 1072 /* 1073 * If there were errors on the buffer, and the buffer is not 1074 * to be written, free the buffer. There is no point in caching 1075 * invalid buffer. 1076 */ 1077 if ((b->read_error || b->write_error) && 1078 !test_bit(B_READING, &b->state) && 1079 !test_bit(B_WRITING, &b->state) && 1080 !test_bit(B_DIRTY, &b->state)) { 1081 __unlink_buffer(b); 1082 __free_buffer_wake(b); 1083 } 1084 } 1085 1086 dm_bufio_unlock(c); 1087 } 1088 EXPORT_SYMBOL_GPL(dm_bufio_release); 1089 1090 void dm_bufio_mark_buffer_dirty(struct dm_buffer *b) 1091 { 1092 struct dm_bufio_client *c = b->c; 1093 1094 dm_bufio_lock(c); 1095 1096 BUG_ON(test_bit(B_READING, &b->state)); 1097 1098 if (!test_and_set_bit(B_DIRTY, &b->state)) 1099 __relink_lru(b, LIST_DIRTY); 1100 1101 dm_bufio_unlock(c); 1102 } 1103 EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty); 1104 1105 void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c) 1106 { 1107 BUG_ON(dm_bufio_in_request()); 1108 1109 dm_bufio_lock(c); 1110 __write_dirty_buffers_async(c, 0); 1111 dm_bufio_unlock(c); 1112 } 1113 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async); 1114 1115 /* 1116 * For performance, it is essential that the buffers are written asynchronously 1117 * and simultaneously (so that the block layer can merge the writes) and then 1118 * waited upon. 1119 * 1120 * Finally, we flush hardware disk cache. 1121 */ 1122 int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c) 1123 { 1124 int a, f; 1125 unsigned long buffers_processed = 0; 1126 struct dm_buffer *b, *tmp; 1127 1128 dm_bufio_lock(c); 1129 __write_dirty_buffers_async(c, 0); 1130 1131 again: 1132 list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) { 1133 int dropped_lock = 0; 1134 1135 if (buffers_processed < c->n_buffers[LIST_DIRTY]) 1136 buffers_processed++; 1137 1138 BUG_ON(test_bit(B_READING, &b->state)); 1139 1140 if (test_bit(B_WRITING, &b->state)) { 1141 if (buffers_processed < c->n_buffers[LIST_DIRTY]) { 1142 dropped_lock = 1; 1143 b->hold_count++; 1144 dm_bufio_unlock(c); 1145 wait_on_bit(&b->state, B_WRITING, 1146 do_io_schedule, 1147 TASK_UNINTERRUPTIBLE); 1148 dm_bufio_lock(c); 1149 b->hold_count--; 1150 } else 1151 wait_on_bit(&b->state, B_WRITING, 1152 do_io_schedule, 1153 TASK_UNINTERRUPTIBLE); 1154 } 1155 1156 if (!test_bit(B_DIRTY, &b->state) && 1157 !test_bit(B_WRITING, &b->state)) 1158 __relink_lru(b, LIST_CLEAN); 1159 1160 dm_bufio_cond_resched(); 1161 1162 /* 1163 * If we dropped the lock, the list is no longer consistent, 1164 * so we must restart the search. 1165 * 1166 * In the most common case, the buffer just processed is 1167 * relinked to the clean list, so we won't loop scanning the 1168 * same buffer again and again. 1169 * 1170 * This may livelock if there is another thread simultaneously 1171 * dirtying buffers, so we count the number of buffers walked 1172 * and if it exceeds the total number of buffers, it means that 1173 * someone is doing some writes simultaneously with us. In 1174 * this case, stop, dropping the lock. 1175 */ 1176 if (dropped_lock) 1177 goto again; 1178 } 1179 wake_up(&c->free_buffer_wait); 1180 dm_bufio_unlock(c); 1181 1182 a = xchg(&c->async_write_error, 0); 1183 f = dm_bufio_issue_flush(c); 1184 if (a) 1185 return a; 1186 1187 return f; 1188 } 1189 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers); 1190 1191 /* 1192 * Use dm-io to send and empty barrier flush the device. 1193 */ 1194 int dm_bufio_issue_flush(struct dm_bufio_client *c) 1195 { 1196 struct dm_io_request io_req = { 1197 .bi_rw = WRITE_FLUSH, 1198 .mem.type = DM_IO_KMEM, 1199 .mem.ptr.addr = NULL, 1200 .client = c->dm_io, 1201 }; 1202 struct dm_io_region io_reg = { 1203 .bdev = c->bdev, 1204 .sector = 0, 1205 .count = 0, 1206 }; 1207 1208 BUG_ON(dm_bufio_in_request()); 1209 1210 return dm_io(&io_req, 1, &io_reg, NULL); 1211 } 1212 EXPORT_SYMBOL_GPL(dm_bufio_issue_flush); 1213 1214 /* 1215 * We first delete any other buffer that may be at that new location. 1216 * 1217 * Then, we write the buffer to the original location if it was dirty. 1218 * 1219 * Then, if we are the only one who is holding the buffer, relink the buffer 1220 * in the hash queue for the new location. 1221 * 1222 * If there was someone else holding the buffer, we write it to the new 1223 * location but not relink it, because that other user needs to have the buffer 1224 * at the same place. 1225 */ 1226 void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block) 1227 { 1228 struct dm_bufio_client *c = b->c; 1229 struct dm_buffer *new; 1230 1231 BUG_ON(dm_bufio_in_request()); 1232 1233 dm_bufio_lock(c); 1234 1235 retry: 1236 new = __find(c, new_block); 1237 if (new) { 1238 if (new->hold_count) { 1239 __wait_for_free_buffer(c); 1240 goto retry; 1241 } 1242 1243 /* 1244 * FIXME: Is there any point waiting for a write that's going 1245 * to be overwritten in a bit? 1246 */ 1247 __make_buffer_clean(new); 1248 __unlink_buffer(new); 1249 __free_buffer_wake(new); 1250 } 1251 1252 BUG_ON(!b->hold_count); 1253 BUG_ON(test_bit(B_READING, &b->state)); 1254 1255 __write_dirty_buffer(b); 1256 if (b->hold_count == 1) { 1257 wait_on_bit(&b->state, B_WRITING, 1258 do_io_schedule, TASK_UNINTERRUPTIBLE); 1259 set_bit(B_DIRTY, &b->state); 1260 __unlink_buffer(b); 1261 __link_buffer(b, new_block, LIST_DIRTY); 1262 } else { 1263 sector_t old_block; 1264 wait_on_bit_lock(&b->state, B_WRITING, 1265 do_io_schedule, TASK_UNINTERRUPTIBLE); 1266 /* 1267 * Relink buffer to "new_block" so that write_callback 1268 * sees "new_block" as a block number. 1269 * After the write, link the buffer back to old_block. 1270 * All this must be done in bufio lock, so that block number 1271 * change isn't visible to other threads. 1272 */ 1273 old_block = b->block; 1274 __unlink_buffer(b); 1275 __link_buffer(b, new_block, b->list_mode); 1276 submit_io(b, WRITE, new_block, write_endio); 1277 wait_on_bit(&b->state, B_WRITING, 1278 do_io_schedule, TASK_UNINTERRUPTIBLE); 1279 __unlink_buffer(b); 1280 __link_buffer(b, old_block, b->list_mode); 1281 } 1282 1283 dm_bufio_unlock(c); 1284 dm_bufio_release(b); 1285 } 1286 EXPORT_SYMBOL_GPL(dm_bufio_release_move); 1287 1288 unsigned dm_bufio_get_block_size(struct dm_bufio_client *c) 1289 { 1290 return c->block_size; 1291 } 1292 EXPORT_SYMBOL_GPL(dm_bufio_get_block_size); 1293 1294 sector_t dm_bufio_get_device_size(struct dm_bufio_client *c) 1295 { 1296 return i_size_read(c->bdev->bd_inode) >> 1297 (SECTOR_SHIFT + c->sectors_per_block_bits); 1298 } 1299 EXPORT_SYMBOL_GPL(dm_bufio_get_device_size); 1300 1301 sector_t dm_bufio_get_block_number(struct dm_buffer *b) 1302 { 1303 return b->block; 1304 } 1305 EXPORT_SYMBOL_GPL(dm_bufio_get_block_number); 1306 1307 void *dm_bufio_get_block_data(struct dm_buffer *b) 1308 { 1309 return b->data; 1310 } 1311 EXPORT_SYMBOL_GPL(dm_bufio_get_block_data); 1312 1313 void *dm_bufio_get_aux_data(struct dm_buffer *b) 1314 { 1315 return b + 1; 1316 } 1317 EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data); 1318 1319 struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b) 1320 { 1321 return b->c; 1322 } 1323 EXPORT_SYMBOL_GPL(dm_bufio_get_client); 1324 1325 static void drop_buffers(struct dm_bufio_client *c) 1326 { 1327 struct dm_buffer *b; 1328 int i; 1329 1330 BUG_ON(dm_bufio_in_request()); 1331 1332 /* 1333 * An optimization so that the buffers are not written one-by-one. 1334 */ 1335 dm_bufio_write_dirty_buffers_async(c); 1336 1337 dm_bufio_lock(c); 1338 1339 while ((b = __get_unclaimed_buffer(c))) 1340 __free_buffer_wake(b); 1341 1342 for (i = 0; i < LIST_SIZE; i++) 1343 list_for_each_entry(b, &c->lru[i], lru_list) 1344 DMERR("leaked buffer %llx, hold count %u, list %d", 1345 (unsigned long long)b->block, b->hold_count, i); 1346 1347 for (i = 0; i < LIST_SIZE; i++) 1348 BUG_ON(!list_empty(&c->lru[i])); 1349 1350 dm_bufio_unlock(c); 1351 } 1352 1353 /* 1354 * Test if the buffer is unused and too old, and commit it. 1355 * At if noio is set, we must not do any I/O because we hold 1356 * dm_bufio_clients_lock and we would risk deadlock if the I/O gets rerouted to 1357 * different bufio client. 1358 */ 1359 static int __cleanup_old_buffer(struct dm_buffer *b, gfp_t gfp, 1360 unsigned long max_jiffies) 1361 { 1362 if (jiffies - b->last_accessed < max_jiffies) 1363 return 1; 1364 1365 if (!(gfp & __GFP_IO)) { 1366 if (test_bit(B_READING, &b->state) || 1367 test_bit(B_WRITING, &b->state) || 1368 test_bit(B_DIRTY, &b->state)) 1369 return 1; 1370 } 1371 1372 if (b->hold_count) 1373 return 1; 1374 1375 __make_buffer_clean(b); 1376 __unlink_buffer(b); 1377 __free_buffer_wake(b); 1378 1379 return 0; 1380 } 1381 1382 static void __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, 1383 struct shrink_control *sc) 1384 { 1385 int l; 1386 struct dm_buffer *b, *tmp; 1387 1388 for (l = 0; l < LIST_SIZE; l++) { 1389 list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) 1390 if (!__cleanup_old_buffer(b, sc->gfp_mask, 0) && 1391 !--nr_to_scan) 1392 return; 1393 dm_bufio_cond_resched(); 1394 } 1395 } 1396 1397 static int shrink(struct shrinker *shrinker, struct shrink_control *sc) 1398 { 1399 struct dm_bufio_client *c = 1400 container_of(shrinker, struct dm_bufio_client, shrinker); 1401 unsigned long r; 1402 unsigned long nr_to_scan = sc->nr_to_scan; 1403 1404 if (sc->gfp_mask & __GFP_IO) 1405 dm_bufio_lock(c); 1406 else if (!dm_bufio_trylock(c)) 1407 return !nr_to_scan ? 0 : -1; 1408 1409 if (nr_to_scan) 1410 __scan(c, nr_to_scan, sc); 1411 1412 r = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY]; 1413 if (r > INT_MAX) 1414 r = INT_MAX; 1415 1416 dm_bufio_unlock(c); 1417 1418 return r; 1419 } 1420 1421 /* 1422 * Create the buffering interface 1423 */ 1424 struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size, 1425 unsigned reserved_buffers, unsigned aux_size, 1426 void (*alloc_callback)(struct dm_buffer *), 1427 void (*write_callback)(struct dm_buffer *)) 1428 { 1429 int r; 1430 struct dm_bufio_client *c; 1431 unsigned i; 1432 1433 BUG_ON(block_size < 1 << SECTOR_SHIFT || 1434 (block_size & (block_size - 1))); 1435 1436 c = kmalloc(sizeof(*c), GFP_KERNEL); 1437 if (!c) { 1438 r = -ENOMEM; 1439 goto bad_client; 1440 } 1441 c->cache_hash = vmalloc(sizeof(struct hlist_head) << DM_BUFIO_HASH_BITS); 1442 if (!c->cache_hash) { 1443 r = -ENOMEM; 1444 goto bad_hash; 1445 } 1446 1447 c->bdev = bdev; 1448 c->block_size = block_size; 1449 c->sectors_per_block_bits = ffs(block_size) - 1 - SECTOR_SHIFT; 1450 c->pages_per_block_bits = (ffs(block_size) - 1 >= PAGE_SHIFT) ? 1451 ffs(block_size) - 1 - PAGE_SHIFT : 0; 1452 c->blocks_per_page_bits = (ffs(block_size) - 1 < PAGE_SHIFT ? 1453 PAGE_SHIFT - (ffs(block_size) - 1) : 0); 1454 1455 c->aux_size = aux_size; 1456 c->alloc_callback = alloc_callback; 1457 c->write_callback = write_callback; 1458 1459 for (i = 0; i < LIST_SIZE; i++) { 1460 INIT_LIST_HEAD(&c->lru[i]); 1461 c->n_buffers[i] = 0; 1462 } 1463 1464 for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++) 1465 INIT_HLIST_HEAD(&c->cache_hash[i]); 1466 1467 mutex_init(&c->lock); 1468 INIT_LIST_HEAD(&c->reserved_buffers); 1469 c->need_reserved_buffers = reserved_buffers; 1470 1471 init_waitqueue_head(&c->free_buffer_wait); 1472 c->async_write_error = 0; 1473 1474 c->dm_io = dm_io_client_create(); 1475 if (IS_ERR(c->dm_io)) { 1476 r = PTR_ERR(c->dm_io); 1477 goto bad_dm_io; 1478 } 1479 1480 mutex_lock(&dm_bufio_clients_lock); 1481 if (c->blocks_per_page_bits) { 1482 if (!DM_BUFIO_CACHE_NAME(c)) { 1483 DM_BUFIO_CACHE_NAME(c) = kasprintf(GFP_KERNEL, "dm_bufio_cache-%u", c->block_size); 1484 if (!DM_BUFIO_CACHE_NAME(c)) { 1485 r = -ENOMEM; 1486 mutex_unlock(&dm_bufio_clients_lock); 1487 goto bad_cache; 1488 } 1489 } 1490 1491 if (!DM_BUFIO_CACHE(c)) { 1492 DM_BUFIO_CACHE(c) = kmem_cache_create(DM_BUFIO_CACHE_NAME(c), 1493 c->block_size, 1494 c->block_size, 0, NULL); 1495 if (!DM_BUFIO_CACHE(c)) { 1496 r = -ENOMEM; 1497 mutex_unlock(&dm_bufio_clients_lock); 1498 goto bad_cache; 1499 } 1500 } 1501 } 1502 mutex_unlock(&dm_bufio_clients_lock); 1503 1504 while (c->need_reserved_buffers) { 1505 struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL); 1506 1507 if (!b) { 1508 r = -ENOMEM; 1509 goto bad_buffer; 1510 } 1511 __free_buffer_wake(b); 1512 } 1513 1514 mutex_lock(&dm_bufio_clients_lock); 1515 dm_bufio_client_count++; 1516 list_add(&c->client_list, &dm_bufio_all_clients); 1517 __cache_size_refresh(); 1518 mutex_unlock(&dm_bufio_clients_lock); 1519 1520 c->shrinker.shrink = shrink; 1521 c->shrinker.seeks = 1; 1522 c->shrinker.batch = 0; 1523 register_shrinker(&c->shrinker); 1524 1525 return c; 1526 1527 bad_buffer: 1528 bad_cache: 1529 while (!list_empty(&c->reserved_buffers)) { 1530 struct dm_buffer *b = list_entry(c->reserved_buffers.next, 1531 struct dm_buffer, lru_list); 1532 list_del(&b->lru_list); 1533 free_buffer(b); 1534 } 1535 dm_io_client_destroy(c->dm_io); 1536 bad_dm_io: 1537 vfree(c->cache_hash); 1538 bad_hash: 1539 kfree(c); 1540 bad_client: 1541 return ERR_PTR(r); 1542 } 1543 EXPORT_SYMBOL_GPL(dm_bufio_client_create); 1544 1545 /* 1546 * Free the buffering interface. 1547 * It is required that there are no references on any buffers. 1548 */ 1549 void dm_bufio_client_destroy(struct dm_bufio_client *c) 1550 { 1551 unsigned i; 1552 1553 drop_buffers(c); 1554 1555 unregister_shrinker(&c->shrinker); 1556 1557 mutex_lock(&dm_bufio_clients_lock); 1558 1559 list_del(&c->client_list); 1560 dm_bufio_client_count--; 1561 __cache_size_refresh(); 1562 1563 mutex_unlock(&dm_bufio_clients_lock); 1564 1565 for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++) 1566 BUG_ON(!hlist_empty(&c->cache_hash[i])); 1567 1568 BUG_ON(c->need_reserved_buffers); 1569 1570 while (!list_empty(&c->reserved_buffers)) { 1571 struct dm_buffer *b = list_entry(c->reserved_buffers.next, 1572 struct dm_buffer, lru_list); 1573 list_del(&b->lru_list); 1574 free_buffer(b); 1575 } 1576 1577 for (i = 0; i < LIST_SIZE; i++) 1578 if (c->n_buffers[i]) 1579 DMERR("leaked buffer count %d: %ld", i, c->n_buffers[i]); 1580 1581 for (i = 0; i < LIST_SIZE; i++) 1582 BUG_ON(c->n_buffers[i]); 1583 1584 dm_io_client_destroy(c->dm_io); 1585 vfree(c->cache_hash); 1586 kfree(c); 1587 } 1588 EXPORT_SYMBOL_GPL(dm_bufio_client_destroy); 1589 1590 static void cleanup_old_buffers(void) 1591 { 1592 unsigned long max_age = ACCESS_ONCE(dm_bufio_max_age); 1593 struct dm_bufio_client *c; 1594 1595 if (max_age > ULONG_MAX / HZ) 1596 max_age = ULONG_MAX / HZ; 1597 1598 mutex_lock(&dm_bufio_clients_lock); 1599 list_for_each_entry(c, &dm_bufio_all_clients, client_list) { 1600 if (!dm_bufio_trylock(c)) 1601 continue; 1602 1603 while (!list_empty(&c->lru[LIST_CLEAN])) { 1604 struct dm_buffer *b; 1605 b = list_entry(c->lru[LIST_CLEAN].prev, 1606 struct dm_buffer, lru_list); 1607 if (__cleanup_old_buffer(b, 0, max_age * HZ)) 1608 break; 1609 dm_bufio_cond_resched(); 1610 } 1611 1612 dm_bufio_unlock(c); 1613 dm_bufio_cond_resched(); 1614 } 1615 mutex_unlock(&dm_bufio_clients_lock); 1616 } 1617 1618 static struct workqueue_struct *dm_bufio_wq; 1619 static struct delayed_work dm_bufio_work; 1620 1621 static void work_fn(struct work_struct *w) 1622 { 1623 cleanup_old_buffers(); 1624 1625 queue_delayed_work(dm_bufio_wq, &dm_bufio_work, 1626 DM_BUFIO_WORK_TIMER_SECS * HZ); 1627 } 1628 1629 /*---------------------------------------------------------------- 1630 * Module setup 1631 *--------------------------------------------------------------*/ 1632 1633 /* 1634 * This is called only once for the whole dm_bufio module. 1635 * It initializes memory limit. 1636 */ 1637 static int __init dm_bufio_init(void) 1638 { 1639 __u64 mem; 1640 1641 memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches); 1642 memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names); 1643 1644 mem = (__u64)((totalram_pages - totalhigh_pages) * 1645 DM_BUFIO_MEMORY_PERCENT / 100) << PAGE_SHIFT; 1646 1647 if (mem > ULONG_MAX) 1648 mem = ULONG_MAX; 1649 1650 #ifdef CONFIG_MMU 1651 /* 1652 * Get the size of vmalloc space the same way as VMALLOC_TOTAL 1653 * in fs/proc/internal.h 1654 */ 1655 if (mem > (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100) 1656 mem = (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100; 1657 #endif 1658 1659 dm_bufio_default_cache_size = mem; 1660 1661 mutex_lock(&dm_bufio_clients_lock); 1662 __cache_size_refresh(); 1663 mutex_unlock(&dm_bufio_clients_lock); 1664 1665 dm_bufio_wq = create_singlethread_workqueue("dm_bufio_cache"); 1666 if (!dm_bufio_wq) 1667 return -ENOMEM; 1668 1669 INIT_DELAYED_WORK(&dm_bufio_work, work_fn); 1670 queue_delayed_work(dm_bufio_wq, &dm_bufio_work, 1671 DM_BUFIO_WORK_TIMER_SECS * HZ); 1672 1673 return 0; 1674 } 1675 1676 /* 1677 * This is called once when unloading the dm_bufio module. 1678 */ 1679 static void __exit dm_bufio_exit(void) 1680 { 1681 int bug = 0; 1682 int i; 1683 1684 cancel_delayed_work_sync(&dm_bufio_work); 1685 destroy_workqueue(dm_bufio_wq); 1686 1687 for (i = 0; i < ARRAY_SIZE(dm_bufio_caches); i++) { 1688 struct kmem_cache *kc = dm_bufio_caches[i]; 1689 1690 if (kc) 1691 kmem_cache_destroy(kc); 1692 } 1693 1694 for (i = 0; i < ARRAY_SIZE(dm_bufio_cache_names); i++) 1695 kfree(dm_bufio_cache_names[i]); 1696 1697 if (dm_bufio_client_count) { 1698 DMCRIT("%s: dm_bufio_client_count leaked: %d", 1699 __func__, dm_bufio_client_count); 1700 bug = 1; 1701 } 1702 1703 if (dm_bufio_current_allocated) { 1704 DMCRIT("%s: dm_bufio_current_allocated leaked: %lu", 1705 __func__, dm_bufio_current_allocated); 1706 bug = 1; 1707 } 1708 1709 if (dm_bufio_allocated_get_free_pages) { 1710 DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu", 1711 __func__, dm_bufio_allocated_get_free_pages); 1712 bug = 1; 1713 } 1714 1715 if (dm_bufio_allocated_vmalloc) { 1716 DMCRIT("%s: dm_bufio_vmalloc leaked: %lu", 1717 __func__, dm_bufio_allocated_vmalloc); 1718 bug = 1; 1719 } 1720 1721 if (bug) 1722 BUG(); 1723 } 1724 1725 module_init(dm_bufio_init) 1726 module_exit(dm_bufio_exit) 1727 1728 module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, S_IRUGO | S_IWUSR); 1729 MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache"); 1730 1731 module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR); 1732 MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds"); 1733 1734 module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR); 1735 MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory"); 1736 1737 module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, S_IRUGO); 1738 MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc"); 1739 1740 module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, S_IRUGO); 1741 MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages"); 1742 1743 module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, S_IRUGO); 1744 MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc"); 1745 1746 module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, S_IRUGO); 1747 MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache"); 1748 1749 MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>"); 1750 MODULE_DESCRIPTION(DM_NAME " buffered I/O library"); 1751 MODULE_LICENSE("GPL"); 1752