1 /* 2 * Copyright (C) 2009-2011 Red Hat, Inc. 3 * 4 * Author: Mikulas Patocka <mpatocka@redhat.com> 5 * 6 * This file is released under the GPL. 7 */ 8 9 #include "dm-bufio.h" 10 11 #include <linux/device-mapper.h> 12 #include <linux/dm-io.h> 13 #include <linux/slab.h> 14 #include <linux/jiffies.h> 15 #include <linux/vmalloc.h> 16 #include <linux/shrinker.h> 17 #include <linux/module.h> 18 #include <linux/rbtree.h> 19 #include <linux/stacktrace.h> 20 21 #define DM_MSG_PREFIX "bufio" 22 23 /* 24 * Memory management policy: 25 * Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory 26 * or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower). 27 * Always allocate at least DM_BUFIO_MIN_BUFFERS buffers. 28 * Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT 29 * dirty buffers. 30 */ 31 #define DM_BUFIO_MIN_BUFFERS 8 32 33 #define DM_BUFIO_MEMORY_PERCENT 2 34 #define DM_BUFIO_VMALLOC_PERCENT 25 35 #define DM_BUFIO_WRITEBACK_PERCENT 75 36 37 /* 38 * Check buffer ages in this interval (seconds) 39 */ 40 #define DM_BUFIO_WORK_TIMER_SECS 30 41 42 /* 43 * Free buffers when they are older than this (seconds) 44 */ 45 #define DM_BUFIO_DEFAULT_AGE_SECS 300 46 47 /* 48 * The nr of bytes of cached data to keep around. 49 */ 50 #define DM_BUFIO_DEFAULT_RETAIN_BYTES (256 * 1024) 51 52 /* 53 * The number of bvec entries that are embedded directly in the buffer. 54 * If the chunk size is larger, dm-io is used to do the io. 55 */ 56 #define DM_BUFIO_INLINE_VECS 16 57 58 /* 59 * Don't try to use kmem_cache_alloc for blocks larger than this. 60 * For explanation, see alloc_buffer_data below. 61 */ 62 #define DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT (PAGE_SIZE >> 1) 63 #define DM_BUFIO_BLOCK_SIZE_GFP_LIMIT (PAGE_SIZE << (MAX_ORDER - 1)) 64 65 /* 66 * dm_buffer->list_mode 67 */ 68 #define LIST_CLEAN 0 69 #define LIST_DIRTY 1 70 #define LIST_SIZE 2 71 72 /* 73 * Linking of buffers: 74 * All buffers are linked to cache_hash with their hash_list field. 75 * 76 * Clean buffers that are not being written (B_WRITING not set) 77 * are linked to lru[LIST_CLEAN] with their lru_list field. 78 * 79 * Dirty and clean buffers that are being written are linked to 80 * lru[LIST_DIRTY] with their lru_list field. When the write 81 * finishes, the buffer cannot be relinked immediately (because we 82 * are in an interrupt context and relinking requires process 83 * context), so some clean-not-writing buffers can be held on 84 * dirty_lru too. They are later added to lru in the process 85 * context. 86 */ 87 struct dm_bufio_client { 88 struct mutex lock; 89 90 struct list_head lru[LIST_SIZE]; 91 unsigned long n_buffers[LIST_SIZE]; 92 93 struct block_device *bdev; 94 unsigned block_size; 95 unsigned char sectors_per_block_bits; 96 unsigned char pages_per_block_bits; 97 unsigned char blocks_per_page_bits; 98 unsigned aux_size; 99 void (*alloc_callback)(struct dm_buffer *); 100 void (*write_callback)(struct dm_buffer *); 101 102 struct dm_io_client *dm_io; 103 104 struct list_head reserved_buffers; 105 unsigned need_reserved_buffers; 106 107 unsigned minimum_buffers; 108 109 struct rb_root buffer_tree; 110 wait_queue_head_t free_buffer_wait; 111 112 int async_write_error; 113 114 struct list_head client_list; 115 struct shrinker shrinker; 116 }; 117 118 /* 119 * Buffer state bits. 120 */ 121 #define B_READING 0 122 #define B_WRITING 1 123 #define B_DIRTY 2 124 125 /* 126 * Describes how the block was allocated: 127 * kmem_cache_alloc(), __get_free_pages() or vmalloc(). 128 * See the comment at alloc_buffer_data. 129 */ 130 enum data_mode { 131 DATA_MODE_SLAB = 0, 132 DATA_MODE_GET_FREE_PAGES = 1, 133 DATA_MODE_VMALLOC = 2, 134 DATA_MODE_LIMIT = 3 135 }; 136 137 struct dm_buffer { 138 struct rb_node node; 139 struct list_head lru_list; 140 sector_t block; 141 void *data; 142 enum data_mode data_mode; 143 unsigned char list_mode; /* LIST_* */ 144 unsigned hold_count; 145 int read_error; 146 int write_error; 147 unsigned long state; 148 unsigned long last_accessed; 149 struct dm_bufio_client *c; 150 struct list_head write_list; 151 struct bio bio; 152 struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS]; 153 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING 154 #define MAX_STACK 10 155 struct stack_trace stack_trace; 156 unsigned long stack_entries[MAX_STACK]; 157 #endif 158 }; 159 160 /*----------------------------------------------------------------*/ 161 162 static struct kmem_cache *dm_bufio_caches[PAGE_SHIFT - SECTOR_SHIFT]; 163 static char *dm_bufio_cache_names[PAGE_SHIFT - SECTOR_SHIFT]; 164 165 static inline int dm_bufio_cache_index(struct dm_bufio_client *c) 166 { 167 unsigned ret = c->blocks_per_page_bits - 1; 168 169 BUG_ON(ret >= ARRAY_SIZE(dm_bufio_caches)); 170 171 return ret; 172 } 173 174 #define DM_BUFIO_CACHE(c) (dm_bufio_caches[dm_bufio_cache_index(c)]) 175 #define DM_BUFIO_CACHE_NAME(c) (dm_bufio_cache_names[dm_bufio_cache_index(c)]) 176 177 #define dm_bufio_in_request() (!!current->bio_list) 178 179 static void dm_bufio_lock(struct dm_bufio_client *c) 180 { 181 mutex_lock_nested(&c->lock, dm_bufio_in_request()); 182 } 183 184 static int dm_bufio_trylock(struct dm_bufio_client *c) 185 { 186 return mutex_trylock(&c->lock); 187 } 188 189 static void dm_bufio_unlock(struct dm_bufio_client *c) 190 { 191 mutex_unlock(&c->lock); 192 } 193 194 /* 195 * FIXME Move to sched.h? 196 */ 197 #ifdef CONFIG_PREEMPT_VOLUNTARY 198 # define dm_bufio_cond_resched() \ 199 do { \ 200 if (unlikely(need_resched())) \ 201 _cond_resched(); \ 202 } while (0) 203 #else 204 # define dm_bufio_cond_resched() do { } while (0) 205 #endif 206 207 /*----------------------------------------------------------------*/ 208 209 /* 210 * Default cache size: available memory divided by the ratio. 211 */ 212 static unsigned long dm_bufio_default_cache_size; 213 214 /* 215 * Total cache size set by the user. 216 */ 217 static unsigned long dm_bufio_cache_size; 218 219 /* 220 * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change 221 * at any time. If it disagrees, the user has changed cache size. 222 */ 223 static unsigned long dm_bufio_cache_size_latch; 224 225 static DEFINE_SPINLOCK(param_spinlock); 226 227 /* 228 * Buffers are freed after this timeout 229 */ 230 static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS; 231 static unsigned dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES; 232 233 static unsigned long dm_bufio_peak_allocated; 234 static unsigned long dm_bufio_allocated_kmem_cache; 235 static unsigned long dm_bufio_allocated_get_free_pages; 236 static unsigned long dm_bufio_allocated_vmalloc; 237 static unsigned long dm_bufio_current_allocated; 238 239 /*----------------------------------------------------------------*/ 240 241 /* 242 * Per-client cache: dm_bufio_cache_size / dm_bufio_client_count 243 */ 244 static unsigned long dm_bufio_cache_size_per_client; 245 246 /* 247 * The current number of clients. 248 */ 249 static int dm_bufio_client_count; 250 251 /* 252 * The list of all clients. 253 */ 254 static LIST_HEAD(dm_bufio_all_clients); 255 256 /* 257 * This mutex protects dm_bufio_cache_size_latch, 258 * dm_bufio_cache_size_per_client and dm_bufio_client_count 259 */ 260 static DEFINE_MUTEX(dm_bufio_clients_lock); 261 262 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING 263 static void buffer_record_stack(struct dm_buffer *b) 264 { 265 b->stack_trace.nr_entries = 0; 266 b->stack_trace.max_entries = MAX_STACK; 267 b->stack_trace.entries = b->stack_entries; 268 b->stack_trace.skip = 2; 269 save_stack_trace(&b->stack_trace); 270 } 271 #endif 272 273 /*---------------------------------------------------------------- 274 * A red/black tree acts as an index for all the buffers. 275 *--------------------------------------------------------------*/ 276 static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block) 277 { 278 struct rb_node *n = c->buffer_tree.rb_node; 279 struct dm_buffer *b; 280 281 while (n) { 282 b = container_of(n, struct dm_buffer, node); 283 284 if (b->block == block) 285 return b; 286 287 n = (b->block < block) ? n->rb_left : n->rb_right; 288 } 289 290 return NULL; 291 } 292 293 static void __insert(struct dm_bufio_client *c, struct dm_buffer *b) 294 { 295 struct rb_node **new = &c->buffer_tree.rb_node, *parent = NULL; 296 struct dm_buffer *found; 297 298 while (*new) { 299 found = container_of(*new, struct dm_buffer, node); 300 301 if (found->block == b->block) { 302 BUG_ON(found != b); 303 return; 304 } 305 306 parent = *new; 307 new = (found->block < b->block) ? 308 &((*new)->rb_left) : &((*new)->rb_right); 309 } 310 311 rb_link_node(&b->node, parent, new); 312 rb_insert_color(&b->node, &c->buffer_tree); 313 } 314 315 static void __remove(struct dm_bufio_client *c, struct dm_buffer *b) 316 { 317 rb_erase(&b->node, &c->buffer_tree); 318 } 319 320 /*----------------------------------------------------------------*/ 321 322 static void adjust_total_allocated(enum data_mode data_mode, long diff) 323 { 324 static unsigned long * const class_ptr[DATA_MODE_LIMIT] = { 325 &dm_bufio_allocated_kmem_cache, 326 &dm_bufio_allocated_get_free_pages, 327 &dm_bufio_allocated_vmalloc, 328 }; 329 330 spin_lock(¶m_spinlock); 331 332 *class_ptr[data_mode] += diff; 333 334 dm_bufio_current_allocated += diff; 335 336 if (dm_bufio_current_allocated > dm_bufio_peak_allocated) 337 dm_bufio_peak_allocated = dm_bufio_current_allocated; 338 339 spin_unlock(¶m_spinlock); 340 } 341 342 /* 343 * Change the number of clients and recalculate per-client limit. 344 */ 345 static void __cache_size_refresh(void) 346 { 347 BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock)); 348 BUG_ON(dm_bufio_client_count < 0); 349 350 dm_bufio_cache_size_latch = ACCESS_ONCE(dm_bufio_cache_size); 351 352 /* 353 * Use default if set to 0 and report the actual cache size used. 354 */ 355 if (!dm_bufio_cache_size_latch) { 356 (void)cmpxchg(&dm_bufio_cache_size, 0, 357 dm_bufio_default_cache_size); 358 dm_bufio_cache_size_latch = dm_bufio_default_cache_size; 359 } 360 361 dm_bufio_cache_size_per_client = dm_bufio_cache_size_latch / 362 (dm_bufio_client_count ? : 1); 363 } 364 365 /* 366 * Allocating buffer data. 367 * 368 * Small buffers are allocated with kmem_cache, to use space optimally. 369 * 370 * For large buffers, we choose between get_free_pages and vmalloc. 371 * Each has advantages and disadvantages. 372 * 373 * __get_free_pages can randomly fail if the memory is fragmented. 374 * __vmalloc won't randomly fail, but vmalloc space is limited (it may be 375 * as low as 128M) so using it for caching is not appropriate. 376 * 377 * If the allocation may fail we use __get_free_pages. Memory fragmentation 378 * won't have a fatal effect here, but it just causes flushes of some other 379 * buffers and more I/O will be performed. Don't use __get_free_pages if it 380 * always fails (i.e. order >= MAX_ORDER). 381 * 382 * If the allocation shouldn't fail we use __vmalloc. This is only for the 383 * initial reserve allocation, so there's no risk of wasting all vmalloc 384 * space. 385 */ 386 static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask, 387 enum data_mode *data_mode) 388 { 389 unsigned noio_flag; 390 void *ptr; 391 392 if (c->block_size <= DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT) { 393 *data_mode = DATA_MODE_SLAB; 394 return kmem_cache_alloc(DM_BUFIO_CACHE(c), gfp_mask); 395 } 396 397 if (c->block_size <= DM_BUFIO_BLOCK_SIZE_GFP_LIMIT && 398 gfp_mask & __GFP_NORETRY) { 399 *data_mode = DATA_MODE_GET_FREE_PAGES; 400 return (void *)__get_free_pages(gfp_mask, 401 c->pages_per_block_bits); 402 } 403 404 *data_mode = DATA_MODE_VMALLOC; 405 406 /* 407 * __vmalloc allocates the data pages and auxiliary structures with 408 * gfp_flags that were specified, but pagetables are always allocated 409 * with GFP_KERNEL, no matter what was specified as gfp_mask. 410 * 411 * Consequently, we must set per-process flag PF_MEMALLOC_NOIO so that 412 * all allocations done by this process (including pagetables) are done 413 * as if GFP_NOIO was specified. 414 */ 415 416 if (gfp_mask & __GFP_NORETRY) 417 noio_flag = memalloc_noio_save(); 418 419 ptr = __vmalloc(c->block_size, gfp_mask | __GFP_HIGHMEM, PAGE_KERNEL); 420 421 if (gfp_mask & __GFP_NORETRY) 422 memalloc_noio_restore(noio_flag); 423 424 return ptr; 425 } 426 427 /* 428 * Free buffer's data. 429 */ 430 static void free_buffer_data(struct dm_bufio_client *c, 431 void *data, enum data_mode data_mode) 432 { 433 switch (data_mode) { 434 case DATA_MODE_SLAB: 435 kmem_cache_free(DM_BUFIO_CACHE(c), data); 436 break; 437 438 case DATA_MODE_GET_FREE_PAGES: 439 free_pages((unsigned long)data, c->pages_per_block_bits); 440 break; 441 442 case DATA_MODE_VMALLOC: 443 vfree(data); 444 break; 445 446 default: 447 DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d", 448 data_mode); 449 BUG(); 450 } 451 } 452 453 /* 454 * Allocate buffer and its data. 455 */ 456 static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask) 457 { 458 struct dm_buffer *b = kmalloc(sizeof(struct dm_buffer) + c->aux_size, 459 gfp_mask); 460 461 if (!b) 462 return NULL; 463 464 b->c = c; 465 466 b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode); 467 if (!b->data) { 468 kfree(b); 469 return NULL; 470 } 471 472 adjust_total_allocated(b->data_mode, (long)c->block_size); 473 474 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING 475 memset(&b->stack_trace, 0, sizeof(b->stack_trace)); 476 #endif 477 return b; 478 } 479 480 /* 481 * Free buffer and its data. 482 */ 483 static void free_buffer(struct dm_buffer *b) 484 { 485 struct dm_bufio_client *c = b->c; 486 487 adjust_total_allocated(b->data_mode, -(long)c->block_size); 488 489 free_buffer_data(c, b->data, b->data_mode); 490 kfree(b); 491 } 492 493 /* 494 * Link buffer to the hash list and clean or dirty queue. 495 */ 496 static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty) 497 { 498 struct dm_bufio_client *c = b->c; 499 500 c->n_buffers[dirty]++; 501 b->block = block; 502 b->list_mode = dirty; 503 list_add(&b->lru_list, &c->lru[dirty]); 504 __insert(b->c, b); 505 b->last_accessed = jiffies; 506 } 507 508 /* 509 * Unlink buffer from the hash list and dirty or clean queue. 510 */ 511 static void __unlink_buffer(struct dm_buffer *b) 512 { 513 struct dm_bufio_client *c = b->c; 514 515 BUG_ON(!c->n_buffers[b->list_mode]); 516 517 c->n_buffers[b->list_mode]--; 518 __remove(b->c, b); 519 list_del(&b->lru_list); 520 } 521 522 /* 523 * Place the buffer to the head of dirty or clean LRU queue. 524 */ 525 static void __relink_lru(struct dm_buffer *b, int dirty) 526 { 527 struct dm_bufio_client *c = b->c; 528 529 BUG_ON(!c->n_buffers[b->list_mode]); 530 531 c->n_buffers[b->list_mode]--; 532 c->n_buffers[dirty]++; 533 b->list_mode = dirty; 534 list_move(&b->lru_list, &c->lru[dirty]); 535 b->last_accessed = jiffies; 536 } 537 538 /*---------------------------------------------------------------- 539 * Submit I/O on the buffer. 540 * 541 * Bio interface is faster but it has some problems: 542 * the vector list is limited (increasing this limit increases 543 * memory-consumption per buffer, so it is not viable); 544 * 545 * the memory must be direct-mapped, not vmalloced; 546 * 547 * the I/O driver can reject requests spuriously if it thinks that 548 * the requests are too big for the device or if they cross a 549 * controller-defined memory boundary. 550 * 551 * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and 552 * it is not vmalloced, try using the bio interface. 553 * 554 * If the buffer is big, if it is vmalloced or if the underlying device 555 * rejects the bio because it is too large, use dm-io layer to do the I/O. 556 * The dm-io layer splits the I/O into multiple requests, avoiding the above 557 * shortcomings. 558 *--------------------------------------------------------------*/ 559 560 /* 561 * dm-io completion routine. It just calls b->bio.bi_end_io, pretending 562 * that the request was handled directly with bio interface. 563 */ 564 static void dmio_complete(unsigned long error, void *context) 565 { 566 struct dm_buffer *b = context; 567 568 b->bio.bi_error = error ? -EIO : 0; 569 b->bio.bi_end_io(&b->bio); 570 } 571 572 static void use_dmio(struct dm_buffer *b, int rw, sector_t block, 573 bio_end_io_t *end_io) 574 { 575 int r; 576 struct dm_io_request io_req = { 577 .bi_rw = rw, 578 .notify.fn = dmio_complete, 579 .notify.context = b, 580 .client = b->c->dm_io, 581 }; 582 struct dm_io_region region = { 583 .bdev = b->c->bdev, 584 .sector = block << b->c->sectors_per_block_bits, 585 .count = b->c->block_size >> SECTOR_SHIFT, 586 }; 587 588 if (b->data_mode != DATA_MODE_VMALLOC) { 589 io_req.mem.type = DM_IO_KMEM; 590 io_req.mem.ptr.addr = b->data; 591 } else { 592 io_req.mem.type = DM_IO_VMA; 593 io_req.mem.ptr.vma = b->data; 594 } 595 596 b->bio.bi_end_io = end_io; 597 598 r = dm_io(&io_req, 1, ®ion, NULL); 599 if (r) { 600 b->bio.bi_error = r; 601 end_io(&b->bio); 602 } 603 } 604 605 static void inline_endio(struct bio *bio) 606 { 607 bio_end_io_t *end_fn = bio->bi_private; 608 int error = bio->bi_error; 609 610 /* 611 * Reset the bio to free any attached resources 612 * (e.g. bio integrity profiles). 613 */ 614 bio_reset(bio); 615 616 bio->bi_error = error; 617 end_fn(bio); 618 } 619 620 static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, 621 bio_end_io_t *end_io) 622 { 623 char *ptr; 624 int len; 625 626 bio_init(&b->bio); 627 b->bio.bi_io_vec = b->bio_vec; 628 b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS; 629 b->bio.bi_iter.bi_sector = block << b->c->sectors_per_block_bits; 630 b->bio.bi_bdev = b->c->bdev; 631 b->bio.bi_end_io = inline_endio; 632 /* 633 * Use of .bi_private isn't a problem here because 634 * the dm_buffer's inline bio is local to bufio. 635 */ 636 b->bio.bi_private = end_io; 637 638 /* 639 * We assume that if len >= PAGE_SIZE ptr is page-aligned. 640 * If len < PAGE_SIZE the buffer doesn't cross page boundary. 641 */ 642 ptr = b->data; 643 len = b->c->block_size; 644 645 if (len >= PAGE_SIZE) 646 BUG_ON((unsigned long)ptr & (PAGE_SIZE - 1)); 647 else 648 BUG_ON((unsigned long)ptr & (len - 1)); 649 650 do { 651 if (!bio_add_page(&b->bio, virt_to_page(ptr), 652 len < PAGE_SIZE ? len : PAGE_SIZE, 653 offset_in_page(ptr))) { 654 BUG_ON(b->c->block_size <= PAGE_SIZE); 655 use_dmio(b, rw, block, end_io); 656 return; 657 } 658 659 len -= PAGE_SIZE; 660 ptr += PAGE_SIZE; 661 } while (len > 0); 662 663 submit_bio(rw, &b->bio); 664 } 665 666 static void submit_io(struct dm_buffer *b, int rw, sector_t block, 667 bio_end_io_t *end_io) 668 { 669 if (rw == WRITE && b->c->write_callback) 670 b->c->write_callback(b); 671 672 if (b->c->block_size <= DM_BUFIO_INLINE_VECS * PAGE_SIZE && 673 b->data_mode != DATA_MODE_VMALLOC) 674 use_inline_bio(b, rw, block, end_io); 675 else 676 use_dmio(b, rw, block, end_io); 677 } 678 679 /*---------------------------------------------------------------- 680 * Writing dirty buffers 681 *--------------------------------------------------------------*/ 682 683 /* 684 * The endio routine for write. 685 * 686 * Set the error, clear B_WRITING bit and wake anyone who was waiting on 687 * it. 688 */ 689 static void write_endio(struct bio *bio) 690 { 691 struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); 692 693 b->write_error = bio->bi_error; 694 if (unlikely(bio->bi_error)) { 695 struct dm_bufio_client *c = b->c; 696 int error = bio->bi_error; 697 (void)cmpxchg(&c->async_write_error, 0, error); 698 } 699 700 BUG_ON(!test_bit(B_WRITING, &b->state)); 701 702 smp_mb__before_atomic(); 703 clear_bit(B_WRITING, &b->state); 704 smp_mb__after_atomic(); 705 706 wake_up_bit(&b->state, B_WRITING); 707 } 708 709 /* 710 * Initiate a write on a dirty buffer, but don't wait for it. 711 * 712 * - If the buffer is not dirty, exit. 713 * - If there some previous write going on, wait for it to finish (we can't 714 * have two writes on the same buffer simultaneously). 715 * - Submit our write and don't wait on it. We set B_WRITING indicating 716 * that there is a write in progress. 717 */ 718 static void __write_dirty_buffer(struct dm_buffer *b, 719 struct list_head *write_list) 720 { 721 if (!test_bit(B_DIRTY, &b->state)) 722 return; 723 724 clear_bit(B_DIRTY, &b->state); 725 wait_on_bit_lock_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE); 726 727 if (!write_list) 728 submit_io(b, WRITE, b->block, write_endio); 729 else 730 list_add_tail(&b->write_list, write_list); 731 } 732 733 static void __flush_write_list(struct list_head *write_list) 734 { 735 struct blk_plug plug; 736 blk_start_plug(&plug); 737 while (!list_empty(write_list)) { 738 struct dm_buffer *b = 739 list_entry(write_list->next, struct dm_buffer, write_list); 740 list_del(&b->write_list); 741 submit_io(b, WRITE, b->block, write_endio); 742 dm_bufio_cond_resched(); 743 } 744 blk_finish_plug(&plug); 745 } 746 747 /* 748 * Wait until any activity on the buffer finishes. Possibly write the 749 * buffer if it is dirty. When this function finishes, there is no I/O 750 * running on the buffer and the buffer is not dirty. 751 */ 752 static void __make_buffer_clean(struct dm_buffer *b) 753 { 754 BUG_ON(b->hold_count); 755 756 if (!b->state) /* fast case */ 757 return; 758 759 wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE); 760 __write_dirty_buffer(b, NULL); 761 wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE); 762 } 763 764 /* 765 * Find some buffer that is not held by anybody, clean it, unlink it and 766 * return it. 767 */ 768 static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c) 769 { 770 struct dm_buffer *b; 771 772 list_for_each_entry_reverse(b, &c->lru[LIST_CLEAN], lru_list) { 773 BUG_ON(test_bit(B_WRITING, &b->state)); 774 BUG_ON(test_bit(B_DIRTY, &b->state)); 775 776 if (!b->hold_count) { 777 __make_buffer_clean(b); 778 __unlink_buffer(b); 779 return b; 780 } 781 dm_bufio_cond_resched(); 782 } 783 784 list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) { 785 BUG_ON(test_bit(B_READING, &b->state)); 786 787 if (!b->hold_count) { 788 __make_buffer_clean(b); 789 __unlink_buffer(b); 790 return b; 791 } 792 dm_bufio_cond_resched(); 793 } 794 795 return NULL; 796 } 797 798 /* 799 * Wait until some other threads free some buffer or release hold count on 800 * some buffer. 801 * 802 * This function is entered with c->lock held, drops it and regains it 803 * before exiting. 804 */ 805 static void __wait_for_free_buffer(struct dm_bufio_client *c) 806 { 807 DECLARE_WAITQUEUE(wait, current); 808 809 add_wait_queue(&c->free_buffer_wait, &wait); 810 set_task_state(current, TASK_UNINTERRUPTIBLE); 811 dm_bufio_unlock(c); 812 813 io_schedule(); 814 815 remove_wait_queue(&c->free_buffer_wait, &wait); 816 817 dm_bufio_lock(c); 818 } 819 820 enum new_flag { 821 NF_FRESH = 0, 822 NF_READ = 1, 823 NF_GET = 2, 824 NF_PREFETCH = 3 825 }; 826 827 /* 828 * Allocate a new buffer. If the allocation is not possible, wait until 829 * some other thread frees a buffer. 830 * 831 * May drop the lock and regain it. 832 */ 833 static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf) 834 { 835 struct dm_buffer *b; 836 837 /* 838 * dm-bufio is resistant to allocation failures (it just keeps 839 * one buffer reserved in cases all the allocations fail). 840 * So set flags to not try too hard: 841 * GFP_NOIO: don't recurse into the I/O layer 842 * __GFP_NORETRY: don't retry and rather return failure 843 * __GFP_NOMEMALLOC: don't use emergency reserves 844 * __GFP_NOWARN: don't print a warning in case of failure 845 * 846 * For debugging, if we set the cache size to 1, no new buffers will 847 * be allocated. 848 */ 849 while (1) { 850 if (dm_bufio_cache_size_latch != 1) { 851 b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); 852 if (b) 853 return b; 854 } 855 856 if (nf == NF_PREFETCH) 857 return NULL; 858 859 if (!list_empty(&c->reserved_buffers)) { 860 b = list_entry(c->reserved_buffers.next, 861 struct dm_buffer, lru_list); 862 list_del(&b->lru_list); 863 c->need_reserved_buffers++; 864 865 return b; 866 } 867 868 b = __get_unclaimed_buffer(c); 869 if (b) 870 return b; 871 872 __wait_for_free_buffer(c); 873 } 874 } 875 876 static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf) 877 { 878 struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf); 879 880 if (!b) 881 return NULL; 882 883 if (c->alloc_callback) 884 c->alloc_callback(b); 885 886 return b; 887 } 888 889 /* 890 * Free a buffer and wake other threads waiting for free buffers. 891 */ 892 static void __free_buffer_wake(struct dm_buffer *b) 893 { 894 struct dm_bufio_client *c = b->c; 895 896 if (!c->need_reserved_buffers) 897 free_buffer(b); 898 else { 899 list_add(&b->lru_list, &c->reserved_buffers); 900 c->need_reserved_buffers--; 901 } 902 903 wake_up(&c->free_buffer_wait); 904 } 905 906 static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait, 907 struct list_head *write_list) 908 { 909 struct dm_buffer *b, *tmp; 910 911 list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) { 912 BUG_ON(test_bit(B_READING, &b->state)); 913 914 if (!test_bit(B_DIRTY, &b->state) && 915 !test_bit(B_WRITING, &b->state)) { 916 __relink_lru(b, LIST_CLEAN); 917 continue; 918 } 919 920 if (no_wait && test_bit(B_WRITING, &b->state)) 921 return; 922 923 __write_dirty_buffer(b, write_list); 924 dm_bufio_cond_resched(); 925 } 926 } 927 928 /* 929 * Get writeback threshold and buffer limit for a given client. 930 */ 931 static void __get_memory_limit(struct dm_bufio_client *c, 932 unsigned long *threshold_buffers, 933 unsigned long *limit_buffers) 934 { 935 unsigned long buffers; 936 937 if (ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch) { 938 mutex_lock(&dm_bufio_clients_lock); 939 __cache_size_refresh(); 940 mutex_unlock(&dm_bufio_clients_lock); 941 } 942 943 buffers = dm_bufio_cache_size_per_client >> 944 (c->sectors_per_block_bits + SECTOR_SHIFT); 945 946 if (buffers < c->minimum_buffers) 947 buffers = c->minimum_buffers; 948 949 *limit_buffers = buffers; 950 *threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100; 951 } 952 953 /* 954 * Check if we're over watermark. 955 * If we are over threshold_buffers, start freeing buffers. 956 * If we're over "limit_buffers", block until we get under the limit. 957 */ 958 static void __check_watermark(struct dm_bufio_client *c, 959 struct list_head *write_list) 960 { 961 unsigned long threshold_buffers, limit_buffers; 962 963 __get_memory_limit(c, &threshold_buffers, &limit_buffers); 964 965 while (c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY] > 966 limit_buffers) { 967 968 struct dm_buffer *b = __get_unclaimed_buffer(c); 969 970 if (!b) 971 return; 972 973 __free_buffer_wake(b); 974 dm_bufio_cond_resched(); 975 } 976 977 if (c->n_buffers[LIST_DIRTY] > threshold_buffers) 978 __write_dirty_buffers_async(c, 1, write_list); 979 } 980 981 /*---------------------------------------------------------------- 982 * Getting a buffer 983 *--------------------------------------------------------------*/ 984 985 static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, 986 enum new_flag nf, int *need_submit, 987 struct list_head *write_list) 988 { 989 struct dm_buffer *b, *new_b = NULL; 990 991 *need_submit = 0; 992 993 b = __find(c, block); 994 if (b) 995 goto found_buffer; 996 997 if (nf == NF_GET) 998 return NULL; 999 1000 new_b = __alloc_buffer_wait(c, nf); 1001 if (!new_b) 1002 return NULL; 1003 1004 /* 1005 * We've had a period where the mutex was unlocked, so need to 1006 * recheck the hash table. 1007 */ 1008 b = __find(c, block); 1009 if (b) { 1010 __free_buffer_wake(new_b); 1011 goto found_buffer; 1012 } 1013 1014 __check_watermark(c, write_list); 1015 1016 b = new_b; 1017 b->hold_count = 1; 1018 b->read_error = 0; 1019 b->write_error = 0; 1020 __link_buffer(b, block, LIST_CLEAN); 1021 1022 if (nf == NF_FRESH) { 1023 b->state = 0; 1024 return b; 1025 } 1026 1027 b->state = 1 << B_READING; 1028 *need_submit = 1; 1029 1030 return b; 1031 1032 found_buffer: 1033 if (nf == NF_PREFETCH) 1034 return NULL; 1035 /* 1036 * Note: it is essential that we don't wait for the buffer to be 1037 * read if dm_bufio_get function is used. Both dm_bufio_get and 1038 * dm_bufio_prefetch can be used in the driver request routine. 1039 * If the user called both dm_bufio_prefetch and dm_bufio_get on 1040 * the same buffer, it would deadlock if we waited. 1041 */ 1042 if (nf == NF_GET && unlikely(test_bit(B_READING, &b->state))) 1043 return NULL; 1044 1045 b->hold_count++; 1046 __relink_lru(b, test_bit(B_DIRTY, &b->state) || 1047 test_bit(B_WRITING, &b->state)); 1048 return b; 1049 } 1050 1051 /* 1052 * The endio routine for reading: set the error, clear the bit and wake up 1053 * anyone waiting on the buffer. 1054 */ 1055 static void read_endio(struct bio *bio) 1056 { 1057 struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); 1058 1059 b->read_error = bio->bi_error; 1060 1061 BUG_ON(!test_bit(B_READING, &b->state)); 1062 1063 smp_mb__before_atomic(); 1064 clear_bit(B_READING, &b->state); 1065 smp_mb__after_atomic(); 1066 1067 wake_up_bit(&b->state, B_READING); 1068 } 1069 1070 /* 1071 * A common routine for dm_bufio_new and dm_bufio_read. Operation of these 1072 * functions is similar except that dm_bufio_new doesn't read the 1073 * buffer from the disk (assuming that the caller overwrites all the data 1074 * and uses dm_bufio_mark_buffer_dirty to write new data back). 1075 */ 1076 static void *new_read(struct dm_bufio_client *c, sector_t block, 1077 enum new_flag nf, struct dm_buffer **bp) 1078 { 1079 int need_submit; 1080 struct dm_buffer *b; 1081 1082 LIST_HEAD(write_list); 1083 1084 dm_bufio_lock(c); 1085 b = __bufio_new(c, block, nf, &need_submit, &write_list); 1086 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING 1087 if (b && b->hold_count == 1) 1088 buffer_record_stack(b); 1089 #endif 1090 dm_bufio_unlock(c); 1091 1092 __flush_write_list(&write_list); 1093 1094 if (!b) 1095 return NULL; 1096 1097 if (need_submit) 1098 submit_io(b, READ, b->block, read_endio); 1099 1100 wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE); 1101 1102 if (b->read_error) { 1103 int error = b->read_error; 1104 1105 dm_bufio_release(b); 1106 1107 return ERR_PTR(error); 1108 } 1109 1110 *bp = b; 1111 1112 return b->data; 1113 } 1114 1115 void *dm_bufio_get(struct dm_bufio_client *c, sector_t block, 1116 struct dm_buffer **bp) 1117 { 1118 return new_read(c, block, NF_GET, bp); 1119 } 1120 EXPORT_SYMBOL_GPL(dm_bufio_get); 1121 1122 void *dm_bufio_read(struct dm_bufio_client *c, sector_t block, 1123 struct dm_buffer **bp) 1124 { 1125 BUG_ON(dm_bufio_in_request()); 1126 1127 return new_read(c, block, NF_READ, bp); 1128 } 1129 EXPORT_SYMBOL_GPL(dm_bufio_read); 1130 1131 void *dm_bufio_new(struct dm_bufio_client *c, sector_t block, 1132 struct dm_buffer **bp) 1133 { 1134 BUG_ON(dm_bufio_in_request()); 1135 1136 return new_read(c, block, NF_FRESH, bp); 1137 } 1138 EXPORT_SYMBOL_GPL(dm_bufio_new); 1139 1140 void dm_bufio_prefetch(struct dm_bufio_client *c, 1141 sector_t block, unsigned n_blocks) 1142 { 1143 struct blk_plug plug; 1144 1145 LIST_HEAD(write_list); 1146 1147 BUG_ON(dm_bufio_in_request()); 1148 1149 blk_start_plug(&plug); 1150 dm_bufio_lock(c); 1151 1152 for (; n_blocks--; block++) { 1153 int need_submit; 1154 struct dm_buffer *b; 1155 b = __bufio_new(c, block, NF_PREFETCH, &need_submit, 1156 &write_list); 1157 if (unlikely(!list_empty(&write_list))) { 1158 dm_bufio_unlock(c); 1159 blk_finish_plug(&plug); 1160 __flush_write_list(&write_list); 1161 blk_start_plug(&plug); 1162 dm_bufio_lock(c); 1163 } 1164 if (unlikely(b != NULL)) { 1165 dm_bufio_unlock(c); 1166 1167 if (need_submit) 1168 submit_io(b, READ, b->block, read_endio); 1169 dm_bufio_release(b); 1170 1171 dm_bufio_cond_resched(); 1172 1173 if (!n_blocks) 1174 goto flush_plug; 1175 dm_bufio_lock(c); 1176 } 1177 } 1178 1179 dm_bufio_unlock(c); 1180 1181 flush_plug: 1182 blk_finish_plug(&plug); 1183 } 1184 EXPORT_SYMBOL_GPL(dm_bufio_prefetch); 1185 1186 void dm_bufio_release(struct dm_buffer *b) 1187 { 1188 struct dm_bufio_client *c = b->c; 1189 1190 dm_bufio_lock(c); 1191 1192 BUG_ON(!b->hold_count); 1193 1194 b->hold_count--; 1195 if (!b->hold_count) { 1196 wake_up(&c->free_buffer_wait); 1197 1198 /* 1199 * If there were errors on the buffer, and the buffer is not 1200 * to be written, free the buffer. There is no point in caching 1201 * invalid buffer. 1202 */ 1203 if ((b->read_error || b->write_error) && 1204 !test_bit(B_READING, &b->state) && 1205 !test_bit(B_WRITING, &b->state) && 1206 !test_bit(B_DIRTY, &b->state)) { 1207 __unlink_buffer(b); 1208 __free_buffer_wake(b); 1209 } 1210 } 1211 1212 dm_bufio_unlock(c); 1213 } 1214 EXPORT_SYMBOL_GPL(dm_bufio_release); 1215 1216 void dm_bufio_mark_buffer_dirty(struct dm_buffer *b) 1217 { 1218 struct dm_bufio_client *c = b->c; 1219 1220 dm_bufio_lock(c); 1221 1222 BUG_ON(test_bit(B_READING, &b->state)); 1223 1224 if (!test_and_set_bit(B_DIRTY, &b->state)) 1225 __relink_lru(b, LIST_DIRTY); 1226 1227 dm_bufio_unlock(c); 1228 } 1229 EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty); 1230 1231 void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c) 1232 { 1233 LIST_HEAD(write_list); 1234 1235 BUG_ON(dm_bufio_in_request()); 1236 1237 dm_bufio_lock(c); 1238 __write_dirty_buffers_async(c, 0, &write_list); 1239 dm_bufio_unlock(c); 1240 __flush_write_list(&write_list); 1241 } 1242 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async); 1243 1244 /* 1245 * For performance, it is essential that the buffers are written asynchronously 1246 * and simultaneously (so that the block layer can merge the writes) and then 1247 * waited upon. 1248 * 1249 * Finally, we flush hardware disk cache. 1250 */ 1251 int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c) 1252 { 1253 int a, f; 1254 unsigned long buffers_processed = 0; 1255 struct dm_buffer *b, *tmp; 1256 1257 LIST_HEAD(write_list); 1258 1259 dm_bufio_lock(c); 1260 __write_dirty_buffers_async(c, 0, &write_list); 1261 dm_bufio_unlock(c); 1262 __flush_write_list(&write_list); 1263 dm_bufio_lock(c); 1264 1265 again: 1266 list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) { 1267 int dropped_lock = 0; 1268 1269 if (buffers_processed < c->n_buffers[LIST_DIRTY]) 1270 buffers_processed++; 1271 1272 BUG_ON(test_bit(B_READING, &b->state)); 1273 1274 if (test_bit(B_WRITING, &b->state)) { 1275 if (buffers_processed < c->n_buffers[LIST_DIRTY]) { 1276 dropped_lock = 1; 1277 b->hold_count++; 1278 dm_bufio_unlock(c); 1279 wait_on_bit_io(&b->state, B_WRITING, 1280 TASK_UNINTERRUPTIBLE); 1281 dm_bufio_lock(c); 1282 b->hold_count--; 1283 } else 1284 wait_on_bit_io(&b->state, B_WRITING, 1285 TASK_UNINTERRUPTIBLE); 1286 } 1287 1288 if (!test_bit(B_DIRTY, &b->state) && 1289 !test_bit(B_WRITING, &b->state)) 1290 __relink_lru(b, LIST_CLEAN); 1291 1292 dm_bufio_cond_resched(); 1293 1294 /* 1295 * If we dropped the lock, the list is no longer consistent, 1296 * so we must restart the search. 1297 * 1298 * In the most common case, the buffer just processed is 1299 * relinked to the clean list, so we won't loop scanning the 1300 * same buffer again and again. 1301 * 1302 * This may livelock if there is another thread simultaneously 1303 * dirtying buffers, so we count the number of buffers walked 1304 * and if it exceeds the total number of buffers, it means that 1305 * someone is doing some writes simultaneously with us. In 1306 * this case, stop, dropping the lock. 1307 */ 1308 if (dropped_lock) 1309 goto again; 1310 } 1311 wake_up(&c->free_buffer_wait); 1312 dm_bufio_unlock(c); 1313 1314 a = xchg(&c->async_write_error, 0); 1315 f = dm_bufio_issue_flush(c); 1316 if (a) 1317 return a; 1318 1319 return f; 1320 } 1321 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers); 1322 1323 /* 1324 * Use dm-io to send and empty barrier flush the device. 1325 */ 1326 int dm_bufio_issue_flush(struct dm_bufio_client *c) 1327 { 1328 struct dm_io_request io_req = { 1329 .bi_rw = WRITE_FLUSH, 1330 .mem.type = DM_IO_KMEM, 1331 .mem.ptr.addr = NULL, 1332 .client = c->dm_io, 1333 }; 1334 struct dm_io_region io_reg = { 1335 .bdev = c->bdev, 1336 .sector = 0, 1337 .count = 0, 1338 }; 1339 1340 BUG_ON(dm_bufio_in_request()); 1341 1342 return dm_io(&io_req, 1, &io_reg, NULL); 1343 } 1344 EXPORT_SYMBOL_GPL(dm_bufio_issue_flush); 1345 1346 /* 1347 * We first delete any other buffer that may be at that new location. 1348 * 1349 * Then, we write the buffer to the original location if it was dirty. 1350 * 1351 * Then, if we are the only one who is holding the buffer, relink the buffer 1352 * in the hash queue for the new location. 1353 * 1354 * If there was someone else holding the buffer, we write it to the new 1355 * location but not relink it, because that other user needs to have the buffer 1356 * at the same place. 1357 */ 1358 void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block) 1359 { 1360 struct dm_bufio_client *c = b->c; 1361 struct dm_buffer *new; 1362 1363 BUG_ON(dm_bufio_in_request()); 1364 1365 dm_bufio_lock(c); 1366 1367 retry: 1368 new = __find(c, new_block); 1369 if (new) { 1370 if (new->hold_count) { 1371 __wait_for_free_buffer(c); 1372 goto retry; 1373 } 1374 1375 /* 1376 * FIXME: Is there any point waiting for a write that's going 1377 * to be overwritten in a bit? 1378 */ 1379 __make_buffer_clean(new); 1380 __unlink_buffer(new); 1381 __free_buffer_wake(new); 1382 } 1383 1384 BUG_ON(!b->hold_count); 1385 BUG_ON(test_bit(B_READING, &b->state)); 1386 1387 __write_dirty_buffer(b, NULL); 1388 if (b->hold_count == 1) { 1389 wait_on_bit_io(&b->state, B_WRITING, 1390 TASK_UNINTERRUPTIBLE); 1391 set_bit(B_DIRTY, &b->state); 1392 __unlink_buffer(b); 1393 __link_buffer(b, new_block, LIST_DIRTY); 1394 } else { 1395 sector_t old_block; 1396 wait_on_bit_lock_io(&b->state, B_WRITING, 1397 TASK_UNINTERRUPTIBLE); 1398 /* 1399 * Relink buffer to "new_block" so that write_callback 1400 * sees "new_block" as a block number. 1401 * After the write, link the buffer back to old_block. 1402 * All this must be done in bufio lock, so that block number 1403 * change isn't visible to other threads. 1404 */ 1405 old_block = b->block; 1406 __unlink_buffer(b); 1407 __link_buffer(b, new_block, b->list_mode); 1408 submit_io(b, WRITE, new_block, write_endio); 1409 wait_on_bit_io(&b->state, B_WRITING, 1410 TASK_UNINTERRUPTIBLE); 1411 __unlink_buffer(b); 1412 __link_buffer(b, old_block, b->list_mode); 1413 } 1414 1415 dm_bufio_unlock(c); 1416 dm_bufio_release(b); 1417 } 1418 EXPORT_SYMBOL_GPL(dm_bufio_release_move); 1419 1420 /* 1421 * Free the given buffer. 1422 * 1423 * This is just a hint, if the buffer is in use or dirty, this function 1424 * does nothing. 1425 */ 1426 void dm_bufio_forget(struct dm_bufio_client *c, sector_t block) 1427 { 1428 struct dm_buffer *b; 1429 1430 dm_bufio_lock(c); 1431 1432 b = __find(c, block); 1433 if (b && likely(!b->hold_count) && likely(!b->state)) { 1434 __unlink_buffer(b); 1435 __free_buffer_wake(b); 1436 } 1437 1438 dm_bufio_unlock(c); 1439 } 1440 EXPORT_SYMBOL(dm_bufio_forget); 1441 1442 void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n) 1443 { 1444 c->minimum_buffers = n; 1445 } 1446 EXPORT_SYMBOL(dm_bufio_set_minimum_buffers); 1447 1448 unsigned dm_bufio_get_block_size(struct dm_bufio_client *c) 1449 { 1450 return c->block_size; 1451 } 1452 EXPORT_SYMBOL_GPL(dm_bufio_get_block_size); 1453 1454 sector_t dm_bufio_get_device_size(struct dm_bufio_client *c) 1455 { 1456 return i_size_read(c->bdev->bd_inode) >> 1457 (SECTOR_SHIFT + c->sectors_per_block_bits); 1458 } 1459 EXPORT_SYMBOL_GPL(dm_bufio_get_device_size); 1460 1461 sector_t dm_bufio_get_block_number(struct dm_buffer *b) 1462 { 1463 return b->block; 1464 } 1465 EXPORT_SYMBOL_GPL(dm_bufio_get_block_number); 1466 1467 void *dm_bufio_get_block_data(struct dm_buffer *b) 1468 { 1469 return b->data; 1470 } 1471 EXPORT_SYMBOL_GPL(dm_bufio_get_block_data); 1472 1473 void *dm_bufio_get_aux_data(struct dm_buffer *b) 1474 { 1475 return b + 1; 1476 } 1477 EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data); 1478 1479 struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b) 1480 { 1481 return b->c; 1482 } 1483 EXPORT_SYMBOL_GPL(dm_bufio_get_client); 1484 1485 static void drop_buffers(struct dm_bufio_client *c) 1486 { 1487 struct dm_buffer *b; 1488 int i; 1489 bool warned = false; 1490 1491 BUG_ON(dm_bufio_in_request()); 1492 1493 /* 1494 * An optimization so that the buffers are not written one-by-one. 1495 */ 1496 dm_bufio_write_dirty_buffers_async(c); 1497 1498 dm_bufio_lock(c); 1499 1500 while ((b = __get_unclaimed_buffer(c))) 1501 __free_buffer_wake(b); 1502 1503 for (i = 0; i < LIST_SIZE; i++) 1504 list_for_each_entry(b, &c->lru[i], lru_list) { 1505 WARN_ON(!warned); 1506 warned = true; 1507 DMERR("leaked buffer %llx, hold count %u, list %d", 1508 (unsigned long long)b->block, b->hold_count, i); 1509 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING 1510 print_stack_trace(&b->stack_trace, 1); 1511 b->hold_count = 0; /* mark unclaimed to avoid BUG_ON below */ 1512 #endif 1513 } 1514 1515 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING 1516 while ((b = __get_unclaimed_buffer(c))) 1517 __free_buffer_wake(b); 1518 #endif 1519 1520 for (i = 0; i < LIST_SIZE; i++) 1521 BUG_ON(!list_empty(&c->lru[i])); 1522 1523 dm_bufio_unlock(c); 1524 } 1525 1526 /* 1527 * We may not be able to evict this buffer if IO pending or the client 1528 * is still using it. Caller is expected to know buffer is too old. 1529 * 1530 * And if GFP_NOFS is used, we must not do any I/O because we hold 1531 * dm_bufio_clients_lock and we would risk deadlock if the I/O gets 1532 * rerouted to different bufio client. 1533 */ 1534 static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp) 1535 { 1536 if (!(gfp & __GFP_FS)) { 1537 if (test_bit(B_READING, &b->state) || 1538 test_bit(B_WRITING, &b->state) || 1539 test_bit(B_DIRTY, &b->state)) 1540 return false; 1541 } 1542 1543 if (b->hold_count) 1544 return false; 1545 1546 __make_buffer_clean(b); 1547 __unlink_buffer(b); 1548 __free_buffer_wake(b); 1549 1550 return true; 1551 } 1552 1553 static unsigned get_retain_buffers(struct dm_bufio_client *c) 1554 { 1555 unsigned retain_bytes = ACCESS_ONCE(dm_bufio_retain_bytes); 1556 return retain_bytes / c->block_size; 1557 } 1558 1559 static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, 1560 gfp_t gfp_mask) 1561 { 1562 int l; 1563 struct dm_buffer *b, *tmp; 1564 unsigned long freed = 0; 1565 unsigned long count = nr_to_scan; 1566 unsigned retain_target = get_retain_buffers(c); 1567 1568 for (l = 0; l < LIST_SIZE; l++) { 1569 list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) { 1570 if (__try_evict_buffer(b, gfp_mask)) 1571 freed++; 1572 if (!--nr_to_scan || ((count - freed) <= retain_target)) 1573 return freed; 1574 dm_bufio_cond_resched(); 1575 } 1576 } 1577 return freed; 1578 } 1579 1580 static unsigned long 1581 dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) 1582 { 1583 struct dm_bufio_client *c; 1584 unsigned long freed; 1585 1586 c = container_of(shrink, struct dm_bufio_client, shrinker); 1587 if (sc->gfp_mask & __GFP_FS) 1588 dm_bufio_lock(c); 1589 else if (!dm_bufio_trylock(c)) 1590 return SHRINK_STOP; 1591 1592 freed = __scan(c, sc->nr_to_scan, sc->gfp_mask); 1593 dm_bufio_unlock(c); 1594 return freed; 1595 } 1596 1597 static unsigned long 1598 dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc) 1599 { 1600 struct dm_bufio_client *c; 1601 unsigned long count; 1602 1603 c = container_of(shrink, struct dm_bufio_client, shrinker); 1604 if (sc->gfp_mask & __GFP_FS) 1605 dm_bufio_lock(c); 1606 else if (!dm_bufio_trylock(c)) 1607 return 0; 1608 1609 count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY]; 1610 dm_bufio_unlock(c); 1611 return count; 1612 } 1613 1614 /* 1615 * Create the buffering interface 1616 */ 1617 struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size, 1618 unsigned reserved_buffers, unsigned aux_size, 1619 void (*alloc_callback)(struct dm_buffer *), 1620 void (*write_callback)(struct dm_buffer *)) 1621 { 1622 int r; 1623 struct dm_bufio_client *c; 1624 unsigned i; 1625 1626 BUG_ON(block_size < 1 << SECTOR_SHIFT || 1627 (block_size & (block_size - 1))); 1628 1629 c = kzalloc(sizeof(*c), GFP_KERNEL); 1630 if (!c) { 1631 r = -ENOMEM; 1632 goto bad_client; 1633 } 1634 c->buffer_tree = RB_ROOT; 1635 1636 c->bdev = bdev; 1637 c->block_size = block_size; 1638 c->sectors_per_block_bits = __ffs(block_size) - SECTOR_SHIFT; 1639 c->pages_per_block_bits = (__ffs(block_size) >= PAGE_SHIFT) ? 1640 __ffs(block_size) - PAGE_SHIFT : 0; 1641 c->blocks_per_page_bits = (__ffs(block_size) < PAGE_SHIFT ? 1642 PAGE_SHIFT - __ffs(block_size) : 0); 1643 1644 c->aux_size = aux_size; 1645 c->alloc_callback = alloc_callback; 1646 c->write_callback = write_callback; 1647 1648 for (i = 0; i < LIST_SIZE; i++) { 1649 INIT_LIST_HEAD(&c->lru[i]); 1650 c->n_buffers[i] = 0; 1651 } 1652 1653 mutex_init(&c->lock); 1654 INIT_LIST_HEAD(&c->reserved_buffers); 1655 c->need_reserved_buffers = reserved_buffers; 1656 1657 c->minimum_buffers = DM_BUFIO_MIN_BUFFERS; 1658 1659 init_waitqueue_head(&c->free_buffer_wait); 1660 c->async_write_error = 0; 1661 1662 c->dm_io = dm_io_client_create(); 1663 if (IS_ERR(c->dm_io)) { 1664 r = PTR_ERR(c->dm_io); 1665 goto bad_dm_io; 1666 } 1667 1668 mutex_lock(&dm_bufio_clients_lock); 1669 if (c->blocks_per_page_bits) { 1670 if (!DM_BUFIO_CACHE_NAME(c)) { 1671 DM_BUFIO_CACHE_NAME(c) = kasprintf(GFP_KERNEL, "dm_bufio_cache-%u", c->block_size); 1672 if (!DM_BUFIO_CACHE_NAME(c)) { 1673 r = -ENOMEM; 1674 mutex_unlock(&dm_bufio_clients_lock); 1675 goto bad_cache; 1676 } 1677 } 1678 1679 if (!DM_BUFIO_CACHE(c)) { 1680 DM_BUFIO_CACHE(c) = kmem_cache_create(DM_BUFIO_CACHE_NAME(c), 1681 c->block_size, 1682 c->block_size, 0, NULL); 1683 if (!DM_BUFIO_CACHE(c)) { 1684 r = -ENOMEM; 1685 mutex_unlock(&dm_bufio_clients_lock); 1686 goto bad_cache; 1687 } 1688 } 1689 } 1690 mutex_unlock(&dm_bufio_clients_lock); 1691 1692 while (c->need_reserved_buffers) { 1693 struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL); 1694 1695 if (!b) { 1696 r = -ENOMEM; 1697 goto bad_buffer; 1698 } 1699 __free_buffer_wake(b); 1700 } 1701 1702 mutex_lock(&dm_bufio_clients_lock); 1703 dm_bufio_client_count++; 1704 list_add(&c->client_list, &dm_bufio_all_clients); 1705 __cache_size_refresh(); 1706 mutex_unlock(&dm_bufio_clients_lock); 1707 1708 c->shrinker.count_objects = dm_bufio_shrink_count; 1709 c->shrinker.scan_objects = dm_bufio_shrink_scan; 1710 c->shrinker.seeks = 1; 1711 c->shrinker.batch = 0; 1712 register_shrinker(&c->shrinker); 1713 1714 return c; 1715 1716 bad_buffer: 1717 bad_cache: 1718 while (!list_empty(&c->reserved_buffers)) { 1719 struct dm_buffer *b = list_entry(c->reserved_buffers.next, 1720 struct dm_buffer, lru_list); 1721 list_del(&b->lru_list); 1722 free_buffer(b); 1723 } 1724 dm_io_client_destroy(c->dm_io); 1725 bad_dm_io: 1726 kfree(c); 1727 bad_client: 1728 return ERR_PTR(r); 1729 } 1730 EXPORT_SYMBOL_GPL(dm_bufio_client_create); 1731 1732 /* 1733 * Free the buffering interface. 1734 * It is required that there are no references on any buffers. 1735 */ 1736 void dm_bufio_client_destroy(struct dm_bufio_client *c) 1737 { 1738 unsigned i; 1739 1740 drop_buffers(c); 1741 1742 unregister_shrinker(&c->shrinker); 1743 1744 mutex_lock(&dm_bufio_clients_lock); 1745 1746 list_del(&c->client_list); 1747 dm_bufio_client_count--; 1748 __cache_size_refresh(); 1749 1750 mutex_unlock(&dm_bufio_clients_lock); 1751 1752 BUG_ON(!RB_EMPTY_ROOT(&c->buffer_tree)); 1753 BUG_ON(c->need_reserved_buffers); 1754 1755 while (!list_empty(&c->reserved_buffers)) { 1756 struct dm_buffer *b = list_entry(c->reserved_buffers.next, 1757 struct dm_buffer, lru_list); 1758 list_del(&b->lru_list); 1759 free_buffer(b); 1760 } 1761 1762 for (i = 0; i < LIST_SIZE; i++) 1763 if (c->n_buffers[i]) 1764 DMERR("leaked buffer count %d: %ld", i, c->n_buffers[i]); 1765 1766 for (i = 0; i < LIST_SIZE; i++) 1767 BUG_ON(c->n_buffers[i]); 1768 1769 dm_io_client_destroy(c->dm_io); 1770 kfree(c); 1771 } 1772 EXPORT_SYMBOL_GPL(dm_bufio_client_destroy); 1773 1774 static unsigned get_max_age_hz(void) 1775 { 1776 unsigned max_age = ACCESS_ONCE(dm_bufio_max_age); 1777 1778 if (max_age > UINT_MAX / HZ) 1779 max_age = UINT_MAX / HZ; 1780 1781 return max_age * HZ; 1782 } 1783 1784 static bool older_than(struct dm_buffer *b, unsigned long age_hz) 1785 { 1786 return time_after_eq(jiffies, b->last_accessed + age_hz); 1787 } 1788 1789 static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz) 1790 { 1791 struct dm_buffer *b, *tmp; 1792 unsigned retain_target = get_retain_buffers(c); 1793 unsigned count; 1794 1795 dm_bufio_lock(c); 1796 1797 count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY]; 1798 list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_CLEAN], lru_list) { 1799 if (count <= retain_target) 1800 break; 1801 1802 if (!older_than(b, age_hz)) 1803 break; 1804 1805 if (__try_evict_buffer(b, 0)) 1806 count--; 1807 1808 dm_bufio_cond_resched(); 1809 } 1810 1811 dm_bufio_unlock(c); 1812 } 1813 1814 static void cleanup_old_buffers(void) 1815 { 1816 unsigned long max_age_hz = get_max_age_hz(); 1817 struct dm_bufio_client *c; 1818 1819 mutex_lock(&dm_bufio_clients_lock); 1820 1821 list_for_each_entry(c, &dm_bufio_all_clients, client_list) 1822 __evict_old_buffers(c, max_age_hz); 1823 1824 mutex_unlock(&dm_bufio_clients_lock); 1825 } 1826 1827 static struct workqueue_struct *dm_bufio_wq; 1828 static struct delayed_work dm_bufio_work; 1829 1830 static void work_fn(struct work_struct *w) 1831 { 1832 cleanup_old_buffers(); 1833 1834 queue_delayed_work(dm_bufio_wq, &dm_bufio_work, 1835 DM_BUFIO_WORK_TIMER_SECS * HZ); 1836 } 1837 1838 /*---------------------------------------------------------------- 1839 * Module setup 1840 *--------------------------------------------------------------*/ 1841 1842 /* 1843 * This is called only once for the whole dm_bufio module. 1844 * It initializes memory limit. 1845 */ 1846 static int __init dm_bufio_init(void) 1847 { 1848 __u64 mem; 1849 1850 dm_bufio_allocated_kmem_cache = 0; 1851 dm_bufio_allocated_get_free_pages = 0; 1852 dm_bufio_allocated_vmalloc = 0; 1853 dm_bufio_current_allocated = 0; 1854 1855 memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches); 1856 memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names); 1857 1858 mem = (__u64)((totalram_pages - totalhigh_pages) * 1859 DM_BUFIO_MEMORY_PERCENT / 100) << PAGE_SHIFT; 1860 1861 if (mem > ULONG_MAX) 1862 mem = ULONG_MAX; 1863 1864 #ifdef CONFIG_MMU 1865 /* 1866 * Get the size of vmalloc space the same way as VMALLOC_TOTAL 1867 * in fs/proc/internal.h 1868 */ 1869 if (mem > (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100) 1870 mem = (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100; 1871 #endif 1872 1873 dm_bufio_default_cache_size = mem; 1874 1875 mutex_lock(&dm_bufio_clients_lock); 1876 __cache_size_refresh(); 1877 mutex_unlock(&dm_bufio_clients_lock); 1878 1879 dm_bufio_wq = create_singlethread_workqueue("dm_bufio_cache"); 1880 if (!dm_bufio_wq) 1881 return -ENOMEM; 1882 1883 INIT_DELAYED_WORK(&dm_bufio_work, work_fn); 1884 queue_delayed_work(dm_bufio_wq, &dm_bufio_work, 1885 DM_BUFIO_WORK_TIMER_SECS * HZ); 1886 1887 return 0; 1888 } 1889 1890 /* 1891 * This is called once when unloading the dm_bufio module. 1892 */ 1893 static void __exit dm_bufio_exit(void) 1894 { 1895 int bug = 0; 1896 int i; 1897 1898 cancel_delayed_work_sync(&dm_bufio_work); 1899 destroy_workqueue(dm_bufio_wq); 1900 1901 for (i = 0; i < ARRAY_SIZE(dm_bufio_caches); i++) 1902 kmem_cache_destroy(dm_bufio_caches[i]); 1903 1904 for (i = 0; i < ARRAY_SIZE(dm_bufio_cache_names); i++) 1905 kfree(dm_bufio_cache_names[i]); 1906 1907 if (dm_bufio_client_count) { 1908 DMCRIT("%s: dm_bufio_client_count leaked: %d", 1909 __func__, dm_bufio_client_count); 1910 bug = 1; 1911 } 1912 1913 if (dm_bufio_current_allocated) { 1914 DMCRIT("%s: dm_bufio_current_allocated leaked: %lu", 1915 __func__, dm_bufio_current_allocated); 1916 bug = 1; 1917 } 1918 1919 if (dm_bufio_allocated_get_free_pages) { 1920 DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu", 1921 __func__, dm_bufio_allocated_get_free_pages); 1922 bug = 1; 1923 } 1924 1925 if (dm_bufio_allocated_vmalloc) { 1926 DMCRIT("%s: dm_bufio_vmalloc leaked: %lu", 1927 __func__, dm_bufio_allocated_vmalloc); 1928 bug = 1; 1929 } 1930 1931 BUG_ON(bug); 1932 } 1933 1934 module_init(dm_bufio_init) 1935 module_exit(dm_bufio_exit) 1936 1937 module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, S_IRUGO | S_IWUSR); 1938 MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache"); 1939 1940 module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR); 1941 MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds"); 1942 1943 module_param_named(retain_bytes, dm_bufio_retain_bytes, uint, S_IRUGO | S_IWUSR); 1944 MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory"); 1945 1946 module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR); 1947 MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory"); 1948 1949 module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, S_IRUGO); 1950 MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc"); 1951 1952 module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, S_IRUGO); 1953 MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages"); 1954 1955 module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, S_IRUGO); 1956 MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc"); 1957 1958 module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, S_IRUGO); 1959 MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache"); 1960 1961 MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>"); 1962 MODULE_DESCRIPTION(DM_NAME " buffered I/O library"); 1963 MODULE_LICENSE("GPL"); 1964