1 /* 2 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 #include "xfs.h" 19 #include <linux/stddef.h> 20 #include <linux/errno.h> 21 #include <linux/gfp.h> 22 #include <linux/pagemap.h> 23 #include <linux/init.h> 24 #include <linux/vmalloc.h> 25 #include <linux/bio.h> 26 #include <linux/sysctl.h> 27 #include <linux/proc_fs.h> 28 #include <linux/workqueue.h> 29 #include <linux/percpu.h> 30 #include <linux/blkdev.h> 31 #include <linux/hash.h> 32 #include <linux/kthread.h> 33 #include <linux/migrate.h> 34 #include <linux/backing-dev.h> 35 #include <linux/freezer.h> 36 37 #include "xfs_sb.h" 38 #include "xfs_inum.h" 39 #include "xfs_log.h" 40 #include "xfs_ag.h" 41 #include "xfs_mount.h" 42 #include "xfs_trace.h" 43 44 static kmem_zone_t *xfs_buf_zone; 45 STATIC int xfsbufd(void *); 46 STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int); 47 48 static struct workqueue_struct *xfslogd_workqueue; 49 struct workqueue_struct *xfsdatad_workqueue; 50 struct workqueue_struct *xfsconvertd_workqueue; 51 52 #ifdef XFS_BUF_LOCK_TRACKING 53 # define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid) 54 # define XB_CLEAR_OWNER(bp) ((bp)->b_last_holder = -1) 55 # define XB_GET_OWNER(bp) ((bp)->b_last_holder) 56 #else 57 # define XB_SET_OWNER(bp) do { } while (0) 58 # define XB_CLEAR_OWNER(bp) do { } while (0) 59 # define XB_GET_OWNER(bp) do { } while (0) 60 #endif 61 62 #define xb_to_gfp(flags) \ 63 ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : \ 64 ((flags) & XBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN) 65 66 #define xb_to_km(flags) \ 67 (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP) 68 69 #define xfs_buf_allocate(flags) \ 70 kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags)) 71 #define xfs_buf_deallocate(bp) \ 72 kmem_zone_free(xfs_buf_zone, (bp)); 73 74 static inline int 75 xfs_buf_is_vmapped( 76 struct xfs_buf *bp) 77 { 78 /* 79 * Return true if the buffer is vmapped. 80 * 81 * The XBF_MAPPED flag is set if the buffer should be mapped, but the 82 * code is clever enough to know it doesn't have to map a single page, 83 * so the check has to be both for XBF_MAPPED and bp->b_page_count > 1. 84 */ 85 return (bp->b_flags & XBF_MAPPED) && bp->b_page_count > 1; 86 } 87 88 static inline int 89 xfs_buf_vmap_len( 90 struct xfs_buf *bp) 91 { 92 return (bp->b_page_count * PAGE_SIZE) - bp->b_offset; 93 } 94 95 /* 96 * xfs_buf_lru_add - add a buffer to the LRU. 97 * 98 * The LRU takes a new reference to the buffer so that it will only be freed 99 * once the shrinker takes the buffer off the LRU. 100 */ 101 STATIC void 102 xfs_buf_lru_add( 103 struct xfs_buf *bp) 104 { 105 struct xfs_buftarg *btp = bp->b_target; 106 107 spin_lock(&btp->bt_lru_lock); 108 if (list_empty(&bp->b_lru)) { 109 atomic_inc(&bp->b_hold); 110 list_add_tail(&bp->b_lru, &btp->bt_lru); 111 btp->bt_lru_nr++; 112 } 113 spin_unlock(&btp->bt_lru_lock); 114 } 115 116 /* 117 * xfs_buf_lru_del - remove a buffer from the LRU 118 * 119 * The unlocked check is safe here because it only occurs when there are not 120 * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there 121 * to optimise the shrinker removing the buffer from the LRU and calling 122 * xfs_buf_free(). i.e. it removes an unnecessary round trip on the 123 * bt_lru_lock. 124 */ 125 STATIC void 126 xfs_buf_lru_del( 127 struct xfs_buf *bp) 128 { 129 struct xfs_buftarg *btp = bp->b_target; 130 131 if (list_empty(&bp->b_lru)) 132 return; 133 134 spin_lock(&btp->bt_lru_lock); 135 if (!list_empty(&bp->b_lru)) { 136 list_del_init(&bp->b_lru); 137 btp->bt_lru_nr--; 138 } 139 spin_unlock(&btp->bt_lru_lock); 140 } 141 142 /* 143 * When we mark a buffer stale, we remove the buffer from the LRU and clear the 144 * b_lru_ref count so that the buffer is freed immediately when the buffer 145 * reference count falls to zero. If the buffer is already on the LRU, we need 146 * to remove the reference that LRU holds on the buffer. 147 * 148 * This prevents build-up of stale buffers on the LRU. 149 */ 150 void 151 xfs_buf_stale( 152 struct xfs_buf *bp) 153 { 154 bp->b_flags |= XBF_STALE; 155 atomic_set(&(bp)->b_lru_ref, 0); 156 if (!list_empty(&bp->b_lru)) { 157 struct xfs_buftarg *btp = bp->b_target; 158 159 spin_lock(&btp->bt_lru_lock); 160 if (!list_empty(&bp->b_lru)) { 161 list_del_init(&bp->b_lru); 162 btp->bt_lru_nr--; 163 atomic_dec(&bp->b_hold); 164 } 165 spin_unlock(&btp->bt_lru_lock); 166 } 167 ASSERT(atomic_read(&bp->b_hold) >= 1); 168 } 169 170 STATIC void 171 _xfs_buf_initialize( 172 xfs_buf_t *bp, 173 xfs_buftarg_t *target, 174 xfs_off_t range_base, 175 size_t range_length, 176 xfs_buf_flags_t flags) 177 { 178 /* 179 * We don't want certain flags to appear in b_flags. 180 */ 181 flags &= ~(XBF_LOCK|XBF_MAPPED|XBF_DONT_BLOCK|XBF_READ_AHEAD); 182 183 memset(bp, 0, sizeof(xfs_buf_t)); 184 atomic_set(&bp->b_hold, 1); 185 atomic_set(&bp->b_lru_ref, 1); 186 init_completion(&bp->b_iowait); 187 INIT_LIST_HEAD(&bp->b_lru); 188 INIT_LIST_HEAD(&bp->b_list); 189 RB_CLEAR_NODE(&bp->b_rbnode); 190 sema_init(&bp->b_sema, 0); /* held, no waiters */ 191 XB_SET_OWNER(bp); 192 bp->b_target = target; 193 bp->b_file_offset = range_base; 194 /* 195 * Set buffer_length and count_desired to the same value initially. 196 * I/O routines should use count_desired, which will be the same in 197 * most cases but may be reset (e.g. XFS recovery). 198 */ 199 bp->b_buffer_length = bp->b_count_desired = range_length; 200 bp->b_flags = flags; 201 bp->b_bn = XFS_BUF_DADDR_NULL; 202 atomic_set(&bp->b_pin_count, 0); 203 init_waitqueue_head(&bp->b_waiters); 204 205 XFS_STATS_INC(xb_create); 206 207 trace_xfs_buf_init(bp, _RET_IP_); 208 } 209 210 /* 211 * Allocate a page array capable of holding a specified number 212 * of pages, and point the page buf at it. 213 */ 214 STATIC int 215 _xfs_buf_get_pages( 216 xfs_buf_t *bp, 217 int page_count, 218 xfs_buf_flags_t flags) 219 { 220 /* Make sure that we have a page list */ 221 if (bp->b_pages == NULL) { 222 bp->b_offset = xfs_buf_poff(bp->b_file_offset); 223 bp->b_page_count = page_count; 224 if (page_count <= XB_PAGES) { 225 bp->b_pages = bp->b_page_array; 226 } else { 227 bp->b_pages = kmem_alloc(sizeof(struct page *) * 228 page_count, xb_to_km(flags)); 229 if (bp->b_pages == NULL) 230 return -ENOMEM; 231 } 232 memset(bp->b_pages, 0, sizeof(struct page *) * page_count); 233 } 234 return 0; 235 } 236 237 /* 238 * Frees b_pages if it was allocated. 239 */ 240 STATIC void 241 _xfs_buf_free_pages( 242 xfs_buf_t *bp) 243 { 244 if (bp->b_pages != bp->b_page_array) { 245 kmem_free(bp->b_pages); 246 bp->b_pages = NULL; 247 } 248 } 249 250 /* 251 * Releases the specified buffer. 252 * 253 * The modification state of any associated pages is left unchanged. 254 * The buffer most not be on any hash - use xfs_buf_rele instead for 255 * hashed and refcounted buffers 256 */ 257 void 258 xfs_buf_free( 259 xfs_buf_t *bp) 260 { 261 trace_xfs_buf_free(bp, _RET_IP_); 262 263 ASSERT(list_empty(&bp->b_lru)); 264 265 if (bp->b_flags & _XBF_PAGES) { 266 uint i; 267 268 if (xfs_buf_is_vmapped(bp)) 269 vm_unmap_ram(bp->b_addr - bp->b_offset, 270 bp->b_page_count); 271 272 for (i = 0; i < bp->b_page_count; i++) { 273 struct page *page = bp->b_pages[i]; 274 275 __free_page(page); 276 } 277 } else if (bp->b_flags & _XBF_KMEM) 278 kmem_free(bp->b_addr); 279 _xfs_buf_free_pages(bp); 280 xfs_buf_deallocate(bp); 281 } 282 283 /* 284 * Allocates all the pages for buffer in question and builds it's page list. 285 */ 286 STATIC int 287 xfs_buf_allocate_memory( 288 xfs_buf_t *bp, 289 uint flags) 290 { 291 size_t size = bp->b_count_desired; 292 size_t nbytes, offset; 293 gfp_t gfp_mask = xb_to_gfp(flags); 294 unsigned short page_count, i; 295 xfs_off_t end; 296 int error; 297 298 /* 299 * for buffers that are contained within a single page, just allocate 300 * the memory from the heap - there's no need for the complexity of 301 * page arrays to keep allocation down to order 0. 302 */ 303 if (bp->b_buffer_length < PAGE_SIZE) { 304 bp->b_addr = kmem_alloc(bp->b_buffer_length, xb_to_km(flags)); 305 if (!bp->b_addr) { 306 /* low memory - use alloc_page loop instead */ 307 goto use_alloc_page; 308 } 309 310 if (((unsigned long)(bp->b_addr + bp->b_buffer_length - 1) & 311 PAGE_MASK) != 312 ((unsigned long)bp->b_addr & PAGE_MASK)) { 313 /* b_addr spans two pages - use alloc_page instead */ 314 kmem_free(bp->b_addr); 315 bp->b_addr = NULL; 316 goto use_alloc_page; 317 } 318 bp->b_offset = offset_in_page(bp->b_addr); 319 bp->b_pages = bp->b_page_array; 320 bp->b_pages[0] = virt_to_page(bp->b_addr); 321 bp->b_page_count = 1; 322 bp->b_flags |= XBF_MAPPED | _XBF_KMEM; 323 return 0; 324 } 325 326 use_alloc_page: 327 end = bp->b_file_offset + bp->b_buffer_length; 328 page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset); 329 error = _xfs_buf_get_pages(bp, page_count, flags); 330 if (unlikely(error)) 331 return error; 332 333 offset = bp->b_offset; 334 bp->b_flags |= _XBF_PAGES; 335 336 for (i = 0; i < bp->b_page_count; i++) { 337 struct page *page; 338 uint retries = 0; 339 retry: 340 page = alloc_page(gfp_mask); 341 if (unlikely(page == NULL)) { 342 if (flags & XBF_READ_AHEAD) { 343 bp->b_page_count = i; 344 error = ENOMEM; 345 goto out_free_pages; 346 } 347 348 /* 349 * This could deadlock. 350 * 351 * But until all the XFS lowlevel code is revamped to 352 * handle buffer allocation failures we can't do much. 353 */ 354 if (!(++retries % 100)) 355 xfs_err(NULL, 356 "possible memory allocation deadlock in %s (mode:0x%x)", 357 __func__, gfp_mask); 358 359 XFS_STATS_INC(xb_page_retries); 360 congestion_wait(BLK_RW_ASYNC, HZ/50); 361 goto retry; 362 } 363 364 XFS_STATS_INC(xb_page_found); 365 366 nbytes = min_t(size_t, size, PAGE_SIZE - offset); 367 size -= nbytes; 368 bp->b_pages[i] = page; 369 offset = 0; 370 } 371 return 0; 372 373 out_free_pages: 374 for (i = 0; i < bp->b_page_count; i++) 375 __free_page(bp->b_pages[i]); 376 return error; 377 } 378 379 /* 380 * Map buffer into kernel address-space if necessary. 381 */ 382 STATIC int 383 _xfs_buf_map_pages( 384 xfs_buf_t *bp, 385 uint flags) 386 { 387 ASSERT(bp->b_flags & _XBF_PAGES); 388 if (bp->b_page_count == 1) { 389 /* A single page buffer is always mappable */ 390 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; 391 bp->b_flags |= XBF_MAPPED; 392 } else if (flags & XBF_MAPPED) { 393 int retried = 0; 394 395 do { 396 bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, 397 -1, PAGE_KERNEL); 398 if (bp->b_addr) 399 break; 400 vm_unmap_aliases(); 401 } while (retried++ <= 1); 402 403 if (!bp->b_addr) 404 return -ENOMEM; 405 bp->b_addr += bp->b_offset; 406 bp->b_flags |= XBF_MAPPED; 407 } 408 409 return 0; 410 } 411 412 /* 413 * Finding and Reading Buffers 414 */ 415 416 /* 417 * Look up, and creates if absent, a lockable buffer for 418 * a given range of an inode. The buffer is returned 419 * locked. If other overlapping buffers exist, they are 420 * released before the new buffer is created and locked, 421 * which may imply that this call will block until those buffers 422 * are unlocked. No I/O is implied by this call. 423 */ 424 xfs_buf_t * 425 _xfs_buf_find( 426 xfs_buftarg_t *btp, /* block device target */ 427 xfs_off_t ioff, /* starting offset of range */ 428 size_t isize, /* length of range */ 429 xfs_buf_flags_t flags, 430 xfs_buf_t *new_bp) 431 { 432 xfs_off_t range_base; 433 size_t range_length; 434 struct xfs_perag *pag; 435 struct rb_node **rbp; 436 struct rb_node *parent; 437 xfs_buf_t *bp; 438 439 range_base = (ioff << BBSHIFT); 440 range_length = (isize << BBSHIFT); 441 442 /* Check for IOs smaller than the sector size / not sector aligned */ 443 ASSERT(!(range_length < (1 << btp->bt_sshift))); 444 ASSERT(!(range_base & (xfs_off_t)btp->bt_smask)); 445 446 /* get tree root */ 447 pag = xfs_perag_get(btp->bt_mount, 448 xfs_daddr_to_agno(btp->bt_mount, ioff)); 449 450 /* walk tree */ 451 spin_lock(&pag->pag_buf_lock); 452 rbp = &pag->pag_buf_tree.rb_node; 453 parent = NULL; 454 bp = NULL; 455 while (*rbp) { 456 parent = *rbp; 457 bp = rb_entry(parent, struct xfs_buf, b_rbnode); 458 459 if (range_base < bp->b_file_offset) 460 rbp = &(*rbp)->rb_left; 461 else if (range_base > bp->b_file_offset) 462 rbp = &(*rbp)->rb_right; 463 else { 464 /* 465 * found a block offset match. If the range doesn't 466 * match, the only way this is allowed is if the buffer 467 * in the cache is stale and the transaction that made 468 * it stale has not yet committed. i.e. we are 469 * reallocating a busy extent. Skip this buffer and 470 * continue searching to the right for an exact match. 471 */ 472 if (bp->b_buffer_length != range_length) { 473 ASSERT(bp->b_flags & XBF_STALE); 474 rbp = &(*rbp)->rb_right; 475 continue; 476 } 477 atomic_inc(&bp->b_hold); 478 goto found; 479 } 480 } 481 482 /* No match found */ 483 if (new_bp) { 484 _xfs_buf_initialize(new_bp, btp, range_base, 485 range_length, flags); 486 rb_link_node(&new_bp->b_rbnode, parent, rbp); 487 rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree); 488 /* the buffer keeps the perag reference until it is freed */ 489 new_bp->b_pag = pag; 490 spin_unlock(&pag->pag_buf_lock); 491 } else { 492 XFS_STATS_INC(xb_miss_locked); 493 spin_unlock(&pag->pag_buf_lock); 494 xfs_perag_put(pag); 495 } 496 return new_bp; 497 498 found: 499 spin_unlock(&pag->pag_buf_lock); 500 xfs_perag_put(pag); 501 502 if (!xfs_buf_trylock(bp)) { 503 if (flags & XBF_TRYLOCK) { 504 xfs_buf_rele(bp); 505 XFS_STATS_INC(xb_busy_locked); 506 return NULL; 507 } 508 xfs_buf_lock(bp); 509 XFS_STATS_INC(xb_get_locked_waited); 510 } 511 512 /* 513 * if the buffer is stale, clear all the external state associated with 514 * it. We need to keep flags such as how we allocated the buffer memory 515 * intact here. 516 */ 517 if (bp->b_flags & XBF_STALE) { 518 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); 519 bp->b_flags &= XBF_MAPPED | _XBF_KMEM | _XBF_PAGES; 520 } 521 522 trace_xfs_buf_find(bp, flags, _RET_IP_); 523 XFS_STATS_INC(xb_get_locked); 524 return bp; 525 } 526 527 /* 528 * Assembles a buffer covering the specified range. 529 * Storage in memory for all portions of the buffer will be allocated, 530 * although backing storage may not be. 531 */ 532 xfs_buf_t * 533 xfs_buf_get( 534 xfs_buftarg_t *target,/* target for buffer */ 535 xfs_off_t ioff, /* starting offset of range */ 536 size_t isize, /* length of range */ 537 xfs_buf_flags_t flags) 538 { 539 xfs_buf_t *bp, *new_bp; 540 int error = 0; 541 542 new_bp = xfs_buf_allocate(flags); 543 if (unlikely(!new_bp)) 544 return NULL; 545 546 bp = _xfs_buf_find(target, ioff, isize, flags, new_bp); 547 if (bp == new_bp) { 548 error = xfs_buf_allocate_memory(bp, flags); 549 if (error) 550 goto no_buffer; 551 } else { 552 xfs_buf_deallocate(new_bp); 553 if (unlikely(bp == NULL)) 554 return NULL; 555 } 556 557 if (!(bp->b_flags & XBF_MAPPED)) { 558 error = _xfs_buf_map_pages(bp, flags); 559 if (unlikely(error)) { 560 xfs_warn(target->bt_mount, 561 "%s: failed to map pages\n", __func__); 562 goto no_buffer; 563 } 564 } 565 566 XFS_STATS_INC(xb_get); 567 568 /* 569 * Always fill in the block number now, the mapped cases can do 570 * their own overlay of this later. 571 */ 572 bp->b_bn = ioff; 573 bp->b_count_desired = bp->b_buffer_length; 574 575 trace_xfs_buf_get(bp, flags, _RET_IP_); 576 return bp; 577 578 no_buffer: 579 if (flags & (XBF_LOCK | XBF_TRYLOCK)) 580 xfs_buf_unlock(bp); 581 xfs_buf_rele(bp); 582 return NULL; 583 } 584 585 STATIC int 586 _xfs_buf_read( 587 xfs_buf_t *bp, 588 xfs_buf_flags_t flags) 589 { 590 int status; 591 592 ASSERT(!(flags & (XBF_DELWRI|XBF_WRITE))); 593 ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL); 594 595 bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_DELWRI | XBF_READ_AHEAD); 596 bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); 597 598 status = xfs_buf_iorequest(bp); 599 if (status || bp->b_error || (flags & XBF_ASYNC)) 600 return status; 601 return xfs_buf_iowait(bp); 602 } 603 604 xfs_buf_t * 605 xfs_buf_read( 606 xfs_buftarg_t *target, 607 xfs_off_t ioff, 608 size_t isize, 609 xfs_buf_flags_t flags) 610 { 611 xfs_buf_t *bp; 612 613 flags |= XBF_READ; 614 615 bp = xfs_buf_get(target, ioff, isize, flags); 616 if (bp) { 617 trace_xfs_buf_read(bp, flags, _RET_IP_); 618 619 if (!XFS_BUF_ISDONE(bp)) { 620 XFS_STATS_INC(xb_get_read); 621 _xfs_buf_read(bp, flags); 622 } else if (flags & XBF_ASYNC) { 623 /* 624 * Read ahead call which is already satisfied, 625 * drop the buffer 626 */ 627 goto no_buffer; 628 } else { 629 /* We do not want read in the flags */ 630 bp->b_flags &= ~XBF_READ; 631 } 632 } 633 634 return bp; 635 636 no_buffer: 637 if (flags & (XBF_LOCK | XBF_TRYLOCK)) 638 xfs_buf_unlock(bp); 639 xfs_buf_rele(bp); 640 return NULL; 641 } 642 643 /* 644 * If we are not low on memory then do the readahead in a deadlock 645 * safe manner. 646 */ 647 void 648 xfs_buf_readahead( 649 xfs_buftarg_t *target, 650 xfs_off_t ioff, 651 size_t isize) 652 { 653 if (bdi_read_congested(target->bt_bdi)) 654 return; 655 656 xfs_buf_read(target, ioff, isize, 657 XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD|XBF_DONT_BLOCK); 658 } 659 660 /* 661 * Read an uncached buffer from disk. Allocates and returns a locked 662 * buffer containing the disk contents or nothing. 663 */ 664 struct xfs_buf * 665 xfs_buf_read_uncached( 666 struct xfs_mount *mp, 667 struct xfs_buftarg *target, 668 xfs_daddr_t daddr, 669 size_t length, 670 int flags) 671 { 672 xfs_buf_t *bp; 673 int error; 674 675 bp = xfs_buf_get_uncached(target, length, flags); 676 if (!bp) 677 return NULL; 678 679 /* set up the buffer for a read IO */ 680 XFS_BUF_SET_ADDR(bp, daddr); 681 XFS_BUF_READ(bp); 682 683 xfsbdstrat(mp, bp); 684 error = xfs_buf_iowait(bp); 685 if (error || bp->b_error) { 686 xfs_buf_relse(bp); 687 return NULL; 688 } 689 return bp; 690 } 691 692 xfs_buf_t * 693 xfs_buf_get_empty( 694 size_t len, 695 xfs_buftarg_t *target) 696 { 697 xfs_buf_t *bp; 698 699 bp = xfs_buf_allocate(0); 700 if (bp) 701 _xfs_buf_initialize(bp, target, 0, len, 0); 702 return bp; 703 } 704 705 /* 706 * Return a buffer allocated as an empty buffer and associated to external 707 * memory via xfs_buf_associate_memory() back to it's empty state. 708 */ 709 void 710 xfs_buf_set_empty( 711 struct xfs_buf *bp, 712 size_t len) 713 { 714 if (bp->b_pages) 715 _xfs_buf_free_pages(bp); 716 717 bp->b_pages = NULL; 718 bp->b_page_count = 0; 719 bp->b_addr = NULL; 720 bp->b_file_offset = 0; 721 bp->b_buffer_length = bp->b_count_desired = len; 722 bp->b_bn = XFS_BUF_DADDR_NULL; 723 bp->b_flags &= ~XBF_MAPPED; 724 } 725 726 static inline struct page * 727 mem_to_page( 728 void *addr) 729 { 730 if ((!is_vmalloc_addr(addr))) { 731 return virt_to_page(addr); 732 } else { 733 return vmalloc_to_page(addr); 734 } 735 } 736 737 int 738 xfs_buf_associate_memory( 739 xfs_buf_t *bp, 740 void *mem, 741 size_t len) 742 { 743 int rval; 744 int i = 0; 745 unsigned long pageaddr; 746 unsigned long offset; 747 size_t buflen; 748 int page_count; 749 750 pageaddr = (unsigned long)mem & PAGE_MASK; 751 offset = (unsigned long)mem - pageaddr; 752 buflen = PAGE_ALIGN(len + offset); 753 page_count = buflen >> PAGE_SHIFT; 754 755 /* Free any previous set of page pointers */ 756 if (bp->b_pages) 757 _xfs_buf_free_pages(bp); 758 759 bp->b_pages = NULL; 760 bp->b_addr = mem; 761 762 rval = _xfs_buf_get_pages(bp, page_count, XBF_DONT_BLOCK); 763 if (rval) 764 return rval; 765 766 bp->b_offset = offset; 767 768 for (i = 0; i < bp->b_page_count; i++) { 769 bp->b_pages[i] = mem_to_page((void *)pageaddr); 770 pageaddr += PAGE_SIZE; 771 } 772 773 bp->b_count_desired = len; 774 bp->b_buffer_length = buflen; 775 bp->b_flags |= XBF_MAPPED; 776 777 return 0; 778 } 779 780 xfs_buf_t * 781 xfs_buf_get_uncached( 782 struct xfs_buftarg *target, 783 size_t len, 784 int flags) 785 { 786 unsigned long page_count = PAGE_ALIGN(len) >> PAGE_SHIFT; 787 int error, i; 788 xfs_buf_t *bp; 789 790 bp = xfs_buf_allocate(0); 791 if (unlikely(bp == NULL)) 792 goto fail; 793 _xfs_buf_initialize(bp, target, 0, len, 0); 794 795 error = _xfs_buf_get_pages(bp, page_count, 0); 796 if (error) 797 goto fail_free_buf; 798 799 for (i = 0; i < page_count; i++) { 800 bp->b_pages[i] = alloc_page(xb_to_gfp(flags)); 801 if (!bp->b_pages[i]) 802 goto fail_free_mem; 803 } 804 bp->b_flags |= _XBF_PAGES; 805 806 error = _xfs_buf_map_pages(bp, XBF_MAPPED); 807 if (unlikely(error)) { 808 xfs_warn(target->bt_mount, 809 "%s: failed to map pages\n", __func__); 810 goto fail_free_mem; 811 } 812 813 trace_xfs_buf_get_uncached(bp, _RET_IP_); 814 return bp; 815 816 fail_free_mem: 817 while (--i >= 0) 818 __free_page(bp->b_pages[i]); 819 _xfs_buf_free_pages(bp); 820 fail_free_buf: 821 xfs_buf_deallocate(bp); 822 fail: 823 return NULL; 824 } 825 826 /* 827 * Increment reference count on buffer, to hold the buffer concurrently 828 * with another thread which may release (free) the buffer asynchronously. 829 * Must hold the buffer already to call this function. 830 */ 831 void 832 xfs_buf_hold( 833 xfs_buf_t *bp) 834 { 835 trace_xfs_buf_hold(bp, _RET_IP_); 836 atomic_inc(&bp->b_hold); 837 } 838 839 /* 840 * Releases a hold on the specified buffer. If the 841 * the hold count is 1, calls xfs_buf_free. 842 */ 843 void 844 xfs_buf_rele( 845 xfs_buf_t *bp) 846 { 847 struct xfs_perag *pag = bp->b_pag; 848 849 trace_xfs_buf_rele(bp, _RET_IP_); 850 851 if (!pag) { 852 ASSERT(list_empty(&bp->b_lru)); 853 ASSERT(RB_EMPTY_NODE(&bp->b_rbnode)); 854 if (atomic_dec_and_test(&bp->b_hold)) 855 xfs_buf_free(bp); 856 return; 857 } 858 859 ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode)); 860 861 ASSERT(atomic_read(&bp->b_hold) > 0); 862 if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) { 863 if (!(bp->b_flags & XBF_STALE) && 864 atomic_read(&bp->b_lru_ref)) { 865 xfs_buf_lru_add(bp); 866 spin_unlock(&pag->pag_buf_lock); 867 } else { 868 xfs_buf_lru_del(bp); 869 ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q))); 870 rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); 871 spin_unlock(&pag->pag_buf_lock); 872 xfs_perag_put(pag); 873 xfs_buf_free(bp); 874 } 875 } 876 } 877 878 879 /* 880 * Lock a buffer object, if it is not already locked. 881 * 882 * If we come across a stale, pinned, locked buffer, we know that we are 883 * being asked to lock a buffer that has been reallocated. Because it is 884 * pinned, we know that the log has not been pushed to disk and hence it 885 * will still be locked. Rather than continuing to have trylock attempts 886 * fail until someone else pushes the log, push it ourselves before 887 * returning. This means that the xfsaild will not get stuck trying 888 * to push on stale inode buffers. 889 */ 890 int 891 xfs_buf_trylock( 892 struct xfs_buf *bp) 893 { 894 int locked; 895 896 locked = down_trylock(&bp->b_sema) == 0; 897 if (locked) 898 XB_SET_OWNER(bp); 899 else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) 900 xfs_log_force(bp->b_target->bt_mount, 0); 901 902 trace_xfs_buf_trylock(bp, _RET_IP_); 903 return locked; 904 } 905 906 /* 907 * Lock a buffer object. 908 * 909 * If we come across a stale, pinned, locked buffer, we know that we 910 * are being asked to lock a buffer that has been reallocated. Because 911 * it is pinned, we know that the log has not been pushed to disk and 912 * hence it will still be locked. Rather than sleeping until someone 913 * else pushes the log, push it ourselves before trying to get the lock. 914 */ 915 void 916 xfs_buf_lock( 917 struct xfs_buf *bp) 918 { 919 trace_xfs_buf_lock(bp, _RET_IP_); 920 921 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) 922 xfs_log_force(bp->b_target->bt_mount, 0); 923 down(&bp->b_sema); 924 XB_SET_OWNER(bp); 925 926 trace_xfs_buf_lock_done(bp, _RET_IP_); 927 } 928 929 /* 930 * Releases the lock on the buffer object. 931 * If the buffer is marked delwri but is not queued, do so before we 932 * unlock the buffer as we need to set flags correctly. We also need to 933 * take a reference for the delwri queue because the unlocker is going to 934 * drop their's and they don't know we just queued it. 935 */ 936 void 937 xfs_buf_unlock( 938 struct xfs_buf *bp) 939 { 940 if ((bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)) == XBF_DELWRI) { 941 atomic_inc(&bp->b_hold); 942 bp->b_flags |= XBF_ASYNC; 943 xfs_buf_delwri_queue(bp, 0); 944 } 945 946 XB_CLEAR_OWNER(bp); 947 up(&bp->b_sema); 948 949 trace_xfs_buf_unlock(bp, _RET_IP_); 950 } 951 952 STATIC void 953 xfs_buf_wait_unpin( 954 xfs_buf_t *bp) 955 { 956 DECLARE_WAITQUEUE (wait, current); 957 958 if (atomic_read(&bp->b_pin_count) == 0) 959 return; 960 961 add_wait_queue(&bp->b_waiters, &wait); 962 for (;;) { 963 set_current_state(TASK_UNINTERRUPTIBLE); 964 if (atomic_read(&bp->b_pin_count) == 0) 965 break; 966 io_schedule(); 967 } 968 remove_wait_queue(&bp->b_waiters, &wait); 969 set_current_state(TASK_RUNNING); 970 } 971 972 /* 973 * Buffer Utility Routines 974 */ 975 976 STATIC void 977 xfs_buf_iodone_work( 978 struct work_struct *work) 979 { 980 xfs_buf_t *bp = 981 container_of(work, xfs_buf_t, b_iodone_work); 982 983 if (bp->b_iodone) 984 (*(bp->b_iodone))(bp); 985 else if (bp->b_flags & XBF_ASYNC) 986 xfs_buf_relse(bp); 987 } 988 989 void 990 xfs_buf_ioend( 991 xfs_buf_t *bp, 992 int schedule) 993 { 994 trace_xfs_buf_iodone(bp, _RET_IP_); 995 996 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); 997 if (bp->b_error == 0) 998 bp->b_flags |= XBF_DONE; 999 1000 if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) { 1001 if (schedule) { 1002 INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work); 1003 queue_work(xfslogd_workqueue, &bp->b_iodone_work); 1004 } else { 1005 xfs_buf_iodone_work(&bp->b_iodone_work); 1006 } 1007 } else { 1008 complete(&bp->b_iowait); 1009 } 1010 } 1011 1012 void 1013 xfs_buf_ioerror( 1014 xfs_buf_t *bp, 1015 int error) 1016 { 1017 ASSERT(error >= 0 && error <= 0xffff); 1018 bp->b_error = (unsigned short)error; 1019 trace_xfs_buf_ioerror(bp, error, _RET_IP_); 1020 } 1021 1022 int 1023 xfs_bwrite( 1024 struct xfs_mount *mp, 1025 struct xfs_buf *bp) 1026 { 1027 int error; 1028 1029 bp->b_flags |= XBF_WRITE; 1030 bp->b_flags &= ~(XBF_ASYNC | XBF_READ); 1031 1032 xfs_buf_delwri_dequeue(bp); 1033 xfs_bdstrat_cb(bp); 1034 1035 error = xfs_buf_iowait(bp); 1036 if (error) 1037 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 1038 xfs_buf_relse(bp); 1039 return error; 1040 } 1041 1042 void 1043 xfs_bdwrite( 1044 void *mp, 1045 struct xfs_buf *bp) 1046 { 1047 trace_xfs_buf_bdwrite(bp, _RET_IP_); 1048 1049 bp->b_flags &= ~XBF_READ; 1050 bp->b_flags |= (XBF_DELWRI | XBF_ASYNC); 1051 1052 xfs_buf_delwri_queue(bp, 1); 1053 } 1054 1055 /* 1056 * Called when we want to stop a buffer from getting written or read. 1057 * We attach the EIO error, muck with its flags, and call xfs_buf_ioend 1058 * so that the proper iodone callbacks get called. 1059 */ 1060 STATIC int 1061 xfs_bioerror( 1062 xfs_buf_t *bp) 1063 { 1064 #ifdef XFSERRORDEBUG 1065 ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone); 1066 #endif 1067 1068 /* 1069 * No need to wait until the buffer is unpinned, we aren't flushing it. 1070 */ 1071 xfs_buf_ioerror(bp, EIO); 1072 1073 /* 1074 * We're calling xfs_buf_ioend, so delete XBF_DONE flag. 1075 */ 1076 XFS_BUF_UNREAD(bp); 1077 XFS_BUF_UNDELAYWRITE(bp); 1078 XFS_BUF_UNDONE(bp); 1079 XFS_BUF_STALE(bp); 1080 1081 xfs_buf_ioend(bp, 0); 1082 1083 return EIO; 1084 } 1085 1086 /* 1087 * Same as xfs_bioerror, except that we are releasing the buffer 1088 * here ourselves, and avoiding the xfs_buf_ioend call. 1089 * This is meant for userdata errors; metadata bufs come with 1090 * iodone functions attached, so that we can track down errors. 1091 */ 1092 STATIC int 1093 xfs_bioerror_relse( 1094 struct xfs_buf *bp) 1095 { 1096 int64_t fl = bp->b_flags; 1097 /* 1098 * No need to wait until the buffer is unpinned. 1099 * We aren't flushing it. 1100 * 1101 * chunkhold expects B_DONE to be set, whether 1102 * we actually finish the I/O or not. We don't want to 1103 * change that interface. 1104 */ 1105 XFS_BUF_UNREAD(bp); 1106 XFS_BUF_UNDELAYWRITE(bp); 1107 XFS_BUF_DONE(bp); 1108 XFS_BUF_STALE(bp); 1109 bp->b_iodone = NULL; 1110 if (!(fl & XBF_ASYNC)) { 1111 /* 1112 * Mark b_error and B_ERROR _both_. 1113 * Lot's of chunkcache code assumes that. 1114 * There's no reason to mark error for 1115 * ASYNC buffers. 1116 */ 1117 xfs_buf_ioerror(bp, EIO); 1118 XFS_BUF_FINISH_IOWAIT(bp); 1119 } else { 1120 xfs_buf_relse(bp); 1121 } 1122 1123 return EIO; 1124 } 1125 1126 1127 /* 1128 * All xfs metadata buffers except log state machine buffers 1129 * get this attached as their b_bdstrat callback function. 1130 * This is so that we can catch a buffer 1131 * after prematurely unpinning it to forcibly shutdown the filesystem. 1132 */ 1133 int 1134 xfs_bdstrat_cb( 1135 struct xfs_buf *bp) 1136 { 1137 if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { 1138 trace_xfs_bdstrat_shut(bp, _RET_IP_); 1139 /* 1140 * Metadata write that didn't get logged but 1141 * written delayed anyway. These aren't associated 1142 * with a transaction, and can be ignored. 1143 */ 1144 if (!bp->b_iodone && !XFS_BUF_ISREAD(bp)) 1145 return xfs_bioerror_relse(bp); 1146 else 1147 return xfs_bioerror(bp); 1148 } 1149 1150 xfs_buf_iorequest(bp); 1151 return 0; 1152 } 1153 1154 /* 1155 * Wrapper around bdstrat so that we can stop data from going to disk in case 1156 * we are shutting down the filesystem. Typically user data goes thru this 1157 * path; one of the exceptions is the superblock. 1158 */ 1159 void 1160 xfsbdstrat( 1161 struct xfs_mount *mp, 1162 struct xfs_buf *bp) 1163 { 1164 if (XFS_FORCED_SHUTDOWN(mp)) { 1165 trace_xfs_bdstrat_shut(bp, _RET_IP_); 1166 xfs_bioerror_relse(bp); 1167 return; 1168 } 1169 1170 xfs_buf_iorequest(bp); 1171 } 1172 1173 STATIC void 1174 _xfs_buf_ioend( 1175 xfs_buf_t *bp, 1176 int schedule) 1177 { 1178 if (atomic_dec_and_test(&bp->b_io_remaining) == 1) 1179 xfs_buf_ioend(bp, schedule); 1180 } 1181 1182 STATIC void 1183 xfs_buf_bio_end_io( 1184 struct bio *bio, 1185 int error) 1186 { 1187 xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; 1188 1189 xfs_buf_ioerror(bp, -error); 1190 1191 if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) 1192 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); 1193 1194 _xfs_buf_ioend(bp, 1); 1195 bio_put(bio); 1196 } 1197 1198 STATIC void 1199 _xfs_buf_ioapply( 1200 xfs_buf_t *bp) 1201 { 1202 int rw, map_i, total_nr_pages, nr_pages; 1203 struct bio *bio; 1204 int offset = bp->b_offset; 1205 int size = bp->b_count_desired; 1206 sector_t sector = bp->b_bn; 1207 1208 total_nr_pages = bp->b_page_count; 1209 map_i = 0; 1210 1211 if (bp->b_flags & XBF_WRITE) { 1212 if (bp->b_flags & XBF_SYNCIO) 1213 rw = WRITE_SYNC; 1214 else 1215 rw = WRITE; 1216 if (bp->b_flags & XBF_FUA) 1217 rw |= REQ_FUA; 1218 if (bp->b_flags & XBF_FLUSH) 1219 rw |= REQ_FLUSH; 1220 } else if (bp->b_flags & XBF_READ_AHEAD) { 1221 rw = READA; 1222 } else { 1223 rw = READ; 1224 } 1225 1226 /* we only use the buffer cache for meta-data */ 1227 rw |= REQ_META; 1228 1229 next_chunk: 1230 atomic_inc(&bp->b_io_remaining); 1231 nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT); 1232 if (nr_pages > total_nr_pages) 1233 nr_pages = total_nr_pages; 1234 1235 bio = bio_alloc(GFP_NOIO, nr_pages); 1236 bio->bi_bdev = bp->b_target->bt_bdev; 1237 bio->bi_sector = sector; 1238 bio->bi_end_io = xfs_buf_bio_end_io; 1239 bio->bi_private = bp; 1240 1241 1242 for (; size && nr_pages; nr_pages--, map_i++) { 1243 int rbytes, nbytes = PAGE_SIZE - offset; 1244 1245 if (nbytes > size) 1246 nbytes = size; 1247 1248 rbytes = bio_add_page(bio, bp->b_pages[map_i], nbytes, offset); 1249 if (rbytes < nbytes) 1250 break; 1251 1252 offset = 0; 1253 sector += nbytes >> BBSHIFT; 1254 size -= nbytes; 1255 total_nr_pages--; 1256 } 1257 1258 if (likely(bio->bi_size)) { 1259 if (xfs_buf_is_vmapped(bp)) { 1260 flush_kernel_vmap_range(bp->b_addr, 1261 xfs_buf_vmap_len(bp)); 1262 } 1263 submit_bio(rw, bio); 1264 if (size) 1265 goto next_chunk; 1266 } else { 1267 xfs_buf_ioerror(bp, EIO); 1268 bio_put(bio); 1269 } 1270 } 1271 1272 int 1273 xfs_buf_iorequest( 1274 xfs_buf_t *bp) 1275 { 1276 trace_xfs_buf_iorequest(bp, _RET_IP_); 1277 1278 if (bp->b_flags & XBF_DELWRI) { 1279 xfs_buf_delwri_queue(bp, 1); 1280 return 0; 1281 } 1282 1283 if (bp->b_flags & XBF_WRITE) { 1284 xfs_buf_wait_unpin(bp); 1285 } 1286 1287 xfs_buf_hold(bp); 1288 1289 /* Set the count to 1 initially, this will stop an I/O 1290 * completion callout which happens before we have started 1291 * all the I/O from calling xfs_buf_ioend too early. 1292 */ 1293 atomic_set(&bp->b_io_remaining, 1); 1294 _xfs_buf_ioapply(bp); 1295 _xfs_buf_ioend(bp, 0); 1296 1297 xfs_buf_rele(bp); 1298 return 0; 1299 } 1300 1301 /* 1302 * Waits for I/O to complete on the buffer supplied. 1303 * It returns immediately if no I/O is pending. 1304 * It returns the I/O error code, if any, or 0 if there was no error. 1305 */ 1306 int 1307 xfs_buf_iowait( 1308 xfs_buf_t *bp) 1309 { 1310 trace_xfs_buf_iowait(bp, _RET_IP_); 1311 1312 wait_for_completion(&bp->b_iowait); 1313 1314 trace_xfs_buf_iowait_done(bp, _RET_IP_); 1315 return bp->b_error; 1316 } 1317 1318 xfs_caddr_t 1319 xfs_buf_offset( 1320 xfs_buf_t *bp, 1321 size_t offset) 1322 { 1323 struct page *page; 1324 1325 if (bp->b_flags & XBF_MAPPED) 1326 return bp->b_addr + offset; 1327 1328 offset += bp->b_offset; 1329 page = bp->b_pages[offset >> PAGE_SHIFT]; 1330 return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1)); 1331 } 1332 1333 /* 1334 * Move data into or out of a buffer. 1335 */ 1336 void 1337 xfs_buf_iomove( 1338 xfs_buf_t *bp, /* buffer to process */ 1339 size_t boff, /* starting buffer offset */ 1340 size_t bsize, /* length to copy */ 1341 void *data, /* data address */ 1342 xfs_buf_rw_t mode) /* read/write/zero flag */ 1343 { 1344 size_t bend, cpoff, csize; 1345 struct page *page; 1346 1347 bend = boff + bsize; 1348 while (boff < bend) { 1349 page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)]; 1350 cpoff = xfs_buf_poff(boff + bp->b_offset); 1351 csize = min_t(size_t, 1352 PAGE_SIZE-cpoff, bp->b_count_desired-boff); 1353 1354 ASSERT(((csize + cpoff) <= PAGE_SIZE)); 1355 1356 switch (mode) { 1357 case XBRW_ZERO: 1358 memset(page_address(page) + cpoff, 0, csize); 1359 break; 1360 case XBRW_READ: 1361 memcpy(data, page_address(page) + cpoff, csize); 1362 break; 1363 case XBRW_WRITE: 1364 memcpy(page_address(page) + cpoff, data, csize); 1365 } 1366 1367 boff += csize; 1368 data += csize; 1369 } 1370 } 1371 1372 /* 1373 * Handling of buffer targets (buftargs). 1374 */ 1375 1376 /* 1377 * Wait for any bufs with callbacks that have been submitted but have not yet 1378 * returned. These buffers will have an elevated hold count, so wait on those 1379 * while freeing all the buffers only held by the LRU. 1380 */ 1381 void 1382 xfs_wait_buftarg( 1383 struct xfs_buftarg *btp) 1384 { 1385 struct xfs_buf *bp; 1386 1387 restart: 1388 spin_lock(&btp->bt_lru_lock); 1389 while (!list_empty(&btp->bt_lru)) { 1390 bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru); 1391 if (atomic_read(&bp->b_hold) > 1) { 1392 spin_unlock(&btp->bt_lru_lock); 1393 delay(100); 1394 goto restart; 1395 } 1396 /* 1397 * clear the LRU reference count so the bufer doesn't get 1398 * ignored in xfs_buf_rele(). 1399 */ 1400 atomic_set(&bp->b_lru_ref, 0); 1401 spin_unlock(&btp->bt_lru_lock); 1402 xfs_buf_rele(bp); 1403 spin_lock(&btp->bt_lru_lock); 1404 } 1405 spin_unlock(&btp->bt_lru_lock); 1406 } 1407 1408 int 1409 xfs_buftarg_shrink( 1410 struct shrinker *shrink, 1411 struct shrink_control *sc) 1412 { 1413 struct xfs_buftarg *btp = container_of(shrink, 1414 struct xfs_buftarg, bt_shrinker); 1415 struct xfs_buf *bp; 1416 int nr_to_scan = sc->nr_to_scan; 1417 LIST_HEAD(dispose); 1418 1419 if (!nr_to_scan) 1420 return btp->bt_lru_nr; 1421 1422 spin_lock(&btp->bt_lru_lock); 1423 while (!list_empty(&btp->bt_lru)) { 1424 if (nr_to_scan-- <= 0) 1425 break; 1426 1427 bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru); 1428 1429 /* 1430 * Decrement the b_lru_ref count unless the value is already 1431 * zero. If the value is already zero, we need to reclaim the 1432 * buffer, otherwise it gets another trip through the LRU. 1433 */ 1434 if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) { 1435 list_move_tail(&bp->b_lru, &btp->bt_lru); 1436 continue; 1437 } 1438 1439 /* 1440 * remove the buffer from the LRU now to avoid needing another 1441 * lock round trip inside xfs_buf_rele(). 1442 */ 1443 list_move(&bp->b_lru, &dispose); 1444 btp->bt_lru_nr--; 1445 } 1446 spin_unlock(&btp->bt_lru_lock); 1447 1448 while (!list_empty(&dispose)) { 1449 bp = list_first_entry(&dispose, struct xfs_buf, b_lru); 1450 list_del_init(&bp->b_lru); 1451 xfs_buf_rele(bp); 1452 } 1453 1454 return btp->bt_lru_nr; 1455 } 1456 1457 void 1458 xfs_free_buftarg( 1459 struct xfs_mount *mp, 1460 struct xfs_buftarg *btp) 1461 { 1462 unregister_shrinker(&btp->bt_shrinker); 1463 1464 xfs_flush_buftarg(btp, 1); 1465 if (mp->m_flags & XFS_MOUNT_BARRIER) 1466 xfs_blkdev_issue_flush(btp); 1467 1468 kthread_stop(btp->bt_task); 1469 kmem_free(btp); 1470 } 1471 1472 STATIC int 1473 xfs_setsize_buftarg_flags( 1474 xfs_buftarg_t *btp, 1475 unsigned int blocksize, 1476 unsigned int sectorsize, 1477 int verbose) 1478 { 1479 btp->bt_bsize = blocksize; 1480 btp->bt_sshift = ffs(sectorsize) - 1; 1481 btp->bt_smask = sectorsize - 1; 1482 1483 if (set_blocksize(btp->bt_bdev, sectorsize)) { 1484 xfs_warn(btp->bt_mount, 1485 "Cannot set_blocksize to %u on device %s\n", 1486 sectorsize, xfs_buf_target_name(btp)); 1487 return EINVAL; 1488 } 1489 1490 return 0; 1491 } 1492 1493 /* 1494 * When allocating the initial buffer target we have not yet 1495 * read in the superblock, so don't know what sized sectors 1496 * are being used is at this early stage. Play safe. 1497 */ 1498 STATIC int 1499 xfs_setsize_buftarg_early( 1500 xfs_buftarg_t *btp, 1501 struct block_device *bdev) 1502 { 1503 return xfs_setsize_buftarg_flags(btp, 1504 PAGE_SIZE, bdev_logical_block_size(bdev), 0); 1505 } 1506 1507 int 1508 xfs_setsize_buftarg( 1509 xfs_buftarg_t *btp, 1510 unsigned int blocksize, 1511 unsigned int sectorsize) 1512 { 1513 return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1); 1514 } 1515 1516 STATIC int 1517 xfs_alloc_delwrite_queue( 1518 xfs_buftarg_t *btp, 1519 const char *fsname) 1520 { 1521 INIT_LIST_HEAD(&btp->bt_delwrite_queue); 1522 spin_lock_init(&btp->bt_delwrite_lock); 1523 btp->bt_flags = 0; 1524 btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname); 1525 if (IS_ERR(btp->bt_task)) 1526 return PTR_ERR(btp->bt_task); 1527 return 0; 1528 } 1529 1530 xfs_buftarg_t * 1531 xfs_alloc_buftarg( 1532 struct xfs_mount *mp, 1533 struct block_device *bdev, 1534 int external, 1535 const char *fsname) 1536 { 1537 xfs_buftarg_t *btp; 1538 1539 btp = kmem_zalloc(sizeof(*btp), KM_SLEEP); 1540 1541 btp->bt_mount = mp; 1542 btp->bt_dev = bdev->bd_dev; 1543 btp->bt_bdev = bdev; 1544 btp->bt_bdi = blk_get_backing_dev_info(bdev); 1545 if (!btp->bt_bdi) 1546 goto error; 1547 1548 INIT_LIST_HEAD(&btp->bt_lru); 1549 spin_lock_init(&btp->bt_lru_lock); 1550 if (xfs_setsize_buftarg_early(btp, bdev)) 1551 goto error; 1552 if (xfs_alloc_delwrite_queue(btp, fsname)) 1553 goto error; 1554 btp->bt_shrinker.shrink = xfs_buftarg_shrink; 1555 btp->bt_shrinker.seeks = DEFAULT_SEEKS; 1556 register_shrinker(&btp->bt_shrinker); 1557 return btp; 1558 1559 error: 1560 kmem_free(btp); 1561 return NULL; 1562 } 1563 1564 1565 /* 1566 * Delayed write buffer handling 1567 */ 1568 STATIC void 1569 xfs_buf_delwri_queue( 1570 xfs_buf_t *bp, 1571 int unlock) 1572 { 1573 struct list_head *dwq = &bp->b_target->bt_delwrite_queue; 1574 spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock; 1575 1576 trace_xfs_buf_delwri_queue(bp, _RET_IP_); 1577 1578 ASSERT((bp->b_flags&(XBF_DELWRI|XBF_ASYNC)) == (XBF_DELWRI|XBF_ASYNC)); 1579 1580 spin_lock(dwlk); 1581 /* If already in the queue, dequeue and place at tail */ 1582 if (!list_empty(&bp->b_list)) { 1583 ASSERT(bp->b_flags & _XBF_DELWRI_Q); 1584 if (unlock) 1585 atomic_dec(&bp->b_hold); 1586 list_del(&bp->b_list); 1587 } 1588 1589 if (list_empty(dwq)) { 1590 /* start xfsbufd as it is about to have something to do */ 1591 wake_up_process(bp->b_target->bt_task); 1592 } 1593 1594 bp->b_flags |= _XBF_DELWRI_Q; 1595 list_add_tail(&bp->b_list, dwq); 1596 bp->b_queuetime = jiffies; 1597 spin_unlock(dwlk); 1598 1599 if (unlock) 1600 xfs_buf_unlock(bp); 1601 } 1602 1603 void 1604 xfs_buf_delwri_dequeue( 1605 xfs_buf_t *bp) 1606 { 1607 spinlock_t *dwlk = &bp->b_target->bt_delwrite_lock; 1608 int dequeued = 0; 1609 1610 spin_lock(dwlk); 1611 if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) { 1612 ASSERT(bp->b_flags & _XBF_DELWRI_Q); 1613 list_del_init(&bp->b_list); 1614 dequeued = 1; 1615 } 1616 bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q); 1617 spin_unlock(dwlk); 1618 1619 if (dequeued) 1620 xfs_buf_rele(bp); 1621 1622 trace_xfs_buf_delwri_dequeue(bp, _RET_IP_); 1623 } 1624 1625 /* 1626 * If a delwri buffer needs to be pushed before it has aged out, then promote 1627 * it to the head of the delwri queue so that it will be flushed on the next 1628 * xfsbufd run. We do this by resetting the queuetime of the buffer to be older 1629 * than the age currently needed to flush the buffer. Hence the next time the 1630 * xfsbufd sees it is guaranteed to be considered old enough to flush. 1631 */ 1632 void 1633 xfs_buf_delwri_promote( 1634 struct xfs_buf *bp) 1635 { 1636 struct xfs_buftarg *btp = bp->b_target; 1637 long age = xfs_buf_age_centisecs * msecs_to_jiffies(10) + 1; 1638 1639 ASSERT(bp->b_flags & XBF_DELWRI); 1640 ASSERT(bp->b_flags & _XBF_DELWRI_Q); 1641 1642 /* 1643 * Check the buffer age before locking the delayed write queue as we 1644 * don't need to promote buffers that are already past the flush age. 1645 */ 1646 if (bp->b_queuetime < jiffies - age) 1647 return; 1648 bp->b_queuetime = jiffies - age; 1649 spin_lock(&btp->bt_delwrite_lock); 1650 list_move(&bp->b_list, &btp->bt_delwrite_queue); 1651 spin_unlock(&btp->bt_delwrite_lock); 1652 } 1653 1654 STATIC void 1655 xfs_buf_runall_queues( 1656 struct workqueue_struct *queue) 1657 { 1658 flush_workqueue(queue); 1659 } 1660 1661 /* 1662 * Move as many buffers as specified to the supplied list 1663 * idicating if we skipped any buffers to prevent deadlocks. 1664 */ 1665 STATIC int 1666 xfs_buf_delwri_split( 1667 xfs_buftarg_t *target, 1668 struct list_head *list, 1669 unsigned long age) 1670 { 1671 xfs_buf_t *bp, *n; 1672 struct list_head *dwq = &target->bt_delwrite_queue; 1673 spinlock_t *dwlk = &target->bt_delwrite_lock; 1674 int skipped = 0; 1675 int force; 1676 1677 force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags); 1678 INIT_LIST_HEAD(list); 1679 spin_lock(dwlk); 1680 list_for_each_entry_safe(bp, n, dwq, b_list) { 1681 ASSERT(bp->b_flags & XBF_DELWRI); 1682 1683 if (!xfs_buf_ispinned(bp) && xfs_buf_trylock(bp)) { 1684 if (!force && 1685 time_before(jiffies, bp->b_queuetime + age)) { 1686 xfs_buf_unlock(bp); 1687 break; 1688 } 1689 1690 bp->b_flags &= ~(XBF_DELWRI | _XBF_DELWRI_Q); 1691 bp->b_flags |= XBF_WRITE; 1692 list_move_tail(&bp->b_list, list); 1693 trace_xfs_buf_delwri_split(bp, _RET_IP_); 1694 } else 1695 skipped++; 1696 } 1697 spin_unlock(dwlk); 1698 1699 return skipped; 1700 1701 } 1702 1703 /* 1704 * Compare function is more complex than it needs to be because 1705 * the return value is only 32 bits and we are doing comparisons 1706 * on 64 bit values 1707 */ 1708 static int 1709 xfs_buf_cmp( 1710 void *priv, 1711 struct list_head *a, 1712 struct list_head *b) 1713 { 1714 struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list); 1715 struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list); 1716 xfs_daddr_t diff; 1717 1718 diff = ap->b_bn - bp->b_bn; 1719 if (diff < 0) 1720 return -1; 1721 if (diff > 0) 1722 return 1; 1723 return 0; 1724 } 1725 1726 STATIC int 1727 xfsbufd( 1728 void *data) 1729 { 1730 xfs_buftarg_t *target = (xfs_buftarg_t *)data; 1731 1732 current->flags |= PF_MEMALLOC; 1733 1734 set_freezable(); 1735 1736 do { 1737 long age = xfs_buf_age_centisecs * msecs_to_jiffies(10); 1738 long tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10); 1739 struct list_head tmp; 1740 struct blk_plug plug; 1741 1742 if (unlikely(freezing(current))) { 1743 set_bit(XBT_FORCE_SLEEP, &target->bt_flags); 1744 refrigerator(); 1745 } else { 1746 clear_bit(XBT_FORCE_SLEEP, &target->bt_flags); 1747 } 1748 1749 /* sleep for a long time if there is nothing to do. */ 1750 if (list_empty(&target->bt_delwrite_queue)) 1751 tout = MAX_SCHEDULE_TIMEOUT; 1752 schedule_timeout_interruptible(tout); 1753 1754 xfs_buf_delwri_split(target, &tmp, age); 1755 list_sort(NULL, &tmp, xfs_buf_cmp); 1756 1757 blk_start_plug(&plug); 1758 while (!list_empty(&tmp)) { 1759 struct xfs_buf *bp; 1760 bp = list_first_entry(&tmp, struct xfs_buf, b_list); 1761 list_del_init(&bp->b_list); 1762 xfs_bdstrat_cb(bp); 1763 } 1764 blk_finish_plug(&plug); 1765 } while (!kthread_should_stop()); 1766 1767 return 0; 1768 } 1769 1770 /* 1771 * Go through all incore buffers, and release buffers if they belong to 1772 * the given device. This is used in filesystem error handling to 1773 * preserve the consistency of its metadata. 1774 */ 1775 int 1776 xfs_flush_buftarg( 1777 xfs_buftarg_t *target, 1778 int wait) 1779 { 1780 xfs_buf_t *bp; 1781 int pincount = 0; 1782 LIST_HEAD(tmp_list); 1783 LIST_HEAD(wait_list); 1784 struct blk_plug plug; 1785 1786 xfs_buf_runall_queues(xfsconvertd_workqueue); 1787 xfs_buf_runall_queues(xfsdatad_workqueue); 1788 xfs_buf_runall_queues(xfslogd_workqueue); 1789 1790 set_bit(XBT_FORCE_FLUSH, &target->bt_flags); 1791 pincount = xfs_buf_delwri_split(target, &tmp_list, 0); 1792 1793 /* 1794 * Dropped the delayed write list lock, now walk the temporary list. 1795 * All I/O is issued async and then if we need to wait for completion 1796 * we do that after issuing all the IO. 1797 */ 1798 list_sort(NULL, &tmp_list, xfs_buf_cmp); 1799 1800 blk_start_plug(&plug); 1801 while (!list_empty(&tmp_list)) { 1802 bp = list_first_entry(&tmp_list, struct xfs_buf, b_list); 1803 ASSERT(target == bp->b_target); 1804 list_del_init(&bp->b_list); 1805 if (wait) { 1806 bp->b_flags &= ~XBF_ASYNC; 1807 list_add(&bp->b_list, &wait_list); 1808 } 1809 xfs_bdstrat_cb(bp); 1810 } 1811 blk_finish_plug(&plug); 1812 1813 if (wait) { 1814 /* Wait for IO to complete. */ 1815 while (!list_empty(&wait_list)) { 1816 bp = list_first_entry(&wait_list, struct xfs_buf, b_list); 1817 1818 list_del_init(&bp->b_list); 1819 xfs_buf_iowait(bp); 1820 xfs_buf_relse(bp); 1821 } 1822 } 1823 1824 return pincount; 1825 } 1826 1827 int __init 1828 xfs_buf_init(void) 1829 { 1830 xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf", 1831 KM_ZONE_HWALIGN, NULL); 1832 if (!xfs_buf_zone) 1833 goto out; 1834 1835 xfslogd_workqueue = alloc_workqueue("xfslogd", 1836 WQ_MEM_RECLAIM | WQ_HIGHPRI, 1); 1837 if (!xfslogd_workqueue) 1838 goto out_free_buf_zone; 1839 1840 xfsdatad_workqueue = alloc_workqueue("xfsdatad", WQ_MEM_RECLAIM, 1); 1841 if (!xfsdatad_workqueue) 1842 goto out_destroy_xfslogd_workqueue; 1843 1844 xfsconvertd_workqueue = alloc_workqueue("xfsconvertd", 1845 WQ_MEM_RECLAIM, 1); 1846 if (!xfsconvertd_workqueue) 1847 goto out_destroy_xfsdatad_workqueue; 1848 1849 return 0; 1850 1851 out_destroy_xfsdatad_workqueue: 1852 destroy_workqueue(xfsdatad_workqueue); 1853 out_destroy_xfslogd_workqueue: 1854 destroy_workqueue(xfslogd_workqueue); 1855 out_free_buf_zone: 1856 kmem_zone_destroy(xfs_buf_zone); 1857 out: 1858 return -ENOMEM; 1859 } 1860 1861 void 1862 xfs_buf_terminate(void) 1863 { 1864 destroy_workqueue(xfsconvertd_workqueue); 1865 destroy_workqueue(xfsdatad_workqueue); 1866 destroy_workqueue(xfslogd_workqueue); 1867 kmem_zone_destroy(xfs_buf_zone); 1868 } 1869 1870 #ifdef CONFIG_KDB_MODULES 1871 struct list_head * 1872 xfs_get_buftarg_list(void) 1873 { 1874 return &xfs_buftarg_list; 1875 } 1876 #endif 1877