1 /* 2 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 #include "xfs.h" 19 #include <linux/stddef.h> 20 #include <linux/errno.h> 21 #include <linux/gfp.h> 22 #include <linux/pagemap.h> 23 #include <linux/init.h> 24 #include <linux/vmalloc.h> 25 #include <linux/bio.h> 26 #include <linux/sysctl.h> 27 #include <linux/proc_fs.h> 28 #include <linux/workqueue.h> 29 #include <linux/percpu.h> 30 #include <linux/blkdev.h> 31 #include <linux/hash.h> 32 #include <linux/kthread.h> 33 #include <linux/migrate.h> 34 #include <linux/backing-dev.h> 35 #include <linux/freezer.h> 36 37 #include "xfs_sb.h" 38 #include "xfs_log.h" 39 #include "xfs_ag.h" 40 #include "xfs_mount.h" 41 #include "xfs_trace.h" 42 43 static kmem_zone_t *xfs_buf_zone; 44 45 static struct workqueue_struct *xfslogd_workqueue; 46 47 #ifdef XFS_BUF_LOCK_TRACKING 48 # define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid) 49 # define XB_CLEAR_OWNER(bp) ((bp)->b_last_holder = -1) 50 # define XB_GET_OWNER(bp) ((bp)->b_last_holder) 51 #else 52 # define XB_SET_OWNER(bp) do { } while (0) 53 # define XB_CLEAR_OWNER(bp) do { } while (0) 54 # define XB_GET_OWNER(bp) do { } while (0) 55 #endif 56 57 #define xb_to_gfp(flags) \ 58 ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN) 59 60 61 static inline int 62 xfs_buf_is_vmapped( 63 struct xfs_buf *bp) 64 { 65 /* 66 * Return true if the buffer is vmapped. 67 * 68 * b_addr is null if the buffer is not mapped, but the code is clever 69 * enough to know it doesn't have to map a single page, so the check has 70 * to be both for b_addr and bp->b_page_count > 1. 71 */ 72 return bp->b_addr && bp->b_page_count > 1; 73 } 74 75 static inline int 76 xfs_buf_vmap_len( 77 struct xfs_buf *bp) 78 { 79 return (bp->b_page_count * PAGE_SIZE) - bp->b_offset; 80 } 81 82 /* 83 * xfs_buf_lru_add - add a buffer to the LRU. 84 * 85 * The LRU takes a new reference to the buffer so that it will only be freed 86 * once the shrinker takes the buffer off the LRU. 87 */ 88 STATIC void 89 xfs_buf_lru_add( 90 struct xfs_buf *bp) 91 { 92 struct xfs_buftarg *btp = bp->b_target; 93 94 spin_lock(&btp->bt_lru_lock); 95 if (list_empty(&bp->b_lru)) { 96 atomic_inc(&bp->b_hold); 97 list_add_tail(&bp->b_lru, &btp->bt_lru); 98 btp->bt_lru_nr++; 99 } 100 spin_unlock(&btp->bt_lru_lock); 101 } 102 103 /* 104 * xfs_buf_lru_del - remove a buffer from the LRU 105 * 106 * The unlocked check is safe here because it only occurs when there are not 107 * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there 108 * to optimise the shrinker removing the buffer from the LRU and calling 109 * xfs_buf_free(). i.e. it removes an unnecessary round trip on the 110 * bt_lru_lock. 111 */ 112 STATIC void 113 xfs_buf_lru_del( 114 struct xfs_buf *bp) 115 { 116 struct xfs_buftarg *btp = bp->b_target; 117 118 if (list_empty(&bp->b_lru)) 119 return; 120 121 spin_lock(&btp->bt_lru_lock); 122 if (!list_empty(&bp->b_lru)) { 123 list_del_init(&bp->b_lru); 124 btp->bt_lru_nr--; 125 } 126 spin_unlock(&btp->bt_lru_lock); 127 } 128 129 /* 130 * When we mark a buffer stale, we remove the buffer from the LRU and clear the 131 * b_lru_ref count so that the buffer is freed immediately when the buffer 132 * reference count falls to zero. If the buffer is already on the LRU, we need 133 * to remove the reference that LRU holds on the buffer. 134 * 135 * This prevents build-up of stale buffers on the LRU. 136 */ 137 void 138 xfs_buf_stale( 139 struct xfs_buf *bp) 140 { 141 ASSERT(xfs_buf_islocked(bp)); 142 143 bp->b_flags |= XBF_STALE; 144 145 /* 146 * Clear the delwri status so that a delwri queue walker will not 147 * flush this buffer to disk now that it is stale. The delwri queue has 148 * a reference to the buffer, so this is safe to do. 149 */ 150 bp->b_flags &= ~_XBF_DELWRI_Q; 151 152 atomic_set(&(bp)->b_lru_ref, 0); 153 if (!list_empty(&bp->b_lru)) { 154 struct xfs_buftarg *btp = bp->b_target; 155 156 spin_lock(&btp->bt_lru_lock); 157 if (!list_empty(&bp->b_lru)) { 158 list_del_init(&bp->b_lru); 159 btp->bt_lru_nr--; 160 atomic_dec(&bp->b_hold); 161 } 162 spin_unlock(&btp->bt_lru_lock); 163 } 164 ASSERT(atomic_read(&bp->b_hold) >= 1); 165 } 166 167 struct xfs_buf * 168 xfs_buf_alloc( 169 struct xfs_buftarg *target, 170 xfs_daddr_t blkno, 171 size_t numblks, 172 xfs_buf_flags_t flags) 173 { 174 struct xfs_buf *bp; 175 176 bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS); 177 if (unlikely(!bp)) 178 return NULL; 179 180 /* 181 * We don't want certain flags to appear in b_flags unless they are 182 * specifically set by later operations on the buffer. 183 */ 184 flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD); 185 186 atomic_set(&bp->b_hold, 1); 187 atomic_set(&bp->b_lru_ref, 1); 188 init_completion(&bp->b_iowait); 189 INIT_LIST_HEAD(&bp->b_lru); 190 INIT_LIST_HEAD(&bp->b_list); 191 RB_CLEAR_NODE(&bp->b_rbnode); 192 sema_init(&bp->b_sema, 0); /* held, no waiters */ 193 XB_SET_OWNER(bp); 194 bp->b_target = target; 195 196 /* 197 * Set length and io_length to the same value initially. 198 * I/O routines should use io_length, which will be the same in 199 * most cases but may be reset (e.g. XFS recovery). 200 */ 201 bp->b_length = numblks; 202 bp->b_io_length = numblks; 203 bp->b_flags = flags; 204 bp->b_bn = blkno; 205 atomic_set(&bp->b_pin_count, 0); 206 init_waitqueue_head(&bp->b_waiters); 207 208 XFS_STATS_INC(xb_create); 209 trace_xfs_buf_init(bp, _RET_IP_); 210 211 return bp; 212 } 213 214 /* 215 * Allocate a page array capable of holding a specified number 216 * of pages, and point the page buf at it. 217 */ 218 STATIC int 219 _xfs_buf_get_pages( 220 xfs_buf_t *bp, 221 int page_count, 222 xfs_buf_flags_t flags) 223 { 224 /* Make sure that we have a page list */ 225 if (bp->b_pages == NULL) { 226 bp->b_page_count = page_count; 227 if (page_count <= XB_PAGES) { 228 bp->b_pages = bp->b_page_array; 229 } else { 230 bp->b_pages = kmem_alloc(sizeof(struct page *) * 231 page_count, KM_NOFS); 232 if (bp->b_pages == NULL) 233 return -ENOMEM; 234 } 235 memset(bp->b_pages, 0, sizeof(struct page *) * page_count); 236 } 237 return 0; 238 } 239 240 /* 241 * Frees b_pages if it was allocated. 242 */ 243 STATIC void 244 _xfs_buf_free_pages( 245 xfs_buf_t *bp) 246 { 247 if (bp->b_pages != bp->b_page_array) { 248 kmem_free(bp->b_pages); 249 bp->b_pages = NULL; 250 } 251 } 252 253 /* 254 * Releases the specified buffer. 255 * 256 * The modification state of any associated pages is left unchanged. 257 * The buffer most not be on any hash - use xfs_buf_rele instead for 258 * hashed and refcounted buffers 259 */ 260 void 261 xfs_buf_free( 262 xfs_buf_t *bp) 263 { 264 trace_xfs_buf_free(bp, _RET_IP_); 265 266 ASSERT(list_empty(&bp->b_lru)); 267 268 if (bp->b_flags & _XBF_PAGES) { 269 uint i; 270 271 if (xfs_buf_is_vmapped(bp)) 272 vm_unmap_ram(bp->b_addr - bp->b_offset, 273 bp->b_page_count); 274 275 for (i = 0; i < bp->b_page_count; i++) { 276 struct page *page = bp->b_pages[i]; 277 278 __free_page(page); 279 } 280 } else if (bp->b_flags & _XBF_KMEM) 281 kmem_free(bp->b_addr); 282 _xfs_buf_free_pages(bp); 283 kmem_zone_free(xfs_buf_zone, bp); 284 } 285 286 /* 287 * Allocates all the pages for buffer in question and builds it's page list. 288 */ 289 STATIC int 290 xfs_buf_allocate_memory( 291 xfs_buf_t *bp, 292 uint flags) 293 { 294 size_t size; 295 size_t nbytes, offset; 296 gfp_t gfp_mask = xb_to_gfp(flags); 297 unsigned short page_count, i; 298 xfs_off_t start, end; 299 int error; 300 301 /* 302 * for buffers that are contained within a single page, just allocate 303 * the memory from the heap - there's no need for the complexity of 304 * page arrays to keep allocation down to order 0. 305 */ 306 size = BBTOB(bp->b_length); 307 if (size < PAGE_SIZE) { 308 bp->b_addr = kmem_alloc(size, KM_NOFS); 309 if (!bp->b_addr) { 310 /* low memory - use alloc_page loop instead */ 311 goto use_alloc_page; 312 } 313 314 if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) != 315 ((unsigned long)bp->b_addr & PAGE_MASK)) { 316 /* b_addr spans two pages - use alloc_page instead */ 317 kmem_free(bp->b_addr); 318 bp->b_addr = NULL; 319 goto use_alloc_page; 320 } 321 bp->b_offset = offset_in_page(bp->b_addr); 322 bp->b_pages = bp->b_page_array; 323 bp->b_pages[0] = virt_to_page(bp->b_addr); 324 bp->b_page_count = 1; 325 bp->b_flags |= _XBF_KMEM; 326 return 0; 327 } 328 329 use_alloc_page: 330 start = BBTOB(bp->b_bn) >> PAGE_SHIFT; 331 end = (BBTOB(bp->b_bn + bp->b_length) + PAGE_SIZE - 1) >> PAGE_SHIFT; 332 page_count = end - start; 333 error = _xfs_buf_get_pages(bp, page_count, flags); 334 if (unlikely(error)) 335 return error; 336 337 offset = bp->b_offset; 338 bp->b_flags |= _XBF_PAGES; 339 340 for (i = 0; i < bp->b_page_count; i++) { 341 struct page *page; 342 uint retries = 0; 343 retry: 344 page = alloc_page(gfp_mask); 345 if (unlikely(page == NULL)) { 346 if (flags & XBF_READ_AHEAD) { 347 bp->b_page_count = i; 348 error = ENOMEM; 349 goto out_free_pages; 350 } 351 352 /* 353 * This could deadlock. 354 * 355 * But until all the XFS lowlevel code is revamped to 356 * handle buffer allocation failures we can't do much. 357 */ 358 if (!(++retries % 100)) 359 xfs_err(NULL, 360 "possible memory allocation deadlock in %s (mode:0x%x)", 361 __func__, gfp_mask); 362 363 XFS_STATS_INC(xb_page_retries); 364 congestion_wait(BLK_RW_ASYNC, HZ/50); 365 goto retry; 366 } 367 368 XFS_STATS_INC(xb_page_found); 369 370 nbytes = min_t(size_t, size, PAGE_SIZE - offset); 371 size -= nbytes; 372 bp->b_pages[i] = page; 373 offset = 0; 374 } 375 return 0; 376 377 out_free_pages: 378 for (i = 0; i < bp->b_page_count; i++) 379 __free_page(bp->b_pages[i]); 380 return error; 381 } 382 383 /* 384 * Map buffer into kernel address-space if necessary. 385 */ 386 STATIC int 387 _xfs_buf_map_pages( 388 xfs_buf_t *bp, 389 uint flags) 390 { 391 ASSERT(bp->b_flags & _XBF_PAGES); 392 if (bp->b_page_count == 1) { 393 /* A single page buffer is always mappable */ 394 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; 395 } else if (flags & XBF_UNMAPPED) { 396 bp->b_addr = NULL; 397 } else { 398 int retried = 0; 399 400 do { 401 bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, 402 -1, PAGE_KERNEL); 403 if (bp->b_addr) 404 break; 405 vm_unmap_aliases(); 406 } while (retried++ <= 1); 407 408 if (!bp->b_addr) 409 return -ENOMEM; 410 bp->b_addr += bp->b_offset; 411 } 412 413 return 0; 414 } 415 416 /* 417 * Finding and Reading Buffers 418 */ 419 420 /* 421 * Look up, and creates if absent, a lockable buffer for 422 * a given range of an inode. The buffer is returned 423 * locked. No I/O is implied by this call. 424 */ 425 xfs_buf_t * 426 _xfs_buf_find( 427 struct xfs_buftarg *btp, 428 xfs_daddr_t blkno, 429 size_t numblks, 430 xfs_buf_flags_t flags, 431 xfs_buf_t *new_bp) 432 { 433 size_t numbytes; 434 struct xfs_perag *pag; 435 struct rb_node **rbp; 436 struct rb_node *parent; 437 xfs_buf_t *bp; 438 439 numbytes = BBTOB(numblks); 440 441 /* Check for IOs smaller than the sector size / not sector aligned */ 442 ASSERT(!(numbytes < (1 << btp->bt_sshift))); 443 ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask)); 444 445 /* get tree root */ 446 pag = xfs_perag_get(btp->bt_mount, 447 xfs_daddr_to_agno(btp->bt_mount, blkno)); 448 449 /* walk tree */ 450 spin_lock(&pag->pag_buf_lock); 451 rbp = &pag->pag_buf_tree.rb_node; 452 parent = NULL; 453 bp = NULL; 454 while (*rbp) { 455 parent = *rbp; 456 bp = rb_entry(parent, struct xfs_buf, b_rbnode); 457 458 if (blkno < bp->b_bn) 459 rbp = &(*rbp)->rb_left; 460 else if (blkno > bp->b_bn) 461 rbp = &(*rbp)->rb_right; 462 else { 463 /* 464 * found a block number match. If the range doesn't 465 * match, the only way this is allowed is if the buffer 466 * in the cache is stale and the transaction that made 467 * it stale has not yet committed. i.e. we are 468 * reallocating a busy extent. Skip this buffer and 469 * continue searching to the right for an exact match. 470 */ 471 if (bp->b_length != numblks) { 472 ASSERT(bp->b_flags & XBF_STALE); 473 rbp = &(*rbp)->rb_right; 474 continue; 475 } 476 atomic_inc(&bp->b_hold); 477 goto found; 478 } 479 } 480 481 /* No match found */ 482 if (new_bp) { 483 rb_link_node(&new_bp->b_rbnode, parent, rbp); 484 rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree); 485 /* the buffer keeps the perag reference until it is freed */ 486 new_bp->b_pag = pag; 487 spin_unlock(&pag->pag_buf_lock); 488 } else { 489 XFS_STATS_INC(xb_miss_locked); 490 spin_unlock(&pag->pag_buf_lock); 491 xfs_perag_put(pag); 492 } 493 return new_bp; 494 495 found: 496 spin_unlock(&pag->pag_buf_lock); 497 xfs_perag_put(pag); 498 499 if (!xfs_buf_trylock(bp)) { 500 if (flags & XBF_TRYLOCK) { 501 xfs_buf_rele(bp); 502 XFS_STATS_INC(xb_busy_locked); 503 return NULL; 504 } 505 xfs_buf_lock(bp); 506 XFS_STATS_INC(xb_get_locked_waited); 507 } 508 509 /* 510 * if the buffer is stale, clear all the external state associated with 511 * it. We need to keep flags such as how we allocated the buffer memory 512 * intact here. 513 */ 514 if (bp->b_flags & XBF_STALE) { 515 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); 516 bp->b_flags &= _XBF_KMEM | _XBF_PAGES; 517 } 518 519 trace_xfs_buf_find(bp, flags, _RET_IP_); 520 XFS_STATS_INC(xb_get_locked); 521 return bp; 522 } 523 524 /* 525 * Assembles a buffer covering the specified range. The code is optimised for 526 * cache hits, as metadata intensive workloads will see 3 orders of magnitude 527 * more hits than misses. 528 */ 529 struct xfs_buf * 530 xfs_buf_get( 531 xfs_buftarg_t *target, 532 xfs_daddr_t blkno, 533 size_t numblks, 534 xfs_buf_flags_t flags) 535 { 536 struct xfs_buf *bp; 537 struct xfs_buf *new_bp; 538 int error = 0; 539 540 bp = _xfs_buf_find(target, blkno, numblks, flags, NULL); 541 if (likely(bp)) 542 goto found; 543 544 new_bp = xfs_buf_alloc(target, blkno, numblks, flags); 545 if (unlikely(!new_bp)) 546 return NULL; 547 548 error = xfs_buf_allocate_memory(new_bp, flags); 549 if (error) { 550 kmem_zone_free(xfs_buf_zone, new_bp); 551 return NULL; 552 } 553 554 bp = _xfs_buf_find(target, blkno, numblks, flags, new_bp); 555 if (!bp) { 556 xfs_buf_free(new_bp); 557 return NULL; 558 } 559 560 if (bp != new_bp) 561 xfs_buf_free(new_bp); 562 563 bp->b_io_length = bp->b_length; 564 565 found: 566 if (!bp->b_addr) { 567 error = _xfs_buf_map_pages(bp, flags); 568 if (unlikely(error)) { 569 xfs_warn(target->bt_mount, 570 "%s: failed to map pages\n", __func__); 571 xfs_buf_relse(bp); 572 return NULL; 573 } 574 } 575 576 XFS_STATS_INC(xb_get); 577 trace_xfs_buf_get(bp, flags, _RET_IP_); 578 return bp; 579 } 580 581 STATIC int 582 _xfs_buf_read( 583 xfs_buf_t *bp, 584 xfs_buf_flags_t flags) 585 { 586 ASSERT(!(flags & XBF_WRITE)); 587 ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL); 588 589 bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD); 590 bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); 591 592 xfs_buf_iorequest(bp); 593 if (flags & XBF_ASYNC) 594 return 0; 595 return xfs_buf_iowait(bp); 596 } 597 598 xfs_buf_t * 599 xfs_buf_read( 600 xfs_buftarg_t *target, 601 xfs_daddr_t blkno, 602 size_t numblks, 603 xfs_buf_flags_t flags) 604 { 605 xfs_buf_t *bp; 606 607 flags |= XBF_READ; 608 609 bp = xfs_buf_get(target, blkno, numblks, flags); 610 if (bp) { 611 trace_xfs_buf_read(bp, flags, _RET_IP_); 612 613 if (!XFS_BUF_ISDONE(bp)) { 614 XFS_STATS_INC(xb_get_read); 615 _xfs_buf_read(bp, flags); 616 } else if (flags & XBF_ASYNC) { 617 /* 618 * Read ahead call which is already satisfied, 619 * drop the buffer 620 */ 621 xfs_buf_relse(bp); 622 return NULL; 623 } else { 624 /* We do not want read in the flags */ 625 bp->b_flags &= ~XBF_READ; 626 } 627 } 628 629 return bp; 630 } 631 632 /* 633 * If we are not low on memory then do the readahead in a deadlock 634 * safe manner. 635 */ 636 void 637 xfs_buf_readahead( 638 xfs_buftarg_t *target, 639 xfs_daddr_t blkno, 640 size_t numblks) 641 { 642 if (bdi_read_congested(target->bt_bdi)) 643 return; 644 645 xfs_buf_read(target, blkno, numblks, 646 XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD); 647 } 648 649 /* 650 * Read an uncached buffer from disk. Allocates and returns a locked 651 * buffer containing the disk contents or nothing. 652 */ 653 struct xfs_buf * 654 xfs_buf_read_uncached( 655 struct xfs_buftarg *target, 656 xfs_daddr_t daddr, 657 size_t numblks, 658 int flags) 659 { 660 xfs_buf_t *bp; 661 int error; 662 663 bp = xfs_buf_get_uncached(target, numblks, flags); 664 if (!bp) 665 return NULL; 666 667 /* set up the buffer for a read IO */ 668 XFS_BUF_SET_ADDR(bp, daddr); 669 XFS_BUF_READ(bp); 670 671 xfsbdstrat(target->bt_mount, bp); 672 error = xfs_buf_iowait(bp); 673 if (error) { 674 xfs_buf_relse(bp); 675 return NULL; 676 } 677 return bp; 678 } 679 680 /* 681 * Return a buffer allocated as an empty buffer and associated to external 682 * memory via xfs_buf_associate_memory() back to it's empty state. 683 */ 684 void 685 xfs_buf_set_empty( 686 struct xfs_buf *bp, 687 size_t numblks) 688 { 689 if (bp->b_pages) 690 _xfs_buf_free_pages(bp); 691 692 bp->b_pages = NULL; 693 bp->b_page_count = 0; 694 bp->b_addr = NULL; 695 bp->b_length = numblks; 696 bp->b_io_length = numblks; 697 bp->b_bn = XFS_BUF_DADDR_NULL; 698 } 699 700 static inline struct page * 701 mem_to_page( 702 void *addr) 703 { 704 if ((!is_vmalloc_addr(addr))) { 705 return virt_to_page(addr); 706 } else { 707 return vmalloc_to_page(addr); 708 } 709 } 710 711 int 712 xfs_buf_associate_memory( 713 xfs_buf_t *bp, 714 void *mem, 715 size_t len) 716 { 717 int rval; 718 int i = 0; 719 unsigned long pageaddr; 720 unsigned long offset; 721 size_t buflen; 722 int page_count; 723 724 pageaddr = (unsigned long)mem & PAGE_MASK; 725 offset = (unsigned long)mem - pageaddr; 726 buflen = PAGE_ALIGN(len + offset); 727 page_count = buflen >> PAGE_SHIFT; 728 729 /* Free any previous set of page pointers */ 730 if (bp->b_pages) 731 _xfs_buf_free_pages(bp); 732 733 bp->b_pages = NULL; 734 bp->b_addr = mem; 735 736 rval = _xfs_buf_get_pages(bp, page_count, 0); 737 if (rval) 738 return rval; 739 740 bp->b_offset = offset; 741 742 for (i = 0; i < bp->b_page_count; i++) { 743 bp->b_pages[i] = mem_to_page((void *)pageaddr); 744 pageaddr += PAGE_SIZE; 745 } 746 747 bp->b_io_length = BTOBB(len); 748 bp->b_length = BTOBB(buflen); 749 750 return 0; 751 } 752 753 xfs_buf_t * 754 xfs_buf_get_uncached( 755 struct xfs_buftarg *target, 756 size_t numblks, 757 int flags) 758 { 759 unsigned long page_count; 760 int error, i; 761 xfs_buf_t *bp; 762 763 bp = xfs_buf_alloc(target, XFS_BUF_DADDR_NULL, numblks, 0); 764 if (unlikely(bp == NULL)) 765 goto fail; 766 767 page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT; 768 error = _xfs_buf_get_pages(bp, page_count, 0); 769 if (error) 770 goto fail_free_buf; 771 772 for (i = 0; i < page_count; i++) { 773 bp->b_pages[i] = alloc_page(xb_to_gfp(flags)); 774 if (!bp->b_pages[i]) 775 goto fail_free_mem; 776 } 777 bp->b_flags |= _XBF_PAGES; 778 779 error = _xfs_buf_map_pages(bp, 0); 780 if (unlikely(error)) { 781 xfs_warn(target->bt_mount, 782 "%s: failed to map pages\n", __func__); 783 goto fail_free_mem; 784 } 785 786 trace_xfs_buf_get_uncached(bp, _RET_IP_); 787 return bp; 788 789 fail_free_mem: 790 while (--i >= 0) 791 __free_page(bp->b_pages[i]); 792 _xfs_buf_free_pages(bp); 793 fail_free_buf: 794 kmem_zone_free(xfs_buf_zone, bp); 795 fail: 796 return NULL; 797 } 798 799 /* 800 * Increment reference count on buffer, to hold the buffer concurrently 801 * with another thread which may release (free) the buffer asynchronously. 802 * Must hold the buffer already to call this function. 803 */ 804 void 805 xfs_buf_hold( 806 xfs_buf_t *bp) 807 { 808 trace_xfs_buf_hold(bp, _RET_IP_); 809 atomic_inc(&bp->b_hold); 810 } 811 812 /* 813 * Releases a hold on the specified buffer. If the 814 * the hold count is 1, calls xfs_buf_free. 815 */ 816 void 817 xfs_buf_rele( 818 xfs_buf_t *bp) 819 { 820 struct xfs_perag *pag = bp->b_pag; 821 822 trace_xfs_buf_rele(bp, _RET_IP_); 823 824 if (!pag) { 825 ASSERT(list_empty(&bp->b_lru)); 826 ASSERT(RB_EMPTY_NODE(&bp->b_rbnode)); 827 if (atomic_dec_and_test(&bp->b_hold)) 828 xfs_buf_free(bp); 829 return; 830 } 831 832 ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode)); 833 834 ASSERT(atomic_read(&bp->b_hold) > 0); 835 if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) { 836 if (!(bp->b_flags & XBF_STALE) && 837 atomic_read(&bp->b_lru_ref)) { 838 xfs_buf_lru_add(bp); 839 spin_unlock(&pag->pag_buf_lock); 840 } else { 841 xfs_buf_lru_del(bp); 842 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); 843 rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); 844 spin_unlock(&pag->pag_buf_lock); 845 xfs_perag_put(pag); 846 xfs_buf_free(bp); 847 } 848 } 849 } 850 851 852 /* 853 * Lock a buffer object, if it is not already locked. 854 * 855 * If we come across a stale, pinned, locked buffer, we know that we are 856 * being asked to lock a buffer that has been reallocated. Because it is 857 * pinned, we know that the log has not been pushed to disk and hence it 858 * will still be locked. Rather than continuing to have trylock attempts 859 * fail until someone else pushes the log, push it ourselves before 860 * returning. This means that the xfsaild will not get stuck trying 861 * to push on stale inode buffers. 862 */ 863 int 864 xfs_buf_trylock( 865 struct xfs_buf *bp) 866 { 867 int locked; 868 869 locked = down_trylock(&bp->b_sema) == 0; 870 if (locked) 871 XB_SET_OWNER(bp); 872 else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) 873 xfs_log_force(bp->b_target->bt_mount, 0); 874 875 trace_xfs_buf_trylock(bp, _RET_IP_); 876 return locked; 877 } 878 879 /* 880 * Lock a buffer object. 881 * 882 * If we come across a stale, pinned, locked buffer, we know that we 883 * are being asked to lock a buffer that has been reallocated. Because 884 * it is pinned, we know that the log has not been pushed to disk and 885 * hence it will still be locked. Rather than sleeping until someone 886 * else pushes the log, push it ourselves before trying to get the lock. 887 */ 888 void 889 xfs_buf_lock( 890 struct xfs_buf *bp) 891 { 892 trace_xfs_buf_lock(bp, _RET_IP_); 893 894 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) 895 xfs_log_force(bp->b_target->bt_mount, 0); 896 down(&bp->b_sema); 897 XB_SET_OWNER(bp); 898 899 trace_xfs_buf_lock_done(bp, _RET_IP_); 900 } 901 902 void 903 xfs_buf_unlock( 904 struct xfs_buf *bp) 905 { 906 XB_CLEAR_OWNER(bp); 907 up(&bp->b_sema); 908 909 trace_xfs_buf_unlock(bp, _RET_IP_); 910 } 911 912 STATIC void 913 xfs_buf_wait_unpin( 914 xfs_buf_t *bp) 915 { 916 DECLARE_WAITQUEUE (wait, current); 917 918 if (atomic_read(&bp->b_pin_count) == 0) 919 return; 920 921 add_wait_queue(&bp->b_waiters, &wait); 922 for (;;) { 923 set_current_state(TASK_UNINTERRUPTIBLE); 924 if (atomic_read(&bp->b_pin_count) == 0) 925 break; 926 io_schedule(); 927 } 928 remove_wait_queue(&bp->b_waiters, &wait); 929 set_current_state(TASK_RUNNING); 930 } 931 932 /* 933 * Buffer Utility Routines 934 */ 935 936 STATIC void 937 xfs_buf_iodone_work( 938 struct work_struct *work) 939 { 940 xfs_buf_t *bp = 941 container_of(work, xfs_buf_t, b_iodone_work); 942 943 if (bp->b_iodone) 944 (*(bp->b_iodone))(bp); 945 else if (bp->b_flags & XBF_ASYNC) 946 xfs_buf_relse(bp); 947 } 948 949 void 950 xfs_buf_ioend( 951 xfs_buf_t *bp, 952 int schedule) 953 { 954 trace_xfs_buf_iodone(bp, _RET_IP_); 955 956 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); 957 if (bp->b_error == 0) 958 bp->b_flags |= XBF_DONE; 959 960 if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) { 961 if (schedule) { 962 INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work); 963 queue_work(xfslogd_workqueue, &bp->b_iodone_work); 964 } else { 965 xfs_buf_iodone_work(&bp->b_iodone_work); 966 } 967 } else { 968 complete(&bp->b_iowait); 969 } 970 } 971 972 void 973 xfs_buf_ioerror( 974 xfs_buf_t *bp, 975 int error) 976 { 977 ASSERT(error >= 0 && error <= 0xffff); 978 bp->b_error = (unsigned short)error; 979 trace_xfs_buf_ioerror(bp, error, _RET_IP_); 980 } 981 982 void 983 xfs_buf_ioerror_alert( 984 struct xfs_buf *bp, 985 const char *func) 986 { 987 xfs_alert(bp->b_target->bt_mount, 988 "metadata I/O error: block 0x%llx (\"%s\") error %d numblks %d", 989 (__uint64_t)XFS_BUF_ADDR(bp), func, bp->b_error, bp->b_length); 990 } 991 992 /* 993 * Called when we want to stop a buffer from getting written or read. 994 * We attach the EIO error, muck with its flags, and call xfs_buf_ioend 995 * so that the proper iodone callbacks get called. 996 */ 997 STATIC int 998 xfs_bioerror( 999 xfs_buf_t *bp) 1000 { 1001 #ifdef XFSERRORDEBUG 1002 ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone); 1003 #endif 1004 1005 /* 1006 * No need to wait until the buffer is unpinned, we aren't flushing it. 1007 */ 1008 xfs_buf_ioerror(bp, EIO); 1009 1010 /* 1011 * We're calling xfs_buf_ioend, so delete XBF_DONE flag. 1012 */ 1013 XFS_BUF_UNREAD(bp); 1014 XFS_BUF_UNDONE(bp); 1015 xfs_buf_stale(bp); 1016 1017 xfs_buf_ioend(bp, 0); 1018 1019 return EIO; 1020 } 1021 1022 /* 1023 * Same as xfs_bioerror, except that we are releasing the buffer 1024 * here ourselves, and avoiding the xfs_buf_ioend call. 1025 * This is meant for userdata errors; metadata bufs come with 1026 * iodone functions attached, so that we can track down errors. 1027 */ 1028 STATIC int 1029 xfs_bioerror_relse( 1030 struct xfs_buf *bp) 1031 { 1032 int64_t fl = bp->b_flags; 1033 /* 1034 * No need to wait until the buffer is unpinned. 1035 * We aren't flushing it. 1036 * 1037 * chunkhold expects B_DONE to be set, whether 1038 * we actually finish the I/O or not. We don't want to 1039 * change that interface. 1040 */ 1041 XFS_BUF_UNREAD(bp); 1042 XFS_BUF_DONE(bp); 1043 xfs_buf_stale(bp); 1044 bp->b_iodone = NULL; 1045 if (!(fl & XBF_ASYNC)) { 1046 /* 1047 * Mark b_error and B_ERROR _both_. 1048 * Lot's of chunkcache code assumes that. 1049 * There's no reason to mark error for 1050 * ASYNC buffers. 1051 */ 1052 xfs_buf_ioerror(bp, EIO); 1053 complete(&bp->b_iowait); 1054 } else { 1055 xfs_buf_relse(bp); 1056 } 1057 1058 return EIO; 1059 } 1060 1061 STATIC int 1062 xfs_bdstrat_cb( 1063 struct xfs_buf *bp) 1064 { 1065 if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { 1066 trace_xfs_bdstrat_shut(bp, _RET_IP_); 1067 /* 1068 * Metadata write that didn't get logged but 1069 * written delayed anyway. These aren't associated 1070 * with a transaction, and can be ignored. 1071 */ 1072 if (!bp->b_iodone && !XFS_BUF_ISREAD(bp)) 1073 return xfs_bioerror_relse(bp); 1074 else 1075 return xfs_bioerror(bp); 1076 } 1077 1078 xfs_buf_iorequest(bp); 1079 return 0; 1080 } 1081 1082 int 1083 xfs_bwrite( 1084 struct xfs_buf *bp) 1085 { 1086 int error; 1087 1088 ASSERT(xfs_buf_islocked(bp)); 1089 1090 bp->b_flags |= XBF_WRITE; 1091 bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q); 1092 1093 xfs_bdstrat_cb(bp); 1094 1095 error = xfs_buf_iowait(bp); 1096 if (error) { 1097 xfs_force_shutdown(bp->b_target->bt_mount, 1098 SHUTDOWN_META_IO_ERROR); 1099 } 1100 return error; 1101 } 1102 1103 /* 1104 * Wrapper around bdstrat so that we can stop data from going to disk in case 1105 * we are shutting down the filesystem. Typically user data goes thru this 1106 * path; one of the exceptions is the superblock. 1107 */ 1108 void 1109 xfsbdstrat( 1110 struct xfs_mount *mp, 1111 struct xfs_buf *bp) 1112 { 1113 if (XFS_FORCED_SHUTDOWN(mp)) { 1114 trace_xfs_bdstrat_shut(bp, _RET_IP_); 1115 xfs_bioerror_relse(bp); 1116 return; 1117 } 1118 1119 xfs_buf_iorequest(bp); 1120 } 1121 1122 STATIC void 1123 _xfs_buf_ioend( 1124 xfs_buf_t *bp, 1125 int schedule) 1126 { 1127 if (atomic_dec_and_test(&bp->b_io_remaining) == 1) 1128 xfs_buf_ioend(bp, schedule); 1129 } 1130 1131 STATIC void 1132 xfs_buf_bio_end_io( 1133 struct bio *bio, 1134 int error) 1135 { 1136 xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; 1137 1138 xfs_buf_ioerror(bp, -error); 1139 1140 if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) 1141 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); 1142 1143 _xfs_buf_ioend(bp, 1); 1144 bio_put(bio); 1145 } 1146 1147 STATIC void 1148 _xfs_buf_ioapply( 1149 xfs_buf_t *bp) 1150 { 1151 int rw, map_i, total_nr_pages, nr_pages; 1152 struct bio *bio; 1153 int offset = bp->b_offset; 1154 int size = BBTOB(bp->b_io_length); 1155 sector_t sector = bp->b_bn; 1156 1157 total_nr_pages = bp->b_page_count; 1158 map_i = 0; 1159 1160 if (bp->b_flags & XBF_WRITE) { 1161 if (bp->b_flags & XBF_SYNCIO) 1162 rw = WRITE_SYNC; 1163 else 1164 rw = WRITE; 1165 if (bp->b_flags & XBF_FUA) 1166 rw |= REQ_FUA; 1167 if (bp->b_flags & XBF_FLUSH) 1168 rw |= REQ_FLUSH; 1169 } else if (bp->b_flags & XBF_READ_AHEAD) { 1170 rw = READA; 1171 } else { 1172 rw = READ; 1173 } 1174 1175 /* we only use the buffer cache for meta-data */ 1176 rw |= REQ_META; 1177 1178 next_chunk: 1179 atomic_inc(&bp->b_io_remaining); 1180 nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT); 1181 if (nr_pages > total_nr_pages) 1182 nr_pages = total_nr_pages; 1183 1184 bio = bio_alloc(GFP_NOIO, nr_pages); 1185 bio->bi_bdev = bp->b_target->bt_bdev; 1186 bio->bi_sector = sector; 1187 bio->bi_end_io = xfs_buf_bio_end_io; 1188 bio->bi_private = bp; 1189 1190 1191 for (; size && nr_pages; nr_pages--, map_i++) { 1192 int rbytes, nbytes = PAGE_SIZE - offset; 1193 1194 if (nbytes > size) 1195 nbytes = size; 1196 1197 rbytes = bio_add_page(bio, bp->b_pages[map_i], nbytes, offset); 1198 if (rbytes < nbytes) 1199 break; 1200 1201 offset = 0; 1202 sector += BTOBB(nbytes); 1203 size -= nbytes; 1204 total_nr_pages--; 1205 } 1206 1207 if (likely(bio->bi_size)) { 1208 if (xfs_buf_is_vmapped(bp)) { 1209 flush_kernel_vmap_range(bp->b_addr, 1210 xfs_buf_vmap_len(bp)); 1211 } 1212 submit_bio(rw, bio); 1213 if (size) 1214 goto next_chunk; 1215 } else { 1216 xfs_buf_ioerror(bp, EIO); 1217 bio_put(bio); 1218 } 1219 } 1220 1221 void 1222 xfs_buf_iorequest( 1223 xfs_buf_t *bp) 1224 { 1225 trace_xfs_buf_iorequest(bp, _RET_IP_); 1226 1227 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); 1228 1229 if (bp->b_flags & XBF_WRITE) 1230 xfs_buf_wait_unpin(bp); 1231 xfs_buf_hold(bp); 1232 1233 /* Set the count to 1 initially, this will stop an I/O 1234 * completion callout which happens before we have started 1235 * all the I/O from calling xfs_buf_ioend too early. 1236 */ 1237 atomic_set(&bp->b_io_remaining, 1); 1238 _xfs_buf_ioapply(bp); 1239 _xfs_buf_ioend(bp, 1); 1240 1241 xfs_buf_rele(bp); 1242 } 1243 1244 /* 1245 * Waits for I/O to complete on the buffer supplied. It returns immediately if 1246 * no I/O is pending or there is already a pending error on the buffer. It 1247 * returns the I/O error code, if any, or 0 if there was no error. 1248 */ 1249 int 1250 xfs_buf_iowait( 1251 xfs_buf_t *bp) 1252 { 1253 trace_xfs_buf_iowait(bp, _RET_IP_); 1254 1255 if (!bp->b_error) 1256 wait_for_completion(&bp->b_iowait); 1257 1258 trace_xfs_buf_iowait_done(bp, _RET_IP_); 1259 return bp->b_error; 1260 } 1261 1262 xfs_caddr_t 1263 xfs_buf_offset( 1264 xfs_buf_t *bp, 1265 size_t offset) 1266 { 1267 struct page *page; 1268 1269 if (bp->b_addr) 1270 return bp->b_addr + offset; 1271 1272 offset += bp->b_offset; 1273 page = bp->b_pages[offset >> PAGE_SHIFT]; 1274 return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1)); 1275 } 1276 1277 /* 1278 * Move data into or out of a buffer. 1279 */ 1280 void 1281 xfs_buf_iomove( 1282 xfs_buf_t *bp, /* buffer to process */ 1283 size_t boff, /* starting buffer offset */ 1284 size_t bsize, /* length to copy */ 1285 void *data, /* data address */ 1286 xfs_buf_rw_t mode) /* read/write/zero flag */ 1287 { 1288 size_t bend; 1289 1290 bend = boff + bsize; 1291 while (boff < bend) { 1292 struct page *page; 1293 int page_index, page_offset, csize; 1294 1295 page_index = (boff + bp->b_offset) >> PAGE_SHIFT; 1296 page_offset = (boff + bp->b_offset) & ~PAGE_MASK; 1297 page = bp->b_pages[page_index]; 1298 csize = min_t(size_t, PAGE_SIZE - page_offset, 1299 BBTOB(bp->b_io_length) - boff); 1300 1301 ASSERT((csize + page_offset) <= PAGE_SIZE); 1302 1303 switch (mode) { 1304 case XBRW_ZERO: 1305 memset(page_address(page) + page_offset, 0, csize); 1306 break; 1307 case XBRW_READ: 1308 memcpy(data, page_address(page) + page_offset, csize); 1309 break; 1310 case XBRW_WRITE: 1311 memcpy(page_address(page) + page_offset, data, csize); 1312 } 1313 1314 boff += csize; 1315 data += csize; 1316 } 1317 } 1318 1319 /* 1320 * Handling of buffer targets (buftargs). 1321 */ 1322 1323 /* 1324 * Wait for any bufs with callbacks that have been submitted but have not yet 1325 * returned. These buffers will have an elevated hold count, so wait on those 1326 * while freeing all the buffers only held by the LRU. 1327 */ 1328 void 1329 xfs_wait_buftarg( 1330 struct xfs_buftarg *btp) 1331 { 1332 struct xfs_buf *bp; 1333 1334 restart: 1335 spin_lock(&btp->bt_lru_lock); 1336 while (!list_empty(&btp->bt_lru)) { 1337 bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru); 1338 if (atomic_read(&bp->b_hold) > 1) { 1339 spin_unlock(&btp->bt_lru_lock); 1340 delay(100); 1341 goto restart; 1342 } 1343 /* 1344 * clear the LRU reference count so the buffer doesn't get 1345 * ignored in xfs_buf_rele(). 1346 */ 1347 atomic_set(&bp->b_lru_ref, 0); 1348 spin_unlock(&btp->bt_lru_lock); 1349 xfs_buf_rele(bp); 1350 spin_lock(&btp->bt_lru_lock); 1351 } 1352 spin_unlock(&btp->bt_lru_lock); 1353 } 1354 1355 int 1356 xfs_buftarg_shrink( 1357 struct shrinker *shrink, 1358 struct shrink_control *sc) 1359 { 1360 struct xfs_buftarg *btp = container_of(shrink, 1361 struct xfs_buftarg, bt_shrinker); 1362 struct xfs_buf *bp; 1363 int nr_to_scan = sc->nr_to_scan; 1364 LIST_HEAD(dispose); 1365 1366 if (!nr_to_scan) 1367 return btp->bt_lru_nr; 1368 1369 spin_lock(&btp->bt_lru_lock); 1370 while (!list_empty(&btp->bt_lru)) { 1371 if (nr_to_scan-- <= 0) 1372 break; 1373 1374 bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru); 1375 1376 /* 1377 * Decrement the b_lru_ref count unless the value is already 1378 * zero. If the value is already zero, we need to reclaim the 1379 * buffer, otherwise it gets another trip through the LRU. 1380 */ 1381 if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) { 1382 list_move_tail(&bp->b_lru, &btp->bt_lru); 1383 continue; 1384 } 1385 1386 /* 1387 * remove the buffer from the LRU now to avoid needing another 1388 * lock round trip inside xfs_buf_rele(). 1389 */ 1390 list_move(&bp->b_lru, &dispose); 1391 btp->bt_lru_nr--; 1392 } 1393 spin_unlock(&btp->bt_lru_lock); 1394 1395 while (!list_empty(&dispose)) { 1396 bp = list_first_entry(&dispose, struct xfs_buf, b_lru); 1397 list_del_init(&bp->b_lru); 1398 xfs_buf_rele(bp); 1399 } 1400 1401 return btp->bt_lru_nr; 1402 } 1403 1404 void 1405 xfs_free_buftarg( 1406 struct xfs_mount *mp, 1407 struct xfs_buftarg *btp) 1408 { 1409 unregister_shrinker(&btp->bt_shrinker); 1410 1411 if (mp->m_flags & XFS_MOUNT_BARRIER) 1412 xfs_blkdev_issue_flush(btp); 1413 1414 kmem_free(btp); 1415 } 1416 1417 STATIC int 1418 xfs_setsize_buftarg_flags( 1419 xfs_buftarg_t *btp, 1420 unsigned int blocksize, 1421 unsigned int sectorsize, 1422 int verbose) 1423 { 1424 btp->bt_bsize = blocksize; 1425 btp->bt_sshift = ffs(sectorsize) - 1; 1426 btp->bt_smask = sectorsize - 1; 1427 1428 if (set_blocksize(btp->bt_bdev, sectorsize)) { 1429 char name[BDEVNAME_SIZE]; 1430 1431 bdevname(btp->bt_bdev, name); 1432 1433 xfs_warn(btp->bt_mount, 1434 "Cannot set_blocksize to %u on device %s\n", 1435 sectorsize, name); 1436 return EINVAL; 1437 } 1438 1439 return 0; 1440 } 1441 1442 /* 1443 * When allocating the initial buffer target we have not yet 1444 * read in the superblock, so don't know what sized sectors 1445 * are being used is at this early stage. Play safe. 1446 */ 1447 STATIC int 1448 xfs_setsize_buftarg_early( 1449 xfs_buftarg_t *btp, 1450 struct block_device *bdev) 1451 { 1452 return xfs_setsize_buftarg_flags(btp, 1453 PAGE_SIZE, bdev_logical_block_size(bdev), 0); 1454 } 1455 1456 int 1457 xfs_setsize_buftarg( 1458 xfs_buftarg_t *btp, 1459 unsigned int blocksize, 1460 unsigned int sectorsize) 1461 { 1462 return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1); 1463 } 1464 1465 xfs_buftarg_t * 1466 xfs_alloc_buftarg( 1467 struct xfs_mount *mp, 1468 struct block_device *bdev, 1469 int external, 1470 const char *fsname) 1471 { 1472 xfs_buftarg_t *btp; 1473 1474 btp = kmem_zalloc(sizeof(*btp), KM_SLEEP); 1475 1476 btp->bt_mount = mp; 1477 btp->bt_dev = bdev->bd_dev; 1478 btp->bt_bdev = bdev; 1479 btp->bt_bdi = blk_get_backing_dev_info(bdev); 1480 if (!btp->bt_bdi) 1481 goto error; 1482 1483 INIT_LIST_HEAD(&btp->bt_lru); 1484 spin_lock_init(&btp->bt_lru_lock); 1485 if (xfs_setsize_buftarg_early(btp, bdev)) 1486 goto error; 1487 btp->bt_shrinker.shrink = xfs_buftarg_shrink; 1488 btp->bt_shrinker.seeks = DEFAULT_SEEKS; 1489 register_shrinker(&btp->bt_shrinker); 1490 return btp; 1491 1492 error: 1493 kmem_free(btp); 1494 return NULL; 1495 } 1496 1497 /* 1498 * Add a buffer to the delayed write list. 1499 * 1500 * This queues a buffer for writeout if it hasn't already been. Note that 1501 * neither this routine nor the buffer list submission functions perform 1502 * any internal synchronization. It is expected that the lists are thread-local 1503 * to the callers. 1504 * 1505 * Returns true if we queued up the buffer, or false if it already had 1506 * been on the buffer list. 1507 */ 1508 bool 1509 xfs_buf_delwri_queue( 1510 struct xfs_buf *bp, 1511 struct list_head *list) 1512 { 1513 ASSERT(xfs_buf_islocked(bp)); 1514 ASSERT(!(bp->b_flags & XBF_READ)); 1515 1516 /* 1517 * If the buffer is already marked delwri it already is queued up 1518 * by someone else for imediate writeout. Just ignore it in that 1519 * case. 1520 */ 1521 if (bp->b_flags & _XBF_DELWRI_Q) { 1522 trace_xfs_buf_delwri_queued(bp, _RET_IP_); 1523 return false; 1524 } 1525 1526 trace_xfs_buf_delwri_queue(bp, _RET_IP_); 1527 1528 /* 1529 * If a buffer gets written out synchronously or marked stale while it 1530 * is on a delwri list we lazily remove it. To do this, the other party 1531 * clears the _XBF_DELWRI_Q flag but otherwise leaves the buffer alone. 1532 * It remains referenced and on the list. In a rare corner case it 1533 * might get readded to a delwri list after the synchronous writeout, in 1534 * which case we need just need to re-add the flag here. 1535 */ 1536 bp->b_flags |= _XBF_DELWRI_Q; 1537 if (list_empty(&bp->b_list)) { 1538 atomic_inc(&bp->b_hold); 1539 list_add_tail(&bp->b_list, list); 1540 } 1541 1542 return true; 1543 } 1544 1545 /* 1546 * Compare function is more complex than it needs to be because 1547 * the return value is only 32 bits and we are doing comparisons 1548 * on 64 bit values 1549 */ 1550 static int 1551 xfs_buf_cmp( 1552 void *priv, 1553 struct list_head *a, 1554 struct list_head *b) 1555 { 1556 struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list); 1557 struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list); 1558 xfs_daddr_t diff; 1559 1560 diff = ap->b_bn - bp->b_bn; 1561 if (diff < 0) 1562 return -1; 1563 if (diff > 0) 1564 return 1; 1565 return 0; 1566 } 1567 1568 static int 1569 __xfs_buf_delwri_submit( 1570 struct list_head *buffer_list, 1571 struct list_head *io_list, 1572 bool wait) 1573 { 1574 struct blk_plug plug; 1575 struct xfs_buf *bp, *n; 1576 int pinned = 0; 1577 1578 list_for_each_entry_safe(bp, n, buffer_list, b_list) { 1579 if (!wait) { 1580 if (xfs_buf_ispinned(bp)) { 1581 pinned++; 1582 continue; 1583 } 1584 if (!xfs_buf_trylock(bp)) 1585 continue; 1586 } else { 1587 xfs_buf_lock(bp); 1588 } 1589 1590 /* 1591 * Someone else might have written the buffer synchronously or 1592 * marked it stale in the meantime. In that case only the 1593 * _XBF_DELWRI_Q flag got cleared, and we have to drop the 1594 * reference and remove it from the list here. 1595 */ 1596 if (!(bp->b_flags & _XBF_DELWRI_Q)) { 1597 list_del_init(&bp->b_list); 1598 xfs_buf_relse(bp); 1599 continue; 1600 } 1601 1602 list_move_tail(&bp->b_list, io_list); 1603 trace_xfs_buf_delwri_split(bp, _RET_IP_); 1604 } 1605 1606 list_sort(NULL, io_list, xfs_buf_cmp); 1607 1608 blk_start_plug(&plug); 1609 list_for_each_entry_safe(bp, n, io_list, b_list) { 1610 bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC); 1611 bp->b_flags |= XBF_WRITE; 1612 1613 if (!wait) { 1614 bp->b_flags |= XBF_ASYNC; 1615 list_del_init(&bp->b_list); 1616 } 1617 xfs_bdstrat_cb(bp); 1618 } 1619 blk_finish_plug(&plug); 1620 1621 return pinned; 1622 } 1623 1624 /* 1625 * Write out a buffer list asynchronously. 1626 * 1627 * This will take the @buffer_list, write all non-locked and non-pinned buffers 1628 * out and not wait for I/O completion on any of the buffers. This interface 1629 * is only safely useable for callers that can track I/O completion by higher 1630 * level means, e.g. AIL pushing as the @buffer_list is consumed in this 1631 * function. 1632 */ 1633 int 1634 xfs_buf_delwri_submit_nowait( 1635 struct list_head *buffer_list) 1636 { 1637 LIST_HEAD (io_list); 1638 return __xfs_buf_delwri_submit(buffer_list, &io_list, false); 1639 } 1640 1641 /* 1642 * Write out a buffer list synchronously. 1643 * 1644 * This will take the @buffer_list, write all buffers out and wait for I/O 1645 * completion on all of the buffers. @buffer_list is consumed by the function, 1646 * so callers must have some other way of tracking buffers if they require such 1647 * functionality. 1648 */ 1649 int 1650 xfs_buf_delwri_submit( 1651 struct list_head *buffer_list) 1652 { 1653 LIST_HEAD (io_list); 1654 int error = 0, error2; 1655 struct xfs_buf *bp; 1656 1657 __xfs_buf_delwri_submit(buffer_list, &io_list, true); 1658 1659 /* Wait for IO to complete. */ 1660 while (!list_empty(&io_list)) { 1661 bp = list_first_entry(&io_list, struct xfs_buf, b_list); 1662 1663 list_del_init(&bp->b_list); 1664 error2 = xfs_buf_iowait(bp); 1665 xfs_buf_relse(bp); 1666 if (!error) 1667 error = error2; 1668 } 1669 1670 return error; 1671 } 1672 1673 int __init 1674 xfs_buf_init(void) 1675 { 1676 xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf", 1677 KM_ZONE_HWALIGN, NULL); 1678 if (!xfs_buf_zone) 1679 goto out; 1680 1681 xfslogd_workqueue = alloc_workqueue("xfslogd", 1682 WQ_MEM_RECLAIM | WQ_HIGHPRI, 1); 1683 if (!xfslogd_workqueue) 1684 goto out_free_buf_zone; 1685 1686 return 0; 1687 1688 out_free_buf_zone: 1689 kmem_zone_destroy(xfs_buf_zone); 1690 out: 1691 return -ENOMEM; 1692 } 1693 1694 void 1695 xfs_buf_terminate(void) 1696 { 1697 destroy_workqueue(xfslogd_workqueue); 1698 kmem_zone_destroy(xfs_buf_zone); 1699 } 1700