1 /* 2 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 #include "xfs.h" 19 #include <linux/stddef.h> 20 #include <linux/errno.h> 21 #include <linux/gfp.h> 22 #include <linux/pagemap.h> 23 #include <linux/init.h> 24 #include <linux/vmalloc.h> 25 #include <linux/bio.h> 26 #include <linux/sysctl.h> 27 #include <linux/proc_fs.h> 28 #include <linux/workqueue.h> 29 #include <linux/percpu.h> 30 #include <linux/blkdev.h> 31 #include <linux/hash.h> 32 #include <linux/kthread.h> 33 #include <linux/migrate.h> 34 #include <linux/backing-dev.h> 35 #include <linux/freezer.h> 36 37 #include "xfs_format.h" 38 #include "xfs_log_format.h" 39 #include "xfs_trans_resv.h" 40 #include "xfs_sb.h" 41 #include "xfs_mount.h" 42 #include "xfs_trace.h" 43 #include "xfs_log.h" 44 45 static kmem_zone_t *xfs_buf_zone; 46 47 #ifdef XFS_BUF_LOCK_TRACKING 48 # define XB_SET_OWNER(bp) ((bp)->b_last_holder = current->pid) 49 # define XB_CLEAR_OWNER(bp) ((bp)->b_last_holder = -1) 50 # define XB_GET_OWNER(bp) ((bp)->b_last_holder) 51 #else 52 # define XB_SET_OWNER(bp) do { } while (0) 53 # define XB_CLEAR_OWNER(bp) do { } while (0) 54 # define XB_GET_OWNER(bp) do { } while (0) 55 #endif 56 57 #define xb_to_gfp(flags) \ 58 ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN) 59 60 61 static inline int 62 xfs_buf_is_vmapped( 63 struct xfs_buf *bp) 64 { 65 /* 66 * Return true if the buffer is vmapped. 67 * 68 * b_addr is null if the buffer is not mapped, but the code is clever 69 * enough to know it doesn't have to map a single page, so the check has 70 * to be both for b_addr and bp->b_page_count > 1. 71 */ 72 return bp->b_addr && bp->b_page_count > 1; 73 } 74 75 static inline int 76 xfs_buf_vmap_len( 77 struct xfs_buf *bp) 78 { 79 return (bp->b_page_count * PAGE_SIZE) - bp->b_offset; 80 } 81 82 /* 83 * Bump the I/O in flight count on the buftarg if we haven't yet done so for 84 * this buffer. The count is incremented once per buffer (per hold cycle) 85 * because the corresponding decrement is deferred to buffer release. Buffers 86 * can undergo I/O multiple times in a hold-release cycle and per buffer I/O 87 * tracking adds unnecessary overhead. This is used for sychronization purposes 88 * with unmount (see xfs_wait_buftarg()), so all we really need is a count of 89 * in-flight buffers. 90 * 91 * Buffers that are never released (e.g., superblock, iclog buffers) must set 92 * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count 93 * never reaches zero and unmount hangs indefinitely. 94 */ 95 static inline void 96 xfs_buf_ioacct_inc( 97 struct xfs_buf *bp) 98 { 99 if (bp->b_flags & (XBF_NO_IOACCT|_XBF_IN_FLIGHT)) 100 return; 101 102 ASSERT(bp->b_flags & XBF_ASYNC); 103 bp->b_flags |= _XBF_IN_FLIGHT; 104 percpu_counter_inc(&bp->b_target->bt_io_count); 105 } 106 107 /* 108 * Clear the in-flight state on a buffer about to be released to the LRU or 109 * freed and unaccount from the buftarg. 110 */ 111 static inline void 112 xfs_buf_ioacct_dec( 113 struct xfs_buf *bp) 114 { 115 if (!(bp->b_flags & _XBF_IN_FLIGHT)) 116 return; 117 118 bp->b_flags &= ~_XBF_IN_FLIGHT; 119 percpu_counter_dec(&bp->b_target->bt_io_count); 120 } 121 122 /* 123 * When we mark a buffer stale, we remove the buffer from the LRU and clear the 124 * b_lru_ref count so that the buffer is freed immediately when the buffer 125 * reference count falls to zero. If the buffer is already on the LRU, we need 126 * to remove the reference that LRU holds on the buffer. 127 * 128 * This prevents build-up of stale buffers on the LRU. 129 */ 130 void 131 xfs_buf_stale( 132 struct xfs_buf *bp) 133 { 134 ASSERT(xfs_buf_islocked(bp)); 135 136 bp->b_flags |= XBF_STALE; 137 138 /* 139 * Clear the delwri status so that a delwri queue walker will not 140 * flush this buffer to disk now that it is stale. The delwri queue has 141 * a reference to the buffer, so this is safe to do. 142 */ 143 bp->b_flags &= ~_XBF_DELWRI_Q; 144 145 /* 146 * Once the buffer is marked stale and unlocked, a subsequent lookup 147 * could reset b_flags. There is no guarantee that the buffer is 148 * unaccounted (released to LRU) before that occurs. Drop in-flight 149 * status now to preserve accounting consistency. 150 */ 151 xfs_buf_ioacct_dec(bp); 152 153 spin_lock(&bp->b_lock); 154 atomic_set(&bp->b_lru_ref, 0); 155 if (!(bp->b_state & XFS_BSTATE_DISPOSE) && 156 (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru))) 157 atomic_dec(&bp->b_hold); 158 159 ASSERT(atomic_read(&bp->b_hold) >= 1); 160 spin_unlock(&bp->b_lock); 161 } 162 163 static int 164 xfs_buf_get_maps( 165 struct xfs_buf *bp, 166 int map_count) 167 { 168 ASSERT(bp->b_maps == NULL); 169 bp->b_map_count = map_count; 170 171 if (map_count == 1) { 172 bp->b_maps = &bp->__b_map; 173 return 0; 174 } 175 176 bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map), 177 KM_NOFS); 178 if (!bp->b_maps) 179 return -ENOMEM; 180 return 0; 181 } 182 183 /* 184 * Frees b_pages if it was allocated. 185 */ 186 static void 187 xfs_buf_free_maps( 188 struct xfs_buf *bp) 189 { 190 if (bp->b_maps != &bp->__b_map) { 191 kmem_free(bp->b_maps); 192 bp->b_maps = NULL; 193 } 194 } 195 196 struct xfs_buf * 197 _xfs_buf_alloc( 198 struct xfs_buftarg *target, 199 struct xfs_buf_map *map, 200 int nmaps, 201 xfs_buf_flags_t flags) 202 { 203 struct xfs_buf *bp; 204 int error; 205 int i; 206 207 bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS); 208 if (unlikely(!bp)) 209 return NULL; 210 211 /* 212 * We don't want certain flags to appear in b_flags unless they are 213 * specifically set by later operations on the buffer. 214 */ 215 flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD); 216 217 atomic_set(&bp->b_hold, 1); 218 atomic_set(&bp->b_lru_ref, 1); 219 init_completion(&bp->b_iowait); 220 INIT_LIST_HEAD(&bp->b_lru); 221 INIT_LIST_HEAD(&bp->b_list); 222 RB_CLEAR_NODE(&bp->b_rbnode); 223 sema_init(&bp->b_sema, 0); /* held, no waiters */ 224 spin_lock_init(&bp->b_lock); 225 XB_SET_OWNER(bp); 226 bp->b_target = target; 227 bp->b_flags = flags; 228 229 /* 230 * Set length and io_length to the same value initially. 231 * I/O routines should use io_length, which will be the same in 232 * most cases but may be reset (e.g. XFS recovery). 233 */ 234 error = xfs_buf_get_maps(bp, nmaps); 235 if (error) { 236 kmem_zone_free(xfs_buf_zone, bp); 237 return NULL; 238 } 239 240 bp->b_bn = map[0].bm_bn; 241 bp->b_length = 0; 242 for (i = 0; i < nmaps; i++) { 243 bp->b_maps[i].bm_bn = map[i].bm_bn; 244 bp->b_maps[i].bm_len = map[i].bm_len; 245 bp->b_length += map[i].bm_len; 246 } 247 bp->b_io_length = bp->b_length; 248 249 atomic_set(&bp->b_pin_count, 0); 250 init_waitqueue_head(&bp->b_waiters); 251 252 XFS_STATS_INC(target->bt_mount, xb_create); 253 trace_xfs_buf_init(bp, _RET_IP_); 254 255 return bp; 256 } 257 258 /* 259 * Allocate a page array capable of holding a specified number 260 * of pages, and point the page buf at it. 261 */ 262 STATIC int 263 _xfs_buf_get_pages( 264 xfs_buf_t *bp, 265 int page_count) 266 { 267 /* Make sure that we have a page list */ 268 if (bp->b_pages == NULL) { 269 bp->b_page_count = page_count; 270 if (page_count <= XB_PAGES) { 271 bp->b_pages = bp->b_page_array; 272 } else { 273 bp->b_pages = kmem_alloc(sizeof(struct page *) * 274 page_count, KM_NOFS); 275 if (bp->b_pages == NULL) 276 return -ENOMEM; 277 } 278 memset(bp->b_pages, 0, sizeof(struct page *) * page_count); 279 } 280 return 0; 281 } 282 283 /* 284 * Frees b_pages if it was allocated. 285 */ 286 STATIC void 287 _xfs_buf_free_pages( 288 xfs_buf_t *bp) 289 { 290 if (bp->b_pages != bp->b_page_array) { 291 kmem_free(bp->b_pages); 292 bp->b_pages = NULL; 293 } 294 } 295 296 /* 297 * Releases the specified buffer. 298 * 299 * The modification state of any associated pages is left unchanged. 300 * The buffer must not be on any hash - use xfs_buf_rele instead for 301 * hashed and refcounted buffers 302 */ 303 void 304 xfs_buf_free( 305 xfs_buf_t *bp) 306 { 307 trace_xfs_buf_free(bp, _RET_IP_); 308 309 ASSERT(list_empty(&bp->b_lru)); 310 311 if (bp->b_flags & _XBF_PAGES) { 312 uint i; 313 314 if (xfs_buf_is_vmapped(bp)) 315 vm_unmap_ram(bp->b_addr - bp->b_offset, 316 bp->b_page_count); 317 318 for (i = 0; i < bp->b_page_count; i++) { 319 struct page *page = bp->b_pages[i]; 320 321 __free_page(page); 322 } 323 } else if (bp->b_flags & _XBF_KMEM) 324 kmem_free(bp->b_addr); 325 _xfs_buf_free_pages(bp); 326 xfs_buf_free_maps(bp); 327 kmem_zone_free(xfs_buf_zone, bp); 328 } 329 330 /* 331 * Allocates all the pages for buffer in question and builds it's page list. 332 */ 333 STATIC int 334 xfs_buf_allocate_memory( 335 xfs_buf_t *bp, 336 uint flags) 337 { 338 size_t size; 339 size_t nbytes, offset; 340 gfp_t gfp_mask = xb_to_gfp(flags); 341 unsigned short page_count, i; 342 xfs_off_t start, end; 343 int error; 344 345 /* 346 * for buffers that are contained within a single page, just allocate 347 * the memory from the heap - there's no need for the complexity of 348 * page arrays to keep allocation down to order 0. 349 */ 350 size = BBTOB(bp->b_length); 351 if (size < PAGE_SIZE) { 352 bp->b_addr = kmem_alloc(size, KM_NOFS); 353 if (!bp->b_addr) { 354 /* low memory - use alloc_page loop instead */ 355 goto use_alloc_page; 356 } 357 358 if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) != 359 ((unsigned long)bp->b_addr & PAGE_MASK)) { 360 /* b_addr spans two pages - use alloc_page instead */ 361 kmem_free(bp->b_addr); 362 bp->b_addr = NULL; 363 goto use_alloc_page; 364 } 365 bp->b_offset = offset_in_page(bp->b_addr); 366 bp->b_pages = bp->b_page_array; 367 bp->b_pages[0] = virt_to_page(bp->b_addr); 368 bp->b_page_count = 1; 369 bp->b_flags |= _XBF_KMEM; 370 return 0; 371 } 372 373 use_alloc_page: 374 start = BBTOB(bp->b_maps[0].bm_bn) >> PAGE_SHIFT; 375 end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1) 376 >> PAGE_SHIFT; 377 page_count = end - start; 378 error = _xfs_buf_get_pages(bp, page_count); 379 if (unlikely(error)) 380 return error; 381 382 offset = bp->b_offset; 383 bp->b_flags |= _XBF_PAGES; 384 385 for (i = 0; i < bp->b_page_count; i++) { 386 struct page *page; 387 uint retries = 0; 388 retry: 389 page = alloc_page(gfp_mask); 390 if (unlikely(page == NULL)) { 391 if (flags & XBF_READ_AHEAD) { 392 bp->b_page_count = i; 393 error = -ENOMEM; 394 goto out_free_pages; 395 } 396 397 /* 398 * This could deadlock. 399 * 400 * But until all the XFS lowlevel code is revamped to 401 * handle buffer allocation failures we can't do much. 402 */ 403 if (!(++retries % 100)) 404 xfs_err(NULL, 405 "%s(%u) possible memory allocation deadlock in %s (mode:0x%x)", 406 current->comm, current->pid, 407 __func__, gfp_mask); 408 409 XFS_STATS_INC(bp->b_target->bt_mount, xb_page_retries); 410 congestion_wait(BLK_RW_ASYNC, HZ/50); 411 goto retry; 412 } 413 414 XFS_STATS_INC(bp->b_target->bt_mount, xb_page_found); 415 416 nbytes = min_t(size_t, size, PAGE_SIZE - offset); 417 size -= nbytes; 418 bp->b_pages[i] = page; 419 offset = 0; 420 } 421 return 0; 422 423 out_free_pages: 424 for (i = 0; i < bp->b_page_count; i++) 425 __free_page(bp->b_pages[i]); 426 return error; 427 } 428 429 /* 430 * Map buffer into kernel address-space if necessary. 431 */ 432 STATIC int 433 _xfs_buf_map_pages( 434 xfs_buf_t *bp, 435 uint flags) 436 { 437 ASSERT(bp->b_flags & _XBF_PAGES); 438 if (bp->b_page_count == 1) { 439 /* A single page buffer is always mappable */ 440 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; 441 } else if (flags & XBF_UNMAPPED) { 442 bp->b_addr = NULL; 443 } else { 444 int retried = 0; 445 unsigned noio_flag; 446 447 /* 448 * vm_map_ram() will allocate auxillary structures (e.g. 449 * pagetables) with GFP_KERNEL, yet we are likely to be under 450 * GFP_NOFS context here. Hence we need to tell memory reclaim 451 * that we are in such a context via PF_MEMALLOC_NOIO to prevent 452 * memory reclaim re-entering the filesystem here and 453 * potentially deadlocking. 454 */ 455 noio_flag = memalloc_noio_save(); 456 do { 457 bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, 458 -1, PAGE_KERNEL); 459 if (bp->b_addr) 460 break; 461 vm_unmap_aliases(); 462 } while (retried++ <= 1); 463 memalloc_noio_restore(noio_flag); 464 465 if (!bp->b_addr) 466 return -ENOMEM; 467 bp->b_addr += bp->b_offset; 468 } 469 470 return 0; 471 } 472 473 /* 474 * Finding and Reading Buffers 475 */ 476 477 /* 478 * Look up, and creates if absent, a lockable buffer for 479 * a given range of an inode. The buffer is returned 480 * locked. No I/O is implied by this call. 481 */ 482 xfs_buf_t * 483 _xfs_buf_find( 484 struct xfs_buftarg *btp, 485 struct xfs_buf_map *map, 486 int nmaps, 487 xfs_buf_flags_t flags, 488 xfs_buf_t *new_bp) 489 { 490 struct xfs_perag *pag; 491 struct rb_node **rbp; 492 struct rb_node *parent; 493 xfs_buf_t *bp; 494 xfs_daddr_t blkno = map[0].bm_bn; 495 xfs_daddr_t eofs; 496 int numblks = 0; 497 int i; 498 499 for (i = 0; i < nmaps; i++) 500 numblks += map[i].bm_len; 501 502 /* Check for IOs smaller than the sector size / not sector aligned */ 503 ASSERT(!(BBTOB(numblks) < btp->bt_meta_sectorsize)); 504 ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_meta_sectormask)); 505 506 /* 507 * Corrupted block numbers can get through to here, unfortunately, so we 508 * have to check that the buffer falls within the filesystem bounds. 509 */ 510 eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); 511 if (blkno < 0 || blkno >= eofs) { 512 /* 513 * XXX (dgc): we should really be returning -EFSCORRUPTED here, 514 * but none of the higher level infrastructure supports 515 * returning a specific error on buffer lookup failures. 516 */ 517 xfs_alert(btp->bt_mount, 518 "%s: Block out of range: block 0x%llx, EOFS 0x%llx ", 519 __func__, blkno, eofs); 520 WARN_ON(1); 521 return NULL; 522 } 523 524 /* get tree root */ 525 pag = xfs_perag_get(btp->bt_mount, 526 xfs_daddr_to_agno(btp->bt_mount, blkno)); 527 528 /* walk tree */ 529 spin_lock(&pag->pag_buf_lock); 530 rbp = &pag->pag_buf_tree.rb_node; 531 parent = NULL; 532 bp = NULL; 533 while (*rbp) { 534 parent = *rbp; 535 bp = rb_entry(parent, struct xfs_buf, b_rbnode); 536 537 if (blkno < bp->b_bn) 538 rbp = &(*rbp)->rb_left; 539 else if (blkno > bp->b_bn) 540 rbp = &(*rbp)->rb_right; 541 else { 542 /* 543 * found a block number match. If the range doesn't 544 * match, the only way this is allowed is if the buffer 545 * in the cache is stale and the transaction that made 546 * it stale has not yet committed. i.e. we are 547 * reallocating a busy extent. Skip this buffer and 548 * continue searching to the right for an exact match. 549 */ 550 if (bp->b_length != numblks) { 551 ASSERT(bp->b_flags & XBF_STALE); 552 rbp = &(*rbp)->rb_right; 553 continue; 554 } 555 atomic_inc(&bp->b_hold); 556 goto found; 557 } 558 } 559 560 /* No match found */ 561 if (new_bp) { 562 rb_link_node(&new_bp->b_rbnode, parent, rbp); 563 rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree); 564 /* the buffer keeps the perag reference until it is freed */ 565 new_bp->b_pag = pag; 566 spin_unlock(&pag->pag_buf_lock); 567 } else { 568 XFS_STATS_INC(btp->bt_mount, xb_miss_locked); 569 spin_unlock(&pag->pag_buf_lock); 570 xfs_perag_put(pag); 571 } 572 return new_bp; 573 574 found: 575 spin_unlock(&pag->pag_buf_lock); 576 xfs_perag_put(pag); 577 578 if (!xfs_buf_trylock(bp)) { 579 if (flags & XBF_TRYLOCK) { 580 xfs_buf_rele(bp); 581 XFS_STATS_INC(btp->bt_mount, xb_busy_locked); 582 return NULL; 583 } 584 xfs_buf_lock(bp); 585 XFS_STATS_INC(btp->bt_mount, xb_get_locked_waited); 586 } 587 588 /* 589 * if the buffer is stale, clear all the external state associated with 590 * it. We need to keep flags such as how we allocated the buffer memory 591 * intact here. 592 */ 593 if (bp->b_flags & XBF_STALE) { 594 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); 595 ASSERT(bp->b_iodone == NULL); 596 bp->b_flags &= _XBF_KMEM | _XBF_PAGES; 597 bp->b_ops = NULL; 598 } 599 600 trace_xfs_buf_find(bp, flags, _RET_IP_); 601 XFS_STATS_INC(btp->bt_mount, xb_get_locked); 602 return bp; 603 } 604 605 /* 606 * Assembles a buffer covering the specified range. The code is optimised for 607 * cache hits, as metadata intensive workloads will see 3 orders of magnitude 608 * more hits than misses. 609 */ 610 struct xfs_buf * 611 xfs_buf_get_map( 612 struct xfs_buftarg *target, 613 struct xfs_buf_map *map, 614 int nmaps, 615 xfs_buf_flags_t flags) 616 { 617 struct xfs_buf *bp; 618 struct xfs_buf *new_bp; 619 int error = 0; 620 621 bp = _xfs_buf_find(target, map, nmaps, flags, NULL); 622 if (likely(bp)) 623 goto found; 624 625 new_bp = _xfs_buf_alloc(target, map, nmaps, flags); 626 if (unlikely(!new_bp)) 627 return NULL; 628 629 error = xfs_buf_allocate_memory(new_bp, flags); 630 if (error) { 631 xfs_buf_free(new_bp); 632 return NULL; 633 } 634 635 bp = _xfs_buf_find(target, map, nmaps, flags, new_bp); 636 if (!bp) { 637 xfs_buf_free(new_bp); 638 return NULL; 639 } 640 641 if (bp != new_bp) 642 xfs_buf_free(new_bp); 643 644 found: 645 if (!bp->b_addr) { 646 error = _xfs_buf_map_pages(bp, flags); 647 if (unlikely(error)) { 648 xfs_warn(target->bt_mount, 649 "%s: failed to map pagesn", __func__); 650 xfs_buf_relse(bp); 651 return NULL; 652 } 653 } 654 655 /* 656 * Clear b_error if this is a lookup from a caller that doesn't expect 657 * valid data to be found in the buffer. 658 */ 659 if (!(flags & XBF_READ)) 660 xfs_buf_ioerror(bp, 0); 661 662 XFS_STATS_INC(target->bt_mount, xb_get); 663 trace_xfs_buf_get(bp, flags, _RET_IP_); 664 return bp; 665 } 666 667 STATIC int 668 _xfs_buf_read( 669 xfs_buf_t *bp, 670 xfs_buf_flags_t flags) 671 { 672 ASSERT(!(flags & XBF_WRITE)); 673 ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL); 674 675 bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD); 676 bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); 677 678 if (flags & XBF_ASYNC) { 679 xfs_buf_submit(bp); 680 return 0; 681 } 682 return xfs_buf_submit_wait(bp); 683 } 684 685 xfs_buf_t * 686 xfs_buf_read_map( 687 struct xfs_buftarg *target, 688 struct xfs_buf_map *map, 689 int nmaps, 690 xfs_buf_flags_t flags, 691 const struct xfs_buf_ops *ops) 692 { 693 struct xfs_buf *bp; 694 695 flags |= XBF_READ; 696 697 bp = xfs_buf_get_map(target, map, nmaps, flags); 698 if (bp) { 699 trace_xfs_buf_read(bp, flags, _RET_IP_); 700 701 if (!(bp->b_flags & XBF_DONE)) { 702 XFS_STATS_INC(target->bt_mount, xb_get_read); 703 bp->b_ops = ops; 704 _xfs_buf_read(bp, flags); 705 } else if (flags & XBF_ASYNC) { 706 /* 707 * Read ahead call which is already satisfied, 708 * drop the buffer 709 */ 710 xfs_buf_relse(bp); 711 return NULL; 712 } else { 713 /* We do not want read in the flags */ 714 bp->b_flags &= ~XBF_READ; 715 } 716 } 717 718 return bp; 719 } 720 721 /* 722 * If we are not low on memory then do the readahead in a deadlock 723 * safe manner. 724 */ 725 void 726 xfs_buf_readahead_map( 727 struct xfs_buftarg *target, 728 struct xfs_buf_map *map, 729 int nmaps, 730 const struct xfs_buf_ops *ops) 731 { 732 if (bdi_read_congested(target->bt_bdi)) 733 return; 734 735 xfs_buf_read_map(target, map, nmaps, 736 XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, ops); 737 } 738 739 /* 740 * Read an uncached buffer from disk. Allocates and returns a locked 741 * buffer containing the disk contents or nothing. 742 */ 743 int 744 xfs_buf_read_uncached( 745 struct xfs_buftarg *target, 746 xfs_daddr_t daddr, 747 size_t numblks, 748 int flags, 749 struct xfs_buf **bpp, 750 const struct xfs_buf_ops *ops) 751 { 752 struct xfs_buf *bp; 753 754 *bpp = NULL; 755 756 bp = xfs_buf_get_uncached(target, numblks, flags); 757 if (!bp) 758 return -ENOMEM; 759 760 /* set up the buffer for a read IO */ 761 ASSERT(bp->b_map_count == 1); 762 bp->b_bn = XFS_BUF_DADDR_NULL; /* always null for uncached buffers */ 763 bp->b_maps[0].bm_bn = daddr; 764 bp->b_flags |= XBF_READ; 765 bp->b_ops = ops; 766 767 xfs_buf_submit_wait(bp); 768 if (bp->b_error) { 769 int error = bp->b_error; 770 xfs_buf_relse(bp); 771 return error; 772 } 773 774 *bpp = bp; 775 return 0; 776 } 777 778 /* 779 * Return a buffer allocated as an empty buffer and associated to external 780 * memory via xfs_buf_associate_memory() back to it's empty state. 781 */ 782 void 783 xfs_buf_set_empty( 784 struct xfs_buf *bp, 785 size_t numblks) 786 { 787 if (bp->b_pages) 788 _xfs_buf_free_pages(bp); 789 790 bp->b_pages = NULL; 791 bp->b_page_count = 0; 792 bp->b_addr = NULL; 793 bp->b_length = numblks; 794 bp->b_io_length = numblks; 795 796 ASSERT(bp->b_map_count == 1); 797 bp->b_bn = XFS_BUF_DADDR_NULL; 798 bp->b_maps[0].bm_bn = XFS_BUF_DADDR_NULL; 799 bp->b_maps[0].bm_len = bp->b_length; 800 } 801 802 static inline struct page * 803 mem_to_page( 804 void *addr) 805 { 806 if ((!is_vmalloc_addr(addr))) { 807 return virt_to_page(addr); 808 } else { 809 return vmalloc_to_page(addr); 810 } 811 } 812 813 int 814 xfs_buf_associate_memory( 815 xfs_buf_t *bp, 816 void *mem, 817 size_t len) 818 { 819 int rval; 820 int i = 0; 821 unsigned long pageaddr; 822 unsigned long offset; 823 size_t buflen; 824 int page_count; 825 826 pageaddr = (unsigned long)mem & PAGE_MASK; 827 offset = (unsigned long)mem - pageaddr; 828 buflen = PAGE_ALIGN(len + offset); 829 page_count = buflen >> PAGE_SHIFT; 830 831 /* Free any previous set of page pointers */ 832 if (bp->b_pages) 833 _xfs_buf_free_pages(bp); 834 835 bp->b_pages = NULL; 836 bp->b_addr = mem; 837 838 rval = _xfs_buf_get_pages(bp, page_count); 839 if (rval) 840 return rval; 841 842 bp->b_offset = offset; 843 844 for (i = 0; i < bp->b_page_count; i++) { 845 bp->b_pages[i] = mem_to_page((void *)pageaddr); 846 pageaddr += PAGE_SIZE; 847 } 848 849 bp->b_io_length = BTOBB(len); 850 bp->b_length = BTOBB(buflen); 851 852 return 0; 853 } 854 855 xfs_buf_t * 856 xfs_buf_get_uncached( 857 struct xfs_buftarg *target, 858 size_t numblks, 859 int flags) 860 { 861 unsigned long page_count; 862 int error, i; 863 struct xfs_buf *bp; 864 DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks); 865 866 /* flags might contain irrelevant bits, pass only what we care about */ 867 bp = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT); 868 if (unlikely(bp == NULL)) 869 goto fail; 870 871 page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT; 872 error = _xfs_buf_get_pages(bp, page_count); 873 if (error) 874 goto fail_free_buf; 875 876 for (i = 0; i < page_count; i++) { 877 bp->b_pages[i] = alloc_page(xb_to_gfp(flags)); 878 if (!bp->b_pages[i]) 879 goto fail_free_mem; 880 } 881 bp->b_flags |= _XBF_PAGES; 882 883 error = _xfs_buf_map_pages(bp, 0); 884 if (unlikely(error)) { 885 xfs_warn(target->bt_mount, 886 "%s: failed to map pages", __func__); 887 goto fail_free_mem; 888 } 889 890 trace_xfs_buf_get_uncached(bp, _RET_IP_); 891 return bp; 892 893 fail_free_mem: 894 while (--i >= 0) 895 __free_page(bp->b_pages[i]); 896 _xfs_buf_free_pages(bp); 897 fail_free_buf: 898 xfs_buf_free_maps(bp); 899 kmem_zone_free(xfs_buf_zone, bp); 900 fail: 901 return NULL; 902 } 903 904 /* 905 * Increment reference count on buffer, to hold the buffer concurrently 906 * with another thread which may release (free) the buffer asynchronously. 907 * Must hold the buffer already to call this function. 908 */ 909 void 910 xfs_buf_hold( 911 xfs_buf_t *bp) 912 { 913 trace_xfs_buf_hold(bp, _RET_IP_); 914 atomic_inc(&bp->b_hold); 915 } 916 917 /* 918 * Release a hold on the specified buffer. If the hold count is 1, the buffer is 919 * placed on LRU or freed (depending on b_lru_ref). 920 */ 921 void 922 xfs_buf_rele( 923 xfs_buf_t *bp) 924 { 925 struct xfs_perag *pag = bp->b_pag; 926 bool release; 927 bool freebuf = false; 928 929 trace_xfs_buf_rele(bp, _RET_IP_); 930 931 if (!pag) { 932 ASSERT(list_empty(&bp->b_lru)); 933 ASSERT(RB_EMPTY_NODE(&bp->b_rbnode)); 934 if (atomic_dec_and_test(&bp->b_hold)) { 935 xfs_buf_ioacct_dec(bp); 936 xfs_buf_free(bp); 937 } 938 return; 939 } 940 941 ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode)); 942 943 ASSERT(atomic_read(&bp->b_hold) > 0); 944 945 release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock); 946 spin_lock(&bp->b_lock); 947 if (!release) { 948 /* 949 * Drop the in-flight state if the buffer is already on the LRU 950 * and it holds the only reference. This is racy because we 951 * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT 952 * ensures the decrement occurs only once per-buf. 953 */ 954 if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru)) 955 xfs_buf_ioacct_dec(bp); 956 goto out_unlock; 957 } 958 959 /* the last reference has been dropped ... */ 960 xfs_buf_ioacct_dec(bp); 961 if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { 962 /* 963 * If the buffer is added to the LRU take a new reference to the 964 * buffer for the LRU and clear the (now stale) dispose list 965 * state flag 966 */ 967 if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) { 968 bp->b_state &= ~XFS_BSTATE_DISPOSE; 969 atomic_inc(&bp->b_hold); 970 } 971 spin_unlock(&pag->pag_buf_lock); 972 } else { 973 /* 974 * most of the time buffers will already be removed from the 975 * LRU, so optimise that case by checking for the 976 * XFS_BSTATE_DISPOSE flag indicating the last list the buffer 977 * was on was the disposal list 978 */ 979 if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { 980 list_lru_del(&bp->b_target->bt_lru, &bp->b_lru); 981 } else { 982 ASSERT(list_empty(&bp->b_lru)); 983 } 984 985 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); 986 rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); 987 spin_unlock(&pag->pag_buf_lock); 988 xfs_perag_put(pag); 989 freebuf = true; 990 } 991 992 out_unlock: 993 spin_unlock(&bp->b_lock); 994 995 if (freebuf) 996 xfs_buf_free(bp); 997 } 998 999 1000 /* 1001 * Lock a buffer object, if it is not already locked. 1002 * 1003 * If we come across a stale, pinned, locked buffer, we know that we are 1004 * being asked to lock a buffer that has been reallocated. Because it is 1005 * pinned, we know that the log has not been pushed to disk and hence it 1006 * will still be locked. Rather than continuing to have trylock attempts 1007 * fail until someone else pushes the log, push it ourselves before 1008 * returning. This means that the xfsaild will not get stuck trying 1009 * to push on stale inode buffers. 1010 */ 1011 int 1012 xfs_buf_trylock( 1013 struct xfs_buf *bp) 1014 { 1015 int locked; 1016 1017 locked = down_trylock(&bp->b_sema) == 0; 1018 if (locked) { 1019 XB_SET_OWNER(bp); 1020 trace_xfs_buf_trylock(bp, _RET_IP_); 1021 } else { 1022 trace_xfs_buf_trylock_fail(bp, _RET_IP_); 1023 } 1024 return locked; 1025 } 1026 1027 /* 1028 * Lock a buffer object. 1029 * 1030 * If we come across a stale, pinned, locked buffer, we know that we 1031 * are being asked to lock a buffer that has been reallocated. Because 1032 * it is pinned, we know that the log has not been pushed to disk and 1033 * hence it will still be locked. Rather than sleeping until someone 1034 * else pushes the log, push it ourselves before trying to get the lock. 1035 */ 1036 void 1037 xfs_buf_lock( 1038 struct xfs_buf *bp) 1039 { 1040 trace_xfs_buf_lock(bp, _RET_IP_); 1041 1042 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) 1043 xfs_log_force(bp->b_target->bt_mount, 0); 1044 down(&bp->b_sema); 1045 XB_SET_OWNER(bp); 1046 1047 trace_xfs_buf_lock_done(bp, _RET_IP_); 1048 } 1049 1050 void 1051 xfs_buf_unlock( 1052 struct xfs_buf *bp) 1053 { 1054 XB_CLEAR_OWNER(bp); 1055 up(&bp->b_sema); 1056 1057 trace_xfs_buf_unlock(bp, _RET_IP_); 1058 } 1059 1060 STATIC void 1061 xfs_buf_wait_unpin( 1062 xfs_buf_t *bp) 1063 { 1064 DECLARE_WAITQUEUE (wait, current); 1065 1066 if (atomic_read(&bp->b_pin_count) == 0) 1067 return; 1068 1069 add_wait_queue(&bp->b_waiters, &wait); 1070 for (;;) { 1071 set_current_state(TASK_UNINTERRUPTIBLE); 1072 if (atomic_read(&bp->b_pin_count) == 0) 1073 break; 1074 io_schedule(); 1075 } 1076 remove_wait_queue(&bp->b_waiters, &wait); 1077 set_current_state(TASK_RUNNING); 1078 } 1079 1080 /* 1081 * Buffer Utility Routines 1082 */ 1083 1084 void 1085 xfs_buf_ioend( 1086 struct xfs_buf *bp) 1087 { 1088 bool read = bp->b_flags & XBF_READ; 1089 1090 trace_xfs_buf_iodone(bp, _RET_IP_); 1091 1092 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); 1093 1094 /* 1095 * Pull in IO completion errors now. We are guaranteed to be running 1096 * single threaded, so we don't need the lock to read b_io_error. 1097 */ 1098 if (!bp->b_error && bp->b_io_error) 1099 xfs_buf_ioerror(bp, bp->b_io_error); 1100 1101 /* Only validate buffers that were read without errors */ 1102 if (read && !bp->b_error && bp->b_ops) { 1103 ASSERT(!bp->b_iodone); 1104 bp->b_ops->verify_read(bp); 1105 } 1106 1107 if (!bp->b_error) 1108 bp->b_flags |= XBF_DONE; 1109 1110 if (bp->b_iodone) 1111 (*(bp->b_iodone))(bp); 1112 else if (bp->b_flags & XBF_ASYNC) 1113 xfs_buf_relse(bp); 1114 else 1115 complete(&bp->b_iowait); 1116 } 1117 1118 static void 1119 xfs_buf_ioend_work( 1120 struct work_struct *work) 1121 { 1122 struct xfs_buf *bp = 1123 container_of(work, xfs_buf_t, b_ioend_work); 1124 1125 xfs_buf_ioend(bp); 1126 } 1127 1128 static void 1129 xfs_buf_ioend_async( 1130 struct xfs_buf *bp) 1131 { 1132 INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work); 1133 queue_work(bp->b_ioend_wq, &bp->b_ioend_work); 1134 } 1135 1136 void 1137 xfs_buf_ioerror( 1138 xfs_buf_t *bp, 1139 int error) 1140 { 1141 ASSERT(error <= 0 && error >= -1000); 1142 bp->b_error = error; 1143 trace_xfs_buf_ioerror(bp, error, _RET_IP_); 1144 } 1145 1146 void 1147 xfs_buf_ioerror_alert( 1148 struct xfs_buf *bp, 1149 const char *func) 1150 { 1151 xfs_alert(bp->b_target->bt_mount, 1152 "metadata I/O error: block 0x%llx (\"%s\") error %d numblks %d", 1153 (__uint64_t)XFS_BUF_ADDR(bp), func, -bp->b_error, bp->b_length); 1154 } 1155 1156 int 1157 xfs_bwrite( 1158 struct xfs_buf *bp) 1159 { 1160 int error; 1161 1162 ASSERT(xfs_buf_islocked(bp)); 1163 1164 bp->b_flags |= XBF_WRITE; 1165 bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | 1166 XBF_WRITE_FAIL | XBF_DONE); 1167 1168 error = xfs_buf_submit_wait(bp); 1169 if (error) { 1170 xfs_force_shutdown(bp->b_target->bt_mount, 1171 SHUTDOWN_META_IO_ERROR); 1172 } 1173 return error; 1174 } 1175 1176 static void 1177 xfs_buf_bio_end_io( 1178 struct bio *bio) 1179 { 1180 struct xfs_buf *bp = (struct xfs_buf *)bio->bi_private; 1181 1182 /* 1183 * don't overwrite existing errors - otherwise we can lose errors on 1184 * buffers that require multiple bios to complete. 1185 */ 1186 if (bio->bi_error) 1187 cmpxchg(&bp->b_io_error, 0, bio->bi_error); 1188 1189 if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) 1190 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); 1191 1192 if (atomic_dec_and_test(&bp->b_io_remaining) == 1) 1193 xfs_buf_ioend_async(bp); 1194 bio_put(bio); 1195 } 1196 1197 static void 1198 xfs_buf_ioapply_map( 1199 struct xfs_buf *bp, 1200 int map, 1201 int *buf_offset, 1202 int *count, 1203 int op, 1204 int op_flags) 1205 { 1206 int page_index; 1207 int total_nr_pages = bp->b_page_count; 1208 int nr_pages; 1209 struct bio *bio; 1210 sector_t sector = bp->b_maps[map].bm_bn; 1211 int size; 1212 int offset; 1213 1214 total_nr_pages = bp->b_page_count; 1215 1216 /* skip the pages in the buffer before the start offset */ 1217 page_index = 0; 1218 offset = *buf_offset; 1219 while (offset >= PAGE_SIZE) { 1220 page_index++; 1221 offset -= PAGE_SIZE; 1222 } 1223 1224 /* 1225 * Limit the IO size to the length of the current vector, and update the 1226 * remaining IO count for the next time around. 1227 */ 1228 size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count); 1229 *count -= size; 1230 *buf_offset += size; 1231 1232 next_chunk: 1233 atomic_inc(&bp->b_io_remaining); 1234 nr_pages = min(total_nr_pages, BIO_MAX_PAGES); 1235 1236 bio = bio_alloc(GFP_NOIO, nr_pages); 1237 bio->bi_bdev = bp->b_target->bt_bdev; 1238 bio->bi_iter.bi_sector = sector; 1239 bio->bi_end_io = xfs_buf_bio_end_io; 1240 bio->bi_private = bp; 1241 bio_set_op_attrs(bio, op, op_flags); 1242 1243 for (; size && nr_pages; nr_pages--, page_index++) { 1244 int rbytes, nbytes = PAGE_SIZE - offset; 1245 1246 if (nbytes > size) 1247 nbytes = size; 1248 1249 rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes, 1250 offset); 1251 if (rbytes < nbytes) 1252 break; 1253 1254 offset = 0; 1255 sector += BTOBB(nbytes); 1256 size -= nbytes; 1257 total_nr_pages--; 1258 } 1259 1260 if (likely(bio->bi_iter.bi_size)) { 1261 if (xfs_buf_is_vmapped(bp)) { 1262 flush_kernel_vmap_range(bp->b_addr, 1263 xfs_buf_vmap_len(bp)); 1264 } 1265 submit_bio(bio); 1266 if (size) 1267 goto next_chunk; 1268 } else { 1269 /* 1270 * This is guaranteed not to be the last io reference count 1271 * because the caller (xfs_buf_submit) holds a count itself. 1272 */ 1273 atomic_dec(&bp->b_io_remaining); 1274 xfs_buf_ioerror(bp, -EIO); 1275 bio_put(bio); 1276 } 1277 1278 } 1279 1280 STATIC void 1281 _xfs_buf_ioapply( 1282 struct xfs_buf *bp) 1283 { 1284 struct blk_plug plug; 1285 int op; 1286 int op_flags = 0; 1287 int offset; 1288 int size; 1289 int i; 1290 1291 /* 1292 * Make sure we capture only current IO errors rather than stale errors 1293 * left over from previous use of the buffer (e.g. failed readahead). 1294 */ 1295 bp->b_error = 0; 1296 1297 /* 1298 * Initialize the I/O completion workqueue if we haven't yet or the 1299 * submitter has not opted to specify a custom one. 1300 */ 1301 if (!bp->b_ioend_wq) 1302 bp->b_ioend_wq = bp->b_target->bt_mount->m_buf_workqueue; 1303 1304 if (bp->b_flags & XBF_WRITE) { 1305 op = REQ_OP_WRITE; 1306 if (bp->b_flags & XBF_SYNCIO) 1307 op_flags = WRITE_SYNC; 1308 if (bp->b_flags & XBF_FUA) 1309 op_flags |= REQ_FUA; 1310 if (bp->b_flags & XBF_FLUSH) 1311 op_flags |= REQ_PREFLUSH; 1312 1313 /* 1314 * Run the write verifier callback function if it exists. If 1315 * this function fails it will mark the buffer with an error and 1316 * the IO should not be dispatched. 1317 */ 1318 if (bp->b_ops) { 1319 bp->b_ops->verify_write(bp); 1320 if (bp->b_error) { 1321 xfs_force_shutdown(bp->b_target->bt_mount, 1322 SHUTDOWN_CORRUPT_INCORE); 1323 return; 1324 } 1325 } else if (bp->b_bn != XFS_BUF_DADDR_NULL) { 1326 struct xfs_mount *mp = bp->b_target->bt_mount; 1327 1328 /* 1329 * non-crc filesystems don't attach verifiers during 1330 * log recovery, so don't warn for such filesystems. 1331 */ 1332 if (xfs_sb_version_hascrc(&mp->m_sb)) { 1333 xfs_warn(mp, 1334 "%s: no ops on block 0x%llx/0x%x", 1335 __func__, bp->b_bn, bp->b_length); 1336 xfs_hex_dump(bp->b_addr, 64); 1337 dump_stack(); 1338 } 1339 } 1340 } else if (bp->b_flags & XBF_READ_AHEAD) { 1341 op = REQ_OP_READ; 1342 op_flags = REQ_RAHEAD; 1343 } else { 1344 op = REQ_OP_READ; 1345 } 1346 1347 /* we only use the buffer cache for meta-data */ 1348 op_flags |= REQ_META; 1349 1350 /* 1351 * Walk all the vectors issuing IO on them. Set up the initial offset 1352 * into the buffer and the desired IO size before we start - 1353 * _xfs_buf_ioapply_vec() will modify them appropriately for each 1354 * subsequent call. 1355 */ 1356 offset = bp->b_offset; 1357 size = BBTOB(bp->b_io_length); 1358 blk_start_plug(&plug); 1359 for (i = 0; i < bp->b_map_count; i++) { 1360 xfs_buf_ioapply_map(bp, i, &offset, &size, op, op_flags); 1361 if (bp->b_error) 1362 break; 1363 if (size <= 0) 1364 break; /* all done */ 1365 } 1366 blk_finish_plug(&plug); 1367 } 1368 1369 /* 1370 * Asynchronous IO submission path. This transfers the buffer lock ownership and 1371 * the current reference to the IO. It is not safe to reference the buffer after 1372 * a call to this function unless the caller holds an additional reference 1373 * itself. 1374 */ 1375 void 1376 xfs_buf_submit( 1377 struct xfs_buf *bp) 1378 { 1379 trace_xfs_buf_submit(bp, _RET_IP_); 1380 1381 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); 1382 ASSERT(bp->b_flags & XBF_ASYNC); 1383 1384 /* on shutdown we stale and complete the buffer immediately */ 1385 if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { 1386 xfs_buf_ioerror(bp, -EIO); 1387 bp->b_flags &= ~XBF_DONE; 1388 xfs_buf_stale(bp); 1389 xfs_buf_ioend(bp); 1390 return; 1391 } 1392 1393 if (bp->b_flags & XBF_WRITE) 1394 xfs_buf_wait_unpin(bp); 1395 1396 /* clear the internal error state to avoid spurious errors */ 1397 bp->b_io_error = 0; 1398 1399 /* 1400 * The caller's reference is released during I/O completion. 1401 * This occurs some time after the last b_io_remaining reference is 1402 * released, so after we drop our Io reference we have to have some 1403 * other reference to ensure the buffer doesn't go away from underneath 1404 * us. Take a direct reference to ensure we have safe access to the 1405 * buffer until we are finished with it. 1406 */ 1407 xfs_buf_hold(bp); 1408 1409 /* 1410 * Set the count to 1 initially, this will stop an I/O completion 1411 * callout which happens before we have started all the I/O from calling 1412 * xfs_buf_ioend too early. 1413 */ 1414 atomic_set(&bp->b_io_remaining, 1); 1415 xfs_buf_ioacct_inc(bp); 1416 _xfs_buf_ioapply(bp); 1417 1418 /* 1419 * If _xfs_buf_ioapply failed, we can get back here with only the IO 1420 * reference we took above. If we drop it to zero, run completion so 1421 * that we don't return to the caller with completion still pending. 1422 */ 1423 if (atomic_dec_and_test(&bp->b_io_remaining) == 1) { 1424 if (bp->b_error) 1425 xfs_buf_ioend(bp); 1426 else 1427 xfs_buf_ioend_async(bp); 1428 } 1429 1430 xfs_buf_rele(bp); 1431 /* Note: it is not safe to reference bp now we've dropped our ref */ 1432 } 1433 1434 /* 1435 * Synchronous buffer IO submission path, read or write. 1436 */ 1437 int 1438 xfs_buf_submit_wait( 1439 struct xfs_buf *bp) 1440 { 1441 int error; 1442 1443 trace_xfs_buf_submit_wait(bp, _RET_IP_); 1444 1445 ASSERT(!(bp->b_flags & (_XBF_DELWRI_Q | XBF_ASYNC))); 1446 1447 if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { 1448 xfs_buf_ioerror(bp, -EIO); 1449 xfs_buf_stale(bp); 1450 bp->b_flags &= ~XBF_DONE; 1451 return -EIO; 1452 } 1453 1454 if (bp->b_flags & XBF_WRITE) 1455 xfs_buf_wait_unpin(bp); 1456 1457 /* clear the internal error state to avoid spurious errors */ 1458 bp->b_io_error = 0; 1459 1460 /* 1461 * For synchronous IO, the IO does not inherit the submitters reference 1462 * count, nor the buffer lock. Hence we cannot release the reference we 1463 * are about to take until we've waited for all IO completion to occur, 1464 * including any xfs_buf_ioend_async() work that may be pending. 1465 */ 1466 xfs_buf_hold(bp); 1467 1468 /* 1469 * Set the count to 1 initially, this will stop an I/O completion 1470 * callout which happens before we have started all the I/O from calling 1471 * xfs_buf_ioend too early. 1472 */ 1473 atomic_set(&bp->b_io_remaining, 1); 1474 _xfs_buf_ioapply(bp); 1475 1476 /* 1477 * make sure we run completion synchronously if it raced with us and is 1478 * already complete. 1479 */ 1480 if (atomic_dec_and_test(&bp->b_io_remaining) == 1) 1481 xfs_buf_ioend(bp); 1482 1483 /* wait for completion before gathering the error from the buffer */ 1484 trace_xfs_buf_iowait(bp, _RET_IP_); 1485 wait_for_completion(&bp->b_iowait); 1486 trace_xfs_buf_iowait_done(bp, _RET_IP_); 1487 error = bp->b_error; 1488 1489 /* 1490 * all done now, we can release the hold that keeps the buffer 1491 * referenced for the entire IO. 1492 */ 1493 xfs_buf_rele(bp); 1494 return error; 1495 } 1496 1497 void * 1498 xfs_buf_offset( 1499 struct xfs_buf *bp, 1500 size_t offset) 1501 { 1502 struct page *page; 1503 1504 if (bp->b_addr) 1505 return bp->b_addr + offset; 1506 1507 offset += bp->b_offset; 1508 page = bp->b_pages[offset >> PAGE_SHIFT]; 1509 return page_address(page) + (offset & (PAGE_SIZE-1)); 1510 } 1511 1512 /* 1513 * Move data into or out of a buffer. 1514 */ 1515 void 1516 xfs_buf_iomove( 1517 xfs_buf_t *bp, /* buffer to process */ 1518 size_t boff, /* starting buffer offset */ 1519 size_t bsize, /* length to copy */ 1520 void *data, /* data address */ 1521 xfs_buf_rw_t mode) /* read/write/zero flag */ 1522 { 1523 size_t bend; 1524 1525 bend = boff + bsize; 1526 while (boff < bend) { 1527 struct page *page; 1528 int page_index, page_offset, csize; 1529 1530 page_index = (boff + bp->b_offset) >> PAGE_SHIFT; 1531 page_offset = (boff + bp->b_offset) & ~PAGE_MASK; 1532 page = bp->b_pages[page_index]; 1533 csize = min_t(size_t, PAGE_SIZE - page_offset, 1534 BBTOB(bp->b_io_length) - boff); 1535 1536 ASSERT((csize + page_offset) <= PAGE_SIZE); 1537 1538 switch (mode) { 1539 case XBRW_ZERO: 1540 memset(page_address(page) + page_offset, 0, csize); 1541 break; 1542 case XBRW_READ: 1543 memcpy(data, page_address(page) + page_offset, csize); 1544 break; 1545 case XBRW_WRITE: 1546 memcpy(page_address(page) + page_offset, data, csize); 1547 } 1548 1549 boff += csize; 1550 data += csize; 1551 } 1552 } 1553 1554 /* 1555 * Handling of buffer targets (buftargs). 1556 */ 1557 1558 /* 1559 * Wait for any bufs with callbacks that have been submitted but have not yet 1560 * returned. These buffers will have an elevated hold count, so wait on those 1561 * while freeing all the buffers only held by the LRU. 1562 */ 1563 static enum lru_status 1564 xfs_buftarg_wait_rele( 1565 struct list_head *item, 1566 struct list_lru_one *lru, 1567 spinlock_t *lru_lock, 1568 void *arg) 1569 1570 { 1571 struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); 1572 struct list_head *dispose = arg; 1573 1574 if (atomic_read(&bp->b_hold) > 1) { 1575 /* need to wait, so skip it this pass */ 1576 trace_xfs_buf_wait_buftarg(bp, _RET_IP_); 1577 return LRU_SKIP; 1578 } 1579 if (!spin_trylock(&bp->b_lock)) 1580 return LRU_SKIP; 1581 1582 /* 1583 * clear the LRU reference count so the buffer doesn't get 1584 * ignored in xfs_buf_rele(). 1585 */ 1586 atomic_set(&bp->b_lru_ref, 0); 1587 bp->b_state |= XFS_BSTATE_DISPOSE; 1588 list_lru_isolate_move(lru, item, dispose); 1589 spin_unlock(&bp->b_lock); 1590 return LRU_REMOVED; 1591 } 1592 1593 void 1594 xfs_wait_buftarg( 1595 struct xfs_buftarg *btp) 1596 { 1597 LIST_HEAD(dispose); 1598 int loop = 0; 1599 1600 /* 1601 * First wait on the buftarg I/O count for all in-flight buffers to be 1602 * released. This is critical as new buffers do not make the LRU until 1603 * they are released. 1604 * 1605 * Next, flush the buffer workqueue to ensure all completion processing 1606 * has finished. Just waiting on buffer locks is not sufficient for 1607 * async IO as the reference count held over IO is not released until 1608 * after the buffer lock is dropped. Hence we need to ensure here that 1609 * all reference counts have been dropped before we start walking the 1610 * LRU list. 1611 */ 1612 while (percpu_counter_sum(&btp->bt_io_count)) 1613 delay(100); 1614 flush_workqueue(btp->bt_mount->m_buf_workqueue); 1615 1616 /* loop until there is nothing left on the lru list. */ 1617 while (list_lru_count(&btp->bt_lru)) { 1618 list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele, 1619 &dispose, LONG_MAX); 1620 1621 while (!list_empty(&dispose)) { 1622 struct xfs_buf *bp; 1623 bp = list_first_entry(&dispose, struct xfs_buf, b_lru); 1624 list_del_init(&bp->b_lru); 1625 if (bp->b_flags & XBF_WRITE_FAIL) { 1626 xfs_alert(btp->bt_mount, 1627 "Corruption Alert: Buffer at block 0x%llx had permanent write failures!", 1628 (long long)bp->b_bn); 1629 xfs_alert(btp->bt_mount, 1630 "Please run xfs_repair to determine the extent of the problem."); 1631 } 1632 xfs_buf_rele(bp); 1633 } 1634 if (loop++ != 0) 1635 delay(100); 1636 } 1637 } 1638 1639 static enum lru_status 1640 xfs_buftarg_isolate( 1641 struct list_head *item, 1642 struct list_lru_one *lru, 1643 spinlock_t *lru_lock, 1644 void *arg) 1645 { 1646 struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); 1647 struct list_head *dispose = arg; 1648 1649 /* 1650 * we are inverting the lru lock/bp->b_lock here, so use a trylock. 1651 * If we fail to get the lock, just skip it. 1652 */ 1653 if (!spin_trylock(&bp->b_lock)) 1654 return LRU_SKIP; 1655 /* 1656 * Decrement the b_lru_ref count unless the value is already 1657 * zero. If the value is already zero, we need to reclaim the 1658 * buffer, otherwise it gets another trip through the LRU. 1659 */ 1660 if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) { 1661 spin_unlock(&bp->b_lock); 1662 return LRU_ROTATE; 1663 } 1664 1665 bp->b_state |= XFS_BSTATE_DISPOSE; 1666 list_lru_isolate_move(lru, item, dispose); 1667 spin_unlock(&bp->b_lock); 1668 return LRU_REMOVED; 1669 } 1670 1671 static unsigned long 1672 xfs_buftarg_shrink_scan( 1673 struct shrinker *shrink, 1674 struct shrink_control *sc) 1675 { 1676 struct xfs_buftarg *btp = container_of(shrink, 1677 struct xfs_buftarg, bt_shrinker); 1678 LIST_HEAD(dispose); 1679 unsigned long freed; 1680 1681 freed = list_lru_shrink_walk(&btp->bt_lru, sc, 1682 xfs_buftarg_isolate, &dispose); 1683 1684 while (!list_empty(&dispose)) { 1685 struct xfs_buf *bp; 1686 bp = list_first_entry(&dispose, struct xfs_buf, b_lru); 1687 list_del_init(&bp->b_lru); 1688 xfs_buf_rele(bp); 1689 } 1690 1691 return freed; 1692 } 1693 1694 static unsigned long 1695 xfs_buftarg_shrink_count( 1696 struct shrinker *shrink, 1697 struct shrink_control *sc) 1698 { 1699 struct xfs_buftarg *btp = container_of(shrink, 1700 struct xfs_buftarg, bt_shrinker); 1701 return list_lru_shrink_count(&btp->bt_lru, sc); 1702 } 1703 1704 void 1705 xfs_free_buftarg( 1706 struct xfs_mount *mp, 1707 struct xfs_buftarg *btp) 1708 { 1709 unregister_shrinker(&btp->bt_shrinker); 1710 ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0); 1711 percpu_counter_destroy(&btp->bt_io_count); 1712 list_lru_destroy(&btp->bt_lru); 1713 1714 if (mp->m_flags & XFS_MOUNT_BARRIER) 1715 xfs_blkdev_issue_flush(btp); 1716 1717 kmem_free(btp); 1718 } 1719 1720 int 1721 xfs_setsize_buftarg( 1722 xfs_buftarg_t *btp, 1723 unsigned int sectorsize) 1724 { 1725 /* Set up metadata sector size info */ 1726 btp->bt_meta_sectorsize = sectorsize; 1727 btp->bt_meta_sectormask = sectorsize - 1; 1728 1729 if (set_blocksize(btp->bt_bdev, sectorsize)) { 1730 xfs_warn(btp->bt_mount, 1731 "Cannot set_blocksize to %u on device %pg", 1732 sectorsize, btp->bt_bdev); 1733 return -EINVAL; 1734 } 1735 1736 /* Set up device logical sector size mask */ 1737 btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev); 1738 btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1; 1739 1740 return 0; 1741 } 1742 1743 /* 1744 * When allocating the initial buffer target we have not yet 1745 * read in the superblock, so don't know what sized sectors 1746 * are being used at this early stage. Play safe. 1747 */ 1748 STATIC int 1749 xfs_setsize_buftarg_early( 1750 xfs_buftarg_t *btp, 1751 struct block_device *bdev) 1752 { 1753 return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev)); 1754 } 1755 1756 xfs_buftarg_t * 1757 xfs_alloc_buftarg( 1758 struct xfs_mount *mp, 1759 struct block_device *bdev) 1760 { 1761 xfs_buftarg_t *btp; 1762 1763 btp = kmem_zalloc(sizeof(*btp), KM_SLEEP | KM_NOFS); 1764 1765 btp->bt_mount = mp; 1766 btp->bt_dev = bdev->bd_dev; 1767 btp->bt_bdev = bdev; 1768 btp->bt_bdi = blk_get_backing_dev_info(bdev); 1769 1770 if (xfs_setsize_buftarg_early(btp, bdev)) 1771 goto error; 1772 1773 if (list_lru_init(&btp->bt_lru)) 1774 goto error; 1775 1776 if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL)) 1777 goto error; 1778 1779 btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count; 1780 btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan; 1781 btp->bt_shrinker.seeks = DEFAULT_SEEKS; 1782 btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE; 1783 register_shrinker(&btp->bt_shrinker); 1784 return btp; 1785 1786 error: 1787 kmem_free(btp); 1788 return NULL; 1789 } 1790 1791 /* 1792 * Add a buffer to the delayed write list. 1793 * 1794 * This queues a buffer for writeout if it hasn't already been. Note that 1795 * neither this routine nor the buffer list submission functions perform 1796 * any internal synchronization. It is expected that the lists are thread-local 1797 * to the callers. 1798 * 1799 * Returns true if we queued up the buffer, or false if it already had 1800 * been on the buffer list. 1801 */ 1802 bool 1803 xfs_buf_delwri_queue( 1804 struct xfs_buf *bp, 1805 struct list_head *list) 1806 { 1807 ASSERT(xfs_buf_islocked(bp)); 1808 ASSERT(!(bp->b_flags & XBF_READ)); 1809 1810 /* 1811 * If the buffer is already marked delwri it already is queued up 1812 * by someone else for imediate writeout. Just ignore it in that 1813 * case. 1814 */ 1815 if (bp->b_flags & _XBF_DELWRI_Q) { 1816 trace_xfs_buf_delwri_queued(bp, _RET_IP_); 1817 return false; 1818 } 1819 1820 trace_xfs_buf_delwri_queue(bp, _RET_IP_); 1821 1822 /* 1823 * If a buffer gets written out synchronously or marked stale while it 1824 * is on a delwri list we lazily remove it. To do this, the other party 1825 * clears the _XBF_DELWRI_Q flag but otherwise leaves the buffer alone. 1826 * It remains referenced and on the list. In a rare corner case it 1827 * might get readded to a delwri list after the synchronous writeout, in 1828 * which case we need just need to re-add the flag here. 1829 */ 1830 bp->b_flags |= _XBF_DELWRI_Q; 1831 if (list_empty(&bp->b_list)) { 1832 atomic_inc(&bp->b_hold); 1833 list_add_tail(&bp->b_list, list); 1834 } 1835 1836 return true; 1837 } 1838 1839 /* 1840 * Compare function is more complex than it needs to be because 1841 * the return value is only 32 bits and we are doing comparisons 1842 * on 64 bit values 1843 */ 1844 static int 1845 xfs_buf_cmp( 1846 void *priv, 1847 struct list_head *a, 1848 struct list_head *b) 1849 { 1850 struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list); 1851 struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list); 1852 xfs_daddr_t diff; 1853 1854 diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn; 1855 if (diff < 0) 1856 return -1; 1857 if (diff > 0) 1858 return 1; 1859 return 0; 1860 } 1861 1862 /* 1863 * submit buffers for write. 1864 * 1865 * When we have a large buffer list, we do not want to hold all the buffers 1866 * locked while we block on the request queue waiting for IO dispatch. To avoid 1867 * this problem, we lock and submit buffers in groups of 50, thereby minimising 1868 * the lock hold times for lists which may contain thousands of objects. 1869 * 1870 * To do this, we sort the buffer list before we walk the list to lock and 1871 * submit buffers, and we plug and unplug around each group of buffers we 1872 * submit. 1873 */ 1874 static int 1875 xfs_buf_delwri_submit_buffers( 1876 struct list_head *buffer_list, 1877 struct list_head *wait_list) 1878 { 1879 struct xfs_buf *bp, *n; 1880 LIST_HEAD (submit_list); 1881 int pinned = 0; 1882 struct blk_plug plug; 1883 1884 list_sort(NULL, buffer_list, xfs_buf_cmp); 1885 1886 blk_start_plug(&plug); 1887 list_for_each_entry_safe(bp, n, buffer_list, b_list) { 1888 if (!wait_list) { 1889 if (xfs_buf_ispinned(bp)) { 1890 pinned++; 1891 continue; 1892 } 1893 if (!xfs_buf_trylock(bp)) 1894 continue; 1895 } else { 1896 xfs_buf_lock(bp); 1897 } 1898 1899 /* 1900 * Someone else might have written the buffer synchronously or 1901 * marked it stale in the meantime. In that case only the 1902 * _XBF_DELWRI_Q flag got cleared, and we have to drop the 1903 * reference and remove it from the list here. 1904 */ 1905 if (!(bp->b_flags & _XBF_DELWRI_Q)) { 1906 list_del_init(&bp->b_list); 1907 xfs_buf_relse(bp); 1908 continue; 1909 } 1910 1911 trace_xfs_buf_delwri_split(bp, _RET_IP_); 1912 1913 /* 1914 * We do all IO submission async. This means if we need 1915 * to wait for IO completion we need to take an extra 1916 * reference so the buffer is still valid on the other 1917 * side. We need to move the buffer onto the io_list 1918 * at this point so the caller can still access it. 1919 */ 1920 bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_WRITE_FAIL); 1921 bp->b_flags |= XBF_WRITE | XBF_ASYNC; 1922 if (wait_list) { 1923 xfs_buf_hold(bp); 1924 list_move_tail(&bp->b_list, wait_list); 1925 } else 1926 list_del_init(&bp->b_list); 1927 1928 xfs_buf_submit(bp); 1929 } 1930 blk_finish_plug(&plug); 1931 1932 return pinned; 1933 } 1934 1935 /* 1936 * Write out a buffer list asynchronously. 1937 * 1938 * This will take the @buffer_list, write all non-locked and non-pinned buffers 1939 * out and not wait for I/O completion on any of the buffers. This interface 1940 * is only safely useable for callers that can track I/O completion by higher 1941 * level means, e.g. AIL pushing as the @buffer_list is consumed in this 1942 * function. 1943 */ 1944 int 1945 xfs_buf_delwri_submit_nowait( 1946 struct list_head *buffer_list) 1947 { 1948 return xfs_buf_delwri_submit_buffers(buffer_list, NULL); 1949 } 1950 1951 /* 1952 * Write out a buffer list synchronously. 1953 * 1954 * This will take the @buffer_list, write all buffers out and wait for I/O 1955 * completion on all of the buffers. @buffer_list is consumed by the function, 1956 * so callers must have some other way of tracking buffers if they require such 1957 * functionality. 1958 */ 1959 int 1960 xfs_buf_delwri_submit( 1961 struct list_head *buffer_list) 1962 { 1963 LIST_HEAD (wait_list); 1964 int error = 0, error2; 1965 struct xfs_buf *bp; 1966 1967 xfs_buf_delwri_submit_buffers(buffer_list, &wait_list); 1968 1969 /* Wait for IO to complete. */ 1970 while (!list_empty(&wait_list)) { 1971 bp = list_first_entry(&wait_list, struct xfs_buf, b_list); 1972 1973 list_del_init(&bp->b_list); 1974 1975 /* locking the buffer will wait for async IO completion. */ 1976 xfs_buf_lock(bp); 1977 error2 = bp->b_error; 1978 xfs_buf_relse(bp); 1979 if (!error) 1980 error = error2; 1981 } 1982 1983 return error; 1984 } 1985 1986 int __init 1987 xfs_buf_init(void) 1988 { 1989 xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf", 1990 KM_ZONE_HWALIGN, NULL); 1991 if (!xfs_buf_zone) 1992 goto out; 1993 1994 return 0; 1995 1996 out: 1997 return -ENOMEM; 1998 } 1999 2000 void 2001 xfs_buf_terminate(void) 2002 { 2003 kmem_zone_destroy(xfs_buf_zone); 2004 } 2005