1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include <linux/stddef.h> 8 #include <linux/errno.h> 9 #include <linux/gfp.h> 10 #include <linux/pagemap.h> 11 #include <linux/init.h> 12 #include <linux/vmalloc.h> 13 #include <linux/bio.h> 14 #include <linux/sysctl.h> 15 #include <linux/proc_fs.h> 16 #include <linux/workqueue.h> 17 #include <linux/percpu.h> 18 #include <linux/blkdev.h> 19 #include <linux/hash.h> 20 #include <linux/kthread.h> 21 #include <linux/migrate.h> 22 #include <linux/backing-dev.h> 23 #include <linux/freezer.h> 24 25 #include "xfs_format.h" 26 #include "xfs_log_format.h" 27 #include "xfs_trans_resv.h" 28 #include "xfs_sb.h" 29 #include "xfs_mount.h" 30 #include "xfs_trace.h" 31 #include "xfs_log.h" 32 #include "xfs_errortag.h" 33 #include "xfs_error.h" 34 35 static kmem_zone_t *xfs_buf_zone; 36 37 #define xb_to_gfp(flags) \ 38 ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN) 39 40 /* 41 * Locking orders 42 * 43 * xfs_buf_ioacct_inc: 44 * xfs_buf_ioacct_dec: 45 * b_sema (caller holds) 46 * b_lock 47 * 48 * xfs_buf_stale: 49 * b_sema (caller holds) 50 * b_lock 51 * lru_lock 52 * 53 * xfs_buf_rele: 54 * b_lock 55 * pag_buf_lock 56 * lru_lock 57 * 58 * xfs_buftarg_wait_rele 59 * lru_lock 60 * b_lock (trylock due to inversion) 61 * 62 * xfs_buftarg_isolate 63 * lru_lock 64 * b_lock (trylock due to inversion) 65 */ 66 67 static inline int 68 xfs_buf_is_vmapped( 69 struct xfs_buf *bp) 70 { 71 /* 72 * Return true if the buffer is vmapped. 73 * 74 * b_addr is null if the buffer is not mapped, but the code is clever 75 * enough to know it doesn't have to map a single page, so the check has 76 * to be both for b_addr and bp->b_page_count > 1. 77 */ 78 return bp->b_addr && bp->b_page_count > 1; 79 } 80 81 static inline int 82 xfs_buf_vmap_len( 83 struct xfs_buf *bp) 84 { 85 return (bp->b_page_count * PAGE_SIZE) - bp->b_offset; 86 } 87 88 /* 89 * Bump the I/O in flight count on the buftarg if we haven't yet done so for 90 * this buffer. The count is incremented once per buffer (per hold cycle) 91 * because the corresponding decrement is deferred to buffer release. Buffers 92 * can undergo I/O multiple times in a hold-release cycle and per buffer I/O 93 * tracking adds unnecessary overhead. This is used for sychronization purposes 94 * with unmount (see xfs_wait_buftarg()), so all we really need is a count of 95 * in-flight buffers. 96 * 97 * Buffers that are never released (e.g., superblock, iclog buffers) must set 98 * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count 99 * never reaches zero and unmount hangs indefinitely. 100 */ 101 static inline void 102 xfs_buf_ioacct_inc( 103 struct xfs_buf *bp) 104 { 105 if (bp->b_flags & XBF_NO_IOACCT) 106 return; 107 108 ASSERT(bp->b_flags & XBF_ASYNC); 109 spin_lock(&bp->b_lock); 110 if (!(bp->b_state & XFS_BSTATE_IN_FLIGHT)) { 111 bp->b_state |= XFS_BSTATE_IN_FLIGHT; 112 percpu_counter_inc(&bp->b_target->bt_io_count); 113 } 114 spin_unlock(&bp->b_lock); 115 } 116 117 /* 118 * Clear the in-flight state on a buffer about to be released to the LRU or 119 * freed and unaccount from the buftarg. 120 */ 121 static inline void 122 __xfs_buf_ioacct_dec( 123 struct xfs_buf *bp) 124 { 125 lockdep_assert_held(&bp->b_lock); 126 127 if (bp->b_state & XFS_BSTATE_IN_FLIGHT) { 128 bp->b_state &= ~XFS_BSTATE_IN_FLIGHT; 129 percpu_counter_dec(&bp->b_target->bt_io_count); 130 } 131 } 132 133 static inline void 134 xfs_buf_ioacct_dec( 135 struct xfs_buf *bp) 136 { 137 spin_lock(&bp->b_lock); 138 __xfs_buf_ioacct_dec(bp); 139 spin_unlock(&bp->b_lock); 140 } 141 142 /* 143 * When we mark a buffer stale, we remove the buffer from the LRU and clear the 144 * b_lru_ref count so that the buffer is freed immediately when the buffer 145 * reference count falls to zero. If the buffer is already on the LRU, we need 146 * to remove the reference that LRU holds on the buffer. 147 * 148 * This prevents build-up of stale buffers on the LRU. 149 */ 150 void 151 xfs_buf_stale( 152 struct xfs_buf *bp) 153 { 154 ASSERT(xfs_buf_islocked(bp)); 155 156 bp->b_flags |= XBF_STALE; 157 158 /* 159 * Clear the delwri status so that a delwri queue walker will not 160 * flush this buffer to disk now that it is stale. The delwri queue has 161 * a reference to the buffer, so this is safe to do. 162 */ 163 bp->b_flags &= ~_XBF_DELWRI_Q; 164 165 /* 166 * Once the buffer is marked stale and unlocked, a subsequent lookup 167 * could reset b_flags. There is no guarantee that the buffer is 168 * unaccounted (released to LRU) before that occurs. Drop in-flight 169 * status now to preserve accounting consistency. 170 */ 171 spin_lock(&bp->b_lock); 172 __xfs_buf_ioacct_dec(bp); 173 174 atomic_set(&bp->b_lru_ref, 0); 175 if (!(bp->b_state & XFS_BSTATE_DISPOSE) && 176 (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru))) 177 atomic_dec(&bp->b_hold); 178 179 ASSERT(atomic_read(&bp->b_hold) >= 1); 180 spin_unlock(&bp->b_lock); 181 } 182 183 static int 184 xfs_buf_get_maps( 185 struct xfs_buf *bp, 186 int map_count) 187 { 188 ASSERT(bp->b_maps == NULL); 189 bp->b_map_count = map_count; 190 191 if (map_count == 1) { 192 bp->b_maps = &bp->__b_map; 193 return 0; 194 } 195 196 bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map), 197 KM_NOFS); 198 if (!bp->b_maps) 199 return -ENOMEM; 200 return 0; 201 } 202 203 /* 204 * Frees b_pages if it was allocated. 205 */ 206 static void 207 xfs_buf_free_maps( 208 struct xfs_buf *bp) 209 { 210 if (bp->b_maps != &bp->__b_map) { 211 kmem_free(bp->b_maps); 212 bp->b_maps = NULL; 213 } 214 } 215 216 struct xfs_buf * 217 _xfs_buf_alloc( 218 struct xfs_buftarg *target, 219 struct xfs_buf_map *map, 220 int nmaps, 221 xfs_buf_flags_t flags) 222 { 223 struct xfs_buf *bp; 224 int error; 225 int i; 226 227 bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS); 228 if (unlikely(!bp)) 229 return NULL; 230 231 /* 232 * We don't want certain flags to appear in b_flags unless they are 233 * specifically set by later operations on the buffer. 234 */ 235 flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD); 236 237 atomic_set(&bp->b_hold, 1); 238 atomic_set(&bp->b_lru_ref, 1); 239 init_completion(&bp->b_iowait); 240 INIT_LIST_HEAD(&bp->b_lru); 241 INIT_LIST_HEAD(&bp->b_list); 242 INIT_LIST_HEAD(&bp->b_li_list); 243 sema_init(&bp->b_sema, 0); /* held, no waiters */ 244 spin_lock_init(&bp->b_lock); 245 bp->b_target = target; 246 bp->b_flags = flags; 247 248 /* 249 * Set length and io_length to the same value initially. 250 * I/O routines should use io_length, which will be the same in 251 * most cases but may be reset (e.g. XFS recovery). 252 */ 253 error = xfs_buf_get_maps(bp, nmaps); 254 if (error) { 255 kmem_zone_free(xfs_buf_zone, bp); 256 return NULL; 257 } 258 259 bp->b_bn = map[0].bm_bn; 260 bp->b_length = 0; 261 for (i = 0; i < nmaps; i++) { 262 bp->b_maps[i].bm_bn = map[i].bm_bn; 263 bp->b_maps[i].bm_len = map[i].bm_len; 264 bp->b_length += map[i].bm_len; 265 } 266 bp->b_io_length = bp->b_length; 267 268 atomic_set(&bp->b_pin_count, 0); 269 init_waitqueue_head(&bp->b_waiters); 270 271 XFS_STATS_INC(target->bt_mount, xb_create); 272 trace_xfs_buf_init(bp, _RET_IP_); 273 274 return bp; 275 } 276 277 /* 278 * Allocate a page array capable of holding a specified number 279 * of pages, and point the page buf at it. 280 */ 281 STATIC int 282 _xfs_buf_get_pages( 283 xfs_buf_t *bp, 284 int page_count) 285 { 286 /* Make sure that we have a page list */ 287 if (bp->b_pages == NULL) { 288 bp->b_page_count = page_count; 289 if (page_count <= XB_PAGES) { 290 bp->b_pages = bp->b_page_array; 291 } else { 292 bp->b_pages = kmem_alloc(sizeof(struct page *) * 293 page_count, KM_NOFS); 294 if (bp->b_pages == NULL) 295 return -ENOMEM; 296 } 297 memset(bp->b_pages, 0, sizeof(struct page *) * page_count); 298 } 299 return 0; 300 } 301 302 /* 303 * Frees b_pages if it was allocated. 304 */ 305 STATIC void 306 _xfs_buf_free_pages( 307 xfs_buf_t *bp) 308 { 309 if (bp->b_pages != bp->b_page_array) { 310 kmem_free(bp->b_pages); 311 bp->b_pages = NULL; 312 } 313 } 314 315 /* 316 * Releases the specified buffer. 317 * 318 * The modification state of any associated pages is left unchanged. 319 * The buffer must not be on any hash - use xfs_buf_rele instead for 320 * hashed and refcounted buffers 321 */ 322 void 323 xfs_buf_free( 324 xfs_buf_t *bp) 325 { 326 trace_xfs_buf_free(bp, _RET_IP_); 327 328 ASSERT(list_empty(&bp->b_lru)); 329 330 if (bp->b_flags & _XBF_PAGES) { 331 uint i; 332 333 if (xfs_buf_is_vmapped(bp)) 334 vm_unmap_ram(bp->b_addr - bp->b_offset, 335 bp->b_page_count); 336 337 for (i = 0; i < bp->b_page_count; i++) { 338 struct page *page = bp->b_pages[i]; 339 340 __free_page(page); 341 } 342 } else if (bp->b_flags & _XBF_KMEM) 343 kmem_free(bp->b_addr); 344 _xfs_buf_free_pages(bp); 345 xfs_buf_free_maps(bp); 346 kmem_zone_free(xfs_buf_zone, bp); 347 } 348 349 /* 350 * Allocates all the pages for buffer in question and builds it's page list. 351 */ 352 STATIC int 353 xfs_buf_allocate_memory( 354 xfs_buf_t *bp, 355 uint flags) 356 { 357 size_t size; 358 size_t nbytes, offset; 359 gfp_t gfp_mask = xb_to_gfp(flags); 360 unsigned short page_count, i; 361 xfs_off_t start, end; 362 int error; 363 364 /* 365 * for buffers that are contained within a single page, just allocate 366 * the memory from the heap - there's no need for the complexity of 367 * page arrays to keep allocation down to order 0. 368 */ 369 size = BBTOB(bp->b_length); 370 if (size < PAGE_SIZE) { 371 bp->b_addr = kmem_alloc(size, KM_NOFS); 372 if (!bp->b_addr) { 373 /* low memory - use alloc_page loop instead */ 374 goto use_alloc_page; 375 } 376 377 if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) != 378 ((unsigned long)bp->b_addr & PAGE_MASK)) { 379 /* b_addr spans two pages - use alloc_page instead */ 380 kmem_free(bp->b_addr); 381 bp->b_addr = NULL; 382 goto use_alloc_page; 383 } 384 bp->b_offset = offset_in_page(bp->b_addr); 385 bp->b_pages = bp->b_page_array; 386 bp->b_pages[0] = virt_to_page(bp->b_addr); 387 bp->b_page_count = 1; 388 bp->b_flags |= _XBF_KMEM; 389 return 0; 390 } 391 392 use_alloc_page: 393 start = BBTOB(bp->b_maps[0].bm_bn) >> PAGE_SHIFT; 394 end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1) 395 >> PAGE_SHIFT; 396 page_count = end - start; 397 error = _xfs_buf_get_pages(bp, page_count); 398 if (unlikely(error)) 399 return error; 400 401 offset = bp->b_offset; 402 bp->b_flags |= _XBF_PAGES; 403 404 for (i = 0; i < bp->b_page_count; i++) { 405 struct page *page; 406 uint retries = 0; 407 retry: 408 page = alloc_page(gfp_mask); 409 if (unlikely(page == NULL)) { 410 if (flags & XBF_READ_AHEAD) { 411 bp->b_page_count = i; 412 error = -ENOMEM; 413 goto out_free_pages; 414 } 415 416 /* 417 * This could deadlock. 418 * 419 * But until all the XFS lowlevel code is revamped to 420 * handle buffer allocation failures we can't do much. 421 */ 422 if (!(++retries % 100)) 423 xfs_err(NULL, 424 "%s(%u) possible memory allocation deadlock in %s (mode:0x%x)", 425 current->comm, current->pid, 426 __func__, gfp_mask); 427 428 XFS_STATS_INC(bp->b_target->bt_mount, xb_page_retries); 429 congestion_wait(BLK_RW_ASYNC, HZ/50); 430 goto retry; 431 } 432 433 XFS_STATS_INC(bp->b_target->bt_mount, xb_page_found); 434 435 nbytes = min_t(size_t, size, PAGE_SIZE - offset); 436 size -= nbytes; 437 bp->b_pages[i] = page; 438 offset = 0; 439 } 440 return 0; 441 442 out_free_pages: 443 for (i = 0; i < bp->b_page_count; i++) 444 __free_page(bp->b_pages[i]); 445 bp->b_flags &= ~_XBF_PAGES; 446 return error; 447 } 448 449 /* 450 * Map buffer into kernel address-space if necessary. 451 */ 452 STATIC int 453 _xfs_buf_map_pages( 454 xfs_buf_t *bp, 455 uint flags) 456 { 457 ASSERT(bp->b_flags & _XBF_PAGES); 458 if (bp->b_page_count == 1) { 459 /* A single page buffer is always mappable */ 460 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; 461 } else if (flags & XBF_UNMAPPED) { 462 bp->b_addr = NULL; 463 } else { 464 int retried = 0; 465 unsigned nofs_flag; 466 467 /* 468 * vm_map_ram() will allocate auxillary structures (e.g. 469 * pagetables) with GFP_KERNEL, yet we are likely to be under 470 * GFP_NOFS context here. Hence we need to tell memory reclaim 471 * that we are in such a context via PF_MEMALLOC_NOFS to prevent 472 * memory reclaim re-entering the filesystem here and 473 * potentially deadlocking. 474 */ 475 nofs_flag = memalloc_nofs_save(); 476 do { 477 bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, 478 -1, PAGE_KERNEL); 479 if (bp->b_addr) 480 break; 481 vm_unmap_aliases(); 482 } while (retried++ <= 1); 483 memalloc_nofs_restore(nofs_flag); 484 485 if (!bp->b_addr) 486 return -ENOMEM; 487 bp->b_addr += bp->b_offset; 488 } 489 490 return 0; 491 } 492 493 /* 494 * Finding and Reading Buffers 495 */ 496 static int 497 _xfs_buf_obj_cmp( 498 struct rhashtable_compare_arg *arg, 499 const void *obj) 500 { 501 const struct xfs_buf_map *map = arg->key; 502 const struct xfs_buf *bp = obj; 503 504 /* 505 * The key hashing in the lookup path depends on the key being the 506 * first element of the compare_arg, make sure to assert this. 507 */ 508 BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0); 509 510 if (bp->b_bn != map->bm_bn) 511 return 1; 512 513 if (unlikely(bp->b_length != map->bm_len)) { 514 /* 515 * found a block number match. If the range doesn't 516 * match, the only way this is allowed is if the buffer 517 * in the cache is stale and the transaction that made 518 * it stale has not yet committed. i.e. we are 519 * reallocating a busy extent. Skip this buffer and 520 * continue searching for an exact match. 521 */ 522 ASSERT(bp->b_flags & XBF_STALE); 523 return 1; 524 } 525 return 0; 526 } 527 528 static const struct rhashtable_params xfs_buf_hash_params = { 529 .min_size = 32, /* empty AGs have minimal footprint */ 530 .nelem_hint = 16, 531 .key_len = sizeof(xfs_daddr_t), 532 .key_offset = offsetof(struct xfs_buf, b_bn), 533 .head_offset = offsetof(struct xfs_buf, b_rhash_head), 534 .automatic_shrinking = true, 535 .obj_cmpfn = _xfs_buf_obj_cmp, 536 }; 537 538 int 539 xfs_buf_hash_init( 540 struct xfs_perag *pag) 541 { 542 spin_lock_init(&pag->pag_buf_lock); 543 return rhashtable_init(&pag->pag_buf_hash, &xfs_buf_hash_params); 544 } 545 546 void 547 xfs_buf_hash_destroy( 548 struct xfs_perag *pag) 549 { 550 rhashtable_destroy(&pag->pag_buf_hash); 551 } 552 553 /* 554 * Look up a buffer in the buffer cache and return it referenced and locked 555 * in @found_bp. 556 * 557 * If @new_bp is supplied and we have a lookup miss, insert @new_bp into the 558 * cache. 559 * 560 * If XBF_TRYLOCK is set in @flags, only try to lock the buffer and return 561 * -EAGAIN if we fail to lock it. 562 * 563 * Return values are: 564 * -EFSCORRUPTED if have been supplied with an invalid address 565 * -EAGAIN on trylock failure 566 * -ENOENT if we fail to find a match and @new_bp was NULL 567 * 0, with @found_bp: 568 * - @new_bp if we inserted it into the cache 569 * - the buffer we found and locked. 570 */ 571 static int 572 xfs_buf_find( 573 struct xfs_buftarg *btp, 574 struct xfs_buf_map *map, 575 int nmaps, 576 xfs_buf_flags_t flags, 577 struct xfs_buf *new_bp, 578 struct xfs_buf **found_bp) 579 { 580 struct xfs_perag *pag; 581 xfs_buf_t *bp; 582 struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn }; 583 xfs_daddr_t eofs; 584 int i; 585 586 *found_bp = NULL; 587 588 for (i = 0; i < nmaps; i++) 589 cmap.bm_len += map[i].bm_len; 590 591 /* Check for IOs smaller than the sector size / not sector aligned */ 592 ASSERT(!(BBTOB(cmap.bm_len) < btp->bt_meta_sectorsize)); 593 ASSERT(!(BBTOB(cmap.bm_bn) & (xfs_off_t)btp->bt_meta_sectormask)); 594 595 /* 596 * Corrupted block numbers can get through to here, unfortunately, so we 597 * have to check that the buffer falls within the filesystem bounds. 598 */ 599 eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); 600 if (cmap.bm_bn < 0 || cmap.bm_bn >= eofs) { 601 xfs_alert(btp->bt_mount, 602 "%s: daddr 0x%llx out of range, EOFS 0x%llx", 603 __func__, cmap.bm_bn, eofs); 604 WARN_ON(1); 605 return -EFSCORRUPTED; 606 } 607 608 pag = xfs_perag_get(btp->bt_mount, 609 xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn)); 610 611 spin_lock(&pag->pag_buf_lock); 612 bp = rhashtable_lookup_fast(&pag->pag_buf_hash, &cmap, 613 xfs_buf_hash_params); 614 if (bp) { 615 atomic_inc(&bp->b_hold); 616 goto found; 617 } 618 619 /* No match found */ 620 if (!new_bp) { 621 XFS_STATS_INC(btp->bt_mount, xb_miss_locked); 622 spin_unlock(&pag->pag_buf_lock); 623 xfs_perag_put(pag); 624 return -ENOENT; 625 } 626 627 /* the buffer keeps the perag reference until it is freed */ 628 new_bp->b_pag = pag; 629 rhashtable_insert_fast(&pag->pag_buf_hash, &new_bp->b_rhash_head, 630 xfs_buf_hash_params); 631 spin_unlock(&pag->pag_buf_lock); 632 *found_bp = new_bp; 633 return 0; 634 635 found: 636 spin_unlock(&pag->pag_buf_lock); 637 xfs_perag_put(pag); 638 639 if (!xfs_buf_trylock(bp)) { 640 if (flags & XBF_TRYLOCK) { 641 xfs_buf_rele(bp); 642 XFS_STATS_INC(btp->bt_mount, xb_busy_locked); 643 return -EAGAIN; 644 } 645 xfs_buf_lock(bp); 646 XFS_STATS_INC(btp->bt_mount, xb_get_locked_waited); 647 } 648 649 /* 650 * if the buffer is stale, clear all the external state associated with 651 * it. We need to keep flags such as how we allocated the buffer memory 652 * intact here. 653 */ 654 if (bp->b_flags & XBF_STALE) { 655 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); 656 ASSERT(bp->b_iodone == NULL); 657 bp->b_flags &= _XBF_KMEM | _XBF_PAGES; 658 bp->b_ops = NULL; 659 } 660 661 trace_xfs_buf_find(bp, flags, _RET_IP_); 662 XFS_STATS_INC(btp->bt_mount, xb_get_locked); 663 *found_bp = bp; 664 return 0; 665 } 666 667 struct xfs_buf * 668 xfs_buf_incore( 669 struct xfs_buftarg *target, 670 xfs_daddr_t blkno, 671 size_t numblks, 672 xfs_buf_flags_t flags) 673 { 674 struct xfs_buf *bp; 675 int error; 676 DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); 677 678 error = xfs_buf_find(target, &map, 1, flags, NULL, &bp); 679 if (error) 680 return NULL; 681 return bp; 682 } 683 684 /* 685 * Assembles a buffer covering the specified range. The code is optimised for 686 * cache hits, as metadata intensive workloads will see 3 orders of magnitude 687 * more hits than misses. 688 */ 689 struct xfs_buf * 690 xfs_buf_get_map( 691 struct xfs_buftarg *target, 692 struct xfs_buf_map *map, 693 int nmaps, 694 xfs_buf_flags_t flags) 695 { 696 struct xfs_buf *bp; 697 struct xfs_buf *new_bp; 698 int error = 0; 699 700 error = xfs_buf_find(target, map, nmaps, flags, NULL, &bp); 701 702 switch (error) { 703 case 0: 704 /* cache hit */ 705 goto found; 706 case -EAGAIN: 707 /* cache hit, trylock failure, caller handles failure */ 708 ASSERT(flags & XBF_TRYLOCK); 709 return NULL; 710 case -ENOENT: 711 /* cache miss, go for insert */ 712 break; 713 case -EFSCORRUPTED: 714 default: 715 /* 716 * None of the higher layers understand failure types 717 * yet, so return NULL to signal a fatal lookup error. 718 */ 719 return NULL; 720 } 721 722 new_bp = _xfs_buf_alloc(target, map, nmaps, flags); 723 if (unlikely(!new_bp)) 724 return NULL; 725 726 error = xfs_buf_allocate_memory(new_bp, flags); 727 if (error) { 728 xfs_buf_free(new_bp); 729 return NULL; 730 } 731 732 error = xfs_buf_find(target, map, nmaps, flags, new_bp, &bp); 733 if (error) { 734 xfs_buf_free(new_bp); 735 return NULL; 736 } 737 738 if (bp != new_bp) 739 xfs_buf_free(new_bp); 740 741 found: 742 if (!bp->b_addr) { 743 error = _xfs_buf_map_pages(bp, flags); 744 if (unlikely(error)) { 745 xfs_warn(target->bt_mount, 746 "%s: failed to map pagesn", __func__); 747 xfs_buf_relse(bp); 748 return NULL; 749 } 750 } 751 752 /* 753 * Clear b_error if this is a lookup from a caller that doesn't expect 754 * valid data to be found in the buffer. 755 */ 756 if (!(flags & XBF_READ)) 757 xfs_buf_ioerror(bp, 0); 758 759 XFS_STATS_INC(target->bt_mount, xb_get); 760 trace_xfs_buf_get(bp, flags, _RET_IP_); 761 return bp; 762 } 763 764 STATIC int 765 _xfs_buf_read( 766 xfs_buf_t *bp, 767 xfs_buf_flags_t flags) 768 { 769 ASSERT(!(flags & XBF_WRITE)); 770 ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL); 771 772 bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD); 773 bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); 774 775 return xfs_buf_submit(bp); 776 } 777 778 /* 779 * Reverify a buffer found in cache without an attached ->b_ops. 780 * 781 * If the caller passed an ops structure and the buffer doesn't have ops 782 * assigned, set the ops and use it to verify the contents. If verification 783 * fails, clear XBF_DONE. We assume the buffer has no recorded errors and is 784 * already in XBF_DONE state on entry. 785 * 786 * Under normal operations, every in-core buffer is verified on read I/O 787 * completion. There are two scenarios that can lead to in-core buffers without 788 * an assigned ->b_ops. The first is during log recovery of buffers on a V4 789 * filesystem, though these buffers are purged at the end of recovery. The 790 * other is online repair, which intentionally reads with a NULL buffer ops to 791 * run several verifiers across an in-core buffer in order to establish buffer 792 * type. If repair can't establish that, the buffer will be left in memory 793 * with NULL buffer ops. 794 */ 795 int 796 xfs_buf_reverify( 797 struct xfs_buf *bp, 798 const struct xfs_buf_ops *ops) 799 { 800 ASSERT(bp->b_flags & XBF_DONE); 801 ASSERT(bp->b_error == 0); 802 803 if (!ops || bp->b_ops) 804 return 0; 805 806 bp->b_ops = ops; 807 bp->b_ops->verify_read(bp); 808 if (bp->b_error) 809 bp->b_flags &= ~XBF_DONE; 810 return bp->b_error; 811 } 812 813 xfs_buf_t * 814 xfs_buf_read_map( 815 struct xfs_buftarg *target, 816 struct xfs_buf_map *map, 817 int nmaps, 818 xfs_buf_flags_t flags, 819 const struct xfs_buf_ops *ops) 820 { 821 struct xfs_buf *bp; 822 823 flags |= XBF_READ; 824 825 bp = xfs_buf_get_map(target, map, nmaps, flags); 826 if (!bp) 827 return NULL; 828 829 trace_xfs_buf_read(bp, flags, _RET_IP_); 830 831 if (!(bp->b_flags & XBF_DONE)) { 832 XFS_STATS_INC(target->bt_mount, xb_get_read); 833 bp->b_ops = ops; 834 _xfs_buf_read(bp, flags); 835 return bp; 836 } 837 838 xfs_buf_reverify(bp, ops); 839 840 if (flags & XBF_ASYNC) { 841 /* 842 * Read ahead call which is already satisfied, 843 * drop the buffer 844 */ 845 xfs_buf_relse(bp); 846 return NULL; 847 } 848 849 /* We do not want read in the flags */ 850 bp->b_flags &= ~XBF_READ; 851 ASSERT(bp->b_ops != NULL || ops == NULL); 852 return bp; 853 } 854 855 /* 856 * If we are not low on memory then do the readahead in a deadlock 857 * safe manner. 858 */ 859 void 860 xfs_buf_readahead_map( 861 struct xfs_buftarg *target, 862 struct xfs_buf_map *map, 863 int nmaps, 864 const struct xfs_buf_ops *ops) 865 { 866 if (bdi_read_congested(target->bt_bdev->bd_bdi)) 867 return; 868 869 xfs_buf_read_map(target, map, nmaps, 870 XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, ops); 871 } 872 873 /* 874 * Read an uncached buffer from disk. Allocates and returns a locked 875 * buffer containing the disk contents or nothing. 876 */ 877 int 878 xfs_buf_read_uncached( 879 struct xfs_buftarg *target, 880 xfs_daddr_t daddr, 881 size_t numblks, 882 int flags, 883 struct xfs_buf **bpp, 884 const struct xfs_buf_ops *ops) 885 { 886 struct xfs_buf *bp; 887 888 *bpp = NULL; 889 890 bp = xfs_buf_get_uncached(target, numblks, flags); 891 if (!bp) 892 return -ENOMEM; 893 894 /* set up the buffer for a read IO */ 895 ASSERT(bp->b_map_count == 1); 896 bp->b_bn = XFS_BUF_DADDR_NULL; /* always null for uncached buffers */ 897 bp->b_maps[0].bm_bn = daddr; 898 bp->b_flags |= XBF_READ; 899 bp->b_ops = ops; 900 901 xfs_buf_submit(bp); 902 if (bp->b_error) { 903 int error = bp->b_error; 904 xfs_buf_relse(bp); 905 return error; 906 } 907 908 *bpp = bp; 909 return 0; 910 } 911 912 /* 913 * Return a buffer allocated as an empty buffer and associated to external 914 * memory via xfs_buf_associate_memory() back to it's empty state. 915 */ 916 void 917 xfs_buf_set_empty( 918 struct xfs_buf *bp, 919 size_t numblks) 920 { 921 if (bp->b_pages) 922 _xfs_buf_free_pages(bp); 923 924 bp->b_pages = NULL; 925 bp->b_page_count = 0; 926 bp->b_addr = NULL; 927 bp->b_length = numblks; 928 bp->b_io_length = numblks; 929 930 ASSERT(bp->b_map_count == 1); 931 bp->b_bn = XFS_BUF_DADDR_NULL; 932 bp->b_maps[0].bm_bn = XFS_BUF_DADDR_NULL; 933 bp->b_maps[0].bm_len = bp->b_length; 934 } 935 936 static inline struct page * 937 mem_to_page( 938 void *addr) 939 { 940 if ((!is_vmalloc_addr(addr))) { 941 return virt_to_page(addr); 942 } else { 943 return vmalloc_to_page(addr); 944 } 945 } 946 947 int 948 xfs_buf_associate_memory( 949 xfs_buf_t *bp, 950 void *mem, 951 size_t len) 952 { 953 int rval; 954 int i = 0; 955 unsigned long pageaddr; 956 unsigned long offset; 957 size_t buflen; 958 int page_count; 959 960 pageaddr = (unsigned long)mem & PAGE_MASK; 961 offset = (unsigned long)mem - pageaddr; 962 buflen = PAGE_ALIGN(len + offset); 963 page_count = buflen >> PAGE_SHIFT; 964 965 /* Free any previous set of page pointers */ 966 if (bp->b_pages) 967 _xfs_buf_free_pages(bp); 968 969 bp->b_pages = NULL; 970 bp->b_addr = mem; 971 972 rval = _xfs_buf_get_pages(bp, page_count); 973 if (rval) 974 return rval; 975 976 bp->b_offset = offset; 977 978 for (i = 0; i < bp->b_page_count; i++) { 979 bp->b_pages[i] = mem_to_page((void *)pageaddr); 980 pageaddr += PAGE_SIZE; 981 } 982 983 bp->b_io_length = BTOBB(len); 984 bp->b_length = BTOBB(buflen); 985 986 return 0; 987 } 988 989 xfs_buf_t * 990 xfs_buf_get_uncached( 991 struct xfs_buftarg *target, 992 size_t numblks, 993 int flags) 994 { 995 unsigned long page_count; 996 int error, i; 997 struct xfs_buf *bp; 998 DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks); 999 1000 /* flags might contain irrelevant bits, pass only what we care about */ 1001 bp = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT); 1002 if (unlikely(bp == NULL)) 1003 goto fail; 1004 1005 page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT; 1006 error = _xfs_buf_get_pages(bp, page_count); 1007 if (error) 1008 goto fail_free_buf; 1009 1010 for (i = 0; i < page_count; i++) { 1011 bp->b_pages[i] = alloc_page(xb_to_gfp(flags)); 1012 if (!bp->b_pages[i]) 1013 goto fail_free_mem; 1014 } 1015 bp->b_flags |= _XBF_PAGES; 1016 1017 error = _xfs_buf_map_pages(bp, 0); 1018 if (unlikely(error)) { 1019 xfs_warn(target->bt_mount, 1020 "%s: failed to map pages", __func__); 1021 goto fail_free_mem; 1022 } 1023 1024 trace_xfs_buf_get_uncached(bp, _RET_IP_); 1025 return bp; 1026 1027 fail_free_mem: 1028 while (--i >= 0) 1029 __free_page(bp->b_pages[i]); 1030 _xfs_buf_free_pages(bp); 1031 fail_free_buf: 1032 xfs_buf_free_maps(bp); 1033 kmem_zone_free(xfs_buf_zone, bp); 1034 fail: 1035 return NULL; 1036 } 1037 1038 /* 1039 * Increment reference count on buffer, to hold the buffer concurrently 1040 * with another thread which may release (free) the buffer asynchronously. 1041 * Must hold the buffer already to call this function. 1042 */ 1043 void 1044 xfs_buf_hold( 1045 xfs_buf_t *bp) 1046 { 1047 trace_xfs_buf_hold(bp, _RET_IP_); 1048 atomic_inc(&bp->b_hold); 1049 } 1050 1051 /* 1052 * Release a hold on the specified buffer. If the hold count is 1, the buffer is 1053 * placed on LRU or freed (depending on b_lru_ref). 1054 */ 1055 void 1056 xfs_buf_rele( 1057 xfs_buf_t *bp) 1058 { 1059 struct xfs_perag *pag = bp->b_pag; 1060 bool release; 1061 bool freebuf = false; 1062 1063 trace_xfs_buf_rele(bp, _RET_IP_); 1064 1065 if (!pag) { 1066 ASSERT(list_empty(&bp->b_lru)); 1067 if (atomic_dec_and_test(&bp->b_hold)) { 1068 xfs_buf_ioacct_dec(bp); 1069 xfs_buf_free(bp); 1070 } 1071 return; 1072 } 1073 1074 ASSERT(atomic_read(&bp->b_hold) > 0); 1075 1076 /* 1077 * We grab the b_lock here first to serialise racing xfs_buf_rele() 1078 * calls. The pag_buf_lock being taken on the last reference only 1079 * serialises against racing lookups in xfs_buf_find(). IOWs, the second 1080 * to last reference we drop here is not serialised against the last 1081 * reference until we take bp->b_lock. Hence if we don't grab b_lock 1082 * first, the last "release" reference can win the race to the lock and 1083 * free the buffer before the second-to-last reference is processed, 1084 * leading to a use-after-free scenario. 1085 */ 1086 spin_lock(&bp->b_lock); 1087 release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock); 1088 if (!release) { 1089 /* 1090 * Drop the in-flight state if the buffer is already on the LRU 1091 * and it holds the only reference. This is racy because we 1092 * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT 1093 * ensures the decrement occurs only once per-buf. 1094 */ 1095 if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru)) 1096 __xfs_buf_ioacct_dec(bp); 1097 goto out_unlock; 1098 } 1099 1100 /* the last reference has been dropped ... */ 1101 __xfs_buf_ioacct_dec(bp); 1102 if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { 1103 /* 1104 * If the buffer is added to the LRU take a new reference to the 1105 * buffer for the LRU and clear the (now stale) dispose list 1106 * state flag 1107 */ 1108 if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) { 1109 bp->b_state &= ~XFS_BSTATE_DISPOSE; 1110 atomic_inc(&bp->b_hold); 1111 } 1112 spin_unlock(&pag->pag_buf_lock); 1113 } else { 1114 /* 1115 * most of the time buffers will already be removed from the 1116 * LRU, so optimise that case by checking for the 1117 * XFS_BSTATE_DISPOSE flag indicating the last list the buffer 1118 * was on was the disposal list 1119 */ 1120 if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { 1121 list_lru_del(&bp->b_target->bt_lru, &bp->b_lru); 1122 } else { 1123 ASSERT(list_empty(&bp->b_lru)); 1124 } 1125 1126 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); 1127 rhashtable_remove_fast(&pag->pag_buf_hash, &bp->b_rhash_head, 1128 xfs_buf_hash_params); 1129 spin_unlock(&pag->pag_buf_lock); 1130 xfs_perag_put(pag); 1131 freebuf = true; 1132 } 1133 1134 out_unlock: 1135 spin_unlock(&bp->b_lock); 1136 1137 if (freebuf) 1138 xfs_buf_free(bp); 1139 } 1140 1141 1142 /* 1143 * Lock a buffer object, if it is not already locked. 1144 * 1145 * If we come across a stale, pinned, locked buffer, we know that we are 1146 * being asked to lock a buffer that has been reallocated. Because it is 1147 * pinned, we know that the log has not been pushed to disk and hence it 1148 * will still be locked. Rather than continuing to have trylock attempts 1149 * fail until someone else pushes the log, push it ourselves before 1150 * returning. This means that the xfsaild will not get stuck trying 1151 * to push on stale inode buffers. 1152 */ 1153 int 1154 xfs_buf_trylock( 1155 struct xfs_buf *bp) 1156 { 1157 int locked; 1158 1159 locked = down_trylock(&bp->b_sema) == 0; 1160 if (locked) 1161 trace_xfs_buf_trylock(bp, _RET_IP_); 1162 else 1163 trace_xfs_buf_trylock_fail(bp, _RET_IP_); 1164 return locked; 1165 } 1166 1167 /* 1168 * Lock a buffer object. 1169 * 1170 * If we come across a stale, pinned, locked buffer, we know that we 1171 * are being asked to lock a buffer that has been reallocated. Because 1172 * it is pinned, we know that the log has not been pushed to disk and 1173 * hence it will still be locked. Rather than sleeping until someone 1174 * else pushes the log, push it ourselves before trying to get the lock. 1175 */ 1176 void 1177 xfs_buf_lock( 1178 struct xfs_buf *bp) 1179 { 1180 trace_xfs_buf_lock(bp, _RET_IP_); 1181 1182 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) 1183 xfs_log_force(bp->b_target->bt_mount, 0); 1184 down(&bp->b_sema); 1185 1186 trace_xfs_buf_lock_done(bp, _RET_IP_); 1187 } 1188 1189 void 1190 xfs_buf_unlock( 1191 struct xfs_buf *bp) 1192 { 1193 ASSERT(xfs_buf_islocked(bp)); 1194 1195 up(&bp->b_sema); 1196 trace_xfs_buf_unlock(bp, _RET_IP_); 1197 } 1198 1199 STATIC void 1200 xfs_buf_wait_unpin( 1201 xfs_buf_t *bp) 1202 { 1203 DECLARE_WAITQUEUE (wait, current); 1204 1205 if (atomic_read(&bp->b_pin_count) == 0) 1206 return; 1207 1208 add_wait_queue(&bp->b_waiters, &wait); 1209 for (;;) { 1210 set_current_state(TASK_UNINTERRUPTIBLE); 1211 if (atomic_read(&bp->b_pin_count) == 0) 1212 break; 1213 io_schedule(); 1214 } 1215 remove_wait_queue(&bp->b_waiters, &wait); 1216 set_current_state(TASK_RUNNING); 1217 } 1218 1219 /* 1220 * Buffer Utility Routines 1221 */ 1222 1223 void 1224 xfs_buf_ioend( 1225 struct xfs_buf *bp) 1226 { 1227 bool read = bp->b_flags & XBF_READ; 1228 1229 trace_xfs_buf_iodone(bp, _RET_IP_); 1230 1231 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); 1232 1233 /* 1234 * Pull in IO completion errors now. We are guaranteed to be running 1235 * single threaded, so we don't need the lock to read b_io_error. 1236 */ 1237 if (!bp->b_error && bp->b_io_error) 1238 xfs_buf_ioerror(bp, bp->b_io_error); 1239 1240 /* Only validate buffers that were read without errors */ 1241 if (read && !bp->b_error && bp->b_ops) { 1242 ASSERT(!bp->b_iodone); 1243 bp->b_ops->verify_read(bp); 1244 } 1245 1246 if (!bp->b_error) 1247 bp->b_flags |= XBF_DONE; 1248 1249 if (bp->b_iodone) 1250 (*(bp->b_iodone))(bp); 1251 else if (bp->b_flags & XBF_ASYNC) 1252 xfs_buf_relse(bp); 1253 else 1254 complete(&bp->b_iowait); 1255 } 1256 1257 static void 1258 xfs_buf_ioend_work( 1259 struct work_struct *work) 1260 { 1261 struct xfs_buf *bp = 1262 container_of(work, xfs_buf_t, b_ioend_work); 1263 1264 xfs_buf_ioend(bp); 1265 } 1266 1267 static void 1268 xfs_buf_ioend_async( 1269 struct xfs_buf *bp) 1270 { 1271 INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work); 1272 queue_work(bp->b_ioend_wq, &bp->b_ioend_work); 1273 } 1274 1275 void 1276 __xfs_buf_ioerror( 1277 xfs_buf_t *bp, 1278 int error, 1279 xfs_failaddr_t failaddr) 1280 { 1281 ASSERT(error <= 0 && error >= -1000); 1282 bp->b_error = error; 1283 trace_xfs_buf_ioerror(bp, error, failaddr); 1284 } 1285 1286 void 1287 xfs_buf_ioerror_alert( 1288 struct xfs_buf *bp, 1289 const char *func) 1290 { 1291 xfs_alert(bp->b_target->bt_mount, 1292 "metadata I/O error in \"%s\" at daddr 0x%llx len %d error %d", 1293 func, (uint64_t)XFS_BUF_ADDR(bp), bp->b_length, 1294 -bp->b_error); 1295 } 1296 1297 int 1298 xfs_bwrite( 1299 struct xfs_buf *bp) 1300 { 1301 int error; 1302 1303 ASSERT(xfs_buf_islocked(bp)); 1304 1305 bp->b_flags |= XBF_WRITE; 1306 bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | 1307 XBF_WRITE_FAIL | XBF_DONE); 1308 1309 error = xfs_buf_submit(bp); 1310 if (error) { 1311 xfs_force_shutdown(bp->b_target->bt_mount, 1312 SHUTDOWN_META_IO_ERROR); 1313 } 1314 return error; 1315 } 1316 1317 static void 1318 xfs_buf_bio_end_io( 1319 struct bio *bio) 1320 { 1321 struct xfs_buf *bp = (struct xfs_buf *)bio->bi_private; 1322 1323 /* 1324 * don't overwrite existing errors - otherwise we can lose errors on 1325 * buffers that require multiple bios to complete. 1326 */ 1327 if (bio->bi_status) { 1328 int error = blk_status_to_errno(bio->bi_status); 1329 1330 cmpxchg(&bp->b_io_error, 0, error); 1331 } 1332 1333 if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) 1334 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); 1335 1336 if (atomic_dec_and_test(&bp->b_io_remaining) == 1) 1337 xfs_buf_ioend_async(bp); 1338 bio_put(bio); 1339 } 1340 1341 static void 1342 xfs_buf_ioapply_map( 1343 struct xfs_buf *bp, 1344 int map, 1345 int *buf_offset, 1346 int *count, 1347 int op, 1348 int op_flags) 1349 { 1350 int page_index; 1351 int total_nr_pages = bp->b_page_count; 1352 int nr_pages; 1353 struct bio *bio; 1354 sector_t sector = bp->b_maps[map].bm_bn; 1355 int size; 1356 int offset; 1357 1358 /* skip the pages in the buffer before the start offset */ 1359 page_index = 0; 1360 offset = *buf_offset; 1361 while (offset >= PAGE_SIZE) { 1362 page_index++; 1363 offset -= PAGE_SIZE; 1364 } 1365 1366 /* 1367 * Limit the IO size to the length of the current vector, and update the 1368 * remaining IO count for the next time around. 1369 */ 1370 size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count); 1371 *count -= size; 1372 *buf_offset += size; 1373 1374 next_chunk: 1375 atomic_inc(&bp->b_io_remaining); 1376 nr_pages = min(total_nr_pages, BIO_MAX_PAGES); 1377 1378 bio = bio_alloc(GFP_NOIO, nr_pages); 1379 bio_set_dev(bio, bp->b_target->bt_bdev); 1380 bio->bi_iter.bi_sector = sector; 1381 bio->bi_end_io = xfs_buf_bio_end_io; 1382 bio->bi_private = bp; 1383 bio_set_op_attrs(bio, op, op_flags); 1384 1385 for (; size && nr_pages; nr_pages--, page_index++) { 1386 int rbytes, nbytes = PAGE_SIZE - offset; 1387 1388 if (nbytes > size) 1389 nbytes = size; 1390 1391 rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes, 1392 offset); 1393 if (rbytes < nbytes) 1394 break; 1395 1396 offset = 0; 1397 sector += BTOBB(nbytes); 1398 size -= nbytes; 1399 total_nr_pages--; 1400 } 1401 1402 if (likely(bio->bi_iter.bi_size)) { 1403 if (xfs_buf_is_vmapped(bp)) { 1404 flush_kernel_vmap_range(bp->b_addr, 1405 xfs_buf_vmap_len(bp)); 1406 } 1407 submit_bio(bio); 1408 if (size) 1409 goto next_chunk; 1410 } else { 1411 /* 1412 * This is guaranteed not to be the last io reference count 1413 * because the caller (xfs_buf_submit) holds a count itself. 1414 */ 1415 atomic_dec(&bp->b_io_remaining); 1416 xfs_buf_ioerror(bp, -EIO); 1417 bio_put(bio); 1418 } 1419 1420 } 1421 1422 STATIC void 1423 _xfs_buf_ioapply( 1424 struct xfs_buf *bp) 1425 { 1426 struct blk_plug plug; 1427 int op; 1428 int op_flags = 0; 1429 int offset; 1430 int size; 1431 int i; 1432 1433 /* 1434 * Make sure we capture only current IO errors rather than stale errors 1435 * left over from previous use of the buffer (e.g. failed readahead). 1436 */ 1437 bp->b_error = 0; 1438 1439 /* 1440 * Initialize the I/O completion workqueue if we haven't yet or the 1441 * submitter has not opted to specify a custom one. 1442 */ 1443 if (!bp->b_ioend_wq) 1444 bp->b_ioend_wq = bp->b_target->bt_mount->m_buf_workqueue; 1445 1446 if (bp->b_flags & XBF_WRITE) { 1447 op = REQ_OP_WRITE; 1448 if (bp->b_flags & XBF_SYNCIO) 1449 op_flags = REQ_SYNC; 1450 if (bp->b_flags & XBF_FUA) 1451 op_flags |= REQ_FUA; 1452 if (bp->b_flags & XBF_FLUSH) 1453 op_flags |= REQ_PREFLUSH; 1454 1455 /* 1456 * Run the write verifier callback function if it exists. If 1457 * this function fails it will mark the buffer with an error and 1458 * the IO should not be dispatched. 1459 */ 1460 if (bp->b_ops) { 1461 bp->b_ops->verify_write(bp); 1462 if (bp->b_error) { 1463 xfs_force_shutdown(bp->b_target->bt_mount, 1464 SHUTDOWN_CORRUPT_INCORE); 1465 return; 1466 } 1467 } else if (bp->b_bn != XFS_BUF_DADDR_NULL) { 1468 struct xfs_mount *mp = bp->b_target->bt_mount; 1469 1470 /* 1471 * non-crc filesystems don't attach verifiers during 1472 * log recovery, so don't warn for such filesystems. 1473 */ 1474 if (xfs_sb_version_hascrc(&mp->m_sb)) { 1475 xfs_warn(mp, 1476 "%s: no buf ops on daddr 0x%llx len %d", 1477 __func__, bp->b_bn, bp->b_length); 1478 xfs_hex_dump(bp->b_addr, 1479 XFS_CORRUPTION_DUMP_LEN); 1480 dump_stack(); 1481 } 1482 } 1483 } else if (bp->b_flags & XBF_READ_AHEAD) { 1484 op = REQ_OP_READ; 1485 op_flags = REQ_RAHEAD; 1486 } else { 1487 op = REQ_OP_READ; 1488 } 1489 1490 /* we only use the buffer cache for meta-data */ 1491 op_flags |= REQ_META; 1492 1493 /* 1494 * Walk all the vectors issuing IO on them. Set up the initial offset 1495 * into the buffer and the desired IO size before we start - 1496 * _xfs_buf_ioapply_vec() will modify them appropriately for each 1497 * subsequent call. 1498 */ 1499 offset = bp->b_offset; 1500 size = BBTOB(bp->b_io_length); 1501 blk_start_plug(&plug); 1502 for (i = 0; i < bp->b_map_count; i++) { 1503 xfs_buf_ioapply_map(bp, i, &offset, &size, op, op_flags); 1504 if (bp->b_error) 1505 break; 1506 if (size <= 0) 1507 break; /* all done */ 1508 } 1509 blk_finish_plug(&plug); 1510 } 1511 1512 /* 1513 * Wait for I/O completion of a sync buffer and return the I/O error code. 1514 */ 1515 static int 1516 xfs_buf_iowait( 1517 struct xfs_buf *bp) 1518 { 1519 ASSERT(!(bp->b_flags & XBF_ASYNC)); 1520 1521 trace_xfs_buf_iowait(bp, _RET_IP_); 1522 wait_for_completion(&bp->b_iowait); 1523 trace_xfs_buf_iowait_done(bp, _RET_IP_); 1524 1525 return bp->b_error; 1526 } 1527 1528 /* 1529 * Buffer I/O submission path, read or write. Asynchronous submission transfers 1530 * the buffer lock ownership and the current reference to the IO. It is not 1531 * safe to reference the buffer after a call to this function unless the caller 1532 * holds an additional reference itself. 1533 */ 1534 int 1535 __xfs_buf_submit( 1536 struct xfs_buf *bp, 1537 bool wait) 1538 { 1539 int error = 0; 1540 1541 trace_xfs_buf_submit(bp, _RET_IP_); 1542 1543 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); 1544 1545 /* on shutdown we stale and complete the buffer immediately */ 1546 if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { 1547 xfs_buf_ioerror(bp, -EIO); 1548 bp->b_flags &= ~XBF_DONE; 1549 xfs_buf_stale(bp); 1550 xfs_buf_ioend(bp); 1551 return -EIO; 1552 } 1553 1554 /* 1555 * Grab a reference so the buffer does not go away underneath us. For 1556 * async buffers, I/O completion drops the callers reference, which 1557 * could occur before submission returns. 1558 */ 1559 xfs_buf_hold(bp); 1560 1561 if (bp->b_flags & XBF_WRITE) 1562 xfs_buf_wait_unpin(bp); 1563 1564 /* clear the internal error state to avoid spurious errors */ 1565 bp->b_io_error = 0; 1566 1567 /* 1568 * Set the count to 1 initially, this will stop an I/O completion 1569 * callout which happens before we have started all the I/O from calling 1570 * xfs_buf_ioend too early. 1571 */ 1572 atomic_set(&bp->b_io_remaining, 1); 1573 if (bp->b_flags & XBF_ASYNC) 1574 xfs_buf_ioacct_inc(bp); 1575 _xfs_buf_ioapply(bp); 1576 1577 /* 1578 * If _xfs_buf_ioapply failed, we can get back here with only the IO 1579 * reference we took above. If we drop it to zero, run completion so 1580 * that we don't return to the caller with completion still pending. 1581 */ 1582 if (atomic_dec_and_test(&bp->b_io_remaining) == 1) { 1583 if (bp->b_error || !(bp->b_flags & XBF_ASYNC)) 1584 xfs_buf_ioend(bp); 1585 else 1586 xfs_buf_ioend_async(bp); 1587 } 1588 1589 if (wait) 1590 error = xfs_buf_iowait(bp); 1591 1592 /* 1593 * Release the hold that keeps the buffer referenced for the entire 1594 * I/O. Note that if the buffer is async, it is not safe to reference 1595 * after this release. 1596 */ 1597 xfs_buf_rele(bp); 1598 return error; 1599 } 1600 1601 void * 1602 xfs_buf_offset( 1603 struct xfs_buf *bp, 1604 size_t offset) 1605 { 1606 struct page *page; 1607 1608 if (bp->b_addr) 1609 return bp->b_addr + offset; 1610 1611 offset += bp->b_offset; 1612 page = bp->b_pages[offset >> PAGE_SHIFT]; 1613 return page_address(page) + (offset & (PAGE_SIZE-1)); 1614 } 1615 1616 /* 1617 * Move data into or out of a buffer. 1618 */ 1619 void 1620 xfs_buf_iomove( 1621 xfs_buf_t *bp, /* buffer to process */ 1622 size_t boff, /* starting buffer offset */ 1623 size_t bsize, /* length to copy */ 1624 void *data, /* data address */ 1625 xfs_buf_rw_t mode) /* read/write/zero flag */ 1626 { 1627 size_t bend; 1628 1629 bend = boff + bsize; 1630 while (boff < bend) { 1631 struct page *page; 1632 int page_index, page_offset, csize; 1633 1634 page_index = (boff + bp->b_offset) >> PAGE_SHIFT; 1635 page_offset = (boff + bp->b_offset) & ~PAGE_MASK; 1636 page = bp->b_pages[page_index]; 1637 csize = min_t(size_t, PAGE_SIZE - page_offset, 1638 BBTOB(bp->b_io_length) - boff); 1639 1640 ASSERT((csize + page_offset) <= PAGE_SIZE); 1641 1642 switch (mode) { 1643 case XBRW_ZERO: 1644 memset(page_address(page) + page_offset, 0, csize); 1645 break; 1646 case XBRW_READ: 1647 memcpy(data, page_address(page) + page_offset, csize); 1648 break; 1649 case XBRW_WRITE: 1650 memcpy(page_address(page) + page_offset, data, csize); 1651 } 1652 1653 boff += csize; 1654 data += csize; 1655 } 1656 } 1657 1658 /* 1659 * Handling of buffer targets (buftargs). 1660 */ 1661 1662 /* 1663 * Wait for any bufs with callbacks that have been submitted but have not yet 1664 * returned. These buffers will have an elevated hold count, so wait on those 1665 * while freeing all the buffers only held by the LRU. 1666 */ 1667 static enum lru_status 1668 xfs_buftarg_wait_rele( 1669 struct list_head *item, 1670 struct list_lru_one *lru, 1671 spinlock_t *lru_lock, 1672 void *arg) 1673 1674 { 1675 struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); 1676 struct list_head *dispose = arg; 1677 1678 if (atomic_read(&bp->b_hold) > 1) { 1679 /* need to wait, so skip it this pass */ 1680 trace_xfs_buf_wait_buftarg(bp, _RET_IP_); 1681 return LRU_SKIP; 1682 } 1683 if (!spin_trylock(&bp->b_lock)) 1684 return LRU_SKIP; 1685 1686 /* 1687 * clear the LRU reference count so the buffer doesn't get 1688 * ignored in xfs_buf_rele(). 1689 */ 1690 atomic_set(&bp->b_lru_ref, 0); 1691 bp->b_state |= XFS_BSTATE_DISPOSE; 1692 list_lru_isolate_move(lru, item, dispose); 1693 spin_unlock(&bp->b_lock); 1694 return LRU_REMOVED; 1695 } 1696 1697 void 1698 xfs_wait_buftarg( 1699 struct xfs_buftarg *btp) 1700 { 1701 LIST_HEAD(dispose); 1702 int loop = 0; 1703 1704 /* 1705 * First wait on the buftarg I/O count for all in-flight buffers to be 1706 * released. This is critical as new buffers do not make the LRU until 1707 * they are released. 1708 * 1709 * Next, flush the buffer workqueue to ensure all completion processing 1710 * has finished. Just waiting on buffer locks is not sufficient for 1711 * async IO as the reference count held over IO is not released until 1712 * after the buffer lock is dropped. Hence we need to ensure here that 1713 * all reference counts have been dropped before we start walking the 1714 * LRU list. 1715 */ 1716 while (percpu_counter_sum(&btp->bt_io_count)) 1717 delay(100); 1718 flush_workqueue(btp->bt_mount->m_buf_workqueue); 1719 1720 /* loop until there is nothing left on the lru list. */ 1721 while (list_lru_count(&btp->bt_lru)) { 1722 list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele, 1723 &dispose, LONG_MAX); 1724 1725 while (!list_empty(&dispose)) { 1726 struct xfs_buf *bp; 1727 bp = list_first_entry(&dispose, struct xfs_buf, b_lru); 1728 list_del_init(&bp->b_lru); 1729 if (bp->b_flags & XBF_WRITE_FAIL) { 1730 xfs_alert(btp->bt_mount, 1731 "Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!", 1732 (long long)bp->b_bn); 1733 xfs_alert(btp->bt_mount, 1734 "Please run xfs_repair to determine the extent of the problem."); 1735 } 1736 xfs_buf_rele(bp); 1737 } 1738 if (loop++ != 0) 1739 delay(100); 1740 } 1741 } 1742 1743 static enum lru_status 1744 xfs_buftarg_isolate( 1745 struct list_head *item, 1746 struct list_lru_one *lru, 1747 spinlock_t *lru_lock, 1748 void *arg) 1749 { 1750 struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); 1751 struct list_head *dispose = arg; 1752 1753 /* 1754 * we are inverting the lru lock/bp->b_lock here, so use a trylock. 1755 * If we fail to get the lock, just skip it. 1756 */ 1757 if (!spin_trylock(&bp->b_lock)) 1758 return LRU_SKIP; 1759 /* 1760 * Decrement the b_lru_ref count unless the value is already 1761 * zero. If the value is already zero, we need to reclaim the 1762 * buffer, otherwise it gets another trip through the LRU. 1763 */ 1764 if (atomic_add_unless(&bp->b_lru_ref, -1, 0)) { 1765 spin_unlock(&bp->b_lock); 1766 return LRU_ROTATE; 1767 } 1768 1769 bp->b_state |= XFS_BSTATE_DISPOSE; 1770 list_lru_isolate_move(lru, item, dispose); 1771 spin_unlock(&bp->b_lock); 1772 return LRU_REMOVED; 1773 } 1774 1775 static unsigned long 1776 xfs_buftarg_shrink_scan( 1777 struct shrinker *shrink, 1778 struct shrink_control *sc) 1779 { 1780 struct xfs_buftarg *btp = container_of(shrink, 1781 struct xfs_buftarg, bt_shrinker); 1782 LIST_HEAD(dispose); 1783 unsigned long freed; 1784 1785 freed = list_lru_shrink_walk(&btp->bt_lru, sc, 1786 xfs_buftarg_isolate, &dispose); 1787 1788 while (!list_empty(&dispose)) { 1789 struct xfs_buf *bp; 1790 bp = list_first_entry(&dispose, struct xfs_buf, b_lru); 1791 list_del_init(&bp->b_lru); 1792 xfs_buf_rele(bp); 1793 } 1794 1795 return freed; 1796 } 1797 1798 static unsigned long 1799 xfs_buftarg_shrink_count( 1800 struct shrinker *shrink, 1801 struct shrink_control *sc) 1802 { 1803 struct xfs_buftarg *btp = container_of(shrink, 1804 struct xfs_buftarg, bt_shrinker); 1805 return list_lru_shrink_count(&btp->bt_lru, sc); 1806 } 1807 1808 void 1809 xfs_free_buftarg( 1810 struct xfs_buftarg *btp) 1811 { 1812 unregister_shrinker(&btp->bt_shrinker); 1813 ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0); 1814 percpu_counter_destroy(&btp->bt_io_count); 1815 list_lru_destroy(&btp->bt_lru); 1816 1817 xfs_blkdev_issue_flush(btp); 1818 1819 kmem_free(btp); 1820 } 1821 1822 int 1823 xfs_setsize_buftarg( 1824 xfs_buftarg_t *btp, 1825 unsigned int sectorsize) 1826 { 1827 /* Set up metadata sector size info */ 1828 btp->bt_meta_sectorsize = sectorsize; 1829 btp->bt_meta_sectormask = sectorsize - 1; 1830 1831 if (set_blocksize(btp->bt_bdev, sectorsize)) { 1832 xfs_warn(btp->bt_mount, 1833 "Cannot set_blocksize to %u on device %pg", 1834 sectorsize, btp->bt_bdev); 1835 return -EINVAL; 1836 } 1837 1838 /* Set up device logical sector size mask */ 1839 btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev); 1840 btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1; 1841 1842 return 0; 1843 } 1844 1845 /* 1846 * When allocating the initial buffer target we have not yet 1847 * read in the superblock, so don't know what sized sectors 1848 * are being used at this early stage. Play safe. 1849 */ 1850 STATIC int 1851 xfs_setsize_buftarg_early( 1852 xfs_buftarg_t *btp, 1853 struct block_device *bdev) 1854 { 1855 return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev)); 1856 } 1857 1858 xfs_buftarg_t * 1859 xfs_alloc_buftarg( 1860 struct xfs_mount *mp, 1861 struct block_device *bdev, 1862 struct dax_device *dax_dev) 1863 { 1864 xfs_buftarg_t *btp; 1865 1866 btp = kmem_zalloc(sizeof(*btp), KM_SLEEP | KM_NOFS); 1867 1868 btp->bt_mount = mp; 1869 btp->bt_dev = bdev->bd_dev; 1870 btp->bt_bdev = bdev; 1871 btp->bt_daxdev = dax_dev; 1872 1873 if (xfs_setsize_buftarg_early(btp, bdev)) 1874 goto error_free; 1875 1876 if (list_lru_init(&btp->bt_lru)) 1877 goto error_free; 1878 1879 if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL)) 1880 goto error_lru; 1881 1882 btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count; 1883 btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan; 1884 btp->bt_shrinker.seeks = DEFAULT_SEEKS; 1885 btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE; 1886 if (register_shrinker(&btp->bt_shrinker)) 1887 goto error_pcpu; 1888 return btp; 1889 1890 error_pcpu: 1891 percpu_counter_destroy(&btp->bt_io_count); 1892 error_lru: 1893 list_lru_destroy(&btp->bt_lru); 1894 error_free: 1895 kmem_free(btp); 1896 return NULL; 1897 } 1898 1899 /* 1900 * Cancel a delayed write list. 1901 * 1902 * Remove each buffer from the list, clear the delwri queue flag and drop the 1903 * associated buffer reference. 1904 */ 1905 void 1906 xfs_buf_delwri_cancel( 1907 struct list_head *list) 1908 { 1909 struct xfs_buf *bp; 1910 1911 while (!list_empty(list)) { 1912 bp = list_first_entry(list, struct xfs_buf, b_list); 1913 1914 xfs_buf_lock(bp); 1915 bp->b_flags &= ~_XBF_DELWRI_Q; 1916 list_del_init(&bp->b_list); 1917 xfs_buf_relse(bp); 1918 } 1919 } 1920 1921 /* 1922 * Add a buffer to the delayed write list. 1923 * 1924 * This queues a buffer for writeout if it hasn't already been. Note that 1925 * neither this routine nor the buffer list submission functions perform 1926 * any internal synchronization. It is expected that the lists are thread-local 1927 * to the callers. 1928 * 1929 * Returns true if we queued up the buffer, or false if it already had 1930 * been on the buffer list. 1931 */ 1932 bool 1933 xfs_buf_delwri_queue( 1934 struct xfs_buf *bp, 1935 struct list_head *list) 1936 { 1937 ASSERT(xfs_buf_islocked(bp)); 1938 ASSERT(!(bp->b_flags & XBF_READ)); 1939 1940 /* 1941 * If the buffer is already marked delwri it already is queued up 1942 * by someone else for imediate writeout. Just ignore it in that 1943 * case. 1944 */ 1945 if (bp->b_flags & _XBF_DELWRI_Q) { 1946 trace_xfs_buf_delwri_queued(bp, _RET_IP_); 1947 return false; 1948 } 1949 1950 trace_xfs_buf_delwri_queue(bp, _RET_IP_); 1951 1952 /* 1953 * If a buffer gets written out synchronously or marked stale while it 1954 * is on a delwri list we lazily remove it. To do this, the other party 1955 * clears the _XBF_DELWRI_Q flag but otherwise leaves the buffer alone. 1956 * It remains referenced and on the list. In a rare corner case it 1957 * might get readded to a delwri list after the synchronous writeout, in 1958 * which case we need just need to re-add the flag here. 1959 */ 1960 bp->b_flags |= _XBF_DELWRI_Q; 1961 if (list_empty(&bp->b_list)) { 1962 atomic_inc(&bp->b_hold); 1963 list_add_tail(&bp->b_list, list); 1964 } 1965 1966 return true; 1967 } 1968 1969 /* 1970 * Compare function is more complex than it needs to be because 1971 * the return value is only 32 bits and we are doing comparisons 1972 * on 64 bit values 1973 */ 1974 static int 1975 xfs_buf_cmp( 1976 void *priv, 1977 struct list_head *a, 1978 struct list_head *b) 1979 { 1980 struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list); 1981 struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list); 1982 xfs_daddr_t diff; 1983 1984 diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn; 1985 if (diff < 0) 1986 return -1; 1987 if (diff > 0) 1988 return 1; 1989 return 0; 1990 } 1991 1992 /* 1993 * Submit buffers for write. If wait_list is specified, the buffers are 1994 * submitted using sync I/O and placed on the wait list such that the caller can 1995 * iowait each buffer. Otherwise async I/O is used and the buffers are released 1996 * at I/O completion time. In either case, buffers remain locked until I/O 1997 * completes and the buffer is released from the queue. 1998 */ 1999 static int 2000 xfs_buf_delwri_submit_buffers( 2001 struct list_head *buffer_list, 2002 struct list_head *wait_list) 2003 { 2004 struct xfs_buf *bp, *n; 2005 int pinned = 0; 2006 struct blk_plug plug; 2007 2008 list_sort(NULL, buffer_list, xfs_buf_cmp); 2009 2010 blk_start_plug(&plug); 2011 list_for_each_entry_safe(bp, n, buffer_list, b_list) { 2012 if (!wait_list) { 2013 if (xfs_buf_ispinned(bp)) { 2014 pinned++; 2015 continue; 2016 } 2017 if (!xfs_buf_trylock(bp)) 2018 continue; 2019 } else { 2020 xfs_buf_lock(bp); 2021 } 2022 2023 /* 2024 * Someone else might have written the buffer synchronously or 2025 * marked it stale in the meantime. In that case only the 2026 * _XBF_DELWRI_Q flag got cleared, and we have to drop the 2027 * reference and remove it from the list here. 2028 */ 2029 if (!(bp->b_flags & _XBF_DELWRI_Q)) { 2030 list_del_init(&bp->b_list); 2031 xfs_buf_relse(bp); 2032 continue; 2033 } 2034 2035 trace_xfs_buf_delwri_split(bp, _RET_IP_); 2036 2037 /* 2038 * If we have a wait list, each buffer (and associated delwri 2039 * queue reference) transfers to it and is submitted 2040 * synchronously. Otherwise, drop the buffer from the delwri 2041 * queue and submit async. 2042 */ 2043 bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_WRITE_FAIL); 2044 bp->b_flags |= XBF_WRITE; 2045 if (wait_list) { 2046 bp->b_flags &= ~XBF_ASYNC; 2047 list_move_tail(&bp->b_list, wait_list); 2048 } else { 2049 bp->b_flags |= XBF_ASYNC; 2050 list_del_init(&bp->b_list); 2051 } 2052 __xfs_buf_submit(bp, false); 2053 } 2054 blk_finish_plug(&plug); 2055 2056 return pinned; 2057 } 2058 2059 /* 2060 * Write out a buffer list asynchronously. 2061 * 2062 * This will take the @buffer_list, write all non-locked and non-pinned buffers 2063 * out and not wait for I/O completion on any of the buffers. This interface 2064 * is only safely useable for callers that can track I/O completion by higher 2065 * level means, e.g. AIL pushing as the @buffer_list is consumed in this 2066 * function. 2067 * 2068 * Note: this function will skip buffers it would block on, and in doing so 2069 * leaves them on @buffer_list so they can be retried on a later pass. As such, 2070 * it is up to the caller to ensure that the buffer list is fully submitted or 2071 * cancelled appropriately when they are finished with the list. Failure to 2072 * cancel or resubmit the list until it is empty will result in leaked buffers 2073 * at unmount time. 2074 */ 2075 int 2076 xfs_buf_delwri_submit_nowait( 2077 struct list_head *buffer_list) 2078 { 2079 return xfs_buf_delwri_submit_buffers(buffer_list, NULL); 2080 } 2081 2082 /* 2083 * Write out a buffer list synchronously. 2084 * 2085 * This will take the @buffer_list, write all buffers out and wait for I/O 2086 * completion on all of the buffers. @buffer_list is consumed by the function, 2087 * so callers must have some other way of tracking buffers if they require such 2088 * functionality. 2089 */ 2090 int 2091 xfs_buf_delwri_submit( 2092 struct list_head *buffer_list) 2093 { 2094 LIST_HEAD (wait_list); 2095 int error = 0, error2; 2096 struct xfs_buf *bp; 2097 2098 xfs_buf_delwri_submit_buffers(buffer_list, &wait_list); 2099 2100 /* Wait for IO to complete. */ 2101 while (!list_empty(&wait_list)) { 2102 bp = list_first_entry(&wait_list, struct xfs_buf, b_list); 2103 2104 list_del_init(&bp->b_list); 2105 2106 /* 2107 * Wait on the locked buffer, check for errors and unlock and 2108 * release the delwri queue reference. 2109 */ 2110 error2 = xfs_buf_iowait(bp); 2111 xfs_buf_relse(bp); 2112 if (!error) 2113 error = error2; 2114 } 2115 2116 return error; 2117 } 2118 2119 /* 2120 * Push a single buffer on a delwri queue. 2121 * 2122 * The purpose of this function is to submit a single buffer of a delwri queue 2123 * and return with the buffer still on the original queue. The waiting delwri 2124 * buffer submission infrastructure guarantees transfer of the delwri queue 2125 * buffer reference to a temporary wait list. We reuse this infrastructure to 2126 * transfer the buffer back to the original queue. 2127 * 2128 * Note the buffer transitions from the queued state, to the submitted and wait 2129 * listed state and back to the queued state during this call. The buffer 2130 * locking and queue management logic between _delwri_pushbuf() and 2131 * _delwri_queue() guarantee that the buffer cannot be queued to another list 2132 * before returning. 2133 */ 2134 int 2135 xfs_buf_delwri_pushbuf( 2136 struct xfs_buf *bp, 2137 struct list_head *buffer_list) 2138 { 2139 LIST_HEAD (submit_list); 2140 int error; 2141 2142 ASSERT(bp->b_flags & _XBF_DELWRI_Q); 2143 2144 trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_); 2145 2146 /* 2147 * Isolate the buffer to a new local list so we can submit it for I/O 2148 * independently from the rest of the original list. 2149 */ 2150 xfs_buf_lock(bp); 2151 list_move(&bp->b_list, &submit_list); 2152 xfs_buf_unlock(bp); 2153 2154 /* 2155 * Delwri submission clears the DELWRI_Q buffer flag and returns with 2156 * the buffer on the wait list with the original reference. Rather than 2157 * bounce the buffer from a local wait list back to the original list 2158 * after I/O completion, reuse the original list as the wait list. 2159 */ 2160 xfs_buf_delwri_submit_buffers(&submit_list, buffer_list); 2161 2162 /* 2163 * The buffer is now locked, under I/O and wait listed on the original 2164 * delwri queue. Wait for I/O completion, restore the DELWRI_Q flag and 2165 * return with the buffer unlocked and on the original queue. 2166 */ 2167 error = xfs_buf_iowait(bp); 2168 bp->b_flags |= _XBF_DELWRI_Q; 2169 xfs_buf_unlock(bp); 2170 2171 return error; 2172 } 2173 2174 int __init 2175 xfs_buf_init(void) 2176 { 2177 xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf", 2178 KM_ZONE_HWALIGN, NULL); 2179 if (!xfs_buf_zone) 2180 goto out; 2181 2182 return 0; 2183 2184 out: 2185 return -ENOMEM; 2186 } 2187 2188 void 2189 xfs_buf_terminate(void) 2190 { 2191 kmem_zone_destroy(xfs_buf_zone); 2192 } 2193 2194 void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) 2195 { 2196 /* 2197 * Set the lru reference count to 0 based on the error injection tag. 2198 * This allows userspace to disrupt buffer caching for debug/testing 2199 * purposes. 2200 */ 2201 if (XFS_TEST_ERROR(false, bp->b_target->bt_mount, 2202 XFS_ERRTAG_BUF_LRU_REF)) 2203 lru_ref = 0; 2204 2205 atomic_set(&bp->b_lru_ref, lru_ref); 2206 } 2207 2208 /* 2209 * Verify an on-disk magic value against the magic value specified in the 2210 * verifier structure. The verifier magic is in disk byte order so the caller is 2211 * expected to pass the value directly from disk. 2212 */ 2213 bool 2214 xfs_verify_magic( 2215 struct xfs_buf *bp, 2216 __be32 dmagic) 2217 { 2218 struct xfs_mount *mp = bp->b_target->bt_mount; 2219 int idx; 2220 2221 idx = xfs_sb_version_hascrc(&mp->m_sb); 2222 if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx]))) 2223 return false; 2224 return dmagic == bp->b_ops->magic[idx]; 2225 } 2226 /* 2227 * Verify an on-disk magic value against the magic value specified in the 2228 * verifier structure. The verifier magic is in disk byte order so the caller is 2229 * expected to pass the value directly from disk. 2230 */ 2231 bool 2232 xfs_verify_magic16( 2233 struct xfs_buf *bp, 2234 __be16 dmagic) 2235 { 2236 struct xfs_mount *mp = bp->b_target->bt_mount; 2237 int idx; 2238 2239 idx = xfs_sb_version_hascrc(&mp->m_sb); 2240 if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx]))) 2241 return false; 2242 return dmagic == bp->b_ops->magic16[idx]; 2243 } 2244