1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include <linux/backing-dev.h> 8 #include <linux/dax.h> 9 10 #include "xfs_shared.h" 11 #include "xfs_format.h" 12 #include "xfs_log_format.h" 13 #include "xfs_trans_resv.h" 14 #include "xfs_mount.h" 15 #include "xfs_trace.h" 16 #include "xfs_log.h" 17 #include "xfs_log_recover.h" 18 #include "xfs_log_priv.h" 19 #include "xfs_trans.h" 20 #include "xfs_buf_item.h" 21 #include "xfs_errortag.h" 22 #include "xfs_error.h" 23 #include "xfs_ag.h" 24 #include "xfs_buf_mem.h" 25 #include "xfs_notify_failure.h" 26 27 struct kmem_cache *xfs_buf_cache; 28 29 /* 30 * Locking orders 31 * 32 * xfs_buf_stale: 33 * b_sema (caller holds) 34 * b_lock 35 * lru_lock 36 * 37 * xfs_buf_rele: 38 * b_lock 39 * lru_lock 40 * 41 * xfs_buftarg_drain_rele 42 * lru_lock 43 * b_lock (trylock due to inversion) 44 * 45 * xfs_buftarg_isolate 46 * lru_lock 47 * b_lock (trylock due to inversion) 48 */ 49 50 static void xfs_buf_submit(struct xfs_buf *bp); 51 static int xfs_buf_iowait(struct xfs_buf *bp); 52 53 static inline bool xfs_buf_is_uncached(struct xfs_buf *bp) 54 { 55 return bp->b_rhash_key == XFS_BUF_DADDR_NULL; 56 } 57 58 /* 59 * When we mark a buffer stale, we remove the buffer from the LRU and clear the 60 * b_lru_ref count so that the buffer is freed immediately when the buffer 61 * reference count falls to zero. If the buffer is already on the LRU, we need 62 * to remove the reference that LRU holds on the buffer. 63 * 64 * This prevents build-up of stale buffers on the LRU. 65 */ 66 void 67 xfs_buf_stale( 68 struct xfs_buf *bp) 69 { 70 ASSERT(xfs_buf_islocked(bp)); 71 72 bp->b_flags |= XBF_STALE; 73 74 /* 75 * Clear the delwri status so that a delwri queue walker will not 76 * flush this buffer to disk now that it is stale. The delwri queue has 77 * a reference to the buffer, so this is safe to do. 78 */ 79 bp->b_flags &= ~_XBF_DELWRI_Q; 80 81 spin_lock(&bp->b_lock); 82 atomic_set(&bp->b_lru_ref, 0); 83 if (!(bp->b_state & XFS_BSTATE_DISPOSE) && 84 (list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru))) 85 bp->b_hold--; 86 87 ASSERT(bp->b_hold >= 1); 88 spin_unlock(&bp->b_lock); 89 } 90 91 static void 92 xfs_buf_free_callback( 93 struct callback_head *cb) 94 { 95 struct xfs_buf *bp = container_of(cb, struct xfs_buf, b_rcu); 96 97 if (bp->b_maps != &bp->__b_map) 98 kfree(bp->b_maps); 99 kmem_cache_free(xfs_buf_cache, bp); 100 } 101 102 static void 103 xfs_buf_free( 104 struct xfs_buf *bp) 105 { 106 unsigned int size = BBTOB(bp->b_length); 107 108 might_sleep(); 109 trace_xfs_buf_free(bp, _RET_IP_); 110 111 ASSERT(list_empty(&bp->b_lru)); 112 113 if (!xfs_buftarg_is_mem(bp->b_target) && size >= PAGE_SIZE) 114 mm_account_reclaimed_pages(howmany(size, PAGE_SHIFT)); 115 116 if (is_vmalloc_addr(bp->b_addr)) 117 vfree(bp->b_addr); 118 else if (bp->b_flags & _XBF_KMEM) 119 kfree(bp->b_addr); 120 else 121 folio_put(virt_to_folio(bp->b_addr)); 122 123 call_rcu(&bp->b_rcu, xfs_buf_free_callback); 124 } 125 126 static int 127 xfs_buf_alloc_kmem( 128 struct xfs_buf *bp, 129 size_t size, 130 gfp_t gfp_mask) 131 { 132 ASSERT(is_power_of_2(size)); 133 ASSERT(size < PAGE_SIZE); 134 135 bp->b_addr = kmalloc(size, gfp_mask | __GFP_NOFAIL); 136 if (!bp->b_addr) 137 return -ENOMEM; 138 139 /* 140 * Slab guarantees that we get back naturally aligned allocations for 141 * power of two sizes. Keep this check as the canary in the coal mine 142 * if anything changes in slab. 143 */ 144 if (WARN_ON_ONCE(!IS_ALIGNED((unsigned long)bp->b_addr, size))) { 145 kfree(bp->b_addr); 146 bp->b_addr = NULL; 147 return -ENOMEM; 148 } 149 bp->b_flags |= _XBF_KMEM; 150 trace_xfs_buf_backing_kmem(bp, _RET_IP_); 151 return 0; 152 } 153 154 /* 155 * Allocate backing memory for a buffer. 156 * 157 * For tmpfs-backed buffers used by in-memory btrees this directly maps the 158 * tmpfs page cache folios. 159 * 160 * For real file system buffers there are three different kinds backing memory: 161 * 162 * The first type backs the buffer by a kmalloc allocation. This is done for 163 * less than PAGE_SIZE allocations to avoid wasting memory. 164 * 165 * The second type is a single folio buffer - this may be a high order folio or 166 * just a single page sized folio, but either way they get treated the same way 167 * by the rest of the code - the buffer memory spans a single contiguous memory 168 * region that we don't have to map and unmap to access the data directly. 169 * 170 * The third type of buffer is the vmalloc()d buffer. This provides the buffer 171 * with the required contiguous memory region but backed by discontiguous 172 * physical pages. 173 */ 174 static int 175 xfs_buf_alloc_backing_mem( 176 struct xfs_buf *bp, 177 xfs_buf_flags_t flags) 178 { 179 size_t size = BBTOB(bp->b_length); 180 gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN; 181 struct folio *folio; 182 183 if (xfs_buftarg_is_mem(bp->b_target)) 184 return xmbuf_map_backing_mem(bp); 185 186 /* Assure zeroed buffer for non-read cases. */ 187 if (!(flags & XBF_READ)) 188 gfp_mask |= __GFP_ZERO; 189 190 if (flags & XBF_READ_AHEAD) 191 gfp_mask |= __GFP_NORETRY; 192 193 /* 194 * For buffers smaller than PAGE_SIZE use a kmalloc allocation if that 195 * is properly aligned. The slab allocator now guarantees an aligned 196 * allocation for all power of two sizes, which matches most of the 197 * smaller than PAGE_SIZE buffers used by XFS. 198 */ 199 if (size < PAGE_SIZE && is_power_of_2(size)) 200 return xfs_buf_alloc_kmem(bp, size, gfp_mask); 201 202 /* 203 * Don't bother with the retry loop for single PAGE allocations: vmalloc 204 * won't do any better. 205 */ 206 if (size <= PAGE_SIZE) 207 gfp_mask |= __GFP_NOFAIL; 208 209 /* 210 * Optimistically attempt a single high order folio allocation for 211 * larger than PAGE_SIZE buffers. 212 * 213 * Allocating a high order folio makes the assumption that buffers are a 214 * power-of-2 size, matching the power-of-2 folios sizes available. 215 * 216 * The exception here are user xattr data buffers, which can be arbitrarily 217 * sized up to 64kB plus structure metadata, skip straight to the vmalloc 218 * path for them instead of wasting memory here. 219 */ 220 if (size > PAGE_SIZE) { 221 if (!is_power_of_2(size)) 222 goto fallback; 223 gfp_mask &= ~__GFP_DIRECT_RECLAIM; 224 gfp_mask |= __GFP_NORETRY; 225 } 226 folio = folio_alloc(gfp_mask, get_order(size)); 227 if (!folio) { 228 if (size <= PAGE_SIZE) 229 return -ENOMEM; 230 trace_xfs_buf_backing_fallback(bp, _RET_IP_); 231 goto fallback; 232 } 233 bp->b_addr = folio_address(folio); 234 trace_xfs_buf_backing_folio(bp, _RET_IP_); 235 return 0; 236 237 fallback: 238 for (;;) { 239 bp->b_addr = __vmalloc(size, gfp_mask); 240 if (bp->b_addr) 241 break; 242 if (flags & XBF_READ_AHEAD) 243 return -ENOMEM; 244 XFS_STATS_INC(bp->b_mount, xb_page_retries); 245 memalloc_retry_wait(gfp_mask); 246 } 247 248 trace_xfs_buf_backing_vmalloc(bp, _RET_IP_); 249 return 0; 250 } 251 252 static int 253 xfs_buf_alloc( 254 struct xfs_buftarg *target, 255 struct xfs_buf_map *map, 256 int nmaps, 257 xfs_buf_flags_t flags, 258 struct xfs_buf **bpp) 259 { 260 struct xfs_buf *bp; 261 int error; 262 int i; 263 264 *bpp = NULL; 265 bp = kmem_cache_zalloc(xfs_buf_cache, 266 GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); 267 268 /* 269 * We don't want certain flags to appear in b_flags unless they are 270 * specifically set by later operations on the buffer. 271 */ 272 flags &= ~(XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD); 273 274 /* 275 * A new buffer is held and locked by the owner. This ensures that the 276 * buffer is owned by the caller and racing RCU lookups right after 277 * inserting into the hash table are safe (and will have to wait for 278 * the unlock to do anything non-trivial). 279 */ 280 bp->b_hold = 1; 281 sema_init(&bp->b_sema, 0); /* held, no waiters */ 282 283 spin_lock_init(&bp->b_lock); 284 atomic_set(&bp->b_lru_ref, 1); 285 init_completion(&bp->b_iowait); 286 INIT_LIST_HEAD(&bp->b_lru); 287 INIT_LIST_HEAD(&bp->b_list); 288 INIT_LIST_HEAD(&bp->b_li_list); 289 bp->b_target = target; 290 bp->b_mount = target->bt_mount; 291 bp->b_flags = flags; 292 bp->b_rhash_key = map[0].bm_bn; 293 bp->b_length = 0; 294 bp->b_map_count = nmaps; 295 if (nmaps == 1) 296 bp->b_maps = &bp->__b_map; 297 else 298 bp->b_maps = kcalloc(nmaps, sizeof(struct xfs_buf_map), 299 GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); 300 for (i = 0; i < nmaps; i++) { 301 bp->b_maps[i].bm_bn = map[i].bm_bn; 302 bp->b_maps[i].bm_len = map[i].bm_len; 303 bp->b_length += map[i].bm_len; 304 } 305 306 atomic_set(&bp->b_pin_count, 0); 307 init_waitqueue_head(&bp->b_waiters); 308 309 XFS_STATS_INC(bp->b_mount, xb_create); 310 trace_xfs_buf_init(bp, _RET_IP_); 311 312 error = xfs_buf_alloc_backing_mem(bp, flags); 313 if (error) { 314 xfs_buf_free(bp); 315 return error; 316 } 317 318 *bpp = bp; 319 return 0; 320 } 321 322 /* 323 * Finding and Reading Buffers 324 */ 325 static int 326 _xfs_buf_obj_cmp( 327 struct rhashtable_compare_arg *arg, 328 const void *obj) 329 { 330 const struct xfs_buf_map *map = arg->key; 331 const struct xfs_buf *bp = obj; 332 333 /* 334 * The key hashing in the lookup path depends on the key being the 335 * first element of the compare_arg, make sure to assert this. 336 */ 337 BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0); 338 339 if (bp->b_rhash_key != map->bm_bn) 340 return 1; 341 342 if (unlikely(bp->b_length != map->bm_len)) { 343 /* 344 * found a block number match. If the range doesn't 345 * match, the only way this is allowed is if the buffer 346 * in the cache is stale and the transaction that made 347 * it stale has not yet committed. i.e. we are 348 * reallocating a busy extent. Skip this buffer and 349 * continue searching for an exact match. 350 * 351 * Note: If we're scanning for incore buffers to stale, don't 352 * complain if we find non-stale buffers. 353 */ 354 if (!(map->bm_flags & XBM_LIVESCAN)) 355 ASSERT(bp->b_flags & XBF_STALE); 356 return 1; 357 } 358 return 0; 359 } 360 361 static const struct rhashtable_params xfs_buf_hash_params = { 362 .min_size = 32, /* empty AGs have minimal footprint */ 363 .nelem_hint = 16, 364 .key_len = sizeof(xfs_daddr_t), 365 .key_offset = offsetof(struct xfs_buf, b_rhash_key), 366 .head_offset = offsetof(struct xfs_buf, b_rhash_head), 367 .automatic_shrinking = true, 368 .obj_cmpfn = _xfs_buf_obj_cmp, 369 }; 370 371 int 372 xfs_buf_cache_init( 373 struct xfs_buf_cache *bch) 374 { 375 return rhashtable_init(&bch->bc_hash, &xfs_buf_hash_params); 376 } 377 378 void 379 xfs_buf_cache_destroy( 380 struct xfs_buf_cache *bch) 381 { 382 rhashtable_destroy(&bch->bc_hash); 383 } 384 385 static int 386 xfs_buf_map_verify( 387 struct xfs_buftarg *btp, 388 struct xfs_buf_map *map) 389 { 390 xfs_daddr_t eofs; 391 392 /* Check for IOs smaller than the sector size / not sector aligned */ 393 ASSERT(!(BBTOB(map->bm_len) < btp->bt_meta_sectorsize)); 394 ASSERT(!(BBTOB(map->bm_bn) & (xfs_off_t)btp->bt_meta_sectormask)); 395 396 /* 397 * Corrupted block numbers can get through to here, unfortunately, so we 398 * have to check that the buffer falls within the filesystem bounds. 399 */ 400 eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); 401 if (map->bm_bn < 0 || map->bm_bn >= eofs) { 402 xfs_alert(btp->bt_mount, 403 "%s: daddr 0x%llx out of range, EOFS 0x%llx", 404 __func__, map->bm_bn, eofs); 405 WARN_ON(1); 406 return -EFSCORRUPTED; 407 } 408 return 0; 409 } 410 411 static int 412 xfs_buf_find_lock( 413 struct xfs_buf *bp, 414 xfs_buf_flags_t flags) 415 { 416 if (flags & XBF_TRYLOCK) { 417 if (!xfs_buf_trylock(bp)) { 418 XFS_STATS_INC(bp->b_mount, xb_busy_locked); 419 return -EAGAIN; 420 } 421 } else { 422 xfs_buf_lock(bp); 423 XFS_STATS_INC(bp->b_mount, xb_get_locked_waited); 424 } 425 426 /* 427 * if the buffer is stale, clear all the external state associated with 428 * it. We need to keep flags such as how we allocated the buffer memory 429 * intact here. 430 */ 431 if (bp->b_flags & XBF_STALE) { 432 if (flags & XBF_LIVESCAN) { 433 xfs_buf_unlock(bp); 434 return -ENOENT; 435 } 436 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); 437 bp->b_flags &= _XBF_KMEM; 438 bp->b_ops = NULL; 439 } 440 return 0; 441 } 442 443 static bool 444 xfs_buf_try_hold( 445 struct xfs_buf *bp) 446 { 447 spin_lock(&bp->b_lock); 448 if (bp->b_hold == 0) { 449 spin_unlock(&bp->b_lock); 450 return false; 451 } 452 bp->b_hold++; 453 spin_unlock(&bp->b_lock); 454 return true; 455 } 456 457 static inline int 458 xfs_buf_lookup( 459 struct xfs_buf_cache *bch, 460 struct xfs_buf_map *map, 461 xfs_buf_flags_t flags, 462 struct xfs_buf **bpp) 463 { 464 struct xfs_buf *bp; 465 int error; 466 467 rcu_read_lock(); 468 bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params); 469 if (!bp || !xfs_buf_try_hold(bp)) { 470 rcu_read_unlock(); 471 return -ENOENT; 472 } 473 rcu_read_unlock(); 474 475 error = xfs_buf_find_lock(bp, flags); 476 if (error) { 477 xfs_buf_rele(bp); 478 return error; 479 } 480 481 trace_xfs_buf_find(bp, flags, _RET_IP_); 482 *bpp = bp; 483 return 0; 484 } 485 486 /* 487 * Insert the new_bp into the hash table. This consumes the perag reference 488 * taken for the lookup regardless of the result of the insert. 489 */ 490 static int 491 xfs_buf_find_insert( 492 struct xfs_buftarg *btp, 493 struct xfs_buf_cache *bch, 494 struct xfs_perag *pag, 495 struct xfs_buf_map *cmap, 496 struct xfs_buf_map *map, 497 int nmaps, 498 xfs_buf_flags_t flags, 499 struct xfs_buf **bpp) 500 { 501 struct xfs_buf *new_bp; 502 struct xfs_buf *bp; 503 int error; 504 505 error = xfs_buf_alloc(btp, map, nmaps, flags, &new_bp); 506 if (error) 507 goto out_drop_pag; 508 509 /* The new buffer keeps the perag reference until it is freed. */ 510 new_bp->b_pag = pag; 511 512 rcu_read_lock(); 513 bp = rhashtable_lookup_get_insert_fast(&bch->bc_hash, 514 &new_bp->b_rhash_head, xfs_buf_hash_params); 515 if (IS_ERR(bp)) { 516 rcu_read_unlock(); 517 error = PTR_ERR(bp); 518 goto out_free_buf; 519 } 520 if (bp && xfs_buf_try_hold(bp)) { 521 /* found an existing buffer */ 522 rcu_read_unlock(); 523 error = xfs_buf_find_lock(bp, flags); 524 if (error) 525 xfs_buf_rele(bp); 526 else 527 *bpp = bp; 528 goto out_free_buf; 529 } 530 rcu_read_unlock(); 531 532 *bpp = new_bp; 533 return 0; 534 535 out_free_buf: 536 xfs_buf_free(new_bp); 537 out_drop_pag: 538 if (pag) 539 xfs_perag_put(pag); 540 return error; 541 } 542 543 static inline struct xfs_perag * 544 xfs_buftarg_get_pag( 545 struct xfs_buftarg *btp, 546 const struct xfs_buf_map *map) 547 { 548 struct xfs_mount *mp = btp->bt_mount; 549 550 if (xfs_buftarg_is_mem(btp)) 551 return NULL; 552 return xfs_perag_get(mp, xfs_daddr_to_agno(mp, map->bm_bn)); 553 } 554 555 static inline struct xfs_buf_cache * 556 xfs_buftarg_buf_cache( 557 struct xfs_buftarg *btp, 558 struct xfs_perag *pag) 559 { 560 if (pag) 561 return &pag->pag_bcache; 562 return btp->bt_cache; 563 } 564 565 /* 566 * Assembles a buffer covering the specified range. The code is optimised for 567 * cache hits, as metadata intensive workloads will see 3 orders of magnitude 568 * more hits than misses. 569 */ 570 int 571 xfs_buf_get_map( 572 struct xfs_buftarg *btp, 573 struct xfs_buf_map *map, 574 int nmaps, 575 xfs_buf_flags_t flags, 576 struct xfs_buf **bpp) 577 { 578 struct xfs_buf_cache *bch; 579 struct xfs_perag *pag; 580 struct xfs_buf *bp = NULL; 581 struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn }; 582 int error; 583 int i; 584 585 if (flags & XBF_LIVESCAN) 586 cmap.bm_flags |= XBM_LIVESCAN; 587 for (i = 0; i < nmaps; i++) 588 cmap.bm_len += map[i].bm_len; 589 590 error = xfs_buf_map_verify(btp, &cmap); 591 if (error) 592 return error; 593 594 pag = xfs_buftarg_get_pag(btp, &cmap); 595 bch = xfs_buftarg_buf_cache(btp, pag); 596 597 error = xfs_buf_lookup(bch, &cmap, flags, &bp); 598 if (error && error != -ENOENT) 599 goto out_put_perag; 600 601 /* cache hits always outnumber misses by at least 10:1 */ 602 if (unlikely(!bp)) { 603 XFS_STATS_INC(btp->bt_mount, xb_miss_locked); 604 605 if (flags & XBF_INCORE) 606 goto out_put_perag; 607 608 /* xfs_buf_find_insert() consumes the perag reference. */ 609 error = xfs_buf_find_insert(btp, bch, pag, &cmap, map, nmaps, 610 flags, &bp); 611 if (error) 612 return error; 613 } else { 614 XFS_STATS_INC(btp->bt_mount, xb_get_locked); 615 if (pag) 616 xfs_perag_put(pag); 617 } 618 619 /* 620 * Clear b_error if this is a lookup from a caller that doesn't expect 621 * valid data to be found in the buffer. 622 */ 623 if (!(flags & XBF_READ)) 624 xfs_buf_ioerror(bp, 0); 625 626 XFS_STATS_INC(btp->bt_mount, xb_get); 627 trace_xfs_buf_get(bp, flags, _RET_IP_); 628 *bpp = bp; 629 return 0; 630 631 out_put_perag: 632 if (pag) 633 xfs_perag_put(pag); 634 return error; 635 } 636 637 int 638 _xfs_buf_read( 639 struct xfs_buf *bp) 640 { 641 ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL); 642 643 bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD | XBF_DONE); 644 bp->b_flags |= XBF_READ; 645 xfs_buf_submit(bp); 646 return xfs_buf_iowait(bp); 647 } 648 649 /* 650 * Reverify a buffer found in cache without an attached ->b_ops. 651 * 652 * If the caller passed an ops structure and the buffer doesn't have ops 653 * assigned, set the ops and use it to verify the contents. If verification 654 * fails, clear XBF_DONE. We assume the buffer has no recorded errors and is 655 * already in XBF_DONE state on entry. 656 * 657 * Under normal operations, every in-core buffer is verified on read I/O 658 * completion. There are two scenarios that can lead to in-core buffers without 659 * an assigned ->b_ops. The first is during log recovery of buffers on a V4 660 * filesystem, though these buffers are purged at the end of recovery. The 661 * other is online repair, which intentionally reads with a NULL buffer ops to 662 * run several verifiers across an in-core buffer in order to establish buffer 663 * type. If repair can't establish that, the buffer will be left in memory 664 * with NULL buffer ops. 665 */ 666 int 667 xfs_buf_reverify( 668 struct xfs_buf *bp, 669 const struct xfs_buf_ops *ops) 670 { 671 ASSERT(bp->b_flags & XBF_DONE); 672 ASSERT(bp->b_error == 0); 673 674 if (!ops || bp->b_ops) 675 return 0; 676 677 bp->b_ops = ops; 678 bp->b_ops->verify_read(bp); 679 if (bp->b_error) 680 bp->b_flags &= ~XBF_DONE; 681 return bp->b_error; 682 } 683 684 int 685 xfs_buf_read_map( 686 struct xfs_buftarg *target, 687 struct xfs_buf_map *map, 688 int nmaps, 689 xfs_buf_flags_t flags, 690 struct xfs_buf **bpp, 691 const struct xfs_buf_ops *ops, 692 xfs_failaddr_t fa) 693 { 694 struct xfs_buf *bp; 695 int error; 696 697 ASSERT(!(flags & (XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD))); 698 699 flags |= XBF_READ; 700 *bpp = NULL; 701 702 error = xfs_buf_get_map(target, map, nmaps, flags, &bp); 703 if (error) 704 return error; 705 706 trace_xfs_buf_read(bp, flags, _RET_IP_); 707 708 if (!(bp->b_flags & XBF_DONE)) { 709 /* Initiate the buffer read and wait. */ 710 XFS_STATS_INC(target->bt_mount, xb_get_read); 711 bp->b_ops = ops; 712 error = _xfs_buf_read(bp); 713 } else { 714 /* Buffer already read; all we need to do is check it. */ 715 error = xfs_buf_reverify(bp, ops); 716 717 /* We do not want read in the flags */ 718 bp->b_flags &= ~XBF_READ; 719 ASSERT(bp->b_ops != NULL || ops == NULL); 720 } 721 722 /* 723 * If we've had a read error, then the contents of the buffer are 724 * invalid and should not be used. To ensure that a followup read tries 725 * to pull the buffer from disk again, we clear the XBF_DONE flag and 726 * mark the buffer stale. This ensures that anyone who has a current 727 * reference to the buffer will interpret it's contents correctly and 728 * future cache lookups will also treat it as an empty, uninitialised 729 * buffer. 730 */ 731 if (error) { 732 /* 733 * Check against log shutdown for error reporting because 734 * metadata writeback may require a read first and we need to 735 * report errors in metadata writeback until the log is shut 736 * down. High level transaction read functions already check 737 * against mount shutdown, anyway, so we only need to be 738 * concerned about low level IO interactions here. 739 */ 740 if (!xlog_is_shutdown(target->bt_mount->m_log)) 741 xfs_buf_ioerror_alert(bp, fa); 742 743 bp->b_flags &= ~XBF_DONE; 744 xfs_buf_stale(bp); 745 xfs_buf_relse(bp); 746 747 /* bad CRC means corrupted metadata */ 748 if (error == -EFSBADCRC) 749 error = -EFSCORRUPTED; 750 return error; 751 } 752 753 *bpp = bp; 754 return 0; 755 } 756 757 /* 758 * If we are not low on memory then do the readahead in a deadlock 759 * safe manner. 760 */ 761 void 762 xfs_buf_readahead_map( 763 struct xfs_buftarg *target, 764 struct xfs_buf_map *map, 765 int nmaps, 766 const struct xfs_buf_ops *ops) 767 { 768 const xfs_buf_flags_t flags = XBF_READ | XBF_ASYNC | XBF_READ_AHEAD; 769 struct xfs_buf *bp; 770 771 /* 772 * Currently we don't have a good means or justification for performing 773 * xmbuf_map_page asynchronously, so we don't do readahead. 774 */ 775 if (xfs_buftarg_is_mem(target)) 776 return; 777 778 if (xfs_buf_get_map(target, map, nmaps, flags | XBF_TRYLOCK, &bp)) 779 return; 780 trace_xfs_buf_readahead(bp, 0, _RET_IP_); 781 782 if (bp->b_flags & XBF_DONE) { 783 xfs_buf_reverify(bp, ops); 784 xfs_buf_relse(bp); 785 return; 786 } 787 XFS_STATS_INC(target->bt_mount, xb_get_read); 788 bp->b_ops = ops; 789 bp->b_flags &= ~(XBF_WRITE | XBF_DONE); 790 bp->b_flags |= flags; 791 percpu_counter_inc(&target->bt_readahead_count); 792 xfs_buf_submit(bp); 793 } 794 795 /* 796 * Read an uncached buffer from disk. Allocates and returns a locked 797 * buffer containing the disk contents or nothing. Uncached buffers always have 798 * a cache index of XFS_BUF_DADDR_NULL so we can easily determine if the buffer 799 * is cached or uncached during fault diagnosis. 800 */ 801 int 802 xfs_buf_read_uncached( 803 struct xfs_buftarg *target, 804 xfs_daddr_t daddr, 805 size_t numblks, 806 struct xfs_buf **bpp, 807 const struct xfs_buf_ops *ops) 808 { 809 struct xfs_buf *bp; 810 int error; 811 812 *bpp = NULL; 813 814 error = xfs_buf_get_uncached(target, numblks, &bp); 815 if (error) 816 return error; 817 818 /* set up the buffer for a read IO */ 819 ASSERT(bp->b_map_count == 1); 820 bp->b_rhash_key = XFS_BUF_DADDR_NULL; 821 bp->b_maps[0].bm_bn = daddr; 822 bp->b_flags |= XBF_READ; 823 bp->b_ops = ops; 824 825 xfs_buf_submit(bp); 826 error = xfs_buf_iowait(bp); 827 if (error) { 828 xfs_buf_relse(bp); 829 return error; 830 } 831 832 *bpp = bp; 833 return 0; 834 } 835 836 int 837 xfs_buf_get_uncached( 838 struct xfs_buftarg *target, 839 size_t numblks, 840 struct xfs_buf **bpp) 841 { 842 int error; 843 DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks); 844 845 error = xfs_buf_alloc(target, &map, 1, 0, bpp); 846 if (!error) 847 trace_xfs_buf_get_uncached(*bpp, _RET_IP_); 848 return error; 849 } 850 851 /* 852 * Increment reference count on buffer, to hold the buffer concurrently 853 * with another thread which may release (free) the buffer asynchronously. 854 * Must hold the buffer already to call this function. 855 */ 856 void 857 xfs_buf_hold( 858 struct xfs_buf *bp) 859 { 860 trace_xfs_buf_hold(bp, _RET_IP_); 861 862 spin_lock(&bp->b_lock); 863 bp->b_hold++; 864 spin_unlock(&bp->b_lock); 865 } 866 867 static void 868 xfs_buf_rele_uncached( 869 struct xfs_buf *bp) 870 { 871 ASSERT(list_empty(&bp->b_lru)); 872 873 spin_lock(&bp->b_lock); 874 if (--bp->b_hold) { 875 spin_unlock(&bp->b_lock); 876 return; 877 } 878 spin_unlock(&bp->b_lock); 879 xfs_buf_free(bp); 880 } 881 882 static void 883 xfs_buf_rele_cached( 884 struct xfs_buf *bp) 885 { 886 struct xfs_buftarg *btp = bp->b_target; 887 struct xfs_perag *pag = bp->b_pag; 888 struct xfs_buf_cache *bch = xfs_buftarg_buf_cache(btp, pag); 889 bool freebuf = false; 890 891 trace_xfs_buf_rele(bp, _RET_IP_); 892 893 spin_lock(&bp->b_lock); 894 ASSERT(bp->b_hold >= 1); 895 if (bp->b_hold > 1) { 896 bp->b_hold--; 897 goto out_unlock; 898 } 899 900 /* we are asked to drop the last reference */ 901 if (atomic_read(&bp->b_lru_ref)) { 902 /* 903 * If the buffer is added to the LRU, keep the reference to the 904 * buffer for the LRU and clear the (now stale) dispose list 905 * state flag, else drop the reference. 906 */ 907 if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru)) 908 bp->b_state &= ~XFS_BSTATE_DISPOSE; 909 else 910 bp->b_hold--; 911 } else { 912 bp->b_hold--; 913 /* 914 * most of the time buffers will already be removed from the 915 * LRU, so optimise that case by checking for the 916 * XFS_BSTATE_DISPOSE flag indicating the last list the buffer 917 * was on was the disposal list 918 */ 919 if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { 920 list_lru_del_obj(&btp->bt_lru, &bp->b_lru); 921 } else { 922 ASSERT(list_empty(&bp->b_lru)); 923 } 924 925 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); 926 rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head, 927 xfs_buf_hash_params); 928 if (pag) 929 xfs_perag_put(pag); 930 freebuf = true; 931 } 932 933 out_unlock: 934 spin_unlock(&bp->b_lock); 935 936 if (freebuf) 937 xfs_buf_free(bp); 938 } 939 940 /* 941 * Release a hold on the specified buffer. 942 */ 943 void 944 xfs_buf_rele( 945 struct xfs_buf *bp) 946 { 947 trace_xfs_buf_rele(bp, _RET_IP_); 948 if (xfs_buf_is_uncached(bp)) 949 xfs_buf_rele_uncached(bp); 950 else 951 xfs_buf_rele_cached(bp); 952 } 953 954 /* 955 * Lock a buffer object, if it is not already locked. 956 * 957 * If we come across a stale, pinned, locked buffer, we know that we are 958 * being asked to lock a buffer that has been reallocated. Because it is 959 * pinned, we know that the log has not been pushed to disk and hence it 960 * will still be locked. Rather than continuing to have trylock attempts 961 * fail until someone else pushes the log, push it ourselves before 962 * returning. This means that the xfsaild will not get stuck trying 963 * to push on stale inode buffers. 964 */ 965 int 966 xfs_buf_trylock( 967 struct xfs_buf *bp) 968 { 969 int locked; 970 971 locked = down_trylock(&bp->b_sema) == 0; 972 if (locked) 973 trace_xfs_buf_trylock(bp, _RET_IP_); 974 else 975 trace_xfs_buf_trylock_fail(bp, _RET_IP_); 976 return locked; 977 } 978 979 /* 980 * Lock a buffer object. 981 * 982 * If we come across a stale, pinned, locked buffer, we know that we 983 * are being asked to lock a buffer that has been reallocated. Because 984 * it is pinned, we know that the log has not been pushed to disk and 985 * hence it will still be locked. Rather than sleeping until someone 986 * else pushes the log, push it ourselves before trying to get the lock. 987 */ 988 void 989 xfs_buf_lock( 990 struct xfs_buf *bp) 991 { 992 trace_xfs_buf_lock(bp, _RET_IP_); 993 994 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) 995 xfs_log_force(bp->b_mount, 0); 996 down(&bp->b_sema); 997 998 trace_xfs_buf_lock_done(bp, _RET_IP_); 999 } 1000 1001 void 1002 xfs_buf_unlock( 1003 struct xfs_buf *bp) 1004 { 1005 ASSERT(xfs_buf_islocked(bp)); 1006 1007 up(&bp->b_sema); 1008 trace_xfs_buf_unlock(bp, _RET_IP_); 1009 } 1010 1011 STATIC void 1012 xfs_buf_wait_unpin( 1013 struct xfs_buf *bp) 1014 { 1015 DECLARE_WAITQUEUE (wait, current); 1016 1017 if (atomic_read(&bp->b_pin_count) == 0) 1018 return; 1019 1020 add_wait_queue(&bp->b_waiters, &wait); 1021 for (;;) { 1022 set_current_state(TASK_UNINTERRUPTIBLE); 1023 if (atomic_read(&bp->b_pin_count) == 0) 1024 break; 1025 io_schedule(); 1026 } 1027 remove_wait_queue(&bp->b_waiters, &wait); 1028 set_current_state(TASK_RUNNING); 1029 } 1030 1031 static void 1032 xfs_buf_ioerror_alert_ratelimited( 1033 struct xfs_buf *bp) 1034 { 1035 static unsigned long lasttime; 1036 static struct xfs_buftarg *lasttarg; 1037 1038 if (bp->b_target != lasttarg || 1039 time_after(jiffies, (lasttime + 5*HZ))) { 1040 lasttime = jiffies; 1041 xfs_buf_ioerror_alert(bp, __this_address); 1042 } 1043 lasttarg = bp->b_target; 1044 } 1045 1046 /* 1047 * Account for this latest trip around the retry handler, and decide if 1048 * we've failed enough times to constitute a permanent failure. 1049 */ 1050 static bool 1051 xfs_buf_ioerror_permanent( 1052 struct xfs_buf *bp, 1053 struct xfs_error_cfg *cfg) 1054 { 1055 struct xfs_mount *mp = bp->b_mount; 1056 1057 if (cfg->max_retries != XFS_ERR_RETRY_FOREVER && 1058 ++bp->b_retries > cfg->max_retries) 1059 return true; 1060 if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && 1061 time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time)) 1062 return true; 1063 1064 /* At unmount we may treat errors differently */ 1065 if (xfs_is_unmounting(mp) && mp->m_fail_unmount) 1066 return true; 1067 1068 return false; 1069 } 1070 1071 /* 1072 * On a sync write or shutdown we just want to stale the buffer and let the 1073 * caller handle the error in bp->b_error appropriately. 1074 * 1075 * If the write was asynchronous then no one will be looking for the error. If 1076 * this is the first failure of this type, clear the error state and write the 1077 * buffer out again. This means we always retry an async write failure at least 1078 * once, but we also need to set the buffer up to behave correctly now for 1079 * repeated failures. 1080 * 1081 * If we get repeated async write failures, then we take action according to the 1082 * error configuration we have been set up to use. 1083 * 1084 * Returns true if this function took care of error handling and the caller must 1085 * not touch the buffer again. Return false if the caller should proceed with 1086 * normal I/O completion handling. 1087 */ 1088 static bool 1089 xfs_buf_ioend_handle_error( 1090 struct xfs_buf *bp) 1091 { 1092 struct xfs_mount *mp = bp->b_mount; 1093 struct xfs_error_cfg *cfg; 1094 struct xfs_log_item *lip; 1095 1096 /* 1097 * If we've already shutdown the journal because of I/O errors, there's 1098 * no point in giving this a retry. 1099 */ 1100 if (xlog_is_shutdown(mp->m_log)) 1101 goto out_stale; 1102 1103 xfs_buf_ioerror_alert_ratelimited(bp); 1104 1105 /* 1106 * We're not going to bother about retrying this during recovery. 1107 * One strike! 1108 */ 1109 if (bp->b_flags & _XBF_LOGRECOVERY) { 1110 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 1111 return false; 1112 } 1113 1114 /* 1115 * Synchronous writes will have callers process the error. 1116 */ 1117 if (!(bp->b_flags & XBF_ASYNC)) 1118 goto out_stale; 1119 1120 trace_xfs_buf_iodone_async(bp, _RET_IP_); 1121 1122 cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error); 1123 if (bp->b_last_error != bp->b_error || 1124 !(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL))) { 1125 bp->b_last_error = bp->b_error; 1126 if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && 1127 !bp->b_first_retry_time) 1128 bp->b_first_retry_time = jiffies; 1129 goto resubmit; 1130 } 1131 1132 /* 1133 * Permanent error - we need to trigger a shutdown if we haven't already 1134 * to indicate that inconsistency will result from this action. 1135 */ 1136 if (xfs_buf_ioerror_permanent(bp, cfg)) { 1137 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 1138 goto out_stale; 1139 } 1140 1141 /* Still considered a transient error. Caller will schedule retries. */ 1142 list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { 1143 set_bit(XFS_LI_FAILED, &lip->li_flags); 1144 clear_bit(XFS_LI_FLUSHING, &lip->li_flags); 1145 } 1146 1147 xfs_buf_ioerror(bp, 0); 1148 xfs_buf_relse(bp); 1149 return true; 1150 1151 resubmit: 1152 xfs_buf_ioerror(bp, 0); 1153 bp->b_flags |= (XBF_DONE | XBF_WRITE_FAIL); 1154 reinit_completion(&bp->b_iowait); 1155 xfs_buf_submit(bp); 1156 return true; 1157 out_stale: 1158 xfs_buf_stale(bp); 1159 bp->b_flags |= XBF_DONE; 1160 bp->b_flags &= ~XBF_WRITE; 1161 trace_xfs_buf_error_relse(bp, _RET_IP_); 1162 return false; 1163 } 1164 1165 /* returns false if the caller needs to resubmit the I/O, else true */ 1166 static bool 1167 __xfs_buf_ioend( 1168 struct xfs_buf *bp) 1169 { 1170 trace_xfs_buf_iodone(bp, _RET_IP_); 1171 1172 if (bp->b_flags & XBF_READ) { 1173 if (!bp->b_error && is_vmalloc_addr(bp->b_addr)) 1174 invalidate_kernel_vmap_range(bp->b_addr, 1175 roundup(BBTOB(bp->b_length), PAGE_SIZE)); 1176 if (!bp->b_error && bp->b_ops) 1177 bp->b_ops->verify_read(bp); 1178 if (!bp->b_error) 1179 bp->b_flags |= XBF_DONE; 1180 if (bp->b_flags & XBF_READ_AHEAD) 1181 percpu_counter_dec(&bp->b_target->bt_readahead_count); 1182 } else { 1183 if (!bp->b_error) { 1184 bp->b_flags &= ~XBF_WRITE_FAIL; 1185 bp->b_flags |= XBF_DONE; 1186 } 1187 1188 if (unlikely(bp->b_error) && xfs_buf_ioend_handle_error(bp)) 1189 return false; 1190 1191 /* clear the retry state */ 1192 bp->b_last_error = 0; 1193 bp->b_retries = 0; 1194 bp->b_first_retry_time = 0; 1195 1196 /* 1197 * Note that for things like remote attribute buffers, there may 1198 * not be a buffer log item here, so processing the buffer log 1199 * item must remain optional. 1200 */ 1201 if (bp->b_log_item) 1202 xfs_buf_item_done(bp); 1203 1204 if (bp->b_iodone) 1205 bp->b_iodone(bp); 1206 } 1207 1208 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD | 1209 _XBF_LOGRECOVERY); 1210 return true; 1211 } 1212 1213 static void 1214 xfs_buf_ioend( 1215 struct xfs_buf *bp) 1216 { 1217 if (!__xfs_buf_ioend(bp)) 1218 return; 1219 if (bp->b_flags & XBF_ASYNC) 1220 xfs_buf_relse(bp); 1221 else 1222 complete(&bp->b_iowait); 1223 } 1224 1225 static void 1226 xfs_buf_ioend_work( 1227 struct work_struct *work) 1228 { 1229 struct xfs_buf *bp = 1230 container_of(work, struct xfs_buf, b_ioend_work); 1231 1232 if (__xfs_buf_ioend(bp)) 1233 xfs_buf_relse(bp); 1234 } 1235 1236 void 1237 __xfs_buf_ioerror( 1238 struct xfs_buf *bp, 1239 int error, 1240 xfs_failaddr_t failaddr) 1241 { 1242 ASSERT(error <= 0 && error >= -1000); 1243 bp->b_error = error; 1244 trace_xfs_buf_ioerror(bp, error, failaddr); 1245 } 1246 1247 void 1248 xfs_buf_ioerror_alert( 1249 struct xfs_buf *bp, 1250 xfs_failaddr_t func) 1251 { 1252 xfs_buf_alert_ratelimited(bp, "XFS: metadata IO error", 1253 "metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d", 1254 func, (uint64_t)xfs_buf_daddr(bp), 1255 bp->b_length, -bp->b_error); 1256 } 1257 1258 /* 1259 * To simulate an I/O failure, the buffer must be locked and held with at least 1260 * three references. The LRU reference is dropped by the stale call. The buf 1261 * item reference is dropped via ioend processing. The third reference is owned 1262 * by the caller and is dropped on I/O completion if the buffer is XBF_ASYNC. 1263 */ 1264 void 1265 xfs_buf_ioend_fail( 1266 struct xfs_buf *bp) 1267 { 1268 bp->b_flags &= ~XBF_DONE; 1269 xfs_buf_stale(bp); 1270 xfs_buf_ioerror(bp, -EIO); 1271 xfs_buf_ioend(bp); 1272 } 1273 1274 int 1275 xfs_bwrite( 1276 struct xfs_buf *bp) 1277 { 1278 int error; 1279 1280 ASSERT(xfs_buf_islocked(bp)); 1281 1282 bp->b_flags |= XBF_WRITE; 1283 bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | 1284 XBF_DONE); 1285 1286 xfs_buf_submit(bp); 1287 error = xfs_buf_iowait(bp); 1288 if (error) 1289 xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR); 1290 return error; 1291 } 1292 1293 static void 1294 xfs_buf_bio_end_io( 1295 struct bio *bio) 1296 { 1297 struct xfs_buf *bp = bio->bi_private; 1298 1299 if (bio->bi_status) 1300 xfs_buf_ioerror(bp, blk_status_to_errno(bio->bi_status)); 1301 else if ((bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) && 1302 XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR)) 1303 xfs_buf_ioerror(bp, -EIO); 1304 1305 if (bp->b_flags & XBF_ASYNC) { 1306 INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work); 1307 queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work); 1308 } else { 1309 complete(&bp->b_iowait); 1310 } 1311 1312 bio_put(bio); 1313 } 1314 1315 static inline blk_opf_t 1316 xfs_buf_bio_op( 1317 struct xfs_buf *bp) 1318 { 1319 blk_opf_t op; 1320 1321 if (bp->b_flags & XBF_WRITE) { 1322 op = REQ_OP_WRITE; 1323 } else { 1324 op = REQ_OP_READ; 1325 if (bp->b_flags & XBF_READ_AHEAD) 1326 op |= REQ_RAHEAD; 1327 } 1328 1329 return op | REQ_META; 1330 } 1331 1332 static void 1333 xfs_buf_submit_bio( 1334 struct xfs_buf *bp) 1335 { 1336 unsigned int len = BBTOB(bp->b_length); 1337 unsigned int nr_vecs = bio_add_max_vecs(bp->b_addr, len); 1338 unsigned int map = 0; 1339 struct blk_plug plug; 1340 struct bio *bio; 1341 1342 bio = bio_alloc(bp->b_target->bt_bdev, nr_vecs, xfs_buf_bio_op(bp), 1343 GFP_NOIO); 1344 if (is_vmalloc_addr(bp->b_addr)) 1345 bio_add_vmalloc(bio, bp->b_addr, len); 1346 else 1347 bio_add_virt_nofail(bio, bp->b_addr, len); 1348 bio->bi_private = bp; 1349 bio->bi_end_io = xfs_buf_bio_end_io; 1350 1351 /* 1352 * If there is more than one map segment, split out a new bio for each 1353 * map except of the last one. The last map is handled by the 1354 * remainder of the original bio outside the loop. 1355 */ 1356 blk_start_plug(&plug); 1357 for (map = 0; map < bp->b_map_count - 1; map++) { 1358 struct bio *split; 1359 1360 split = bio_split(bio, bp->b_maps[map].bm_len, GFP_NOFS, 1361 &fs_bio_set); 1362 split->bi_iter.bi_sector = bp->b_maps[map].bm_bn; 1363 bio_chain(split, bio); 1364 submit_bio(split); 1365 } 1366 bio->bi_iter.bi_sector = bp->b_maps[map].bm_bn; 1367 submit_bio(bio); 1368 blk_finish_plug(&plug); 1369 } 1370 1371 /* 1372 * Wait for I/O completion of a sync buffer and return the I/O error code. 1373 */ 1374 static int 1375 xfs_buf_iowait( 1376 struct xfs_buf *bp) 1377 { 1378 ASSERT(!(bp->b_flags & XBF_ASYNC)); 1379 1380 do { 1381 trace_xfs_buf_iowait(bp, _RET_IP_); 1382 wait_for_completion(&bp->b_iowait); 1383 trace_xfs_buf_iowait_done(bp, _RET_IP_); 1384 } while (!__xfs_buf_ioend(bp)); 1385 1386 return bp->b_error; 1387 } 1388 1389 /* 1390 * Run the write verifier callback function if it exists. If this fails, mark 1391 * the buffer with an error and do not dispatch the I/O. 1392 */ 1393 static bool 1394 xfs_buf_verify_write( 1395 struct xfs_buf *bp) 1396 { 1397 if (bp->b_ops) { 1398 bp->b_ops->verify_write(bp); 1399 if (bp->b_error) 1400 return false; 1401 } else if (bp->b_rhash_key != XFS_BUF_DADDR_NULL) { 1402 /* 1403 * Non-crc filesystems don't attach verifiers during log 1404 * recovery, so don't warn for such filesystems. 1405 */ 1406 if (xfs_has_crc(bp->b_mount)) { 1407 xfs_warn(bp->b_mount, 1408 "%s: no buf ops on daddr 0x%llx len %d", 1409 __func__, xfs_buf_daddr(bp), 1410 bp->b_length); 1411 xfs_hex_dump(bp->b_addr, XFS_CORRUPTION_DUMP_LEN); 1412 dump_stack(); 1413 } 1414 } 1415 1416 return true; 1417 } 1418 1419 /* 1420 * Buffer I/O submission path, read or write. Asynchronous submission transfers 1421 * the buffer lock ownership and the current reference to the IO. It is not 1422 * safe to reference the buffer after a call to this function unless the caller 1423 * holds an additional reference itself. 1424 */ 1425 static void 1426 xfs_buf_submit( 1427 struct xfs_buf *bp) 1428 { 1429 trace_xfs_buf_submit(bp, _RET_IP_); 1430 1431 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); 1432 1433 /* 1434 * On log shutdown we stale and complete the buffer immediately. We can 1435 * be called to read the superblock before the log has been set up, so 1436 * be careful checking the log state. 1437 * 1438 * Checking the mount shutdown state here can result in the log tail 1439 * moving inappropriately on disk as the log may not yet be shut down. 1440 * i.e. failing this buffer on mount shutdown can remove it from the AIL 1441 * and move the tail of the log forwards without having written this 1442 * buffer to disk. This corrupts the log tail state in memory, and 1443 * because the log may not be shut down yet, it can then be propagated 1444 * to disk before the log is shutdown. Hence we check log shutdown 1445 * state here rather than mount state to avoid corrupting the log tail 1446 * on shutdown. 1447 */ 1448 if (bp->b_mount->m_log && xlog_is_shutdown(bp->b_mount->m_log)) { 1449 xfs_buf_ioend_fail(bp); 1450 return; 1451 } 1452 1453 if (bp->b_flags & XBF_WRITE) 1454 xfs_buf_wait_unpin(bp); 1455 1456 /* 1457 * Make sure we capture only current IO errors rather than stale errors 1458 * left over from previous use of the buffer (e.g. failed readahead). 1459 */ 1460 bp->b_error = 0; 1461 1462 if ((bp->b_flags & XBF_WRITE) && !xfs_buf_verify_write(bp)) { 1463 xfs_force_shutdown(bp->b_mount, SHUTDOWN_CORRUPT_INCORE); 1464 xfs_buf_ioend(bp); 1465 return; 1466 } 1467 1468 /* In-memory targets are directly mapped, no I/O required. */ 1469 if (xfs_buftarg_is_mem(bp->b_target)) { 1470 xfs_buf_ioend(bp); 1471 return; 1472 } 1473 1474 xfs_buf_submit_bio(bp); 1475 } 1476 1477 /* 1478 * Log a message about and stale a buffer that a caller has decided is corrupt. 1479 * 1480 * This function should be called for the kinds of metadata corruption that 1481 * cannot be detect from a verifier, such as incorrect inter-block relationship 1482 * data. Do /not/ call this function from a verifier function. 1483 * 1484 * The buffer must be XBF_DONE prior to the call. Afterwards, the buffer will 1485 * be marked stale, but b_error will not be set. The caller is responsible for 1486 * releasing the buffer or fixing it. 1487 */ 1488 void 1489 __xfs_buf_mark_corrupt( 1490 struct xfs_buf *bp, 1491 xfs_failaddr_t fa) 1492 { 1493 ASSERT(bp->b_flags & XBF_DONE); 1494 1495 xfs_buf_corruption_error(bp, fa); 1496 xfs_buf_stale(bp); 1497 } 1498 1499 /* 1500 * Handling of buffer targets (buftargs). 1501 */ 1502 1503 /* 1504 * Wait for any bufs with callbacks that have been submitted but have not yet 1505 * returned. These buffers will have an elevated hold count, so wait on those 1506 * while freeing all the buffers only held by the LRU. 1507 */ 1508 static enum lru_status 1509 xfs_buftarg_drain_rele( 1510 struct list_head *item, 1511 struct list_lru_one *lru, 1512 void *arg) 1513 1514 { 1515 struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); 1516 struct list_head *dispose = arg; 1517 1518 if (!spin_trylock(&bp->b_lock)) 1519 return LRU_SKIP; 1520 if (bp->b_hold > 1) { 1521 /* need to wait, so skip it this pass */ 1522 spin_unlock(&bp->b_lock); 1523 trace_xfs_buf_drain_buftarg(bp, _RET_IP_); 1524 return LRU_SKIP; 1525 } 1526 1527 /* 1528 * clear the LRU reference count so the buffer doesn't get 1529 * ignored in xfs_buf_rele(). 1530 */ 1531 atomic_set(&bp->b_lru_ref, 0); 1532 bp->b_state |= XFS_BSTATE_DISPOSE; 1533 list_lru_isolate_move(lru, item, dispose); 1534 spin_unlock(&bp->b_lock); 1535 return LRU_REMOVED; 1536 } 1537 1538 /* 1539 * Wait for outstanding I/O on the buftarg to complete. 1540 */ 1541 void 1542 xfs_buftarg_wait( 1543 struct xfs_buftarg *btp) 1544 { 1545 /* 1546 * First wait for all in-flight readahead buffers to be released. This is 1547 * critical as new buffers do not make the LRU until they are released. 1548 * 1549 * Next, flush the buffer workqueue to ensure all completion processing 1550 * has finished. Just waiting on buffer locks is not sufficient for 1551 * async IO as the reference count held over IO is not released until 1552 * after the buffer lock is dropped. Hence we need to ensure here that 1553 * all reference counts have been dropped before we start walking the 1554 * LRU list. 1555 */ 1556 while (percpu_counter_sum(&btp->bt_readahead_count)) 1557 delay(100); 1558 flush_workqueue(btp->bt_mount->m_buf_workqueue); 1559 } 1560 1561 void 1562 xfs_buftarg_drain( 1563 struct xfs_buftarg *btp) 1564 { 1565 LIST_HEAD(dispose); 1566 int loop = 0; 1567 bool write_fail = false; 1568 1569 xfs_buftarg_wait(btp); 1570 1571 /* loop until there is nothing left on the lru list. */ 1572 while (list_lru_count(&btp->bt_lru)) { 1573 list_lru_walk(&btp->bt_lru, xfs_buftarg_drain_rele, 1574 &dispose, LONG_MAX); 1575 1576 while (!list_empty(&dispose)) { 1577 struct xfs_buf *bp; 1578 bp = list_first_entry(&dispose, struct xfs_buf, b_lru); 1579 list_del_init(&bp->b_lru); 1580 if (bp->b_flags & XBF_WRITE_FAIL) { 1581 write_fail = true; 1582 xfs_buf_alert_ratelimited(bp, 1583 "XFS: Corruption Alert", 1584 "Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!", 1585 (long long)xfs_buf_daddr(bp)); 1586 } 1587 xfs_buf_rele(bp); 1588 } 1589 if (loop++ != 0) 1590 delay(100); 1591 } 1592 1593 /* 1594 * If one or more failed buffers were freed, that means dirty metadata 1595 * was thrown away. This should only ever happen after I/O completion 1596 * handling has elevated I/O error(s) to permanent failures and shuts 1597 * down the journal. 1598 */ 1599 if (write_fail) { 1600 ASSERT(xlog_is_shutdown(btp->bt_mount->m_log)); 1601 xfs_alert(btp->bt_mount, 1602 "Please run xfs_repair to determine the extent of the problem."); 1603 } 1604 } 1605 1606 static enum lru_status 1607 xfs_buftarg_isolate( 1608 struct list_head *item, 1609 struct list_lru_one *lru, 1610 void *arg) 1611 { 1612 struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); 1613 struct list_head *dispose = arg; 1614 1615 /* 1616 * we are inverting the lru lock/bp->b_lock here, so use a trylock. 1617 * If we fail to get the lock, just skip it. 1618 */ 1619 if (!spin_trylock(&bp->b_lock)) 1620 return LRU_SKIP; 1621 /* 1622 * Decrement the b_lru_ref count unless the value is already 1623 * zero. If the value is already zero, we need to reclaim the 1624 * buffer, otherwise it gets another trip through the LRU. 1625 */ 1626 if (atomic_add_unless(&bp->b_lru_ref, -1, 0)) { 1627 spin_unlock(&bp->b_lock); 1628 return LRU_ROTATE; 1629 } 1630 1631 bp->b_state |= XFS_BSTATE_DISPOSE; 1632 list_lru_isolate_move(lru, item, dispose); 1633 spin_unlock(&bp->b_lock); 1634 return LRU_REMOVED; 1635 } 1636 1637 static unsigned long 1638 xfs_buftarg_shrink_scan( 1639 struct shrinker *shrink, 1640 struct shrink_control *sc) 1641 { 1642 struct xfs_buftarg *btp = shrink->private_data; 1643 LIST_HEAD(dispose); 1644 unsigned long freed; 1645 1646 freed = list_lru_shrink_walk(&btp->bt_lru, sc, 1647 xfs_buftarg_isolate, &dispose); 1648 1649 while (!list_empty(&dispose)) { 1650 struct xfs_buf *bp; 1651 bp = list_first_entry(&dispose, struct xfs_buf, b_lru); 1652 list_del_init(&bp->b_lru); 1653 xfs_buf_rele(bp); 1654 } 1655 1656 return freed; 1657 } 1658 1659 static unsigned long 1660 xfs_buftarg_shrink_count( 1661 struct shrinker *shrink, 1662 struct shrink_control *sc) 1663 { 1664 struct xfs_buftarg *btp = shrink->private_data; 1665 return list_lru_shrink_count(&btp->bt_lru, sc); 1666 } 1667 1668 void 1669 xfs_destroy_buftarg( 1670 struct xfs_buftarg *btp) 1671 { 1672 shrinker_free(btp->bt_shrinker); 1673 ASSERT(percpu_counter_sum(&btp->bt_readahead_count) == 0); 1674 percpu_counter_destroy(&btp->bt_readahead_count); 1675 list_lru_destroy(&btp->bt_lru); 1676 } 1677 1678 void 1679 xfs_free_buftarg( 1680 struct xfs_buftarg *btp) 1681 { 1682 xfs_destroy_buftarg(btp); 1683 fs_put_dax(btp->bt_daxdev, btp->bt_mount); 1684 /* the main block device is closed by kill_block_super */ 1685 if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev) 1686 bdev_fput(btp->bt_file); 1687 kfree(btp); 1688 } 1689 1690 /* 1691 * Configure this buffer target for hardware-assisted atomic writes if the 1692 * underlying block device supports is congruent with the filesystem geometry. 1693 */ 1694 static inline void 1695 xfs_configure_buftarg_atomic_writes( 1696 struct xfs_buftarg *btp) 1697 { 1698 struct xfs_mount *mp = btp->bt_mount; 1699 unsigned int min_bytes, max_bytes; 1700 1701 min_bytes = bdev_atomic_write_unit_min_bytes(btp->bt_bdev); 1702 max_bytes = bdev_atomic_write_unit_max_bytes(btp->bt_bdev); 1703 1704 /* 1705 * Ignore atomic write geometry that is nonsense or doesn't even cover 1706 * a single fsblock. 1707 */ 1708 if (min_bytes > max_bytes || 1709 min_bytes > mp->m_sb.sb_blocksize || 1710 max_bytes < mp->m_sb.sb_blocksize) { 1711 min_bytes = 0; 1712 max_bytes = 0; 1713 } 1714 1715 btp->bt_awu_min = min_bytes; 1716 btp->bt_awu_max = max_bytes; 1717 } 1718 1719 /* Configure a buffer target that abstracts a block device. */ 1720 int 1721 xfs_configure_buftarg( 1722 struct xfs_buftarg *btp, 1723 unsigned int sectorsize) 1724 { 1725 int error; 1726 1727 ASSERT(btp->bt_bdev != NULL); 1728 1729 /* Set up metadata sector size info */ 1730 btp->bt_meta_sectorsize = sectorsize; 1731 btp->bt_meta_sectormask = sectorsize - 1; 1732 1733 error = bdev_validate_blocksize(btp->bt_bdev, sectorsize); 1734 if (error) { 1735 xfs_warn(btp->bt_mount, 1736 "Cannot use blocksize %u on device %pg, err %d", 1737 sectorsize, btp->bt_bdev, error); 1738 return -EINVAL; 1739 } 1740 1741 if (bdev_can_atomic_write(btp->bt_bdev)) 1742 xfs_configure_buftarg_atomic_writes(btp); 1743 return 0; 1744 } 1745 1746 int 1747 xfs_init_buftarg( 1748 struct xfs_buftarg *btp, 1749 size_t logical_sectorsize, 1750 const char *descr) 1751 { 1752 /* Set up device logical sector size mask */ 1753 btp->bt_logical_sectorsize = logical_sectorsize; 1754 btp->bt_logical_sectormask = logical_sectorsize - 1; 1755 1756 /* 1757 * Buffer IO error rate limiting. Limit it to no more than 10 messages 1758 * per 30 seconds so as to not spam logs too much on repeated errors. 1759 */ 1760 ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ, 1761 DEFAULT_RATELIMIT_BURST); 1762 1763 if (list_lru_init(&btp->bt_lru)) 1764 return -ENOMEM; 1765 if (percpu_counter_init(&btp->bt_readahead_count, 0, GFP_KERNEL)) 1766 goto out_destroy_lru; 1767 1768 btp->bt_shrinker = 1769 shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-buf:%s", descr); 1770 if (!btp->bt_shrinker) 1771 goto out_destroy_io_count; 1772 btp->bt_shrinker->count_objects = xfs_buftarg_shrink_count; 1773 btp->bt_shrinker->scan_objects = xfs_buftarg_shrink_scan; 1774 btp->bt_shrinker->private_data = btp; 1775 shrinker_register(btp->bt_shrinker); 1776 return 0; 1777 1778 out_destroy_io_count: 1779 percpu_counter_destroy(&btp->bt_readahead_count); 1780 out_destroy_lru: 1781 list_lru_destroy(&btp->bt_lru); 1782 return -ENOMEM; 1783 } 1784 1785 struct xfs_buftarg * 1786 xfs_alloc_buftarg( 1787 struct xfs_mount *mp, 1788 struct file *bdev_file) 1789 { 1790 struct xfs_buftarg *btp; 1791 const struct dax_holder_operations *ops = NULL; 1792 int error; 1793 1794 1795 #if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE) 1796 ops = &xfs_dax_holder_operations; 1797 #endif 1798 btp = kzalloc(sizeof(*btp), GFP_KERNEL | __GFP_NOFAIL); 1799 1800 btp->bt_mount = mp; 1801 btp->bt_file = bdev_file; 1802 btp->bt_bdev = file_bdev(bdev_file); 1803 btp->bt_dev = btp->bt_bdev->bd_dev; 1804 btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off, 1805 mp, ops); 1806 1807 /* 1808 * Flush and invalidate all devices' pagecaches before reading any 1809 * metadata because XFS doesn't use the bdev pagecache. 1810 */ 1811 error = sync_blockdev(btp->bt_bdev); 1812 if (error) 1813 goto error_free; 1814 1815 /* 1816 * When allocating the buftargs we have not yet read the super block and 1817 * thus don't know the file system sector size yet. 1818 */ 1819 btp->bt_meta_sectorsize = bdev_logical_block_size(btp->bt_bdev); 1820 btp->bt_meta_sectormask = btp->bt_meta_sectorsize - 1; 1821 1822 error = xfs_init_buftarg(btp, btp->bt_meta_sectorsize, 1823 mp->m_super->s_id); 1824 if (error) 1825 goto error_free; 1826 1827 return btp; 1828 1829 error_free: 1830 kfree(btp); 1831 return ERR_PTR(error); 1832 } 1833 1834 static inline void 1835 xfs_buf_list_del( 1836 struct xfs_buf *bp) 1837 { 1838 list_del_init(&bp->b_list); 1839 wake_up_var(&bp->b_list); 1840 } 1841 1842 /* 1843 * Cancel a delayed write list. 1844 * 1845 * Remove each buffer from the list, clear the delwri queue flag and drop the 1846 * associated buffer reference. 1847 */ 1848 void 1849 xfs_buf_delwri_cancel( 1850 struct list_head *list) 1851 { 1852 struct xfs_buf *bp; 1853 1854 while (!list_empty(list)) { 1855 bp = list_first_entry(list, struct xfs_buf, b_list); 1856 1857 xfs_buf_lock(bp); 1858 bp->b_flags &= ~_XBF_DELWRI_Q; 1859 xfs_buf_list_del(bp); 1860 xfs_buf_relse(bp); 1861 } 1862 } 1863 1864 /* 1865 * Add a buffer to the delayed write list. 1866 * 1867 * This queues a buffer for writeout if it hasn't already been. Note that 1868 * neither this routine nor the buffer list submission functions perform 1869 * any internal synchronization. It is expected that the lists are thread-local 1870 * to the callers. 1871 * 1872 * Returns true if we queued up the buffer, or false if it already had 1873 * been on the buffer list. 1874 */ 1875 bool 1876 xfs_buf_delwri_queue( 1877 struct xfs_buf *bp, 1878 struct list_head *list) 1879 { 1880 ASSERT(xfs_buf_islocked(bp)); 1881 ASSERT(!(bp->b_flags & XBF_READ)); 1882 1883 /* 1884 * If the buffer is already marked delwri it already is queued up 1885 * by someone else for imediate writeout. Just ignore it in that 1886 * case. 1887 */ 1888 if (bp->b_flags & _XBF_DELWRI_Q) { 1889 trace_xfs_buf_delwri_queued(bp, _RET_IP_); 1890 return false; 1891 } 1892 1893 trace_xfs_buf_delwri_queue(bp, _RET_IP_); 1894 1895 /* 1896 * If a buffer gets written out synchronously or marked stale while it 1897 * is on a delwri list we lazily remove it. To do this, the other party 1898 * clears the _XBF_DELWRI_Q flag but otherwise leaves the buffer alone. 1899 * It remains referenced and on the list. In a rare corner case it 1900 * might get readded to a delwri list after the synchronous writeout, in 1901 * which case we need just need to re-add the flag here. 1902 */ 1903 bp->b_flags |= _XBF_DELWRI_Q; 1904 if (list_empty(&bp->b_list)) { 1905 xfs_buf_hold(bp); 1906 list_add_tail(&bp->b_list, list); 1907 } 1908 1909 return true; 1910 } 1911 1912 /* 1913 * Queue a buffer to this delwri list as part of a data integrity operation. 1914 * If the buffer is on any other delwri list, we'll wait for that to clear 1915 * so that the caller can submit the buffer for IO and wait for the result. 1916 * Callers must ensure the buffer is not already on the list. 1917 */ 1918 void 1919 xfs_buf_delwri_queue_here( 1920 struct xfs_buf *bp, 1921 struct list_head *buffer_list) 1922 { 1923 /* 1924 * We need this buffer to end up on the /caller's/ delwri list, not any 1925 * old list. This can happen if the buffer is marked stale (which 1926 * clears DELWRI_Q) after the AIL queues the buffer to its list but 1927 * before the AIL has a chance to submit the list. 1928 */ 1929 while (!list_empty(&bp->b_list)) { 1930 xfs_buf_unlock(bp); 1931 wait_var_event(&bp->b_list, list_empty(&bp->b_list)); 1932 xfs_buf_lock(bp); 1933 } 1934 1935 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); 1936 1937 xfs_buf_delwri_queue(bp, buffer_list); 1938 } 1939 1940 /* 1941 * Compare function is more complex than it needs to be because 1942 * the return value is only 32 bits and we are doing comparisons 1943 * on 64 bit values 1944 */ 1945 static int 1946 xfs_buf_cmp( 1947 void *priv, 1948 const struct list_head *a, 1949 const struct list_head *b) 1950 { 1951 struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list); 1952 struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list); 1953 xfs_daddr_t diff; 1954 1955 diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn; 1956 if (diff < 0) 1957 return -1; 1958 if (diff > 0) 1959 return 1; 1960 return 0; 1961 } 1962 1963 static bool 1964 xfs_buf_delwri_submit_prep( 1965 struct xfs_buf *bp) 1966 { 1967 /* 1968 * Someone else might have written the buffer synchronously or marked it 1969 * stale in the meantime. In that case only the _XBF_DELWRI_Q flag got 1970 * cleared, and we have to drop the reference and remove it from the 1971 * list here. 1972 */ 1973 if (!(bp->b_flags & _XBF_DELWRI_Q)) { 1974 xfs_buf_list_del(bp); 1975 xfs_buf_relse(bp); 1976 return false; 1977 } 1978 1979 trace_xfs_buf_delwri_split(bp, _RET_IP_); 1980 bp->b_flags &= ~_XBF_DELWRI_Q; 1981 bp->b_flags |= XBF_WRITE; 1982 return true; 1983 } 1984 1985 /* 1986 * Write out a buffer list asynchronously. 1987 * 1988 * This will take the @buffer_list, write all non-locked and non-pinned buffers 1989 * out and not wait for I/O completion on any of the buffers. This interface 1990 * is only safely useable for callers that can track I/O completion by higher 1991 * level means, e.g. AIL pushing as the @buffer_list is consumed in this 1992 * function. 1993 * 1994 * Note: this function will skip buffers it would block on, and in doing so 1995 * leaves them on @buffer_list so they can be retried on a later pass. As such, 1996 * it is up to the caller to ensure that the buffer list is fully submitted or 1997 * cancelled appropriately when they are finished with the list. Failure to 1998 * cancel or resubmit the list until it is empty will result in leaked buffers 1999 * at unmount time. 2000 */ 2001 int 2002 xfs_buf_delwri_submit_nowait( 2003 struct list_head *buffer_list) 2004 { 2005 struct xfs_buf *bp, *n; 2006 int pinned = 0; 2007 struct blk_plug plug; 2008 2009 list_sort(NULL, buffer_list, xfs_buf_cmp); 2010 2011 blk_start_plug(&plug); 2012 list_for_each_entry_safe(bp, n, buffer_list, b_list) { 2013 if (!xfs_buf_trylock(bp)) 2014 continue; 2015 if (xfs_buf_ispinned(bp)) { 2016 xfs_buf_unlock(bp); 2017 pinned++; 2018 continue; 2019 } 2020 if (!xfs_buf_delwri_submit_prep(bp)) 2021 continue; 2022 bp->b_flags |= XBF_ASYNC; 2023 xfs_buf_list_del(bp); 2024 xfs_buf_submit(bp); 2025 } 2026 blk_finish_plug(&plug); 2027 2028 return pinned; 2029 } 2030 2031 /* 2032 * Write out a buffer list synchronously. 2033 * 2034 * This will take the @buffer_list, write all buffers out and wait for I/O 2035 * completion on all of the buffers. @buffer_list is consumed by the function, 2036 * so callers must have some other way of tracking buffers if they require such 2037 * functionality. 2038 */ 2039 int 2040 xfs_buf_delwri_submit( 2041 struct list_head *buffer_list) 2042 { 2043 LIST_HEAD (wait_list); 2044 int error = 0, error2; 2045 struct xfs_buf *bp, *n; 2046 struct blk_plug plug; 2047 2048 list_sort(NULL, buffer_list, xfs_buf_cmp); 2049 2050 blk_start_plug(&plug); 2051 list_for_each_entry_safe(bp, n, buffer_list, b_list) { 2052 xfs_buf_lock(bp); 2053 if (!xfs_buf_delwri_submit_prep(bp)) 2054 continue; 2055 bp->b_flags &= ~XBF_ASYNC; 2056 list_move_tail(&bp->b_list, &wait_list); 2057 xfs_buf_submit(bp); 2058 } 2059 blk_finish_plug(&plug); 2060 2061 /* Wait for IO to complete. */ 2062 while (!list_empty(&wait_list)) { 2063 bp = list_first_entry(&wait_list, struct xfs_buf, b_list); 2064 2065 xfs_buf_list_del(bp); 2066 2067 /* 2068 * Wait on the locked buffer, check for errors and unlock and 2069 * release the delwri queue reference. 2070 */ 2071 error2 = xfs_buf_iowait(bp); 2072 xfs_buf_relse(bp); 2073 if (!error) 2074 error = error2; 2075 } 2076 2077 return error; 2078 } 2079 2080 void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) 2081 { 2082 /* 2083 * Set the lru reference count to 0 based on the error injection tag. 2084 * This allows userspace to disrupt buffer caching for debug/testing 2085 * purposes. 2086 */ 2087 if (XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_LRU_REF)) 2088 lru_ref = 0; 2089 2090 atomic_set(&bp->b_lru_ref, lru_ref); 2091 } 2092 2093 /* 2094 * Verify an on-disk magic value against the magic value specified in the 2095 * verifier structure. The verifier magic is in disk byte order so the caller is 2096 * expected to pass the value directly from disk. 2097 */ 2098 bool 2099 xfs_verify_magic( 2100 struct xfs_buf *bp, 2101 __be32 dmagic) 2102 { 2103 struct xfs_mount *mp = bp->b_mount; 2104 int idx; 2105 2106 idx = xfs_has_crc(mp); 2107 if (WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx])) 2108 return false; 2109 return dmagic == bp->b_ops->magic[idx]; 2110 } 2111 /* 2112 * Verify an on-disk magic value against the magic value specified in the 2113 * verifier structure. The verifier magic is in disk byte order so the caller is 2114 * expected to pass the value directly from disk. 2115 */ 2116 bool 2117 xfs_verify_magic16( 2118 struct xfs_buf *bp, 2119 __be16 dmagic) 2120 { 2121 struct xfs_mount *mp = bp->b_mount; 2122 int idx; 2123 2124 idx = xfs_has_crc(mp); 2125 if (WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx])) 2126 return false; 2127 return dmagic == bp->b_ops->magic16[idx]; 2128 } 2129