1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include <linux/backing-dev.h> 8 #include <linux/dax.h> 9 10 #include "xfs_shared.h" 11 #include "xfs_format.h" 12 #include "xfs_log_format.h" 13 #include "xfs_trans_resv.h" 14 #include "xfs_mount.h" 15 #include "xfs_trace.h" 16 #include "xfs_log.h" 17 #include "xfs_log_recover.h" 18 #include "xfs_log_priv.h" 19 #include "xfs_trans.h" 20 #include "xfs_buf_item.h" 21 #include "xfs_errortag.h" 22 #include "xfs_error.h" 23 #include "xfs_ag.h" 24 #include "xfs_buf_mem.h" 25 #include "xfs_notify_failure.h" 26 27 struct kmem_cache *xfs_buf_cache; 28 29 /* 30 * Locking orders 31 * 32 * xfs_buf_stale: 33 * b_sema (caller holds) 34 * b_lock 35 * lru_lock 36 * 37 * xfs_buf_rele: 38 * b_lock 39 * lru_lock 40 * 41 * xfs_buftarg_drain_rele 42 * lru_lock 43 * b_lock (trylock due to inversion) 44 * 45 * xfs_buftarg_isolate 46 * lru_lock 47 * b_lock (trylock due to inversion) 48 */ 49 50 static void xfs_buf_submit(struct xfs_buf *bp); 51 static int xfs_buf_iowait(struct xfs_buf *bp); 52 53 static inline bool xfs_buf_is_uncached(struct xfs_buf *bp) 54 { 55 return bp->b_rhash_key == XFS_BUF_DADDR_NULL; 56 } 57 58 /* 59 * When we mark a buffer stale, we remove the buffer from the LRU and clear the 60 * b_lru_ref count so that the buffer is freed immediately when the buffer 61 * reference count falls to zero. If the buffer is already on the LRU, we need 62 * to remove the reference that LRU holds on the buffer. 63 * 64 * This prevents build-up of stale buffers on the LRU. 65 */ 66 void 67 xfs_buf_stale( 68 struct xfs_buf *bp) 69 { 70 ASSERT(xfs_buf_islocked(bp)); 71 72 bp->b_flags |= XBF_STALE; 73 74 /* 75 * Clear the delwri status so that a delwri queue walker will not 76 * flush this buffer to disk now that it is stale. The delwri queue has 77 * a reference to the buffer, so this is safe to do. 78 */ 79 bp->b_flags &= ~_XBF_DELWRI_Q; 80 81 spin_lock(&bp->b_lock); 82 atomic_set(&bp->b_lru_ref, 0); 83 if (!(bp->b_state & XFS_BSTATE_DISPOSE) && 84 (list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru))) 85 bp->b_hold--; 86 87 ASSERT(bp->b_hold >= 1); 88 spin_unlock(&bp->b_lock); 89 } 90 91 static void 92 xfs_buf_free_callback( 93 struct callback_head *cb) 94 { 95 struct xfs_buf *bp = container_of(cb, struct xfs_buf, b_rcu); 96 97 if (bp->b_maps != &bp->__b_map) 98 kfree(bp->b_maps); 99 kmem_cache_free(xfs_buf_cache, bp); 100 } 101 102 static void 103 xfs_buf_free( 104 struct xfs_buf *bp) 105 { 106 unsigned int size = BBTOB(bp->b_length); 107 108 might_sleep(); 109 trace_xfs_buf_free(bp, _RET_IP_); 110 111 ASSERT(list_empty(&bp->b_lru)); 112 113 if (!xfs_buftarg_is_mem(bp->b_target) && size >= PAGE_SIZE) 114 mm_account_reclaimed_pages(howmany(size, PAGE_SHIFT)); 115 116 if (is_vmalloc_addr(bp->b_addr)) 117 vfree(bp->b_addr); 118 else if (bp->b_flags & _XBF_KMEM) 119 kfree(bp->b_addr); 120 else 121 folio_put(virt_to_folio(bp->b_addr)); 122 123 call_rcu(&bp->b_rcu, xfs_buf_free_callback); 124 } 125 126 static int 127 xfs_buf_alloc_kmem( 128 struct xfs_buf *bp, 129 size_t size, 130 gfp_t gfp_mask) 131 { 132 ASSERT(is_power_of_2(size)); 133 ASSERT(size < PAGE_SIZE); 134 135 bp->b_addr = kmalloc(size, gfp_mask | __GFP_NOFAIL); 136 if (!bp->b_addr) 137 return -ENOMEM; 138 139 /* 140 * Slab guarantees that we get back naturally aligned allocations for 141 * power of two sizes. Keep this check as the canary in the coal mine 142 * if anything changes in slab. 143 */ 144 if (WARN_ON_ONCE(!IS_ALIGNED((unsigned long)bp->b_addr, size))) { 145 kfree(bp->b_addr); 146 bp->b_addr = NULL; 147 return -ENOMEM; 148 } 149 bp->b_flags |= _XBF_KMEM; 150 trace_xfs_buf_backing_kmem(bp, _RET_IP_); 151 return 0; 152 } 153 154 /* 155 * Allocate backing memory for a buffer. 156 * 157 * For tmpfs-backed buffers used by in-memory btrees this directly maps the 158 * tmpfs page cache folios. 159 * 160 * For real file system buffers there are three different kinds backing memory: 161 * 162 * The first type backs the buffer by a kmalloc allocation. This is done for 163 * less than PAGE_SIZE allocations to avoid wasting memory. 164 * 165 * The second type is a single folio buffer - this may be a high order folio or 166 * just a single page sized folio, but either way they get treated the same way 167 * by the rest of the code - the buffer memory spans a single contiguous memory 168 * region that we don't have to map and unmap to access the data directly. 169 * 170 * The third type of buffer is the vmalloc()d buffer. This provides the buffer 171 * with the required contiguous memory region but backed by discontiguous 172 * physical pages. 173 */ 174 static int 175 xfs_buf_alloc_backing_mem( 176 struct xfs_buf *bp, 177 xfs_buf_flags_t flags) 178 { 179 size_t size = BBTOB(bp->b_length); 180 gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN; 181 struct folio *folio; 182 183 if (xfs_buftarg_is_mem(bp->b_target)) 184 return xmbuf_map_backing_mem(bp); 185 186 /* Assure zeroed buffer for non-read cases. */ 187 if (!(flags & XBF_READ)) 188 gfp_mask |= __GFP_ZERO; 189 190 if (flags & XBF_READ_AHEAD) 191 gfp_mask |= __GFP_NORETRY; 192 193 /* 194 * For buffers smaller than PAGE_SIZE use a kmalloc allocation if that 195 * is properly aligned. The slab allocator now guarantees an aligned 196 * allocation for all power of two sizes, which matches most of the 197 * smaller than PAGE_SIZE buffers used by XFS. 198 */ 199 if (size < PAGE_SIZE && is_power_of_2(size)) 200 return xfs_buf_alloc_kmem(bp, size, gfp_mask); 201 202 /* 203 * Don't bother with the retry loop for single PAGE allocations: vmalloc 204 * won't do any better. 205 */ 206 if (size <= PAGE_SIZE) 207 gfp_mask |= __GFP_NOFAIL; 208 209 /* 210 * Optimistically attempt a single high order folio allocation for 211 * larger than PAGE_SIZE buffers. 212 * 213 * Allocating a high order folio makes the assumption that buffers are a 214 * power-of-2 size, matching the power-of-2 folios sizes available. 215 * 216 * The exception here are user xattr data buffers, which can be arbitrarily 217 * sized up to 64kB plus structure metadata, skip straight to the vmalloc 218 * path for them instead of wasting memory here. 219 */ 220 if (size > PAGE_SIZE) { 221 if (!is_power_of_2(size)) 222 goto fallback; 223 gfp_mask &= ~__GFP_DIRECT_RECLAIM; 224 gfp_mask |= __GFP_NORETRY; 225 } 226 folio = folio_alloc(gfp_mask, get_order(size)); 227 if (!folio) { 228 if (size <= PAGE_SIZE) 229 return -ENOMEM; 230 trace_xfs_buf_backing_fallback(bp, _RET_IP_); 231 goto fallback; 232 } 233 bp->b_addr = folio_address(folio); 234 trace_xfs_buf_backing_folio(bp, _RET_IP_); 235 return 0; 236 237 fallback: 238 for (;;) { 239 bp->b_addr = __vmalloc(size, gfp_mask); 240 if (bp->b_addr) 241 break; 242 if (flags & XBF_READ_AHEAD) 243 return -ENOMEM; 244 XFS_STATS_INC(bp->b_mount, xb_page_retries); 245 memalloc_retry_wait(gfp_mask); 246 } 247 248 trace_xfs_buf_backing_vmalloc(bp, _RET_IP_); 249 return 0; 250 } 251 252 static int 253 xfs_buf_alloc( 254 struct xfs_buftarg *target, 255 struct xfs_buf_map *map, 256 int nmaps, 257 xfs_buf_flags_t flags, 258 struct xfs_buf **bpp) 259 { 260 struct xfs_buf *bp; 261 int error; 262 int i; 263 264 *bpp = NULL; 265 bp = kmem_cache_zalloc(xfs_buf_cache, 266 GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); 267 268 /* 269 * We don't want certain flags to appear in b_flags unless they are 270 * specifically set by later operations on the buffer. 271 */ 272 flags &= ~(XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD); 273 274 /* 275 * A new buffer is held and locked by the owner. This ensures that the 276 * buffer is owned by the caller and racing RCU lookups right after 277 * inserting into the hash table are safe (and will have to wait for 278 * the unlock to do anything non-trivial). 279 */ 280 bp->b_hold = 1; 281 sema_init(&bp->b_sema, 0); /* held, no waiters */ 282 283 spin_lock_init(&bp->b_lock); 284 atomic_set(&bp->b_lru_ref, 1); 285 init_completion(&bp->b_iowait); 286 INIT_LIST_HEAD(&bp->b_lru); 287 INIT_LIST_HEAD(&bp->b_list); 288 INIT_LIST_HEAD(&bp->b_li_list); 289 bp->b_target = target; 290 bp->b_mount = target->bt_mount; 291 bp->b_flags = flags; 292 bp->b_rhash_key = map[0].bm_bn; 293 bp->b_length = 0; 294 bp->b_map_count = nmaps; 295 if (nmaps == 1) 296 bp->b_maps = &bp->__b_map; 297 else 298 bp->b_maps = kcalloc(nmaps, sizeof(struct xfs_buf_map), 299 GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); 300 for (i = 0; i < nmaps; i++) { 301 bp->b_maps[i].bm_bn = map[i].bm_bn; 302 bp->b_maps[i].bm_len = map[i].bm_len; 303 bp->b_length += map[i].bm_len; 304 } 305 306 atomic_set(&bp->b_pin_count, 0); 307 init_waitqueue_head(&bp->b_waiters); 308 309 XFS_STATS_INC(bp->b_mount, xb_create); 310 trace_xfs_buf_init(bp, _RET_IP_); 311 312 error = xfs_buf_alloc_backing_mem(bp, flags); 313 if (error) { 314 xfs_buf_free(bp); 315 return error; 316 } 317 318 *bpp = bp; 319 return 0; 320 } 321 322 /* 323 * Finding and Reading Buffers 324 */ 325 static int 326 _xfs_buf_obj_cmp( 327 struct rhashtable_compare_arg *arg, 328 const void *obj) 329 { 330 const struct xfs_buf_map *map = arg->key; 331 const struct xfs_buf *bp = obj; 332 333 /* 334 * The key hashing in the lookup path depends on the key being the 335 * first element of the compare_arg, make sure to assert this. 336 */ 337 BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0); 338 339 if (bp->b_rhash_key != map->bm_bn) 340 return 1; 341 342 if (unlikely(bp->b_length != map->bm_len)) { 343 /* 344 * found a block number match. If the range doesn't 345 * match, the only way this is allowed is if the buffer 346 * in the cache is stale and the transaction that made 347 * it stale has not yet committed. i.e. we are 348 * reallocating a busy extent. Skip this buffer and 349 * continue searching for an exact match. 350 * 351 * Note: If we're scanning for incore buffers to stale, don't 352 * complain if we find non-stale buffers. 353 */ 354 if (!(map->bm_flags & XBM_LIVESCAN)) 355 ASSERT(bp->b_flags & XBF_STALE); 356 return 1; 357 } 358 return 0; 359 } 360 361 static const struct rhashtable_params xfs_buf_hash_params = { 362 .min_size = 32, /* empty AGs have minimal footprint */ 363 .nelem_hint = 16, 364 .key_len = sizeof(xfs_daddr_t), 365 .key_offset = offsetof(struct xfs_buf, b_rhash_key), 366 .head_offset = offsetof(struct xfs_buf, b_rhash_head), 367 .automatic_shrinking = true, 368 .obj_cmpfn = _xfs_buf_obj_cmp, 369 }; 370 371 int 372 xfs_buf_cache_init( 373 struct xfs_buf_cache *bch) 374 { 375 return rhashtable_init(&bch->bc_hash, &xfs_buf_hash_params); 376 } 377 378 void 379 xfs_buf_cache_destroy( 380 struct xfs_buf_cache *bch) 381 { 382 rhashtable_destroy(&bch->bc_hash); 383 } 384 385 static int 386 xfs_buf_map_verify( 387 struct xfs_buftarg *btp, 388 struct xfs_buf_map *map) 389 { 390 /* Check for IOs smaller than the sector size / not sector aligned */ 391 ASSERT(!(BBTOB(map->bm_len) < btp->bt_meta_sectorsize)); 392 ASSERT(!(BBTOB(map->bm_bn) & (xfs_off_t)btp->bt_meta_sectormask)); 393 394 /* 395 * Corrupted block numbers can get through to here, unfortunately, so we 396 * have to check that the buffer falls within the filesystem bounds. 397 */ 398 if (map->bm_bn < 0 || map->bm_bn >= btp->bt_nr_sectors) { 399 xfs_alert(btp->bt_mount, 400 "%s: daddr 0x%llx out of range, EOFS 0x%llx", 401 __func__, map->bm_bn, btp->bt_nr_sectors); 402 WARN_ON(1); 403 return -EFSCORRUPTED; 404 } 405 return 0; 406 } 407 408 static int 409 xfs_buf_find_lock( 410 struct xfs_buf *bp, 411 xfs_buf_flags_t flags) 412 { 413 if (flags & XBF_TRYLOCK) { 414 if (!xfs_buf_trylock(bp)) { 415 XFS_STATS_INC(bp->b_mount, xb_busy_locked); 416 return -EAGAIN; 417 } 418 } else { 419 xfs_buf_lock(bp); 420 XFS_STATS_INC(bp->b_mount, xb_get_locked_waited); 421 } 422 423 /* 424 * if the buffer is stale, clear all the external state associated with 425 * it. We need to keep flags such as how we allocated the buffer memory 426 * intact here. 427 */ 428 if (bp->b_flags & XBF_STALE) { 429 if (flags & XBF_LIVESCAN) { 430 xfs_buf_unlock(bp); 431 return -ENOENT; 432 } 433 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); 434 bp->b_flags &= _XBF_KMEM; 435 bp->b_ops = NULL; 436 } 437 return 0; 438 } 439 440 static bool 441 xfs_buf_try_hold( 442 struct xfs_buf *bp) 443 { 444 spin_lock(&bp->b_lock); 445 if (bp->b_hold == 0) { 446 spin_unlock(&bp->b_lock); 447 return false; 448 } 449 bp->b_hold++; 450 spin_unlock(&bp->b_lock); 451 return true; 452 } 453 454 static inline int 455 xfs_buf_lookup( 456 struct xfs_buf_cache *bch, 457 struct xfs_buf_map *map, 458 xfs_buf_flags_t flags, 459 struct xfs_buf **bpp) 460 { 461 struct xfs_buf *bp; 462 int error; 463 464 rcu_read_lock(); 465 bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params); 466 if (!bp || !xfs_buf_try_hold(bp)) { 467 rcu_read_unlock(); 468 return -ENOENT; 469 } 470 rcu_read_unlock(); 471 472 error = xfs_buf_find_lock(bp, flags); 473 if (error) { 474 xfs_buf_rele(bp); 475 return error; 476 } 477 478 trace_xfs_buf_find(bp, flags, _RET_IP_); 479 *bpp = bp; 480 return 0; 481 } 482 483 /* 484 * Insert the new_bp into the hash table. This consumes the perag reference 485 * taken for the lookup regardless of the result of the insert. 486 */ 487 static int 488 xfs_buf_find_insert( 489 struct xfs_buftarg *btp, 490 struct xfs_buf_cache *bch, 491 struct xfs_perag *pag, 492 struct xfs_buf_map *cmap, 493 struct xfs_buf_map *map, 494 int nmaps, 495 xfs_buf_flags_t flags, 496 struct xfs_buf **bpp) 497 { 498 struct xfs_buf *new_bp; 499 struct xfs_buf *bp; 500 int error; 501 502 error = xfs_buf_alloc(btp, map, nmaps, flags, &new_bp); 503 if (error) 504 goto out_drop_pag; 505 506 /* The new buffer keeps the perag reference until it is freed. */ 507 new_bp->b_pag = pag; 508 509 rcu_read_lock(); 510 bp = rhashtable_lookup_get_insert_fast(&bch->bc_hash, 511 &new_bp->b_rhash_head, xfs_buf_hash_params); 512 if (IS_ERR(bp)) { 513 rcu_read_unlock(); 514 error = PTR_ERR(bp); 515 goto out_free_buf; 516 } 517 if (bp && xfs_buf_try_hold(bp)) { 518 /* found an existing buffer */ 519 rcu_read_unlock(); 520 error = xfs_buf_find_lock(bp, flags); 521 if (error) 522 xfs_buf_rele(bp); 523 else 524 *bpp = bp; 525 goto out_free_buf; 526 } 527 rcu_read_unlock(); 528 529 *bpp = new_bp; 530 return 0; 531 532 out_free_buf: 533 xfs_buf_free(new_bp); 534 out_drop_pag: 535 if (pag) 536 xfs_perag_put(pag); 537 return error; 538 } 539 540 static inline struct xfs_perag * 541 xfs_buftarg_get_pag( 542 struct xfs_buftarg *btp, 543 const struct xfs_buf_map *map) 544 { 545 struct xfs_mount *mp = btp->bt_mount; 546 547 if (xfs_buftarg_is_mem(btp)) 548 return NULL; 549 return xfs_perag_get(mp, xfs_daddr_to_agno(mp, map->bm_bn)); 550 } 551 552 static inline struct xfs_buf_cache * 553 xfs_buftarg_buf_cache( 554 struct xfs_buftarg *btp, 555 struct xfs_perag *pag) 556 { 557 if (pag) 558 return &pag->pag_bcache; 559 return btp->bt_cache; 560 } 561 562 /* 563 * Assembles a buffer covering the specified range. The code is optimised for 564 * cache hits, as metadata intensive workloads will see 3 orders of magnitude 565 * more hits than misses. 566 */ 567 int 568 xfs_buf_get_map( 569 struct xfs_buftarg *btp, 570 struct xfs_buf_map *map, 571 int nmaps, 572 xfs_buf_flags_t flags, 573 struct xfs_buf **bpp) 574 { 575 struct xfs_buf_cache *bch; 576 struct xfs_perag *pag; 577 struct xfs_buf *bp = NULL; 578 struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn }; 579 int error; 580 int i; 581 582 if (flags & XBF_LIVESCAN) 583 cmap.bm_flags |= XBM_LIVESCAN; 584 for (i = 0; i < nmaps; i++) 585 cmap.bm_len += map[i].bm_len; 586 587 error = xfs_buf_map_verify(btp, &cmap); 588 if (error) 589 return error; 590 591 pag = xfs_buftarg_get_pag(btp, &cmap); 592 bch = xfs_buftarg_buf_cache(btp, pag); 593 594 error = xfs_buf_lookup(bch, &cmap, flags, &bp); 595 if (error && error != -ENOENT) 596 goto out_put_perag; 597 598 /* cache hits always outnumber misses by at least 10:1 */ 599 if (unlikely(!bp)) { 600 XFS_STATS_INC(btp->bt_mount, xb_miss_locked); 601 602 if (flags & XBF_INCORE) 603 goto out_put_perag; 604 605 /* xfs_buf_find_insert() consumes the perag reference. */ 606 error = xfs_buf_find_insert(btp, bch, pag, &cmap, map, nmaps, 607 flags, &bp); 608 if (error) 609 return error; 610 } else { 611 XFS_STATS_INC(btp->bt_mount, xb_get_locked); 612 if (pag) 613 xfs_perag_put(pag); 614 } 615 616 /* 617 * Clear b_error if this is a lookup from a caller that doesn't expect 618 * valid data to be found in the buffer. 619 */ 620 if (!(flags & XBF_READ)) 621 xfs_buf_ioerror(bp, 0); 622 623 XFS_STATS_INC(btp->bt_mount, xb_get); 624 trace_xfs_buf_get(bp, flags, _RET_IP_); 625 *bpp = bp; 626 return 0; 627 628 out_put_perag: 629 if (pag) 630 xfs_perag_put(pag); 631 return error; 632 } 633 634 int 635 _xfs_buf_read( 636 struct xfs_buf *bp) 637 { 638 ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL); 639 640 bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD | XBF_DONE); 641 bp->b_flags |= XBF_READ; 642 xfs_buf_submit(bp); 643 return xfs_buf_iowait(bp); 644 } 645 646 /* 647 * Reverify a buffer found in cache without an attached ->b_ops. 648 * 649 * If the caller passed an ops structure and the buffer doesn't have ops 650 * assigned, set the ops and use it to verify the contents. If verification 651 * fails, clear XBF_DONE. We assume the buffer has no recorded errors and is 652 * already in XBF_DONE state on entry. 653 * 654 * Under normal operations, every in-core buffer is verified on read I/O 655 * completion. There are two scenarios that can lead to in-core buffers without 656 * an assigned ->b_ops. The first is during log recovery of buffers on a V4 657 * filesystem, though these buffers are purged at the end of recovery. The 658 * other is online repair, which intentionally reads with a NULL buffer ops to 659 * run several verifiers across an in-core buffer in order to establish buffer 660 * type. If repair can't establish that, the buffer will be left in memory 661 * with NULL buffer ops. 662 */ 663 int 664 xfs_buf_reverify( 665 struct xfs_buf *bp, 666 const struct xfs_buf_ops *ops) 667 { 668 ASSERT(bp->b_flags & XBF_DONE); 669 ASSERT(bp->b_error == 0); 670 671 if (!ops || bp->b_ops) 672 return 0; 673 674 bp->b_ops = ops; 675 bp->b_ops->verify_read(bp); 676 if (bp->b_error) 677 bp->b_flags &= ~XBF_DONE; 678 return bp->b_error; 679 } 680 681 int 682 xfs_buf_read_map( 683 struct xfs_buftarg *target, 684 struct xfs_buf_map *map, 685 int nmaps, 686 xfs_buf_flags_t flags, 687 struct xfs_buf **bpp, 688 const struct xfs_buf_ops *ops, 689 xfs_failaddr_t fa) 690 { 691 struct xfs_buf *bp; 692 int error; 693 694 ASSERT(!(flags & (XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD))); 695 696 flags |= XBF_READ; 697 *bpp = NULL; 698 699 error = xfs_buf_get_map(target, map, nmaps, flags, &bp); 700 if (error) 701 return error; 702 703 trace_xfs_buf_read(bp, flags, _RET_IP_); 704 705 if (!(bp->b_flags & XBF_DONE)) { 706 /* Initiate the buffer read and wait. */ 707 XFS_STATS_INC(target->bt_mount, xb_get_read); 708 bp->b_ops = ops; 709 error = _xfs_buf_read(bp); 710 } else { 711 /* Buffer already read; all we need to do is check it. */ 712 error = xfs_buf_reverify(bp, ops); 713 714 /* We do not want read in the flags */ 715 bp->b_flags &= ~XBF_READ; 716 ASSERT(bp->b_ops != NULL || ops == NULL); 717 } 718 719 /* 720 * If we've had a read error, then the contents of the buffer are 721 * invalid and should not be used. To ensure that a followup read tries 722 * to pull the buffer from disk again, we clear the XBF_DONE flag and 723 * mark the buffer stale. This ensures that anyone who has a current 724 * reference to the buffer will interpret it's contents correctly and 725 * future cache lookups will also treat it as an empty, uninitialised 726 * buffer. 727 */ 728 if (error) { 729 /* 730 * Check against log shutdown for error reporting because 731 * metadata writeback may require a read first and we need to 732 * report errors in metadata writeback until the log is shut 733 * down. High level transaction read functions already check 734 * against mount shutdown, anyway, so we only need to be 735 * concerned about low level IO interactions here. 736 */ 737 if (!xlog_is_shutdown(target->bt_mount->m_log)) 738 xfs_buf_ioerror_alert(bp, fa); 739 740 bp->b_flags &= ~XBF_DONE; 741 xfs_buf_stale(bp); 742 xfs_buf_relse(bp); 743 744 /* bad CRC means corrupted metadata */ 745 if (error == -EFSBADCRC) 746 error = -EFSCORRUPTED; 747 return error; 748 } 749 750 *bpp = bp; 751 return 0; 752 } 753 754 /* 755 * If we are not low on memory then do the readahead in a deadlock 756 * safe manner. 757 */ 758 void 759 xfs_buf_readahead_map( 760 struct xfs_buftarg *target, 761 struct xfs_buf_map *map, 762 int nmaps, 763 const struct xfs_buf_ops *ops) 764 { 765 const xfs_buf_flags_t flags = XBF_READ | XBF_ASYNC | XBF_READ_AHEAD; 766 struct xfs_buf *bp; 767 768 /* 769 * Currently we don't have a good means or justification for performing 770 * xmbuf_map_page asynchronously, so we don't do readahead. 771 */ 772 if (xfs_buftarg_is_mem(target)) 773 return; 774 775 if (xfs_buf_get_map(target, map, nmaps, flags | XBF_TRYLOCK, &bp)) 776 return; 777 trace_xfs_buf_readahead(bp, 0, _RET_IP_); 778 779 if (bp->b_flags & XBF_DONE) { 780 xfs_buf_reverify(bp, ops); 781 xfs_buf_relse(bp); 782 return; 783 } 784 XFS_STATS_INC(target->bt_mount, xb_get_read); 785 bp->b_ops = ops; 786 bp->b_flags &= ~(XBF_WRITE | XBF_DONE); 787 bp->b_flags |= flags; 788 percpu_counter_inc(&target->bt_readahead_count); 789 xfs_buf_submit(bp); 790 } 791 792 /* 793 * Read an uncached buffer from disk. Allocates and returns a locked 794 * buffer containing the disk contents or nothing. Uncached buffers always have 795 * a cache index of XFS_BUF_DADDR_NULL so we can easily determine if the buffer 796 * is cached or uncached during fault diagnosis. 797 */ 798 int 799 xfs_buf_read_uncached( 800 struct xfs_buftarg *target, 801 xfs_daddr_t daddr, 802 size_t numblks, 803 struct xfs_buf **bpp, 804 const struct xfs_buf_ops *ops) 805 { 806 struct xfs_buf *bp; 807 int error; 808 809 *bpp = NULL; 810 811 error = xfs_buf_get_uncached(target, numblks, &bp); 812 if (error) 813 return error; 814 815 /* set up the buffer for a read IO */ 816 ASSERT(bp->b_map_count == 1); 817 bp->b_rhash_key = XFS_BUF_DADDR_NULL; 818 bp->b_maps[0].bm_bn = daddr; 819 bp->b_flags |= XBF_READ; 820 bp->b_ops = ops; 821 822 xfs_buf_submit(bp); 823 error = xfs_buf_iowait(bp); 824 if (error) { 825 xfs_buf_relse(bp); 826 return error; 827 } 828 829 *bpp = bp; 830 return 0; 831 } 832 833 int 834 xfs_buf_get_uncached( 835 struct xfs_buftarg *target, 836 size_t numblks, 837 struct xfs_buf **bpp) 838 { 839 int error; 840 DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks); 841 842 error = xfs_buf_alloc(target, &map, 1, 0, bpp); 843 if (!error) 844 trace_xfs_buf_get_uncached(*bpp, _RET_IP_); 845 return error; 846 } 847 848 /* 849 * Increment reference count on buffer, to hold the buffer concurrently 850 * with another thread which may release (free) the buffer asynchronously. 851 * Must hold the buffer already to call this function. 852 */ 853 void 854 xfs_buf_hold( 855 struct xfs_buf *bp) 856 { 857 trace_xfs_buf_hold(bp, _RET_IP_); 858 859 spin_lock(&bp->b_lock); 860 bp->b_hold++; 861 spin_unlock(&bp->b_lock); 862 } 863 864 static void 865 xfs_buf_rele_uncached( 866 struct xfs_buf *bp) 867 { 868 ASSERT(list_empty(&bp->b_lru)); 869 870 spin_lock(&bp->b_lock); 871 if (--bp->b_hold) { 872 spin_unlock(&bp->b_lock); 873 return; 874 } 875 spin_unlock(&bp->b_lock); 876 xfs_buf_free(bp); 877 } 878 879 static void 880 xfs_buf_rele_cached( 881 struct xfs_buf *bp) 882 { 883 struct xfs_buftarg *btp = bp->b_target; 884 struct xfs_perag *pag = bp->b_pag; 885 struct xfs_buf_cache *bch = xfs_buftarg_buf_cache(btp, pag); 886 bool freebuf = false; 887 888 trace_xfs_buf_rele(bp, _RET_IP_); 889 890 spin_lock(&bp->b_lock); 891 ASSERT(bp->b_hold >= 1); 892 if (bp->b_hold > 1) { 893 bp->b_hold--; 894 goto out_unlock; 895 } 896 897 /* we are asked to drop the last reference */ 898 if (atomic_read(&bp->b_lru_ref)) { 899 /* 900 * If the buffer is added to the LRU, keep the reference to the 901 * buffer for the LRU and clear the (now stale) dispose list 902 * state flag, else drop the reference. 903 */ 904 if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru)) 905 bp->b_state &= ~XFS_BSTATE_DISPOSE; 906 else 907 bp->b_hold--; 908 } else { 909 bp->b_hold--; 910 /* 911 * most of the time buffers will already be removed from the 912 * LRU, so optimise that case by checking for the 913 * XFS_BSTATE_DISPOSE flag indicating the last list the buffer 914 * was on was the disposal list 915 */ 916 if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { 917 list_lru_del_obj(&btp->bt_lru, &bp->b_lru); 918 } else { 919 ASSERT(list_empty(&bp->b_lru)); 920 } 921 922 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); 923 rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head, 924 xfs_buf_hash_params); 925 if (pag) 926 xfs_perag_put(pag); 927 freebuf = true; 928 } 929 930 out_unlock: 931 spin_unlock(&bp->b_lock); 932 933 if (freebuf) 934 xfs_buf_free(bp); 935 } 936 937 /* 938 * Release a hold on the specified buffer. 939 */ 940 void 941 xfs_buf_rele( 942 struct xfs_buf *bp) 943 { 944 trace_xfs_buf_rele(bp, _RET_IP_); 945 if (xfs_buf_is_uncached(bp)) 946 xfs_buf_rele_uncached(bp); 947 else 948 xfs_buf_rele_cached(bp); 949 } 950 951 /* 952 * Lock a buffer object, if it is not already locked. 953 * 954 * If we come across a stale, pinned, locked buffer, we know that we are 955 * being asked to lock a buffer that has been reallocated. Because it is 956 * pinned, we know that the log has not been pushed to disk and hence it 957 * will still be locked. Rather than continuing to have trylock attempts 958 * fail until someone else pushes the log, push it ourselves before 959 * returning. This means that the xfsaild will not get stuck trying 960 * to push on stale inode buffers. 961 */ 962 int 963 xfs_buf_trylock( 964 struct xfs_buf *bp) 965 { 966 int locked; 967 968 locked = down_trylock(&bp->b_sema) == 0; 969 if (locked) 970 trace_xfs_buf_trylock(bp, _RET_IP_); 971 else 972 trace_xfs_buf_trylock_fail(bp, _RET_IP_); 973 return locked; 974 } 975 976 /* 977 * Lock a buffer object. 978 * 979 * If we come across a stale, pinned, locked buffer, we know that we 980 * are being asked to lock a buffer that has been reallocated. Because 981 * it is pinned, we know that the log has not been pushed to disk and 982 * hence it will still be locked. Rather than sleeping until someone 983 * else pushes the log, push it ourselves before trying to get the lock. 984 */ 985 void 986 xfs_buf_lock( 987 struct xfs_buf *bp) 988 { 989 trace_xfs_buf_lock(bp, _RET_IP_); 990 991 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) 992 xfs_log_force(bp->b_mount, 0); 993 down(&bp->b_sema); 994 995 trace_xfs_buf_lock_done(bp, _RET_IP_); 996 } 997 998 void 999 xfs_buf_unlock( 1000 struct xfs_buf *bp) 1001 { 1002 ASSERT(xfs_buf_islocked(bp)); 1003 1004 up(&bp->b_sema); 1005 trace_xfs_buf_unlock(bp, _RET_IP_); 1006 } 1007 1008 STATIC void 1009 xfs_buf_wait_unpin( 1010 struct xfs_buf *bp) 1011 { 1012 DECLARE_WAITQUEUE (wait, current); 1013 1014 if (atomic_read(&bp->b_pin_count) == 0) 1015 return; 1016 1017 add_wait_queue(&bp->b_waiters, &wait); 1018 for (;;) { 1019 set_current_state(TASK_UNINTERRUPTIBLE); 1020 if (atomic_read(&bp->b_pin_count) == 0) 1021 break; 1022 io_schedule(); 1023 } 1024 remove_wait_queue(&bp->b_waiters, &wait); 1025 set_current_state(TASK_RUNNING); 1026 } 1027 1028 static void 1029 xfs_buf_ioerror_alert_ratelimited( 1030 struct xfs_buf *bp) 1031 { 1032 static unsigned long lasttime; 1033 static struct xfs_buftarg *lasttarg; 1034 1035 if (bp->b_target != lasttarg || 1036 time_after(jiffies, (lasttime + 5*HZ))) { 1037 lasttime = jiffies; 1038 xfs_buf_ioerror_alert(bp, __this_address); 1039 } 1040 lasttarg = bp->b_target; 1041 } 1042 1043 /* 1044 * Account for this latest trip around the retry handler, and decide if 1045 * we've failed enough times to constitute a permanent failure. 1046 */ 1047 static bool 1048 xfs_buf_ioerror_permanent( 1049 struct xfs_buf *bp, 1050 struct xfs_error_cfg *cfg) 1051 { 1052 struct xfs_mount *mp = bp->b_mount; 1053 1054 if (cfg->max_retries != XFS_ERR_RETRY_FOREVER && 1055 ++bp->b_retries > cfg->max_retries) 1056 return true; 1057 if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && 1058 time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time)) 1059 return true; 1060 1061 /* At unmount we may treat errors differently */ 1062 if (xfs_is_unmounting(mp) && mp->m_fail_unmount) 1063 return true; 1064 1065 return false; 1066 } 1067 1068 /* 1069 * On a sync write or shutdown we just want to stale the buffer and let the 1070 * caller handle the error in bp->b_error appropriately. 1071 * 1072 * If the write was asynchronous then no one will be looking for the error. If 1073 * this is the first failure of this type, clear the error state and write the 1074 * buffer out again. This means we always retry an async write failure at least 1075 * once, but we also need to set the buffer up to behave correctly now for 1076 * repeated failures. 1077 * 1078 * If we get repeated async write failures, then we take action according to the 1079 * error configuration we have been set up to use. 1080 * 1081 * Returns true if this function took care of error handling and the caller must 1082 * not touch the buffer again. Return false if the caller should proceed with 1083 * normal I/O completion handling. 1084 */ 1085 static bool 1086 xfs_buf_ioend_handle_error( 1087 struct xfs_buf *bp) 1088 { 1089 struct xfs_mount *mp = bp->b_mount; 1090 struct xfs_error_cfg *cfg; 1091 struct xfs_log_item *lip; 1092 1093 /* 1094 * If we've already shutdown the journal because of I/O errors, there's 1095 * no point in giving this a retry. 1096 */ 1097 if (xlog_is_shutdown(mp->m_log)) 1098 goto out_stale; 1099 1100 xfs_buf_ioerror_alert_ratelimited(bp); 1101 1102 /* 1103 * We're not going to bother about retrying this during recovery. 1104 * One strike! 1105 */ 1106 if (bp->b_flags & _XBF_LOGRECOVERY) { 1107 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 1108 return false; 1109 } 1110 1111 /* 1112 * Synchronous writes will have callers process the error. 1113 */ 1114 if (!(bp->b_flags & XBF_ASYNC)) 1115 goto out_stale; 1116 1117 trace_xfs_buf_iodone_async(bp, _RET_IP_); 1118 1119 cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error); 1120 if (bp->b_last_error != bp->b_error || 1121 !(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL))) { 1122 bp->b_last_error = bp->b_error; 1123 if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && 1124 !bp->b_first_retry_time) 1125 bp->b_first_retry_time = jiffies; 1126 goto resubmit; 1127 } 1128 1129 /* 1130 * Permanent error - we need to trigger a shutdown if we haven't already 1131 * to indicate that inconsistency will result from this action. 1132 */ 1133 if (xfs_buf_ioerror_permanent(bp, cfg)) { 1134 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 1135 goto out_stale; 1136 } 1137 1138 /* Still considered a transient error. Caller will schedule retries. */ 1139 list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { 1140 set_bit(XFS_LI_FAILED, &lip->li_flags); 1141 clear_bit(XFS_LI_FLUSHING, &lip->li_flags); 1142 } 1143 1144 xfs_buf_ioerror(bp, 0); 1145 xfs_buf_relse(bp); 1146 return true; 1147 1148 resubmit: 1149 xfs_buf_ioerror(bp, 0); 1150 bp->b_flags |= (XBF_DONE | XBF_WRITE_FAIL); 1151 reinit_completion(&bp->b_iowait); 1152 xfs_buf_submit(bp); 1153 return true; 1154 out_stale: 1155 xfs_buf_stale(bp); 1156 bp->b_flags |= XBF_DONE; 1157 bp->b_flags &= ~XBF_WRITE; 1158 trace_xfs_buf_error_relse(bp, _RET_IP_); 1159 return false; 1160 } 1161 1162 /* returns false if the caller needs to resubmit the I/O, else true */ 1163 static bool 1164 __xfs_buf_ioend( 1165 struct xfs_buf *bp) 1166 { 1167 trace_xfs_buf_iodone(bp, _RET_IP_); 1168 1169 if (bp->b_flags & XBF_READ) { 1170 if (!bp->b_error && is_vmalloc_addr(bp->b_addr)) 1171 invalidate_kernel_vmap_range(bp->b_addr, 1172 roundup(BBTOB(bp->b_length), PAGE_SIZE)); 1173 if (!bp->b_error && bp->b_ops) 1174 bp->b_ops->verify_read(bp); 1175 if (!bp->b_error) 1176 bp->b_flags |= XBF_DONE; 1177 if (bp->b_flags & XBF_READ_AHEAD) 1178 percpu_counter_dec(&bp->b_target->bt_readahead_count); 1179 } else { 1180 if (!bp->b_error) { 1181 bp->b_flags &= ~XBF_WRITE_FAIL; 1182 bp->b_flags |= XBF_DONE; 1183 } 1184 1185 if (unlikely(bp->b_error) && xfs_buf_ioend_handle_error(bp)) 1186 return false; 1187 1188 /* clear the retry state */ 1189 bp->b_last_error = 0; 1190 bp->b_retries = 0; 1191 bp->b_first_retry_time = 0; 1192 1193 /* 1194 * Note that for things like remote attribute buffers, there may 1195 * not be a buffer log item here, so processing the buffer log 1196 * item must remain optional. 1197 */ 1198 if (bp->b_log_item) 1199 xfs_buf_item_done(bp); 1200 1201 if (bp->b_iodone) 1202 bp->b_iodone(bp); 1203 } 1204 1205 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD | 1206 _XBF_LOGRECOVERY); 1207 return true; 1208 } 1209 1210 static void 1211 xfs_buf_ioend( 1212 struct xfs_buf *bp) 1213 { 1214 if (!__xfs_buf_ioend(bp)) 1215 return; 1216 if (bp->b_flags & XBF_ASYNC) 1217 xfs_buf_relse(bp); 1218 else 1219 complete(&bp->b_iowait); 1220 } 1221 1222 static void 1223 xfs_buf_ioend_work( 1224 struct work_struct *work) 1225 { 1226 struct xfs_buf *bp = 1227 container_of(work, struct xfs_buf, b_ioend_work); 1228 1229 if (__xfs_buf_ioend(bp)) 1230 xfs_buf_relse(bp); 1231 } 1232 1233 void 1234 __xfs_buf_ioerror( 1235 struct xfs_buf *bp, 1236 int error, 1237 xfs_failaddr_t failaddr) 1238 { 1239 ASSERT(error <= 0 && error >= -1000); 1240 bp->b_error = error; 1241 trace_xfs_buf_ioerror(bp, error, failaddr); 1242 } 1243 1244 void 1245 xfs_buf_ioerror_alert( 1246 struct xfs_buf *bp, 1247 xfs_failaddr_t func) 1248 { 1249 xfs_buf_alert_ratelimited(bp, "XFS: metadata IO error", 1250 "metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d", 1251 func, (uint64_t)xfs_buf_daddr(bp), 1252 bp->b_length, -bp->b_error); 1253 } 1254 1255 /* 1256 * To simulate an I/O failure, the buffer must be locked and held with at least 1257 * three references. The LRU reference is dropped by the stale call. The buf 1258 * item reference is dropped via ioend processing. The third reference is owned 1259 * by the caller and is dropped on I/O completion if the buffer is XBF_ASYNC. 1260 */ 1261 void 1262 xfs_buf_ioend_fail( 1263 struct xfs_buf *bp) 1264 { 1265 bp->b_flags &= ~XBF_DONE; 1266 xfs_buf_stale(bp); 1267 xfs_buf_ioerror(bp, -EIO); 1268 xfs_buf_ioend(bp); 1269 } 1270 1271 int 1272 xfs_bwrite( 1273 struct xfs_buf *bp) 1274 { 1275 int error; 1276 1277 ASSERT(xfs_buf_islocked(bp)); 1278 1279 bp->b_flags |= XBF_WRITE; 1280 bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | 1281 XBF_DONE); 1282 1283 xfs_buf_submit(bp); 1284 error = xfs_buf_iowait(bp); 1285 if (error) 1286 xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR); 1287 return error; 1288 } 1289 1290 static void 1291 xfs_buf_bio_end_io( 1292 struct bio *bio) 1293 { 1294 struct xfs_buf *bp = bio->bi_private; 1295 1296 if (bio->bi_status) 1297 xfs_buf_ioerror(bp, blk_status_to_errno(bio->bi_status)); 1298 else if ((bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) && 1299 XFS_TEST_ERROR(bp->b_mount, XFS_ERRTAG_BUF_IOERROR)) 1300 xfs_buf_ioerror(bp, -EIO); 1301 1302 if (bp->b_flags & XBF_ASYNC) { 1303 INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work); 1304 queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work); 1305 } else { 1306 complete(&bp->b_iowait); 1307 } 1308 1309 bio_put(bio); 1310 } 1311 1312 static inline blk_opf_t 1313 xfs_buf_bio_op( 1314 struct xfs_buf *bp) 1315 { 1316 blk_opf_t op; 1317 1318 if (bp->b_flags & XBF_WRITE) { 1319 op = REQ_OP_WRITE; 1320 } else { 1321 op = REQ_OP_READ; 1322 if (bp->b_flags & XBF_READ_AHEAD) 1323 op |= REQ_RAHEAD; 1324 } 1325 1326 return op | REQ_META; 1327 } 1328 1329 static void 1330 xfs_buf_submit_bio( 1331 struct xfs_buf *bp) 1332 { 1333 unsigned int len = BBTOB(bp->b_length); 1334 unsigned int nr_vecs = bio_add_max_vecs(bp->b_addr, len); 1335 unsigned int map = 0; 1336 struct blk_plug plug; 1337 struct bio *bio; 1338 1339 bio = bio_alloc(bp->b_target->bt_bdev, nr_vecs, xfs_buf_bio_op(bp), 1340 GFP_NOIO); 1341 if (is_vmalloc_addr(bp->b_addr)) 1342 bio_add_vmalloc(bio, bp->b_addr, len); 1343 else 1344 bio_add_virt_nofail(bio, bp->b_addr, len); 1345 bio->bi_private = bp; 1346 bio->bi_end_io = xfs_buf_bio_end_io; 1347 1348 /* 1349 * If there is more than one map segment, split out a new bio for each 1350 * map except of the last one. The last map is handled by the 1351 * remainder of the original bio outside the loop. 1352 */ 1353 blk_start_plug(&plug); 1354 for (map = 0; map < bp->b_map_count - 1; map++) { 1355 struct bio *split; 1356 1357 split = bio_split(bio, bp->b_maps[map].bm_len, GFP_NOFS, 1358 &fs_bio_set); 1359 split->bi_iter.bi_sector = bp->b_maps[map].bm_bn; 1360 bio_chain(split, bio); 1361 submit_bio(split); 1362 } 1363 bio->bi_iter.bi_sector = bp->b_maps[map].bm_bn; 1364 submit_bio(bio); 1365 blk_finish_plug(&plug); 1366 } 1367 1368 /* 1369 * Wait for I/O completion of a sync buffer and return the I/O error code. 1370 */ 1371 static int 1372 xfs_buf_iowait( 1373 struct xfs_buf *bp) 1374 { 1375 ASSERT(!(bp->b_flags & XBF_ASYNC)); 1376 1377 do { 1378 trace_xfs_buf_iowait(bp, _RET_IP_); 1379 wait_for_completion(&bp->b_iowait); 1380 trace_xfs_buf_iowait_done(bp, _RET_IP_); 1381 } while (!__xfs_buf_ioend(bp)); 1382 1383 return bp->b_error; 1384 } 1385 1386 /* 1387 * Run the write verifier callback function if it exists. If this fails, mark 1388 * the buffer with an error and do not dispatch the I/O. 1389 */ 1390 static bool 1391 xfs_buf_verify_write( 1392 struct xfs_buf *bp) 1393 { 1394 if (bp->b_ops) { 1395 bp->b_ops->verify_write(bp); 1396 if (bp->b_error) 1397 return false; 1398 } else if (bp->b_rhash_key != XFS_BUF_DADDR_NULL) { 1399 /* 1400 * Non-crc filesystems don't attach verifiers during log 1401 * recovery, so don't warn for such filesystems. 1402 */ 1403 if (xfs_has_crc(bp->b_mount)) { 1404 xfs_warn(bp->b_mount, 1405 "%s: no buf ops on daddr 0x%llx len %d", 1406 __func__, xfs_buf_daddr(bp), 1407 bp->b_length); 1408 xfs_hex_dump(bp->b_addr, XFS_CORRUPTION_DUMP_LEN); 1409 dump_stack(); 1410 } 1411 } 1412 1413 return true; 1414 } 1415 1416 /* 1417 * Buffer I/O submission path, read or write. Asynchronous submission transfers 1418 * the buffer lock ownership and the current reference to the IO. It is not 1419 * safe to reference the buffer after a call to this function unless the caller 1420 * holds an additional reference itself. 1421 */ 1422 static void 1423 xfs_buf_submit( 1424 struct xfs_buf *bp) 1425 { 1426 trace_xfs_buf_submit(bp, _RET_IP_); 1427 1428 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); 1429 1430 /* 1431 * On log shutdown we stale and complete the buffer immediately. We can 1432 * be called to read the superblock before the log has been set up, so 1433 * be careful checking the log state. 1434 * 1435 * Checking the mount shutdown state here can result in the log tail 1436 * moving inappropriately on disk as the log may not yet be shut down. 1437 * i.e. failing this buffer on mount shutdown can remove it from the AIL 1438 * and move the tail of the log forwards without having written this 1439 * buffer to disk. This corrupts the log tail state in memory, and 1440 * because the log may not be shut down yet, it can then be propagated 1441 * to disk before the log is shutdown. Hence we check log shutdown 1442 * state here rather than mount state to avoid corrupting the log tail 1443 * on shutdown. 1444 */ 1445 if (bp->b_mount->m_log && xlog_is_shutdown(bp->b_mount->m_log)) { 1446 xfs_buf_ioend_fail(bp); 1447 return; 1448 } 1449 1450 if (bp->b_flags & XBF_WRITE) 1451 xfs_buf_wait_unpin(bp); 1452 1453 /* 1454 * Make sure we capture only current IO errors rather than stale errors 1455 * left over from previous use of the buffer (e.g. failed readahead). 1456 */ 1457 bp->b_error = 0; 1458 1459 if ((bp->b_flags & XBF_WRITE) && !xfs_buf_verify_write(bp)) { 1460 xfs_force_shutdown(bp->b_mount, SHUTDOWN_CORRUPT_INCORE); 1461 xfs_buf_ioend(bp); 1462 return; 1463 } 1464 1465 /* In-memory targets are directly mapped, no I/O required. */ 1466 if (xfs_buftarg_is_mem(bp->b_target)) { 1467 xfs_buf_ioend(bp); 1468 return; 1469 } 1470 1471 xfs_buf_submit_bio(bp); 1472 } 1473 1474 /* 1475 * Log a message about and stale a buffer that a caller has decided is corrupt. 1476 * 1477 * This function should be called for the kinds of metadata corruption that 1478 * cannot be detect from a verifier, such as incorrect inter-block relationship 1479 * data. Do /not/ call this function from a verifier function. 1480 * 1481 * The buffer must be XBF_DONE prior to the call. Afterwards, the buffer will 1482 * be marked stale, but b_error will not be set. The caller is responsible for 1483 * releasing the buffer or fixing it. 1484 */ 1485 void 1486 __xfs_buf_mark_corrupt( 1487 struct xfs_buf *bp, 1488 xfs_failaddr_t fa) 1489 { 1490 ASSERT(bp->b_flags & XBF_DONE); 1491 1492 xfs_buf_corruption_error(bp, fa); 1493 xfs_buf_stale(bp); 1494 } 1495 1496 /* 1497 * Handling of buffer targets (buftargs). 1498 */ 1499 1500 /* 1501 * Wait for any bufs with callbacks that have been submitted but have not yet 1502 * returned. These buffers will have an elevated hold count, so wait on those 1503 * while freeing all the buffers only held by the LRU. 1504 */ 1505 static enum lru_status 1506 xfs_buftarg_drain_rele( 1507 struct list_head *item, 1508 struct list_lru_one *lru, 1509 void *arg) 1510 1511 { 1512 struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); 1513 struct list_head *dispose = arg; 1514 1515 if (!spin_trylock(&bp->b_lock)) 1516 return LRU_SKIP; 1517 if (bp->b_hold > 1) { 1518 /* need to wait, so skip it this pass */ 1519 spin_unlock(&bp->b_lock); 1520 trace_xfs_buf_drain_buftarg(bp, _RET_IP_); 1521 return LRU_SKIP; 1522 } 1523 1524 /* 1525 * clear the LRU reference count so the buffer doesn't get 1526 * ignored in xfs_buf_rele(). 1527 */ 1528 atomic_set(&bp->b_lru_ref, 0); 1529 bp->b_state |= XFS_BSTATE_DISPOSE; 1530 list_lru_isolate_move(lru, item, dispose); 1531 spin_unlock(&bp->b_lock); 1532 return LRU_REMOVED; 1533 } 1534 1535 /* 1536 * Wait for outstanding I/O on the buftarg to complete. 1537 */ 1538 void 1539 xfs_buftarg_wait( 1540 struct xfs_buftarg *btp) 1541 { 1542 /* 1543 * First wait for all in-flight readahead buffers to be released. This is 1544 * critical as new buffers do not make the LRU until they are released. 1545 * 1546 * Next, flush the buffer workqueue to ensure all completion processing 1547 * has finished. Just waiting on buffer locks is not sufficient for 1548 * async IO as the reference count held over IO is not released until 1549 * after the buffer lock is dropped. Hence we need to ensure here that 1550 * all reference counts have been dropped before we start walking the 1551 * LRU list. 1552 */ 1553 while (percpu_counter_sum(&btp->bt_readahead_count)) 1554 delay(100); 1555 flush_workqueue(btp->bt_mount->m_buf_workqueue); 1556 } 1557 1558 void 1559 xfs_buftarg_drain( 1560 struct xfs_buftarg *btp) 1561 { 1562 LIST_HEAD(dispose); 1563 int loop = 0; 1564 bool write_fail = false; 1565 1566 xfs_buftarg_wait(btp); 1567 1568 /* loop until there is nothing left on the lru list. */ 1569 while (list_lru_count(&btp->bt_lru)) { 1570 list_lru_walk(&btp->bt_lru, xfs_buftarg_drain_rele, 1571 &dispose, LONG_MAX); 1572 1573 while (!list_empty(&dispose)) { 1574 struct xfs_buf *bp; 1575 bp = list_first_entry(&dispose, struct xfs_buf, b_lru); 1576 list_del_init(&bp->b_lru); 1577 if (bp->b_flags & XBF_WRITE_FAIL) { 1578 write_fail = true; 1579 xfs_buf_alert_ratelimited(bp, 1580 "XFS: Corruption Alert", 1581 "Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!", 1582 (long long)xfs_buf_daddr(bp)); 1583 } 1584 xfs_buf_rele(bp); 1585 } 1586 if (loop++ != 0) 1587 delay(100); 1588 } 1589 1590 /* 1591 * If one or more failed buffers were freed, that means dirty metadata 1592 * was thrown away. This should only ever happen after I/O completion 1593 * handling has elevated I/O error(s) to permanent failures and shuts 1594 * down the journal. 1595 */ 1596 if (write_fail) { 1597 ASSERT(xlog_is_shutdown(btp->bt_mount->m_log)); 1598 xfs_alert(btp->bt_mount, 1599 "Please run xfs_repair to determine the extent of the problem."); 1600 } 1601 } 1602 1603 static enum lru_status 1604 xfs_buftarg_isolate( 1605 struct list_head *item, 1606 struct list_lru_one *lru, 1607 void *arg) 1608 { 1609 struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); 1610 struct list_head *dispose = arg; 1611 1612 /* 1613 * we are inverting the lru lock/bp->b_lock here, so use a trylock. 1614 * If we fail to get the lock, just skip it. 1615 */ 1616 if (!spin_trylock(&bp->b_lock)) 1617 return LRU_SKIP; 1618 /* 1619 * Decrement the b_lru_ref count unless the value is already 1620 * zero. If the value is already zero, we need to reclaim the 1621 * buffer, otherwise it gets another trip through the LRU. 1622 */ 1623 if (atomic_add_unless(&bp->b_lru_ref, -1, 0)) { 1624 spin_unlock(&bp->b_lock); 1625 return LRU_ROTATE; 1626 } 1627 1628 bp->b_state |= XFS_BSTATE_DISPOSE; 1629 list_lru_isolate_move(lru, item, dispose); 1630 spin_unlock(&bp->b_lock); 1631 return LRU_REMOVED; 1632 } 1633 1634 static unsigned long 1635 xfs_buftarg_shrink_scan( 1636 struct shrinker *shrink, 1637 struct shrink_control *sc) 1638 { 1639 struct xfs_buftarg *btp = shrink->private_data; 1640 LIST_HEAD(dispose); 1641 unsigned long freed; 1642 1643 freed = list_lru_shrink_walk(&btp->bt_lru, sc, 1644 xfs_buftarg_isolate, &dispose); 1645 1646 while (!list_empty(&dispose)) { 1647 struct xfs_buf *bp; 1648 bp = list_first_entry(&dispose, struct xfs_buf, b_lru); 1649 list_del_init(&bp->b_lru); 1650 xfs_buf_rele(bp); 1651 } 1652 1653 return freed; 1654 } 1655 1656 static unsigned long 1657 xfs_buftarg_shrink_count( 1658 struct shrinker *shrink, 1659 struct shrink_control *sc) 1660 { 1661 struct xfs_buftarg *btp = shrink->private_data; 1662 return list_lru_shrink_count(&btp->bt_lru, sc); 1663 } 1664 1665 void 1666 xfs_destroy_buftarg( 1667 struct xfs_buftarg *btp) 1668 { 1669 shrinker_free(btp->bt_shrinker); 1670 ASSERT(percpu_counter_sum(&btp->bt_readahead_count) == 0); 1671 percpu_counter_destroy(&btp->bt_readahead_count); 1672 list_lru_destroy(&btp->bt_lru); 1673 } 1674 1675 void 1676 xfs_free_buftarg( 1677 struct xfs_buftarg *btp) 1678 { 1679 xfs_destroy_buftarg(btp); 1680 fs_put_dax(btp->bt_daxdev, btp->bt_mount); 1681 /* the main block device is closed by kill_block_super */ 1682 if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev) 1683 bdev_fput(btp->bt_file); 1684 kfree(btp); 1685 } 1686 1687 /* 1688 * Configure this buffer target for hardware-assisted atomic writes if the 1689 * underlying block device supports is congruent with the filesystem geometry. 1690 */ 1691 static inline void 1692 xfs_configure_buftarg_atomic_writes( 1693 struct xfs_buftarg *btp) 1694 { 1695 struct xfs_mount *mp = btp->bt_mount; 1696 unsigned int min_bytes, max_bytes; 1697 1698 min_bytes = bdev_atomic_write_unit_min_bytes(btp->bt_bdev); 1699 max_bytes = bdev_atomic_write_unit_max_bytes(btp->bt_bdev); 1700 1701 /* 1702 * Ignore atomic write geometry that is nonsense or doesn't even cover 1703 * a single fsblock. 1704 */ 1705 if (min_bytes > max_bytes || 1706 min_bytes > mp->m_sb.sb_blocksize || 1707 max_bytes < mp->m_sb.sb_blocksize) { 1708 min_bytes = 0; 1709 max_bytes = 0; 1710 } 1711 1712 btp->bt_awu_min = min_bytes; 1713 btp->bt_awu_max = max_bytes; 1714 } 1715 1716 /* Configure a buffer target that abstracts a block device. */ 1717 int 1718 xfs_configure_buftarg( 1719 struct xfs_buftarg *btp, 1720 unsigned int sectorsize, 1721 xfs_rfsblock_t nr_blocks) 1722 { 1723 struct xfs_mount *mp = btp->bt_mount; 1724 1725 if (btp->bt_bdev) { 1726 int error; 1727 1728 error = bdev_validate_blocksize(btp->bt_bdev, sectorsize); 1729 if (error) { 1730 xfs_warn(mp, 1731 "Cannot use blocksize %u on device %pg, err %d", 1732 sectorsize, btp->bt_bdev, error); 1733 return -EINVAL; 1734 } 1735 1736 if (bdev_can_atomic_write(btp->bt_bdev)) 1737 xfs_configure_buftarg_atomic_writes(btp); 1738 } 1739 1740 btp->bt_meta_sectorsize = sectorsize; 1741 btp->bt_meta_sectormask = sectorsize - 1; 1742 /* m_blkbb_log is not set up yet */ 1743 btp->bt_nr_sectors = nr_blocks << (mp->m_sb.sb_blocklog - BBSHIFT); 1744 return 0; 1745 } 1746 1747 int 1748 xfs_init_buftarg( 1749 struct xfs_buftarg *btp, 1750 size_t logical_sectorsize, 1751 const char *descr) 1752 { 1753 /* The maximum size of the buftarg is only known once the sb is read. */ 1754 btp->bt_nr_sectors = (xfs_daddr_t)-1; 1755 1756 /* Set up device logical sector size mask */ 1757 btp->bt_logical_sectorsize = logical_sectorsize; 1758 btp->bt_logical_sectormask = logical_sectorsize - 1; 1759 1760 /* 1761 * Buffer IO error rate limiting. Limit it to no more than 10 messages 1762 * per 30 seconds so as to not spam logs too much on repeated errors. 1763 */ 1764 ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ, 1765 DEFAULT_RATELIMIT_BURST); 1766 1767 if (list_lru_init(&btp->bt_lru)) 1768 return -ENOMEM; 1769 if (percpu_counter_init(&btp->bt_readahead_count, 0, GFP_KERNEL)) 1770 goto out_destroy_lru; 1771 1772 btp->bt_shrinker = 1773 shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-buf:%s", descr); 1774 if (!btp->bt_shrinker) 1775 goto out_destroy_io_count; 1776 btp->bt_shrinker->count_objects = xfs_buftarg_shrink_count; 1777 btp->bt_shrinker->scan_objects = xfs_buftarg_shrink_scan; 1778 btp->bt_shrinker->private_data = btp; 1779 shrinker_register(btp->bt_shrinker); 1780 return 0; 1781 1782 out_destroy_io_count: 1783 percpu_counter_destroy(&btp->bt_readahead_count); 1784 out_destroy_lru: 1785 list_lru_destroy(&btp->bt_lru); 1786 return -ENOMEM; 1787 } 1788 1789 struct xfs_buftarg * 1790 xfs_alloc_buftarg( 1791 struct xfs_mount *mp, 1792 struct file *bdev_file) 1793 { 1794 struct xfs_buftarg *btp; 1795 const struct dax_holder_operations *ops = NULL; 1796 int error; 1797 1798 1799 #if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE) 1800 ops = &xfs_dax_holder_operations; 1801 #endif 1802 btp = kzalloc(sizeof(*btp), GFP_KERNEL | __GFP_NOFAIL); 1803 1804 btp->bt_mount = mp; 1805 btp->bt_file = bdev_file; 1806 btp->bt_bdev = file_bdev(bdev_file); 1807 btp->bt_dev = btp->bt_bdev->bd_dev; 1808 btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off, 1809 mp, ops); 1810 1811 /* 1812 * Flush and invalidate all devices' pagecaches before reading any 1813 * metadata because XFS doesn't use the bdev pagecache. 1814 */ 1815 error = sync_blockdev(btp->bt_bdev); 1816 if (error) 1817 goto error_free; 1818 1819 /* 1820 * When allocating the buftargs we have not yet read the super block and 1821 * thus don't know the file system sector size yet. 1822 */ 1823 btp->bt_meta_sectorsize = bdev_logical_block_size(btp->bt_bdev); 1824 btp->bt_meta_sectormask = btp->bt_meta_sectorsize - 1; 1825 1826 error = xfs_init_buftarg(btp, btp->bt_meta_sectorsize, 1827 mp->m_super->s_id); 1828 if (error) 1829 goto error_free; 1830 1831 return btp; 1832 1833 error_free: 1834 kfree(btp); 1835 return ERR_PTR(error); 1836 } 1837 1838 static inline void 1839 xfs_buf_list_del( 1840 struct xfs_buf *bp) 1841 { 1842 list_del_init(&bp->b_list); 1843 wake_up_var(&bp->b_list); 1844 } 1845 1846 /* 1847 * Cancel a delayed write list. 1848 * 1849 * Remove each buffer from the list, clear the delwri queue flag and drop the 1850 * associated buffer reference. 1851 */ 1852 void 1853 xfs_buf_delwri_cancel( 1854 struct list_head *list) 1855 { 1856 struct xfs_buf *bp; 1857 1858 while (!list_empty(list)) { 1859 bp = list_first_entry(list, struct xfs_buf, b_list); 1860 1861 xfs_buf_lock(bp); 1862 bp->b_flags &= ~_XBF_DELWRI_Q; 1863 xfs_buf_list_del(bp); 1864 xfs_buf_relse(bp); 1865 } 1866 } 1867 1868 /* 1869 * Add a buffer to the delayed write list. 1870 * 1871 * This queues a buffer for writeout if it hasn't already been. Note that 1872 * neither this routine nor the buffer list submission functions perform 1873 * any internal synchronization. It is expected that the lists are thread-local 1874 * to the callers. 1875 * 1876 * Returns true if we queued up the buffer, or false if it already had 1877 * been on the buffer list. 1878 */ 1879 bool 1880 xfs_buf_delwri_queue( 1881 struct xfs_buf *bp, 1882 struct list_head *list) 1883 { 1884 ASSERT(xfs_buf_islocked(bp)); 1885 ASSERT(!(bp->b_flags & XBF_READ)); 1886 1887 /* 1888 * If the buffer is already marked delwri it already is queued up 1889 * by someone else for imediate writeout. Just ignore it in that 1890 * case. 1891 */ 1892 if (bp->b_flags & _XBF_DELWRI_Q) { 1893 trace_xfs_buf_delwri_queued(bp, _RET_IP_); 1894 return false; 1895 } 1896 1897 trace_xfs_buf_delwri_queue(bp, _RET_IP_); 1898 1899 /* 1900 * If a buffer gets written out synchronously or marked stale while it 1901 * is on a delwri list we lazily remove it. To do this, the other party 1902 * clears the _XBF_DELWRI_Q flag but otherwise leaves the buffer alone. 1903 * It remains referenced and on the list. In a rare corner case it 1904 * might get readded to a delwri list after the synchronous writeout, in 1905 * which case we need just need to re-add the flag here. 1906 */ 1907 bp->b_flags |= _XBF_DELWRI_Q; 1908 if (list_empty(&bp->b_list)) { 1909 xfs_buf_hold(bp); 1910 list_add_tail(&bp->b_list, list); 1911 } 1912 1913 return true; 1914 } 1915 1916 /* 1917 * Queue a buffer to this delwri list as part of a data integrity operation. 1918 * If the buffer is on any other delwri list, we'll wait for that to clear 1919 * so that the caller can submit the buffer for IO and wait for the result. 1920 * Callers must ensure the buffer is not already on the list. 1921 */ 1922 void 1923 xfs_buf_delwri_queue_here( 1924 struct xfs_buf *bp, 1925 struct list_head *buffer_list) 1926 { 1927 /* 1928 * We need this buffer to end up on the /caller's/ delwri list, not any 1929 * old list. This can happen if the buffer is marked stale (which 1930 * clears DELWRI_Q) after the AIL queues the buffer to its list but 1931 * before the AIL has a chance to submit the list. 1932 */ 1933 while (!list_empty(&bp->b_list)) { 1934 xfs_buf_unlock(bp); 1935 wait_var_event(&bp->b_list, list_empty(&bp->b_list)); 1936 xfs_buf_lock(bp); 1937 } 1938 1939 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); 1940 1941 xfs_buf_delwri_queue(bp, buffer_list); 1942 } 1943 1944 /* 1945 * Compare function is more complex than it needs to be because 1946 * the return value is only 32 bits and we are doing comparisons 1947 * on 64 bit values 1948 */ 1949 static int 1950 xfs_buf_cmp( 1951 void *priv, 1952 const struct list_head *a, 1953 const struct list_head *b) 1954 { 1955 struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list); 1956 struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list); 1957 xfs_daddr_t diff; 1958 1959 diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn; 1960 if (diff < 0) 1961 return -1; 1962 if (diff > 0) 1963 return 1; 1964 return 0; 1965 } 1966 1967 static bool 1968 xfs_buf_delwri_submit_prep( 1969 struct xfs_buf *bp) 1970 { 1971 /* 1972 * Someone else might have written the buffer synchronously or marked it 1973 * stale in the meantime. In that case only the _XBF_DELWRI_Q flag got 1974 * cleared, and we have to drop the reference and remove it from the 1975 * list here. 1976 */ 1977 if (!(bp->b_flags & _XBF_DELWRI_Q)) { 1978 xfs_buf_list_del(bp); 1979 xfs_buf_relse(bp); 1980 return false; 1981 } 1982 1983 trace_xfs_buf_delwri_split(bp, _RET_IP_); 1984 bp->b_flags &= ~_XBF_DELWRI_Q; 1985 bp->b_flags |= XBF_WRITE; 1986 return true; 1987 } 1988 1989 /* 1990 * Write out a buffer list asynchronously. 1991 * 1992 * This will take the @buffer_list, write all non-locked and non-pinned buffers 1993 * out and not wait for I/O completion on any of the buffers. This interface 1994 * is only safely useable for callers that can track I/O completion by higher 1995 * level means, e.g. AIL pushing as the @buffer_list is consumed in this 1996 * function. 1997 * 1998 * Note: this function will skip buffers it would block on, and in doing so 1999 * leaves them on @buffer_list so they can be retried on a later pass. As such, 2000 * it is up to the caller to ensure that the buffer list is fully submitted or 2001 * cancelled appropriately when they are finished with the list. Failure to 2002 * cancel or resubmit the list until it is empty will result in leaked buffers 2003 * at unmount time. 2004 */ 2005 int 2006 xfs_buf_delwri_submit_nowait( 2007 struct list_head *buffer_list) 2008 { 2009 struct xfs_buf *bp, *n; 2010 int pinned = 0; 2011 struct blk_plug plug; 2012 2013 list_sort(NULL, buffer_list, xfs_buf_cmp); 2014 2015 blk_start_plug(&plug); 2016 list_for_each_entry_safe(bp, n, buffer_list, b_list) { 2017 if (!xfs_buf_trylock(bp)) 2018 continue; 2019 if (xfs_buf_ispinned(bp)) { 2020 xfs_buf_unlock(bp); 2021 pinned++; 2022 continue; 2023 } 2024 if (!xfs_buf_delwri_submit_prep(bp)) 2025 continue; 2026 bp->b_flags |= XBF_ASYNC; 2027 xfs_buf_list_del(bp); 2028 xfs_buf_submit(bp); 2029 } 2030 blk_finish_plug(&plug); 2031 2032 return pinned; 2033 } 2034 2035 /* 2036 * Write out a buffer list synchronously. 2037 * 2038 * This will take the @buffer_list, write all buffers out and wait for I/O 2039 * completion on all of the buffers. @buffer_list is consumed by the function, 2040 * so callers must have some other way of tracking buffers if they require such 2041 * functionality. 2042 */ 2043 int 2044 xfs_buf_delwri_submit( 2045 struct list_head *buffer_list) 2046 { 2047 LIST_HEAD (wait_list); 2048 int error = 0, error2; 2049 struct xfs_buf *bp, *n; 2050 struct blk_plug plug; 2051 2052 list_sort(NULL, buffer_list, xfs_buf_cmp); 2053 2054 blk_start_plug(&plug); 2055 list_for_each_entry_safe(bp, n, buffer_list, b_list) { 2056 xfs_buf_lock(bp); 2057 if (!xfs_buf_delwri_submit_prep(bp)) 2058 continue; 2059 bp->b_flags &= ~XBF_ASYNC; 2060 list_move_tail(&bp->b_list, &wait_list); 2061 xfs_buf_submit(bp); 2062 } 2063 blk_finish_plug(&plug); 2064 2065 /* Wait for IO to complete. */ 2066 while (!list_empty(&wait_list)) { 2067 bp = list_first_entry(&wait_list, struct xfs_buf, b_list); 2068 2069 xfs_buf_list_del(bp); 2070 2071 /* 2072 * Wait on the locked buffer, check for errors and unlock and 2073 * release the delwri queue reference. 2074 */ 2075 error2 = xfs_buf_iowait(bp); 2076 xfs_buf_relse(bp); 2077 if (!error) 2078 error = error2; 2079 } 2080 2081 return error; 2082 } 2083 2084 void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) 2085 { 2086 /* 2087 * Set the lru reference count to 0 based on the error injection tag. 2088 * This allows userspace to disrupt buffer caching for debug/testing 2089 * purposes. 2090 */ 2091 if (XFS_TEST_ERROR(bp->b_mount, XFS_ERRTAG_BUF_LRU_REF)) 2092 lru_ref = 0; 2093 2094 atomic_set(&bp->b_lru_ref, lru_ref); 2095 } 2096 2097 /* 2098 * Verify an on-disk magic value against the magic value specified in the 2099 * verifier structure. The verifier magic is in disk byte order so the caller is 2100 * expected to pass the value directly from disk. 2101 */ 2102 bool 2103 xfs_verify_magic( 2104 struct xfs_buf *bp, 2105 __be32 dmagic) 2106 { 2107 struct xfs_mount *mp = bp->b_mount; 2108 int idx; 2109 2110 idx = xfs_has_crc(mp); 2111 if (WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx])) 2112 return false; 2113 return dmagic == bp->b_ops->magic[idx]; 2114 } 2115 /* 2116 * Verify an on-disk magic value against the magic value specified in the 2117 * verifier structure. The verifier magic is in disk byte order so the caller is 2118 * expected to pass the value directly from disk. 2119 */ 2120 bool 2121 xfs_verify_magic16( 2122 struct xfs_buf *bp, 2123 __be16 dmagic) 2124 { 2125 struct xfs_mount *mp = bp->b_mount; 2126 int idx; 2127 2128 idx = xfs_has_crc(mp); 2129 if (WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx])) 2130 return false; 2131 return dmagic == bp->b_ops->magic16[idx]; 2132 } 2133