1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include <linux/backing-dev.h> 8 #include <linux/dax.h> 9 10 #include "xfs_shared.h" 11 #include "xfs_format.h" 12 #include "xfs_log_format.h" 13 #include "xfs_trans_resv.h" 14 #include "xfs_mount.h" 15 #include "xfs_trace.h" 16 #include "xfs_log.h" 17 #include "xfs_log_recover.h" 18 #include "xfs_log_priv.h" 19 #include "xfs_trans.h" 20 #include "xfs_buf_item.h" 21 #include "xfs_errortag.h" 22 #include "xfs_error.h" 23 #include "xfs_ag.h" 24 #include "xfs_buf_mem.h" 25 #include "xfs_notify_failure.h" 26 27 struct kmem_cache *xfs_buf_cache; 28 29 /* 30 * Locking orders 31 * 32 * xfs_buf_stale: 33 * b_sema (caller holds) 34 * b_lock 35 * lru_lock 36 * 37 * xfs_buf_rele: 38 * b_lock 39 * lru_lock 40 * 41 * xfs_buftarg_drain_rele 42 * lru_lock 43 * b_lock (trylock due to inversion) 44 * 45 * xfs_buftarg_isolate 46 * lru_lock 47 * b_lock (trylock due to inversion) 48 */ 49 50 static void xfs_buf_submit(struct xfs_buf *bp); 51 static int xfs_buf_iowait(struct xfs_buf *bp); 52 53 static inline bool xfs_buf_is_uncached(struct xfs_buf *bp) 54 { 55 return bp->b_rhash_key == XFS_BUF_DADDR_NULL; 56 } 57 58 /* 59 * When we mark a buffer stale, we remove the buffer from the LRU and clear the 60 * b_lru_ref count so that the buffer is freed immediately when the buffer 61 * reference count falls to zero. If the buffer is already on the LRU, we need 62 * to remove the reference that LRU holds on the buffer. 63 * 64 * This prevents build-up of stale buffers on the LRU. 65 */ 66 void 67 xfs_buf_stale( 68 struct xfs_buf *bp) 69 { 70 ASSERT(xfs_buf_islocked(bp)); 71 72 bp->b_flags |= XBF_STALE; 73 74 /* 75 * Clear the delwri status so that a delwri queue walker will not 76 * flush this buffer to disk now that it is stale. The delwri queue has 77 * a reference to the buffer, so this is safe to do. 78 */ 79 bp->b_flags &= ~_XBF_DELWRI_Q; 80 81 spin_lock(&bp->b_lock); 82 atomic_set(&bp->b_lru_ref, 0); 83 if (!(bp->b_state & XFS_BSTATE_DISPOSE) && 84 (list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru))) 85 bp->b_hold--; 86 87 ASSERT(bp->b_hold >= 1); 88 spin_unlock(&bp->b_lock); 89 } 90 91 static void 92 xfs_buf_free_callback( 93 struct callback_head *cb) 94 { 95 struct xfs_buf *bp = container_of(cb, struct xfs_buf, b_rcu); 96 97 if (bp->b_maps != &bp->__b_map) 98 kfree(bp->b_maps); 99 kmem_cache_free(xfs_buf_cache, bp); 100 } 101 102 static void 103 xfs_buf_free( 104 struct xfs_buf *bp) 105 { 106 unsigned int size = BBTOB(bp->b_length); 107 108 trace_xfs_buf_free(bp, _RET_IP_); 109 110 ASSERT(list_empty(&bp->b_lru)); 111 112 if (!xfs_buftarg_is_mem(bp->b_target) && size >= PAGE_SIZE) 113 mm_account_reclaimed_pages(howmany(size, PAGE_SHIFT)); 114 115 if (is_vmalloc_addr(bp->b_addr)) 116 vfree(bp->b_addr); 117 else if (bp->b_flags & _XBF_KMEM) 118 kfree(bp->b_addr); 119 else 120 folio_put(virt_to_folio(bp->b_addr)); 121 122 call_rcu(&bp->b_rcu, xfs_buf_free_callback); 123 } 124 125 static int 126 xfs_buf_alloc_kmem( 127 struct xfs_buf *bp, 128 size_t size, 129 gfp_t gfp_mask) 130 { 131 ASSERT(is_power_of_2(size)); 132 ASSERT(size < PAGE_SIZE); 133 134 bp->b_addr = kmalloc(size, gfp_mask | __GFP_NOFAIL); 135 if (!bp->b_addr) 136 return -ENOMEM; 137 138 /* 139 * Slab guarantees that we get back naturally aligned allocations for 140 * power of two sizes. Keep this check as the canary in the coal mine 141 * if anything changes in slab. 142 */ 143 if (WARN_ON_ONCE(!IS_ALIGNED((unsigned long)bp->b_addr, size))) { 144 kfree(bp->b_addr); 145 bp->b_addr = NULL; 146 return -ENOMEM; 147 } 148 bp->b_flags |= _XBF_KMEM; 149 trace_xfs_buf_backing_kmem(bp, _RET_IP_); 150 return 0; 151 } 152 153 /* 154 * Allocate backing memory for a buffer. 155 * 156 * For tmpfs-backed buffers used by in-memory btrees this directly maps the 157 * tmpfs page cache folios. 158 * 159 * For real file system buffers there are three different kinds backing memory: 160 * 161 * The first type backs the buffer by a kmalloc allocation. This is done for 162 * less than PAGE_SIZE allocations to avoid wasting memory. 163 * 164 * The second type is a single folio buffer - this may be a high order folio or 165 * just a single page sized folio, but either way they get treated the same way 166 * by the rest of the code - the buffer memory spans a single contiguous memory 167 * region that we don't have to map and unmap to access the data directly. 168 * 169 * The third type of buffer is the vmalloc()d buffer. This provides the buffer 170 * with the required contiguous memory region but backed by discontiguous 171 * physical pages. 172 */ 173 static int 174 xfs_buf_alloc_backing_mem( 175 struct xfs_buf *bp, 176 xfs_buf_flags_t flags) 177 { 178 size_t size = BBTOB(bp->b_length); 179 gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN; 180 struct folio *folio; 181 182 if (xfs_buftarg_is_mem(bp->b_target)) 183 return xmbuf_map_backing_mem(bp); 184 185 /* Assure zeroed buffer for non-read cases. */ 186 if (!(flags & XBF_READ)) 187 gfp_mask |= __GFP_ZERO; 188 189 if (flags & XBF_READ_AHEAD) 190 gfp_mask |= __GFP_NORETRY; 191 192 /* 193 * For buffers smaller than PAGE_SIZE use a kmalloc allocation if that 194 * is properly aligned. The slab allocator now guarantees an aligned 195 * allocation for all power of two sizes, which matches most of the 196 * smaller than PAGE_SIZE buffers used by XFS. 197 */ 198 if (size < PAGE_SIZE && is_power_of_2(size)) 199 return xfs_buf_alloc_kmem(bp, size, gfp_mask); 200 201 /* 202 * Don't bother with the retry loop for single PAGE allocations: vmalloc 203 * won't do any better. 204 */ 205 if (size <= PAGE_SIZE) 206 gfp_mask |= __GFP_NOFAIL; 207 208 /* 209 * Optimistically attempt a single high order folio allocation for 210 * larger than PAGE_SIZE buffers. 211 * 212 * Allocating a high order folio makes the assumption that buffers are a 213 * power-of-2 size, matching the power-of-2 folios sizes available. 214 * 215 * The exception here are user xattr data buffers, which can be arbitrarily 216 * sized up to 64kB plus structure metadata, skip straight to the vmalloc 217 * path for them instead of wasting memory here. 218 */ 219 if (size > PAGE_SIZE) { 220 if (!is_power_of_2(size)) 221 goto fallback; 222 gfp_mask &= ~__GFP_DIRECT_RECLAIM; 223 gfp_mask |= __GFP_NORETRY; 224 } 225 folio = folio_alloc(gfp_mask, get_order(size)); 226 if (!folio) { 227 if (size <= PAGE_SIZE) 228 return -ENOMEM; 229 trace_xfs_buf_backing_fallback(bp, _RET_IP_); 230 goto fallback; 231 } 232 bp->b_addr = folio_address(folio); 233 trace_xfs_buf_backing_folio(bp, _RET_IP_); 234 return 0; 235 236 fallback: 237 for (;;) { 238 bp->b_addr = __vmalloc(size, gfp_mask); 239 if (bp->b_addr) 240 break; 241 if (flags & XBF_READ_AHEAD) 242 return -ENOMEM; 243 XFS_STATS_INC(bp->b_mount, xb_page_retries); 244 memalloc_retry_wait(gfp_mask); 245 } 246 247 trace_xfs_buf_backing_vmalloc(bp, _RET_IP_); 248 return 0; 249 } 250 251 static int 252 xfs_buf_alloc( 253 struct xfs_buftarg *target, 254 struct xfs_buf_map *map, 255 int nmaps, 256 xfs_buf_flags_t flags, 257 struct xfs_buf **bpp) 258 { 259 struct xfs_buf *bp; 260 int error; 261 int i; 262 263 *bpp = NULL; 264 bp = kmem_cache_zalloc(xfs_buf_cache, 265 GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); 266 267 /* 268 * We don't want certain flags to appear in b_flags unless they are 269 * specifically set by later operations on the buffer. 270 */ 271 flags &= ~(XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD); 272 273 /* 274 * A new buffer is held and locked by the owner. This ensures that the 275 * buffer is owned by the caller and racing RCU lookups right after 276 * inserting into the hash table are safe (and will have to wait for 277 * the unlock to do anything non-trivial). 278 */ 279 bp->b_hold = 1; 280 sema_init(&bp->b_sema, 0); /* held, no waiters */ 281 282 spin_lock_init(&bp->b_lock); 283 atomic_set(&bp->b_lru_ref, 1); 284 init_completion(&bp->b_iowait); 285 INIT_LIST_HEAD(&bp->b_lru); 286 INIT_LIST_HEAD(&bp->b_list); 287 INIT_LIST_HEAD(&bp->b_li_list); 288 bp->b_target = target; 289 bp->b_mount = target->bt_mount; 290 bp->b_flags = flags; 291 bp->b_rhash_key = map[0].bm_bn; 292 bp->b_length = 0; 293 bp->b_map_count = nmaps; 294 if (nmaps == 1) 295 bp->b_maps = &bp->__b_map; 296 else 297 bp->b_maps = kcalloc(nmaps, sizeof(struct xfs_buf_map), 298 GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); 299 for (i = 0; i < nmaps; i++) { 300 bp->b_maps[i].bm_bn = map[i].bm_bn; 301 bp->b_maps[i].bm_len = map[i].bm_len; 302 bp->b_length += map[i].bm_len; 303 } 304 305 atomic_set(&bp->b_pin_count, 0); 306 init_waitqueue_head(&bp->b_waiters); 307 308 XFS_STATS_INC(bp->b_mount, xb_create); 309 trace_xfs_buf_init(bp, _RET_IP_); 310 311 error = xfs_buf_alloc_backing_mem(bp, flags); 312 if (error) { 313 xfs_buf_free(bp); 314 return error; 315 } 316 317 *bpp = bp; 318 return 0; 319 } 320 321 /* 322 * Finding and Reading Buffers 323 */ 324 static int 325 _xfs_buf_obj_cmp( 326 struct rhashtable_compare_arg *arg, 327 const void *obj) 328 { 329 const struct xfs_buf_map *map = arg->key; 330 const struct xfs_buf *bp = obj; 331 332 /* 333 * The key hashing in the lookup path depends on the key being the 334 * first element of the compare_arg, make sure to assert this. 335 */ 336 BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0); 337 338 if (bp->b_rhash_key != map->bm_bn) 339 return 1; 340 341 if (unlikely(bp->b_length != map->bm_len)) { 342 /* 343 * found a block number match. If the range doesn't 344 * match, the only way this is allowed is if the buffer 345 * in the cache is stale and the transaction that made 346 * it stale has not yet committed. i.e. we are 347 * reallocating a busy extent. Skip this buffer and 348 * continue searching for an exact match. 349 * 350 * Note: If we're scanning for incore buffers to stale, don't 351 * complain if we find non-stale buffers. 352 */ 353 if (!(map->bm_flags & XBM_LIVESCAN)) 354 ASSERT(bp->b_flags & XBF_STALE); 355 return 1; 356 } 357 return 0; 358 } 359 360 static const struct rhashtable_params xfs_buf_hash_params = { 361 .min_size = 32, /* empty AGs have minimal footprint */ 362 .nelem_hint = 16, 363 .key_len = sizeof(xfs_daddr_t), 364 .key_offset = offsetof(struct xfs_buf, b_rhash_key), 365 .head_offset = offsetof(struct xfs_buf, b_rhash_head), 366 .automatic_shrinking = true, 367 .obj_cmpfn = _xfs_buf_obj_cmp, 368 }; 369 370 int 371 xfs_buf_cache_init( 372 struct xfs_buf_cache *bch) 373 { 374 return rhashtable_init(&bch->bc_hash, &xfs_buf_hash_params); 375 } 376 377 void 378 xfs_buf_cache_destroy( 379 struct xfs_buf_cache *bch) 380 { 381 rhashtable_destroy(&bch->bc_hash); 382 } 383 384 static int 385 xfs_buf_map_verify( 386 struct xfs_buftarg *btp, 387 struct xfs_buf_map *map) 388 { 389 xfs_daddr_t eofs; 390 391 /* Check for IOs smaller than the sector size / not sector aligned */ 392 ASSERT(!(BBTOB(map->bm_len) < btp->bt_meta_sectorsize)); 393 ASSERT(!(BBTOB(map->bm_bn) & (xfs_off_t)btp->bt_meta_sectormask)); 394 395 /* 396 * Corrupted block numbers can get through to here, unfortunately, so we 397 * have to check that the buffer falls within the filesystem bounds. 398 */ 399 eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); 400 if (map->bm_bn < 0 || map->bm_bn >= eofs) { 401 xfs_alert(btp->bt_mount, 402 "%s: daddr 0x%llx out of range, EOFS 0x%llx", 403 __func__, map->bm_bn, eofs); 404 WARN_ON(1); 405 return -EFSCORRUPTED; 406 } 407 return 0; 408 } 409 410 static int 411 xfs_buf_find_lock( 412 struct xfs_buf *bp, 413 xfs_buf_flags_t flags) 414 { 415 if (flags & XBF_TRYLOCK) { 416 if (!xfs_buf_trylock(bp)) { 417 XFS_STATS_INC(bp->b_mount, xb_busy_locked); 418 return -EAGAIN; 419 } 420 } else { 421 xfs_buf_lock(bp); 422 XFS_STATS_INC(bp->b_mount, xb_get_locked_waited); 423 } 424 425 /* 426 * if the buffer is stale, clear all the external state associated with 427 * it. We need to keep flags such as how we allocated the buffer memory 428 * intact here. 429 */ 430 if (bp->b_flags & XBF_STALE) { 431 if (flags & XBF_LIVESCAN) { 432 xfs_buf_unlock(bp); 433 return -ENOENT; 434 } 435 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); 436 bp->b_flags &= _XBF_KMEM; 437 bp->b_ops = NULL; 438 } 439 return 0; 440 } 441 442 static bool 443 xfs_buf_try_hold( 444 struct xfs_buf *bp) 445 { 446 spin_lock(&bp->b_lock); 447 if (bp->b_hold == 0) { 448 spin_unlock(&bp->b_lock); 449 return false; 450 } 451 bp->b_hold++; 452 spin_unlock(&bp->b_lock); 453 return true; 454 } 455 456 static inline int 457 xfs_buf_lookup( 458 struct xfs_buf_cache *bch, 459 struct xfs_buf_map *map, 460 xfs_buf_flags_t flags, 461 struct xfs_buf **bpp) 462 { 463 struct xfs_buf *bp; 464 int error; 465 466 rcu_read_lock(); 467 bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params); 468 if (!bp || !xfs_buf_try_hold(bp)) { 469 rcu_read_unlock(); 470 return -ENOENT; 471 } 472 rcu_read_unlock(); 473 474 error = xfs_buf_find_lock(bp, flags); 475 if (error) { 476 xfs_buf_rele(bp); 477 return error; 478 } 479 480 trace_xfs_buf_find(bp, flags, _RET_IP_); 481 *bpp = bp; 482 return 0; 483 } 484 485 /* 486 * Insert the new_bp into the hash table. This consumes the perag reference 487 * taken for the lookup regardless of the result of the insert. 488 */ 489 static int 490 xfs_buf_find_insert( 491 struct xfs_buftarg *btp, 492 struct xfs_buf_cache *bch, 493 struct xfs_perag *pag, 494 struct xfs_buf_map *cmap, 495 struct xfs_buf_map *map, 496 int nmaps, 497 xfs_buf_flags_t flags, 498 struct xfs_buf **bpp) 499 { 500 struct xfs_buf *new_bp; 501 struct xfs_buf *bp; 502 int error; 503 504 error = xfs_buf_alloc(btp, map, nmaps, flags, &new_bp); 505 if (error) 506 goto out_drop_pag; 507 508 /* The new buffer keeps the perag reference until it is freed. */ 509 new_bp->b_pag = pag; 510 511 rcu_read_lock(); 512 bp = rhashtable_lookup_get_insert_fast(&bch->bc_hash, 513 &new_bp->b_rhash_head, xfs_buf_hash_params); 514 if (IS_ERR(bp)) { 515 rcu_read_unlock(); 516 error = PTR_ERR(bp); 517 goto out_free_buf; 518 } 519 if (bp && xfs_buf_try_hold(bp)) { 520 /* found an existing buffer */ 521 rcu_read_unlock(); 522 error = xfs_buf_find_lock(bp, flags); 523 if (error) 524 xfs_buf_rele(bp); 525 else 526 *bpp = bp; 527 goto out_free_buf; 528 } 529 rcu_read_unlock(); 530 531 *bpp = new_bp; 532 return 0; 533 534 out_free_buf: 535 xfs_buf_free(new_bp); 536 out_drop_pag: 537 if (pag) 538 xfs_perag_put(pag); 539 return error; 540 } 541 542 static inline struct xfs_perag * 543 xfs_buftarg_get_pag( 544 struct xfs_buftarg *btp, 545 const struct xfs_buf_map *map) 546 { 547 struct xfs_mount *mp = btp->bt_mount; 548 549 if (xfs_buftarg_is_mem(btp)) 550 return NULL; 551 return xfs_perag_get(mp, xfs_daddr_to_agno(mp, map->bm_bn)); 552 } 553 554 static inline struct xfs_buf_cache * 555 xfs_buftarg_buf_cache( 556 struct xfs_buftarg *btp, 557 struct xfs_perag *pag) 558 { 559 if (pag) 560 return &pag->pag_bcache; 561 return btp->bt_cache; 562 } 563 564 /* 565 * Assembles a buffer covering the specified range. The code is optimised for 566 * cache hits, as metadata intensive workloads will see 3 orders of magnitude 567 * more hits than misses. 568 */ 569 int 570 xfs_buf_get_map( 571 struct xfs_buftarg *btp, 572 struct xfs_buf_map *map, 573 int nmaps, 574 xfs_buf_flags_t flags, 575 struct xfs_buf **bpp) 576 { 577 struct xfs_buf_cache *bch; 578 struct xfs_perag *pag; 579 struct xfs_buf *bp = NULL; 580 struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn }; 581 int error; 582 int i; 583 584 if (flags & XBF_LIVESCAN) 585 cmap.bm_flags |= XBM_LIVESCAN; 586 for (i = 0; i < nmaps; i++) 587 cmap.bm_len += map[i].bm_len; 588 589 error = xfs_buf_map_verify(btp, &cmap); 590 if (error) 591 return error; 592 593 pag = xfs_buftarg_get_pag(btp, &cmap); 594 bch = xfs_buftarg_buf_cache(btp, pag); 595 596 error = xfs_buf_lookup(bch, &cmap, flags, &bp); 597 if (error && error != -ENOENT) 598 goto out_put_perag; 599 600 /* cache hits always outnumber misses by at least 10:1 */ 601 if (unlikely(!bp)) { 602 XFS_STATS_INC(btp->bt_mount, xb_miss_locked); 603 604 if (flags & XBF_INCORE) 605 goto out_put_perag; 606 607 /* xfs_buf_find_insert() consumes the perag reference. */ 608 error = xfs_buf_find_insert(btp, bch, pag, &cmap, map, nmaps, 609 flags, &bp); 610 if (error) 611 return error; 612 } else { 613 XFS_STATS_INC(btp->bt_mount, xb_get_locked); 614 if (pag) 615 xfs_perag_put(pag); 616 } 617 618 /* 619 * Clear b_error if this is a lookup from a caller that doesn't expect 620 * valid data to be found in the buffer. 621 */ 622 if (!(flags & XBF_READ)) 623 xfs_buf_ioerror(bp, 0); 624 625 XFS_STATS_INC(btp->bt_mount, xb_get); 626 trace_xfs_buf_get(bp, flags, _RET_IP_); 627 *bpp = bp; 628 return 0; 629 630 out_put_perag: 631 if (pag) 632 xfs_perag_put(pag); 633 return error; 634 } 635 636 int 637 _xfs_buf_read( 638 struct xfs_buf *bp) 639 { 640 ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL); 641 642 bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD | XBF_DONE); 643 bp->b_flags |= XBF_READ; 644 xfs_buf_submit(bp); 645 return xfs_buf_iowait(bp); 646 } 647 648 /* 649 * Reverify a buffer found in cache without an attached ->b_ops. 650 * 651 * If the caller passed an ops structure and the buffer doesn't have ops 652 * assigned, set the ops and use it to verify the contents. If verification 653 * fails, clear XBF_DONE. We assume the buffer has no recorded errors and is 654 * already in XBF_DONE state on entry. 655 * 656 * Under normal operations, every in-core buffer is verified on read I/O 657 * completion. There are two scenarios that can lead to in-core buffers without 658 * an assigned ->b_ops. The first is during log recovery of buffers on a V4 659 * filesystem, though these buffers are purged at the end of recovery. The 660 * other is online repair, which intentionally reads with a NULL buffer ops to 661 * run several verifiers across an in-core buffer in order to establish buffer 662 * type. If repair can't establish that, the buffer will be left in memory 663 * with NULL buffer ops. 664 */ 665 int 666 xfs_buf_reverify( 667 struct xfs_buf *bp, 668 const struct xfs_buf_ops *ops) 669 { 670 ASSERT(bp->b_flags & XBF_DONE); 671 ASSERT(bp->b_error == 0); 672 673 if (!ops || bp->b_ops) 674 return 0; 675 676 bp->b_ops = ops; 677 bp->b_ops->verify_read(bp); 678 if (bp->b_error) 679 bp->b_flags &= ~XBF_DONE; 680 return bp->b_error; 681 } 682 683 int 684 xfs_buf_read_map( 685 struct xfs_buftarg *target, 686 struct xfs_buf_map *map, 687 int nmaps, 688 xfs_buf_flags_t flags, 689 struct xfs_buf **bpp, 690 const struct xfs_buf_ops *ops, 691 xfs_failaddr_t fa) 692 { 693 struct xfs_buf *bp; 694 int error; 695 696 ASSERT(!(flags & (XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD))); 697 698 flags |= XBF_READ; 699 *bpp = NULL; 700 701 error = xfs_buf_get_map(target, map, nmaps, flags, &bp); 702 if (error) 703 return error; 704 705 trace_xfs_buf_read(bp, flags, _RET_IP_); 706 707 if (!(bp->b_flags & XBF_DONE)) { 708 /* Initiate the buffer read and wait. */ 709 XFS_STATS_INC(target->bt_mount, xb_get_read); 710 bp->b_ops = ops; 711 error = _xfs_buf_read(bp); 712 } else { 713 /* Buffer already read; all we need to do is check it. */ 714 error = xfs_buf_reverify(bp, ops); 715 716 /* We do not want read in the flags */ 717 bp->b_flags &= ~XBF_READ; 718 ASSERT(bp->b_ops != NULL || ops == NULL); 719 } 720 721 /* 722 * If we've had a read error, then the contents of the buffer are 723 * invalid and should not be used. To ensure that a followup read tries 724 * to pull the buffer from disk again, we clear the XBF_DONE flag and 725 * mark the buffer stale. This ensures that anyone who has a current 726 * reference to the buffer will interpret it's contents correctly and 727 * future cache lookups will also treat it as an empty, uninitialised 728 * buffer. 729 */ 730 if (error) { 731 /* 732 * Check against log shutdown for error reporting because 733 * metadata writeback may require a read first and we need to 734 * report errors in metadata writeback until the log is shut 735 * down. High level transaction read functions already check 736 * against mount shutdown, anyway, so we only need to be 737 * concerned about low level IO interactions here. 738 */ 739 if (!xlog_is_shutdown(target->bt_mount->m_log)) 740 xfs_buf_ioerror_alert(bp, fa); 741 742 bp->b_flags &= ~XBF_DONE; 743 xfs_buf_stale(bp); 744 xfs_buf_relse(bp); 745 746 /* bad CRC means corrupted metadata */ 747 if (error == -EFSBADCRC) 748 error = -EFSCORRUPTED; 749 return error; 750 } 751 752 *bpp = bp; 753 return 0; 754 } 755 756 /* 757 * If we are not low on memory then do the readahead in a deadlock 758 * safe manner. 759 */ 760 void 761 xfs_buf_readahead_map( 762 struct xfs_buftarg *target, 763 struct xfs_buf_map *map, 764 int nmaps, 765 const struct xfs_buf_ops *ops) 766 { 767 const xfs_buf_flags_t flags = XBF_READ | XBF_ASYNC | XBF_READ_AHEAD; 768 struct xfs_buf *bp; 769 770 /* 771 * Currently we don't have a good means or justification for performing 772 * xmbuf_map_page asynchronously, so we don't do readahead. 773 */ 774 if (xfs_buftarg_is_mem(target)) 775 return; 776 777 if (xfs_buf_get_map(target, map, nmaps, flags | XBF_TRYLOCK, &bp)) 778 return; 779 trace_xfs_buf_readahead(bp, 0, _RET_IP_); 780 781 if (bp->b_flags & XBF_DONE) { 782 xfs_buf_reverify(bp, ops); 783 xfs_buf_relse(bp); 784 return; 785 } 786 XFS_STATS_INC(target->bt_mount, xb_get_read); 787 bp->b_ops = ops; 788 bp->b_flags &= ~(XBF_WRITE | XBF_DONE); 789 bp->b_flags |= flags; 790 percpu_counter_inc(&target->bt_readahead_count); 791 xfs_buf_submit(bp); 792 } 793 794 /* 795 * Read an uncached buffer from disk. Allocates and returns a locked 796 * buffer containing the disk contents or nothing. Uncached buffers always have 797 * a cache index of XFS_BUF_DADDR_NULL so we can easily determine if the buffer 798 * is cached or uncached during fault diagnosis. 799 */ 800 int 801 xfs_buf_read_uncached( 802 struct xfs_buftarg *target, 803 xfs_daddr_t daddr, 804 size_t numblks, 805 struct xfs_buf **bpp, 806 const struct xfs_buf_ops *ops) 807 { 808 struct xfs_buf *bp; 809 int error; 810 811 *bpp = NULL; 812 813 error = xfs_buf_get_uncached(target, numblks, &bp); 814 if (error) 815 return error; 816 817 /* set up the buffer for a read IO */ 818 ASSERT(bp->b_map_count == 1); 819 bp->b_rhash_key = XFS_BUF_DADDR_NULL; 820 bp->b_maps[0].bm_bn = daddr; 821 bp->b_flags |= XBF_READ; 822 bp->b_ops = ops; 823 824 xfs_buf_submit(bp); 825 error = xfs_buf_iowait(bp); 826 if (error) { 827 xfs_buf_relse(bp); 828 return error; 829 } 830 831 *bpp = bp; 832 return 0; 833 } 834 835 int 836 xfs_buf_get_uncached( 837 struct xfs_buftarg *target, 838 size_t numblks, 839 struct xfs_buf **bpp) 840 { 841 int error; 842 DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks); 843 844 error = xfs_buf_alloc(target, &map, 1, 0, bpp); 845 if (!error) 846 trace_xfs_buf_get_uncached(*bpp, _RET_IP_); 847 return error; 848 } 849 850 /* 851 * Increment reference count on buffer, to hold the buffer concurrently 852 * with another thread which may release (free) the buffer asynchronously. 853 * Must hold the buffer already to call this function. 854 */ 855 void 856 xfs_buf_hold( 857 struct xfs_buf *bp) 858 { 859 trace_xfs_buf_hold(bp, _RET_IP_); 860 861 spin_lock(&bp->b_lock); 862 bp->b_hold++; 863 spin_unlock(&bp->b_lock); 864 } 865 866 static void 867 xfs_buf_rele_uncached( 868 struct xfs_buf *bp) 869 { 870 ASSERT(list_empty(&bp->b_lru)); 871 872 spin_lock(&bp->b_lock); 873 if (--bp->b_hold) { 874 spin_unlock(&bp->b_lock); 875 return; 876 } 877 spin_unlock(&bp->b_lock); 878 xfs_buf_free(bp); 879 } 880 881 static void 882 xfs_buf_rele_cached( 883 struct xfs_buf *bp) 884 { 885 struct xfs_buftarg *btp = bp->b_target; 886 struct xfs_perag *pag = bp->b_pag; 887 struct xfs_buf_cache *bch = xfs_buftarg_buf_cache(btp, pag); 888 bool freebuf = false; 889 890 trace_xfs_buf_rele(bp, _RET_IP_); 891 892 spin_lock(&bp->b_lock); 893 ASSERT(bp->b_hold >= 1); 894 if (bp->b_hold > 1) { 895 bp->b_hold--; 896 goto out_unlock; 897 } 898 899 /* we are asked to drop the last reference */ 900 if (atomic_read(&bp->b_lru_ref)) { 901 /* 902 * If the buffer is added to the LRU, keep the reference to the 903 * buffer for the LRU and clear the (now stale) dispose list 904 * state flag, else drop the reference. 905 */ 906 if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru)) 907 bp->b_state &= ~XFS_BSTATE_DISPOSE; 908 else 909 bp->b_hold--; 910 } else { 911 bp->b_hold--; 912 /* 913 * most of the time buffers will already be removed from the 914 * LRU, so optimise that case by checking for the 915 * XFS_BSTATE_DISPOSE flag indicating the last list the buffer 916 * was on was the disposal list 917 */ 918 if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { 919 list_lru_del_obj(&btp->bt_lru, &bp->b_lru); 920 } else { 921 ASSERT(list_empty(&bp->b_lru)); 922 } 923 924 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); 925 rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head, 926 xfs_buf_hash_params); 927 if (pag) 928 xfs_perag_put(pag); 929 freebuf = true; 930 } 931 932 out_unlock: 933 spin_unlock(&bp->b_lock); 934 935 if (freebuf) 936 xfs_buf_free(bp); 937 } 938 939 /* 940 * Release a hold on the specified buffer. 941 */ 942 void 943 xfs_buf_rele( 944 struct xfs_buf *bp) 945 { 946 trace_xfs_buf_rele(bp, _RET_IP_); 947 if (xfs_buf_is_uncached(bp)) 948 xfs_buf_rele_uncached(bp); 949 else 950 xfs_buf_rele_cached(bp); 951 } 952 953 /* 954 * Lock a buffer object, if it is not already locked. 955 * 956 * If we come across a stale, pinned, locked buffer, we know that we are 957 * being asked to lock a buffer that has been reallocated. Because it is 958 * pinned, we know that the log has not been pushed to disk and hence it 959 * will still be locked. Rather than continuing to have trylock attempts 960 * fail until someone else pushes the log, push it ourselves before 961 * returning. This means that the xfsaild will not get stuck trying 962 * to push on stale inode buffers. 963 */ 964 int 965 xfs_buf_trylock( 966 struct xfs_buf *bp) 967 { 968 int locked; 969 970 locked = down_trylock(&bp->b_sema) == 0; 971 if (locked) 972 trace_xfs_buf_trylock(bp, _RET_IP_); 973 else 974 trace_xfs_buf_trylock_fail(bp, _RET_IP_); 975 return locked; 976 } 977 978 /* 979 * Lock a buffer object. 980 * 981 * If we come across a stale, pinned, locked buffer, we know that we 982 * are being asked to lock a buffer that has been reallocated. Because 983 * it is pinned, we know that the log has not been pushed to disk and 984 * hence it will still be locked. Rather than sleeping until someone 985 * else pushes the log, push it ourselves before trying to get the lock. 986 */ 987 void 988 xfs_buf_lock( 989 struct xfs_buf *bp) 990 { 991 trace_xfs_buf_lock(bp, _RET_IP_); 992 993 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) 994 xfs_log_force(bp->b_mount, 0); 995 down(&bp->b_sema); 996 997 trace_xfs_buf_lock_done(bp, _RET_IP_); 998 } 999 1000 void 1001 xfs_buf_unlock( 1002 struct xfs_buf *bp) 1003 { 1004 ASSERT(xfs_buf_islocked(bp)); 1005 1006 up(&bp->b_sema); 1007 trace_xfs_buf_unlock(bp, _RET_IP_); 1008 } 1009 1010 STATIC void 1011 xfs_buf_wait_unpin( 1012 struct xfs_buf *bp) 1013 { 1014 DECLARE_WAITQUEUE (wait, current); 1015 1016 if (atomic_read(&bp->b_pin_count) == 0) 1017 return; 1018 1019 add_wait_queue(&bp->b_waiters, &wait); 1020 for (;;) { 1021 set_current_state(TASK_UNINTERRUPTIBLE); 1022 if (atomic_read(&bp->b_pin_count) == 0) 1023 break; 1024 io_schedule(); 1025 } 1026 remove_wait_queue(&bp->b_waiters, &wait); 1027 set_current_state(TASK_RUNNING); 1028 } 1029 1030 static void 1031 xfs_buf_ioerror_alert_ratelimited( 1032 struct xfs_buf *bp) 1033 { 1034 static unsigned long lasttime; 1035 static struct xfs_buftarg *lasttarg; 1036 1037 if (bp->b_target != lasttarg || 1038 time_after(jiffies, (lasttime + 5*HZ))) { 1039 lasttime = jiffies; 1040 xfs_buf_ioerror_alert(bp, __this_address); 1041 } 1042 lasttarg = bp->b_target; 1043 } 1044 1045 /* 1046 * Account for this latest trip around the retry handler, and decide if 1047 * we've failed enough times to constitute a permanent failure. 1048 */ 1049 static bool 1050 xfs_buf_ioerror_permanent( 1051 struct xfs_buf *bp, 1052 struct xfs_error_cfg *cfg) 1053 { 1054 struct xfs_mount *mp = bp->b_mount; 1055 1056 if (cfg->max_retries != XFS_ERR_RETRY_FOREVER && 1057 ++bp->b_retries > cfg->max_retries) 1058 return true; 1059 if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && 1060 time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time)) 1061 return true; 1062 1063 /* At unmount we may treat errors differently */ 1064 if (xfs_is_unmounting(mp) && mp->m_fail_unmount) 1065 return true; 1066 1067 return false; 1068 } 1069 1070 /* 1071 * On a sync write or shutdown we just want to stale the buffer and let the 1072 * caller handle the error in bp->b_error appropriately. 1073 * 1074 * If the write was asynchronous then no one will be looking for the error. If 1075 * this is the first failure of this type, clear the error state and write the 1076 * buffer out again. This means we always retry an async write failure at least 1077 * once, but we also need to set the buffer up to behave correctly now for 1078 * repeated failures. 1079 * 1080 * If we get repeated async write failures, then we take action according to the 1081 * error configuration we have been set up to use. 1082 * 1083 * Returns true if this function took care of error handling and the caller must 1084 * not touch the buffer again. Return false if the caller should proceed with 1085 * normal I/O completion handling. 1086 */ 1087 static bool 1088 xfs_buf_ioend_handle_error( 1089 struct xfs_buf *bp) 1090 { 1091 struct xfs_mount *mp = bp->b_mount; 1092 struct xfs_error_cfg *cfg; 1093 struct xfs_log_item *lip; 1094 1095 /* 1096 * If we've already shutdown the journal because of I/O errors, there's 1097 * no point in giving this a retry. 1098 */ 1099 if (xlog_is_shutdown(mp->m_log)) 1100 goto out_stale; 1101 1102 xfs_buf_ioerror_alert_ratelimited(bp); 1103 1104 /* 1105 * We're not going to bother about retrying this during recovery. 1106 * One strike! 1107 */ 1108 if (bp->b_flags & _XBF_LOGRECOVERY) { 1109 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 1110 return false; 1111 } 1112 1113 /* 1114 * Synchronous writes will have callers process the error. 1115 */ 1116 if (!(bp->b_flags & XBF_ASYNC)) 1117 goto out_stale; 1118 1119 trace_xfs_buf_iodone_async(bp, _RET_IP_); 1120 1121 cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error); 1122 if (bp->b_last_error != bp->b_error || 1123 !(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL))) { 1124 bp->b_last_error = bp->b_error; 1125 if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && 1126 !bp->b_first_retry_time) 1127 bp->b_first_retry_time = jiffies; 1128 goto resubmit; 1129 } 1130 1131 /* 1132 * Permanent error - we need to trigger a shutdown if we haven't already 1133 * to indicate that inconsistency will result from this action. 1134 */ 1135 if (xfs_buf_ioerror_permanent(bp, cfg)) { 1136 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 1137 goto out_stale; 1138 } 1139 1140 /* Still considered a transient error. Caller will schedule retries. */ 1141 list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { 1142 set_bit(XFS_LI_FAILED, &lip->li_flags); 1143 clear_bit(XFS_LI_FLUSHING, &lip->li_flags); 1144 } 1145 1146 xfs_buf_ioerror(bp, 0); 1147 xfs_buf_relse(bp); 1148 return true; 1149 1150 resubmit: 1151 xfs_buf_ioerror(bp, 0); 1152 bp->b_flags |= (XBF_DONE | XBF_WRITE_FAIL); 1153 reinit_completion(&bp->b_iowait); 1154 xfs_buf_submit(bp); 1155 return true; 1156 out_stale: 1157 xfs_buf_stale(bp); 1158 bp->b_flags |= XBF_DONE; 1159 bp->b_flags &= ~XBF_WRITE; 1160 trace_xfs_buf_error_relse(bp, _RET_IP_); 1161 return false; 1162 } 1163 1164 /* returns false if the caller needs to resubmit the I/O, else true */ 1165 static bool 1166 __xfs_buf_ioend( 1167 struct xfs_buf *bp) 1168 { 1169 trace_xfs_buf_iodone(bp, _RET_IP_); 1170 1171 if (bp->b_flags & XBF_READ) { 1172 if (!bp->b_error && is_vmalloc_addr(bp->b_addr)) 1173 invalidate_kernel_vmap_range(bp->b_addr, 1174 roundup(BBTOB(bp->b_length), PAGE_SIZE)); 1175 if (!bp->b_error && bp->b_ops) 1176 bp->b_ops->verify_read(bp); 1177 if (!bp->b_error) 1178 bp->b_flags |= XBF_DONE; 1179 if (bp->b_flags & XBF_READ_AHEAD) 1180 percpu_counter_dec(&bp->b_target->bt_readahead_count); 1181 } else { 1182 if (!bp->b_error) { 1183 bp->b_flags &= ~XBF_WRITE_FAIL; 1184 bp->b_flags |= XBF_DONE; 1185 } 1186 1187 if (unlikely(bp->b_error) && xfs_buf_ioend_handle_error(bp)) 1188 return false; 1189 1190 /* clear the retry state */ 1191 bp->b_last_error = 0; 1192 bp->b_retries = 0; 1193 bp->b_first_retry_time = 0; 1194 1195 /* 1196 * Note that for things like remote attribute buffers, there may 1197 * not be a buffer log item here, so processing the buffer log 1198 * item must remain optional. 1199 */ 1200 if (bp->b_log_item) 1201 xfs_buf_item_done(bp); 1202 1203 if (bp->b_iodone) 1204 bp->b_iodone(bp); 1205 } 1206 1207 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD | 1208 _XBF_LOGRECOVERY); 1209 return true; 1210 } 1211 1212 static void 1213 xfs_buf_ioend( 1214 struct xfs_buf *bp) 1215 { 1216 if (!__xfs_buf_ioend(bp)) 1217 return; 1218 if (bp->b_flags & XBF_ASYNC) 1219 xfs_buf_relse(bp); 1220 else 1221 complete(&bp->b_iowait); 1222 } 1223 1224 static void 1225 xfs_buf_ioend_work( 1226 struct work_struct *work) 1227 { 1228 struct xfs_buf *bp = 1229 container_of(work, struct xfs_buf, b_ioend_work); 1230 1231 if (__xfs_buf_ioend(bp)) 1232 xfs_buf_relse(bp); 1233 } 1234 1235 void 1236 __xfs_buf_ioerror( 1237 struct xfs_buf *bp, 1238 int error, 1239 xfs_failaddr_t failaddr) 1240 { 1241 ASSERT(error <= 0 && error >= -1000); 1242 bp->b_error = error; 1243 trace_xfs_buf_ioerror(bp, error, failaddr); 1244 } 1245 1246 void 1247 xfs_buf_ioerror_alert( 1248 struct xfs_buf *bp, 1249 xfs_failaddr_t func) 1250 { 1251 xfs_buf_alert_ratelimited(bp, "XFS: metadata IO error", 1252 "metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d", 1253 func, (uint64_t)xfs_buf_daddr(bp), 1254 bp->b_length, -bp->b_error); 1255 } 1256 1257 /* 1258 * To simulate an I/O failure, the buffer must be locked and held with at least 1259 * three references. The LRU reference is dropped by the stale call. The buf 1260 * item reference is dropped via ioend processing. The third reference is owned 1261 * by the caller and is dropped on I/O completion if the buffer is XBF_ASYNC. 1262 */ 1263 void 1264 xfs_buf_ioend_fail( 1265 struct xfs_buf *bp) 1266 { 1267 bp->b_flags &= ~XBF_DONE; 1268 xfs_buf_stale(bp); 1269 xfs_buf_ioerror(bp, -EIO); 1270 xfs_buf_ioend(bp); 1271 } 1272 1273 int 1274 xfs_bwrite( 1275 struct xfs_buf *bp) 1276 { 1277 int error; 1278 1279 ASSERT(xfs_buf_islocked(bp)); 1280 1281 bp->b_flags |= XBF_WRITE; 1282 bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | 1283 XBF_DONE); 1284 1285 xfs_buf_submit(bp); 1286 error = xfs_buf_iowait(bp); 1287 if (error) 1288 xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR); 1289 return error; 1290 } 1291 1292 static void 1293 xfs_buf_bio_end_io( 1294 struct bio *bio) 1295 { 1296 struct xfs_buf *bp = bio->bi_private; 1297 1298 if (bio->bi_status) 1299 xfs_buf_ioerror(bp, blk_status_to_errno(bio->bi_status)); 1300 else if ((bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) && 1301 XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR)) 1302 xfs_buf_ioerror(bp, -EIO); 1303 1304 if (bp->b_flags & XBF_ASYNC) { 1305 INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work); 1306 queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work); 1307 } else { 1308 complete(&bp->b_iowait); 1309 } 1310 1311 bio_put(bio); 1312 } 1313 1314 static inline blk_opf_t 1315 xfs_buf_bio_op( 1316 struct xfs_buf *bp) 1317 { 1318 blk_opf_t op; 1319 1320 if (bp->b_flags & XBF_WRITE) { 1321 op = REQ_OP_WRITE; 1322 } else { 1323 op = REQ_OP_READ; 1324 if (bp->b_flags & XBF_READ_AHEAD) 1325 op |= REQ_RAHEAD; 1326 } 1327 1328 return op | REQ_META; 1329 } 1330 1331 static void 1332 xfs_buf_submit_bio( 1333 struct xfs_buf *bp) 1334 { 1335 unsigned int map = 0; 1336 struct blk_plug plug; 1337 struct bio *bio; 1338 1339 if (is_vmalloc_addr(bp->b_addr)) { 1340 unsigned int size = BBTOB(bp->b_length); 1341 unsigned int alloc_size = roundup(size, PAGE_SIZE); 1342 void *data = bp->b_addr; 1343 1344 bio = bio_alloc(bp->b_target->bt_bdev, alloc_size >> PAGE_SHIFT, 1345 xfs_buf_bio_op(bp), GFP_NOIO); 1346 1347 do { 1348 unsigned int len = min(size, PAGE_SIZE); 1349 1350 ASSERT(offset_in_page(data) == 0); 1351 __bio_add_page(bio, vmalloc_to_page(data), len, 0); 1352 data += len; 1353 size -= len; 1354 } while (size); 1355 1356 flush_kernel_vmap_range(bp->b_addr, alloc_size); 1357 } else { 1358 /* 1359 * Single folio or slab allocation. Must be contiguous and thus 1360 * only a single bvec is needed. 1361 * 1362 * This uses the page based bio add helper for now as that is 1363 * the lowest common denominator between folios and slab 1364 * allocations. To be replaced with a better block layer 1365 * helper soon (hopefully). 1366 */ 1367 bio = bio_alloc(bp->b_target->bt_bdev, 1, xfs_buf_bio_op(bp), 1368 GFP_NOIO); 1369 __bio_add_page(bio, virt_to_page(bp->b_addr), 1370 BBTOB(bp->b_length), 1371 offset_in_page(bp->b_addr)); 1372 } 1373 1374 bio->bi_private = bp; 1375 bio->bi_end_io = xfs_buf_bio_end_io; 1376 1377 /* 1378 * If there is more than one map segment, split out a new bio for each 1379 * map except of the last one. The last map is handled by the 1380 * remainder of the original bio outside the loop. 1381 */ 1382 blk_start_plug(&plug); 1383 for (map = 0; map < bp->b_map_count - 1; map++) { 1384 struct bio *split; 1385 1386 split = bio_split(bio, bp->b_maps[map].bm_len, GFP_NOFS, 1387 &fs_bio_set); 1388 split->bi_iter.bi_sector = bp->b_maps[map].bm_bn; 1389 bio_chain(split, bio); 1390 submit_bio(split); 1391 } 1392 bio->bi_iter.bi_sector = bp->b_maps[map].bm_bn; 1393 submit_bio(bio); 1394 blk_finish_plug(&plug); 1395 } 1396 1397 /* 1398 * Wait for I/O completion of a sync buffer and return the I/O error code. 1399 */ 1400 static int 1401 xfs_buf_iowait( 1402 struct xfs_buf *bp) 1403 { 1404 ASSERT(!(bp->b_flags & XBF_ASYNC)); 1405 1406 do { 1407 trace_xfs_buf_iowait(bp, _RET_IP_); 1408 wait_for_completion(&bp->b_iowait); 1409 trace_xfs_buf_iowait_done(bp, _RET_IP_); 1410 } while (!__xfs_buf_ioend(bp)); 1411 1412 return bp->b_error; 1413 } 1414 1415 /* 1416 * Run the write verifier callback function if it exists. If this fails, mark 1417 * the buffer with an error and do not dispatch the I/O. 1418 */ 1419 static bool 1420 xfs_buf_verify_write( 1421 struct xfs_buf *bp) 1422 { 1423 if (bp->b_ops) { 1424 bp->b_ops->verify_write(bp); 1425 if (bp->b_error) 1426 return false; 1427 } else if (bp->b_rhash_key != XFS_BUF_DADDR_NULL) { 1428 /* 1429 * Non-crc filesystems don't attach verifiers during log 1430 * recovery, so don't warn for such filesystems. 1431 */ 1432 if (xfs_has_crc(bp->b_mount)) { 1433 xfs_warn(bp->b_mount, 1434 "%s: no buf ops on daddr 0x%llx len %d", 1435 __func__, xfs_buf_daddr(bp), 1436 bp->b_length); 1437 xfs_hex_dump(bp->b_addr, XFS_CORRUPTION_DUMP_LEN); 1438 dump_stack(); 1439 } 1440 } 1441 1442 return true; 1443 } 1444 1445 /* 1446 * Buffer I/O submission path, read or write. Asynchronous submission transfers 1447 * the buffer lock ownership and the current reference to the IO. It is not 1448 * safe to reference the buffer after a call to this function unless the caller 1449 * holds an additional reference itself. 1450 */ 1451 static void 1452 xfs_buf_submit( 1453 struct xfs_buf *bp) 1454 { 1455 trace_xfs_buf_submit(bp, _RET_IP_); 1456 1457 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); 1458 1459 /* 1460 * On log shutdown we stale and complete the buffer immediately. We can 1461 * be called to read the superblock before the log has been set up, so 1462 * be careful checking the log state. 1463 * 1464 * Checking the mount shutdown state here can result in the log tail 1465 * moving inappropriately on disk as the log may not yet be shut down. 1466 * i.e. failing this buffer on mount shutdown can remove it from the AIL 1467 * and move the tail of the log forwards without having written this 1468 * buffer to disk. This corrupts the log tail state in memory, and 1469 * because the log may not be shut down yet, it can then be propagated 1470 * to disk before the log is shutdown. Hence we check log shutdown 1471 * state here rather than mount state to avoid corrupting the log tail 1472 * on shutdown. 1473 */ 1474 if (bp->b_mount->m_log && xlog_is_shutdown(bp->b_mount->m_log)) { 1475 xfs_buf_ioend_fail(bp); 1476 return; 1477 } 1478 1479 if (bp->b_flags & XBF_WRITE) 1480 xfs_buf_wait_unpin(bp); 1481 1482 /* 1483 * Make sure we capture only current IO errors rather than stale errors 1484 * left over from previous use of the buffer (e.g. failed readahead). 1485 */ 1486 bp->b_error = 0; 1487 1488 if ((bp->b_flags & XBF_WRITE) && !xfs_buf_verify_write(bp)) { 1489 xfs_force_shutdown(bp->b_mount, SHUTDOWN_CORRUPT_INCORE); 1490 xfs_buf_ioend(bp); 1491 return; 1492 } 1493 1494 /* In-memory targets are directly mapped, no I/O required. */ 1495 if (xfs_buftarg_is_mem(bp->b_target)) { 1496 xfs_buf_ioend(bp); 1497 return; 1498 } 1499 1500 xfs_buf_submit_bio(bp); 1501 } 1502 1503 /* 1504 * Log a message about and stale a buffer that a caller has decided is corrupt. 1505 * 1506 * This function should be called for the kinds of metadata corruption that 1507 * cannot be detect from a verifier, such as incorrect inter-block relationship 1508 * data. Do /not/ call this function from a verifier function. 1509 * 1510 * The buffer must be XBF_DONE prior to the call. Afterwards, the buffer will 1511 * be marked stale, but b_error will not be set. The caller is responsible for 1512 * releasing the buffer or fixing it. 1513 */ 1514 void 1515 __xfs_buf_mark_corrupt( 1516 struct xfs_buf *bp, 1517 xfs_failaddr_t fa) 1518 { 1519 ASSERT(bp->b_flags & XBF_DONE); 1520 1521 xfs_buf_corruption_error(bp, fa); 1522 xfs_buf_stale(bp); 1523 } 1524 1525 /* 1526 * Handling of buffer targets (buftargs). 1527 */ 1528 1529 /* 1530 * Wait for any bufs with callbacks that have been submitted but have not yet 1531 * returned. These buffers will have an elevated hold count, so wait on those 1532 * while freeing all the buffers only held by the LRU. 1533 */ 1534 static enum lru_status 1535 xfs_buftarg_drain_rele( 1536 struct list_head *item, 1537 struct list_lru_one *lru, 1538 void *arg) 1539 1540 { 1541 struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); 1542 struct list_head *dispose = arg; 1543 1544 if (!spin_trylock(&bp->b_lock)) 1545 return LRU_SKIP; 1546 if (bp->b_hold > 1) { 1547 /* need to wait, so skip it this pass */ 1548 spin_unlock(&bp->b_lock); 1549 trace_xfs_buf_drain_buftarg(bp, _RET_IP_); 1550 return LRU_SKIP; 1551 } 1552 1553 /* 1554 * clear the LRU reference count so the buffer doesn't get 1555 * ignored in xfs_buf_rele(). 1556 */ 1557 atomic_set(&bp->b_lru_ref, 0); 1558 bp->b_state |= XFS_BSTATE_DISPOSE; 1559 list_lru_isolate_move(lru, item, dispose); 1560 spin_unlock(&bp->b_lock); 1561 return LRU_REMOVED; 1562 } 1563 1564 /* 1565 * Wait for outstanding I/O on the buftarg to complete. 1566 */ 1567 void 1568 xfs_buftarg_wait( 1569 struct xfs_buftarg *btp) 1570 { 1571 /* 1572 * First wait for all in-flight readahead buffers to be released. This is 1573 * critical as new buffers do not make the LRU until they are released. 1574 * 1575 * Next, flush the buffer workqueue to ensure all completion processing 1576 * has finished. Just waiting on buffer locks is not sufficient for 1577 * async IO as the reference count held over IO is not released until 1578 * after the buffer lock is dropped. Hence we need to ensure here that 1579 * all reference counts have been dropped before we start walking the 1580 * LRU list. 1581 */ 1582 while (percpu_counter_sum(&btp->bt_readahead_count)) 1583 delay(100); 1584 flush_workqueue(btp->bt_mount->m_buf_workqueue); 1585 } 1586 1587 void 1588 xfs_buftarg_drain( 1589 struct xfs_buftarg *btp) 1590 { 1591 LIST_HEAD(dispose); 1592 int loop = 0; 1593 bool write_fail = false; 1594 1595 xfs_buftarg_wait(btp); 1596 1597 /* loop until there is nothing left on the lru list. */ 1598 while (list_lru_count(&btp->bt_lru)) { 1599 list_lru_walk(&btp->bt_lru, xfs_buftarg_drain_rele, 1600 &dispose, LONG_MAX); 1601 1602 while (!list_empty(&dispose)) { 1603 struct xfs_buf *bp; 1604 bp = list_first_entry(&dispose, struct xfs_buf, b_lru); 1605 list_del_init(&bp->b_lru); 1606 if (bp->b_flags & XBF_WRITE_FAIL) { 1607 write_fail = true; 1608 xfs_buf_alert_ratelimited(bp, 1609 "XFS: Corruption Alert", 1610 "Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!", 1611 (long long)xfs_buf_daddr(bp)); 1612 } 1613 xfs_buf_rele(bp); 1614 } 1615 if (loop++ != 0) 1616 delay(100); 1617 } 1618 1619 /* 1620 * If one or more failed buffers were freed, that means dirty metadata 1621 * was thrown away. This should only ever happen after I/O completion 1622 * handling has elevated I/O error(s) to permanent failures and shuts 1623 * down the journal. 1624 */ 1625 if (write_fail) { 1626 ASSERT(xlog_is_shutdown(btp->bt_mount->m_log)); 1627 xfs_alert(btp->bt_mount, 1628 "Please run xfs_repair to determine the extent of the problem."); 1629 } 1630 } 1631 1632 static enum lru_status 1633 xfs_buftarg_isolate( 1634 struct list_head *item, 1635 struct list_lru_one *lru, 1636 void *arg) 1637 { 1638 struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); 1639 struct list_head *dispose = arg; 1640 1641 /* 1642 * we are inverting the lru lock/bp->b_lock here, so use a trylock. 1643 * If we fail to get the lock, just skip it. 1644 */ 1645 if (!spin_trylock(&bp->b_lock)) 1646 return LRU_SKIP; 1647 /* 1648 * Decrement the b_lru_ref count unless the value is already 1649 * zero. If the value is already zero, we need to reclaim the 1650 * buffer, otherwise it gets another trip through the LRU. 1651 */ 1652 if (atomic_add_unless(&bp->b_lru_ref, -1, 0)) { 1653 spin_unlock(&bp->b_lock); 1654 return LRU_ROTATE; 1655 } 1656 1657 bp->b_state |= XFS_BSTATE_DISPOSE; 1658 list_lru_isolate_move(lru, item, dispose); 1659 spin_unlock(&bp->b_lock); 1660 return LRU_REMOVED; 1661 } 1662 1663 static unsigned long 1664 xfs_buftarg_shrink_scan( 1665 struct shrinker *shrink, 1666 struct shrink_control *sc) 1667 { 1668 struct xfs_buftarg *btp = shrink->private_data; 1669 LIST_HEAD(dispose); 1670 unsigned long freed; 1671 1672 freed = list_lru_shrink_walk(&btp->bt_lru, sc, 1673 xfs_buftarg_isolate, &dispose); 1674 1675 while (!list_empty(&dispose)) { 1676 struct xfs_buf *bp; 1677 bp = list_first_entry(&dispose, struct xfs_buf, b_lru); 1678 list_del_init(&bp->b_lru); 1679 xfs_buf_rele(bp); 1680 } 1681 1682 return freed; 1683 } 1684 1685 static unsigned long 1686 xfs_buftarg_shrink_count( 1687 struct shrinker *shrink, 1688 struct shrink_control *sc) 1689 { 1690 struct xfs_buftarg *btp = shrink->private_data; 1691 return list_lru_shrink_count(&btp->bt_lru, sc); 1692 } 1693 1694 void 1695 xfs_destroy_buftarg( 1696 struct xfs_buftarg *btp) 1697 { 1698 shrinker_free(btp->bt_shrinker); 1699 ASSERT(percpu_counter_sum(&btp->bt_readahead_count) == 0); 1700 percpu_counter_destroy(&btp->bt_readahead_count); 1701 list_lru_destroy(&btp->bt_lru); 1702 } 1703 1704 void 1705 xfs_free_buftarg( 1706 struct xfs_buftarg *btp) 1707 { 1708 xfs_destroy_buftarg(btp); 1709 fs_put_dax(btp->bt_daxdev, btp->bt_mount); 1710 /* the main block device is closed by kill_block_super */ 1711 if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev) 1712 bdev_fput(btp->bt_bdev_file); 1713 kfree(btp); 1714 } 1715 1716 int 1717 xfs_setsize_buftarg( 1718 struct xfs_buftarg *btp, 1719 unsigned int sectorsize) 1720 { 1721 /* Set up metadata sector size info */ 1722 btp->bt_meta_sectorsize = sectorsize; 1723 btp->bt_meta_sectormask = sectorsize - 1; 1724 1725 if (set_blocksize(btp->bt_bdev_file, sectorsize)) { 1726 xfs_warn(btp->bt_mount, 1727 "Cannot set_blocksize to %u on device %pg", 1728 sectorsize, btp->bt_bdev); 1729 return -EINVAL; 1730 } 1731 1732 return 0; 1733 } 1734 1735 int 1736 xfs_init_buftarg( 1737 struct xfs_buftarg *btp, 1738 size_t logical_sectorsize, 1739 const char *descr) 1740 { 1741 /* Set up device logical sector size mask */ 1742 btp->bt_logical_sectorsize = logical_sectorsize; 1743 btp->bt_logical_sectormask = logical_sectorsize - 1; 1744 1745 /* 1746 * Buffer IO error rate limiting. Limit it to no more than 10 messages 1747 * per 30 seconds so as to not spam logs too much on repeated errors. 1748 */ 1749 ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ, 1750 DEFAULT_RATELIMIT_BURST); 1751 1752 if (list_lru_init(&btp->bt_lru)) 1753 return -ENOMEM; 1754 if (percpu_counter_init(&btp->bt_readahead_count, 0, GFP_KERNEL)) 1755 goto out_destroy_lru; 1756 1757 btp->bt_shrinker = 1758 shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-buf:%s", descr); 1759 if (!btp->bt_shrinker) 1760 goto out_destroy_io_count; 1761 btp->bt_shrinker->count_objects = xfs_buftarg_shrink_count; 1762 btp->bt_shrinker->scan_objects = xfs_buftarg_shrink_scan; 1763 btp->bt_shrinker->private_data = btp; 1764 shrinker_register(btp->bt_shrinker); 1765 return 0; 1766 1767 out_destroy_io_count: 1768 percpu_counter_destroy(&btp->bt_readahead_count); 1769 out_destroy_lru: 1770 list_lru_destroy(&btp->bt_lru); 1771 return -ENOMEM; 1772 } 1773 1774 struct xfs_buftarg * 1775 xfs_alloc_buftarg( 1776 struct xfs_mount *mp, 1777 struct file *bdev_file) 1778 { 1779 struct xfs_buftarg *btp; 1780 const struct dax_holder_operations *ops = NULL; 1781 1782 #if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE) 1783 ops = &xfs_dax_holder_operations; 1784 #endif 1785 btp = kzalloc(sizeof(*btp), GFP_KERNEL | __GFP_NOFAIL); 1786 1787 btp->bt_mount = mp; 1788 btp->bt_bdev_file = bdev_file; 1789 btp->bt_bdev = file_bdev(bdev_file); 1790 btp->bt_dev = btp->bt_bdev->bd_dev; 1791 btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off, 1792 mp, ops); 1793 1794 if (bdev_can_atomic_write(btp->bt_bdev)) { 1795 btp->bt_bdev_awu_min = bdev_atomic_write_unit_min_bytes( 1796 btp->bt_bdev); 1797 btp->bt_bdev_awu_max = bdev_atomic_write_unit_max_bytes( 1798 btp->bt_bdev); 1799 } 1800 1801 /* 1802 * When allocating the buftargs we have not yet read the super block and 1803 * thus don't know the file system sector size yet. 1804 */ 1805 if (xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev))) 1806 goto error_free; 1807 if (xfs_init_buftarg(btp, bdev_logical_block_size(btp->bt_bdev), 1808 mp->m_super->s_id)) 1809 goto error_free; 1810 1811 return btp; 1812 1813 error_free: 1814 kfree(btp); 1815 return NULL; 1816 } 1817 1818 static inline void 1819 xfs_buf_list_del( 1820 struct xfs_buf *bp) 1821 { 1822 list_del_init(&bp->b_list); 1823 wake_up_var(&bp->b_list); 1824 } 1825 1826 /* 1827 * Cancel a delayed write list. 1828 * 1829 * Remove each buffer from the list, clear the delwri queue flag and drop the 1830 * associated buffer reference. 1831 */ 1832 void 1833 xfs_buf_delwri_cancel( 1834 struct list_head *list) 1835 { 1836 struct xfs_buf *bp; 1837 1838 while (!list_empty(list)) { 1839 bp = list_first_entry(list, struct xfs_buf, b_list); 1840 1841 xfs_buf_lock(bp); 1842 bp->b_flags &= ~_XBF_DELWRI_Q; 1843 xfs_buf_list_del(bp); 1844 xfs_buf_relse(bp); 1845 } 1846 } 1847 1848 /* 1849 * Add a buffer to the delayed write list. 1850 * 1851 * This queues a buffer for writeout if it hasn't already been. Note that 1852 * neither this routine nor the buffer list submission functions perform 1853 * any internal synchronization. It is expected that the lists are thread-local 1854 * to the callers. 1855 * 1856 * Returns true if we queued up the buffer, or false if it already had 1857 * been on the buffer list. 1858 */ 1859 bool 1860 xfs_buf_delwri_queue( 1861 struct xfs_buf *bp, 1862 struct list_head *list) 1863 { 1864 ASSERT(xfs_buf_islocked(bp)); 1865 ASSERT(!(bp->b_flags & XBF_READ)); 1866 1867 /* 1868 * If the buffer is already marked delwri it already is queued up 1869 * by someone else for imediate writeout. Just ignore it in that 1870 * case. 1871 */ 1872 if (bp->b_flags & _XBF_DELWRI_Q) { 1873 trace_xfs_buf_delwri_queued(bp, _RET_IP_); 1874 return false; 1875 } 1876 1877 trace_xfs_buf_delwri_queue(bp, _RET_IP_); 1878 1879 /* 1880 * If a buffer gets written out synchronously or marked stale while it 1881 * is on a delwri list we lazily remove it. To do this, the other party 1882 * clears the _XBF_DELWRI_Q flag but otherwise leaves the buffer alone. 1883 * It remains referenced and on the list. In a rare corner case it 1884 * might get readded to a delwri list after the synchronous writeout, in 1885 * which case we need just need to re-add the flag here. 1886 */ 1887 bp->b_flags |= _XBF_DELWRI_Q; 1888 if (list_empty(&bp->b_list)) { 1889 xfs_buf_hold(bp); 1890 list_add_tail(&bp->b_list, list); 1891 } 1892 1893 return true; 1894 } 1895 1896 /* 1897 * Queue a buffer to this delwri list as part of a data integrity operation. 1898 * If the buffer is on any other delwri list, we'll wait for that to clear 1899 * so that the caller can submit the buffer for IO and wait for the result. 1900 * Callers must ensure the buffer is not already on the list. 1901 */ 1902 void 1903 xfs_buf_delwri_queue_here( 1904 struct xfs_buf *bp, 1905 struct list_head *buffer_list) 1906 { 1907 /* 1908 * We need this buffer to end up on the /caller's/ delwri list, not any 1909 * old list. This can happen if the buffer is marked stale (which 1910 * clears DELWRI_Q) after the AIL queues the buffer to its list but 1911 * before the AIL has a chance to submit the list. 1912 */ 1913 while (!list_empty(&bp->b_list)) { 1914 xfs_buf_unlock(bp); 1915 wait_var_event(&bp->b_list, list_empty(&bp->b_list)); 1916 xfs_buf_lock(bp); 1917 } 1918 1919 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); 1920 1921 xfs_buf_delwri_queue(bp, buffer_list); 1922 } 1923 1924 /* 1925 * Compare function is more complex than it needs to be because 1926 * the return value is only 32 bits and we are doing comparisons 1927 * on 64 bit values 1928 */ 1929 static int 1930 xfs_buf_cmp( 1931 void *priv, 1932 const struct list_head *a, 1933 const struct list_head *b) 1934 { 1935 struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list); 1936 struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list); 1937 xfs_daddr_t diff; 1938 1939 diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn; 1940 if (diff < 0) 1941 return -1; 1942 if (diff > 0) 1943 return 1; 1944 return 0; 1945 } 1946 1947 static bool 1948 xfs_buf_delwri_submit_prep( 1949 struct xfs_buf *bp) 1950 { 1951 /* 1952 * Someone else might have written the buffer synchronously or marked it 1953 * stale in the meantime. In that case only the _XBF_DELWRI_Q flag got 1954 * cleared, and we have to drop the reference and remove it from the 1955 * list here. 1956 */ 1957 if (!(bp->b_flags & _XBF_DELWRI_Q)) { 1958 xfs_buf_list_del(bp); 1959 xfs_buf_relse(bp); 1960 return false; 1961 } 1962 1963 trace_xfs_buf_delwri_split(bp, _RET_IP_); 1964 bp->b_flags &= ~_XBF_DELWRI_Q; 1965 bp->b_flags |= XBF_WRITE; 1966 return true; 1967 } 1968 1969 /* 1970 * Write out a buffer list asynchronously. 1971 * 1972 * This will take the @buffer_list, write all non-locked and non-pinned buffers 1973 * out and not wait for I/O completion on any of the buffers. This interface 1974 * is only safely useable for callers that can track I/O completion by higher 1975 * level means, e.g. AIL pushing as the @buffer_list is consumed in this 1976 * function. 1977 * 1978 * Note: this function will skip buffers it would block on, and in doing so 1979 * leaves them on @buffer_list so they can be retried on a later pass. As such, 1980 * it is up to the caller to ensure that the buffer list is fully submitted or 1981 * cancelled appropriately when they are finished with the list. Failure to 1982 * cancel or resubmit the list until it is empty will result in leaked buffers 1983 * at unmount time. 1984 */ 1985 int 1986 xfs_buf_delwri_submit_nowait( 1987 struct list_head *buffer_list) 1988 { 1989 struct xfs_buf *bp, *n; 1990 int pinned = 0; 1991 struct blk_plug plug; 1992 1993 list_sort(NULL, buffer_list, xfs_buf_cmp); 1994 1995 blk_start_plug(&plug); 1996 list_for_each_entry_safe(bp, n, buffer_list, b_list) { 1997 if (!xfs_buf_trylock(bp)) 1998 continue; 1999 if (xfs_buf_ispinned(bp)) { 2000 xfs_buf_unlock(bp); 2001 pinned++; 2002 continue; 2003 } 2004 if (!xfs_buf_delwri_submit_prep(bp)) 2005 continue; 2006 bp->b_flags |= XBF_ASYNC; 2007 xfs_buf_list_del(bp); 2008 xfs_buf_submit(bp); 2009 } 2010 blk_finish_plug(&plug); 2011 2012 return pinned; 2013 } 2014 2015 /* 2016 * Write out a buffer list synchronously. 2017 * 2018 * This will take the @buffer_list, write all buffers out and wait for I/O 2019 * completion on all of the buffers. @buffer_list is consumed by the function, 2020 * so callers must have some other way of tracking buffers if they require such 2021 * functionality. 2022 */ 2023 int 2024 xfs_buf_delwri_submit( 2025 struct list_head *buffer_list) 2026 { 2027 LIST_HEAD (wait_list); 2028 int error = 0, error2; 2029 struct xfs_buf *bp, *n; 2030 struct blk_plug plug; 2031 2032 list_sort(NULL, buffer_list, xfs_buf_cmp); 2033 2034 blk_start_plug(&plug); 2035 list_for_each_entry_safe(bp, n, buffer_list, b_list) { 2036 xfs_buf_lock(bp); 2037 if (!xfs_buf_delwri_submit_prep(bp)) 2038 continue; 2039 bp->b_flags &= ~XBF_ASYNC; 2040 list_move_tail(&bp->b_list, &wait_list); 2041 xfs_buf_submit(bp); 2042 } 2043 blk_finish_plug(&plug); 2044 2045 /* Wait for IO to complete. */ 2046 while (!list_empty(&wait_list)) { 2047 bp = list_first_entry(&wait_list, struct xfs_buf, b_list); 2048 2049 xfs_buf_list_del(bp); 2050 2051 /* 2052 * Wait on the locked buffer, check for errors and unlock and 2053 * release the delwri queue reference. 2054 */ 2055 error2 = xfs_buf_iowait(bp); 2056 xfs_buf_relse(bp); 2057 if (!error) 2058 error = error2; 2059 } 2060 2061 return error; 2062 } 2063 2064 /* 2065 * Push a single buffer on a delwri queue. 2066 * 2067 * The purpose of this function is to submit a single buffer of a delwri queue 2068 * and return with the buffer still on the original queue. 2069 * 2070 * The buffer locking and queue management logic between _delwri_pushbuf() and 2071 * _delwri_queue() guarantee that the buffer cannot be queued to another list 2072 * before returning. 2073 */ 2074 int 2075 xfs_buf_delwri_pushbuf( 2076 struct xfs_buf *bp, 2077 struct list_head *buffer_list) 2078 { 2079 int error; 2080 2081 ASSERT(bp->b_flags & _XBF_DELWRI_Q); 2082 2083 trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_); 2084 2085 xfs_buf_lock(bp); 2086 bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC); 2087 bp->b_flags |= XBF_WRITE; 2088 xfs_buf_submit(bp); 2089 2090 /* 2091 * The buffer is now locked, under I/O but still on the original delwri 2092 * queue. Wait for I/O completion, restore the DELWRI_Q flag and 2093 * return with the buffer unlocked and still on the original queue. 2094 */ 2095 error = xfs_buf_iowait(bp); 2096 bp->b_flags |= _XBF_DELWRI_Q; 2097 xfs_buf_unlock(bp); 2098 2099 return error; 2100 } 2101 2102 void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) 2103 { 2104 /* 2105 * Set the lru reference count to 0 based on the error injection tag. 2106 * This allows userspace to disrupt buffer caching for debug/testing 2107 * purposes. 2108 */ 2109 if (XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_LRU_REF)) 2110 lru_ref = 0; 2111 2112 atomic_set(&bp->b_lru_ref, lru_ref); 2113 } 2114 2115 /* 2116 * Verify an on-disk magic value against the magic value specified in the 2117 * verifier structure. The verifier magic is in disk byte order so the caller is 2118 * expected to pass the value directly from disk. 2119 */ 2120 bool 2121 xfs_verify_magic( 2122 struct xfs_buf *bp, 2123 __be32 dmagic) 2124 { 2125 struct xfs_mount *mp = bp->b_mount; 2126 int idx; 2127 2128 idx = xfs_has_crc(mp); 2129 if (WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx])) 2130 return false; 2131 return dmagic == bp->b_ops->magic[idx]; 2132 } 2133 /* 2134 * Verify an on-disk magic value against the magic value specified in the 2135 * verifier structure. The verifier magic is in disk byte order so the caller is 2136 * expected to pass the value directly from disk. 2137 */ 2138 bool 2139 xfs_verify_magic16( 2140 struct xfs_buf *bp, 2141 __be16 dmagic) 2142 { 2143 struct xfs_mount *mp = bp->b_mount; 2144 int idx; 2145 2146 idx = xfs_has_crc(mp); 2147 if (WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx])) 2148 return false; 2149 return dmagic == bp->b_ops->magic16[idx]; 2150 } 2151