1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include <linux/backing-dev.h> 8 #include <linux/dax.h> 9 10 #include "xfs_shared.h" 11 #include "xfs_format.h" 12 #include "xfs_log_format.h" 13 #include "xfs_trans_resv.h" 14 #include "xfs_mount.h" 15 #include "xfs_trace.h" 16 #include "xfs_log.h" 17 #include "xfs_log_recover.h" 18 #include "xfs_log_priv.h" 19 #include "xfs_trans.h" 20 #include "xfs_buf_item.h" 21 #include "xfs_errortag.h" 22 #include "xfs_error.h" 23 #include "xfs_ag.h" 24 #include "xfs_buf_mem.h" 25 #include "xfs_notify_failure.h" 26 27 struct kmem_cache *xfs_buf_cache; 28 29 /* 30 * Locking orders 31 * 32 * xfs_buf_stale: 33 * b_sema (caller holds) 34 * b_lock 35 * lru_lock 36 * 37 * xfs_buf_rele: 38 * b_lock 39 * lru_lock 40 * 41 * xfs_buftarg_drain_rele 42 * lru_lock 43 * b_lock (trylock due to inversion) 44 * 45 * xfs_buftarg_isolate 46 * lru_lock 47 * b_lock (trylock due to inversion) 48 */ 49 50 static void xfs_buf_submit(struct xfs_buf *bp); 51 static int xfs_buf_iowait(struct xfs_buf *bp); 52 53 static inline bool xfs_buf_is_uncached(struct xfs_buf *bp) 54 { 55 return bp->b_rhash_key == XFS_BUF_DADDR_NULL; 56 } 57 58 /* 59 * When we mark a buffer stale, we remove the buffer from the LRU and clear the 60 * b_lru_ref count so that the buffer is freed immediately when the buffer 61 * reference count falls to zero. If the buffer is already on the LRU, we need 62 * to remove the reference that LRU holds on the buffer. 63 * 64 * This prevents build-up of stale buffers on the LRU. 65 */ 66 void 67 xfs_buf_stale( 68 struct xfs_buf *bp) 69 { 70 ASSERT(xfs_buf_islocked(bp)); 71 72 bp->b_flags |= XBF_STALE; 73 74 /* 75 * Clear the delwri status so that a delwri queue walker will not 76 * flush this buffer to disk now that it is stale. The delwri queue has 77 * a reference to the buffer, so this is safe to do. 78 */ 79 bp->b_flags &= ~_XBF_DELWRI_Q; 80 81 spin_lock(&bp->b_lock); 82 atomic_set(&bp->b_lru_ref, 0); 83 if (!(bp->b_state & XFS_BSTATE_DISPOSE) && 84 (list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru))) 85 bp->b_hold--; 86 87 ASSERT(bp->b_hold >= 1); 88 spin_unlock(&bp->b_lock); 89 } 90 91 static void 92 xfs_buf_free_callback( 93 struct callback_head *cb) 94 { 95 struct xfs_buf *bp = container_of(cb, struct xfs_buf, b_rcu); 96 97 if (bp->b_maps != &bp->__b_map) 98 kfree(bp->b_maps); 99 kmem_cache_free(xfs_buf_cache, bp); 100 } 101 102 static void 103 xfs_buf_free( 104 struct xfs_buf *bp) 105 { 106 unsigned int size = BBTOB(bp->b_length); 107 108 might_sleep(); 109 trace_xfs_buf_free(bp, _RET_IP_); 110 111 ASSERT(list_empty(&bp->b_lru)); 112 113 if (!xfs_buftarg_is_mem(bp->b_target) && size >= PAGE_SIZE) 114 mm_account_reclaimed_pages(howmany(size, PAGE_SHIFT)); 115 116 if (is_vmalloc_addr(bp->b_addr)) 117 vfree(bp->b_addr); 118 else if (bp->b_flags & _XBF_KMEM) 119 kfree(bp->b_addr); 120 else 121 folio_put(virt_to_folio(bp->b_addr)); 122 123 call_rcu(&bp->b_rcu, xfs_buf_free_callback); 124 } 125 126 static int 127 xfs_buf_alloc_kmem( 128 struct xfs_buf *bp, 129 size_t size, 130 gfp_t gfp_mask) 131 { 132 ASSERT(is_power_of_2(size)); 133 ASSERT(size < PAGE_SIZE); 134 135 bp->b_addr = kmalloc(size, gfp_mask | __GFP_NOFAIL); 136 if (!bp->b_addr) 137 return -ENOMEM; 138 139 /* 140 * Slab guarantees that we get back naturally aligned allocations for 141 * power of two sizes. Keep this check as the canary in the coal mine 142 * if anything changes in slab. 143 */ 144 if (WARN_ON_ONCE(!IS_ALIGNED((unsigned long)bp->b_addr, size))) { 145 kfree(bp->b_addr); 146 bp->b_addr = NULL; 147 return -ENOMEM; 148 } 149 bp->b_flags |= _XBF_KMEM; 150 trace_xfs_buf_backing_kmem(bp, _RET_IP_); 151 return 0; 152 } 153 154 /* 155 * Allocate backing memory for a buffer. 156 * 157 * For tmpfs-backed buffers used by in-memory btrees this directly maps the 158 * tmpfs page cache folios. 159 * 160 * For real file system buffers there are three different kinds backing memory: 161 * 162 * The first type backs the buffer by a kmalloc allocation. This is done for 163 * less than PAGE_SIZE allocations to avoid wasting memory. 164 * 165 * The second type is a single folio buffer - this may be a high order folio or 166 * just a single page sized folio, but either way they get treated the same way 167 * by the rest of the code - the buffer memory spans a single contiguous memory 168 * region that we don't have to map and unmap to access the data directly. 169 * 170 * The third type of buffer is the vmalloc()d buffer. This provides the buffer 171 * with the required contiguous memory region but backed by discontiguous 172 * physical pages. 173 */ 174 static int 175 xfs_buf_alloc_backing_mem( 176 struct xfs_buf *bp, 177 xfs_buf_flags_t flags) 178 { 179 size_t size = BBTOB(bp->b_length); 180 gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN; 181 struct folio *folio; 182 183 if (xfs_buftarg_is_mem(bp->b_target)) 184 return xmbuf_map_backing_mem(bp); 185 186 /* Assure zeroed buffer for non-read cases. */ 187 if (!(flags & XBF_READ)) 188 gfp_mask |= __GFP_ZERO; 189 190 if (flags & XBF_READ_AHEAD) 191 gfp_mask |= __GFP_NORETRY; 192 193 /* 194 * For buffers smaller than PAGE_SIZE use a kmalloc allocation if that 195 * is properly aligned. The slab allocator now guarantees an aligned 196 * allocation for all power of two sizes, which matches most of the 197 * smaller than PAGE_SIZE buffers used by XFS. 198 */ 199 if (size < PAGE_SIZE && is_power_of_2(size)) 200 return xfs_buf_alloc_kmem(bp, size, gfp_mask); 201 202 /* 203 * Don't bother with the retry loop for single PAGE allocations: vmalloc 204 * won't do any better. 205 */ 206 if (size <= PAGE_SIZE) 207 gfp_mask |= __GFP_NOFAIL; 208 209 /* 210 * Optimistically attempt a single high order folio allocation for 211 * larger than PAGE_SIZE buffers. 212 * 213 * Allocating a high order folio makes the assumption that buffers are a 214 * power-of-2 size, matching the power-of-2 folios sizes available. 215 * 216 * The exception here are user xattr data buffers, which can be arbitrarily 217 * sized up to 64kB plus structure metadata, skip straight to the vmalloc 218 * path for them instead of wasting memory here. 219 */ 220 if (size > PAGE_SIZE) { 221 if (!is_power_of_2(size)) 222 goto fallback; 223 gfp_mask &= ~__GFP_DIRECT_RECLAIM; 224 gfp_mask |= __GFP_NORETRY; 225 } 226 folio = folio_alloc(gfp_mask, get_order(size)); 227 if (!folio) { 228 if (size <= PAGE_SIZE) 229 return -ENOMEM; 230 trace_xfs_buf_backing_fallback(bp, _RET_IP_); 231 goto fallback; 232 } 233 bp->b_addr = folio_address(folio); 234 trace_xfs_buf_backing_folio(bp, _RET_IP_); 235 return 0; 236 237 fallback: 238 for (;;) { 239 bp->b_addr = __vmalloc(size, gfp_mask); 240 if (bp->b_addr) 241 break; 242 if (flags & XBF_READ_AHEAD) 243 return -ENOMEM; 244 XFS_STATS_INC(bp->b_mount, xb_page_retries); 245 memalloc_retry_wait(gfp_mask); 246 } 247 248 trace_xfs_buf_backing_vmalloc(bp, _RET_IP_); 249 return 0; 250 } 251 252 static int 253 xfs_buf_alloc( 254 struct xfs_buftarg *target, 255 struct xfs_buf_map *map, 256 int nmaps, 257 xfs_buf_flags_t flags, 258 struct xfs_buf **bpp) 259 { 260 struct xfs_buf *bp; 261 int error; 262 int i; 263 264 *bpp = NULL; 265 bp = kmem_cache_zalloc(xfs_buf_cache, 266 GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); 267 268 /* 269 * We don't want certain flags to appear in b_flags unless they are 270 * specifically set by later operations on the buffer. 271 */ 272 flags &= ~(XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD); 273 274 /* 275 * A new buffer is held and locked by the owner. This ensures that the 276 * buffer is owned by the caller and racing RCU lookups right after 277 * inserting into the hash table are safe (and will have to wait for 278 * the unlock to do anything non-trivial). 279 */ 280 bp->b_hold = 1; 281 sema_init(&bp->b_sema, 0); /* held, no waiters */ 282 283 spin_lock_init(&bp->b_lock); 284 atomic_set(&bp->b_lru_ref, 1); 285 init_completion(&bp->b_iowait); 286 INIT_LIST_HEAD(&bp->b_lru); 287 INIT_LIST_HEAD(&bp->b_list); 288 INIT_LIST_HEAD(&bp->b_li_list); 289 bp->b_target = target; 290 bp->b_mount = target->bt_mount; 291 bp->b_flags = flags; 292 bp->b_rhash_key = map[0].bm_bn; 293 bp->b_length = 0; 294 bp->b_map_count = nmaps; 295 if (nmaps == 1) 296 bp->b_maps = &bp->__b_map; 297 else 298 bp->b_maps = kcalloc(nmaps, sizeof(struct xfs_buf_map), 299 GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); 300 for (i = 0; i < nmaps; i++) { 301 bp->b_maps[i].bm_bn = map[i].bm_bn; 302 bp->b_maps[i].bm_len = map[i].bm_len; 303 bp->b_length += map[i].bm_len; 304 } 305 306 atomic_set(&bp->b_pin_count, 0); 307 init_waitqueue_head(&bp->b_waiters); 308 309 XFS_STATS_INC(bp->b_mount, xb_create); 310 trace_xfs_buf_init(bp, _RET_IP_); 311 312 error = xfs_buf_alloc_backing_mem(bp, flags); 313 if (error) { 314 xfs_buf_free(bp); 315 return error; 316 } 317 318 *bpp = bp; 319 return 0; 320 } 321 322 /* 323 * Finding and Reading Buffers 324 */ 325 static int 326 _xfs_buf_obj_cmp( 327 struct rhashtable_compare_arg *arg, 328 const void *obj) 329 { 330 const struct xfs_buf_map *map = arg->key; 331 const struct xfs_buf *bp = obj; 332 333 /* 334 * The key hashing in the lookup path depends on the key being the 335 * first element of the compare_arg, make sure to assert this. 336 */ 337 BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0); 338 339 if (bp->b_rhash_key != map->bm_bn) 340 return 1; 341 342 if (unlikely(bp->b_length != map->bm_len)) { 343 /* 344 * found a block number match. If the range doesn't 345 * match, the only way this is allowed is if the buffer 346 * in the cache is stale and the transaction that made 347 * it stale has not yet committed. i.e. we are 348 * reallocating a busy extent. Skip this buffer and 349 * continue searching for an exact match. 350 * 351 * Note: If we're scanning for incore buffers to stale, don't 352 * complain if we find non-stale buffers. 353 */ 354 if (!(map->bm_flags & XBM_LIVESCAN)) 355 ASSERT(bp->b_flags & XBF_STALE); 356 return 1; 357 } 358 return 0; 359 } 360 361 static const struct rhashtable_params xfs_buf_hash_params = { 362 .min_size = 32, /* empty AGs have minimal footprint */ 363 .nelem_hint = 16, 364 .key_len = sizeof(xfs_daddr_t), 365 .key_offset = offsetof(struct xfs_buf, b_rhash_key), 366 .head_offset = offsetof(struct xfs_buf, b_rhash_head), 367 .automatic_shrinking = true, 368 .obj_cmpfn = _xfs_buf_obj_cmp, 369 }; 370 371 int 372 xfs_buf_cache_init( 373 struct xfs_buf_cache *bch) 374 { 375 return rhashtable_init(&bch->bc_hash, &xfs_buf_hash_params); 376 } 377 378 void 379 xfs_buf_cache_destroy( 380 struct xfs_buf_cache *bch) 381 { 382 rhashtable_destroy(&bch->bc_hash); 383 } 384 385 static int 386 xfs_buf_map_verify( 387 struct xfs_buftarg *btp, 388 struct xfs_buf_map *map) 389 { 390 xfs_daddr_t eofs; 391 392 /* Check for IOs smaller than the sector size / not sector aligned */ 393 ASSERT(!(BBTOB(map->bm_len) < btp->bt_meta_sectorsize)); 394 ASSERT(!(BBTOB(map->bm_bn) & (xfs_off_t)btp->bt_meta_sectormask)); 395 396 /* 397 * Corrupted block numbers can get through to here, unfortunately, so we 398 * have to check that the buffer falls within the filesystem bounds. 399 */ 400 eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); 401 if (map->bm_bn < 0 || map->bm_bn >= eofs) { 402 xfs_alert(btp->bt_mount, 403 "%s: daddr 0x%llx out of range, EOFS 0x%llx", 404 __func__, map->bm_bn, eofs); 405 WARN_ON(1); 406 return -EFSCORRUPTED; 407 } 408 return 0; 409 } 410 411 static int 412 xfs_buf_find_lock( 413 struct xfs_buf *bp, 414 xfs_buf_flags_t flags) 415 { 416 if (flags & XBF_TRYLOCK) { 417 if (!xfs_buf_trylock(bp)) { 418 XFS_STATS_INC(bp->b_mount, xb_busy_locked); 419 return -EAGAIN; 420 } 421 } else { 422 xfs_buf_lock(bp); 423 XFS_STATS_INC(bp->b_mount, xb_get_locked_waited); 424 } 425 426 /* 427 * if the buffer is stale, clear all the external state associated with 428 * it. We need to keep flags such as how we allocated the buffer memory 429 * intact here. 430 */ 431 if (bp->b_flags & XBF_STALE) { 432 if (flags & XBF_LIVESCAN) { 433 xfs_buf_unlock(bp); 434 return -ENOENT; 435 } 436 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); 437 bp->b_flags &= _XBF_KMEM; 438 bp->b_ops = NULL; 439 } 440 return 0; 441 } 442 443 static bool 444 xfs_buf_try_hold( 445 struct xfs_buf *bp) 446 { 447 spin_lock(&bp->b_lock); 448 if (bp->b_hold == 0) { 449 spin_unlock(&bp->b_lock); 450 return false; 451 } 452 bp->b_hold++; 453 spin_unlock(&bp->b_lock); 454 return true; 455 } 456 457 static inline int 458 xfs_buf_lookup( 459 struct xfs_buf_cache *bch, 460 struct xfs_buf_map *map, 461 xfs_buf_flags_t flags, 462 struct xfs_buf **bpp) 463 { 464 struct xfs_buf *bp; 465 int error; 466 467 rcu_read_lock(); 468 bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params); 469 if (!bp || !xfs_buf_try_hold(bp)) { 470 rcu_read_unlock(); 471 return -ENOENT; 472 } 473 rcu_read_unlock(); 474 475 error = xfs_buf_find_lock(bp, flags); 476 if (error) { 477 xfs_buf_rele(bp); 478 return error; 479 } 480 481 trace_xfs_buf_find(bp, flags, _RET_IP_); 482 *bpp = bp; 483 return 0; 484 } 485 486 /* 487 * Insert the new_bp into the hash table. This consumes the perag reference 488 * taken for the lookup regardless of the result of the insert. 489 */ 490 static int 491 xfs_buf_find_insert( 492 struct xfs_buftarg *btp, 493 struct xfs_buf_cache *bch, 494 struct xfs_perag *pag, 495 struct xfs_buf_map *cmap, 496 struct xfs_buf_map *map, 497 int nmaps, 498 xfs_buf_flags_t flags, 499 struct xfs_buf **bpp) 500 { 501 struct xfs_buf *new_bp; 502 struct xfs_buf *bp; 503 int error; 504 505 error = xfs_buf_alloc(btp, map, nmaps, flags, &new_bp); 506 if (error) 507 goto out_drop_pag; 508 509 /* The new buffer keeps the perag reference until it is freed. */ 510 new_bp->b_pag = pag; 511 512 rcu_read_lock(); 513 bp = rhashtable_lookup_get_insert_fast(&bch->bc_hash, 514 &new_bp->b_rhash_head, xfs_buf_hash_params); 515 if (IS_ERR(bp)) { 516 rcu_read_unlock(); 517 error = PTR_ERR(bp); 518 goto out_free_buf; 519 } 520 if (bp && xfs_buf_try_hold(bp)) { 521 /* found an existing buffer */ 522 rcu_read_unlock(); 523 error = xfs_buf_find_lock(bp, flags); 524 if (error) 525 xfs_buf_rele(bp); 526 else 527 *bpp = bp; 528 goto out_free_buf; 529 } 530 rcu_read_unlock(); 531 532 *bpp = new_bp; 533 return 0; 534 535 out_free_buf: 536 xfs_buf_free(new_bp); 537 out_drop_pag: 538 if (pag) 539 xfs_perag_put(pag); 540 return error; 541 } 542 543 static inline struct xfs_perag * 544 xfs_buftarg_get_pag( 545 struct xfs_buftarg *btp, 546 const struct xfs_buf_map *map) 547 { 548 struct xfs_mount *mp = btp->bt_mount; 549 550 if (xfs_buftarg_is_mem(btp)) 551 return NULL; 552 return xfs_perag_get(mp, xfs_daddr_to_agno(mp, map->bm_bn)); 553 } 554 555 static inline struct xfs_buf_cache * 556 xfs_buftarg_buf_cache( 557 struct xfs_buftarg *btp, 558 struct xfs_perag *pag) 559 { 560 if (pag) 561 return &pag->pag_bcache; 562 return btp->bt_cache; 563 } 564 565 /* 566 * Assembles a buffer covering the specified range. The code is optimised for 567 * cache hits, as metadata intensive workloads will see 3 orders of magnitude 568 * more hits than misses. 569 */ 570 int 571 xfs_buf_get_map( 572 struct xfs_buftarg *btp, 573 struct xfs_buf_map *map, 574 int nmaps, 575 xfs_buf_flags_t flags, 576 struct xfs_buf **bpp) 577 { 578 struct xfs_buf_cache *bch; 579 struct xfs_perag *pag; 580 struct xfs_buf *bp = NULL; 581 struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn }; 582 int error; 583 int i; 584 585 if (flags & XBF_LIVESCAN) 586 cmap.bm_flags |= XBM_LIVESCAN; 587 for (i = 0; i < nmaps; i++) 588 cmap.bm_len += map[i].bm_len; 589 590 error = xfs_buf_map_verify(btp, &cmap); 591 if (error) 592 return error; 593 594 pag = xfs_buftarg_get_pag(btp, &cmap); 595 bch = xfs_buftarg_buf_cache(btp, pag); 596 597 error = xfs_buf_lookup(bch, &cmap, flags, &bp); 598 if (error && error != -ENOENT) 599 goto out_put_perag; 600 601 /* cache hits always outnumber misses by at least 10:1 */ 602 if (unlikely(!bp)) { 603 XFS_STATS_INC(btp->bt_mount, xb_miss_locked); 604 605 if (flags & XBF_INCORE) 606 goto out_put_perag; 607 608 /* xfs_buf_find_insert() consumes the perag reference. */ 609 error = xfs_buf_find_insert(btp, bch, pag, &cmap, map, nmaps, 610 flags, &bp); 611 if (error) 612 return error; 613 } else { 614 XFS_STATS_INC(btp->bt_mount, xb_get_locked); 615 if (pag) 616 xfs_perag_put(pag); 617 } 618 619 /* 620 * Clear b_error if this is a lookup from a caller that doesn't expect 621 * valid data to be found in the buffer. 622 */ 623 if (!(flags & XBF_READ)) 624 xfs_buf_ioerror(bp, 0); 625 626 XFS_STATS_INC(btp->bt_mount, xb_get); 627 trace_xfs_buf_get(bp, flags, _RET_IP_); 628 *bpp = bp; 629 return 0; 630 631 out_put_perag: 632 if (pag) 633 xfs_perag_put(pag); 634 return error; 635 } 636 637 int 638 _xfs_buf_read( 639 struct xfs_buf *bp) 640 { 641 ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL); 642 643 bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD | XBF_DONE); 644 bp->b_flags |= XBF_READ; 645 xfs_buf_submit(bp); 646 return xfs_buf_iowait(bp); 647 } 648 649 /* 650 * Reverify a buffer found in cache without an attached ->b_ops. 651 * 652 * If the caller passed an ops structure and the buffer doesn't have ops 653 * assigned, set the ops and use it to verify the contents. If verification 654 * fails, clear XBF_DONE. We assume the buffer has no recorded errors and is 655 * already in XBF_DONE state on entry. 656 * 657 * Under normal operations, every in-core buffer is verified on read I/O 658 * completion. There are two scenarios that can lead to in-core buffers without 659 * an assigned ->b_ops. The first is during log recovery of buffers on a V4 660 * filesystem, though these buffers are purged at the end of recovery. The 661 * other is online repair, which intentionally reads with a NULL buffer ops to 662 * run several verifiers across an in-core buffer in order to establish buffer 663 * type. If repair can't establish that, the buffer will be left in memory 664 * with NULL buffer ops. 665 */ 666 int 667 xfs_buf_reverify( 668 struct xfs_buf *bp, 669 const struct xfs_buf_ops *ops) 670 { 671 ASSERT(bp->b_flags & XBF_DONE); 672 ASSERT(bp->b_error == 0); 673 674 if (!ops || bp->b_ops) 675 return 0; 676 677 bp->b_ops = ops; 678 bp->b_ops->verify_read(bp); 679 if (bp->b_error) 680 bp->b_flags &= ~XBF_DONE; 681 return bp->b_error; 682 } 683 684 int 685 xfs_buf_read_map( 686 struct xfs_buftarg *target, 687 struct xfs_buf_map *map, 688 int nmaps, 689 xfs_buf_flags_t flags, 690 struct xfs_buf **bpp, 691 const struct xfs_buf_ops *ops, 692 xfs_failaddr_t fa) 693 { 694 struct xfs_buf *bp; 695 int error; 696 697 ASSERT(!(flags & (XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD))); 698 699 flags |= XBF_READ; 700 *bpp = NULL; 701 702 error = xfs_buf_get_map(target, map, nmaps, flags, &bp); 703 if (error) 704 return error; 705 706 trace_xfs_buf_read(bp, flags, _RET_IP_); 707 708 if (!(bp->b_flags & XBF_DONE)) { 709 /* Initiate the buffer read and wait. */ 710 XFS_STATS_INC(target->bt_mount, xb_get_read); 711 bp->b_ops = ops; 712 error = _xfs_buf_read(bp); 713 } else { 714 /* Buffer already read; all we need to do is check it. */ 715 error = xfs_buf_reverify(bp, ops); 716 717 /* We do not want read in the flags */ 718 bp->b_flags &= ~XBF_READ; 719 ASSERT(bp->b_ops != NULL || ops == NULL); 720 } 721 722 /* 723 * If we've had a read error, then the contents of the buffer are 724 * invalid and should not be used. To ensure that a followup read tries 725 * to pull the buffer from disk again, we clear the XBF_DONE flag and 726 * mark the buffer stale. This ensures that anyone who has a current 727 * reference to the buffer will interpret it's contents correctly and 728 * future cache lookups will also treat it as an empty, uninitialised 729 * buffer. 730 */ 731 if (error) { 732 /* 733 * Check against log shutdown for error reporting because 734 * metadata writeback may require a read first and we need to 735 * report errors in metadata writeback until the log is shut 736 * down. High level transaction read functions already check 737 * against mount shutdown, anyway, so we only need to be 738 * concerned about low level IO interactions here. 739 */ 740 if (!xlog_is_shutdown(target->bt_mount->m_log)) 741 xfs_buf_ioerror_alert(bp, fa); 742 743 bp->b_flags &= ~XBF_DONE; 744 xfs_buf_stale(bp); 745 xfs_buf_relse(bp); 746 747 /* bad CRC means corrupted metadata */ 748 if (error == -EFSBADCRC) 749 error = -EFSCORRUPTED; 750 return error; 751 } 752 753 *bpp = bp; 754 return 0; 755 } 756 757 /* 758 * If we are not low on memory then do the readahead in a deadlock 759 * safe manner. 760 */ 761 void 762 xfs_buf_readahead_map( 763 struct xfs_buftarg *target, 764 struct xfs_buf_map *map, 765 int nmaps, 766 const struct xfs_buf_ops *ops) 767 { 768 const xfs_buf_flags_t flags = XBF_READ | XBF_ASYNC | XBF_READ_AHEAD; 769 struct xfs_buf *bp; 770 771 /* 772 * Currently we don't have a good means or justification for performing 773 * xmbuf_map_page asynchronously, so we don't do readahead. 774 */ 775 if (xfs_buftarg_is_mem(target)) 776 return; 777 778 if (xfs_buf_get_map(target, map, nmaps, flags | XBF_TRYLOCK, &bp)) 779 return; 780 trace_xfs_buf_readahead(bp, 0, _RET_IP_); 781 782 if (bp->b_flags & XBF_DONE) { 783 xfs_buf_reverify(bp, ops); 784 xfs_buf_relse(bp); 785 return; 786 } 787 XFS_STATS_INC(target->bt_mount, xb_get_read); 788 bp->b_ops = ops; 789 bp->b_flags &= ~(XBF_WRITE | XBF_DONE); 790 bp->b_flags |= flags; 791 percpu_counter_inc(&target->bt_readahead_count); 792 xfs_buf_submit(bp); 793 } 794 795 /* 796 * Read an uncached buffer from disk. Allocates and returns a locked 797 * buffer containing the disk contents or nothing. Uncached buffers always have 798 * a cache index of XFS_BUF_DADDR_NULL so we can easily determine if the buffer 799 * is cached or uncached during fault diagnosis. 800 */ 801 int 802 xfs_buf_read_uncached( 803 struct xfs_buftarg *target, 804 xfs_daddr_t daddr, 805 size_t numblks, 806 struct xfs_buf **bpp, 807 const struct xfs_buf_ops *ops) 808 { 809 struct xfs_buf *bp; 810 int error; 811 812 *bpp = NULL; 813 814 error = xfs_buf_get_uncached(target, numblks, &bp); 815 if (error) 816 return error; 817 818 /* set up the buffer for a read IO */ 819 ASSERT(bp->b_map_count == 1); 820 bp->b_rhash_key = XFS_BUF_DADDR_NULL; 821 bp->b_maps[0].bm_bn = daddr; 822 bp->b_flags |= XBF_READ; 823 bp->b_ops = ops; 824 825 xfs_buf_submit(bp); 826 error = xfs_buf_iowait(bp); 827 if (error) { 828 xfs_buf_relse(bp); 829 return error; 830 } 831 832 *bpp = bp; 833 return 0; 834 } 835 836 int 837 xfs_buf_get_uncached( 838 struct xfs_buftarg *target, 839 size_t numblks, 840 struct xfs_buf **bpp) 841 { 842 int error; 843 DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks); 844 845 error = xfs_buf_alloc(target, &map, 1, 0, bpp); 846 if (!error) 847 trace_xfs_buf_get_uncached(*bpp, _RET_IP_); 848 return error; 849 } 850 851 /* 852 * Increment reference count on buffer, to hold the buffer concurrently 853 * with another thread which may release (free) the buffer asynchronously. 854 * Must hold the buffer already to call this function. 855 */ 856 void 857 xfs_buf_hold( 858 struct xfs_buf *bp) 859 { 860 trace_xfs_buf_hold(bp, _RET_IP_); 861 862 spin_lock(&bp->b_lock); 863 bp->b_hold++; 864 spin_unlock(&bp->b_lock); 865 } 866 867 static void 868 xfs_buf_rele_uncached( 869 struct xfs_buf *bp) 870 { 871 ASSERT(list_empty(&bp->b_lru)); 872 873 spin_lock(&bp->b_lock); 874 if (--bp->b_hold) { 875 spin_unlock(&bp->b_lock); 876 return; 877 } 878 spin_unlock(&bp->b_lock); 879 xfs_buf_free(bp); 880 } 881 882 static void 883 xfs_buf_rele_cached( 884 struct xfs_buf *bp) 885 { 886 struct xfs_buftarg *btp = bp->b_target; 887 struct xfs_perag *pag = bp->b_pag; 888 struct xfs_buf_cache *bch = xfs_buftarg_buf_cache(btp, pag); 889 bool freebuf = false; 890 891 trace_xfs_buf_rele(bp, _RET_IP_); 892 893 spin_lock(&bp->b_lock); 894 ASSERT(bp->b_hold >= 1); 895 if (bp->b_hold > 1) { 896 bp->b_hold--; 897 goto out_unlock; 898 } 899 900 /* we are asked to drop the last reference */ 901 if (atomic_read(&bp->b_lru_ref)) { 902 /* 903 * If the buffer is added to the LRU, keep the reference to the 904 * buffer for the LRU and clear the (now stale) dispose list 905 * state flag, else drop the reference. 906 */ 907 if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru)) 908 bp->b_state &= ~XFS_BSTATE_DISPOSE; 909 else 910 bp->b_hold--; 911 } else { 912 bp->b_hold--; 913 /* 914 * most of the time buffers will already be removed from the 915 * LRU, so optimise that case by checking for the 916 * XFS_BSTATE_DISPOSE flag indicating the last list the buffer 917 * was on was the disposal list 918 */ 919 if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { 920 list_lru_del_obj(&btp->bt_lru, &bp->b_lru); 921 } else { 922 ASSERT(list_empty(&bp->b_lru)); 923 } 924 925 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); 926 rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head, 927 xfs_buf_hash_params); 928 if (pag) 929 xfs_perag_put(pag); 930 freebuf = true; 931 } 932 933 out_unlock: 934 spin_unlock(&bp->b_lock); 935 936 if (freebuf) 937 xfs_buf_free(bp); 938 } 939 940 /* 941 * Release a hold on the specified buffer. 942 */ 943 void 944 xfs_buf_rele( 945 struct xfs_buf *bp) 946 { 947 trace_xfs_buf_rele(bp, _RET_IP_); 948 if (xfs_buf_is_uncached(bp)) 949 xfs_buf_rele_uncached(bp); 950 else 951 xfs_buf_rele_cached(bp); 952 } 953 954 /* 955 * Lock a buffer object, if it is not already locked. 956 * 957 * If we come across a stale, pinned, locked buffer, we know that we are 958 * being asked to lock a buffer that has been reallocated. Because it is 959 * pinned, we know that the log has not been pushed to disk and hence it 960 * will still be locked. Rather than continuing to have trylock attempts 961 * fail until someone else pushes the log, push it ourselves before 962 * returning. This means that the xfsaild will not get stuck trying 963 * to push on stale inode buffers. 964 */ 965 int 966 xfs_buf_trylock( 967 struct xfs_buf *bp) 968 { 969 int locked; 970 971 locked = down_trylock(&bp->b_sema) == 0; 972 if (locked) 973 trace_xfs_buf_trylock(bp, _RET_IP_); 974 else 975 trace_xfs_buf_trylock_fail(bp, _RET_IP_); 976 return locked; 977 } 978 979 /* 980 * Lock a buffer object. 981 * 982 * If we come across a stale, pinned, locked buffer, we know that we 983 * are being asked to lock a buffer that has been reallocated. Because 984 * it is pinned, we know that the log has not been pushed to disk and 985 * hence it will still be locked. Rather than sleeping until someone 986 * else pushes the log, push it ourselves before trying to get the lock. 987 */ 988 void 989 xfs_buf_lock( 990 struct xfs_buf *bp) 991 { 992 trace_xfs_buf_lock(bp, _RET_IP_); 993 994 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) 995 xfs_log_force(bp->b_mount, 0); 996 down(&bp->b_sema); 997 998 trace_xfs_buf_lock_done(bp, _RET_IP_); 999 } 1000 1001 void 1002 xfs_buf_unlock( 1003 struct xfs_buf *bp) 1004 { 1005 ASSERT(xfs_buf_islocked(bp)); 1006 1007 up(&bp->b_sema); 1008 trace_xfs_buf_unlock(bp, _RET_IP_); 1009 } 1010 1011 STATIC void 1012 xfs_buf_wait_unpin( 1013 struct xfs_buf *bp) 1014 { 1015 DECLARE_WAITQUEUE (wait, current); 1016 1017 if (atomic_read(&bp->b_pin_count) == 0) 1018 return; 1019 1020 add_wait_queue(&bp->b_waiters, &wait); 1021 for (;;) { 1022 set_current_state(TASK_UNINTERRUPTIBLE); 1023 if (atomic_read(&bp->b_pin_count) == 0) 1024 break; 1025 io_schedule(); 1026 } 1027 remove_wait_queue(&bp->b_waiters, &wait); 1028 set_current_state(TASK_RUNNING); 1029 } 1030 1031 static void 1032 xfs_buf_ioerror_alert_ratelimited( 1033 struct xfs_buf *bp) 1034 { 1035 static unsigned long lasttime; 1036 static struct xfs_buftarg *lasttarg; 1037 1038 if (bp->b_target != lasttarg || 1039 time_after(jiffies, (lasttime + 5*HZ))) { 1040 lasttime = jiffies; 1041 xfs_buf_ioerror_alert(bp, __this_address); 1042 } 1043 lasttarg = bp->b_target; 1044 } 1045 1046 /* 1047 * Account for this latest trip around the retry handler, and decide if 1048 * we've failed enough times to constitute a permanent failure. 1049 */ 1050 static bool 1051 xfs_buf_ioerror_permanent( 1052 struct xfs_buf *bp, 1053 struct xfs_error_cfg *cfg) 1054 { 1055 struct xfs_mount *mp = bp->b_mount; 1056 1057 if (cfg->max_retries != XFS_ERR_RETRY_FOREVER && 1058 ++bp->b_retries > cfg->max_retries) 1059 return true; 1060 if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && 1061 time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time)) 1062 return true; 1063 1064 /* At unmount we may treat errors differently */ 1065 if (xfs_is_unmounting(mp) && mp->m_fail_unmount) 1066 return true; 1067 1068 return false; 1069 } 1070 1071 /* 1072 * On a sync write or shutdown we just want to stale the buffer and let the 1073 * caller handle the error in bp->b_error appropriately. 1074 * 1075 * If the write was asynchronous then no one will be looking for the error. If 1076 * this is the first failure of this type, clear the error state and write the 1077 * buffer out again. This means we always retry an async write failure at least 1078 * once, but we also need to set the buffer up to behave correctly now for 1079 * repeated failures. 1080 * 1081 * If we get repeated async write failures, then we take action according to the 1082 * error configuration we have been set up to use. 1083 * 1084 * Returns true if this function took care of error handling and the caller must 1085 * not touch the buffer again. Return false if the caller should proceed with 1086 * normal I/O completion handling. 1087 */ 1088 static bool 1089 xfs_buf_ioend_handle_error( 1090 struct xfs_buf *bp) 1091 { 1092 struct xfs_mount *mp = bp->b_mount; 1093 struct xfs_error_cfg *cfg; 1094 struct xfs_log_item *lip; 1095 1096 /* 1097 * If we've already shutdown the journal because of I/O errors, there's 1098 * no point in giving this a retry. 1099 */ 1100 if (xlog_is_shutdown(mp->m_log)) 1101 goto out_stale; 1102 1103 xfs_buf_ioerror_alert_ratelimited(bp); 1104 1105 /* 1106 * We're not going to bother about retrying this during recovery. 1107 * One strike! 1108 */ 1109 if (bp->b_flags & _XBF_LOGRECOVERY) { 1110 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 1111 return false; 1112 } 1113 1114 /* 1115 * Synchronous writes will have callers process the error. 1116 */ 1117 if (!(bp->b_flags & XBF_ASYNC)) 1118 goto out_stale; 1119 1120 trace_xfs_buf_iodone_async(bp, _RET_IP_); 1121 1122 cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error); 1123 if (bp->b_last_error != bp->b_error || 1124 !(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL))) { 1125 bp->b_last_error = bp->b_error; 1126 if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && 1127 !bp->b_first_retry_time) 1128 bp->b_first_retry_time = jiffies; 1129 goto resubmit; 1130 } 1131 1132 /* 1133 * Permanent error - we need to trigger a shutdown if we haven't already 1134 * to indicate that inconsistency will result from this action. 1135 */ 1136 if (xfs_buf_ioerror_permanent(bp, cfg)) { 1137 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 1138 goto out_stale; 1139 } 1140 1141 /* Still considered a transient error. Caller will schedule retries. */ 1142 list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { 1143 set_bit(XFS_LI_FAILED, &lip->li_flags); 1144 clear_bit(XFS_LI_FLUSHING, &lip->li_flags); 1145 } 1146 1147 xfs_buf_ioerror(bp, 0); 1148 xfs_buf_relse(bp); 1149 return true; 1150 1151 resubmit: 1152 xfs_buf_ioerror(bp, 0); 1153 bp->b_flags |= (XBF_DONE | XBF_WRITE_FAIL); 1154 reinit_completion(&bp->b_iowait); 1155 xfs_buf_submit(bp); 1156 return true; 1157 out_stale: 1158 xfs_buf_stale(bp); 1159 bp->b_flags |= XBF_DONE; 1160 bp->b_flags &= ~XBF_WRITE; 1161 trace_xfs_buf_error_relse(bp, _RET_IP_); 1162 return false; 1163 } 1164 1165 /* returns false if the caller needs to resubmit the I/O, else true */ 1166 static bool 1167 __xfs_buf_ioend( 1168 struct xfs_buf *bp) 1169 { 1170 trace_xfs_buf_iodone(bp, _RET_IP_); 1171 1172 if (bp->b_flags & XBF_READ) { 1173 if (!bp->b_error && is_vmalloc_addr(bp->b_addr)) 1174 invalidate_kernel_vmap_range(bp->b_addr, 1175 roundup(BBTOB(bp->b_length), PAGE_SIZE)); 1176 if (!bp->b_error && bp->b_ops) 1177 bp->b_ops->verify_read(bp); 1178 if (!bp->b_error) 1179 bp->b_flags |= XBF_DONE; 1180 if (bp->b_flags & XBF_READ_AHEAD) 1181 percpu_counter_dec(&bp->b_target->bt_readahead_count); 1182 } else { 1183 if (!bp->b_error) { 1184 bp->b_flags &= ~XBF_WRITE_FAIL; 1185 bp->b_flags |= XBF_DONE; 1186 } 1187 1188 if (unlikely(bp->b_error) && xfs_buf_ioend_handle_error(bp)) 1189 return false; 1190 1191 /* clear the retry state */ 1192 bp->b_last_error = 0; 1193 bp->b_retries = 0; 1194 bp->b_first_retry_time = 0; 1195 1196 /* 1197 * Note that for things like remote attribute buffers, there may 1198 * not be a buffer log item here, so processing the buffer log 1199 * item must remain optional. 1200 */ 1201 if (bp->b_log_item) 1202 xfs_buf_item_done(bp); 1203 1204 if (bp->b_iodone) 1205 bp->b_iodone(bp); 1206 } 1207 1208 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD | 1209 _XBF_LOGRECOVERY); 1210 return true; 1211 } 1212 1213 static void 1214 xfs_buf_ioend( 1215 struct xfs_buf *bp) 1216 { 1217 if (!__xfs_buf_ioend(bp)) 1218 return; 1219 if (bp->b_flags & XBF_ASYNC) 1220 xfs_buf_relse(bp); 1221 else 1222 complete(&bp->b_iowait); 1223 } 1224 1225 static void 1226 xfs_buf_ioend_work( 1227 struct work_struct *work) 1228 { 1229 struct xfs_buf *bp = 1230 container_of(work, struct xfs_buf, b_ioend_work); 1231 1232 if (__xfs_buf_ioend(bp)) 1233 xfs_buf_relse(bp); 1234 } 1235 1236 void 1237 __xfs_buf_ioerror( 1238 struct xfs_buf *bp, 1239 int error, 1240 xfs_failaddr_t failaddr) 1241 { 1242 ASSERT(error <= 0 && error >= -1000); 1243 bp->b_error = error; 1244 trace_xfs_buf_ioerror(bp, error, failaddr); 1245 } 1246 1247 void 1248 xfs_buf_ioerror_alert( 1249 struct xfs_buf *bp, 1250 xfs_failaddr_t func) 1251 { 1252 xfs_buf_alert_ratelimited(bp, "XFS: metadata IO error", 1253 "metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d", 1254 func, (uint64_t)xfs_buf_daddr(bp), 1255 bp->b_length, -bp->b_error); 1256 } 1257 1258 /* 1259 * To simulate an I/O failure, the buffer must be locked and held with at least 1260 * three references. The LRU reference is dropped by the stale call. The buf 1261 * item reference is dropped via ioend processing. The third reference is owned 1262 * by the caller and is dropped on I/O completion if the buffer is XBF_ASYNC. 1263 */ 1264 void 1265 xfs_buf_ioend_fail( 1266 struct xfs_buf *bp) 1267 { 1268 bp->b_flags &= ~XBF_DONE; 1269 xfs_buf_stale(bp); 1270 xfs_buf_ioerror(bp, -EIO); 1271 xfs_buf_ioend(bp); 1272 } 1273 1274 int 1275 xfs_bwrite( 1276 struct xfs_buf *bp) 1277 { 1278 int error; 1279 1280 ASSERT(xfs_buf_islocked(bp)); 1281 1282 bp->b_flags |= XBF_WRITE; 1283 bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | 1284 XBF_DONE); 1285 1286 xfs_buf_submit(bp); 1287 error = xfs_buf_iowait(bp); 1288 if (error) 1289 xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR); 1290 return error; 1291 } 1292 1293 static void 1294 xfs_buf_bio_end_io( 1295 struct bio *bio) 1296 { 1297 struct xfs_buf *bp = bio->bi_private; 1298 1299 if (bio->bi_status) 1300 xfs_buf_ioerror(bp, blk_status_to_errno(bio->bi_status)); 1301 else if ((bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) && 1302 XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR)) 1303 xfs_buf_ioerror(bp, -EIO); 1304 1305 if (bp->b_flags & XBF_ASYNC) { 1306 INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work); 1307 queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work); 1308 } else { 1309 complete(&bp->b_iowait); 1310 } 1311 1312 bio_put(bio); 1313 } 1314 1315 static inline blk_opf_t 1316 xfs_buf_bio_op( 1317 struct xfs_buf *bp) 1318 { 1319 blk_opf_t op; 1320 1321 if (bp->b_flags & XBF_WRITE) { 1322 op = REQ_OP_WRITE; 1323 } else { 1324 op = REQ_OP_READ; 1325 if (bp->b_flags & XBF_READ_AHEAD) 1326 op |= REQ_RAHEAD; 1327 } 1328 1329 return op | REQ_META; 1330 } 1331 1332 static void 1333 xfs_buf_submit_bio( 1334 struct xfs_buf *bp) 1335 { 1336 unsigned int map = 0; 1337 struct blk_plug plug; 1338 struct bio *bio; 1339 1340 if (is_vmalloc_addr(bp->b_addr)) { 1341 unsigned int size = BBTOB(bp->b_length); 1342 unsigned int alloc_size = roundup(size, PAGE_SIZE); 1343 void *data = bp->b_addr; 1344 1345 bio = bio_alloc(bp->b_target->bt_bdev, alloc_size >> PAGE_SHIFT, 1346 xfs_buf_bio_op(bp), GFP_NOIO); 1347 1348 do { 1349 unsigned int len = min(size, PAGE_SIZE); 1350 1351 ASSERT(offset_in_page(data) == 0); 1352 __bio_add_page(bio, vmalloc_to_page(data), len, 0); 1353 data += len; 1354 size -= len; 1355 } while (size); 1356 1357 flush_kernel_vmap_range(bp->b_addr, alloc_size); 1358 } else { 1359 /* 1360 * Single folio or slab allocation. Must be contiguous and thus 1361 * only a single bvec is needed. 1362 * 1363 * This uses the page based bio add helper for now as that is 1364 * the lowest common denominator between folios and slab 1365 * allocations. To be replaced with a better block layer 1366 * helper soon (hopefully). 1367 */ 1368 bio = bio_alloc(bp->b_target->bt_bdev, 1, xfs_buf_bio_op(bp), 1369 GFP_NOIO); 1370 __bio_add_page(bio, virt_to_page(bp->b_addr), 1371 BBTOB(bp->b_length), 1372 offset_in_page(bp->b_addr)); 1373 } 1374 1375 bio->bi_private = bp; 1376 bio->bi_end_io = xfs_buf_bio_end_io; 1377 1378 /* 1379 * If there is more than one map segment, split out a new bio for each 1380 * map except of the last one. The last map is handled by the 1381 * remainder of the original bio outside the loop. 1382 */ 1383 blk_start_plug(&plug); 1384 for (map = 0; map < bp->b_map_count - 1; map++) { 1385 struct bio *split; 1386 1387 split = bio_split(bio, bp->b_maps[map].bm_len, GFP_NOFS, 1388 &fs_bio_set); 1389 split->bi_iter.bi_sector = bp->b_maps[map].bm_bn; 1390 bio_chain(split, bio); 1391 submit_bio(split); 1392 } 1393 bio->bi_iter.bi_sector = bp->b_maps[map].bm_bn; 1394 submit_bio(bio); 1395 blk_finish_plug(&plug); 1396 } 1397 1398 /* 1399 * Wait for I/O completion of a sync buffer and return the I/O error code. 1400 */ 1401 static int 1402 xfs_buf_iowait( 1403 struct xfs_buf *bp) 1404 { 1405 ASSERT(!(bp->b_flags & XBF_ASYNC)); 1406 1407 do { 1408 trace_xfs_buf_iowait(bp, _RET_IP_); 1409 wait_for_completion(&bp->b_iowait); 1410 trace_xfs_buf_iowait_done(bp, _RET_IP_); 1411 } while (!__xfs_buf_ioend(bp)); 1412 1413 return bp->b_error; 1414 } 1415 1416 /* 1417 * Run the write verifier callback function if it exists. If this fails, mark 1418 * the buffer with an error and do not dispatch the I/O. 1419 */ 1420 static bool 1421 xfs_buf_verify_write( 1422 struct xfs_buf *bp) 1423 { 1424 if (bp->b_ops) { 1425 bp->b_ops->verify_write(bp); 1426 if (bp->b_error) 1427 return false; 1428 } else if (bp->b_rhash_key != XFS_BUF_DADDR_NULL) { 1429 /* 1430 * Non-crc filesystems don't attach verifiers during log 1431 * recovery, so don't warn for such filesystems. 1432 */ 1433 if (xfs_has_crc(bp->b_mount)) { 1434 xfs_warn(bp->b_mount, 1435 "%s: no buf ops on daddr 0x%llx len %d", 1436 __func__, xfs_buf_daddr(bp), 1437 bp->b_length); 1438 xfs_hex_dump(bp->b_addr, XFS_CORRUPTION_DUMP_LEN); 1439 dump_stack(); 1440 } 1441 } 1442 1443 return true; 1444 } 1445 1446 /* 1447 * Buffer I/O submission path, read or write. Asynchronous submission transfers 1448 * the buffer lock ownership and the current reference to the IO. It is not 1449 * safe to reference the buffer after a call to this function unless the caller 1450 * holds an additional reference itself. 1451 */ 1452 static void 1453 xfs_buf_submit( 1454 struct xfs_buf *bp) 1455 { 1456 trace_xfs_buf_submit(bp, _RET_IP_); 1457 1458 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); 1459 1460 /* 1461 * On log shutdown we stale and complete the buffer immediately. We can 1462 * be called to read the superblock before the log has been set up, so 1463 * be careful checking the log state. 1464 * 1465 * Checking the mount shutdown state here can result in the log tail 1466 * moving inappropriately on disk as the log may not yet be shut down. 1467 * i.e. failing this buffer on mount shutdown can remove it from the AIL 1468 * and move the tail of the log forwards without having written this 1469 * buffer to disk. This corrupts the log tail state in memory, and 1470 * because the log may not be shut down yet, it can then be propagated 1471 * to disk before the log is shutdown. Hence we check log shutdown 1472 * state here rather than mount state to avoid corrupting the log tail 1473 * on shutdown. 1474 */ 1475 if (bp->b_mount->m_log && xlog_is_shutdown(bp->b_mount->m_log)) { 1476 xfs_buf_ioend_fail(bp); 1477 return; 1478 } 1479 1480 if (bp->b_flags & XBF_WRITE) 1481 xfs_buf_wait_unpin(bp); 1482 1483 /* 1484 * Make sure we capture only current IO errors rather than stale errors 1485 * left over from previous use of the buffer (e.g. failed readahead). 1486 */ 1487 bp->b_error = 0; 1488 1489 if ((bp->b_flags & XBF_WRITE) && !xfs_buf_verify_write(bp)) { 1490 xfs_force_shutdown(bp->b_mount, SHUTDOWN_CORRUPT_INCORE); 1491 xfs_buf_ioend(bp); 1492 return; 1493 } 1494 1495 /* In-memory targets are directly mapped, no I/O required. */ 1496 if (xfs_buftarg_is_mem(bp->b_target)) { 1497 xfs_buf_ioend(bp); 1498 return; 1499 } 1500 1501 xfs_buf_submit_bio(bp); 1502 } 1503 1504 /* 1505 * Log a message about and stale a buffer that a caller has decided is corrupt. 1506 * 1507 * This function should be called for the kinds of metadata corruption that 1508 * cannot be detect from a verifier, such as incorrect inter-block relationship 1509 * data. Do /not/ call this function from a verifier function. 1510 * 1511 * The buffer must be XBF_DONE prior to the call. Afterwards, the buffer will 1512 * be marked stale, but b_error will not be set. The caller is responsible for 1513 * releasing the buffer or fixing it. 1514 */ 1515 void 1516 __xfs_buf_mark_corrupt( 1517 struct xfs_buf *bp, 1518 xfs_failaddr_t fa) 1519 { 1520 ASSERT(bp->b_flags & XBF_DONE); 1521 1522 xfs_buf_corruption_error(bp, fa); 1523 xfs_buf_stale(bp); 1524 } 1525 1526 /* 1527 * Handling of buffer targets (buftargs). 1528 */ 1529 1530 /* 1531 * Wait for any bufs with callbacks that have been submitted but have not yet 1532 * returned. These buffers will have an elevated hold count, so wait on those 1533 * while freeing all the buffers only held by the LRU. 1534 */ 1535 static enum lru_status 1536 xfs_buftarg_drain_rele( 1537 struct list_head *item, 1538 struct list_lru_one *lru, 1539 void *arg) 1540 1541 { 1542 struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); 1543 struct list_head *dispose = arg; 1544 1545 if (!spin_trylock(&bp->b_lock)) 1546 return LRU_SKIP; 1547 if (bp->b_hold > 1) { 1548 /* need to wait, so skip it this pass */ 1549 spin_unlock(&bp->b_lock); 1550 trace_xfs_buf_drain_buftarg(bp, _RET_IP_); 1551 return LRU_SKIP; 1552 } 1553 1554 /* 1555 * clear the LRU reference count so the buffer doesn't get 1556 * ignored in xfs_buf_rele(). 1557 */ 1558 atomic_set(&bp->b_lru_ref, 0); 1559 bp->b_state |= XFS_BSTATE_DISPOSE; 1560 list_lru_isolate_move(lru, item, dispose); 1561 spin_unlock(&bp->b_lock); 1562 return LRU_REMOVED; 1563 } 1564 1565 /* 1566 * Wait for outstanding I/O on the buftarg to complete. 1567 */ 1568 void 1569 xfs_buftarg_wait( 1570 struct xfs_buftarg *btp) 1571 { 1572 /* 1573 * First wait for all in-flight readahead buffers to be released. This is 1574 * critical as new buffers do not make the LRU until they are released. 1575 * 1576 * Next, flush the buffer workqueue to ensure all completion processing 1577 * has finished. Just waiting on buffer locks is not sufficient for 1578 * async IO as the reference count held over IO is not released until 1579 * after the buffer lock is dropped. Hence we need to ensure here that 1580 * all reference counts have been dropped before we start walking the 1581 * LRU list. 1582 */ 1583 while (percpu_counter_sum(&btp->bt_readahead_count)) 1584 delay(100); 1585 flush_workqueue(btp->bt_mount->m_buf_workqueue); 1586 } 1587 1588 void 1589 xfs_buftarg_drain( 1590 struct xfs_buftarg *btp) 1591 { 1592 LIST_HEAD(dispose); 1593 int loop = 0; 1594 bool write_fail = false; 1595 1596 xfs_buftarg_wait(btp); 1597 1598 /* loop until there is nothing left on the lru list. */ 1599 while (list_lru_count(&btp->bt_lru)) { 1600 list_lru_walk(&btp->bt_lru, xfs_buftarg_drain_rele, 1601 &dispose, LONG_MAX); 1602 1603 while (!list_empty(&dispose)) { 1604 struct xfs_buf *bp; 1605 bp = list_first_entry(&dispose, struct xfs_buf, b_lru); 1606 list_del_init(&bp->b_lru); 1607 if (bp->b_flags & XBF_WRITE_FAIL) { 1608 write_fail = true; 1609 xfs_buf_alert_ratelimited(bp, 1610 "XFS: Corruption Alert", 1611 "Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!", 1612 (long long)xfs_buf_daddr(bp)); 1613 } 1614 xfs_buf_rele(bp); 1615 } 1616 if (loop++ != 0) 1617 delay(100); 1618 } 1619 1620 /* 1621 * If one or more failed buffers were freed, that means dirty metadata 1622 * was thrown away. This should only ever happen after I/O completion 1623 * handling has elevated I/O error(s) to permanent failures and shuts 1624 * down the journal. 1625 */ 1626 if (write_fail) { 1627 ASSERT(xlog_is_shutdown(btp->bt_mount->m_log)); 1628 xfs_alert(btp->bt_mount, 1629 "Please run xfs_repair to determine the extent of the problem."); 1630 } 1631 } 1632 1633 static enum lru_status 1634 xfs_buftarg_isolate( 1635 struct list_head *item, 1636 struct list_lru_one *lru, 1637 void *arg) 1638 { 1639 struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); 1640 struct list_head *dispose = arg; 1641 1642 /* 1643 * we are inverting the lru lock/bp->b_lock here, so use a trylock. 1644 * If we fail to get the lock, just skip it. 1645 */ 1646 if (!spin_trylock(&bp->b_lock)) 1647 return LRU_SKIP; 1648 /* 1649 * Decrement the b_lru_ref count unless the value is already 1650 * zero. If the value is already zero, we need to reclaim the 1651 * buffer, otherwise it gets another trip through the LRU. 1652 */ 1653 if (atomic_add_unless(&bp->b_lru_ref, -1, 0)) { 1654 spin_unlock(&bp->b_lock); 1655 return LRU_ROTATE; 1656 } 1657 1658 bp->b_state |= XFS_BSTATE_DISPOSE; 1659 list_lru_isolate_move(lru, item, dispose); 1660 spin_unlock(&bp->b_lock); 1661 return LRU_REMOVED; 1662 } 1663 1664 static unsigned long 1665 xfs_buftarg_shrink_scan( 1666 struct shrinker *shrink, 1667 struct shrink_control *sc) 1668 { 1669 struct xfs_buftarg *btp = shrink->private_data; 1670 LIST_HEAD(dispose); 1671 unsigned long freed; 1672 1673 freed = list_lru_shrink_walk(&btp->bt_lru, sc, 1674 xfs_buftarg_isolate, &dispose); 1675 1676 while (!list_empty(&dispose)) { 1677 struct xfs_buf *bp; 1678 bp = list_first_entry(&dispose, struct xfs_buf, b_lru); 1679 list_del_init(&bp->b_lru); 1680 xfs_buf_rele(bp); 1681 } 1682 1683 return freed; 1684 } 1685 1686 static unsigned long 1687 xfs_buftarg_shrink_count( 1688 struct shrinker *shrink, 1689 struct shrink_control *sc) 1690 { 1691 struct xfs_buftarg *btp = shrink->private_data; 1692 return list_lru_shrink_count(&btp->bt_lru, sc); 1693 } 1694 1695 void 1696 xfs_destroy_buftarg( 1697 struct xfs_buftarg *btp) 1698 { 1699 shrinker_free(btp->bt_shrinker); 1700 ASSERT(percpu_counter_sum(&btp->bt_readahead_count) == 0); 1701 percpu_counter_destroy(&btp->bt_readahead_count); 1702 list_lru_destroy(&btp->bt_lru); 1703 } 1704 1705 void 1706 xfs_free_buftarg( 1707 struct xfs_buftarg *btp) 1708 { 1709 xfs_destroy_buftarg(btp); 1710 fs_put_dax(btp->bt_daxdev, btp->bt_mount); 1711 /* the main block device is closed by kill_block_super */ 1712 if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev) 1713 bdev_fput(btp->bt_bdev_file); 1714 kfree(btp); 1715 } 1716 1717 int 1718 xfs_setsize_buftarg( 1719 struct xfs_buftarg *btp, 1720 unsigned int sectorsize) 1721 { 1722 /* Set up metadata sector size info */ 1723 btp->bt_meta_sectorsize = sectorsize; 1724 btp->bt_meta_sectormask = sectorsize - 1; 1725 1726 if (set_blocksize(btp->bt_bdev_file, sectorsize)) { 1727 xfs_warn(btp->bt_mount, 1728 "Cannot set_blocksize to %u on device %pg", 1729 sectorsize, btp->bt_bdev); 1730 return -EINVAL; 1731 } 1732 1733 return 0; 1734 } 1735 1736 int 1737 xfs_init_buftarg( 1738 struct xfs_buftarg *btp, 1739 size_t logical_sectorsize, 1740 const char *descr) 1741 { 1742 /* Set up device logical sector size mask */ 1743 btp->bt_logical_sectorsize = logical_sectorsize; 1744 btp->bt_logical_sectormask = logical_sectorsize - 1; 1745 1746 /* 1747 * Buffer IO error rate limiting. Limit it to no more than 10 messages 1748 * per 30 seconds so as to not spam logs too much on repeated errors. 1749 */ 1750 ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ, 1751 DEFAULT_RATELIMIT_BURST); 1752 1753 if (list_lru_init(&btp->bt_lru)) 1754 return -ENOMEM; 1755 if (percpu_counter_init(&btp->bt_readahead_count, 0, GFP_KERNEL)) 1756 goto out_destroy_lru; 1757 1758 btp->bt_shrinker = 1759 shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-buf:%s", descr); 1760 if (!btp->bt_shrinker) 1761 goto out_destroy_io_count; 1762 btp->bt_shrinker->count_objects = xfs_buftarg_shrink_count; 1763 btp->bt_shrinker->scan_objects = xfs_buftarg_shrink_scan; 1764 btp->bt_shrinker->private_data = btp; 1765 shrinker_register(btp->bt_shrinker); 1766 return 0; 1767 1768 out_destroy_io_count: 1769 percpu_counter_destroy(&btp->bt_readahead_count); 1770 out_destroy_lru: 1771 list_lru_destroy(&btp->bt_lru); 1772 return -ENOMEM; 1773 } 1774 1775 struct xfs_buftarg * 1776 xfs_alloc_buftarg( 1777 struct xfs_mount *mp, 1778 struct file *bdev_file) 1779 { 1780 struct xfs_buftarg *btp; 1781 const struct dax_holder_operations *ops = NULL; 1782 1783 #if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE) 1784 ops = &xfs_dax_holder_operations; 1785 #endif 1786 btp = kzalloc(sizeof(*btp), GFP_KERNEL | __GFP_NOFAIL); 1787 1788 btp->bt_mount = mp; 1789 btp->bt_bdev_file = bdev_file; 1790 btp->bt_bdev = file_bdev(bdev_file); 1791 btp->bt_dev = btp->bt_bdev->bd_dev; 1792 btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off, 1793 mp, ops); 1794 1795 if (bdev_can_atomic_write(btp->bt_bdev)) { 1796 btp->bt_bdev_awu_min = bdev_atomic_write_unit_min_bytes( 1797 btp->bt_bdev); 1798 btp->bt_bdev_awu_max = bdev_atomic_write_unit_max_bytes( 1799 btp->bt_bdev); 1800 } 1801 1802 /* 1803 * When allocating the buftargs we have not yet read the super block and 1804 * thus don't know the file system sector size yet. 1805 */ 1806 if (xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev))) 1807 goto error_free; 1808 if (xfs_init_buftarg(btp, bdev_logical_block_size(btp->bt_bdev), 1809 mp->m_super->s_id)) 1810 goto error_free; 1811 1812 return btp; 1813 1814 error_free: 1815 kfree(btp); 1816 return NULL; 1817 } 1818 1819 static inline void 1820 xfs_buf_list_del( 1821 struct xfs_buf *bp) 1822 { 1823 list_del_init(&bp->b_list); 1824 wake_up_var(&bp->b_list); 1825 } 1826 1827 /* 1828 * Cancel a delayed write list. 1829 * 1830 * Remove each buffer from the list, clear the delwri queue flag and drop the 1831 * associated buffer reference. 1832 */ 1833 void 1834 xfs_buf_delwri_cancel( 1835 struct list_head *list) 1836 { 1837 struct xfs_buf *bp; 1838 1839 while (!list_empty(list)) { 1840 bp = list_first_entry(list, struct xfs_buf, b_list); 1841 1842 xfs_buf_lock(bp); 1843 bp->b_flags &= ~_XBF_DELWRI_Q; 1844 xfs_buf_list_del(bp); 1845 xfs_buf_relse(bp); 1846 } 1847 } 1848 1849 /* 1850 * Add a buffer to the delayed write list. 1851 * 1852 * This queues a buffer for writeout if it hasn't already been. Note that 1853 * neither this routine nor the buffer list submission functions perform 1854 * any internal synchronization. It is expected that the lists are thread-local 1855 * to the callers. 1856 * 1857 * Returns true if we queued up the buffer, or false if it already had 1858 * been on the buffer list. 1859 */ 1860 bool 1861 xfs_buf_delwri_queue( 1862 struct xfs_buf *bp, 1863 struct list_head *list) 1864 { 1865 ASSERT(xfs_buf_islocked(bp)); 1866 ASSERT(!(bp->b_flags & XBF_READ)); 1867 1868 /* 1869 * If the buffer is already marked delwri it already is queued up 1870 * by someone else for imediate writeout. Just ignore it in that 1871 * case. 1872 */ 1873 if (bp->b_flags & _XBF_DELWRI_Q) { 1874 trace_xfs_buf_delwri_queued(bp, _RET_IP_); 1875 return false; 1876 } 1877 1878 trace_xfs_buf_delwri_queue(bp, _RET_IP_); 1879 1880 /* 1881 * If a buffer gets written out synchronously or marked stale while it 1882 * is on a delwri list we lazily remove it. To do this, the other party 1883 * clears the _XBF_DELWRI_Q flag but otherwise leaves the buffer alone. 1884 * It remains referenced and on the list. In a rare corner case it 1885 * might get readded to a delwri list after the synchronous writeout, in 1886 * which case we need just need to re-add the flag here. 1887 */ 1888 bp->b_flags |= _XBF_DELWRI_Q; 1889 if (list_empty(&bp->b_list)) { 1890 xfs_buf_hold(bp); 1891 list_add_tail(&bp->b_list, list); 1892 } 1893 1894 return true; 1895 } 1896 1897 /* 1898 * Queue a buffer to this delwri list as part of a data integrity operation. 1899 * If the buffer is on any other delwri list, we'll wait for that to clear 1900 * so that the caller can submit the buffer for IO and wait for the result. 1901 * Callers must ensure the buffer is not already on the list. 1902 */ 1903 void 1904 xfs_buf_delwri_queue_here( 1905 struct xfs_buf *bp, 1906 struct list_head *buffer_list) 1907 { 1908 /* 1909 * We need this buffer to end up on the /caller's/ delwri list, not any 1910 * old list. This can happen if the buffer is marked stale (which 1911 * clears DELWRI_Q) after the AIL queues the buffer to its list but 1912 * before the AIL has a chance to submit the list. 1913 */ 1914 while (!list_empty(&bp->b_list)) { 1915 xfs_buf_unlock(bp); 1916 wait_var_event(&bp->b_list, list_empty(&bp->b_list)); 1917 xfs_buf_lock(bp); 1918 } 1919 1920 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); 1921 1922 xfs_buf_delwri_queue(bp, buffer_list); 1923 } 1924 1925 /* 1926 * Compare function is more complex than it needs to be because 1927 * the return value is only 32 bits and we are doing comparisons 1928 * on 64 bit values 1929 */ 1930 static int 1931 xfs_buf_cmp( 1932 void *priv, 1933 const struct list_head *a, 1934 const struct list_head *b) 1935 { 1936 struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list); 1937 struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list); 1938 xfs_daddr_t diff; 1939 1940 diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn; 1941 if (diff < 0) 1942 return -1; 1943 if (diff > 0) 1944 return 1; 1945 return 0; 1946 } 1947 1948 static bool 1949 xfs_buf_delwri_submit_prep( 1950 struct xfs_buf *bp) 1951 { 1952 /* 1953 * Someone else might have written the buffer synchronously or marked it 1954 * stale in the meantime. In that case only the _XBF_DELWRI_Q flag got 1955 * cleared, and we have to drop the reference and remove it from the 1956 * list here. 1957 */ 1958 if (!(bp->b_flags & _XBF_DELWRI_Q)) { 1959 xfs_buf_list_del(bp); 1960 xfs_buf_relse(bp); 1961 return false; 1962 } 1963 1964 trace_xfs_buf_delwri_split(bp, _RET_IP_); 1965 bp->b_flags &= ~_XBF_DELWRI_Q; 1966 bp->b_flags |= XBF_WRITE; 1967 return true; 1968 } 1969 1970 /* 1971 * Write out a buffer list asynchronously. 1972 * 1973 * This will take the @buffer_list, write all non-locked and non-pinned buffers 1974 * out and not wait for I/O completion on any of the buffers. This interface 1975 * is only safely useable for callers that can track I/O completion by higher 1976 * level means, e.g. AIL pushing as the @buffer_list is consumed in this 1977 * function. 1978 * 1979 * Note: this function will skip buffers it would block on, and in doing so 1980 * leaves them on @buffer_list so they can be retried on a later pass. As such, 1981 * it is up to the caller to ensure that the buffer list is fully submitted or 1982 * cancelled appropriately when they are finished with the list. Failure to 1983 * cancel or resubmit the list until it is empty will result in leaked buffers 1984 * at unmount time. 1985 */ 1986 int 1987 xfs_buf_delwri_submit_nowait( 1988 struct list_head *buffer_list) 1989 { 1990 struct xfs_buf *bp, *n; 1991 int pinned = 0; 1992 struct blk_plug plug; 1993 1994 list_sort(NULL, buffer_list, xfs_buf_cmp); 1995 1996 blk_start_plug(&plug); 1997 list_for_each_entry_safe(bp, n, buffer_list, b_list) { 1998 if (!xfs_buf_trylock(bp)) 1999 continue; 2000 if (xfs_buf_ispinned(bp)) { 2001 xfs_buf_unlock(bp); 2002 pinned++; 2003 continue; 2004 } 2005 if (!xfs_buf_delwri_submit_prep(bp)) 2006 continue; 2007 bp->b_flags |= XBF_ASYNC; 2008 xfs_buf_list_del(bp); 2009 xfs_buf_submit(bp); 2010 } 2011 blk_finish_plug(&plug); 2012 2013 return pinned; 2014 } 2015 2016 /* 2017 * Write out a buffer list synchronously. 2018 * 2019 * This will take the @buffer_list, write all buffers out and wait for I/O 2020 * completion on all of the buffers. @buffer_list is consumed by the function, 2021 * so callers must have some other way of tracking buffers if they require such 2022 * functionality. 2023 */ 2024 int 2025 xfs_buf_delwri_submit( 2026 struct list_head *buffer_list) 2027 { 2028 LIST_HEAD (wait_list); 2029 int error = 0, error2; 2030 struct xfs_buf *bp, *n; 2031 struct blk_plug plug; 2032 2033 list_sort(NULL, buffer_list, xfs_buf_cmp); 2034 2035 blk_start_plug(&plug); 2036 list_for_each_entry_safe(bp, n, buffer_list, b_list) { 2037 xfs_buf_lock(bp); 2038 if (!xfs_buf_delwri_submit_prep(bp)) 2039 continue; 2040 bp->b_flags &= ~XBF_ASYNC; 2041 list_move_tail(&bp->b_list, &wait_list); 2042 xfs_buf_submit(bp); 2043 } 2044 blk_finish_plug(&plug); 2045 2046 /* Wait for IO to complete. */ 2047 while (!list_empty(&wait_list)) { 2048 bp = list_first_entry(&wait_list, struct xfs_buf, b_list); 2049 2050 xfs_buf_list_del(bp); 2051 2052 /* 2053 * Wait on the locked buffer, check for errors and unlock and 2054 * release the delwri queue reference. 2055 */ 2056 error2 = xfs_buf_iowait(bp); 2057 xfs_buf_relse(bp); 2058 if (!error) 2059 error = error2; 2060 } 2061 2062 return error; 2063 } 2064 2065 /* 2066 * Push a single buffer on a delwri queue. 2067 * 2068 * The purpose of this function is to submit a single buffer of a delwri queue 2069 * and return with the buffer still on the original queue. 2070 * 2071 * The buffer locking and queue management logic between _delwri_pushbuf() and 2072 * _delwri_queue() guarantee that the buffer cannot be queued to another list 2073 * before returning. 2074 */ 2075 int 2076 xfs_buf_delwri_pushbuf( 2077 struct xfs_buf *bp, 2078 struct list_head *buffer_list) 2079 { 2080 int error; 2081 2082 ASSERT(bp->b_flags & _XBF_DELWRI_Q); 2083 2084 trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_); 2085 2086 xfs_buf_lock(bp); 2087 bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC); 2088 bp->b_flags |= XBF_WRITE; 2089 xfs_buf_submit(bp); 2090 2091 /* 2092 * The buffer is now locked, under I/O but still on the original delwri 2093 * queue. Wait for I/O completion, restore the DELWRI_Q flag and 2094 * return with the buffer unlocked and still on the original queue. 2095 */ 2096 error = xfs_buf_iowait(bp); 2097 bp->b_flags |= _XBF_DELWRI_Q; 2098 xfs_buf_unlock(bp); 2099 2100 return error; 2101 } 2102 2103 void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) 2104 { 2105 /* 2106 * Set the lru reference count to 0 based on the error injection tag. 2107 * This allows userspace to disrupt buffer caching for debug/testing 2108 * purposes. 2109 */ 2110 if (XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_LRU_REF)) 2111 lru_ref = 0; 2112 2113 atomic_set(&bp->b_lru_ref, lru_ref); 2114 } 2115 2116 /* 2117 * Verify an on-disk magic value against the magic value specified in the 2118 * verifier structure. The verifier magic is in disk byte order so the caller is 2119 * expected to pass the value directly from disk. 2120 */ 2121 bool 2122 xfs_verify_magic( 2123 struct xfs_buf *bp, 2124 __be32 dmagic) 2125 { 2126 struct xfs_mount *mp = bp->b_mount; 2127 int idx; 2128 2129 idx = xfs_has_crc(mp); 2130 if (WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx])) 2131 return false; 2132 return dmagic == bp->b_ops->magic[idx]; 2133 } 2134 /* 2135 * Verify an on-disk magic value against the magic value specified in the 2136 * verifier structure. The verifier magic is in disk byte order so the caller is 2137 * expected to pass the value directly from disk. 2138 */ 2139 bool 2140 xfs_verify_magic16( 2141 struct xfs_buf *bp, 2142 __be16 dmagic) 2143 { 2144 struct xfs_mount *mp = bp->b_mount; 2145 int idx; 2146 2147 idx = xfs_has_crc(mp); 2148 if (WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx])) 2149 return false; 2150 return dmagic == bp->b_ops->magic16[idx]; 2151 } 2152