1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include <linux/backing-dev.h> 8 #include <linux/dax.h> 9 10 #include "xfs_shared.h" 11 #include "xfs_format.h" 12 #include "xfs_log_format.h" 13 #include "xfs_trans_resv.h" 14 #include "xfs_mount.h" 15 #include "xfs_trace.h" 16 #include "xfs_log.h" 17 #include "xfs_log_recover.h" 18 #include "xfs_log_priv.h" 19 #include "xfs_trans.h" 20 #include "xfs_buf_item.h" 21 #include "xfs_errortag.h" 22 #include "xfs_error.h" 23 #include "xfs_ag.h" 24 25 struct kmem_cache *xfs_buf_cache; 26 27 /* 28 * Locking orders 29 * 30 * xfs_buf_ioacct_inc: 31 * xfs_buf_ioacct_dec: 32 * b_sema (caller holds) 33 * b_lock 34 * 35 * xfs_buf_stale: 36 * b_sema (caller holds) 37 * b_lock 38 * lru_lock 39 * 40 * xfs_buf_rele: 41 * b_lock 42 * pag_buf_lock 43 * lru_lock 44 * 45 * xfs_buftarg_drain_rele 46 * lru_lock 47 * b_lock (trylock due to inversion) 48 * 49 * xfs_buftarg_isolate 50 * lru_lock 51 * b_lock (trylock due to inversion) 52 */ 53 54 static int __xfs_buf_submit(struct xfs_buf *bp, bool wait); 55 56 static inline int 57 xfs_buf_submit( 58 struct xfs_buf *bp) 59 { 60 return __xfs_buf_submit(bp, !(bp->b_flags & XBF_ASYNC)); 61 } 62 63 static inline int 64 xfs_buf_is_vmapped( 65 struct xfs_buf *bp) 66 { 67 /* 68 * Return true if the buffer is vmapped. 69 * 70 * b_addr is null if the buffer is not mapped, but the code is clever 71 * enough to know it doesn't have to map a single page, so the check has 72 * to be both for b_addr and bp->b_page_count > 1. 73 */ 74 return bp->b_addr && bp->b_page_count > 1; 75 } 76 77 static inline int 78 xfs_buf_vmap_len( 79 struct xfs_buf *bp) 80 { 81 return (bp->b_page_count * PAGE_SIZE); 82 } 83 84 /* 85 * Bump the I/O in flight count on the buftarg if we haven't yet done so for 86 * this buffer. The count is incremented once per buffer (per hold cycle) 87 * because the corresponding decrement is deferred to buffer release. Buffers 88 * can undergo I/O multiple times in a hold-release cycle and per buffer I/O 89 * tracking adds unnecessary overhead. This is used for sychronization purposes 90 * with unmount (see xfs_buftarg_drain()), so all we really need is a count of 91 * in-flight buffers. 92 * 93 * Buffers that are never released (e.g., superblock, iclog buffers) must set 94 * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count 95 * never reaches zero and unmount hangs indefinitely. 96 */ 97 static inline void 98 xfs_buf_ioacct_inc( 99 struct xfs_buf *bp) 100 { 101 if (bp->b_flags & XBF_NO_IOACCT) 102 return; 103 104 ASSERT(bp->b_flags & XBF_ASYNC); 105 spin_lock(&bp->b_lock); 106 if (!(bp->b_state & XFS_BSTATE_IN_FLIGHT)) { 107 bp->b_state |= XFS_BSTATE_IN_FLIGHT; 108 percpu_counter_inc(&bp->b_target->bt_io_count); 109 } 110 spin_unlock(&bp->b_lock); 111 } 112 113 /* 114 * Clear the in-flight state on a buffer about to be released to the LRU or 115 * freed and unaccount from the buftarg. 116 */ 117 static inline void 118 __xfs_buf_ioacct_dec( 119 struct xfs_buf *bp) 120 { 121 lockdep_assert_held(&bp->b_lock); 122 123 if (bp->b_state & XFS_BSTATE_IN_FLIGHT) { 124 bp->b_state &= ~XFS_BSTATE_IN_FLIGHT; 125 percpu_counter_dec(&bp->b_target->bt_io_count); 126 } 127 } 128 129 static inline void 130 xfs_buf_ioacct_dec( 131 struct xfs_buf *bp) 132 { 133 spin_lock(&bp->b_lock); 134 __xfs_buf_ioacct_dec(bp); 135 spin_unlock(&bp->b_lock); 136 } 137 138 /* 139 * When we mark a buffer stale, we remove the buffer from the LRU and clear the 140 * b_lru_ref count so that the buffer is freed immediately when the buffer 141 * reference count falls to zero. If the buffer is already on the LRU, we need 142 * to remove the reference that LRU holds on the buffer. 143 * 144 * This prevents build-up of stale buffers on the LRU. 145 */ 146 void 147 xfs_buf_stale( 148 struct xfs_buf *bp) 149 { 150 ASSERT(xfs_buf_islocked(bp)); 151 152 bp->b_flags |= XBF_STALE; 153 154 /* 155 * Clear the delwri status so that a delwri queue walker will not 156 * flush this buffer to disk now that it is stale. The delwri queue has 157 * a reference to the buffer, so this is safe to do. 158 */ 159 bp->b_flags &= ~_XBF_DELWRI_Q; 160 161 /* 162 * Once the buffer is marked stale and unlocked, a subsequent lookup 163 * could reset b_flags. There is no guarantee that the buffer is 164 * unaccounted (released to LRU) before that occurs. Drop in-flight 165 * status now to preserve accounting consistency. 166 */ 167 spin_lock(&bp->b_lock); 168 __xfs_buf_ioacct_dec(bp); 169 170 atomic_set(&bp->b_lru_ref, 0); 171 if (!(bp->b_state & XFS_BSTATE_DISPOSE) && 172 (list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru))) 173 atomic_dec(&bp->b_hold); 174 175 ASSERT(atomic_read(&bp->b_hold) >= 1); 176 spin_unlock(&bp->b_lock); 177 } 178 179 static int 180 xfs_buf_get_maps( 181 struct xfs_buf *bp, 182 int map_count) 183 { 184 ASSERT(bp->b_maps == NULL); 185 bp->b_map_count = map_count; 186 187 if (map_count == 1) { 188 bp->b_maps = &bp->__b_map; 189 return 0; 190 } 191 192 bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map), 193 KM_NOFS); 194 if (!bp->b_maps) 195 return -ENOMEM; 196 return 0; 197 } 198 199 /* 200 * Frees b_pages if it was allocated. 201 */ 202 static void 203 xfs_buf_free_maps( 204 struct xfs_buf *bp) 205 { 206 if (bp->b_maps != &bp->__b_map) { 207 kmem_free(bp->b_maps); 208 bp->b_maps = NULL; 209 } 210 } 211 212 static int 213 _xfs_buf_alloc( 214 struct xfs_buftarg *target, 215 struct xfs_buf_map *map, 216 int nmaps, 217 xfs_buf_flags_t flags, 218 struct xfs_buf **bpp) 219 { 220 struct xfs_buf *bp; 221 int error; 222 int i; 223 224 *bpp = NULL; 225 bp = kmem_cache_zalloc(xfs_buf_cache, GFP_NOFS | __GFP_NOFAIL); 226 227 /* 228 * We don't want certain flags to appear in b_flags unless they are 229 * specifically set by later operations on the buffer. 230 */ 231 flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD); 232 233 atomic_set(&bp->b_hold, 1); 234 atomic_set(&bp->b_lru_ref, 1); 235 init_completion(&bp->b_iowait); 236 INIT_LIST_HEAD(&bp->b_lru); 237 INIT_LIST_HEAD(&bp->b_list); 238 INIT_LIST_HEAD(&bp->b_li_list); 239 sema_init(&bp->b_sema, 0); /* held, no waiters */ 240 spin_lock_init(&bp->b_lock); 241 bp->b_target = target; 242 bp->b_mount = target->bt_mount; 243 bp->b_flags = flags; 244 245 /* 246 * Set length and io_length to the same value initially. 247 * I/O routines should use io_length, which will be the same in 248 * most cases but may be reset (e.g. XFS recovery). 249 */ 250 error = xfs_buf_get_maps(bp, nmaps); 251 if (error) { 252 kmem_cache_free(xfs_buf_cache, bp); 253 return error; 254 } 255 256 bp->b_rhash_key = map[0].bm_bn; 257 bp->b_length = 0; 258 for (i = 0; i < nmaps; i++) { 259 bp->b_maps[i].bm_bn = map[i].bm_bn; 260 bp->b_maps[i].bm_len = map[i].bm_len; 261 bp->b_length += map[i].bm_len; 262 } 263 264 atomic_set(&bp->b_pin_count, 0); 265 init_waitqueue_head(&bp->b_waiters); 266 267 XFS_STATS_INC(bp->b_mount, xb_create); 268 trace_xfs_buf_init(bp, _RET_IP_); 269 270 *bpp = bp; 271 return 0; 272 } 273 274 static void 275 xfs_buf_free_pages( 276 struct xfs_buf *bp) 277 { 278 uint i; 279 280 ASSERT(bp->b_flags & _XBF_PAGES); 281 282 if (xfs_buf_is_vmapped(bp)) 283 vm_unmap_ram(bp->b_addr, bp->b_page_count); 284 285 for (i = 0; i < bp->b_page_count; i++) { 286 if (bp->b_pages[i]) 287 __free_page(bp->b_pages[i]); 288 } 289 mm_account_reclaimed_pages(bp->b_page_count); 290 291 if (bp->b_pages != bp->b_page_array) 292 kmem_free(bp->b_pages); 293 bp->b_pages = NULL; 294 bp->b_flags &= ~_XBF_PAGES; 295 } 296 297 static void 298 xfs_buf_free_callback( 299 struct callback_head *cb) 300 { 301 struct xfs_buf *bp = container_of(cb, struct xfs_buf, b_rcu); 302 303 xfs_buf_free_maps(bp); 304 kmem_cache_free(xfs_buf_cache, bp); 305 } 306 307 static void 308 xfs_buf_free( 309 struct xfs_buf *bp) 310 { 311 trace_xfs_buf_free(bp, _RET_IP_); 312 313 ASSERT(list_empty(&bp->b_lru)); 314 315 if (bp->b_flags & _XBF_PAGES) 316 xfs_buf_free_pages(bp); 317 else if (bp->b_flags & _XBF_KMEM) 318 kmem_free(bp->b_addr); 319 320 call_rcu(&bp->b_rcu, xfs_buf_free_callback); 321 } 322 323 static int 324 xfs_buf_alloc_kmem( 325 struct xfs_buf *bp, 326 xfs_buf_flags_t flags) 327 { 328 xfs_km_flags_t kmflag_mask = KM_NOFS; 329 size_t size = BBTOB(bp->b_length); 330 331 /* Assure zeroed buffer for non-read cases. */ 332 if (!(flags & XBF_READ)) 333 kmflag_mask |= KM_ZERO; 334 335 bp->b_addr = kmem_alloc(size, kmflag_mask); 336 if (!bp->b_addr) 337 return -ENOMEM; 338 339 if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) != 340 ((unsigned long)bp->b_addr & PAGE_MASK)) { 341 /* b_addr spans two pages - use alloc_page instead */ 342 kmem_free(bp->b_addr); 343 bp->b_addr = NULL; 344 return -ENOMEM; 345 } 346 bp->b_offset = offset_in_page(bp->b_addr); 347 bp->b_pages = bp->b_page_array; 348 bp->b_pages[0] = kmem_to_page(bp->b_addr); 349 bp->b_page_count = 1; 350 bp->b_flags |= _XBF_KMEM; 351 return 0; 352 } 353 354 static int 355 xfs_buf_alloc_pages( 356 struct xfs_buf *bp, 357 xfs_buf_flags_t flags) 358 { 359 gfp_t gfp_mask = __GFP_NOWARN; 360 long filled = 0; 361 362 if (flags & XBF_READ_AHEAD) 363 gfp_mask |= __GFP_NORETRY; 364 else 365 gfp_mask |= GFP_NOFS; 366 367 /* Make sure that we have a page list */ 368 bp->b_page_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE); 369 if (bp->b_page_count <= XB_PAGES) { 370 bp->b_pages = bp->b_page_array; 371 } else { 372 bp->b_pages = kzalloc(sizeof(struct page *) * bp->b_page_count, 373 gfp_mask); 374 if (!bp->b_pages) 375 return -ENOMEM; 376 } 377 bp->b_flags |= _XBF_PAGES; 378 379 /* Assure zeroed buffer for non-read cases. */ 380 if (!(flags & XBF_READ)) 381 gfp_mask |= __GFP_ZERO; 382 383 /* 384 * Bulk filling of pages can take multiple calls. Not filling the entire 385 * array is not an allocation failure, so don't back off if we get at 386 * least one extra page. 387 */ 388 for (;;) { 389 long last = filled; 390 391 filled = alloc_pages_bulk_array(gfp_mask, bp->b_page_count, 392 bp->b_pages); 393 if (filled == bp->b_page_count) { 394 XFS_STATS_INC(bp->b_mount, xb_page_found); 395 break; 396 } 397 398 if (filled != last) 399 continue; 400 401 if (flags & XBF_READ_AHEAD) { 402 xfs_buf_free_pages(bp); 403 return -ENOMEM; 404 } 405 406 XFS_STATS_INC(bp->b_mount, xb_page_retries); 407 memalloc_retry_wait(gfp_mask); 408 } 409 return 0; 410 } 411 412 /* 413 * Map buffer into kernel address-space if necessary. 414 */ 415 STATIC int 416 _xfs_buf_map_pages( 417 struct xfs_buf *bp, 418 xfs_buf_flags_t flags) 419 { 420 ASSERT(bp->b_flags & _XBF_PAGES); 421 if (bp->b_page_count == 1) { 422 /* A single page buffer is always mappable */ 423 bp->b_addr = page_address(bp->b_pages[0]); 424 } else if (flags & XBF_UNMAPPED) { 425 bp->b_addr = NULL; 426 } else { 427 int retried = 0; 428 unsigned nofs_flag; 429 430 /* 431 * vm_map_ram() will allocate auxiliary structures (e.g. 432 * pagetables) with GFP_KERNEL, yet we are likely to be under 433 * GFP_NOFS context here. Hence we need to tell memory reclaim 434 * that we are in such a context via PF_MEMALLOC_NOFS to prevent 435 * memory reclaim re-entering the filesystem here and 436 * potentially deadlocking. 437 */ 438 nofs_flag = memalloc_nofs_save(); 439 do { 440 bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, 441 -1); 442 if (bp->b_addr) 443 break; 444 vm_unmap_aliases(); 445 } while (retried++ <= 1); 446 memalloc_nofs_restore(nofs_flag); 447 448 if (!bp->b_addr) 449 return -ENOMEM; 450 } 451 452 return 0; 453 } 454 455 /* 456 * Finding and Reading Buffers 457 */ 458 static int 459 _xfs_buf_obj_cmp( 460 struct rhashtable_compare_arg *arg, 461 const void *obj) 462 { 463 const struct xfs_buf_map *map = arg->key; 464 const struct xfs_buf *bp = obj; 465 466 /* 467 * The key hashing in the lookup path depends on the key being the 468 * first element of the compare_arg, make sure to assert this. 469 */ 470 BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0); 471 472 if (bp->b_rhash_key != map->bm_bn) 473 return 1; 474 475 if (unlikely(bp->b_length != map->bm_len)) { 476 /* 477 * found a block number match. If the range doesn't 478 * match, the only way this is allowed is if the buffer 479 * in the cache is stale and the transaction that made 480 * it stale has not yet committed. i.e. we are 481 * reallocating a busy extent. Skip this buffer and 482 * continue searching for an exact match. 483 */ 484 if (!(map->bm_flags & XBM_LIVESCAN)) 485 ASSERT(bp->b_flags & XBF_STALE); 486 return 1; 487 } 488 return 0; 489 } 490 491 static const struct rhashtable_params xfs_buf_hash_params = { 492 .min_size = 32, /* empty AGs have minimal footprint */ 493 .nelem_hint = 16, 494 .key_len = sizeof(xfs_daddr_t), 495 .key_offset = offsetof(struct xfs_buf, b_rhash_key), 496 .head_offset = offsetof(struct xfs_buf, b_rhash_head), 497 .automatic_shrinking = true, 498 .obj_cmpfn = _xfs_buf_obj_cmp, 499 }; 500 501 int 502 xfs_buf_hash_init( 503 struct xfs_perag *pag) 504 { 505 spin_lock_init(&pag->pag_buf_lock); 506 return rhashtable_init(&pag->pag_buf_hash, &xfs_buf_hash_params); 507 } 508 509 void 510 xfs_buf_hash_destroy( 511 struct xfs_perag *pag) 512 { 513 rhashtable_destroy(&pag->pag_buf_hash); 514 } 515 516 static int 517 xfs_buf_map_verify( 518 struct xfs_buftarg *btp, 519 struct xfs_buf_map *map) 520 { 521 xfs_daddr_t eofs; 522 523 /* Check for IOs smaller than the sector size / not sector aligned */ 524 ASSERT(!(BBTOB(map->bm_len) < btp->bt_meta_sectorsize)); 525 ASSERT(!(BBTOB(map->bm_bn) & (xfs_off_t)btp->bt_meta_sectormask)); 526 527 /* 528 * Corrupted block numbers can get through to here, unfortunately, so we 529 * have to check that the buffer falls within the filesystem bounds. 530 */ 531 eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); 532 if (map->bm_bn < 0 || map->bm_bn >= eofs) { 533 xfs_alert(btp->bt_mount, 534 "%s: daddr 0x%llx out of range, EOFS 0x%llx", 535 __func__, map->bm_bn, eofs); 536 WARN_ON(1); 537 return -EFSCORRUPTED; 538 } 539 return 0; 540 } 541 542 static int 543 xfs_buf_find_lock( 544 struct xfs_buf *bp, 545 xfs_buf_flags_t flags) 546 { 547 if (flags & XBF_TRYLOCK) { 548 if (!xfs_buf_trylock(bp)) { 549 XFS_STATS_INC(bp->b_mount, xb_busy_locked); 550 return -EAGAIN; 551 } 552 } else { 553 xfs_buf_lock(bp); 554 XFS_STATS_INC(bp->b_mount, xb_get_locked_waited); 555 } 556 557 /* 558 * if the buffer is stale, clear all the external state associated with 559 * it. We need to keep flags such as how we allocated the buffer memory 560 * intact here. 561 */ 562 if (bp->b_flags & XBF_STALE) { 563 if (flags & XBF_LIVESCAN) { 564 xfs_buf_unlock(bp); 565 return -ENOENT; 566 } 567 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); 568 bp->b_flags &= _XBF_KMEM | _XBF_PAGES; 569 bp->b_ops = NULL; 570 } 571 return 0; 572 } 573 574 static inline int 575 xfs_buf_lookup( 576 struct xfs_perag *pag, 577 struct xfs_buf_map *map, 578 xfs_buf_flags_t flags, 579 struct xfs_buf **bpp) 580 { 581 struct xfs_buf *bp; 582 int error; 583 584 rcu_read_lock(); 585 bp = rhashtable_lookup(&pag->pag_buf_hash, map, xfs_buf_hash_params); 586 if (!bp || !atomic_inc_not_zero(&bp->b_hold)) { 587 rcu_read_unlock(); 588 return -ENOENT; 589 } 590 rcu_read_unlock(); 591 592 error = xfs_buf_find_lock(bp, flags); 593 if (error) { 594 xfs_buf_rele(bp); 595 return error; 596 } 597 598 trace_xfs_buf_find(bp, flags, _RET_IP_); 599 *bpp = bp; 600 return 0; 601 } 602 603 /* 604 * Insert the new_bp into the hash table. This consumes the perag reference 605 * taken for the lookup regardless of the result of the insert. 606 */ 607 static int 608 xfs_buf_find_insert( 609 struct xfs_buftarg *btp, 610 struct xfs_perag *pag, 611 struct xfs_buf_map *cmap, 612 struct xfs_buf_map *map, 613 int nmaps, 614 xfs_buf_flags_t flags, 615 struct xfs_buf **bpp) 616 { 617 struct xfs_buf *new_bp; 618 struct xfs_buf *bp; 619 int error; 620 621 error = _xfs_buf_alloc(btp, map, nmaps, flags, &new_bp); 622 if (error) 623 goto out_drop_pag; 624 625 /* 626 * For buffers that fit entirely within a single page, first attempt to 627 * allocate the memory from the heap to minimise memory usage. If we 628 * can't get heap memory for these small buffers, we fall back to using 629 * the page allocator. 630 */ 631 if (BBTOB(new_bp->b_length) >= PAGE_SIZE || 632 xfs_buf_alloc_kmem(new_bp, flags) < 0) { 633 error = xfs_buf_alloc_pages(new_bp, flags); 634 if (error) 635 goto out_free_buf; 636 } 637 638 spin_lock(&pag->pag_buf_lock); 639 bp = rhashtable_lookup_get_insert_fast(&pag->pag_buf_hash, 640 &new_bp->b_rhash_head, xfs_buf_hash_params); 641 if (IS_ERR(bp)) { 642 error = PTR_ERR(bp); 643 spin_unlock(&pag->pag_buf_lock); 644 goto out_free_buf; 645 } 646 if (bp) { 647 /* found an existing buffer */ 648 atomic_inc(&bp->b_hold); 649 spin_unlock(&pag->pag_buf_lock); 650 error = xfs_buf_find_lock(bp, flags); 651 if (error) 652 xfs_buf_rele(bp); 653 else 654 *bpp = bp; 655 goto out_free_buf; 656 } 657 658 /* The new buffer keeps the perag reference until it is freed. */ 659 new_bp->b_pag = pag; 660 spin_unlock(&pag->pag_buf_lock); 661 *bpp = new_bp; 662 return 0; 663 664 out_free_buf: 665 xfs_buf_free(new_bp); 666 out_drop_pag: 667 xfs_perag_put(pag); 668 return error; 669 } 670 671 /* 672 * Assembles a buffer covering the specified range. The code is optimised for 673 * cache hits, as metadata intensive workloads will see 3 orders of magnitude 674 * more hits than misses. 675 */ 676 int 677 xfs_buf_get_map( 678 struct xfs_buftarg *btp, 679 struct xfs_buf_map *map, 680 int nmaps, 681 xfs_buf_flags_t flags, 682 struct xfs_buf **bpp) 683 { 684 struct xfs_perag *pag; 685 struct xfs_buf *bp = NULL; 686 struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn }; 687 int error; 688 int i; 689 690 if (flags & XBF_LIVESCAN) 691 cmap.bm_flags |= XBM_LIVESCAN; 692 for (i = 0; i < nmaps; i++) 693 cmap.bm_len += map[i].bm_len; 694 695 error = xfs_buf_map_verify(btp, &cmap); 696 if (error) 697 return error; 698 699 pag = xfs_perag_get(btp->bt_mount, 700 xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn)); 701 702 error = xfs_buf_lookup(pag, &cmap, flags, &bp); 703 if (error && error != -ENOENT) 704 goto out_put_perag; 705 706 /* cache hits always outnumber misses by at least 10:1 */ 707 if (unlikely(!bp)) { 708 XFS_STATS_INC(btp->bt_mount, xb_miss_locked); 709 710 if (flags & XBF_INCORE) 711 goto out_put_perag; 712 713 /* xfs_buf_find_insert() consumes the perag reference. */ 714 error = xfs_buf_find_insert(btp, pag, &cmap, map, nmaps, 715 flags, &bp); 716 if (error) 717 return error; 718 } else { 719 XFS_STATS_INC(btp->bt_mount, xb_get_locked); 720 xfs_perag_put(pag); 721 } 722 723 /* We do not hold a perag reference anymore. */ 724 if (!bp->b_addr) { 725 error = _xfs_buf_map_pages(bp, flags); 726 if (unlikely(error)) { 727 xfs_warn_ratelimited(btp->bt_mount, 728 "%s: failed to map %u pages", __func__, 729 bp->b_page_count); 730 xfs_buf_relse(bp); 731 return error; 732 } 733 } 734 735 /* 736 * Clear b_error if this is a lookup from a caller that doesn't expect 737 * valid data to be found in the buffer. 738 */ 739 if (!(flags & XBF_READ)) 740 xfs_buf_ioerror(bp, 0); 741 742 XFS_STATS_INC(btp->bt_mount, xb_get); 743 trace_xfs_buf_get(bp, flags, _RET_IP_); 744 *bpp = bp; 745 return 0; 746 747 out_put_perag: 748 xfs_perag_put(pag); 749 return error; 750 } 751 752 int 753 _xfs_buf_read( 754 struct xfs_buf *bp, 755 xfs_buf_flags_t flags) 756 { 757 ASSERT(!(flags & XBF_WRITE)); 758 ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL); 759 760 bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD | XBF_DONE); 761 bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); 762 763 return xfs_buf_submit(bp); 764 } 765 766 /* 767 * Reverify a buffer found in cache without an attached ->b_ops. 768 * 769 * If the caller passed an ops structure and the buffer doesn't have ops 770 * assigned, set the ops and use it to verify the contents. If verification 771 * fails, clear XBF_DONE. We assume the buffer has no recorded errors and is 772 * already in XBF_DONE state on entry. 773 * 774 * Under normal operations, every in-core buffer is verified on read I/O 775 * completion. There are two scenarios that can lead to in-core buffers without 776 * an assigned ->b_ops. The first is during log recovery of buffers on a V4 777 * filesystem, though these buffers are purged at the end of recovery. The 778 * other is online repair, which intentionally reads with a NULL buffer ops to 779 * run several verifiers across an in-core buffer in order to establish buffer 780 * type. If repair can't establish that, the buffer will be left in memory 781 * with NULL buffer ops. 782 */ 783 int 784 xfs_buf_reverify( 785 struct xfs_buf *bp, 786 const struct xfs_buf_ops *ops) 787 { 788 ASSERT(bp->b_flags & XBF_DONE); 789 ASSERT(bp->b_error == 0); 790 791 if (!ops || bp->b_ops) 792 return 0; 793 794 bp->b_ops = ops; 795 bp->b_ops->verify_read(bp); 796 if (bp->b_error) 797 bp->b_flags &= ~XBF_DONE; 798 return bp->b_error; 799 } 800 801 int 802 xfs_buf_read_map( 803 struct xfs_buftarg *target, 804 struct xfs_buf_map *map, 805 int nmaps, 806 xfs_buf_flags_t flags, 807 struct xfs_buf **bpp, 808 const struct xfs_buf_ops *ops, 809 xfs_failaddr_t fa) 810 { 811 struct xfs_buf *bp; 812 int error; 813 814 flags |= XBF_READ; 815 *bpp = NULL; 816 817 error = xfs_buf_get_map(target, map, nmaps, flags, &bp); 818 if (error) 819 return error; 820 821 trace_xfs_buf_read(bp, flags, _RET_IP_); 822 823 if (!(bp->b_flags & XBF_DONE)) { 824 /* Initiate the buffer read and wait. */ 825 XFS_STATS_INC(target->bt_mount, xb_get_read); 826 bp->b_ops = ops; 827 error = _xfs_buf_read(bp, flags); 828 829 /* Readahead iodone already dropped the buffer, so exit. */ 830 if (flags & XBF_ASYNC) 831 return 0; 832 } else { 833 /* Buffer already read; all we need to do is check it. */ 834 error = xfs_buf_reverify(bp, ops); 835 836 /* Readahead already finished; drop the buffer and exit. */ 837 if (flags & XBF_ASYNC) { 838 xfs_buf_relse(bp); 839 return 0; 840 } 841 842 /* We do not want read in the flags */ 843 bp->b_flags &= ~XBF_READ; 844 ASSERT(bp->b_ops != NULL || ops == NULL); 845 } 846 847 /* 848 * If we've had a read error, then the contents of the buffer are 849 * invalid and should not be used. To ensure that a followup read tries 850 * to pull the buffer from disk again, we clear the XBF_DONE flag and 851 * mark the buffer stale. This ensures that anyone who has a current 852 * reference to the buffer will interpret it's contents correctly and 853 * future cache lookups will also treat it as an empty, uninitialised 854 * buffer. 855 */ 856 if (error) { 857 /* 858 * Check against log shutdown for error reporting because 859 * metadata writeback may require a read first and we need to 860 * report errors in metadata writeback until the log is shut 861 * down. High level transaction read functions already check 862 * against mount shutdown, anyway, so we only need to be 863 * concerned about low level IO interactions here. 864 */ 865 if (!xlog_is_shutdown(target->bt_mount->m_log)) 866 xfs_buf_ioerror_alert(bp, fa); 867 868 bp->b_flags &= ~XBF_DONE; 869 xfs_buf_stale(bp); 870 xfs_buf_relse(bp); 871 872 /* bad CRC means corrupted metadata */ 873 if (error == -EFSBADCRC) 874 error = -EFSCORRUPTED; 875 return error; 876 } 877 878 *bpp = bp; 879 return 0; 880 } 881 882 /* 883 * If we are not low on memory then do the readahead in a deadlock 884 * safe manner. 885 */ 886 void 887 xfs_buf_readahead_map( 888 struct xfs_buftarg *target, 889 struct xfs_buf_map *map, 890 int nmaps, 891 const struct xfs_buf_ops *ops) 892 { 893 struct xfs_buf *bp; 894 895 xfs_buf_read_map(target, map, nmaps, 896 XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops, 897 __this_address); 898 } 899 900 /* 901 * Read an uncached buffer from disk. Allocates and returns a locked 902 * buffer containing the disk contents or nothing. Uncached buffers always have 903 * a cache index of XFS_BUF_DADDR_NULL so we can easily determine if the buffer 904 * is cached or uncached during fault diagnosis. 905 */ 906 int 907 xfs_buf_read_uncached( 908 struct xfs_buftarg *target, 909 xfs_daddr_t daddr, 910 size_t numblks, 911 xfs_buf_flags_t flags, 912 struct xfs_buf **bpp, 913 const struct xfs_buf_ops *ops) 914 { 915 struct xfs_buf *bp; 916 int error; 917 918 *bpp = NULL; 919 920 error = xfs_buf_get_uncached(target, numblks, flags, &bp); 921 if (error) 922 return error; 923 924 /* set up the buffer for a read IO */ 925 ASSERT(bp->b_map_count == 1); 926 bp->b_rhash_key = XFS_BUF_DADDR_NULL; 927 bp->b_maps[0].bm_bn = daddr; 928 bp->b_flags |= XBF_READ; 929 bp->b_ops = ops; 930 931 xfs_buf_submit(bp); 932 if (bp->b_error) { 933 error = bp->b_error; 934 xfs_buf_relse(bp); 935 return error; 936 } 937 938 *bpp = bp; 939 return 0; 940 } 941 942 int 943 xfs_buf_get_uncached( 944 struct xfs_buftarg *target, 945 size_t numblks, 946 xfs_buf_flags_t flags, 947 struct xfs_buf **bpp) 948 { 949 int error; 950 struct xfs_buf *bp; 951 DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks); 952 953 *bpp = NULL; 954 955 /* flags might contain irrelevant bits, pass only what we care about */ 956 error = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT, &bp); 957 if (error) 958 return error; 959 960 error = xfs_buf_alloc_pages(bp, flags); 961 if (error) 962 goto fail_free_buf; 963 964 error = _xfs_buf_map_pages(bp, 0); 965 if (unlikely(error)) { 966 xfs_warn(target->bt_mount, 967 "%s: failed to map pages", __func__); 968 goto fail_free_buf; 969 } 970 971 trace_xfs_buf_get_uncached(bp, _RET_IP_); 972 *bpp = bp; 973 return 0; 974 975 fail_free_buf: 976 xfs_buf_free(bp); 977 return error; 978 } 979 980 /* 981 * Increment reference count on buffer, to hold the buffer concurrently 982 * with another thread which may release (free) the buffer asynchronously. 983 * Must hold the buffer already to call this function. 984 */ 985 void 986 xfs_buf_hold( 987 struct xfs_buf *bp) 988 { 989 trace_xfs_buf_hold(bp, _RET_IP_); 990 atomic_inc(&bp->b_hold); 991 } 992 993 /* 994 * Release a hold on the specified buffer. If the hold count is 1, the buffer is 995 * placed on LRU or freed (depending on b_lru_ref). 996 */ 997 void 998 xfs_buf_rele( 999 struct xfs_buf *bp) 1000 { 1001 struct xfs_perag *pag = bp->b_pag; 1002 bool release; 1003 bool freebuf = false; 1004 1005 trace_xfs_buf_rele(bp, _RET_IP_); 1006 1007 if (!pag) { 1008 ASSERT(list_empty(&bp->b_lru)); 1009 if (atomic_dec_and_test(&bp->b_hold)) { 1010 xfs_buf_ioacct_dec(bp); 1011 xfs_buf_free(bp); 1012 } 1013 return; 1014 } 1015 1016 ASSERT(atomic_read(&bp->b_hold) > 0); 1017 1018 /* 1019 * We grab the b_lock here first to serialise racing xfs_buf_rele() 1020 * calls. The pag_buf_lock being taken on the last reference only 1021 * serialises against racing lookups in xfs_buf_find(). IOWs, the second 1022 * to last reference we drop here is not serialised against the last 1023 * reference until we take bp->b_lock. Hence if we don't grab b_lock 1024 * first, the last "release" reference can win the race to the lock and 1025 * free the buffer before the second-to-last reference is processed, 1026 * leading to a use-after-free scenario. 1027 */ 1028 spin_lock(&bp->b_lock); 1029 release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock); 1030 if (!release) { 1031 /* 1032 * Drop the in-flight state if the buffer is already on the LRU 1033 * and it holds the only reference. This is racy because we 1034 * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT 1035 * ensures the decrement occurs only once per-buf. 1036 */ 1037 if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru)) 1038 __xfs_buf_ioacct_dec(bp); 1039 goto out_unlock; 1040 } 1041 1042 /* the last reference has been dropped ... */ 1043 __xfs_buf_ioacct_dec(bp); 1044 if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { 1045 /* 1046 * If the buffer is added to the LRU take a new reference to the 1047 * buffer for the LRU and clear the (now stale) dispose list 1048 * state flag 1049 */ 1050 if (list_lru_add_obj(&bp->b_target->bt_lru, &bp->b_lru)) { 1051 bp->b_state &= ~XFS_BSTATE_DISPOSE; 1052 atomic_inc(&bp->b_hold); 1053 } 1054 spin_unlock(&pag->pag_buf_lock); 1055 } else { 1056 /* 1057 * most of the time buffers will already be removed from the 1058 * LRU, so optimise that case by checking for the 1059 * XFS_BSTATE_DISPOSE flag indicating the last list the buffer 1060 * was on was the disposal list 1061 */ 1062 if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { 1063 list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru); 1064 } else { 1065 ASSERT(list_empty(&bp->b_lru)); 1066 } 1067 1068 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); 1069 rhashtable_remove_fast(&pag->pag_buf_hash, &bp->b_rhash_head, 1070 xfs_buf_hash_params); 1071 spin_unlock(&pag->pag_buf_lock); 1072 xfs_perag_put(pag); 1073 freebuf = true; 1074 } 1075 1076 out_unlock: 1077 spin_unlock(&bp->b_lock); 1078 1079 if (freebuf) 1080 xfs_buf_free(bp); 1081 } 1082 1083 1084 /* 1085 * Lock a buffer object, if it is not already locked. 1086 * 1087 * If we come across a stale, pinned, locked buffer, we know that we are 1088 * being asked to lock a buffer that has been reallocated. Because it is 1089 * pinned, we know that the log has not been pushed to disk and hence it 1090 * will still be locked. Rather than continuing to have trylock attempts 1091 * fail until someone else pushes the log, push it ourselves before 1092 * returning. This means that the xfsaild will not get stuck trying 1093 * to push on stale inode buffers. 1094 */ 1095 int 1096 xfs_buf_trylock( 1097 struct xfs_buf *bp) 1098 { 1099 int locked; 1100 1101 locked = down_trylock(&bp->b_sema) == 0; 1102 if (locked) 1103 trace_xfs_buf_trylock(bp, _RET_IP_); 1104 else 1105 trace_xfs_buf_trylock_fail(bp, _RET_IP_); 1106 return locked; 1107 } 1108 1109 /* 1110 * Lock a buffer object. 1111 * 1112 * If we come across a stale, pinned, locked buffer, we know that we 1113 * are being asked to lock a buffer that has been reallocated. Because 1114 * it is pinned, we know that the log has not been pushed to disk and 1115 * hence it will still be locked. Rather than sleeping until someone 1116 * else pushes the log, push it ourselves before trying to get the lock. 1117 */ 1118 void 1119 xfs_buf_lock( 1120 struct xfs_buf *bp) 1121 { 1122 trace_xfs_buf_lock(bp, _RET_IP_); 1123 1124 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) 1125 xfs_log_force(bp->b_mount, 0); 1126 down(&bp->b_sema); 1127 1128 trace_xfs_buf_lock_done(bp, _RET_IP_); 1129 } 1130 1131 void 1132 xfs_buf_unlock( 1133 struct xfs_buf *bp) 1134 { 1135 ASSERT(xfs_buf_islocked(bp)); 1136 1137 up(&bp->b_sema); 1138 trace_xfs_buf_unlock(bp, _RET_IP_); 1139 } 1140 1141 STATIC void 1142 xfs_buf_wait_unpin( 1143 struct xfs_buf *bp) 1144 { 1145 DECLARE_WAITQUEUE (wait, current); 1146 1147 if (atomic_read(&bp->b_pin_count) == 0) 1148 return; 1149 1150 add_wait_queue(&bp->b_waiters, &wait); 1151 for (;;) { 1152 set_current_state(TASK_UNINTERRUPTIBLE); 1153 if (atomic_read(&bp->b_pin_count) == 0) 1154 break; 1155 io_schedule(); 1156 } 1157 remove_wait_queue(&bp->b_waiters, &wait); 1158 set_current_state(TASK_RUNNING); 1159 } 1160 1161 static void 1162 xfs_buf_ioerror_alert_ratelimited( 1163 struct xfs_buf *bp) 1164 { 1165 static unsigned long lasttime; 1166 static struct xfs_buftarg *lasttarg; 1167 1168 if (bp->b_target != lasttarg || 1169 time_after(jiffies, (lasttime + 5*HZ))) { 1170 lasttime = jiffies; 1171 xfs_buf_ioerror_alert(bp, __this_address); 1172 } 1173 lasttarg = bp->b_target; 1174 } 1175 1176 /* 1177 * Account for this latest trip around the retry handler, and decide if 1178 * we've failed enough times to constitute a permanent failure. 1179 */ 1180 static bool 1181 xfs_buf_ioerror_permanent( 1182 struct xfs_buf *bp, 1183 struct xfs_error_cfg *cfg) 1184 { 1185 struct xfs_mount *mp = bp->b_mount; 1186 1187 if (cfg->max_retries != XFS_ERR_RETRY_FOREVER && 1188 ++bp->b_retries > cfg->max_retries) 1189 return true; 1190 if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && 1191 time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time)) 1192 return true; 1193 1194 /* At unmount we may treat errors differently */ 1195 if (xfs_is_unmounting(mp) && mp->m_fail_unmount) 1196 return true; 1197 1198 return false; 1199 } 1200 1201 /* 1202 * On a sync write or shutdown we just want to stale the buffer and let the 1203 * caller handle the error in bp->b_error appropriately. 1204 * 1205 * If the write was asynchronous then no one will be looking for the error. If 1206 * this is the first failure of this type, clear the error state and write the 1207 * buffer out again. This means we always retry an async write failure at least 1208 * once, but we also need to set the buffer up to behave correctly now for 1209 * repeated failures. 1210 * 1211 * If we get repeated async write failures, then we take action according to the 1212 * error configuration we have been set up to use. 1213 * 1214 * Returns true if this function took care of error handling and the caller must 1215 * not touch the buffer again. Return false if the caller should proceed with 1216 * normal I/O completion handling. 1217 */ 1218 static bool 1219 xfs_buf_ioend_handle_error( 1220 struct xfs_buf *bp) 1221 { 1222 struct xfs_mount *mp = bp->b_mount; 1223 struct xfs_error_cfg *cfg; 1224 1225 /* 1226 * If we've already shutdown the journal because of I/O errors, there's 1227 * no point in giving this a retry. 1228 */ 1229 if (xlog_is_shutdown(mp->m_log)) 1230 goto out_stale; 1231 1232 xfs_buf_ioerror_alert_ratelimited(bp); 1233 1234 /* 1235 * We're not going to bother about retrying this during recovery. 1236 * One strike! 1237 */ 1238 if (bp->b_flags & _XBF_LOGRECOVERY) { 1239 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 1240 return false; 1241 } 1242 1243 /* 1244 * Synchronous writes will have callers process the error. 1245 */ 1246 if (!(bp->b_flags & XBF_ASYNC)) 1247 goto out_stale; 1248 1249 trace_xfs_buf_iodone_async(bp, _RET_IP_); 1250 1251 cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error); 1252 if (bp->b_last_error != bp->b_error || 1253 !(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL))) { 1254 bp->b_last_error = bp->b_error; 1255 if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && 1256 !bp->b_first_retry_time) 1257 bp->b_first_retry_time = jiffies; 1258 goto resubmit; 1259 } 1260 1261 /* 1262 * Permanent error - we need to trigger a shutdown if we haven't already 1263 * to indicate that inconsistency will result from this action. 1264 */ 1265 if (xfs_buf_ioerror_permanent(bp, cfg)) { 1266 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 1267 goto out_stale; 1268 } 1269 1270 /* Still considered a transient error. Caller will schedule retries. */ 1271 if (bp->b_flags & _XBF_INODES) 1272 xfs_buf_inode_io_fail(bp); 1273 else if (bp->b_flags & _XBF_DQUOTS) 1274 xfs_buf_dquot_io_fail(bp); 1275 else 1276 ASSERT(list_empty(&bp->b_li_list)); 1277 xfs_buf_ioerror(bp, 0); 1278 xfs_buf_relse(bp); 1279 return true; 1280 1281 resubmit: 1282 xfs_buf_ioerror(bp, 0); 1283 bp->b_flags |= (XBF_DONE | XBF_WRITE_FAIL); 1284 xfs_buf_submit(bp); 1285 return true; 1286 out_stale: 1287 xfs_buf_stale(bp); 1288 bp->b_flags |= XBF_DONE; 1289 bp->b_flags &= ~XBF_WRITE; 1290 trace_xfs_buf_error_relse(bp, _RET_IP_); 1291 return false; 1292 } 1293 1294 static void 1295 xfs_buf_ioend( 1296 struct xfs_buf *bp) 1297 { 1298 trace_xfs_buf_iodone(bp, _RET_IP_); 1299 1300 /* 1301 * Pull in IO completion errors now. We are guaranteed to be running 1302 * single threaded, so we don't need the lock to read b_io_error. 1303 */ 1304 if (!bp->b_error && bp->b_io_error) 1305 xfs_buf_ioerror(bp, bp->b_io_error); 1306 1307 if (bp->b_flags & XBF_READ) { 1308 if (!bp->b_error && bp->b_ops) 1309 bp->b_ops->verify_read(bp); 1310 if (!bp->b_error) 1311 bp->b_flags |= XBF_DONE; 1312 } else { 1313 if (!bp->b_error) { 1314 bp->b_flags &= ~XBF_WRITE_FAIL; 1315 bp->b_flags |= XBF_DONE; 1316 } 1317 1318 if (unlikely(bp->b_error) && xfs_buf_ioend_handle_error(bp)) 1319 return; 1320 1321 /* clear the retry state */ 1322 bp->b_last_error = 0; 1323 bp->b_retries = 0; 1324 bp->b_first_retry_time = 0; 1325 1326 /* 1327 * Note that for things like remote attribute buffers, there may 1328 * not be a buffer log item here, so processing the buffer log 1329 * item must remain optional. 1330 */ 1331 if (bp->b_log_item) 1332 xfs_buf_item_done(bp); 1333 1334 if (bp->b_flags & _XBF_INODES) 1335 xfs_buf_inode_iodone(bp); 1336 else if (bp->b_flags & _XBF_DQUOTS) 1337 xfs_buf_dquot_iodone(bp); 1338 1339 } 1340 1341 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD | 1342 _XBF_LOGRECOVERY); 1343 1344 if (bp->b_flags & XBF_ASYNC) 1345 xfs_buf_relse(bp); 1346 else 1347 complete(&bp->b_iowait); 1348 } 1349 1350 static void 1351 xfs_buf_ioend_work( 1352 struct work_struct *work) 1353 { 1354 struct xfs_buf *bp = 1355 container_of(work, struct xfs_buf, b_ioend_work); 1356 1357 xfs_buf_ioend(bp); 1358 } 1359 1360 static void 1361 xfs_buf_ioend_async( 1362 struct xfs_buf *bp) 1363 { 1364 INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work); 1365 queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work); 1366 } 1367 1368 void 1369 __xfs_buf_ioerror( 1370 struct xfs_buf *bp, 1371 int error, 1372 xfs_failaddr_t failaddr) 1373 { 1374 ASSERT(error <= 0 && error >= -1000); 1375 bp->b_error = error; 1376 trace_xfs_buf_ioerror(bp, error, failaddr); 1377 } 1378 1379 void 1380 xfs_buf_ioerror_alert( 1381 struct xfs_buf *bp, 1382 xfs_failaddr_t func) 1383 { 1384 xfs_buf_alert_ratelimited(bp, "XFS: metadata IO error", 1385 "metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d", 1386 func, (uint64_t)xfs_buf_daddr(bp), 1387 bp->b_length, -bp->b_error); 1388 } 1389 1390 /* 1391 * To simulate an I/O failure, the buffer must be locked and held with at least 1392 * three references. The LRU reference is dropped by the stale call. The buf 1393 * item reference is dropped via ioend processing. The third reference is owned 1394 * by the caller and is dropped on I/O completion if the buffer is XBF_ASYNC. 1395 */ 1396 void 1397 xfs_buf_ioend_fail( 1398 struct xfs_buf *bp) 1399 { 1400 bp->b_flags &= ~XBF_DONE; 1401 xfs_buf_stale(bp); 1402 xfs_buf_ioerror(bp, -EIO); 1403 xfs_buf_ioend(bp); 1404 } 1405 1406 int 1407 xfs_bwrite( 1408 struct xfs_buf *bp) 1409 { 1410 int error; 1411 1412 ASSERT(xfs_buf_islocked(bp)); 1413 1414 bp->b_flags |= XBF_WRITE; 1415 bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | 1416 XBF_DONE); 1417 1418 error = xfs_buf_submit(bp); 1419 if (error) 1420 xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR); 1421 return error; 1422 } 1423 1424 static void 1425 xfs_buf_bio_end_io( 1426 struct bio *bio) 1427 { 1428 struct xfs_buf *bp = (struct xfs_buf *)bio->bi_private; 1429 1430 if (!bio->bi_status && 1431 (bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) && 1432 XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR)) 1433 bio->bi_status = BLK_STS_IOERR; 1434 1435 /* 1436 * don't overwrite existing errors - otherwise we can lose errors on 1437 * buffers that require multiple bios to complete. 1438 */ 1439 if (bio->bi_status) { 1440 int error = blk_status_to_errno(bio->bi_status); 1441 1442 cmpxchg(&bp->b_io_error, 0, error); 1443 } 1444 1445 if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) 1446 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); 1447 1448 if (atomic_dec_and_test(&bp->b_io_remaining) == 1) 1449 xfs_buf_ioend_async(bp); 1450 bio_put(bio); 1451 } 1452 1453 static void 1454 xfs_buf_ioapply_map( 1455 struct xfs_buf *bp, 1456 int map, 1457 int *buf_offset, 1458 int *count, 1459 blk_opf_t op) 1460 { 1461 int page_index; 1462 unsigned int total_nr_pages = bp->b_page_count; 1463 int nr_pages; 1464 struct bio *bio; 1465 sector_t sector = bp->b_maps[map].bm_bn; 1466 int size; 1467 int offset; 1468 1469 /* skip the pages in the buffer before the start offset */ 1470 page_index = 0; 1471 offset = *buf_offset; 1472 while (offset >= PAGE_SIZE) { 1473 page_index++; 1474 offset -= PAGE_SIZE; 1475 } 1476 1477 /* 1478 * Limit the IO size to the length of the current vector, and update the 1479 * remaining IO count for the next time around. 1480 */ 1481 size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count); 1482 *count -= size; 1483 *buf_offset += size; 1484 1485 next_chunk: 1486 atomic_inc(&bp->b_io_remaining); 1487 nr_pages = bio_max_segs(total_nr_pages); 1488 1489 bio = bio_alloc(bp->b_target->bt_bdev, nr_pages, op, GFP_NOIO); 1490 bio->bi_iter.bi_sector = sector; 1491 bio->bi_end_io = xfs_buf_bio_end_io; 1492 bio->bi_private = bp; 1493 1494 for (; size && nr_pages; nr_pages--, page_index++) { 1495 int rbytes, nbytes = PAGE_SIZE - offset; 1496 1497 if (nbytes > size) 1498 nbytes = size; 1499 1500 rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes, 1501 offset); 1502 if (rbytes < nbytes) 1503 break; 1504 1505 offset = 0; 1506 sector += BTOBB(nbytes); 1507 size -= nbytes; 1508 total_nr_pages--; 1509 } 1510 1511 if (likely(bio->bi_iter.bi_size)) { 1512 if (xfs_buf_is_vmapped(bp)) { 1513 flush_kernel_vmap_range(bp->b_addr, 1514 xfs_buf_vmap_len(bp)); 1515 } 1516 submit_bio(bio); 1517 if (size) 1518 goto next_chunk; 1519 } else { 1520 /* 1521 * This is guaranteed not to be the last io reference count 1522 * because the caller (xfs_buf_submit) holds a count itself. 1523 */ 1524 atomic_dec(&bp->b_io_remaining); 1525 xfs_buf_ioerror(bp, -EIO); 1526 bio_put(bio); 1527 } 1528 1529 } 1530 1531 STATIC void 1532 _xfs_buf_ioapply( 1533 struct xfs_buf *bp) 1534 { 1535 struct blk_plug plug; 1536 blk_opf_t op; 1537 int offset; 1538 int size; 1539 int i; 1540 1541 /* 1542 * Make sure we capture only current IO errors rather than stale errors 1543 * left over from previous use of the buffer (e.g. failed readahead). 1544 */ 1545 bp->b_error = 0; 1546 1547 if (bp->b_flags & XBF_WRITE) { 1548 op = REQ_OP_WRITE; 1549 1550 /* 1551 * Run the write verifier callback function if it exists. If 1552 * this function fails it will mark the buffer with an error and 1553 * the IO should not be dispatched. 1554 */ 1555 if (bp->b_ops) { 1556 bp->b_ops->verify_write(bp); 1557 if (bp->b_error) { 1558 xfs_force_shutdown(bp->b_mount, 1559 SHUTDOWN_CORRUPT_INCORE); 1560 return; 1561 } 1562 } else if (bp->b_rhash_key != XFS_BUF_DADDR_NULL) { 1563 struct xfs_mount *mp = bp->b_mount; 1564 1565 /* 1566 * non-crc filesystems don't attach verifiers during 1567 * log recovery, so don't warn for such filesystems. 1568 */ 1569 if (xfs_has_crc(mp)) { 1570 xfs_warn(mp, 1571 "%s: no buf ops on daddr 0x%llx len %d", 1572 __func__, xfs_buf_daddr(bp), 1573 bp->b_length); 1574 xfs_hex_dump(bp->b_addr, 1575 XFS_CORRUPTION_DUMP_LEN); 1576 dump_stack(); 1577 } 1578 } 1579 } else { 1580 op = REQ_OP_READ; 1581 if (bp->b_flags & XBF_READ_AHEAD) 1582 op |= REQ_RAHEAD; 1583 } 1584 1585 /* we only use the buffer cache for meta-data */ 1586 op |= REQ_META; 1587 1588 /* 1589 * Walk all the vectors issuing IO on them. Set up the initial offset 1590 * into the buffer and the desired IO size before we start - 1591 * _xfs_buf_ioapply_vec() will modify them appropriately for each 1592 * subsequent call. 1593 */ 1594 offset = bp->b_offset; 1595 size = BBTOB(bp->b_length); 1596 blk_start_plug(&plug); 1597 for (i = 0; i < bp->b_map_count; i++) { 1598 xfs_buf_ioapply_map(bp, i, &offset, &size, op); 1599 if (bp->b_error) 1600 break; 1601 if (size <= 0) 1602 break; /* all done */ 1603 } 1604 blk_finish_plug(&plug); 1605 } 1606 1607 /* 1608 * Wait for I/O completion of a sync buffer and return the I/O error code. 1609 */ 1610 static int 1611 xfs_buf_iowait( 1612 struct xfs_buf *bp) 1613 { 1614 ASSERT(!(bp->b_flags & XBF_ASYNC)); 1615 1616 trace_xfs_buf_iowait(bp, _RET_IP_); 1617 wait_for_completion(&bp->b_iowait); 1618 trace_xfs_buf_iowait_done(bp, _RET_IP_); 1619 1620 return bp->b_error; 1621 } 1622 1623 /* 1624 * Buffer I/O submission path, read or write. Asynchronous submission transfers 1625 * the buffer lock ownership and the current reference to the IO. It is not 1626 * safe to reference the buffer after a call to this function unless the caller 1627 * holds an additional reference itself. 1628 */ 1629 static int 1630 __xfs_buf_submit( 1631 struct xfs_buf *bp, 1632 bool wait) 1633 { 1634 int error = 0; 1635 1636 trace_xfs_buf_submit(bp, _RET_IP_); 1637 1638 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); 1639 1640 /* 1641 * On log shutdown we stale and complete the buffer immediately. We can 1642 * be called to read the superblock before the log has been set up, so 1643 * be careful checking the log state. 1644 * 1645 * Checking the mount shutdown state here can result in the log tail 1646 * moving inappropriately on disk as the log may not yet be shut down. 1647 * i.e. failing this buffer on mount shutdown can remove it from the AIL 1648 * and move the tail of the log forwards without having written this 1649 * buffer to disk. This corrupts the log tail state in memory, and 1650 * because the log may not be shut down yet, it can then be propagated 1651 * to disk before the log is shutdown. Hence we check log shutdown 1652 * state here rather than mount state to avoid corrupting the log tail 1653 * on shutdown. 1654 */ 1655 if (bp->b_mount->m_log && 1656 xlog_is_shutdown(bp->b_mount->m_log)) { 1657 xfs_buf_ioend_fail(bp); 1658 return -EIO; 1659 } 1660 1661 /* 1662 * Grab a reference so the buffer does not go away underneath us. For 1663 * async buffers, I/O completion drops the callers reference, which 1664 * could occur before submission returns. 1665 */ 1666 xfs_buf_hold(bp); 1667 1668 if (bp->b_flags & XBF_WRITE) 1669 xfs_buf_wait_unpin(bp); 1670 1671 /* clear the internal error state to avoid spurious errors */ 1672 bp->b_io_error = 0; 1673 1674 /* 1675 * Set the count to 1 initially, this will stop an I/O completion 1676 * callout which happens before we have started all the I/O from calling 1677 * xfs_buf_ioend too early. 1678 */ 1679 atomic_set(&bp->b_io_remaining, 1); 1680 if (bp->b_flags & XBF_ASYNC) 1681 xfs_buf_ioacct_inc(bp); 1682 _xfs_buf_ioapply(bp); 1683 1684 /* 1685 * If _xfs_buf_ioapply failed, we can get back here with only the IO 1686 * reference we took above. If we drop it to zero, run completion so 1687 * that we don't return to the caller with completion still pending. 1688 */ 1689 if (atomic_dec_and_test(&bp->b_io_remaining) == 1) { 1690 if (bp->b_error || !(bp->b_flags & XBF_ASYNC)) 1691 xfs_buf_ioend(bp); 1692 else 1693 xfs_buf_ioend_async(bp); 1694 } 1695 1696 if (wait) 1697 error = xfs_buf_iowait(bp); 1698 1699 /* 1700 * Release the hold that keeps the buffer referenced for the entire 1701 * I/O. Note that if the buffer is async, it is not safe to reference 1702 * after this release. 1703 */ 1704 xfs_buf_rele(bp); 1705 return error; 1706 } 1707 1708 void * 1709 xfs_buf_offset( 1710 struct xfs_buf *bp, 1711 size_t offset) 1712 { 1713 struct page *page; 1714 1715 if (bp->b_addr) 1716 return bp->b_addr + offset; 1717 1718 page = bp->b_pages[offset >> PAGE_SHIFT]; 1719 return page_address(page) + (offset & (PAGE_SIZE-1)); 1720 } 1721 1722 void 1723 xfs_buf_zero( 1724 struct xfs_buf *bp, 1725 size_t boff, 1726 size_t bsize) 1727 { 1728 size_t bend; 1729 1730 bend = boff + bsize; 1731 while (boff < bend) { 1732 struct page *page; 1733 int page_index, page_offset, csize; 1734 1735 page_index = (boff + bp->b_offset) >> PAGE_SHIFT; 1736 page_offset = (boff + bp->b_offset) & ~PAGE_MASK; 1737 page = bp->b_pages[page_index]; 1738 csize = min_t(size_t, PAGE_SIZE - page_offset, 1739 BBTOB(bp->b_length) - boff); 1740 1741 ASSERT((csize + page_offset) <= PAGE_SIZE); 1742 1743 memset(page_address(page) + page_offset, 0, csize); 1744 1745 boff += csize; 1746 } 1747 } 1748 1749 /* 1750 * Log a message about and stale a buffer that a caller has decided is corrupt. 1751 * 1752 * This function should be called for the kinds of metadata corruption that 1753 * cannot be detect from a verifier, such as incorrect inter-block relationship 1754 * data. Do /not/ call this function from a verifier function. 1755 * 1756 * The buffer must be XBF_DONE prior to the call. Afterwards, the buffer will 1757 * be marked stale, but b_error will not be set. The caller is responsible for 1758 * releasing the buffer or fixing it. 1759 */ 1760 void 1761 __xfs_buf_mark_corrupt( 1762 struct xfs_buf *bp, 1763 xfs_failaddr_t fa) 1764 { 1765 ASSERT(bp->b_flags & XBF_DONE); 1766 1767 xfs_buf_corruption_error(bp, fa); 1768 xfs_buf_stale(bp); 1769 } 1770 1771 /* 1772 * Handling of buffer targets (buftargs). 1773 */ 1774 1775 /* 1776 * Wait for any bufs with callbacks that have been submitted but have not yet 1777 * returned. These buffers will have an elevated hold count, so wait on those 1778 * while freeing all the buffers only held by the LRU. 1779 */ 1780 static enum lru_status 1781 xfs_buftarg_drain_rele( 1782 struct list_head *item, 1783 struct list_lru_one *lru, 1784 spinlock_t *lru_lock, 1785 void *arg) 1786 1787 { 1788 struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); 1789 struct list_head *dispose = arg; 1790 1791 if (atomic_read(&bp->b_hold) > 1) { 1792 /* need to wait, so skip it this pass */ 1793 trace_xfs_buf_drain_buftarg(bp, _RET_IP_); 1794 return LRU_SKIP; 1795 } 1796 if (!spin_trylock(&bp->b_lock)) 1797 return LRU_SKIP; 1798 1799 /* 1800 * clear the LRU reference count so the buffer doesn't get 1801 * ignored in xfs_buf_rele(). 1802 */ 1803 atomic_set(&bp->b_lru_ref, 0); 1804 bp->b_state |= XFS_BSTATE_DISPOSE; 1805 list_lru_isolate_move(lru, item, dispose); 1806 spin_unlock(&bp->b_lock); 1807 return LRU_REMOVED; 1808 } 1809 1810 /* 1811 * Wait for outstanding I/O on the buftarg to complete. 1812 */ 1813 void 1814 xfs_buftarg_wait( 1815 struct xfs_buftarg *btp) 1816 { 1817 /* 1818 * First wait on the buftarg I/O count for all in-flight buffers to be 1819 * released. This is critical as new buffers do not make the LRU until 1820 * they are released. 1821 * 1822 * Next, flush the buffer workqueue to ensure all completion processing 1823 * has finished. Just waiting on buffer locks is not sufficient for 1824 * async IO as the reference count held over IO is not released until 1825 * after the buffer lock is dropped. Hence we need to ensure here that 1826 * all reference counts have been dropped before we start walking the 1827 * LRU list. 1828 */ 1829 while (percpu_counter_sum(&btp->bt_io_count)) 1830 delay(100); 1831 flush_workqueue(btp->bt_mount->m_buf_workqueue); 1832 } 1833 1834 void 1835 xfs_buftarg_drain( 1836 struct xfs_buftarg *btp) 1837 { 1838 LIST_HEAD(dispose); 1839 int loop = 0; 1840 bool write_fail = false; 1841 1842 xfs_buftarg_wait(btp); 1843 1844 /* loop until there is nothing left on the lru list. */ 1845 while (list_lru_count(&btp->bt_lru)) { 1846 list_lru_walk(&btp->bt_lru, xfs_buftarg_drain_rele, 1847 &dispose, LONG_MAX); 1848 1849 while (!list_empty(&dispose)) { 1850 struct xfs_buf *bp; 1851 bp = list_first_entry(&dispose, struct xfs_buf, b_lru); 1852 list_del_init(&bp->b_lru); 1853 if (bp->b_flags & XBF_WRITE_FAIL) { 1854 write_fail = true; 1855 xfs_buf_alert_ratelimited(bp, 1856 "XFS: Corruption Alert", 1857 "Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!", 1858 (long long)xfs_buf_daddr(bp)); 1859 } 1860 xfs_buf_rele(bp); 1861 } 1862 if (loop++ != 0) 1863 delay(100); 1864 } 1865 1866 /* 1867 * If one or more failed buffers were freed, that means dirty metadata 1868 * was thrown away. This should only ever happen after I/O completion 1869 * handling has elevated I/O error(s) to permanent failures and shuts 1870 * down the journal. 1871 */ 1872 if (write_fail) { 1873 ASSERT(xlog_is_shutdown(btp->bt_mount->m_log)); 1874 xfs_alert(btp->bt_mount, 1875 "Please run xfs_repair to determine the extent of the problem."); 1876 } 1877 } 1878 1879 static enum lru_status 1880 xfs_buftarg_isolate( 1881 struct list_head *item, 1882 struct list_lru_one *lru, 1883 spinlock_t *lru_lock, 1884 void *arg) 1885 { 1886 struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); 1887 struct list_head *dispose = arg; 1888 1889 /* 1890 * we are inverting the lru lock/bp->b_lock here, so use a trylock. 1891 * If we fail to get the lock, just skip it. 1892 */ 1893 if (!spin_trylock(&bp->b_lock)) 1894 return LRU_SKIP; 1895 /* 1896 * Decrement the b_lru_ref count unless the value is already 1897 * zero. If the value is already zero, we need to reclaim the 1898 * buffer, otherwise it gets another trip through the LRU. 1899 */ 1900 if (atomic_add_unless(&bp->b_lru_ref, -1, 0)) { 1901 spin_unlock(&bp->b_lock); 1902 return LRU_ROTATE; 1903 } 1904 1905 bp->b_state |= XFS_BSTATE_DISPOSE; 1906 list_lru_isolate_move(lru, item, dispose); 1907 spin_unlock(&bp->b_lock); 1908 return LRU_REMOVED; 1909 } 1910 1911 static unsigned long 1912 xfs_buftarg_shrink_scan( 1913 struct shrinker *shrink, 1914 struct shrink_control *sc) 1915 { 1916 struct xfs_buftarg *btp = shrink->private_data; 1917 LIST_HEAD(dispose); 1918 unsigned long freed; 1919 1920 freed = list_lru_shrink_walk(&btp->bt_lru, sc, 1921 xfs_buftarg_isolate, &dispose); 1922 1923 while (!list_empty(&dispose)) { 1924 struct xfs_buf *bp; 1925 bp = list_first_entry(&dispose, struct xfs_buf, b_lru); 1926 list_del_init(&bp->b_lru); 1927 xfs_buf_rele(bp); 1928 } 1929 1930 return freed; 1931 } 1932 1933 static unsigned long 1934 xfs_buftarg_shrink_count( 1935 struct shrinker *shrink, 1936 struct shrink_control *sc) 1937 { 1938 struct xfs_buftarg *btp = shrink->private_data; 1939 return list_lru_shrink_count(&btp->bt_lru, sc); 1940 } 1941 1942 void 1943 xfs_free_buftarg( 1944 struct xfs_buftarg *btp) 1945 { 1946 shrinker_free(btp->bt_shrinker); 1947 ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0); 1948 percpu_counter_destroy(&btp->bt_io_count); 1949 list_lru_destroy(&btp->bt_lru); 1950 1951 fs_put_dax(btp->bt_daxdev, btp->bt_mount); 1952 /* the main block device is closed by kill_block_super */ 1953 if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev) 1954 bdev_release(btp->bt_bdev_handle); 1955 1956 kmem_free(btp); 1957 } 1958 1959 int 1960 xfs_setsize_buftarg( 1961 xfs_buftarg_t *btp, 1962 unsigned int sectorsize) 1963 { 1964 /* Set up metadata sector size info */ 1965 btp->bt_meta_sectorsize = sectorsize; 1966 btp->bt_meta_sectormask = sectorsize - 1; 1967 1968 if (set_blocksize(btp->bt_bdev, sectorsize)) { 1969 xfs_warn(btp->bt_mount, 1970 "Cannot set_blocksize to %u on device %pg", 1971 sectorsize, btp->bt_bdev); 1972 return -EINVAL; 1973 } 1974 1975 /* Set up device logical sector size mask */ 1976 btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev); 1977 btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1; 1978 1979 return 0; 1980 } 1981 1982 /* 1983 * When allocating the initial buffer target we have not yet 1984 * read in the superblock, so don't know what sized sectors 1985 * are being used at this early stage. Play safe. 1986 */ 1987 STATIC int 1988 xfs_setsize_buftarg_early( 1989 xfs_buftarg_t *btp) 1990 { 1991 return xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev)); 1992 } 1993 1994 struct xfs_buftarg * 1995 xfs_alloc_buftarg( 1996 struct xfs_mount *mp, 1997 struct bdev_handle *bdev_handle) 1998 { 1999 xfs_buftarg_t *btp; 2000 const struct dax_holder_operations *ops = NULL; 2001 2002 #if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE) 2003 ops = &xfs_dax_holder_operations; 2004 #endif 2005 btp = kmem_zalloc(sizeof(*btp), KM_NOFS); 2006 2007 btp->bt_mount = mp; 2008 btp->bt_bdev_handle = bdev_handle; 2009 btp->bt_dev = bdev_handle->bdev->bd_dev; 2010 btp->bt_bdev = bdev_handle->bdev; 2011 btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off, 2012 mp, ops); 2013 2014 /* 2015 * Buffer IO error rate limiting. Limit it to no more than 10 messages 2016 * per 30 seconds so as to not spam logs too much on repeated errors. 2017 */ 2018 ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ, 2019 DEFAULT_RATELIMIT_BURST); 2020 2021 if (xfs_setsize_buftarg_early(btp)) 2022 goto error_free; 2023 2024 if (list_lru_init(&btp->bt_lru)) 2025 goto error_free; 2026 2027 if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL)) 2028 goto error_lru; 2029 2030 btp->bt_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-buf:%s", 2031 mp->m_super->s_id); 2032 if (!btp->bt_shrinker) 2033 goto error_pcpu; 2034 2035 btp->bt_shrinker->count_objects = xfs_buftarg_shrink_count; 2036 btp->bt_shrinker->scan_objects = xfs_buftarg_shrink_scan; 2037 btp->bt_shrinker->private_data = btp; 2038 2039 shrinker_register(btp->bt_shrinker); 2040 2041 return btp; 2042 2043 error_pcpu: 2044 percpu_counter_destroy(&btp->bt_io_count); 2045 error_lru: 2046 list_lru_destroy(&btp->bt_lru); 2047 error_free: 2048 kmem_free(btp); 2049 return NULL; 2050 } 2051 2052 static inline void 2053 xfs_buf_list_del( 2054 struct xfs_buf *bp) 2055 { 2056 list_del_init(&bp->b_list); 2057 wake_up_var(&bp->b_list); 2058 } 2059 2060 /* 2061 * Cancel a delayed write list. 2062 * 2063 * Remove each buffer from the list, clear the delwri queue flag and drop the 2064 * associated buffer reference. 2065 */ 2066 void 2067 xfs_buf_delwri_cancel( 2068 struct list_head *list) 2069 { 2070 struct xfs_buf *bp; 2071 2072 while (!list_empty(list)) { 2073 bp = list_first_entry(list, struct xfs_buf, b_list); 2074 2075 xfs_buf_lock(bp); 2076 bp->b_flags &= ~_XBF_DELWRI_Q; 2077 xfs_buf_list_del(bp); 2078 xfs_buf_relse(bp); 2079 } 2080 } 2081 2082 /* 2083 * Add a buffer to the delayed write list. 2084 * 2085 * This queues a buffer for writeout if it hasn't already been. Note that 2086 * neither this routine nor the buffer list submission functions perform 2087 * any internal synchronization. It is expected that the lists are thread-local 2088 * to the callers. 2089 * 2090 * Returns true if we queued up the buffer, or false if it already had 2091 * been on the buffer list. 2092 */ 2093 bool 2094 xfs_buf_delwri_queue( 2095 struct xfs_buf *bp, 2096 struct list_head *list) 2097 { 2098 ASSERT(xfs_buf_islocked(bp)); 2099 ASSERT(!(bp->b_flags & XBF_READ)); 2100 2101 /* 2102 * If the buffer is already marked delwri it already is queued up 2103 * by someone else for imediate writeout. Just ignore it in that 2104 * case. 2105 */ 2106 if (bp->b_flags & _XBF_DELWRI_Q) { 2107 trace_xfs_buf_delwri_queued(bp, _RET_IP_); 2108 return false; 2109 } 2110 2111 trace_xfs_buf_delwri_queue(bp, _RET_IP_); 2112 2113 /* 2114 * If a buffer gets written out synchronously or marked stale while it 2115 * is on a delwri list we lazily remove it. To do this, the other party 2116 * clears the _XBF_DELWRI_Q flag but otherwise leaves the buffer alone. 2117 * It remains referenced and on the list. In a rare corner case it 2118 * might get readded to a delwri list after the synchronous writeout, in 2119 * which case we need just need to re-add the flag here. 2120 */ 2121 bp->b_flags |= _XBF_DELWRI_Q; 2122 if (list_empty(&bp->b_list)) { 2123 atomic_inc(&bp->b_hold); 2124 list_add_tail(&bp->b_list, list); 2125 } 2126 2127 return true; 2128 } 2129 2130 /* 2131 * Queue a buffer to this delwri list as part of a data integrity operation. 2132 * If the buffer is on any other delwri list, we'll wait for that to clear 2133 * so that the caller can submit the buffer for IO and wait for the result. 2134 * Callers must ensure the buffer is not already on the list. 2135 */ 2136 void 2137 xfs_buf_delwri_queue_here( 2138 struct xfs_buf *bp, 2139 struct list_head *buffer_list) 2140 { 2141 /* 2142 * We need this buffer to end up on the /caller's/ delwri list, not any 2143 * old list. This can happen if the buffer is marked stale (which 2144 * clears DELWRI_Q) after the AIL queues the buffer to its list but 2145 * before the AIL has a chance to submit the list. 2146 */ 2147 while (!list_empty(&bp->b_list)) { 2148 xfs_buf_unlock(bp); 2149 wait_var_event(&bp->b_list, list_empty(&bp->b_list)); 2150 xfs_buf_lock(bp); 2151 } 2152 2153 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); 2154 2155 xfs_buf_delwri_queue(bp, buffer_list); 2156 } 2157 2158 /* 2159 * Compare function is more complex than it needs to be because 2160 * the return value is only 32 bits and we are doing comparisons 2161 * on 64 bit values 2162 */ 2163 static int 2164 xfs_buf_cmp( 2165 void *priv, 2166 const struct list_head *a, 2167 const struct list_head *b) 2168 { 2169 struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list); 2170 struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list); 2171 xfs_daddr_t diff; 2172 2173 diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn; 2174 if (diff < 0) 2175 return -1; 2176 if (diff > 0) 2177 return 1; 2178 return 0; 2179 } 2180 2181 /* 2182 * Submit buffers for write. If wait_list is specified, the buffers are 2183 * submitted using sync I/O and placed on the wait list such that the caller can 2184 * iowait each buffer. Otherwise async I/O is used and the buffers are released 2185 * at I/O completion time. In either case, buffers remain locked until I/O 2186 * completes and the buffer is released from the queue. 2187 */ 2188 static int 2189 xfs_buf_delwri_submit_buffers( 2190 struct list_head *buffer_list, 2191 struct list_head *wait_list) 2192 { 2193 struct xfs_buf *bp, *n; 2194 int pinned = 0; 2195 struct blk_plug plug; 2196 2197 list_sort(NULL, buffer_list, xfs_buf_cmp); 2198 2199 blk_start_plug(&plug); 2200 list_for_each_entry_safe(bp, n, buffer_list, b_list) { 2201 if (!wait_list) { 2202 if (!xfs_buf_trylock(bp)) 2203 continue; 2204 if (xfs_buf_ispinned(bp)) { 2205 xfs_buf_unlock(bp); 2206 pinned++; 2207 continue; 2208 } 2209 } else { 2210 xfs_buf_lock(bp); 2211 } 2212 2213 /* 2214 * Someone else might have written the buffer synchronously or 2215 * marked it stale in the meantime. In that case only the 2216 * _XBF_DELWRI_Q flag got cleared, and we have to drop the 2217 * reference and remove it from the list here. 2218 */ 2219 if (!(bp->b_flags & _XBF_DELWRI_Q)) { 2220 xfs_buf_list_del(bp); 2221 xfs_buf_relse(bp); 2222 continue; 2223 } 2224 2225 trace_xfs_buf_delwri_split(bp, _RET_IP_); 2226 2227 /* 2228 * If we have a wait list, each buffer (and associated delwri 2229 * queue reference) transfers to it and is submitted 2230 * synchronously. Otherwise, drop the buffer from the delwri 2231 * queue and submit async. 2232 */ 2233 bp->b_flags &= ~_XBF_DELWRI_Q; 2234 bp->b_flags |= XBF_WRITE; 2235 if (wait_list) { 2236 bp->b_flags &= ~XBF_ASYNC; 2237 list_move_tail(&bp->b_list, wait_list); 2238 } else { 2239 bp->b_flags |= XBF_ASYNC; 2240 xfs_buf_list_del(bp); 2241 } 2242 __xfs_buf_submit(bp, false); 2243 } 2244 blk_finish_plug(&plug); 2245 2246 return pinned; 2247 } 2248 2249 /* 2250 * Write out a buffer list asynchronously. 2251 * 2252 * This will take the @buffer_list, write all non-locked and non-pinned buffers 2253 * out and not wait for I/O completion on any of the buffers. This interface 2254 * is only safely useable for callers that can track I/O completion by higher 2255 * level means, e.g. AIL pushing as the @buffer_list is consumed in this 2256 * function. 2257 * 2258 * Note: this function will skip buffers it would block on, and in doing so 2259 * leaves them on @buffer_list so they can be retried on a later pass. As such, 2260 * it is up to the caller to ensure that the buffer list is fully submitted or 2261 * cancelled appropriately when they are finished with the list. Failure to 2262 * cancel or resubmit the list until it is empty will result in leaked buffers 2263 * at unmount time. 2264 */ 2265 int 2266 xfs_buf_delwri_submit_nowait( 2267 struct list_head *buffer_list) 2268 { 2269 return xfs_buf_delwri_submit_buffers(buffer_list, NULL); 2270 } 2271 2272 /* 2273 * Write out a buffer list synchronously. 2274 * 2275 * This will take the @buffer_list, write all buffers out and wait for I/O 2276 * completion on all of the buffers. @buffer_list is consumed by the function, 2277 * so callers must have some other way of tracking buffers if they require such 2278 * functionality. 2279 */ 2280 int 2281 xfs_buf_delwri_submit( 2282 struct list_head *buffer_list) 2283 { 2284 LIST_HEAD (wait_list); 2285 int error = 0, error2; 2286 struct xfs_buf *bp; 2287 2288 xfs_buf_delwri_submit_buffers(buffer_list, &wait_list); 2289 2290 /* Wait for IO to complete. */ 2291 while (!list_empty(&wait_list)) { 2292 bp = list_first_entry(&wait_list, struct xfs_buf, b_list); 2293 2294 xfs_buf_list_del(bp); 2295 2296 /* 2297 * Wait on the locked buffer, check for errors and unlock and 2298 * release the delwri queue reference. 2299 */ 2300 error2 = xfs_buf_iowait(bp); 2301 xfs_buf_relse(bp); 2302 if (!error) 2303 error = error2; 2304 } 2305 2306 return error; 2307 } 2308 2309 /* 2310 * Push a single buffer on a delwri queue. 2311 * 2312 * The purpose of this function is to submit a single buffer of a delwri queue 2313 * and return with the buffer still on the original queue. The waiting delwri 2314 * buffer submission infrastructure guarantees transfer of the delwri queue 2315 * buffer reference to a temporary wait list. We reuse this infrastructure to 2316 * transfer the buffer back to the original queue. 2317 * 2318 * Note the buffer transitions from the queued state, to the submitted and wait 2319 * listed state and back to the queued state during this call. The buffer 2320 * locking and queue management logic between _delwri_pushbuf() and 2321 * _delwri_queue() guarantee that the buffer cannot be queued to another list 2322 * before returning. 2323 */ 2324 int 2325 xfs_buf_delwri_pushbuf( 2326 struct xfs_buf *bp, 2327 struct list_head *buffer_list) 2328 { 2329 LIST_HEAD (submit_list); 2330 int error; 2331 2332 ASSERT(bp->b_flags & _XBF_DELWRI_Q); 2333 2334 trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_); 2335 2336 /* 2337 * Isolate the buffer to a new local list so we can submit it for I/O 2338 * independently from the rest of the original list. 2339 */ 2340 xfs_buf_lock(bp); 2341 list_move(&bp->b_list, &submit_list); 2342 xfs_buf_unlock(bp); 2343 2344 /* 2345 * Delwri submission clears the DELWRI_Q buffer flag and returns with 2346 * the buffer on the wait list with the original reference. Rather than 2347 * bounce the buffer from a local wait list back to the original list 2348 * after I/O completion, reuse the original list as the wait list. 2349 */ 2350 xfs_buf_delwri_submit_buffers(&submit_list, buffer_list); 2351 2352 /* 2353 * The buffer is now locked, under I/O and wait listed on the original 2354 * delwri queue. Wait for I/O completion, restore the DELWRI_Q flag and 2355 * return with the buffer unlocked and on the original queue. 2356 */ 2357 error = xfs_buf_iowait(bp); 2358 bp->b_flags |= _XBF_DELWRI_Q; 2359 xfs_buf_unlock(bp); 2360 2361 return error; 2362 } 2363 2364 void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) 2365 { 2366 /* 2367 * Set the lru reference count to 0 based on the error injection tag. 2368 * This allows userspace to disrupt buffer caching for debug/testing 2369 * purposes. 2370 */ 2371 if (XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_LRU_REF)) 2372 lru_ref = 0; 2373 2374 atomic_set(&bp->b_lru_ref, lru_ref); 2375 } 2376 2377 /* 2378 * Verify an on-disk magic value against the magic value specified in the 2379 * verifier structure. The verifier magic is in disk byte order so the caller is 2380 * expected to pass the value directly from disk. 2381 */ 2382 bool 2383 xfs_verify_magic( 2384 struct xfs_buf *bp, 2385 __be32 dmagic) 2386 { 2387 struct xfs_mount *mp = bp->b_mount; 2388 int idx; 2389 2390 idx = xfs_has_crc(mp); 2391 if (WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx])) 2392 return false; 2393 return dmagic == bp->b_ops->magic[idx]; 2394 } 2395 /* 2396 * Verify an on-disk magic value against the magic value specified in the 2397 * verifier structure. The verifier magic is in disk byte order so the caller is 2398 * expected to pass the value directly from disk. 2399 */ 2400 bool 2401 xfs_verify_magic16( 2402 struct xfs_buf *bp, 2403 __be16 dmagic) 2404 { 2405 struct xfs_mount *mp = bp->b_mount; 2406 int idx; 2407 2408 idx = xfs_has_crc(mp); 2409 if (WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx])) 2410 return false; 2411 return dmagic == bp->b_ops->magic16[idx]; 2412 } 2413