1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include <linux/backing-dev.h> 8 #include <linux/dax.h> 9 10 #include "xfs_shared.h" 11 #include "xfs_format.h" 12 #include "xfs_log_format.h" 13 #include "xfs_trans_resv.h" 14 #include "xfs_mount.h" 15 #include "xfs_trace.h" 16 #include "xfs_log.h" 17 #include "xfs_log_recover.h" 18 #include "xfs_log_priv.h" 19 #include "xfs_trans.h" 20 #include "xfs_buf_item.h" 21 #include "xfs_errortag.h" 22 #include "xfs_error.h" 23 #include "xfs_ag.h" 24 25 struct kmem_cache *xfs_buf_cache; 26 27 /* 28 * Locking orders 29 * 30 * xfs_buf_ioacct_inc: 31 * xfs_buf_ioacct_dec: 32 * b_sema (caller holds) 33 * b_lock 34 * 35 * xfs_buf_stale: 36 * b_sema (caller holds) 37 * b_lock 38 * lru_lock 39 * 40 * xfs_buf_rele: 41 * b_lock 42 * pag_buf_lock 43 * lru_lock 44 * 45 * xfs_buftarg_drain_rele 46 * lru_lock 47 * b_lock (trylock due to inversion) 48 * 49 * xfs_buftarg_isolate 50 * lru_lock 51 * b_lock (trylock due to inversion) 52 */ 53 54 static int __xfs_buf_submit(struct xfs_buf *bp, bool wait); 55 56 static inline int 57 xfs_buf_submit( 58 struct xfs_buf *bp) 59 { 60 return __xfs_buf_submit(bp, !(bp->b_flags & XBF_ASYNC)); 61 } 62 63 static inline int 64 xfs_buf_is_vmapped( 65 struct xfs_buf *bp) 66 { 67 /* 68 * Return true if the buffer is vmapped. 69 * 70 * b_addr is null if the buffer is not mapped, but the code is clever 71 * enough to know it doesn't have to map a single page, so the check has 72 * to be both for b_addr and bp->b_page_count > 1. 73 */ 74 return bp->b_addr && bp->b_page_count > 1; 75 } 76 77 static inline int 78 xfs_buf_vmap_len( 79 struct xfs_buf *bp) 80 { 81 return (bp->b_page_count * PAGE_SIZE); 82 } 83 84 /* 85 * Bump the I/O in flight count on the buftarg if we haven't yet done so for 86 * this buffer. The count is incremented once per buffer (per hold cycle) 87 * because the corresponding decrement is deferred to buffer release. Buffers 88 * can undergo I/O multiple times in a hold-release cycle and per buffer I/O 89 * tracking adds unnecessary overhead. This is used for sychronization purposes 90 * with unmount (see xfs_buftarg_drain()), so all we really need is a count of 91 * in-flight buffers. 92 * 93 * Buffers that are never released (e.g., superblock, iclog buffers) must set 94 * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count 95 * never reaches zero and unmount hangs indefinitely. 96 */ 97 static inline void 98 xfs_buf_ioacct_inc( 99 struct xfs_buf *bp) 100 { 101 if (bp->b_flags & XBF_NO_IOACCT) 102 return; 103 104 ASSERT(bp->b_flags & XBF_ASYNC); 105 spin_lock(&bp->b_lock); 106 if (!(bp->b_state & XFS_BSTATE_IN_FLIGHT)) { 107 bp->b_state |= XFS_BSTATE_IN_FLIGHT; 108 percpu_counter_inc(&bp->b_target->bt_io_count); 109 } 110 spin_unlock(&bp->b_lock); 111 } 112 113 /* 114 * Clear the in-flight state on a buffer about to be released to the LRU or 115 * freed and unaccount from the buftarg. 116 */ 117 static inline void 118 __xfs_buf_ioacct_dec( 119 struct xfs_buf *bp) 120 { 121 lockdep_assert_held(&bp->b_lock); 122 123 if (bp->b_state & XFS_BSTATE_IN_FLIGHT) { 124 bp->b_state &= ~XFS_BSTATE_IN_FLIGHT; 125 percpu_counter_dec(&bp->b_target->bt_io_count); 126 } 127 } 128 129 static inline void 130 xfs_buf_ioacct_dec( 131 struct xfs_buf *bp) 132 { 133 spin_lock(&bp->b_lock); 134 __xfs_buf_ioacct_dec(bp); 135 spin_unlock(&bp->b_lock); 136 } 137 138 /* 139 * When we mark a buffer stale, we remove the buffer from the LRU and clear the 140 * b_lru_ref count so that the buffer is freed immediately when the buffer 141 * reference count falls to zero. If the buffer is already on the LRU, we need 142 * to remove the reference that LRU holds on the buffer. 143 * 144 * This prevents build-up of stale buffers on the LRU. 145 */ 146 void 147 xfs_buf_stale( 148 struct xfs_buf *bp) 149 { 150 ASSERT(xfs_buf_islocked(bp)); 151 152 bp->b_flags |= XBF_STALE; 153 154 /* 155 * Clear the delwri status so that a delwri queue walker will not 156 * flush this buffer to disk now that it is stale. The delwri queue has 157 * a reference to the buffer, so this is safe to do. 158 */ 159 bp->b_flags &= ~_XBF_DELWRI_Q; 160 161 /* 162 * Once the buffer is marked stale and unlocked, a subsequent lookup 163 * could reset b_flags. There is no guarantee that the buffer is 164 * unaccounted (released to LRU) before that occurs. Drop in-flight 165 * status now to preserve accounting consistency. 166 */ 167 spin_lock(&bp->b_lock); 168 __xfs_buf_ioacct_dec(bp); 169 170 atomic_set(&bp->b_lru_ref, 0); 171 if (!(bp->b_state & XFS_BSTATE_DISPOSE) && 172 (list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru))) 173 atomic_dec(&bp->b_hold); 174 175 ASSERT(atomic_read(&bp->b_hold) >= 1); 176 spin_unlock(&bp->b_lock); 177 } 178 179 static int 180 xfs_buf_get_maps( 181 struct xfs_buf *bp, 182 int map_count) 183 { 184 ASSERT(bp->b_maps == NULL); 185 bp->b_map_count = map_count; 186 187 if (map_count == 1) { 188 bp->b_maps = &bp->__b_map; 189 return 0; 190 } 191 192 bp->b_maps = kzalloc(map_count * sizeof(struct xfs_buf_map), 193 GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); 194 if (!bp->b_maps) 195 return -ENOMEM; 196 return 0; 197 } 198 199 /* 200 * Frees b_pages if it was allocated. 201 */ 202 static void 203 xfs_buf_free_maps( 204 struct xfs_buf *bp) 205 { 206 if (bp->b_maps != &bp->__b_map) { 207 kfree(bp->b_maps); 208 bp->b_maps = NULL; 209 } 210 } 211 212 static int 213 _xfs_buf_alloc( 214 struct xfs_buftarg *target, 215 struct xfs_buf_map *map, 216 int nmaps, 217 xfs_buf_flags_t flags, 218 struct xfs_buf **bpp) 219 { 220 struct xfs_buf *bp; 221 int error; 222 int i; 223 224 *bpp = NULL; 225 bp = kmem_cache_zalloc(xfs_buf_cache, 226 GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); 227 228 /* 229 * We don't want certain flags to appear in b_flags unless they are 230 * specifically set by later operations on the buffer. 231 */ 232 flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD); 233 234 atomic_set(&bp->b_hold, 1); 235 atomic_set(&bp->b_lru_ref, 1); 236 init_completion(&bp->b_iowait); 237 INIT_LIST_HEAD(&bp->b_lru); 238 INIT_LIST_HEAD(&bp->b_list); 239 INIT_LIST_HEAD(&bp->b_li_list); 240 sema_init(&bp->b_sema, 0); /* held, no waiters */ 241 spin_lock_init(&bp->b_lock); 242 bp->b_target = target; 243 bp->b_mount = target->bt_mount; 244 bp->b_flags = flags; 245 246 /* 247 * Set length and io_length to the same value initially. 248 * I/O routines should use io_length, which will be the same in 249 * most cases but may be reset (e.g. XFS recovery). 250 */ 251 error = xfs_buf_get_maps(bp, nmaps); 252 if (error) { 253 kmem_cache_free(xfs_buf_cache, bp); 254 return error; 255 } 256 257 bp->b_rhash_key = map[0].bm_bn; 258 bp->b_length = 0; 259 for (i = 0; i < nmaps; i++) { 260 bp->b_maps[i].bm_bn = map[i].bm_bn; 261 bp->b_maps[i].bm_len = map[i].bm_len; 262 bp->b_length += map[i].bm_len; 263 } 264 265 atomic_set(&bp->b_pin_count, 0); 266 init_waitqueue_head(&bp->b_waiters); 267 268 XFS_STATS_INC(bp->b_mount, xb_create); 269 trace_xfs_buf_init(bp, _RET_IP_); 270 271 *bpp = bp; 272 return 0; 273 } 274 275 static void 276 xfs_buf_free_pages( 277 struct xfs_buf *bp) 278 { 279 uint i; 280 281 ASSERT(bp->b_flags & _XBF_PAGES); 282 283 if (xfs_buf_is_vmapped(bp)) 284 vm_unmap_ram(bp->b_addr, bp->b_page_count); 285 286 for (i = 0; i < bp->b_page_count; i++) { 287 if (bp->b_pages[i]) 288 __free_page(bp->b_pages[i]); 289 } 290 mm_account_reclaimed_pages(bp->b_page_count); 291 292 if (bp->b_pages != bp->b_page_array) 293 kfree(bp->b_pages); 294 bp->b_pages = NULL; 295 bp->b_flags &= ~_XBF_PAGES; 296 } 297 298 static void 299 xfs_buf_free_callback( 300 struct callback_head *cb) 301 { 302 struct xfs_buf *bp = container_of(cb, struct xfs_buf, b_rcu); 303 304 xfs_buf_free_maps(bp); 305 kmem_cache_free(xfs_buf_cache, bp); 306 } 307 308 static void 309 xfs_buf_free( 310 struct xfs_buf *bp) 311 { 312 trace_xfs_buf_free(bp, _RET_IP_); 313 314 ASSERT(list_empty(&bp->b_lru)); 315 316 if (bp->b_flags & _XBF_PAGES) 317 xfs_buf_free_pages(bp); 318 else if (bp->b_flags & _XBF_KMEM) 319 kfree(bp->b_addr); 320 321 call_rcu(&bp->b_rcu, xfs_buf_free_callback); 322 } 323 324 static int 325 xfs_buf_alloc_kmem( 326 struct xfs_buf *bp, 327 xfs_buf_flags_t flags) 328 { 329 gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL; 330 size_t size = BBTOB(bp->b_length); 331 332 /* Assure zeroed buffer for non-read cases. */ 333 if (!(flags & XBF_READ)) 334 gfp_mask |= __GFP_ZERO; 335 336 bp->b_addr = kmalloc(size, gfp_mask); 337 if (!bp->b_addr) 338 return -ENOMEM; 339 340 if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) != 341 ((unsigned long)bp->b_addr & PAGE_MASK)) { 342 /* b_addr spans two pages - use alloc_page instead */ 343 kfree(bp->b_addr); 344 bp->b_addr = NULL; 345 return -ENOMEM; 346 } 347 bp->b_offset = offset_in_page(bp->b_addr); 348 bp->b_pages = bp->b_page_array; 349 bp->b_pages[0] = kmem_to_page(bp->b_addr); 350 bp->b_page_count = 1; 351 bp->b_flags |= _XBF_KMEM; 352 return 0; 353 } 354 355 static int 356 xfs_buf_alloc_pages( 357 struct xfs_buf *bp, 358 xfs_buf_flags_t flags) 359 { 360 gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN; 361 long filled = 0; 362 363 if (flags & XBF_READ_AHEAD) 364 gfp_mask |= __GFP_NORETRY; 365 366 /* Make sure that we have a page list */ 367 bp->b_page_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE); 368 if (bp->b_page_count <= XB_PAGES) { 369 bp->b_pages = bp->b_page_array; 370 } else { 371 bp->b_pages = kzalloc(sizeof(struct page *) * bp->b_page_count, 372 gfp_mask); 373 if (!bp->b_pages) 374 return -ENOMEM; 375 } 376 bp->b_flags |= _XBF_PAGES; 377 378 /* Assure zeroed buffer for non-read cases. */ 379 if (!(flags & XBF_READ)) 380 gfp_mask |= __GFP_ZERO; 381 382 /* 383 * Bulk filling of pages can take multiple calls. Not filling the entire 384 * array is not an allocation failure, so don't back off if we get at 385 * least one extra page. 386 */ 387 for (;;) { 388 long last = filled; 389 390 filled = alloc_pages_bulk_array(gfp_mask, bp->b_page_count, 391 bp->b_pages); 392 if (filled == bp->b_page_count) { 393 XFS_STATS_INC(bp->b_mount, xb_page_found); 394 break; 395 } 396 397 if (filled != last) 398 continue; 399 400 if (flags & XBF_READ_AHEAD) { 401 xfs_buf_free_pages(bp); 402 return -ENOMEM; 403 } 404 405 XFS_STATS_INC(bp->b_mount, xb_page_retries); 406 memalloc_retry_wait(gfp_mask); 407 } 408 return 0; 409 } 410 411 /* 412 * Map buffer into kernel address-space if necessary. 413 */ 414 STATIC int 415 _xfs_buf_map_pages( 416 struct xfs_buf *bp, 417 xfs_buf_flags_t flags) 418 { 419 ASSERT(bp->b_flags & _XBF_PAGES); 420 if (bp->b_page_count == 1) { 421 /* A single page buffer is always mappable */ 422 bp->b_addr = page_address(bp->b_pages[0]); 423 } else if (flags & XBF_UNMAPPED) { 424 bp->b_addr = NULL; 425 } else { 426 int retried = 0; 427 unsigned nofs_flag; 428 429 /* 430 * vm_map_ram() will allocate auxiliary structures (e.g. 431 * pagetables) with GFP_KERNEL, yet we often under a scoped nofs 432 * context here. Mixing GFP_KERNEL with GFP_NOFS allocations 433 * from the same call site that can be run from both above and 434 * below memory reclaim causes lockdep false positives. Hence we 435 * always need to force this allocation to nofs context because 436 * we can't pass __GFP_NOLOCKDEP down to auxillary structures to 437 * prevent false positive lockdep reports. 438 * 439 * XXX(dgc): I think dquot reclaim is the only place we can get 440 * to this function from memory reclaim context now. If we fix 441 * that like we've fixed inode reclaim to avoid writeback from 442 * reclaim, this nofs wrapping can go away. 443 */ 444 nofs_flag = memalloc_nofs_save(); 445 do { 446 bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, 447 -1); 448 if (bp->b_addr) 449 break; 450 vm_unmap_aliases(); 451 } while (retried++ <= 1); 452 memalloc_nofs_restore(nofs_flag); 453 454 if (!bp->b_addr) 455 return -ENOMEM; 456 } 457 458 return 0; 459 } 460 461 /* 462 * Finding and Reading Buffers 463 */ 464 static int 465 _xfs_buf_obj_cmp( 466 struct rhashtable_compare_arg *arg, 467 const void *obj) 468 { 469 const struct xfs_buf_map *map = arg->key; 470 const struct xfs_buf *bp = obj; 471 472 /* 473 * The key hashing in the lookup path depends on the key being the 474 * first element of the compare_arg, make sure to assert this. 475 */ 476 BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0); 477 478 if (bp->b_rhash_key != map->bm_bn) 479 return 1; 480 481 if (unlikely(bp->b_length != map->bm_len)) { 482 /* 483 * found a block number match. If the range doesn't 484 * match, the only way this is allowed is if the buffer 485 * in the cache is stale and the transaction that made 486 * it stale has not yet committed. i.e. we are 487 * reallocating a busy extent. Skip this buffer and 488 * continue searching for an exact match. 489 */ 490 if (!(map->bm_flags & XBM_LIVESCAN)) 491 ASSERT(bp->b_flags & XBF_STALE); 492 return 1; 493 } 494 return 0; 495 } 496 497 static const struct rhashtable_params xfs_buf_hash_params = { 498 .min_size = 32, /* empty AGs have minimal footprint */ 499 .nelem_hint = 16, 500 .key_len = sizeof(xfs_daddr_t), 501 .key_offset = offsetof(struct xfs_buf, b_rhash_key), 502 .head_offset = offsetof(struct xfs_buf, b_rhash_head), 503 .automatic_shrinking = true, 504 .obj_cmpfn = _xfs_buf_obj_cmp, 505 }; 506 507 int 508 xfs_buf_hash_init( 509 struct xfs_perag *pag) 510 { 511 spin_lock_init(&pag->pag_buf_lock); 512 return rhashtable_init(&pag->pag_buf_hash, &xfs_buf_hash_params); 513 } 514 515 void 516 xfs_buf_hash_destroy( 517 struct xfs_perag *pag) 518 { 519 rhashtable_destroy(&pag->pag_buf_hash); 520 } 521 522 static int 523 xfs_buf_map_verify( 524 struct xfs_buftarg *btp, 525 struct xfs_buf_map *map) 526 { 527 xfs_daddr_t eofs; 528 529 /* Check for IOs smaller than the sector size / not sector aligned */ 530 ASSERT(!(BBTOB(map->bm_len) < btp->bt_meta_sectorsize)); 531 ASSERT(!(BBTOB(map->bm_bn) & (xfs_off_t)btp->bt_meta_sectormask)); 532 533 /* 534 * Corrupted block numbers can get through to here, unfortunately, so we 535 * have to check that the buffer falls within the filesystem bounds. 536 */ 537 eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); 538 if (map->bm_bn < 0 || map->bm_bn >= eofs) { 539 xfs_alert(btp->bt_mount, 540 "%s: daddr 0x%llx out of range, EOFS 0x%llx", 541 __func__, map->bm_bn, eofs); 542 WARN_ON(1); 543 return -EFSCORRUPTED; 544 } 545 return 0; 546 } 547 548 static int 549 xfs_buf_find_lock( 550 struct xfs_buf *bp, 551 xfs_buf_flags_t flags) 552 { 553 if (flags & XBF_TRYLOCK) { 554 if (!xfs_buf_trylock(bp)) { 555 XFS_STATS_INC(bp->b_mount, xb_busy_locked); 556 return -EAGAIN; 557 } 558 } else { 559 xfs_buf_lock(bp); 560 XFS_STATS_INC(bp->b_mount, xb_get_locked_waited); 561 } 562 563 /* 564 * if the buffer is stale, clear all the external state associated with 565 * it. We need to keep flags such as how we allocated the buffer memory 566 * intact here. 567 */ 568 if (bp->b_flags & XBF_STALE) { 569 if (flags & XBF_LIVESCAN) { 570 xfs_buf_unlock(bp); 571 return -ENOENT; 572 } 573 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); 574 bp->b_flags &= _XBF_KMEM | _XBF_PAGES; 575 bp->b_ops = NULL; 576 } 577 return 0; 578 } 579 580 static inline int 581 xfs_buf_lookup( 582 struct xfs_perag *pag, 583 struct xfs_buf_map *map, 584 xfs_buf_flags_t flags, 585 struct xfs_buf **bpp) 586 { 587 struct xfs_buf *bp; 588 int error; 589 590 rcu_read_lock(); 591 bp = rhashtable_lookup(&pag->pag_buf_hash, map, xfs_buf_hash_params); 592 if (!bp || !atomic_inc_not_zero(&bp->b_hold)) { 593 rcu_read_unlock(); 594 return -ENOENT; 595 } 596 rcu_read_unlock(); 597 598 error = xfs_buf_find_lock(bp, flags); 599 if (error) { 600 xfs_buf_rele(bp); 601 return error; 602 } 603 604 trace_xfs_buf_find(bp, flags, _RET_IP_); 605 *bpp = bp; 606 return 0; 607 } 608 609 /* 610 * Insert the new_bp into the hash table. This consumes the perag reference 611 * taken for the lookup regardless of the result of the insert. 612 */ 613 static int 614 xfs_buf_find_insert( 615 struct xfs_buftarg *btp, 616 struct xfs_perag *pag, 617 struct xfs_buf_map *cmap, 618 struct xfs_buf_map *map, 619 int nmaps, 620 xfs_buf_flags_t flags, 621 struct xfs_buf **bpp) 622 { 623 struct xfs_buf *new_bp; 624 struct xfs_buf *bp; 625 int error; 626 627 error = _xfs_buf_alloc(btp, map, nmaps, flags, &new_bp); 628 if (error) 629 goto out_drop_pag; 630 631 /* 632 * For buffers that fit entirely within a single page, first attempt to 633 * allocate the memory from the heap to minimise memory usage. If we 634 * can't get heap memory for these small buffers, we fall back to using 635 * the page allocator. 636 */ 637 if (BBTOB(new_bp->b_length) >= PAGE_SIZE || 638 xfs_buf_alloc_kmem(new_bp, flags) < 0) { 639 error = xfs_buf_alloc_pages(new_bp, flags); 640 if (error) 641 goto out_free_buf; 642 } 643 644 spin_lock(&pag->pag_buf_lock); 645 bp = rhashtable_lookup_get_insert_fast(&pag->pag_buf_hash, 646 &new_bp->b_rhash_head, xfs_buf_hash_params); 647 if (IS_ERR(bp)) { 648 error = PTR_ERR(bp); 649 spin_unlock(&pag->pag_buf_lock); 650 goto out_free_buf; 651 } 652 if (bp) { 653 /* found an existing buffer */ 654 atomic_inc(&bp->b_hold); 655 spin_unlock(&pag->pag_buf_lock); 656 error = xfs_buf_find_lock(bp, flags); 657 if (error) 658 xfs_buf_rele(bp); 659 else 660 *bpp = bp; 661 goto out_free_buf; 662 } 663 664 /* The new buffer keeps the perag reference until it is freed. */ 665 new_bp->b_pag = pag; 666 spin_unlock(&pag->pag_buf_lock); 667 *bpp = new_bp; 668 return 0; 669 670 out_free_buf: 671 xfs_buf_free(new_bp); 672 out_drop_pag: 673 xfs_perag_put(pag); 674 return error; 675 } 676 677 /* 678 * Assembles a buffer covering the specified range. The code is optimised for 679 * cache hits, as metadata intensive workloads will see 3 orders of magnitude 680 * more hits than misses. 681 */ 682 int 683 xfs_buf_get_map( 684 struct xfs_buftarg *btp, 685 struct xfs_buf_map *map, 686 int nmaps, 687 xfs_buf_flags_t flags, 688 struct xfs_buf **bpp) 689 { 690 struct xfs_perag *pag; 691 struct xfs_buf *bp = NULL; 692 struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn }; 693 int error; 694 int i; 695 696 if (flags & XBF_LIVESCAN) 697 cmap.bm_flags |= XBM_LIVESCAN; 698 for (i = 0; i < nmaps; i++) 699 cmap.bm_len += map[i].bm_len; 700 701 error = xfs_buf_map_verify(btp, &cmap); 702 if (error) 703 return error; 704 705 pag = xfs_perag_get(btp->bt_mount, 706 xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn)); 707 708 error = xfs_buf_lookup(pag, &cmap, flags, &bp); 709 if (error && error != -ENOENT) 710 goto out_put_perag; 711 712 /* cache hits always outnumber misses by at least 10:1 */ 713 if (unlikely(!bp)) { 714 XFS_STATS_INC(btp->bt_mount, xb_miss_locked); 715 716 if (flags & XBF_INCORE) 717 goto out_put_perag; 718 719 /* xfs_buf_find_insert() consumes the perag reference. */ 720 error = xfs_buf_find_insert(btp, pag, &cmap, map, nmaps, 721 flags, &bp); 722 if (error) 723 return error; 724 } else { 725 XFS_STATS_INC(btp->bt_mount, xb_get_locked); 726 xfs_perag_put(pag); 727 } 728 729 /* We do not hold a perag reference anymore. */ 730 if (!bp->b_addr) { 731 error = _xfs_buf_map_pages(bp, flags); 732 if (unlikely(error)) { 733 xfs_warn_ratelimited(btp->bt_mount, 734 "%s: failed to map %u pages", __func__, 735 bp->b_page_count); 736 xfs_buf_relse(bp); 737 return error; 738 } 739 } 740 741 /* 742 * Clear b_error if this is a lookup from a caller that doesn't expect 743 * valid data to be found in the buffer. 744 */ 745 if (!(flags & XBF_READ)) 746 xfs_buf_ioerror(bp, 0); 747 748 XFS_STATS_INC(btp->bt_mount, xb_get); 749 trace_xfs_buf_get(bp, flags, _RET_IP_); 750 *bpp = bp; 751 return 0; 752 753 out_put_perag: 754 xfs_perag_put(pag); 755 return error; 756 } 757 758 int 759 _xfs_buf_read( 760 struct xfs_buf *bp, 761 xfs_buf_flags_t flags) 762 { 763 ASSERT(!(flags & XBF_WRITE)); 764 ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL); 765 766 bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD | XBF_DONE); 767 bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); 768 769 return xfs_buf_submit(bp); 770 } 771 772 /* 773 * Reverify a buffer found in cache without an attached ->b_ops. 774 * 775 * If the caller passed an ops structure and the buffer doesn't have ops 776 * assigned, set the ops and use it to verify the contents. If verification 777 * fails, clear XBF_DONE. We assume the buffer has no recorded errors and is 778 * already in XBF_DONE state on entry. 779 * 780 * Under normal operations, every in-core buffer is verified on read I/O 781 * completion. There are two scenarios that can lead to in-core buffers without 782 * an assigned ->b_ops. The first is during log recovery of buffers on a V4 783 * filesystem, though these buffers are purged at the end of recovery. The 784 * other is online repair, which intentionally reads with a NULL buffer ops to 785 * run several verifiers across an in-core buffer in order to establish buffer 786 * type. If repair can't establish that, the buffer will be left in memory 787 * with NULL buffer ops. 788 */ 789 int 790 xfs_buf_reverify( 791 struct xfs_buf *bp, 792 const struct xfs_buf_ops *ops) 793 { 794 ASSERT(bp->b_flags & XBF_DONE); 795 ASSERT(bp->b_error == 0); 796 797 if (!ops || bp->b_ops) 798 return 0; 799 800 bp->b_ops = ops; 801 bp->b_ops->verify_read(bp); 802 if (bp->b_error) 803 bp->b_flags &= ~XBF_DONE; 804 return bp->b_error; 805 } 806 807 int 808 xfs_buf_read_map( 809 struct xfs_buftarg *target, 810 struct xfs_buf_map *map, 811 int nmaps, 812 xfs_buf_flags_t flags, 813 struct xfs_buf **bpp, 814 const struct xfs_buf_ops *ops, 815 xfs_failaddr_t fa) 816 { 817 struct xfs_buf *bp; 818 int error; 819 820 flags |= XBF_READ; 821 *bpp = NULL; 822 823 error = xfs_buf_get_map(target, map, nmaps, flags, &bp); 824 if (error) 825 return error; 826 827 trace_xfs_buf_read(bp, flags, _RET_IP_); 828 829 if (!(bp->b_flags & XBF_DONE)) { 830 /* Initiate the buffer read and wait. */ 831 XFS_STATS_INC(target->bt_mount, xb_get_read); 832 bp->b_ops = ops; 833 error = _xfs_buf_read(bp, flags); 834 835 /* Readahead iodone already dropped the buffer, so exit. */ 836 if (flags & XBF_ASYNC) 837 return 0; 838 } else { 839 /* Buffer already read; all we need to do is check it. */ 840 error = xfs_buf_reverify(bp, ops); 841 842 /* Readahead already finished; drop the buffer and exit. */ 843 if (flags & XBF_ASYNC) { 844 xfs_buf_relse(bp); 845 return 0; 846 } 847 848 /* We do not want read in the flags */ 849 bp->b_flags &= ~XBF_READ; 850 ASSERT(bp->b_ops != NULL || ops == NULL); 851 } 852 853 /* 854 * If we've had a read error, then the contents of the buffer are 855 * invalid and should not be used. To ensure that a followup read tries 856 * to pull the buffer from disk again, we clear the XBF_DONE flag and 857 * mark the buffer stale. This ensures that anyone who has a current 858 * reference to the buffer will interpret it's contents correctly and 859 * future cache lookups will also treat it as an empty, uninitialised 860 * buffer. 861 */ 862 if (error) { 863 /* 864 * Check against log shutdown for error reporting because 865 * metadata writeback may require a read first and we need to 866 * report errors in metadata writeback until the log is shut 867 * down. High level transaction read functions already check 868 * against mount shutdown, anyway, so we only need to be 869 * concerned about low level IO interactions here. 870 */ 871 if (!xlog_is_shutdown(target->bt_mount->m_log)) 872 xfs_buf_ioerror_alert(bp, fa); 873 874 bp->b_flags &= ~XBF_DONE; 875 xfs_buf_stale(bp); 876 xfs_buf_relse(bp); 877 878 /* bad CRC means corrupted metadata */ 879 if (error == -EFSBADCRC) 880 error = -EFSCORRUPTED; 881 return error; 882 } 883 884 *bpp = bp; 885 return 0; 886 } 887 888 /* 889 * If we are not low on memory then do the readahead in a deadlock 890 * safe manner. 891 */ 892 void 893 xfs_buf_readahead_map( 894 struct xfs_buftarg *target, 895 struct xfs_buf_map *map, 896 int nmaps, 897 const struct xfs_buf_ops *ops) 898 { 899 struct xfs_buf *bp; 900 901 xfs_buf_read_map(target, map, nmaps, 902 XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops, 903 __this_address); 904 } 905 906 /* 907 * Read an uncached buffer from disk. Allocates and returns a locked 908 * buffer containing the disk contents or nothing. Uncached buffers always have 909 * a cache index of XFS_BUF_DADDR_NULL so we can easily determine if the buffer 910 * is cached or uncached during fault diagnosis. 911 */ 912 int 913 xfs_buf_read_uncached( 914 struct xfs_buftarg *target, 915 xfs_daddr_t daddr, 916 size_t numblks, 917 xfs_buf_flags_t flags, 918 struct xfs_buf **bpp, 919 const struct xfs_buf_ops *ops) 920 { 921 struct xfs_buf *bp; 922 int error; 923 924 *bpp = NULL; 925 926 error = xfs_buf_get_uncached(target, numblks, flags, &bp); 927 if (error) 928 return error; 929 930 /* set up the buffer for a read IO */ 931 ASSERT(bp->b_map_count == 1); 932 bp->b_rhash_key = XFS_BUF_DADDR_NULL; 933 bp->b_maps[0].bm_bn = daddr; 934 bp->b_flags |= XBF_READ; 935 bp->b_ops = ops; 936 937 xfs_buf_submit(bp); 938 if (bp->b_error) { 939 error = bp->b_error; 940 xfs_buf_relse(bp); 941 return error; 942 } 943 944 *bpp = bp; 945 return 0; 946 } 947 948 int 949 xfs_buf_get_uncached( 950 struct xfs_buftarg *target, 951 size_t numblks, 952 xfs_buf_flags_t flags, 953 struct xfs_buf **bpp) 954 { 955 int error; 956 struct xfs_buf *bp; 957 DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks); 958 959 *bpp = NULL; 960 961 /* flags might contain irrelevant bits, pass only what we care about */ 962 error = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT, &bp); 963 if (error) 964 return error; 965 966 error = xfs_buf_alloc_pages(bp, flags); 967 if (error) 968 goto fail_free_buf; 969 970 error = _xfs_buf_map_pages(bp, 0); 971 if (unlikely(error)) { 972 xfs_warn(target->bt_mount, 973 "%s: failed to map pages", __func__); 974 goto fail_free_buf; 975 } 976 977 trace_xfs_buf_get_uncached(bp, _RET_IP_); 978 *bpp = bp; 979 return 0; 980 981 fail_free_buf: 982 xfs_buf_free(bp); 983 return error; 984 } 985 986 /* 987 * Increment reference count on buffer, to hold the buffer concurrently 988 * with another thread which may release (free) the buffer asynchronously. 989 * Must hold the buffer already to call this function. 990 */ 991 void 992 xfs_buf_hold( 993 struct xfs_buf *bp) 994 { 995 trace_xfs_buf_hold(bp, _RET_IP_); 996 atomic_inc(&bp->b_hold); 997 } 998 999 /* 1000 * Release a hold on the specified buffer. If the hold count is 1, the buffer is 1001 * placed on LRU or freed (depending on b_lru_ref). 1002 */ 1003 void 1004 xfs_buf_rele( 1005 struct xfs_buf *bp) 1006 { 1007 struct xfs_perag *pag = bp->b_pag; 1008 bool release; 1009 bool freebuf = false; 1010 1011 trace_xfs_buf_rele(bp, _RET_IP_); 1012 1013 if (!pag) { 1014 ASSERT(list_empty(&bp->b_lru)); 1015 if (atomic_dec_and_test(&bp->b_hold)) { 1016 xfs_buf_ioacct_dec(bp); 1017 xfs_buf_free(bp); 1018 } 1019 return; 1020 } 1021 1022 ASSERT(atomic_read(&bp->b_hold) > 0); 1023 1024 /* 1025 * We grab the b_lock here first to serialise racing xfs_buf_rele() 1026 * calls. The pag_buf_lock being taken on the last reference only 1027 * serialises against racing lookups in xfs_buf_find(). IOWs, the second 1028 * to last reference we drop here is not serialised against the last 1029 * reference until we take bp->b_lock. Hence if we don't grab b_lock 1030 * first, the last "release" reference can win the race to the lock and 1031 * free the buffer before the second-to-last reference is processed, 1032 * leading to a use-after-free scenario. 1033 */ 1034 spin_lock(&bp->b_lock); 1035 release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock); 1036 if (!release) { 1037 /* 1038 * Drop the in-flight state if the buffer is already on the LRU 1039 * and it holds the only reference. This is racy because we 1040 * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT 1041 * ensures the decrement occurs only once per-buf. 1042 */ 1043 if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru)) 1044 __xfs_buf_ioacct_dec(bp); 1045 goto out_unlock; 1046 } 1047 1048 /* the last reference has been dropped ... */ 1049 __xfs_buf_ioacct_dec(bp); 1050 if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { 1051 /* 1052 * If the buffer is added to the LRU take a new reference to the 1053 * buffer for the LRU and clear the (now stale) dispose list 1054 * state flag 1055 */ 1056 if (list_lru_add_obj(&bp->b_target->bt_lru, &bp->b_lru)) { 1057 bp->b_state &= ~XFS_BSTATE_DISPOSE; 1058 atomic_inc(&bp->b_hold); 1059 } 1060 spin_unlock(&pag->pag_buf_lock); 1061 } else { 1062 /* 1063 * most of the time buffers will already be removed from the 1064 * LRU, so optimise that case by checking for the 1065 * XFS_BSTATE_DISPOSE flag indicating the last list the buffer 1066 * was on was the disposal list 1067 */ 1068 if (!(bp->b_state & XFS_BSTATE_DISPOSE)) { 1069 list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru); 1070 } else { 1071 ASSERT(list_empty(&bp->b_lru)); 1072 } 1073 1074 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); 1075 rhashtable_remove_fast(&pag->pag_buf_hash, &bp->b_rhash_head, 1076 xfs_buf_hash_params); 1077 spin_unlock(&pag->pag_buf_lock); 1078 xfs_perag_put(pag); 1079 freebuf = true; 1080 } 1081 1082 out_unlock: 1083 spin_unlock(&bp->b_lock); 1084 1085 if (freebuf) 1086 xfs_buf_free(bp); 1087 } 1088 1089 1090 /* 1091 * Lock a buffer object, if it is not already locked. 1092 * 1093 * If we come across a stale, pinned, locked buffer, we know that we are 1094 * being asked to lock a buffer that has been reallocated. Because it is 1095 * pinned, we know that the log has not been pushed to disk and hence it 1096 * will still be locked. Rather than continuing to have trylock attempts 1097 * fail until someone else pushes the log, push it ourselves before 1098 * returning. This means that the xfsaild will not get stuck trying 1099 * to push on stale inode buffers. 1100 */ 1101 int 1102 xfs_buf_trylock( 1103 struct xfs_buf *bp) 1104 { 1105 int locked; 1106 1107 locked = down_trylock(&bp->b_sema) == 0; 1108 if (locked) 1109 trace_xfs_buf_trylock(bp, _RET_IP_); 1110 else 1111 trace_xfs_buf_trylock_fail(bp, _RET_IP_); 1112 return locked; 1113 } 1114 1115 /* 1116 * Lock a buffer object. 1117 * 1118 * If we come across a stale, pinned, locked buffer, we know that we 1119 * are being asked to lock a buffer that has been reallocated. Because 1120 * it is pinned, we know that the log has not been pushed to disk and 1121 * hence it will still be locked. Rather than sleeping until someone 1122 * else pushes the log, push it ourselves before trying to get the lock. 1123 */ 1124 void 1125 xfs_buf_lock( 1126 struct xfs_buf *bp) 1127 { 1128 trace_xfs_buf_lock(bp, _RET_IP_); 1129 1130 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) 1131 xfs_log_force(bp->b_mount, 0); 1132 down(&bp->b_sema); 1133 1134 trace_xfs_buf_lock_done(bp, _RET_IP_); 1135 } 1136 1137 void 1138 xfs_buf_unlock( 1139 struct xfs_buf *bp) 1140 { 1141 ASSERT(xfs_buf_islocked(bp)); 1142 1143 up(&bp->b_sema); 1144 trace_xfs_buf_unlock(bp, _RET_IP_); 1145 } 1146 1147 STATIC void 1148 xfs_buf_wait_unpin( 1149 struct xfs_buf *bp) 1150 { 1151 DECLARE_WAITQUEUE (wait, current); 1152 1153 if (atomic_read(&bp->b_pin_count) == 0) 1154 return; 1155 1156 add_wait_queue(&bp->b_waiters, &wait); 1157 for (;;) { 1158 set_current_state(TASK_UNINTERRUPTIBLE); 1159 if (atomic_read(&bp->b_pin_count) == 0) 1160 break; 1161 io_schedule(); 1162 } 1163 remove_wait_queue(&bp->b_waiters, &wait); 1164 set_current_state(TASK_RUNNING); 1165 } 1166 1167 static void 1168 xfs_buf_ioerror_alert_ratelimited( 1169 struct xfs_buf *bp) 1170 { 1171 static unsigned long lasttime; 1172 static struct xfs_buftarg *lasttarg; 1173 1174 if (bp->b_target != lasttarg || 1175 time_after(jiffies, (lasttime + 5*HZ))) { 1176 lasttime = jiffies; 1177 xfs_buf_ioerror_alert(bp, __this_address); 1178 } 1179 lasttarg = bp->b_target; 1180 } 1181 1182 /* 1183 * Account for this latest trip around the retry handler, and decide if 1184 * we've failed enough times to constitute a permanent failure. 1185 */ 1186 static bool 1187 xfs_buf_ioerror_permanent( 1188 struct xfs_buf *bp, 1189 struct xfs_error_cfg *cfg) 1190 { 1191 struct xfs_mount *mp = bp->b_mount; 1192 1193 if (cfg->max_retries != XFS_ERR_RETRY_FOREVER && 1194 ++bp->b_retries > cfg->max_retries) 1195 return true; 1196 if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && 1197 time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time)) 1198 return true; 1199 1200 /* At unmount we may treat errors differently */ 1201 if (xfs_is_unmounting(mp) && mp->m_fail_unmount) 1202 return true; 1203 1204 return false; 1205 } 1206 1207 /* 1208 * On a sync write or shutdown we just want to stale the buffer and let the 1209 * caller handle the error in bp->b_error appropriately. 1210 * 1211 * If the write was asynchronous then no one will be looking for the error. If 1212 * this is the first failure of this type, clear the error state and write the 1213 * buffer out again. This means we always retry an async write failure at least 1214 * once, but we also need to set the buffer up to behave correctly now for 1215 * repeated failures. 1216 * 1217 * If we get repeated async write failures, then we take action according to the 1218 * error configuration we have been set up to use. 1219 * 1220 * Returns true if this function took care of error handling and the caller must 1221 * not touch the buffer again. Return false if the caller should proceed with 1222 * normal I/O completion handling. 1223 */ 1224 static bool 1225 xfs_buf_ioend_handle_error( 1226 struct xfs_buf *bp) 1227 { 1228 struct xfs_mount *mp = bp->b_mount; 1229 struct xfs_error_cfg *cfg; 1230 1231 /* 1232 * If we've already shutdown the journal because of I/O errors, there's 1233 * no point in giving this a retry. 1234 */ 1235 if (xlog_is_shutdown(mp->m_log)) 1236 goto out_stale; 1237 1238 xfs_buf_ioerror_alert_ratelimited(bp); 1239 1240 /* 1241 * We're not going to bother about retrying this during recovery. 1242 * One strike! 1243 */ 1244 if (bp->b_flags & _XBF_LOGRECOVERY) { 1245 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 1246 return false; 1247 } 1248 1249 /* 1250 * Synchronous writes will have callers process the error. 1251 */ 1252 if (!(bp->b_flags & XBF_ASYNC)) 1253 goto out_stale; 1254 1255 trace_xfs_buf_iodone_async(bp, _RET_IP_); 1256 1257 cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error); 1258 if (bp->b_last_error != bp->b_error || 1259 !(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL))) { 1260 bp->b_last_error = bp->b_error; 1261 if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && 1262 !bp->b_first_retry_time) 1263 bp->b_first_retry_time = jiffies; 1264 goto resubmit; 1265 } 1266 1267 /* 1268 * Permanent error - we need to trigger a shutdown if we haven't already 1269 * to indicate that inconsistency will result from this action. 1270 */ 1271 if (xfs_buf_ioerror_permanent(bp, cfg)) { 1272 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 1273 goto out_stale; 1274 } 1275 1276 /* Still considered a transient error. Caller will schedule retries. */ 1277 if (bp->b_flags & _XBF_INODES) 1278 xfs_buf_inode_io_fail(bp); 1279 else if (bp->b_flags & _XBF_DQUOTS) 1280 xfs_buf_dquot_io_fail(bp); 1281 else 1282 ASSERT(list_empty(&bp->b_li_list)); 1283 xfs_buf_ioerror(bp, 0); 1284 xfs_buf_relse(bp); 1285 return true; 1286 1287 resubmit: 1288 xfs_buf_ioerror(bp, 0); 1289 bp->b_flags |= (XBF_DONE | XBF_WRITE_FAIL); 1290 xfs_buf_submit(bp); 1291 return true; 1292 out_stale: 1293 xfs_buf_stale(bp); 1294 bp->b_flags |= XBF_DONE; 1295 bp->b_flags &= ~XBF_WRITE; 1296 trace_xfs_buf_error_relse(bp, _RET_IP_); 1297 return false; 1298 } 1299 1300 static void 1301 xfs_buf_ioend( 1302 struct xfs_buf *bp) 1303 { 1304 trace_xfs_buf_iodone(bp, _RET_IP_); 1305 1306 /* 1307 * Pull in IO completion errors now. We are guaranteed to be running 1308 * single threaded, so we don't need the lock to read b_io_error. 1309 */ 1310 if (!bp->b_error && bp->b_io_error) 1311 xfs_buf_ioerror(bp, bp->b_io_error); 1312 1313 if (bp->b_flags & XBF_READ) { 1314 if (!bp->b_error && bp->b_ops) 1315 bp->b_ops->verify_read(bp); 1316 if (!bp->b_error) 1317 bp->b_flags |= XBF_DONE; 1318 } else { 1319 if (!bp->b_error) { 1320 bp->b_flags &= ~XBF_WRITE_FAIL; 1321 bp->b_flags |= XBF_DONE; 1322 } 1323 1324 if (unlikely(bp->b_error) && xfs_buf_ioend_handle_error(bp)) 1325 return; 1326 1327 /* clear the retry state */ 1328 bp->b_last_error = 0; 1329 bp->b_retries = 0; 1330 bp->b_first_retry_time = 0; 1331 1332 /* 1333 * Note that for things like remote attribute buffers, there may 1334 * not be a buffer log item here, so processing the buffer log 1335 * item must remain optional. 1336 */ 1337 if (bp->b_log_item) 1338 xfs_buf_item_done(bp); 1339 1340 if (bp->b_flags & _XBF_INODES) 1341 xfs_buf_inode_iodone(bp); 1342 else if (bp->b_flags & _XBF_DQUOTS) 1343 xfs_buf_dquot_iodone(bp); 1344 1345 } 1346 1347 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD | 1348 _XBF_LOGRECOVERY); 1349 1350 if (bp->b_flags & XBF_ASYNC) 1351 xfs_buf_relse(bp); 1352 else 1353 complete(&bp->b_iowait); 1354 } 1355 1356 static void 1357 xfs_buf_ioend_work( 1358 struct work_struct *work) 1359 { 1360 struct xfs_buf *bp = 1361 container_of(work, struct xfs_buf, b_ioend_work); 1362 1363 xfs_buf_ioend(bp); 1364 } 1365 1366 static void 1367 xfs_buf_ioend_async( 1368 struct xfs_buf *bp) 1369 { 1370 INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work); 1371 queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work); 1372 } 1373 1374 void 1375 __xfs_buf_ioerror( 1376 struct xfs_buf *bp, 1377 int error, 1378 xfs_failaddr_t failaddr) 1379 { 1380 ASSERT(error <= 0 && error >= -1000); 1381 bp->b_error = error; 1382 trace_xfs_buf_ioerror(bp, error, failaddr); 1383 } 1384 1385 void 1386 xfs_buf_ioerror_alert( 1387 struct xfs_buf *bp, 1388 xfs_failaddr_t func) 1389 { 1390 xfs_buf_alert_ratelimited(bp, "XFS: metadata IO error", 1391 "metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d", 1392 func, (uint64_t)xfs_buf_daddr(bp), 1393 bp->b_length, -bp->b_error); 1394 } 1395 1396 /* 1397 * To simulate an I/O failure, the buffer must be locked and held with at least 1398 * three references. The LRU reference is dropped by the stale call. The buf 1399 * item reference is dropped via ioend processing. The third reference is owned 1400 * by the caller and is dropped on I/O completion if the buffer is XBF_ASYNC. 1401 */ 1402 void 1403 xfs_buf_ioend_fail( 1404 struct xfs_buf *bp) 1405 { 1406 bp->b_flags &= ~XBF_DONE; 1407 xfs_buf_stale(bp); 1408 xfs_buf_ioerror(bp, -EIO); 1409 xfs_buf_ioend(bp); 1410 } 1411 1412 int 1413 xfs_bwrite( 1414 struct xfs_buf *bp) 1415 { 1416 int error; 1417 1418 ASSERT(xfs_buf_islocked(bp)); 1419 1420 bp->b_flags |= XBF_WRITE; 1421 bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | 1422 XBF_DONE); 1423 1424 error = xfs_buf_submit(bp); 1425 if (error) 1426 xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR); 1427 return error; 1428 } 1429 1430 static void 1431 xfs_buf_bio_end_io( 1432 struct bio *bio) 1433 { 1434 struct xfs_buf *bp = (struct xfs_buf *)bio->bi_private; 1435 1436 if (!bio->bi_status && 1437 (bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) && 1438 XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR)) 1439 bio->bi_status = BLK_STS_IOERR; 1440 1441 /* 1442 * don't overwrite existing errors - otherwise we can lose errors on 1443 * buffers that require multiple bios to complete. 1444 */ 1445 if (bio->bi_status) { 1446 int error = blk_status_to_errno(bio->bi_status); 1447 1448 cmpxchg(&bp->b_io_error, 0, error); 1449 } 1450 1451 if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) 1452 invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); 1453 1454 if (atomic_dec_and_test(&bp->b_io_remaining) == 1) 1455 xfs_buf_ioend_async(bp); 1456 bio_put(bio); 1457 } 1458 1459 static void 1460 xfs_buf_ioapply_map( 1461 struct xfs_buf *bp, 1462 int map, 1463 int *buf_offset, 1464 int *count, 1465 blk_opf_t op) 1466 { 1467 int page_index; 1468 unsigned int total_nr_pages = bp->b_page_count; 1469 int nr_pages; 1470 struct bio *bio; 1471 sector_t sector = bp->b_maps[map].bm_bn; 1472 int size; 1473 int offset; 1474 1475 /* skip the pages in the buffer before the start offset */ 1476 page_index = 0; 1477 offset = *buf_offset; 1478 while (offset >= PAGE_SIZE) { 1479 page_index++; 1480 offset -= PAGE_SIZE; 1481 } 1482 1483 /* 1484 * Limit the IO size to the length of the current vector, and update the 1485 * remaining IO count for the next time around. 1486 */ 1487 size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count); 1488 *count -= size; 1489 *buf_offset += size; 1490 1491 next_chunk: 1492 atomic_inc(&bp->b_io_remaining); 1493 nr_pages = bio_max_segs(total_nr_pages); 1494 1495 bio = bio_alloc(bp->b_target->bt_bdev, nr_pages, op, GFP_NOIO); 1496 bio->bi_iter.bi_sector = sector; 1497 bio->bi_end_io = xfs_buf_bio_end_io; 1498 bio->bi_private = bp; 1499 1500 for (; size && nr_pages; nr_pages--, page_index++) { 1501 int rbytes, nbytes = PAGE_SIZE - offset; 1502 1503 if (nbytes > size) 1504 nbytes = size; 1505 1506 rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes, 1507 offset); 1508 if (rbytes < nbytes) 1509 break; 1510 1511 offset = 0; 1512 sector += BTOBB(nbytes); 1513 size -= nbytes; 1514 total_nr_pages--; 1515 } 1516 1517 if (likely(bio->bi_iter.bi_size)) { 1518 if (xfs_buf_is_vmapped(bp)) { 1519 flush_kernel_vmap_range(bp->b_addr, 1520 xfs_buf_vmap_len(bp)); 1521 } 1522 submit_bio(bio); 1523 if (size) 1524 goto next_chunk; 1525 } else { 1526 /* 1527 * This is guaranteed not to be the last io reference count 1528 * because the caller (xfs_buf_submit) holds a count itself. 1529 */ 1530 atomic_dec(&bp->b_io_remaining); 1531 xfs_buf_ioerror(bp, -EIO); 1532 bio_put(bio); 1533 } 1534 1535 } 1536 1537 STATIC void 1538 _xfs_buf_ioapply( 1539 struct xfs_buf *bp) 1540 { 1541 struct blk_plug plug; 1542 blk_opf_t op; 1543 int offset; 1544 int size; 1545 int i; 1546 1547 /* 1548 * Make sure we capture only current IO errors rather than stale errors 1549 * left over from previous use of the buffer (e.g. failed readahead). 1550 */ 1551 bp->b_error = 0; 1552 1553 if (bp->b_flags & XBF_WRITE) { 1554 op = REQ_OP_WRITE; 1555 1556 /* 1557 * Run the write verifier callback function if it exists. If 1558 * this function fails it will mark the buffer with an error and 1559 * the IO should not be dispatched. 1560 */ 1561 if (bp->b_ops) { 1562 bp->b_ops->verify_write(bp); 1563 if (bp->b_error) { 1564 xfs_force_shutdown(bp->b_mount, 1565 SHUTDOWN_CORRUPT_INCORE); 1566 return; 1567 } 1568 } else if (bp->b_rhash_key != XFS_BUF_DADDR_NULL) { 1569 struct xfs_mount *mp = bp->b_mount; 1570 1571 /* 1572 * non-crc filesystems don't attach verifiers during 1573 * log recovery, so don't warn for such filesystems. 1574 */ 1575 if (xfs_has_crc(mp)) { 1576 xfs_warn(mp, 1577 "%s: no buf ops on daddr 0x%llx len %d", 1578 __func__, xfs_buf_daddr(bp), 1579 bp->b_length); 1580 xfs_hex_dump(bp->b_addr, 1581 XFS_CORRUPTION_DUMP_LEN); 1582 dump_stack(); 1583 } 1584 } 1585 } else { 1586 op = REQ_OP_READ; 1587 if (bp->b_flags & XBF_READ_AHEAD) 1588 op |= REQ_RAHEAD; 1589 } 1590 1591 /* we only use the buffer cache for meta-data */ 1592 op |= REQ_META; 1593 1594 /* 1595 * Walk all the vectors issuing IO on them. Set up the initial offset 1596 * into the buffer and the desired IO size before we start - 1597 * _xfs_buf_ioapply_vec() will modify them appropriately for each 1598 * subsequent call. 1599 */ 1600 offset = bp->b_offset; 1601 size = BBTOB(bp->b_length); 1602 blk_start_plug(&plug); 1603 for (i = 0; i < bp->b_map_count; i++) { 1604 xfs_buf_ioapply_map(bp, i, &offset, &size, op); 1605 if (bp->b_error) 1606 break; 1607 if (size <= 0) 1608 break; /* all done */ 1609 } 1610 blk_finish_plug(&plug); 1611 } 1612 1613 /* 1614 * Wait for I/O completion of a sync buffer and return the I/O error code. 1615 */ 1616 static int 1617 xfs_buf_iowait( 1618 struct xfs_buf *bp) 1619 { 1620 ASSERT(!(bp->b_flags & XBF_ASYNC)); 1621 1622 trace_xfs_buf_iowait(bp, _RET_IP_); 1623 wait_for_completion(&bp->b_iowait); 1624 trace_xfs_buf_iowait_done(bp, _RET_IP_); 1625 1626 return bp->b_error; 1627 } 1628 1629 /* 1630 * Buffer I/O submission path, read or write. Asynchronous submission transfers 1631 * the buffer lock ownership and the current reference to the IO. It is not 1632 * safe to reference the buffer after a call to this function unless the caller 1633 * holds an additional reference itself. 1634 */ 1635 static int 1636 __xfs_buf_submit( 1637 struct xfs_buf *bp, 1638 bool wait) 1639 { 1640 int error = 0; 1641 1642 trace_xfs_buf_submit(bp, _RET_IP_); 1643 1644 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); 1645 1646 /* 1647 * On log shutdown we stale and complete the buffer immediately. We can 1648 * be called to read the superblock before the log has been set up, so 1649 * be careful checking the log state. 1650 * 1651 * Checking the mount shutdown state here can result in the log tail 1652 * moving inappropriately on disk as the log may not yet be shut down. 1653 * i.e. failing this buffer on mount shutdown can remove it from the AIL 1654 * and move the tail of the log forwards without having written this 1655 * buffer to disk. This corrupts the log tail state in memory, and 1656 * because the log may not be shut down yet, it can then be propagated 1657 * to disk before the log is shutdown. Hence we check log shutdown 1658 * state here rather than mount state to avoid corrupting the log tail 1659 * on shutdown. 1660 */ 1661 if (bp->b_mount->m_log && 1662 xlog_is_shutdown(bp->b_mount->m_log)) { 1663 xfs_buf_ioend_fail(bp); 1664 return -EIO; 1665 } 1666 1667 /* 1668 * Grab a reference so the buffer does not go away underneath us. For 1669 * async buffers, I/O completion drops the callers reference, which 1670 * could occur before submission returns. 1671 */ 1672 xfs_buf_hold(bp); 1673 1674 if (bp->b_flags & XBF_WRITE) 1675 xfs_buf_wait_unpin(bp); 1676 1677 /* clear the internal error state to avoid spurious errors */ 1678 bp->b_io_error = 0; 1679 1680 /* 1681 * Set the count to 1 initially, this will stop an I/O completion 1682 * callout which happens before we have started all the I/O from calling 1683 * xfs_buf_ioend too early. 1684 */ 1685 atomic_set(&bp->b_io_remaining, 1); 1686 if (bp->b_flags & XBF_ASYNC) 1687 xfs_buf_ioacct_inc(bp); 1688 _xfs_buf_ioapply(bp); 1689 1690 /* 1691 * If _xfs_buf_ioapply failed, we can get back here with only the IO 1692 * reference we took above. If we drop it to zero, run completion so 1693 * that we don't return to the caller with completion still pending. 1694 */ 1695 if (atomic_dec_and_test(&bp->b_io_remaining) == 1) { 1696 if (bp->b_error || !(bp->b_flags & XBF_ASYNC)) 1697 xfs_buf_ioend(bp); 1698 else 1699 xfs_buf_ioend_async(bp); 1700 } 1701 1702 if (wait) 1703 error = xfs_buf_iowait(bp); 1704 1705 /* 1706 * Release the hold that keeps the buffer referenced for the entire 1707 * I/O. Note that if the buffer is async, it is not safe to reference 1708 * after this release. 1709 */ 1710 xfs_buf_rele(bp); 1711 return error; 1712 } 1713 1714 void * 1715 xfs_buf_offset( 1716 struct xfs_buf *bp, 1717 size_t offset) 1718 { 1719 struct page *page; 1720 1721 if (bp->b_addr) 1722 return bp->b_addr + offset; 1723 1724 page = bp->b_pages[offset >> PAGE_SHIFT]; 1725 return page_address(page) + (offset & (PAGE_SIZE-1)); 1726 } 1727 1728 void 1729 xfs_buf_zero( 1730 struct xfs_buf *bp, 1731 size_t boff, 1732 size_t bsize) 1733 { 1734 size_t bend; 1735 1736 bend = boff + bsize; 1737 while (boff < bend) { 1738 struct page *page; 1739 int page_index, page_offset, csize; 1740 1741 page_index = (boff + bp->b_offset) >> PAGE_SHIFT; 1742 page_offset = (boff + bp->b_offset) & ~PAGE_MASK; 1743 page = bp->b_pages[page_index]; 1744 csize = min_t(size_t, PAGE_SIZE - page_offset, 1745 BBTOB(bp->b_length) - boff); 1746 1747 ASSERT((csize + page_offset) <= PAGE_SIZE); 1748 1749 memset(page_address(page) + page_offset, 0, csize); 1750 1751 boff += csize; 1752 } 1753 } 1754 1755 /* 1756 * Log a message about and stale a buffer that a caller has decided is corrupt. 1757 * 1758 * This function should be called for the kinds of metadata corruption that 1759 * cannot be detect from a verifier, such as incorrect inter-block relationship 1760 * data. Do /not/ call this function from a verifier function. 1761 * 1762 * The buffer must be XBF_DONE prior to the call. Afterwards, the buffer will 1763 * be marked stale, but b_error will not be set. The caller is responsible for 1764 * releasing the buffer or fixing it. 1765 */ 1766 void 1767 __xfs_buf_mark_corrupt( 1768 struct xfs_buf *bp, 1769 xfs_failaddr_t fa) 1770 { 1771 ASSERT(bp->b_flags & XBF_DONE); 1772 1773 xfs_buf_corruption_error(bp, fa); 1774 xfs_buf_stale(bp); 1775 } 1776 1777 /* 1778 * Handling of buffer targets (buftargs). 1779 */ 1780 1781 /* 1782 * Wait for any bufs with callbacks that have been submitted but have not yet 1783 * returned. These buffers will have an elevated hold count, so wait on those 1784 * while freeing all the buffers only held by the LRU. 1785 */ 1786 static enum lru_status 1787 xfs_buftarg_drain_rele( 1788 struct list_head *item, 1789 struct list_lru_one *lru, 1790 spinlock_t *lru_lock, 1791 void *arg) 1792 1793 { 1794 struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); 1795 struct list_head *dispose = arg; 1796 1797 if (atomic_read(&bp->b_hold) > 1) { 1798 /* need to wait, so skip it this pass */ 1799 trace_xfs_buf_drain_buftarg(bp, _RET_IP_); 1800 return LRU_SKIP; 1801 } 1802 if (!spin_trylock(&bp->b_lock)) 1803 return LRU_SKIP; 1804 1805 /* 1806 * clear the LRU reference count so the buffer doesn't get 1807 * ignored in xfs_buf_rele(). 1808 */ 1809 atomic_set(&bp->b_lru_ref, 0); 1810 bp->b_state |= XFS_BSTATE_DISPOSE; 1811 list_lru_isolate_move(lru, item, dispose); 1812 spin_unlock(&bp->b_lock); 1813 return LRU_REMOVED; 1814 } 1815 1816 /* 1817 * Wait for outstanding I/O on the buftarg to complete. 1818 */ 1819 void 1820 xfs_buftarg_wait( 1821 struct xfs_buftarg *btp) 1822 { 1823 /* 1824 * First wait on the buftarg I/O count for all in-flight buffers to be 1825 * released. This is critical as new buffers do not make the LRU until 1826 * they are released. 1827 * 1828 * Next, flush the buffer workqueue to ensure all completion processing 1829 * has finished. Just waiting on buffer locks is not sufficient for 1830 * async IO as the reference count held over IO is not released until 1831 * after the buffer lock is dropped. Hence we need to ensure here that 1832 * all reference counts have been dropped before we start walking the 1833 * LRU list. 1834 */ 1835 while (percpu_counter_sum(&btp->bt_io_count)) 1836 delay(100); 1837 flush_workqueue(btp->bt_mount->m_buf_workqueue); 1838 } 1839 1840 void 1841 xfs_buftarg_drain( 1842 struct xfs_buftarg *btp) 1843 { 1844 LIST_HEAD(dispose); 1845 int loop = 0; 1846 bool write_fail = false; 1847 1848 xfs_buftarg_wait(btp); 1849 1850 /* loop until there is nothing left on the lru list. */ 1851 while (list_lru_count(&btp->bt_lru)) { 1852 list_lru_walk(&btp->bt_lru, xfs_buftarg_drain_rele, 1853 &dispose, LONG_MAX); 1854 1855 while (!list_empty(&dispose)) { 1856 struct xfs_buf *bp; 1857 bp = list_first_entry(&dispose, struct xfs_buf, b_lru); 1858 list_del_init(&bp->b_lru); 1859 if (bp->b_flags & XBF_WRITE_FAIL) { 1860 write_fail = true; 1861 xfs_buf_alert_ratelimited(bp, 1862 "XFS: Corruption Alert", 1863 "Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!", 1864 (long long)xfs_buf_daddr(bp)); 1865 } 1866 xfs_buf_rele(bp); 1867 } 1868 if (loop++ != 0) 1869 delay(100); 1870 } 1871 1872 /* 1873 * If one or more failed buffers were freed, that means dirty metadata 1874 * was thrown away. This should only ever happen after I/O completion 1875 * handling has elevated I/O error(s) to permanent failures and shuts 1876 * down the journal. 1877 */ 1878 if (write_fail) { 1879 ASSERT(xlog_is_shutdown(btp->bt_mount->m_log)); 1880 xfs_alert(btp->bt_mount, 1881 "Please run xfs_repair to determine the extent of the problem."); 1882 } 1883 } 1884 1885 static enum lru_status 1886 xfs_buftarg_isolate( 1887 struct list_head *item, 1888 struct list_lru_one *lru, 1889 spinlock_t *lru_lock, 1890 void *arg) 1891 { 1892 struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); 1893 struct list_head *dispose = arg; 1894 1895 /* 1896 * we are inverting the lru lock/bp->b_lock here, so use a trylock. 1897 * If we fail to get the lock, just skip it. 1898 */ 1899 if (!spin_trylock(&bp->b_lock)) 1900 return LRU_SKIP; 1901 /* 1902 * Decrement the b_lru_ref count unless the value is already 1903 * zero. If the value is already zero, we need to reclaim the 1904 * buffer, otherwise it gets another trip through the LRU. 1905 */ 1906 if (atomic_add_unless(&bp->b_lru_ref, -1, 0)) { 1907 spin_unlock(&bp->b_lock); 1908 return LRU_ROTATE; 1909 } 1910 1911 bp->b_state |= XFS_BSTATE_DISPOSE; 1912 list_lru_isolate_move(lru, item, dispose); 1913 spin_unlock(&bp->b_lock); 1914 return LRU_REMOVED; 1915 } 1916 1917 static unsigned long 1918 xfs_buftarg_shrink_scan( 1919 struct shrinker *shrink, 1920 struct shrink_control *sc) 1921 { 1922 struct xfs_buftarg *btp = shrink->private_data; 1923 LIST_HEAD(dispose); 1924 unsigned long freed; 1925 1926 freed = list_lru_shrink_walk(&btp->bt_lru, sc, 1927 xfs_buftarg_isolate, &dispose); 1928 1929 while (!list_empty(&dispose)) { 1930 struct xfs_buf *bp; 1931 bp = list_first_entry(&dispose, struct xfs_buf, b_lru); 1932 list_del_init(&bp->b_lru); 1933 xfs_buf_rele(bp); 1934 } 1935 1936 return freed; 1937 } 1938 1939 static unsigned long 1940 xfs_buftarg_shrink_count( 1941 struct shrinker *shrink, 1942 struct shrink_control *sc) 1943 { 1944 struct xfs_buftarg *btp = shrink->private_data; 1945 return list_lru_shrink_count(&btp->bt_lru, sc); 1946 } 1947 1948 void 1949 xfs_free_buftarg( 1950 struct xfs_buftarg *btp) 1951 { 1952 shrinker_free(btp->bt_shrinker); 1953 ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0); 1954 percpu_counter_destroy(&btp->bt_io_count); 1955 list_lru_destroy(&btp->bt_lru); 1956 1957 fs_put_dax(btp->bt_daxdev, btp->bt_mount); 1958 /* the main block device is closed by kill_block_super */ 1959 if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev) 1960 bdev_release(btp->bt_bdev_handle); 1961 1962 kfree(btp); 1963 } 1964 1965 int 1966 xfs_setsize_buftarg( 1967 xfs_buftarg_t *btp, 1968 unsigned int sectorsize) 1969 { 1970 /* Set up metadata sector size info */ 1971 btp->bt_meta_sectorsize = sectorsize; 1972 btp->bt_meta_sectormask = sectorsize - 1; 1973 1974 if (set_blocksize(btp->bt_bdev, sectorsize)) { 1975 xfs_warn(btp->bt_mount, 1976 "Cannot set_blocksize to %u on device %pg", 1977 sectorsize, btp->bt_bdev); 1978 return -EINVAL; 1979 } 1980 1981 /* Set up device logical sector size mask */ 1982 btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev); 1983 btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1; 1984 1985 return 0; 1986 } 1987 1988 /* 1989 * When allocating the initial buffer target we have not yet 1990 * read in the superblock, so don't know what sized sectors 1991 * are being used at this early stage. Play safe. 1992 */ 1993 STATIC int 1994 xfs_setsize_buftarg_early( 1995 xfs_buftarg_t *btp) 1996 { 1997 return xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev)); 1998 } 1999 2000 struct xfs_buftarg * 2001 xfs_alloc_buftarg( 2002 struct xfs_mount *mp, 2003 struct bdev_handle *bdev_handle) 2004 { 2005 xfs_buftarg_t *btp; 2006 const struct dax_holder_operations *ops = NULL; 2007 2008 #if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE) 2009 ops = &xfs_dax_holder_operations; 2010 #endif 2011 btp = kzalloc(sizeof(*btp), GFP_KERNEL | __GFP_NOFAIL); 2012 2013 btp->bt_mount = mp; 2014 btp->bt_bdev_handle = bdev_handle; 2015 btp->bt_dev = bdev_handle->bdev->bd_dev; 2016 btp->bt_bdev = bdev_handle->bdev; 2017 btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off, 2018 mp, ops); 2019 2020 /* 2021 * Buffer IO error rate limiting. Limit it to no more than 10 messages 2022 * per 30 seconds so as to not spam logs too much on repeated errors. 2023 */ 2024 ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ, 2025 DEFAULT_RATELIMIT_BURST); 2026 2027 if (xfs_setsize_buftarg_early(btp)) 2028 goto error_free; 2029 2030 if (list_lru_init(&btp->bt_lru)) 2031 goto error_free; 2032 2033 if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL)) 2034 goto error_lru; 2035 2036 btp->bt_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-buf:%s", 2037 mp->m_super->s_id); 2038 if (!btp->bt_shrinker) 2039 goto error_pcpu; 2040 2041 btp->bt_shrinker->count_objects = xfs_buftarg_shrink_count; 2042 btp->bt_shrinker->scan_objects = xfs_buftarg_shrink_scan; 2043 btp->bt_shrinker->private_data = btp; 2044 2045 shrinker_register(btp->bt_shrinker); 2046 2047 return btp; 2048 2049 error_pcpu: 2050 percpu_counter_destroy(&btp->bt_io_count); 2051 error_lru: 2052 list_lru_destroy(&btp->bt_lru); 2053 error_free: 2054 kfree(btp); 2055 return NULL; 2056 } 2057 2058 static inline void 2059 xfs_buf_list_del( 2060 struct xfs_buf *bp) 2061 { 2062 list_del_init(&bp->b_list); 2063 wake_up_var(&bp->b_list); 2064 } 2065 2066 /* 2067 * Cancel a delayed write list. 2068 * 2069 * Remove each buffer from the list, clear the delwri queue flag and drop the 2070 * associated buffer reference. 2071 */ 2072 void 2073 xfs_buf_delwri_cancel( 2074 struct list_head *list) 2075 { 2076 struct xfs_buf *bp; 2077 2078 while (!list_empty(list)) { 2079 bp = list_first_entry(list, struct xfs_buf, b_list); 2080 2081 xfs_buf_lock(bp); 2082 bp->b_flags &= ~_XBF_DELWRI_Q; 2083 xfs_buf_list_del(bp); 2084 xfs_buf_relse(bp); 2085 } 2086 } 2087 2088 /* 2089 * Add a buffer to the delayed write list. 2090 * 2091 * This queues a buffer for writeout if it hasn't already been. Note that 2092 * neither this routine nor the buffer list submission functions perform 2093 * any internal synchronization. It is expected that the lists are thread-local 2094 * to the callers. 2095 * 2096 * Returns true if we queued up the buffer, or false if it already had 2097 * been on the buffer list. 2098 */ 2099 bool 2100 xfs_buf_delwri_queue( 2101 struct xfs_buf *bp, 2102 struct list_head *list) 2103 { 2104 ASSERT(xfs_buf_islocked(bp)); 2105 ASSERT(!(bp->b_flags & XBF_READ)); 2106 2107 /* 2108 * If the buffer is already marked delwri it already is queued up 2109 * by someone else for imediate writeout. Just ignore it in that 2110 * case. 2111 */ 2112 if (bp->b_flags & _XBF_DELWRI_Q) { 2113 trace_xfs_buf_delwri_queued(bp, _RET_IP_); 2114 return false; 2115 } 2116 2117 trace_xfs_buf_delwri_queue(bp, _RET_IP_); 2118 2119 /* 2120 * If a buffer gets written out synchronously or marked stale while it 2121 * is on a delwri list we lazily remove it. To do this, the other party 2122 * clears the _XBF_DELWRI_Q flag but otherwise leaves the buffer alone. 2123 * It remains referenced and on the list. In a rare corner case it 2124 * might get readded to a delwri list after the synchronous writeout, in 2125 * which case we need just need to re-add the flag here. 2126 */ 2127 bp->b_flags |= _XBF_DELWRI_Q; 2128 if (list_empty(&bp->b_list)) { 2129 atomic_inc(&bp->b_hold); 2130 list_add_tail(&bp->b_list, list); 2131 } 2132 2133 return true; 2134 } 2135 2136 /* 2137 * Queue a buffer to this delwri list as part of a data integrity operation. 2138 * If the buffer is on any other delwri list, we'll wait for that to clear 2139 * so that the caller can submit the buffer for IO and wait for the result. 2140 * Callers must ensure the buffer is not already on the list. 2141 */ 2142 void 2143 xfs_buf_delwri_queue_here( 2144 struct xfs_buf *bp, 2145 struct list_head *buffer_list) 2146 { 2147 /* 2148 * We need this buffer to end up on the /caller's/ delwri list, not any 2149 * old list. This can happen if the buffer is marked stale (which 2150 * clears DELWRI_Q) after the AIL queues the buffer to its list but 2151 * before the AIL has a chance to submit the list. 2152 */ 2153 while (!list_empty(&bp->b_list)) { 2154 xfs_buf_unlock(bp); 2155 wait_var_event(&bp->b_list, list_empty(&bp->b_list)); 2156 xfs_buf_lock(bp); 2157 } 2158 2159 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); 2160 2161 xfs_buf_delwri_queue(bp, buffer_list); 2162 } 2163 2164 /* 2165 * Compare function is more complex than it needs to be because 2166 * the return value is only 32 bits and we are doing comparisons 2167 * on 64 bit values 2168 */ 2169 static int 2170 xfs_buf_cmp( 2171 void *priv, 2172 const struct list_head *a, 2173 const struct list_head *b) 2174 { 2175 struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list); 2176 struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list); 2177 xfs_daddr_t diff; 2178 2179 diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn; 2180 if (diff < 0) 2181 return -1; 2182 if (diff > 0) 2183 return 1; 2184 return 0; 2185 } 2186 2187 /* 2188 * Submit buffers for write. If wait_list is specified, the buffers are 2189 * submitted using sync I/O and placed on the wait list such that the caller can 2190 * iowait each buffer. Otherwise async I/O is used and the buffers are released 2191 * at I/O completion time. In either case, buffers remain locked until I/O 2192 * completes and the buffer is released from the queue. 2193 */ 2194 static int 2195 xfs_buf_delwri_submit_buffers( 2196 struct list_head *buffer_list, 2197 struct list_head *wait_list) 2198 { 2199 struct xfs_buf *bp, *n; 2200 int pinned = 0; 2201 struct blk_plug plug; 2202 2203 list_sort(NULL, buffer_list, xfs_buf_cmp); 2204 2205 blk_start_plug(&plug); 2206 list_for_each_entry_safe(bp, n, buffer_list, b_list) { 2207 if (!wait_list) { 2208 if (!xfs_buf_trylock(bp)) 2209 continue; 2210 if (xfs_buf_ispinned(bp)) { 2211 xfs_buf_unlock(bp); 2212 pinned++; 2213 continue; 2214 } 2215 } else { 2216 xfs_buf_lock(bp); 2217 } 2218 2219 /* 2220 * Someone else might have written the buffer synchronously or 2221 * marked it stale in the meantime. In that case only the 2222 * _XBF_DELWRI_Q flag got cleared, and we have to drop the 2223 * reference and remove it from the list here. 2224 */ 2225 if (!(bp->b_flags & _XBF_DELWRI_Q)) { 2226 xfs_buf_list_del(bp); 2227 xfs_buf_relse(bp); 2228 continue; 2229 } 2230 2231 trace_xfs_buf_delwri_split(bp, _RET_IP_); 2232 2233 /* 2234 * If we have a wait list, each buffer (and associated delwri 2235 * queue reference) transfers to it and is submitted 2236 * synchronously. Otherwise, drop the buffer from the delwri 2237 * queue and submit async. 2238 */ 2239 bp->b_flags &= ~_XBF_DELWRI_Q; 2240 bp->b_flags |= XBF_WRITE; 2241 if (wait_list) { 2242 bp->b_flags &= ~XBF_ASYNC; 2243 list_move_tail(&bp->b_list, wait_list); 2244 } else { 2245 bp->b_flags |= XBF_ASYNC; 2246 xfs_buf_list_del(bp); 2247 } 2248 __xfs_buf_submit(bp, false); 2249 } 2250 blk_finish_plug(&plug); 2251 2252 return pinned; 2253 } 2254 2255 /* 2256 * Write out a buffer list asynchronously. 2257 * 2258 * This will take the @buffer_list, write all non-locked and non-pinned buffers 2259 * out and not wait for I/O completion on any of the buffers. This interface 2260 * is only safely useable for callers that can track I/O completion by higher 2261 * level means, e.g. AIL pushing as the @buffer_list is consumed in this 2262 * function. 2263 * 2264 * Note: this function will skip buffers it would block on, and in doing so 2265 * leaves them on @buffer_list so they can be retried on a later pass. As such, 2266 * it is up to the caller to ensure that the buffer list is fully submitted or 2267 * cancelled appropriately when they are finished with the list. Failure to 2268 * cancel or resubmit the list until it is empty will result in leaked buffers 2269 * at unmount time. 2270 */ 2271 int 2272 xfs_buf_delwri_submit_nowait( 2273 struct list_head *buffer_list) 2274 { 2275 return xfs_buf_delwri_submit_buffers(buffer_list, NULL); 2276 } 2277 2278 /* 2279 * Write out a buffer list synchronously. 2280 * 2281 * This will take the @buffer_list, write all buffers out and wait for I/O 2282 * completion on all of the buffers. @buffer_list is consumed by the function, 2283 * so callers must have some other way of tracking buffers if they require such 2284 * functionality. 2285 */ 2286 int 2287 xfs_buf_delwri_submit( 2288 struct list_head *buffer_list) 2289 { 2290 LIST_HEAD (wait_list); 2291 int error = 0, error2; 2292 struct xfs_buf *bp; 2293 2294 xfs_buf_delwri_submit_buffers(buffer_list, &wait_list); 2295 2296 /* Wait for IO to complete. */ 2297 while (!list_empty(&wait_list)) { 2298 bp = list_first_entry(&wait_list, struct xfs_buf, b_list); 2299 2300 xfs_buf_list_del(bp); 2301 2302 /* 2303 * Wait on the locked buffer, check for errors and unlock and 2304 * release the delwri queue reference. 2305 */ 2306 error2 = xfs_buf_iowait(bp); 2307 xfs_buf_relse(bp); 2308 if (!error) 2309 error = error2; 2310 } 2311 2312 return error; 2313 } 2314 2315 /* 2316 * Push a single buffer on a delwri queue. 2317 * 2318 * The purpose of this function is to submit a single buffer of a delwri queue 2319 * and return with the buffer still on the original queue. The waiting delwri 2320 * buffer submission infrastructure guarantees transfer of the delwri queue 2321 * buffer reference to a temporary wait list. We reuse this infrastructure to 2322 * transfer the buffer back to the original queue. 2323 * 2324 * Note the buffer transitions from the queued state, to the submitted and wait 2325 * listed state and back to the queued state during this call. The buffer 2326 * locking and queue management logic between _delwri_pushbuf() and 2327 * _delwri_queue() guarantee that the buffer cannot be queued to another list 2328 * before returning. 2329 */ 2330 int 2331 xfs_buf_delwri_pushbuf( 2332 struct xfs_buf *bp, 2333 struct list_head *buffer_list) 2334 { 2335 LIST_HEAD (submit_list); 2336 int error; 2337 2338 ASSERT(bp->b_flags & _XBF_DELWRI_Q); 2339 2340 trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_); 2341 2342 /* 2343 * Isolate the buffer to a new local list so we can submit it for I/O 2344 * independently from the rest of the original list. 2345 */ 2346 xfs_buf_lock(bp); 2347 list_move(&bp->b_list, &submit_list); 2348 xfs_buf_unlock(bp); 2349 2350 /* 2351 * Delwri submission clears the DELWRI_Q buffer flag and returns with 2352 * the buffer on the wait list with the original reference. Rather than 2353 * bounce the buffer from a local wait list back to the original list 2354 * after I/O completion, reuse the original list as the wait list. 2355 */ 2356 xfs_buf_delwri_submit_buffers(&submit_list, buffer_list); 2357 2358 /* 2359 * The buffer is now locked, under I/O and wait listed on the original 2360 * delwri queue. Wait for I/O completion, restore the DELWRI_Q flag and 2361 * return with the buffer unlocked and on the original queue. 2362 */ 2363 error = xfs_buf_iowait(bp); 2364 bp->b_flags |= _XBF_DELWRI_Q; 2365 xfs_buf_unlock(bp); 2366 2367 return error; 2368 } 2369 2370 void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) 2371 { 2372 /* 2373 * Set the lru reference count to 0 based on the error injection tag. 2374 * This allows userspace to disrupt buffer caching for debug/testing 2375 * purposes. 2376 */ 2377 if (XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_LRU_REF)) 2378 lru_ref = 0; 2379 2380 atomic_set(&bp->b_lru_ref, lru_ref); 2381 } 2382 2383 /* 2384 * Verify an on-disk magic value against the magic value specified in the 2385 * verifier structure. The verifier magic is in disk byte order so the caller is 2386 * expected to pass the value directly from disk. 2387 */ 2388 bool 2389 xfs_verify_magic( 2390 struct xfs_buf *bp, 2391 __be32 dmagic) 2392 { 2393 struct xfs_mount *mp = bp->b_mount; 2394 int idx; 2395 2396 idx = xfs_has_crc(mp); 2397 if (WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx])) 2398 return false; 2399 return dmagic == bp->b_ops->magic[idx]; 2400 } 2401 /* 2402 * Verify an on-disk magic value against the magic value specified in the 2403 * verifier structure. The verifier magic is in disk byte order so the caller is 2404 * expected to pass the value directly from disk. 2405 */ 2406 bool 2407 xfs_verify_magic16( 2408 struct xfs_buf *bp, 2409 __be16 dmagic) 2410 { 2411 struct xfs_mount *mp = bp->b_mount; 2412 int idx; 2413 2414 idx = xfs_has_crc(mp); 2415 if (WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx])) 2416 return false; 2417 return dmagic == bp->b_ops->magic16[idx]; 2418 } 2419