1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
4 * All Rights Reserved.
5 */
6 #include "xfs.h"
7 #include <linux/backing-dev.h>
8 #include <linux/dax.h>
9
10 #include "xfs_shared.h"
11 #include "xfs_format.h"
12 #include "xfs_log_format.h"
13 #include "xfs_trans_resv.h"
14 #include "xfs_mount.h"
15 #include "xfs_trace.h"
16 #include "xfs_log.h"
17 #include "xfs_log_recover.h"
18 #include "xfs_log_priv.h"
19 #include "xfs_trans.h"
20 #include "xfs_buf_item.h"
21 #include "xfs_errortag.h"
22 #include "xfs_error.h"
23 #include "xfs_ag.h"
24 #include "xfs_buf_mem.h"
25 #include "xfs_notify_failure.h"
26
27 struct kmem_cache *xfs_buf_cache;
28
29 /*
30 * Locking orders
31 *
32 * xfs_buf_stale:
33 * b_sema (caller holds)
34 * b_lock
35 * lru_lock
36 *
37 * xfs_buf_rele:
38 * b_lock
39 * lru_lock
40 *
41 * xfs_buftarg_drain_rele
42 * lru_lock
43 * b_lock (trylock due to inversion)
44 *
45 * xfs_buftarg_isolate
46 * lru_lock
47 * b_lock (trylock due to inversion)
48 */
49
50 static void xfs_buf_submit(struct xfs_buf *bp);
51 static int xfs_buf_iowait(struct xfs_buf *bp);
52
xfs_buf_is_uncached(struct xfs_buf * bp)53 static inline bool xfs_buf_is_uncached(struct xfs_buf *bp)
54 {
55 return bp->b_rhash_key == XFS_BUF_DADDR_NULL;
56 }
57
58 static inline int
xfs_buf_is_vmapped(struct xfs_buf * bp)59 xfs_buf_is_vmapped(
60 struct xfs_buf *bp)
61 {
62 /*
63 * Return true if the buffer is vmapped.
64 *
65 * b_addr is null if the buffer is not mapped, but the code is clever
66 * enough to know it doesn't have to map a single page, so the check has
67 * to be both for b_addr and bp->b_page_count > 1.
68 */
69 return bp->b_addr && bp->b_page_count > 1;
70 }
71
72 static inline int
xfs_buf_vmap_len(struct xfs_buf * bp)73 xfs_buf_vmap_len(
74 struct xfs_buf *bp)
75 {
76 return (bp->b_page_count * PAGE_SIZE);
77 }
78
79 /*
80 * When we mark a buffer stale, we remove the buffer from the LRU and clear the
81 * b_lru_ref count so that the buffer is freed immediately when the buffer
82 * reference count falls to zero. If the buffer is already on the LRU, we need
83 * to remove the reference that LRU holds on the buffer.
84 *
85 * This prevents build-up of stale buffers on the LRU.
86 */
87 void
xfs_buf_stale(struct xfs_buf * bp)88 xfs_buf_stale(
89 struct xfs_buf *bp)
90 {
91 ASSERT(xfs_buf_islocked(bp));
92
93 bp->b_flags |= XBF_STALE;
94
95 /*
96 * Clear the delwri status so that a delwri queue walker will not
97 * flush this buffer to disk now that it is stale. The delwri queue has
98 * a reference to the buffer, so this is safe to do.
99 */
100 bp->b_flags &= ~_XBF_DELWRI_Q;
101
102 spin_lock(&bp->b_lock);
103 atomic_set(&bp->b_lru_ref, 0);
104 if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
105 (list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru)))
106 bp->b_hold--;
107
108 ASSERT(bp->b_hold >= 1);
109 spin_unlock(&bp->b_lock);
110 }
111
112 static int
xfs_buf_get_maps(struct xfs_buf * bp,int map_count)113 xfs_buf_get_maps(
114 struct xfs_buf *bp,
115 int map_count)
116 {
117 ASSERT(bp->b_maps == NULL);
118 bp->b_map_count = map_count;
119
120 if (map_count == 1) {
121 bp->b_maps = &bp->__b_map;
122 return 0;
123 }
124
125 bp->b_maps = kzalloc(map_count * sizeof(struct xfs_buf_map),
126 GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
127 if (!bp->b_maps)
128 return -ENOMEM;
129 return 0;
130 }
131
132 static void
xfs_buf_free_maps(struct xfs_buf * bp)133 xfs_buf_free_maps(
134 struct xfs_buf *bp)
135 {
136 if (bp->b_maps != &bp->__b_map) {
137 kfree(bp->b_maps);
138 bp->b_maps = NULL;
139 }
140 }
141
142 static int
_xfs_buf_alloc(struct xfs_buftarg * target,struct xfs_buf_map * map,int nmaps,xfs_buf_flags_t flags,struct xfs_buf ** bpp)143 _xfs_buf_alloc(
144 struct xfs_buftarg *target,
145 struct xfs_buf_map *map,
146 int nmaps,
147 xfs_buf_flags_t flags,
148 struct xfs_buf **bpp)
149 {
150 struct xfs_buf *bp;
151 int error;
152 int i;
153
154 *bpp = NULL;
155 bp = kmem_cache_zalloc(xfs_buf_cache,
156 GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
157
158 /*
159 * We don't want certain flags to appear in b_flags unless they are
160 * specifically set by later operations on the buffer.
161 */
162 flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
163
164 /*
165 * A new buffer is held and locked by the owner. This ensures that the
166 * buffer is owned by the caller and racing RCU lookups right after
167 * inserting into the hash table are safe (and will have to wait for
168 * the unlock to do anything non-trivial).
169 */
170 bp->b_hold = 1;
171 sema_init(&bp->b_sema, 0); /* held, no waiters */
172
173 spin_lock_init(&bp->b_lock);
174 atomic_set(&bp->b_lru_ref, 1);
175 init_completion(&bp->b_iowait);
176 INIT_LIST_HEAD(&bp->b_lru);
177 INIT_LIST_HEAD(&bp->b_list);
178 INIT_LIST_HEAD(&bp->b_li_list);
179 bp->b_target = target;
180 bp->b_mount = target->bt_mount;
181 bp->b_flags = flags;
182
183 error = xfs_buf_get_maps(bp, nmaps);
184 if (error) {
185 kmem_cache_free(xfs_buf_cache, bp);
186 return error;
187 }
188
189 bp->b_rhash_key = map[0].bm_bn;
190 bp->b_length = 0;
191 for (i = 0; i < nmaps; i++) {
192 bp->b_maps[i].bm_bn = map[i].bm_bn;
193 bp->b_maps[i].bm_len = map[i].bm_len;
194 bp->b_length += map[i].bm_len;
195 }
196
197 atomic_set(&bp->b_pin_count, 0);
198 init_waitqueue_head(&bp->b_waiters);
199
200 XFS_STATS_INC(bp->b_mount, xb_create);
201 trace_xfs_buf_init(bp, _RET_IP_);
202
203 *bpp = bp;
204 return 0;
205 }
206
207 static void
xfs_buf_free_pages(struct xfs_buf * bp)208 xfs_buf_free_pages(
209 struct xfs_buf *bp)
210 {
211 uint i;
212
213 ASSERT(bp->b_flags & _XBF_PAGES);
214
215 if (xfs_buf_is_vmapped(bp))
216 vm_unmap_ram(bp->b_addr, bp->b_page_count);
217
218 for (i = 0; i < bp->b_page_count; i++) {
219 if (bp->b_pages[i])
220 __free_page(bp->b_pages[i]);
221 }
222 mm_account_reclaimed_pages(bp->b_page_count);
223
224 if (bp->b_pages != bp->b_page_array)
225 kfree(bp->b_pages);
226 bp->b_pages = NULL;
227 bp->b_flags &= ~_XBF_PAGES;
228 }
229
230 static void
xfs_buf_free_callback(struct callback_head * cb)231 xfs_buf_free_callback(
232 struct callback_head *cb)
233 {
234 struct xfs_buf *bp = container_of(cb, struct xfs_buf, b_rcu);
235
236 xfs_buf_free_maps(bp);
237 kmem_cache_free(xfs_buf_cache, bp);
238 }
239
240 static void
xfs_buf_free(struct xfs_buf * bp)241 xfs_buf_free(
242 struct xfs_buf *bp)
243 {
244 trace_xfs_buf_free(bp, _RET_IP_);
245
246 ASSERT(list_empty(&bp->b_lru));
247
248 if (xfs_buftarg_is_mem(bp->b_target))
249 xmbuf_unmap_page(bp);
250 else if (bp->b_flags & _XBF_PAGES)
251 xfs_buf_free_pages(bp);
252 else if (bp->b_flags & _XBF_KMEM)
253 kfree(bp->b_addr);
254
255 call_rcu(&bp->b_rcu, xfs_buf_free_callback);
256 }
257
258 static int
xfs_buf_alloc_kmem(struct xfs_buf * bp,xfs_buf_flags_t flags)259 xfs_buf_alloc_kmem(
260 struct xfs_buf *bp,
261 xfs_buf_flags_t flags)
262 {
263 gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL;
264 size_t size = BBTOB(bp->b_length);
265
266 /* Assure zeroed buffer for non-read cases. */
267 if (!(flags & XBF_READ))
268 gfp_mask |= __GFP_ZERO;
269
270 bp->b_addr = kmalloc(size, gfp_mask);
271 if (!bp->b_addr)
272 return -ENOMEM;
273
274 if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) !=
275 ((unsigned long)bp->b_addr & PAGE_MASK)) {
276 /* b_addr spans two pages - use alloc_page instead */
277 kfree(bp->b_addr);
278 bp->b_addr = NULL;
279 return -ENOMEM;
280 }
281 bp->b_offset = offset_in_page(bp->b_addr);
282 bp->b_pages = bp->b_page_array;
283 bp->b_pages[0] = kmem_to_page(bp->b_addr);
284 bp->b_page_count = 1;
285 bp->b_flags |= _XBF_KMEM;
286 return 0;
287 }
288
289 static int
xfs_buf_alloc_pages(struct xfs_buf * bp,xfs_buf_flags_t flags)290 xfs_buf_alloc_pages(
291 struct xfs_buf *bp,
292 xfs_buf_flags_t flags)
293 {
294 gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN;
295 long filled = 0;
296
297 if (flags & XBF_READ_AHEAD)
298 gfp_mask |= __GFP_NORETRY;
299
300 /* Make sure that we have a page list */
301 bp->b_page_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE);
302 if (bp->b_page_count <= XB_PAGES) {
303 bp->b_pages = bp->b_page_array;
304 } else {
305 bp->b_pages = kzalloc(sizeof(struct page *) * bp->b_page_count,
306 gfp_mask);
307 if (!bp->b_pages)
308 return -ENOMEM;
309 }
310 bp->b_flags |= _XBF_PAGES;
311
312 /* Assure zeroed buffer for non-read cases. */
313 if (!(flags & XBF_READ))
314 gfp_mask |= __GFP_ZERO;
315
316 /*
317 * Bulk filling of pages can take multiple calls. Not filling the entire
318 * array is not an allocation failure, so don't back off if we get at
319 * least one extra page.
320 */
321 for (;;) {
322 long last = filled;
323
324 filled = alloc_pages_bulk(gfp_mask, bp->b_page_count,
325 bp->b_pages);
326 if (filled == bp->b_page_count) {
327 XFS_STATS_INC(bp->b_mount, xb_page_found);
328 break;
329 }
330
331 if (filled != last)
332 continue;
333
334 if (flags & XBF_READ_AHEAD) {
335 xfs_buf_free_pages(bp);
336 return -ENOMEM;
337 }
338
339 XFS_STATS_INC(bp->b_mount, xb_page_retries);
340 memalloc_retry_wait(gfp_mask);
341 }
342 return 0;
343 }
344
345 /*
346 * Map buffer into kernel address-space if necessary.
347 */
348 STATIC int
_xfs_buf_map_pages(struct xfs_buf * bp,xfs_buf_flags_t flags)349 _xfs_buf_map_pages(
350 struct xfs_buf *bp,
351 xfs_buf_flags_t flags)
352 {
353 ASSERT(bp->b_flags & _XBF_PAGES);
354 if (bp->b_page_count == 1) {
355 /* A single page buffer is always mappable */
356 bp->b_addr = page_address(bp->b_pages[0]);
357 } else if (flags & XBF_UNMAPPED) {
358 bp->b_addr = NULL;
359 } else {
360 int retried = 0;
361 unsigned nofs_flag;
362
363 /*
364 * vm_map_ram() will allocate auxiliary structures (e.g.
365 * pagetables) with GFP_KERNEL, yet we often under a scoped nofs
366 * context here. Mixing GFP_KERNEL with GFP_NOFS allocations
367 * from the same call site that can be run from both above and
368 * below memory reclaim causes lockdep false positives. Hence we
369 * always need to force this allocation to nofs context because
370 * we can't pass __GFP_NOLOCKDEP down to auxillary structures to
371 * prevent false positive lockdep reports.
372 *
373 * XXX(dgc): I think dquot reclaim is the only place we can get
374 * to this function from memory reclaim context now. If we fix
375 * that like we've fixed inode reclaim to avoid writeback from
376 * reclaim, this nofs wrapping can go away.
377 */
378 nofs_flag = memalloc_nofs_save();
379 do {
380 bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
381 -1);
382 if (bp->b_addr)
383 break;
384 vm_unmap_aliases();
385 } while (retried++ <= 1);
386 memalloc_nofs_restore(nofs_flag);
387
388 if (!bp->b_addr)
389 return -ENOMEM;
390 }
391
392 return 0;
393 }
394
395 /*
396 * Finding and Reading Buffers
397 */
398 static int
_xfs_buf_obj_cmp(struct rhashtable_compare_arg * arg,const void * obj)399 _xfs_buf_obj_cmp(
400 struct rhashtable_compare_arg *arg,
401 const void *obj)
402 {
403 const struct xfs_buf_map *map = arg->key;
404 const struct xfs_buf *bp = obj;
405
406 /*
407 * The key hashing in the lookup path depends on the key being the
408 * first element of the compare_arg, make sure to assert this.
409 */
410 BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0);
411
412 if (bp->b_rhash_key != map->bm_bn)
413 return 1;
414
415 if (unlikely(bp->b_length != map->bm_len)) {
416 /*
417 * found a block number match. If the range doesn't
418 * match, the only way this is allowed is if the buffer
419 * in the cache is stale and the transaction that made
420 * it stale has not yet committed. i.e. we are
421 * reallocating a busy extent. Skip this buffer and
422 * continue searching for an exact match.
423 *
424 * Note: If we're scanning for incore buffers to stale, don't
425 * complain if we find non-stale buffers.
426 */
427 if (!(map->bm_flags & XBM_LIVESCAN))
428 ASSERT(bp->b_flags & XBF_STALE);
429 return 1;
430 }
431 return 0;
432 }
433
434 static const struct rhashtable_params xfs_buf_hash_params = {
435 .min_size = 32, /* empty AGs have minimal footprint */
436 .nelem_hint = 16,
437 .key_len = sizeof(xfs_daddr_t),
438 .key_offset = offsetof(struct xfs_buf, b_rhash_key),
439 .head_offset = offsetof(struct xfs_buf, b_rhash_head),
440 .automatic_shrinking = true,
441 .obj_cmpfn = _xfs_buf_obj_cmp,
442 };
443
444 int
xfs_buf_cache_init(struct xfs_buf_cache * bch)445 xfs_buf_cache_init(
446 struct xfs_buf_cache *bch)
447 {
448 return rhashtable_init(&bch->bc_hash, &xfs_buf_hash_params);
449 }
450
451 void
xfs_buf_cache_destroy(struct xfs_buf_cache * bch)452 xfs_buf_cache_destroy(
453 struct xfs_buf_cache *bch)
454 {
455 rhashtable_destroy(&bch->bc_hash);
456 }
457
458 static int
xfs_buf_map_verify(struct xfs_buftarg * btp,struct xfs_buf_map * map)459 xfs_buf_map_verify(
460 struct xfs_buftarg *btp,
461 struct xfs_buf_map *map)
462 {
463 xfs_daddr_t eofs;
464
465 /* Check for IOs smaller than the sector size / not sector aligned */
466 ASSERT(!(BBTOB(map->bm_len) < btp->bt_meta_sectorsize));
467 ASSERT(!(BBTOB(map->bm_bn) & (xfs_off_t)btp->bt_meta_sectormask));
468
469 /*
470 * Corrupted block numbers can get through to here, unfortunately, so we
471 * have to check that the buffer falls within the filesystem bounds.
472 */
473 eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
474 if (map->bm_bn < 0 || map->bm_bn >= eofs) {
475 xfs_alert(btp->bt_mount,
476 "%s: daddr 0x%llx out of range, EOFS 0x%llx",
477 __func__, map->bm_bn, eofs);
478 WARN_ON(1);
479 return -EFSCORRUPTED;
480 }
481 return 0;
482 }
483
484 static int
xfs_buf_find_lock(struct xfs_buf * bp,xfs_buf_flags_t flags)485 xfs_buf_find_lock(
486 struct xfs_buf *bp,
487 xfs_buf_flags_t flags)
488 {
489 if (flags & XBF_TRYLOCK) {
490 if (!xfs_buf_trylock(bp)) {
491 XFS_STATS_INC(bp->b_mount, xb_busy_locked);
492 return -EAGAIN;
493 }
494 } else {
495 xfs_buf_lock(bp);
496 XFS_STATS_INC(bp->b_mount, xb_get_locked_waited);
497 }
498
499 /*
500 * if the buffer is stale, clear all the external state associated with
501 * it. We need to keep flags such as how we allocated the buffer memory
502 * intact here.
503 */
504 if (bp->b_flags & XBF_STALE) {
505 if (flags & XBF_LIVESCAN) {
506 xfs_buf_unlock(bp);
507 return -ENOENT;
508 }
509 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
510 bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
511 bp->b_ops = NULL;
512 }
513 return 0;
514 }
515
516 static bool
xfs_buf_try_hold(struct xfs_buf * bp)517 xfs_buf_try_hold(
518 struct xfs_buf *bp)
519 {
520 spin_lock(&bp->b_lock);
521 if (bp->b_hold == 0) {
522 spin_unlock(&bp->b_lock);
523 return false;
524 }
525 bp->b_hold++;
526 spin_unlock(&bp->b_lock);
527 return true;
528 }
529
530 static inline int
xfs_buf_lookup(struct xfs_buf_cache * bch,struct xfs_buf_map * map,xfs_buf_flags_t flags,struct xfs_buf ** bpp)531 xfs_buf_lookup(
532 struct xfs_buf_cache *bch,
533 struct xfs_buf_map *map,
534 xfs_buf_flags_t flags,
535 struct xfs_buf **bpp)
536 {
537 struct xfs_buf *bp;
538 int error;
539
540 rcu_read_lock();
541 bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params);
542 if (!bp || !xfs_buf_try_hold(bp)) {
543 rcu_read_unlock();
544 return -ENOENT;
545 }
546 rcu_read_unlock();
547
548 error = xfs_buf_find_lock(bp, flags);
549 if (error) {
550 xfs_buf_rele(bp);
551 return error;
552 }
553
554 trace_xfs_buf_find(bp, flags, _RET_IP_);
555 *bpp = bp;
556 return 0;
557 }
558
559 /*
560 * Insert the new_bp into the hash table. This consumes the perag reference
561 * taken for the lookup regardless of the result of the insert.
562 */
563 static int
xfs_buf_find_insert(struct xfs_buftarg * btp,struct xfs_buf_cache * bch,struct xfs_perag * pag,struct xfs_buf_map * cmap,struct xfs_buf_map * map,int nmaps,xfs_buf_flags_t flags,struct xfs_buf ** bpp)564 xfs_buf_find_insert(
565 struct xfs_buftarg *btp,
566 struct xfs_buf_cache *bch,
567 struct xfs_perag *pag,
568 struct xfs_buf_map *cmap,
569 struct xfs_buf_map *map,
570 int nmaps,
571 xfs_buf_flags_t flags,
572 struct xfs_buf **bpp)
573 {
574 struct xfs_buf *new_bp;
575 struct xfs_buf *bp;
576 int error;
577
578 error = _xfs_buf_alloc(btp, map, nmaps, flags, &new_bp);
579 if (error)
580 goto out_drop_pag;
581
582 if (xfs_buftarg_is_mem(new_bp->b_target)) {
583 error = xmbuf_map_page(new_bp);
584 } else if (BBTOB(new_bp->b_length) >= PAGE_SIZE ||
585 xfs_buf_alloc_kmem(new_bp, flags) < 0) {
586 /*
587 * For buffers that fit entirely within a single page, first
588 * attempt to allocate the memory from the heap to minimise
589 * memory usage. If we can't get heap memory for these small
590 * buffers, we fall back to using the page allocator.
591 */
592 error = xfs_buf_alloc_pages(new_bp, flags);
593 }
594 if (error)
595 goto out_free_buf;
596
597 /* The new buffer keeps the perag reference until it is freed. */
598 new_bp->b_pag = pag;
599
600 rcu_read_lock();
601 bp = rhashtable_lookup_get_insert_fast(&bch->bc_hash,
602 &new_bp->b_rhash_head, xfs_buf_hash_params);
603 if (IS_ERR(bp)) {
604 rcu_read_unlock();
605 error = PTR_ERR(bp);
606 goto out_free_buf;
607 }
608 if (bp && xfs_buf_try_hold(bp)) {
609 /* found an existing buffer */
610 rcu_read_unlock();
611 error = xfs_buf_find_lock(bp, flags);
612 if (error)
613 xfs_buf_rele(bp);
614 else
615 *bpp = bp;
616 goto out_free_buf;
617 }
618 rcu_read_unlock();
619
620 *bpp = new_bp;
621 return 0;
622
623 out_free_buf:
624 xfs_buf_free(new_bp);
625 out_drop_pag:
626 if (pag)
627 xfs_perag_put(pag);
628 return error;
629 }
630
631 static inline struct xfs_perag *
xfs_buftarg_get_pag(struct xfs_buftarg * btp,const struct xfs_buf_map * map)632 xfs_buftarg_get_pag(
633 struct xfs_buftarg *btp,
634 const struct xfs_buf_map *map)
635 {
636 struct xfs_mount *mp = btp->bt_mount;
637
638 if (xfs_buftarg_is_mem(btp))
639 return NULL;
640 return xfs_perag_get(mp, xfs_daddr_to_agno(mp, map->bm_bn));
641 }
642
643 static inline struct xfs_buf_cache *
xfs_buftarg_buf_cache(struct xfs_buftarg * btp,struct xfs_perag * pag)644 xfs_buftarg_buf_cache(
645 struct xfs_buftarg *btp,
646 struct xfs_perag *pag)
647 {
648 if (pag)
649 return &pag->pag_bcache;
650 return btp->bt_cache;
651 }
652
653 /*
654 * Assembles a buffer covering the specified range. The code is optimised for
655 * cache hits, as metadata intensive workloads will see 3 orders of magnitude
656 * more hits than misses.
657 */
658 int
xfs_buf_get_map(struct xfs_buftarg * btp,struct xfs_buf_map * map,int nmaps,xfs_buf_flags_t flags,struct xfs_buf ** bpp)659 xfs_buf_get_map(
660 struct xfs_buftarg *btp,
661 struct xfs_buf_map *map,
662 int nmaps,
663 xfs_buf_flags_t flags,
664 struct xfs_buf **bpp)
665 {
666 struct xfs_buf_cache *bch;
667 struct xfs_perag *pag;
668 struct xfs_buf *bp = NULL;
669 struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn };
670 int error;
671 int i;
672
673 if (flags & XBF_LIVESCAN)
674 cmap.bm_flags |= XBM_LIVESCAN;
675 for (i = 0; i < nmaps; i++)
676 cmap.bm_len += map[i].bm_len;
677
678 error = xfs_buf_map_verify(btp, &cmap);
679 if (error)
680 return error;
681
682 pag = xfs_buftarg_get_pag(btp, &cmap);
683 bch = xfs_buftarg_buf_cache(btp, pag);
684
685 error = xfs_buf_lookup(bch, &cmap, flags, &bp);
686 if (error && error != -ENOENT)
687 goto out_put_perag;
688
689 /* cache hits always outnumber misses by at least 10:1 */
690 if (unlikely(!bp)) {
691 XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
692
693 if (flags & XBF_INCORE)
694 goto out_put_perag;
695
696 /* xfs_buf_find_insert() consumes the perag reference. */
697 error = xfs_buf_find_insert(btp, bch, pag, &cmap, map, nmaps,
698 flags, &bp);
699 if (error)
700 return error;
701 } else {
702 XFS_STATS_INC(btp->bt_mount, xb_get_locked);
703 if (pag)
704 xfs_perag_put(pag);
705 }
706
707 /* We do not hold a perag reference anymore. */
708 if (!bp->b_addr) {
709 error = _xfs_buf_map_pages(bp, flags);
710 if (unlikely(error)) {
711 xfs_warn_ratelimited(btp->bt_mount,
712 "%s: failed to map %u pages", __func__,
713 bp->b_page_count);
714 xfs_buf_relse(bp);
715 return error;
716 }
717 }
718
719 /*
720 * Clear b_error if this is a lookup from a caller that doesn't expect
721 * valid data to be found in the buffer.
722 */
723 if (!(flags & XBF_READ))
724 xfs_buf_ioerror(bp, 0);
725
726 XFS_STATS_INC(btp->bt_mount, xb_get);
727 trace_xfs_buf_get(bp, flags, _RET_IP_);
728 *bpp = bp;
729 return 0;
730
731 out_put_perag:
732 if (pag)
733 xfs_perag_put(pag);
734 return error;
735 }
736
737 int
_xfs_buf_read(struct xfs_buf * bp)738 _xfs_buf_read(
739 struct xfs_buf *bp)
740 {
741 ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL);
742
743 bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD | XBF_DONE);
744 bp->b_flags |= XBF_READ;
745 xfs_buf_submit(bp);
746 return xfs_buf_iowait(bp);
747 }
748
749 /*
750 * Reverify a buffer found in cache without an attached ->b_ops.
751 *
752 * If the caller passed an ops structure and the buffer doesn't have ops
753 * assigned, set the ops and use it to verify the contents. If verification
754 * fails, clear XBF_DONE. We assume the buffer has no recorded errors and is
755 * already in XBF_DONE state on entry.
756 *
757 * Under normal operations, every in-core buffer is verified on read I/O
758 * completion. There are two scenarios that can lead to in-core buffers without
759 * an assigned ->b_ops. The first is during log recovery of buffers on a V4
760 * filesystem, though these buffers are purged at the end of recovery. The
761 * other is online repair, which intentionally reads with a NULL buffer ops to
762 * run several verifiers across an in-core buffer in order to establish buffer
763 * type. If repair can't establish that, the buffer will be left in memory
764 * with NULL buffer ops.
765 */
766 int
xfs_buf_reverify(struct xfs_buf * bp,const struct xfs_buf_ops * ops)767 xfs_buf_reverify(
768 struct xfs_buf *bp,
769 const struct xfs_buf_ops *ops)
770 {
771 ASSERT(bp->b_flags & XBF_DONE);
772 ASSERT(bp->b_error == 0);
773
774 if (!ops || bp->b_ops)
775 return 0;
776
777 bp->b_ops = ops;
778 bp->b_ops->verify_read(bp);
779 if (bp->b_error)
780 bp->b_flags &= ~XBF_DONE;
781 return bp->b_error;
782 }
783
784 int
xfs_buf_read_map(struct xfs_buftarg * target,struct xfs_buf_map * map,int nmaps,xfs_buf_flags_t flags,struct xfs_buf ** bpp,const struct xfs_buf_ops * ops,xfs_failaddr_t fa)785 xfs_buf_read_map(
786 struct xfs_buftarg *target,
787 struct xfs_buf_map *map,
788 int nmaps,
789 xfs_buf_flags_t flags,
790 struct xfs_buf **bpp,
791 const struct xfs_buf_ops *ops,
792 xfs_failaddr_t fa)
793 {
794 struct xfs_buf *bp;
795 int error;
796
797 ASSERT(!(flags & (XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD)));
798
799 flags |= XBF_READ;
800 *bpp = NULL;
801
802 error = xfs_buf_get_map(target, map, nmaps, flags, &bp);
803 if (error)
804 return error;
805
806 trace_xfs_buf_read(bp, flags, _RET_IP_);
807
808 if (!(bp->b_flags & XBF_DONE)) {
809 /* Initiate the buffer read and wait. */
810 XFS_STATS_INC(target->bt_mount, xb_get_read);
811 bp->b_ops = ops;
812 error = _xfs_buf_read(bp);
813 } else {
814 /* Buffer already read; all we need to do is check it. */
815 error = xfs_buf_reverify(bp, ops);
816
817 /* We do not want read in the flags */
818 bp->b_flags &= ~XBF_READ;
819 ASSERT(bp->b_ops != NULL || ops == NULL);
820 }
821
822 /*
823 * If we've had a read error, then the contents of the buffer are
824 * invalid and should not be used. To ensure that a followup read tries
825 * to pull the buffer from disk again, we clear the XBF_DONE flag and
826 * mark the buffer stale. This ensures that anyone who has a current
827 * reference to the buffer will interpret it's contents correctly and
828 * future cache lookups will also treat it as an empty, uninitialised
829 * buffer.
830 */
831 if (error) {
832 /*
833 * Check against log shutdown for error reporting because
834 * metadata writeback may require a read first and we need to
835 * report errors in metadata writeback until the log is shut
836 * down. High level transaction read functions already check
837 * against mount shutdown, anyway, so we only need to be
838 * concerned about low level IO interactions here.
839 */
840 if (!xlog_is_shutdown(target->bt_mount->m_log))
841 xfs_buf_ioerror_alert(bp, fa);
842
843 bp->b_flags &= ~XBF_DONE;
844 xfs_buf_stale(bp);
845 xfs_buf_relse(bp);
846
847 /* bad CRC means corrupted metadata */
848 if (error == -EFSBADCRC)
849 error = -EFSCORRUPTED;
850 return error;
851 }
852
853 *bpp = bp;
854 return 0;
855 }
856
857 /*
858 * If we are not low on memory then do the readahead in a deadlock
859 * safe manner.
860 */
861 void
xfs_buf_readahead_map(struct xfs_buftarg * target,struct xfs_buf_map * map,int nmaps,const struct xfs_buf_ops * ops)862 xfs_buf_readahead_map(
863 struct xfs_buftarg *target,
864 struct xfs_buf_map *map,
865 int nmaps,
866 const struct xfs_buf_ops *ops)
867 {
868 const xfs_buf_flags_t flags = XBF_READ | XBF_ASYNC | XBF_READ_AHEAD;
869 struct xfs_buf *bp;
870
871 /*
872 * Currently we don't have a good means or justification for performing
873 * xmbuf_map_page asynchronously, so we don't do readahead.
874 */
875 if (xfs_buftarg_is_mem(target))
876 return;
877
878 if (xfs_buf_get_map(target, map, nmaps, flags | XBF_TRYLOCK, &bp))
879 return;
880 trace_xfs_buf_readahead(bp, 0, _RET_IP_);
881
882 if (bp->b_flags & XBF_DONE) {
883 xfs_buf_reverify(bp, ops);
884 xfs_buf_relse(bp);
885 return;
886 }
887 XFS_STATS_INC(target->bt_mount, xb_get_read);
888 bp->b_ops = ops;
889 bp->b_flags &= ~(XBF_WRITE | XBF_DONE);
890 bp->b_flags |= flags;
891 percpu_counter_inc(&target->bt_readahead_count);
892 xfs_buf_submit(bp);
893 }
894
895 /*
896 * Read an uncached buffer from disk. Allocates and returns a locked
897 * buffer containing the disk contents or nothing. Uncached buffers always have
898 * a cache index of XFS_BUF_DADDR_NULL so we can easily determine if the buffer
899 * is cached or uncached during fault diagnosis.
900 */
901 int
xfs_buf_read_uncached(struct xfs_buftarg * target,xfs_daddr_t daddr,size_t numblks,xfs_buf_flags_t flags,struct xfs_buf ** bpp,const struct xfs_buf_ops * ops)902 xfs_buf_read_uncached(
903 struct xfs_buftarg *target,
904 xfs_daddr_t daddr,
905 size_t numblks,
906 xfs_buf_flags_t flags,
907 struct xfs_buf **bpp,
908 const struct xfs_buf_ops *ops)
909 {
910 struct xfs_buf *bp;
911 int error;
912
913 *bpp = NULL;
914
915 error = xfs_buf_get_uncached(target, numblks, flags, &bp);
916 if (error)
917 return error;
918
919 /* set up the buffer for a read IO */
920 ASSERT(bp->b_map_count == 1);
921 bp->b_rhash_key = XFS_BUF_DADDR_NULL;
922 bp->b_maps[0].bm_bn = daddr;
923 bp->b_flags |= XBF_READ;
924 bp->b_ops = ops;
925
926 xfs_buf_submit(bp);
927 error = xfs_buf_iowait(bp);
928 if (error) {
929 xfs_buf_relse(bp);
930 return error;
931 }
932
933 *bpp = bp;
934 return 0;
935 }
936
937 int
xfs_buf_get_uncached(struct xfs_buftarg * target,size_t numblks,xfs_buf_flags_t flags,struct xfs_buf ** bpp)938 xfs_buf_get_uncached(
939 struct xfs_buftarg *target,
940 size_t numblks,
941 xfs_buf_flags_t flags,
942 struct xfs_buf **bpp)
943 {
944 int error;
945 struct xfs_buf *bp;
946 DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
947
948 /* there are currently no valid flags for xfs_buf_get_uncached */
949 ASSERT(flags == 0);
950
951 *bpp = NULL;
952
953 error = _xfs_buf_alloc(target, &map, 1, flags, &bp);
954 if (error)
955 return error;
956
957 if (xfs_buftarg_is_mem(bp->b_target))
958 error = xmbuf_map_page(bp);
959 else
960 error = xfs_buf_alloc_pages(bp, flags);
961 if (error)
962 goto fail_free_buf;
963
964 error = _xfs_buf_map_pages(bp, 0);
965 if (unlikely(error)) {
966 xfs_warn(target->bt_mount,
967 "%s: failed to map pages", __func__);
968 goto fail_free_buf;
969 }
970
971 trace_xfs_buf_get_uncached(bp, _RET_IP_);
972 *bpp = bp;
973 return 0;
974
975 fail_free_buf:
976 xfs_buf_free(bp);
977 return error;
978 }
979
980 /*
981 * Increment reference count on buffer, to hold the buffer concurrently
982 * with another thread which may release (free) the buffer asynchronously.
983 * Must hold the buffer already to call this function.
984 */
985 void
xfs_buf_hold(struct xfs_buf * bp)986 xfs_buf_hold(
987 struct xfs_buf *bp)
988 {
989 trace_xfs_buf_hold(bp, _RET_IP_);
990
991 spin_lock(&bp->b_lock);
992 bp->b_hold++;
993 spin_unlock(&bp->b_lock);
994 }
995
996 static void
xfs_buf_rele_uncached(struct xfs_buf * bp)997 xfs_buf_rele_uncached(
998 struct xfs_buf *bp)
999 {
1000 ASSERT(list_empty(&bp->b_lru));
1001
1002 spin_lock(&bp->b_lock);
1003 if (--bp->b_hold) {
1004 spin_unlock(&bp->b_lock);
1005 return;
1006 }
1007 spin_unlock(&bp->b_lock);
1008 xfs_buf_free(bp);
1009 }
1010
1011 static void
xfs_buf_rele_cached(struct xfs_buf * bp)1012 xfs_buf_rele_cached(
1013 struct xfs_buf *bp)
1014 {
1015 struct xfs_buftarg *btp = bp->b_target;
1016 struct xfs_perag *pag = bp->b_pag;
1017 struct xfs_buf_cache *bch = xfs_buftarg_buf_cache(btp, pag);
1018 bool freebuf = false;
1019
1020 trace_xfs_buf_rele(bp, _RET_IP_);
1021
1022 spin_lock(&bp->b_lock);
1023 ASSERT(bp->b_hold >= 1);
1024 if (bp->b_hold > 1) {
1025 bp->b_hold--;
1026 goto out_unlock;
1027 }
1028
1029 /* we are asked to drop the last reference */
1030 if (atomic_read(&bp->b_lru_ref)) {
1031 /*
1032 * If the buffer is added to the LRU, keep the reference to the
1033 * buffer for the LRU and clear the (now stale) dispose list
1034 * state flag, else drop the reference.
1035 */
1036 if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru))
1037 bp->b_state &= ~XFS_BSTATE_DISPOSE;
1038 else
1039 bp->b_hold--;
1040 } else {
1041 bp->b_hold--;
1042 /*
1043 * most of the time buffers will already be removed from the
1044 * LRU, so optimise that case by checking for the
1045 * XFS_BSTATE_DISPOSE flag indicating the last list the buffer
1046 * was on was the disposal list
1047 */
1048 if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
1049 list_lru_del_obj(&btp->bt_lru, &bp->b_lru);
1050 } else {
1051 ASSERT(list_empty(&bp->b_lru));
1052 }
1053
1054 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
1055 rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head,
1056 xfs_buf_hash_params);
1057 if (pag)
1058 xfs_perag_put(pag);
1059 freebuf = true;
1060 }
1061
1062 out_unlock:
1063 spin_unlock(&bp->b_lock);
1064
1065 if (freebuf)
1066 xfs_buf_free(bp);
1067 }
1068
1069 /*
1070 * Release a hold on the specified buffer.
1071 */
1072 void
xfs_buf_rele(struct xfs_buf * bp)1073 xfs_buf_rele(
1074 struct xfs_buf *bp)
1075 {
1076 trace_xfs_buf_rele(bp, _RET_IP_);
1077 if (xfs_buf_is_uncached(bp))
1078 xfs_buf_rele_uncached(bp);
1079 else
1080 xfs_buf_rele_cached(bp);
1081 }
1082
1083 /*
1084 * Lock a buffer object, if it is not already locked.
1085 *
1086 * If we come across a stale, pinned, locked buffer, we know that we are
1087 * being asked to lock a buffer that has been reallocated. Because it is
1088 * pinned, we know that the log has not been pushed to disk and hence it
1089 * will still be locked. Rather than continuing to have trylock attempts
1090 * fail until someone else pushes the log, push it ourselves before
1091 * returning. This means that the xfsaild will not get stuck trying
1092 * to push on stale inode buffers.
1093 */
1094 int
xfs_buf_trylock(struct xfs_buf * bp)1095 xfs_buf_trylock(
1096 struct xfs_buf *bp)
1097 {
1098 int locked;
1099
1100 locked = down_trylock(&bp->b_sema) == 0;
1101 if (locked)
1102 trace_xfs_buf_trylock(bp, _RET_IP_);
1103 else
1104 trace_xfs_buf_trylock_fail(bp, _RET_IP_);
1105 return locked;
1106 }
1107
1108 /*
1109 * Lock a buffer object.
1110 *
1111 * If we come across a stale, pinned, locked buffer, we know that we
1112 * are being asked to lock a buffer that has been reallocated. Because
1113 * it is pinned, we know that the log has not been pushed to disk and
1114 * hence it will still be locked. Rather than sleeping until someone
1115 * else pushes the log, push it ourselves before trying to get the lock.
1116 */
1117 void
xfs_buf_lock(struct xfs_buf * bp)1118 xfs_buf_lock(
1119 struct xfs_buf *bp)
1120 {
1121 trace_xfs_buf_lock(bp, _RET_IP_);
1122
1123 if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
1124 xfs_log_force(bp->b_mount, 0);
1125 down(&bp->b_sema);
1126
1127 trace_xfs_buf_lock_done(bp, _RET_IP_);
1128 }
1129
1130 void
xfs_buf_unlock(struct xfs_buf * bp)1131 xfs_buf_unlock(
1132 struct xfs_buf *bp)
1133 {
1134 ASSERT(xfs_buf_islocked(bp));
1135
1136 up(&bp->b_sema);
1137 trace_xfs_buf_unlock(bp, _RET_IP_);
1138 }
1139
1140 STATIC void
xfs_buf_wait_unpin(struct xfs_buf * bp)1141 xfs_buf_wait_unpin(
1142 struct xfs_buf *bp)
1143 {
1144 DECLARE_WAITQUEUE (wait, current);
1145
1146 if (atomic_read(&bp->b_pin_count) == 0)
1147 return;
1148
1149 add_wait_queue(&bp->b_waiters, &wait);
1150 for (;;) {
1151 set_current_state(TASK_UNINTERRUPTIBLE);
1152 if (atomic_read(&bp->b_pin_count) == 0)
1153 break;
1154 io_schedule();
1155 }
1156 remove_wait_queue(&bp->b_waiters, &wait);
1157 set_current_state(TASK_RUNNING);
1158 }
1159
1160 static void
xfs_buf_ioerror_alert_ratelimited(struct xfs_buf * bp)1161 xfs_buf_ioerror_alert_ratelimited(
1162 struct xfs_buf *bp)
1163 {
1164 static unsigned long lasttime;
1165 static struct xfs_buftarg *lasttarg;
1166
1167 if (bp->b_target != lasttarg ||
1168 time_after(jiffies, (lasttime + 5*HZ))) {
1169 lasttime = jiffies;
1170 xfs_buf_ioerror_alert(bp, __this_address);
1171 }
1172 lasttarg = bp->b_target;
1173 }
1174
1175 /*
1176 * Account for this latest trip around the retry handler, and decide if
1177 * we've failed enough times to constitute a permanent failure.
1178 */
1179 static bool
xfs_buf_ioerror_permanent(struct xfs_buf * bp,struct xfs_error_cfg * cfg)1180 xfs_buf_ioerror_permanent(
1181 struct xfs_buf *bp,
1182 struct xfs_error_cfg *cfg)
1183 {
1184 struct xfs_mount *mp = bp->b_mount;
1185
1186 if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
1187 ++bp->b_retries > cfg->max_retries)
1188 return true;
1189 if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
1190 time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time))
1191 return true;
1192
1193 /* At unmount we may treat errors differently */
1194 if (xfs_is_unmounting(mp) && mp->m_fail_unmount)
1195 return true;
1196
1197 return false;
1198 }
1199
1200 /*
1201 * On a sync write or shutdown we just want to stale the buffer and let the
1202 * caller handle the error in bp->b_error appropriately.
1203 *
1204 * If the write was asynchronous then no one will be looking for the error. If
1205 * this is the first failure of this type, clear the error state and write the
1206 * buffer out again. This means we always retry an async write failure at least
1207 * once, but we also need to set the buffer up to behave correctly now for
1208 * repeated failures.
1209 *
1210 * If we get repeated async write failures, then we take action according to the
1211 * error configuration we have been set up to use.
1212 *
1213 * Returns true if this function took care of error handling and the caller must
1214 * not touch the buffer again. Return false if the caller should proceed with
1215 * normal I/O completion handling.
1216 */
1217 static bool
xfs_buf_ioend_handle_error(struct xfs_buf * bp)1218 xfs_buf_ioend_handle_error(
1219 struct xfs_buf *bp)
1220 {
1221 struct xfs_mount *mp = bp->b_mount;
1222 struct xfs_error_cfg *cfg;
1223 struct xfs_log_item *lip;
1224
1225 /*
1226 * If we've already shutdown the journal because of I/O errors, there's
1227 * no point in giving this a retry.
1228 */
1229 if (xlog_is_shutdown(mp->m_log))
1230 goto out_stale;
1231
1232 xfs_buf_ioerror_alert_ratelimited(bp);
1233
1234 /*
1235 * We're not going to bother about retrying this during recovery.
1236 * One strike!
1237 */
1238 if (bp->b_flags & _XBF_LOGRECOVERY) {
1239 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1240 return false;
1241 }
1242
1243 /*
1244 * Synchronous writes will have callers process the error.
1245 */
1246 if (!(bp->b_flags & XBF_ASYNC))
1247 goto out_stale;
1248
1249 trace_xfs_buf_iodone_async(bp, _RET_IP_);
1250
1251 cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
1252 if (bp->b_last_error != bp->b_error ||
1253 !(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL))) {
1254 bp->b_last_error = bp->b_error;
1255 if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
1256 !bp->b_first_retry_time)
1257 bp->b_first_retry_time = jiffies;
1258 goto resubmit;
1259 }
1260
1261 /*
1262 * Permanent error - we need to trigger a shutdown if we haven't already
1263 * to indicate that inconsistency will result from this action.
1264 */
1265 if (xfs_buf_ioerror_permanent(bp, cfg)) {
1266 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1267 goto out_stale;
1268 }
1269
1270 /* Still considered a transient error. Caller will schedule retries. */
1271 list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
1272 set_bit(XFS_LI_FAILED, &lip->li_flags);
1273 clear_bit(XFS_LI_FLUSHING, &lip->li_flags);
1274 }
1275
1276 xfs_buf_ioerror(bp, 0);
1277 xfs_buf_relse(bp);
1278 return true;
1279
1280 resubmit:
1281 xfs_buf_ioerror(bp, 0);
1282 bp->b_flags |= (XBF_DONE | XBF_WRITE_FAIL);
1283 reinit_completion(&bp->b_iowait);
1284 xfs_buf_submit(bp);
1285 return true;
1286 out_stale:
1287 xfs_buf_stale(bp);
1288 bp->b_flags |= XBF_DONE;
1289 bp->b_flags &= ~XBF_WRITE;
1290 trace_xfs_buf_error_relse(bp, _RET_IP_);
1291 return false;
1292 }
1293
1294 /* returns false if the caller needs to resubmit the I/O, else true */
1295 static bool
__xfs_buf_ioend(struct xfs_buf * bp)1296 __xfs_buf_ioend(
1297 struct xfs_buf *bp)
1298 {
1299 trace_xfs_buf_iodone(bp, _RET_IP_);
1300
1301 if (bp->b_flags & XBF_READ) {
1302 if (!bp->b_error && xfs_buf_is_vmapped(bp))
1303 invalidate_kernel_vmap_range(bp->b_addr,
1304 xfs_buf_vmap_len(bp));
1305 if (!bp->b_error && bp->b_ops)
1306 bp->b_ops->verify_read(bp);
1307 if (!bp->b_error)
1308 bp->b_flags |= XBF_DONE;
1309 if (bp->b_flags & XBF_READ_AHEAD)
1310 percpu_counter_dec(&bp->b_target->bt_readahead_count);
1311 } else {
1312 if (!bp->b_error) {
1313 bp->b_flags &= ~XBF_WRITE_FAIL;
1314 bp->b_flags |= XBF_DONE;
1315 }
1316
1317 if (unlikely(bp->b_error) && xfs_buf_ioend_handle_error(bp))
1318 return false;
1319
1320 /* clear the retry state */
1321 bp->b_last_error = 0;
1322 bp->b_retries = 0;
1323 bp->b_first_retry_time = 0;
1324
1325 /*
1326 * Note that for things like remote attribute buffers, there may
1327 * not be a buffer log item here, so processing the buffer log
1328 * item must remain optional.
1329 */
1330 if (bp->b_log_item)
1331 xfs_buf_item_done(bp);
1332
1333 if (bp->b_iodone)
1334 bp->b_iodone(bp);
1335 }
1336
1337 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD |
1338 _XBF_LOGRECOVERY);
1339 return true;
1340 }
1341
1342 static void
xfs_buf_ioend(struct xfs_buf * bp)1343 xfs_buf_ioend(
1344 struct xfs_buf *bp)
1345 {
1346 if (!__xfs_buf_ioend(bp))
1347 return;
1348 if (bp->b_flags & XBF_ASYNC)
1349 xfs_buf_relse(bp);
1350 else
1351 complete(&bp->b_iowait);
1352 }
1353
1354 static void
xfs_buf_ioend_work(struct work_struct * work)1355 xfs_buf_ioend_work(
1356 struct work_struct *work)
1357 {
1358 struct xfs_buf *bp =
1359 container_of(work, struct xfs_buf, b_ioend_work);
1360
1361 if (__xfs_buf_ioend(bp))
1362 xfs_buf_relse(bp);
1363 }
1364
1365 void
__xfs_buf_ioerror(struct xfs_buf * bp,int error,xfs_failaddr_t failaddr)1366 __xfs_buf_ioerror(
1367 struct xfs_buf *bp,
1368 int error,
1369 xfs_failaddr_t failaddr)
1370 {
1371 ASSERT(error <= 0 && error >= -1000);
1372 bp->b_error = error;
1373 trace_xfs_buf_ioerror(bp, error, failaddr);
1374 }
1375
1376 void
xfs_buf_ioerror_alert(struct xfs_buf * bp,xfs_failaddr_t func)1377 xfs_buf_ioerror_alert(
1378 struct xfs_buf *bp,
1379 xfs_failaddr_t func)
1380 {
1381 xfs_buf_alert_ratelimited(bp, "XFS: metadata IO error",
1382 "metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d",
1383 func, (uint64_t)xfs_buf_daddr(bp),
1384 bp->b_length, -bp->b_error);
1385 }
1386
1387 /*
1388 * To simulate an I/O failure, the buffer must be locked and held with at least
1389 * three references. The LRU reference is dropped by the stale call. The buf
1390 * item reference is dropped via ioend processing. The third reference is owned
1391 * by the caller and is dropped on I/O completion if the buffer is XBF_ASYNC.
1392 */
1393 void
xfs_buf_ioend_fail(struct xfs_buf * bp)1394 xfs_buf_ioend_fail(
1395 struct xfs_buf *bp)
1396 {
1397 bp->b_flags &= ~XBF_DONE;
1398 xfs_buf_stale(bp);
1399 xfs_buf_ioerror(bp, -EIO);
1400 xfs_buf_ioend(bp);
1401 }
1402
1403 int
xfs_bwrite(struct xfs_buf * bp)1404 xfs_bwrite(
1405 struct xfs_buf *bp)
1406 {
1407 int error;
1408
1409 ASSERT(xfs_buf_islocked(bp));
1410
1411 bp->b_flags |= XBF_WRITE;
1412 bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q |
1413 XBF_DONE);
1414
1415 xfs_buf_submit(bp);
1416 error = xfs_buf_iowait(bp);
1417 if (error)
1418 xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
1419 return error;
1420 }
1421
1422 static void
xfs_buf_bio_end_io(struct bio * bio)1423 xfs_buf_bio_end_io(
1424 struct bio *bio)
1425 {
1426 struct xfs_buf *bp = bio->bi_private;
1427
1428 if (bio->bi_status)
1429 xfs_buf_ioerror(bp, blk_status_to_errno(bio->bi_status));
1430 else if ((bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) &&
1431 XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR))
1432 xfs_buf_ioerror(bp, -EIO);
1433
1434 if (bp->b_flags & XBF_ASYNC) {
1435 INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work);
1436 queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work);
1437 } else {
1438 complete(&bp->b_iowait);
1439 }
1440
1441 bio_put(bio);
1442 }
1443
1444 static inline blk_opf_t
xfs_buf_bio_op(struct xfs_buf * bp)1445 xfs_buf_bio_op(
1446 struct xfs_buf *bp)
1447 {
1448 blk_opf_t op;
1449
1450 if (bp->b_flags & XBF_WRITE) {
1451 op = REQ_OP_WRITE;
1452 } else {
1453 op = REQ_OP_READ;
1454 if (bp->b_flags & XBF_READ_AHEAD)
1455 op |= REQ_RAHEAD;
1456 }
1457
1458 return op | REQ_META;
1459 }
1460
1461 static void
xfs_buf_submit_bio(struct xfs_buf * bp)1462 xfs_buf_submit_bio(
1463 struct xfs_buf *bp)
1464 {
1465 unsigned int size = BBTOB(bp->b_length);
1466 unsigned int map = 0, p;
1467 struct blk_plug plug;
1468 struct bio *bio;
1469
1470 bio = bio_alloc(bp->b_target->bt_bdev, bp->b_page_count,
1471 xfs_buf_bio_op(bp), GFP_NOIO);
1472 bio->bi_private = bp;
1473 bio->bi_end_io = xfs_buf_bio_end_io;
1474
1475 if (bp->b_flags & _XBF_KMEM) {
1476 __bio_add_page(bio, virt_to_page(bp->b_addr), size,
1477 bp->b_offset);
1478 } else {
1479 for (p = 0; p < bp->b_page_count; p++)
1480 __bio_add_page(bio, bp->b_pages[p], PAGE_SIZE, 0);
1481 bio->bi_iter.bi_size = size; /* limit to the actual size used */
1482
1483 if (xfs_buf_is_vmapped(bp))
1484 flush_kernel_vmap_range(bp->b_addr,
1485 xfs_buf_vmap_len(bp));
1486 }
1487
1488 /*
1489 * If there is more than one map segment, split out a new bio for each
1490 * map except of the last one. The last map is handled by the
1491 * remainder of the original bio outside the loop.
1492 */
1493 blk_start_plug(&plug);
1494 for (map = 0; map < bp->b_map_count - 1; map++) {
1495 struct bio *split;
1496
1497 split = bio_split(bio, bp->b_maps[map].bm_len, GFP_NOFS,
1498 &fs_bio_set);
1499 split->bi_iter.bi_sector = bp->b_maps[map].bm_bn;
1500 bio_chain(split, bio);
1501 submit_bio(split);
1502 }
1503 bio->bi_iter.bi_sector = bp->b_maps[map].bm_bn;
1504 submit_bio(bio);
1505 blk_finish_plug(&plug);
1506 }
1507
1508 /*
1509 * Wait for I/O completion of a sync buffer and return the I/O error code.
1510 */
1511 static int
xfs_buf_iowait(struct xfs_buf * bp)1512 xfs_buf_iowait(
1513 struct xfs_buf *bp)
1514 {
1515 ASSERT(!(bp->b_flags & XBF_ASYNC));
1516
1517 do {
1518 trace_xfs_buf_iowait(bp, _RET_IP_);
1519 wait_for_completion(&bp->b_iowait);
1520 trace_xfs_buf_iowait_done(bp, _RET_IP_);
1521 } while (!__xfs_buf_ioend(bp));
1522
1523 return bp->b_error;
1524 }
1525
1526 /*
1527 * Run the write verifier callback function if it exists. If this fails, mark
1528 * the buffer with an error and do not dispatch the I/O.
1529 */
1530 static bool
xfs_buf_verify_write(struct xfs_buf * bp)1531 xfs_buf_verify_write(
1532 struct xfs_buf *bp)
1533 {
1534 if (bp->b_ops) {
1535 bp->b_ops->verify_write(bp);
1536 if (bp->b_error)
1537 return false;
1538 } else if (bp->b_rhash_key != XFS_BUF_DADDR_NULL) {
1539 /*
1540 * Non-crc filesystems don't attach verifiers during log
1541 * recovery, so don't warn for such filesystems.
1542 */
1543 if (xfs_has_crc(bp->b_mount)) {
1544 xfs_warn(bp->b_mount,
1545 "%s: no buf ops on daddr 0x%llx len %d",
1546 __func__, xfs_buf_daddr(bp),
1547 bp->b_length);
1548 xfs_hex_dump(bp->b_addr, XFS_CORRUPTION_DUMP_LEN);
1549 dump_stack();
1550 }
1551 }
1552
1553 return true;
1554 }
1555
1556 /*
1557 * Buffer I/O submission path, read or write. Asynchronous submission transfers
1558 * the buffer lock ownership and the current reference to the IO. It is not
1559 * safe to reference the buffer after a call to this function unless the caller
1560 * holds an additional reference itself.
1561 */
1562 static void
xfs_buf_submit(struct xfs_buf * bp)1563 xfs_buf_submit(
1564 struct xfs_buf *bp)
1565 {
1566 trace_xfs_buf_submit(bp, _RET_IP_);
1567
1568 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
1569
1570 /*
1571 * On log shutdown we stale and complete the buffer immediately. We can
1572 * be called to read the superblock before the log has been set up, so
1573 * be careful checking the log state.
1574 *
1575 * Checking the mount shutdown state here can result in the log tail
1576 * moving inappropriately on disk as the log may not yet be shut down.
1577 * i.e. failing this buffer on mount shutdown can remove it from the AIL
1578 * and move the tail of the log forwards without having written this
1579 * buffer to disk. This corrupts the log tail state in memory, and
1580 * because the log may not be shut down yet, it can then be propagated
1581 * to disk before the log is shutdown. Hence we check log shutdown
1582 * state here rather than mount state to avoid corrupting the log tail
1583 * on shutdown.
1584 */
1585 if (bp->b_mount->m_log && xlog_is_shutdown(bp->b_mount->m_log)) {
1586 xfs_buf_ioend_fail(bp);
1587 return;
1588 }
1589
1590 if (bp->b_flags & XBF_WRITE)
1591 xfs_buf_wait_unpin(bp);
1592
1593 /*
1594 * Make sure we capture only current IO errors rather than stale errors
1595 * left over from previous use of the buffer (e.g. failed readahead).
1596 */
1597 bp->b_error = 0;
1598
1599 if ((bp->b_flags & XBF_WRITE) && !xfs_buf_verify_write(bp)) {
1600 xfs_force_shutdown(bp->b_mount, SHUTDOWN_CORRUPT_INCORE);
1601 xfs_buf_ioend(bp);
1602 return;
1603 }
1604
1605 /* In-memory targets are directly mapped, no I/O required. */
1606 if (xfs_buftarg_is_mem(bp->b_target)) {
1607 xfs_buf_ioend(bp);
1608 return;
1609 }
1610
1611 xfs_buf_submit_bio(bp);
1612 }
1613
1614 void *
xfs_buf_offset(struct xfs_buf * bp,size_t offset)1615 xfs_buf_offset(
1616 struct xfs_buf *bp,
1617 size_t offset)
1618 {
1619 struct page *page;
1620
1621 if (bp->b_addr)
1622 return bp->b_addr + offset;
1623
1624 page = bp->b_pages[offset >> PAGE_SHIFT];
1625 return page_address(page) + (offset & (PAGE_SIZE-1));
1626 }
1627
1628 void
xfs_buf_zero(struct xfs_buf * bp,size_t boff,size_t bsize)1629 xfs_buf_zero(
1630 struct xfs_buf *bp,
1631 size_t boff,
1632 size_t bsize)
1633 {
1634 size_t bend;
1635
1636 bend = boff + bsize;
1637 while (boff < bend) {
1638 struct page *page;
1639 int page_index, page_offset, csize;
1640
1641 page_index = (boff + bp->b_offset) >> PAGE_SHIFT;
1642 page_offset = (boff + bp->b_offset) & ~PAGE_MASK;
1643 page = bp->b_pages[page_index];
1644 csize = min_t(size_t, PAGE_SIZE - page_offset,
1645 BBTOB(bp->b_length) - boff);
1646
1647 ASSERT((csize + page_offset) <= PAGE_SIZE);
1648
1649 memset(page_address(page) + page_offset, 0, csize);
1650
1651 boff += csize;
1652 }
1653 }
1654
1655 /*
1656 * Log a message about and stale a buffer that a caller has decided is corrupt.
1657 *
1658 * This function should be called for the kinds of metadata corruption that
1659 * cannot be detect from a verifier, such as incorrect inter-block relationship
1660 * data. Do /not/ call this function from a verifier function.
1661 *
1662 * The buffer must be XBF_DONE prior to the call. Afterwards, the buffer will
1663 * be marked stale, but b_error will not be set. The caller is responsible for
1664 * releasing the buffer or fixing it.
1665 */
1666 void
__xfs_buf_mark_corrupt(struct xfs_buf * bp,xfs_failaddr_t fa)1667 __xfs_buf_mark_corrupt(
1668 struct xfs_buf *bp,
1669 xfs_failaddr_t fa)
1670 {
1671 ASSERT(bp->b_flags & XBF_DONE);
1672
1673 xfs_buf_corruption_error(bp, fa);
1674 xfs_buf_stale(bp);
1675 }
1676
1677 /*
1678 * Handling of buffer targets (buftargs).
1679 */
1680
1681 /*
1682 * Wait for any bufs with callbacks that have been submitted but have not yet
1683 * returned. These buffers will have an elevated hold count, so wait on those
1684 * while freeing all the buffers only held by the LRU.
1685 */
1686 static enum lru_status
xfs_buftarg_drain_rele(struct list_head * item,struct list_lru_one * lru,void * arg)1687 xfs_buftarg_drain_rele(
1688 struct list_head *item,
1689 struct list_lru_one *lru,
1690 void *arg)
1691
1692 {
1693 struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru);
1694 struct list_head *dispose = arg;
1695
1696 if (!spin_trylock(&bp->b_lock))
1697 return LRU_SKIP;
1698 if (bp->b_hold > 1) {
1699 /* need to wait, so skip it this pass */
1700 spin_unlock(&bp->b_lock);
1701 trace_xfs_buf_drain_buftarg(bp, _RET_IP_);
1702 return LRU_SKIP;
1703 }
1704
1705 /*
1706 * clear the LRU reference count so the buffer doesn't get
1707 * ignored in xfs_buf_rele().
1708 */
1709 atomic_set(&bp->b_lru_ref, 0);
1710 bp->b_state |= XFS_BSTATE_DISPOSE;
1711 list_lru_isolate_move(lru, item, dispose);
1712 spin_unlock(&bp->b_lock);
1713 return LRU_REMOVED;
1714 }
1715
1716 /*
1717 * Wait for outstanding I/O on the buftarg to complete.
1718 */
1719 void
xfs_buftarg_wait(struct xfs_buftarg * btp)1720 xfs_buftarg_wait(
1721 struct xfs_buftarg *btp)
1722 {
1723 /*
1724 * First wait for all in-flight readahead buffers to be released. This is
1725 * critical as new buffers do not make the LRU until they are released.
1726 *
1727 * Next, flush the buffer workqueue to ensure all completion processing
1728 * has finished. Just waiting on buffer locks is not sufficient for
1729 * async IO as the reference count held over IO is not released until
1730 * after the buffer lock is dropped. Hence we need to ensure here that
1731 * all reference counts have been dropped before we start walking the
1732 * LRU list.
1733 */
1734 while (percpu_counter_sum(&btp->bt_readahead_count))
1735 delay(100);
1736 flush_workqueue(btp->bt_mount->m_buf_workqueue);
1737 }
1738
1739 void
xfs_buftarg_drain(struct xfs_buftarg * btp)1740 xfs_buftarg_drain(
1741 struct xfs_buftarg *btp)
1742 {
1743 LIST_HEAD(dispose);
1744 int loop = 0;
1745 bool write_fail = false;
1746
1747 xfs_buftarg_wait(btp);
1748
1749 /* loop until there is nothing left on the lru list. */
1750 while (list_lru_count(&btp->bt_lru)) {
1751 list_lru_walk(&btp->bt_lru, xfs_buftarg_drain_rele,
1752 &dispose, LONG_MAX);
1753
1754 while (!list_empty(&dispose)) {
1755 struct xfs_buf *bp;
1756 bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
1757 list_del_init(&bp->b_lru);
1758 if (bp->b_flags & XBF_WRITE_FAIL) {
1759 write_fail = true;
1760 xfs_buf_alert_ratelimited(bp,
1761 "XFS: Corruption Alert",
1762 "Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!",
1763 (long long)xfs_buf_daddr(bp));
1764 }
1765 xfs_buf_rele(bp);
1766 }
1767 if (loop++ != 0)
1768 delay(100);
1769 }
1770
1771 /*
1772 * If one or more failed buffers were freed, that means dirty metadata
1773 * was thrown away. This should only ever happen after I/O completion
1774 * handling has elevated I/O error(s) to permanent failures and shuts
1775 * down the journal.
1776 */
1777 if (write_fail) {
1778 ASSERT(xlog_is_shutdown(btp->bt_mount->m_log));
1779 xfs_alert(btp->bt_mount,
1780 "Please run xfs_repair to determine the extent of the problem.");
1781 }
1782 }
1783
1784 static enum lru_status
xfs_buftarg_isolate(struct list_head * item,struct list_lru_one * lru,void * arg)1785 xfs_buftarg_isolate(
1786 struct list_head *item,
1787 struct list_lru_one *lru,
1788 void *arg)
1789 {
1790 struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru);
1791 struct list_head *dispose = arg;
1792
1793 /*
1794 * we are inverting the lru lock/bp->b_lock here, so use a trylock.
1795 * If we fail to get the lock, just skip it.
1796 */
1797 if (!spin_trylock(&bp->b_lock))
1798 return LRU_SKIP;
1799 /*
1800 * Decrement the b_lru_ref count unless the value is already
1801 * zero. If the value is already zero, we need to reclaim the
1802 * buffer, otherwise it gets another trip through the LRU.
1803 */
1804 if (atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
1805 spin_unlock(&bp->b_lock);
1806 return LRU_ROTATE;
1807 }
1808
1809 bp->b_state |= XFS_BSTATE_DISPOSE;
1810 list_lru_isolate_move(lru, item, dispose);
1811 spin_unlock(&bp->b_lock);
1812 return LRU_REMOVED;
1813 }
1814
1815 static unsigned long
xfs_buftarg_shrink_scan(struct shrinker * shrink,struct shrink_control * sc)1816 xfs_buftarg_shrink_scan(
1817 struct shrinker *shrink,
1818 struct shrink_control *sc)
1819 {
1820 struct xfs_buftarg *btp = shrink->private_data;
1821 LIST_HEAD(dispose);
1822 unsigned long freed;
1823
1824 freed = list_lru_shrink_walk(&btp->bt_lru, sc,
1825 xfs_buftarg_isolate, &dispose);
1826
1827 while (!list_empty(&dispose)) {
1828 struct xfs_buf *bp;
1829 bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
1830 list_del_init(&bp->b_lru);
1831 xfs_buf_rele(bp);
1832 }
1833
1834 return freed;
1835 }
1836
1837 static unsigned long
xfs_buftarg_shrink_count(struct shrinker * shrink,struct shrink_control * sc)1838 xfs_buftarg_shrink_count(
1839 struct shrinker *shrink,
1840 struct shrink_control *sc)
1841 {
1842 struct xfs_buftarg *btp = shrink->private_data;
1843 return list_lru_shrink_count(&btp->bt_lru, sc);
1844 }
1845
1846 void
xfs_destroy_buftarg(struct xfs_buftarg * btp)1847 xfs_destroy_buftarg(
1848 struct xfs_buftarg *btp)
1849 {
1850 shrinker_free(btp->bt_shrinker);
1851 ASSERT(percpu_counter_sum(&btp->bt_readahead_count) == 0);
1852 percpu_counter_destroy(&btp->bt_readahead_count);
1853 list_lru_destroy(&btp->bt_lru);
1854 }
1855
1856 void
xfs_free_buftarg(struct xfs_buftarg * btp)1857 xfs_free_buftarg(
1858 struct xfs_buftarg *btp)
1859 {
1860 xfs_destroy_buftarg(btp);
1861 fs_put_dax(btp->bt_daxdev, btp->bt_mount);
1862 /* the main block device is closed by kill_block_super */
1863 if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev)
1864 bdev_fput(btp->bt_bdev_file);
1865 kfree(btp);
1866 }
1867
1868 int
xfs_setsize_buftarg(struct xfs_buftarg * btp,unsigned int sectorsize)1869 xfs_setsize_buftarg(
1870 struct xfs_buftarg *btp,
1871 unsigned int sectorsize)
1872 {
1873 /* Set up metadata sector size info */
1874 btp->bt_meta_sectorsize = sectorsize;
1875 btp->bt_meta_sectormask = sectorsize - 1;
1876
1877 if (set_blocksize(btp->bt_bdev_file, sectorsize)) {
1878 xfs_warn(btp->bt_mount,
1879 "Cannot set_blocksize to %u on device %pg",
1880 sectorsize, btp->bt_bdev);
1881 return -EINVAL;
1882 }
1883
1884 return 0;
1885 }
1886
1887 int
xfs_init_buftarg(struct xfs_buftarg * btp,size_t logical_sectorsize,const char * descr)1888 xfs_init_buftarg(
1889 struct xfs_buftarg *btp,
1890 size_t logical_sectorsize,
1891 const char *descr)
1892 {
1893 /* Set up device logical sector size mask */
1894 btp->bt_logical_sectorsize = logical_sectorsize;
1895 btp->bt_logical_sectormask = logical_sectorsize - 1;
1896
1897 /*
1898 * Buffer IO error rate limiting. Limit it to no more than 10 messages
1899 * per 30 seconds so as to not spam logs too much on repeated errors.
1900 */
1901 ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
1902 DEFAULT_RATELIMIT_BURST);
1903
1904 if (list_lru_init(&btp->bt_lru))
1905 return -ENOMEM;
1906 if (percpu_counter_init(&btp->bt_readahead_count, 0, GFP_KERNEL))
1907 goto out_destroy_lru;
1908
1909 btp->bt_shrinker =
1910 shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-buf:%s", descr);
1911 if (!btp->bt_shrinker)
1912 goto out_destroy_io_count;
1913 btp->bt_shrinker->count_objects = xfs_buftarg_shrink_count;
1914 btp->bt_shrinker->scan_objects = xfs_buftarg_shrink_scan;
1915 btp->bt_shrinker->private_data = btp;
1916 shrinker_register(btp->bt_shrinker);
1917 return 0;
1918
1919 out_destroy_io_count:
1920 percpu_counter_destroy(&btp->bt_readahead_count);
1921 out_destroy_lru:
1922 list_lru_destroy(&btp->bt_lru);
1923 return -ENOMEM;
1924 }
1925
1926 struct xfs_buftarg *
xfs_alloc_buftarg(struct xfs_mount * mp,struct file * bdev_file)1927 xfs_alloc_buftarg(
1928 struct xfs_mount *mp,
1929 struct file *bdev_file)
1930 {
1931 struct xfs_buftarg *btp;
1932 const struct dax_holder_operations *ops = NULL;
1933
1934 #if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE)
1935 ops = &xfs_dax_holder_operations;
1936 #endif
1937 btp = kzalloc(sizeof(*btp), GFP_KERNEL | __GFP_NOFAIL);
1938
1939 btp->bt_mount = mp;
1940 btp->bt_bdev_file = bdev_file;
1941 btp->bt_bdev = file_bdev(bdev_file);
1942 btp->bt_dev = btp->bt_bdev->bd_dev;
1943 btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off,
1944 mp, ops);
1945
1946 if (bdev_can_atomic_write(btp->bt_bdev)) {
1947 btp->bt_bdev_awu_min = bdev_atomic_write_unit_min_bytes(
1948 btp->bt_bdev);
1949 btp->bt_bdev_awu_max = bdev_atomic_write_unit_max_bytes(
1950 btp->bt_bdev);
1951 }
1952
1953 /*
1954 * When allocating the buftargs we have not yet read the super block and
1955 * thus don't know the file system sector size yet.
1956 */
1957 if (xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev)))
1958 goto error_free;
1959 if (xfs_init_buftarg(btp, bdev_logical_block_size(btp->bt_bdev),
1960 mp->m_super->s_id))
1961 goto error_free;
1962
1963 return btp;
1964
1965 error_free:
1966 kfree(btp);
1967 return NULL;
1968 }
1969
1970 static inline void
xfs_buf_list_del(struct xfs_buf * bp)1971 xfs_buf_list_del(
1972 struct xfs_buf *bp)
1973 {
1974 list_del_init(&bp->b_list);
1975 wake_up_var(&bp->b_list);
1976 }
1977
1978 /*
1979 * Cancel a delayed write list.
1980 *
1981 * Remove each buffer from the list, clear the delwri queue flag and drop the
1982 * associated buffer reference.
1983 */
1984 void
xfs_buf_delwri_cancel(struct list_head * list)1985 xfs_buf_delwri_cancel(
1986 struct list_head *list)
1987 {
1988 struct xfs_buf *bp;
1989
1990 while (!list_empty(list)) {
1991 bp = list_first_entry(list, struct xfs_buf, b_list);
1992
1993 xfs_buf_lock(bp);
1994 bp->b_flags &= ~_XBF_DELWRI_Q;
1995 xfs_buf_list_del(bp);
1996 xfs_buf_relse(bp);
1997 }
1998 }
1999
2000 /*
2001 * Add a buffer to the delayed write list.
2002 *
2003 * This queues a buffer for writeout if it hasn't already been. Note that
2004 * neither this routine nor the buffer list submission functions perform
2005 * any internal synchronization. It is expected that the lists are thread-local
2006 * to the callers.
2007 *
2008 * Returns true if we queued up the buffer, or false if it already had
2009 * been on the buffer list.
2010 */
2011 bool
xfs_buf_delwri_queue(struct xfs_buf * bp,struct list_head * list)2012 xfs_buf_delwri_queue(
2013 struct xfs_buf *bp,
2014 struct list_head *list)
2015 {
2016 ASSERT(xfs_buf_islocked(bp));
2017 ASSERT(!(bp->b_flags & XBF_READ));
2018
2019 /*
2020 * If the buffer is already marked delwri it already is queued up
2021 * by someone else for imediate writeout. Just ignore it in that
2022 * case.
2023 */
2024 if (bp->b_flags & _XBF_DELWRI_Q) {
2025 trace_xfs_buf_delwri_queued(bp, _RET_IP_);
2026 return false;
2027 }
2028
2029 trace_xfs_buf_delwri_queue(bp, _RET_IP_);
2030
2031 /*
2032 * If a buffer gets written out synchronously or marked stale while it
2033 * is on a delwri list we lazily remove it. To do this, the other party
2034 * clears the _XBF_DELWRI_Q flag but otherwise leaves the buffer alone.
2035 * It remains referenced and on the list. In a rare corner case it
2036 * might get readded to a delwri list after the synchronous writeout, in
2037 * which case we need just need to re-add the flag here.
2038 */
2039 bp->b_flags |= _XBF_DELWRI_Q;
2040 if (list_empty(&bp->b_list)) {
2041 xfs_buf_hold(bp);
2042 list_add_tail(&bp->b_list, list);
2043 }
2044
2045 return true;
2046 }
2047
2048 /*
2049 * Queue a buffer to this delwri list as part of a data integrity operation.
2050 * If the buffer is on any other delwri list, we'll wait for that to clear
2051 * so that the caller can submit the buffer for IO and wait for the result.
2052 * Callers must ensure the buffer is not already on the list.
2053 */
2054 void
xfs_buf_delwri_queue_here(struct xfs_buf * bp,struct list_head * buffer_list)2055 xfs_buf_delwri_queue_here(
2056 struct xfs_buf *bp,
2057 struct list_head *buffer_list)
2058 {
2059 /*
2060 * We need this buffer to end up on the /caller's/ delwri list, not any
2061 * old list. This can happen if the buffer is marked stale (which
2062 * clears DELWRI_Q) after the AIL queues the buffer to its list but
2063 * before the AIL has a chance to submit the list.
2064 */
2065 while (!list_empty(&bp->b_list)) {
2066 xfs_buf_unlock(bp);
2067 wait_var_event(&bp->b_list, list_empty(&bp->b_list));
2068 xfs_buf_lock(bp);
2069 }
2070
2071 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
2072
2073 xfs_buf_delwri_queue(bp, buffer_list);
2074 }
2075
2076 /*
2077 * Compare function is more complex than it needs to be because
2078 * the return value is only 32 bits and we are doing comparisons
2079 * on 64 bit values
2080 */
2081 static int
xfs_buf_cmp(void * priv,const struct list_head * a,const struct list_head * b)2082 xfs_buf_cmp(
2083 void *priv,
2084 const struct list_head *a,
2085 const struct list_head *b)
2086 {
2087 struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list);
2088 struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list);
2089 xfs_daddr_t diff;
2090
2091 diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn;
2092 if (diff < 0)
2093 return -1;
2094 if (diff > 0)
2095 return 1;
2096 return 0;
2097 }
2098
2099 static bool
xfs_buf_delwri_submit_prep(struct xfs_buf * bp)2100 xfs_buf_delwri_submit_prep(
2101 struct xfs_buf *bp)
2102 {
2103 /*
2104 * Someone else might have written the buffer synchronously or marked it
2105 * stale in the meantime. In that case only the _XBF_DELWRI_Q flag got
2106 * cleared, and we have to drop the reference and remove it from the
2107 * list here.
2108 */
2109 if (!(bp->b_flags & _XBF_DELWRI_Q)) {
2110 xfs_buf_list_del(bp);
2111 xfs_buf_relse(bp);
2112 return false;
2113 }
2114
2115 trace_xfs_buf_delwri_split(bp, _RET_IP_);
2116 bp->b_flags &= ~_XBF_DELWRI_Q;
2117 bp->b_flags |= XBF_WRITE;
2118 return true;
2119 }
2120
2121 /*
2122 * Write out a buffer list asynchronously.
2123 *
2124 * This will take the @buffer_list, write all non-locked and non-pinned buffers
2125 * out and not wait for I/O completion on any of the buffers. This interface
2126 * is only safely useable for callers that can track I/O completion by higher
2127 * level means, e.g. AIL pushing as the @buffer_list is consumed in this
2128 * function.
2129 *
2130 * Note: this function will skip buffers it would block on, and in doing so
2131 * leaves them on @buffer_list so they can be retried on a later pass. As such,
2132 * it is up to the caller to ensure that the buffer list is fully submitted or
2133 * cancelled appropriately when they are finished with the list. Failure to
2134 * cancel or resubmit the list until it is empty will result in leaked buffers
2135 * at unmount time.
2136 */
2137 int
xfs_buf_delwri_submit_nowait(struct list_head * buffer_list)2138 xfs_buf_delwri_submit_nowait(
2139 struct list_head *buffer_list)
2140 {
2141 struct xfs_buf *bp, *n;
2142 int pinned = 0;
2143 struct blk_plug plug;
2144
2145 list_sort(NULL, buffer_list, xfs_buf_cmp);
2146
2147 blk_start_plug(&plug);
2148 list_for_each_entry_safe(bp, n, buffer_list, b_list) {
2149 if (!xfs_buf_trylock(bp))
2150 continue;
2151 if (xfs_buf_ispinned(bp)) {
2152 xfs_buf_unlock(bp);
2153 pinned++;
2154 continue;
2155 }
2156 if (!xfs_buf_delwri_submit_prep(bp))
2157 continue;
2158 bp->b_flags |= XBF_ASYNC;
2159 xfs_buf_list_del(bp);
2160 xfs_buf_submit(bp);
2161 }
2162 blk_finish_plug(&plug);
2163
2164 return pinned;
2165 }
2166
2167 /*
2168 * Write out a buffer list synchronously.
2169 *
2170 * This will take the @buffer_list, write all buffers out and wait for I/O
2171 * completion on all of the buffers. @buffer_list is consumed by the function,
2172 * so callers must have some other way of tracking buffers if they require such
2173 * functionality.
2174 */
2175 int
xfs_buf_delwri_submit(struct list_head * buffer_list)2176 xfs_buf_delwri_submit(
2177 struct list_head *buffer_list)
2178 {
2179 LIST_HEAD (wait_list);
2180 int error = 0, error2;
2181 struct xfs_buf *bp, *n;
2182 struct blk_plug plug;
2183
2184 list_sort(NULL, buffer_list, xfs_buf_cmp);
2185
2186 blk_start_plug(&plug);
2187 list_for_each_entry_safe(bp, n, buffer_list, b_list) {
2188 xfs_buf_lock(bp);
2189 if (!xfs_buf_delwri_submit_prep(bp))
2190 continue;
2191 bp->b_flags &= ~XBF_ASYNC;
2192 list_move_tail(&bp->b_list, &wait_list);
2193 xfs_buf_submit(bp);
2194 }
2195 blk_finish_plug(&plug);
2196
2197 /* Wait for IO to complete. */
2198 while (!list_empty(&wait_list)) {
2199 bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
2200
2201 xfs_buf_list_del(bp);
2202
2203 /*
2204 * Wait on the locked buffer, check for errors and unlock and
2205 * release the delwri queue reference.
2206 */
2207 error2 = xfs_buf_iowait(bp);
2208 xfs_buf_relse(bp);
2209 if (!error)
2210 error = error2;
2211 }
2212
2213 return error;
2214 }
2215
2216 /*
2217 * Push a single buffer on a delwri queue.
2218 *
2219 * The purpose of this function is to submit a single buffer of a delwri queue
2220 * and return with the buffer still on the original queue.
2221 *
2222 * The buffer locking and queue management logic between _delwri_pushbuf() and
2223 * _delwri_queue() guarantee that the buffer cannot be queued to another list
2224 * before returning.
2225 */
2226 int
xfs_buf_delwri_pushbuf(struct xfs_buf * bp,struct list_head * buffer_list)2227 xfs_buf_delwri_pushbuf(
2228 struct xfs_buf *bp,
2229 struct list_head *buffer_list)
2230 {
2231 int error;
2232
2233 ASSERT(bp->b_flags & _XBF_DELWRI_Q);
2234
2235 trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_);
2236
2237 xfs_buf_lock(bp);
2238 bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC);
2239 bp->b_flags |= XBF_WRITE;
2240 xfs_buf_submit(bp);
2241
2242 /*
2243 * The buffer is now locked, under I/O but still on the original delwri
2244 * queue. Wait for I/O completion, restore the DELWRI_Q flag and
2245 * return with the buffer unlocked and still on the original queue.
2246 */
2247 error = xfs_buf_iowait(bp);
2248 bp->b_flags |= _XBF_DELWRI_Q;
2249 xfs_buf_unlock(bp);
2250
2251 return error;
2252 }
2253
xfs_buf_set_ref(struct xfs_buf * bp,int lru_ref)2254 void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
2255 {
2256 /*
2257 * Set the lru reference count to 0 based on the error injection tag.
2258 * This allows userspace to disrupt buffer caching for debug/testing
2259 * purposes.
2260 */
2261 if (XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_LRU_REF))
2262 lru_ref = 0;
2263
2264 atomic_set(&bp->b_lru_ref, lru_ref);
2265 }
2266
2267 /*
2268 * Verify an on-disk magic value against the magic value specified in the
2269 * verifier structure. The verifier magic is in disk byte order so the caller is
2270 * expected to pass the value directly from disk.
2271 */
2272 bool
xfs_verify_magic(struct xfs_buf * bp,__be32 dmagic)2273 xfs_verify_magic(
2274 struct xfs_buf *bp,
2275 __be32 dmagic)
2276 {
2277 struct xfs_mount *mp = bp->b_mount;
2278 int idx;
2279
2280 idx = xfs_has_crc(mp);
2281 if (WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx]))
2282 return false;
2283 return dmagic == bp->b_ops->magic[idx];
2284 }
2285 /*
2286 * Verify an on-disk magic value against the magic value specified in the
2287 * verifier structure. The verifier magic is in disk byte order so the caller is
2288 * expected to pass the value directly from disk.
2289 */
2290 bool
xfs_verify_magic16(struct xfs_buf * bp,__be16 dmagic)2291 xfs_verify_magic16(
2292 struct xfs_buf *bp,
2293 __be16 dmagic)
2294 {
2295 struct xfs_mount *mp = bp->b_mount;
2296 int idx;
2297
2298 idx = xfs_has_crc(mp);
2299 if (WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx]))
2300 return false;
2301 return dmagic == bp->b_ops->magic16[idx];
2302 }
2303