1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Copyright (C) 2022-2023 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_btree.h"
13 #include "xfs_log_format.h"
14 #include "xfs_trans.h"
15 #include "xfs_sb.h"
16 #include "xfs_inode.h"
17 #include "xfs_alloc.h"
18 #include "xfs_alloc_btree.h"
19 #include "xfs_ialloc.h"
20 #include "xfs_ialloc_btree.h"
21 #include "xfs_rmap.h"
22 #include "xfs_rmap_btree.h"
23 #include "xfs_refcount.h"
24 #include "xfs_refcount_btree.h"
25 #include "xfs_extent_busy.h"
26 #include "xfs_ag.h"
27 #include "xfs_ag_resv.h"
28 #include "xfs_quota.h"
29 #include "xfs_qm.h"
30 #include "xfs_bmap.h"
31 #include "xfs_da_format.h"
32 #include "xfs_da_btree.h"
33 #include "xfs_attr.h"
34 #include "xfs_attr_remote.h"
35 #include "xfs_defer.h"
36 #include "xfs_metafile.h"
37 #include "xfs_rtgroup.h"
38 #include "xfs_rtrmap_btree.h"
39 #include "xfs_extfree_item.h"
40 #include "xfs_rmap_item.h"
41 #include "xfs_refcount_item.h"
42 #include "xfs_buf_item.h"
43 #include "xfs_bmap_item.h"
44 #include "xfs_bmap_btree.h"
45 #include "scrub/scrub.h"
46 #include "scrub/common.h"
47 #include "scrub/trace.h"
48 #include "scrub/repair.h"
49 #include "scrub/bitmap.h"
50 #include "scrub/agb_bitmap.h"
51 #include "scrub/fsb_bitmap.h"
52 #include "scrub/rtb_bitmap.h"
53 #include "scrub/reap.h"
54
55 /*
56 * Disposal of Blocks from Old Metadata
57 *
58 * Now that we've constructed a new btree to replace the damaged one, we want
59 * to dispose of the blocks that (we think) the old btree was using.
60 * Previously, we used the rmapbt to collect the extents (bitmap) with the
61 * rmap owner corresponding to the tree we rebuilt, collected extents for any
62 * blocks with the same rmap owner that are owned by another data structure
63 * (sublist), and subtracted sublist from bitmap. In theory the extents
64 * remaining in bitmap are the old btree's blocks.
65 *
66 * Unfortunately, it's possible that the btree was crosslinked with other
67 * blocks on disk. The rmap data can tell us if there are multiple owners, so
68 * if the rmapbt says there is an owner of this block other than @oinfo, then
69 * the block is crosslinked. Remove the reverse mapping and continue.
70 *
71 * If there is one rmap record, we can free the block, which removes the
72 * reverse mapping but doesn't add the block to the free space. Our repair
73 * strategy is to hope the other metadata objects crosslinked on this block
74 * will be rebuilt (atop different blocks), thereby removing all the cross
75 * links.
76 *
77 * If there are no rmap records at all, we also free the block. If the btree
78 * being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't
79 * supposed to be a rmap record and everything is ok. For other btrees there
80 * had to have been an rmap entry for the block to have ended up on @bitmap,
81 * so if it's gone now there's something wrong and the fs will shut down.
82 *
83 * Note: If there are multiple rmap records with only the same rmap owner as
84 * the btree we're trying to rebuild and the block is indeed owned by another
85 * data structure with the same rmap owner, then the block will be in sublist
86 * and therefore doesn't need disposal. If there are multiple rmap records
87 * with only the same rmap owner but the block is not owned by something with
88 * the same rmap owner, the block will be freed.
89 *
90 * The caller is responsible for locking the AG headers/inode for the entire
91 * rebuild operation so that nothing else can sneak in and change the incore
92 * state while we're not looking. We must also invalidate any buffers
93 * associated with @bitmap.
94 */
95
96 /* Information about reaping extents after a repair. */
97 struct xreap_state {
98 struct xfs_scrub *sc;
99
100 union {
101 struct {
102 /*
103 * For AG blocks, this is reverse mapping owner and
104 * metadata reservation type.
105 */
106 const struct xfs_owner_info *oinfo;
107 enum xfs_ag_resv_type resv;
108 };
109 struct {
110 /* For file blocks, this is the inode and fork. */
111 struct xfs_inode *ip;
112 int whichfork;
113 };
114 };
115
116 /* Number of invalidated buffers logged to the current transaction. */
117 unsigned int nr_binval;
118
119 /* Maximum number of buffers we can invalidate in a single tx. */
120 unsigned int max_binval;
121
122 /* Number of deferred reaps attached to the current transaction. */
123 unsigned int nr_deferred;
124
125 /* Maximum number of intents we can reap in a single transaction. */
126 unsigned int max_deferred;
127 };
128
129 /* Put a block back on the AGFL. */
130 STATIC int
xreap_put_freelist(struct xfs_scrub * sc,xfs_agblock_t agbno)131 xreap_put_freelist(
132 struct xfs_scrub *sc,
133 xfs_agblock_t agbno)
134 {
135 struct xfs_buf *agfl_bp;
136 int error;
137
138 /* Make sure there's space on the freelist. */
139 error = xrep_fix_freelist(sc, 0);
140 if (error)
141 return error;
142
143 /*
144 * Since we're "freeing" a lost block onto the AGFL, we have to
145 * create an rmap for the block prior to merging it or else other
146 * parts will break.
147 */
148 error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, 1,
149 &XFS_RMAP_OINFO_AG);
150 if (error)
151 return error;
152
153 /* Put the block on the AGFL. */
154 error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
155 if (error)
156 return error;
157
158 error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp,
159 agfl_bp, agbno, 0);
160 if (error)
161 return error;
162 xfs_extent_busy_insert(sc->tp, pag_group(sc->sa.pag), agbno, 1,
163 XFS_EXTENT_BUSY_SKIP_DISCARD);
164
165 return 0;
166 }
167
168 /* Are there any uncommitted reap operations? */
xreap_is_dirty(const struct xreap_state * rs)169 static inline bool xreap_is_dirty(const struct xreap_state *rs)
170 {
171 return rs->nr_binval > 0 || rs->nr_deferred > 0;
172 }
173
174 /*
175 * Decide if we need to roll the transaction to clear out the the log
176 * reservation that we allocated to buffer invalidations.
177 */
xreap_want_binval_roll(const struct xreap_state * rs)178 static inline bool xreap_want_binval_roll(const struct xreap_state *rs)
179 {
180 return rs->nr_binval >= rs->max_binval;
181 }
182
183 /* Reset the buffer invalidation count after rolling. */
xreap_binval_reset(struct xreap_state * rs)184 static inline void xreap_binval_reset(struct xreap_state *rs)
185 {
186 rs->nr_binval = 0;
187 }
188
189 /*
190 * Bump the number of invalidated buffers, and return true if we can continue,
191 * or false if we need to roll the transaction.
192 */
xreap_inc_binval(struct xreap_state * rs)193 static inline bool xreap_inc_binval(struct xreap_state *rs)
194 {
195 rs->nr_binval++;
196 return rs->nr_binval < rs->max_binval;
197 }
198
199 /*
200 * Decide if we want to finish the deferred ops that are attached to the scrub
201 * transaction. We don't want to queue huge chains of deferred ops because
202 * that can consume a lot of log space and kernel memory. Hence we trigger a
203 * xfs_defer_finish if there are too many deferred reap operations or we've run
204 * out of space for invalidations.
205 */
xreap_want_defer_finish(const struct xreap_state * rs)206 static inline bool xreap_want_defer_finish(const struct xreap_state *rs)
207 {
208 return rs->nr_deferred >= rs->max_deferred;
209 }
210
211 /*
212 * Reset the defer chain length and buffer invalidation count after finishing
213 * items.
214 */
xreap_defer_finish_reset(struct xreap_state * rs)215 static inline void xreap_defer_finish_reset(struct xreap_state *rs)
216 {
217 rs->nr_deferred = 0;
218 rs->nr_binval = 0;
219 }
220
221 /*
222 * Bump the number of deferred extent reaps.
223 */
xreap_inc_defer(struct xreap_state * rs)224 static inline void xreap_inc_defer(struct xreap_state *rs)
225 {
226 rs->nr_deferred++;
227 }
228
229 /* Force the caller to finish a deferred item chain. */
xreap_force_defer_finish(struct xreap_state * rs)230 static inline void xreap_force_defer_finish(struct xreap_state *rs)
231 {
232 rs->nr_deferred = rs->max_deferred;
233 }
234
235 /* Maximum number of fsblocks that we might find in a buffer to invalidate. */
236 static inline unsigned int
xrep_binval_max_fsblocks(struct xfs_mount * mp)237 xrep_binval_max_fsblocks(
238 struct xfs_mount *mp)
239 {
240 /* Remote xattr values are the largest buffers that we support. */
241 return xfs_attr3_max_rmt_blocks(mp);
242 }
243
244 /*
245 * Compute the maximum length of a buffer cache scan (in units of sectors),
246 * given a quantity of fs blocks.
247 */
248 xfs_daddr_t
xrep_bufscan_max_sectors(struct xfs_mount * mp,xfs_extlen_t fsblocks)249 xrep_bufscan_max_sectors(
250 struct xfs_mount *mp,
251 xfs_extlen_t fsblocks)
252 {
253 return XFS_FSB_TO_BB(mp, min_t(xfs_extlen_t, fsblocks,
254 xrep_binval_max_fsblocks(mp)));
255 }
256
257 /*
258 * Return an incore buffer from a sector scan, or NULL if there are no buffers
259 * left to return.
260 */
261 struct xfs_buf *
xrep_bufscan_advance(struct xfs_mount * mp,struct xrep_bufscan * scan)262 xrep_bufscan_advance(
263 struct xfs_mount *mp,
264 struct xrep_bufscan *scan)
265 {
266 scan->__sector_count += scan->daddr_step;
267 while (scan->__sector_count <= scan->max_sectors) {
268 struct xfs_buf *bp = NULL;
269 int error;
270
271 error = xfs_buf_incore(mp->m_ddev_targp, scan->daddr,
272 scan->__sector_count, XBF_LIVESCAN, &bp);
273 if (!error)
274 return bp;
275
276 scan->__sector_count += scan->daddr_step;
277 }
278
279 return NULL;
280 }
281
282 /* Try to invalidate the incore buffers for an extent that we're freeing. */
283 STATIC void
xreap_agextent_binval(struct xreap_state * rs,xfs_agblock_t agbno,xfs_extlen_t * aglenp)284 xreap_agextent_binval(
285 struct xreap_state *rs,
286 xfs_agblock_t agbno,
287 xfs_extlen_t *aglenp)
288 {
289 struct xfs_scrub *sc = rs->sc;
290 struct xfs_perag *pag = sc->sa.pag;
291 struct xfs_mount *mp = sc->mp;
292 xfs_agblock_t agbno_next = agbno + *aglenp;
293 xfs_agblock_t bno = agbno;
294
295 /*
296 * Avoid invalidating AG headers and post-EOFS blocks because we never
297 * own those.
298 */
299 if (!xfs_verify_agbno(pag, agbno) ||
300 !xfs_verify_agbno(pag, agbno_next - 1))
301 return;
302
303 /*
304 * If there are incore buffers for these blocks, invalidate them. We
305 * assume that the lack of any other known owners means that the buffer
306 * can be locked without risk of deadlocking. The buffer cache cannot
307 * detect aliasing, so employ nested loops to scan for incore buffers
308 * of any plausible size.
309 */
310 while (bno < agbno_next) {
311 struct xrep_bufscan scan = {
312 .daddr = xfs_agbno_to_daddr(pag, bno),
313 .max_sectors = xrep_bufscan_max_sectors(mp,
314 agbno_next - bno),
315 .daddr_step = XFS_FSB_TO_BB(mp, 1),
316 };
317 struct xfs_buf *bp;
318
319 while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) {
320 xfs_trans_bjoin(sc->tp, bp);
321 xfs_trans_binval(sc->tp, bp);
322
323 /*
324 * Stop invalidating if we've hit the limit; we should
325 * still have enough reservation left to free however
326 * far we've gotten.
327 */
328 if (!xreap_inc_binval(rs)) {
329 *aglenp -= agbno_next - bno;
330 goto out;
331 }
332 }
333
334 bno++;
335 }
336
337 out:
338 trace_xreap_agextent_binval(pag_group(sc->sa.pag), agbno, *aglenp);
339 }
340
341 /*
342 * Figure out the longest run of blocks that we can dispose of with a single
343 * call. Cross-linked blocks should have their reverse mappings removed, but
344 * single-owner extents can be freed. AGFL blocks can only be put back one at
345 * a time.
346 */
347 STATIC int
xreap_agextent_select(struct xreap_state * rs,xfs_agblock_t agbno,xfs_agblock_t agbno_next,bool * crosslinked,xfs_extlen_t * aglenp)348 xreap_agextent_select(
349 struct xreap_state *rs,
350 xfs_agblock_t agbno,
351 xfs_agblock_t agbno_next,
352 bool *crosslinked,
353 xfs_extlen_t *aglenp)
354 {
355 struct xfs_scrub *sc = rs->sc;
356 struct xfs_btree_cur *cur;
357 xfs_agblock_t bno = agbno + 1;
358 xfs_extlen_t len = 1;
359 int error;
360
361 /*
362 * Determine if there are any other rmap records covering the first
363 * block of this extent. If so, the block is crosslinked.
364 */
365 cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
366 sc->sa.pag);
367 error = xfs_rmap_has_other_keys(cur, agbno, 1, rs->oinfo,
368 crosslinked);
369 if (error)
370 goto out_cur;
371
372 /* AGFL blocks can only be deal with one at a time. */
373 if (rs->resv == XFS_AG_RESV_AGFL)
374 goto out_found;
375
376 /*
377 * Figure out how many of the subsequent blocks have the same crosslink
378 * status.
379 */
380 while (bno < agbno_next) {
381 bool also_crosslinked;
382
383 error = xfs_rmap_has_other_keys(cur, bno, 1, rs->oinfo,
384 &also_crosslinked);
385 if (error)
386 goto out_cur;
387
388 if (*crosslinked != also_crosslinked)
389 break;
390
391 len++;
392 bno++;
393 }
394
395 out_found:
396 *aglenp = len;
397 trace_xreap_agextent_select(pag_group(sc->sa.pag), agbno, len,
398 *crosslinked);
399 out_cur:
400 xfs_btree_del_cursor(cur, error);
401 return error;
402 }
403
404 /*
405 * Dispose of as much of the beginning of this AG extent as possible. The
406 * number of blocks disposed of will be returned in @aglenp.
407 */
408 STATIC int
xreap_agextent_iter(struct xreap_state * rs,xfs_agblock_t agbno,xfs_extlen_t * aglenp,bool crosslinked)409 xreap_agextent_iter(
410 struct xreap_state *rs,
411 xfs_agblock_t agbno,
412 xfs_extlen_t *aglenp,
413 bool crosslinked)
414 {
415 struct xfs_scrub *sc = rs->sc;
416 xfs_fsblock_t fsbno;
417 int error = 0;
418
419 ASSERT(rs->resv != XFS_AG_RESV_METAFILE);
420
421 fsbno = xfs_agbno_to_fsb(sc->sa.pag, agbno);
422
423 /*
424 * If there are other rmappings, this block is cross linked and must
425 * not be freed. Remove the reverse mapping and move on. Otherwise,
426 * we were the only owner of the block, so free the extent, which will
427 * also remove the rmap.
428 *
429 * XXX: XFS doesn't support detecting the case where a single block
430 * metadata structure is crosslinked with a multi-block structure
431 * because the buffer cache doesn't detect aliasing problems, so we
432 * can't fix 100% of crosslinking problems (yet). The verifiers will
433 * blow on writeout, the filesystem will shut down, and the admin gets
434 * to run xfs_repair.
435 */
436 if (crosslinked) {
437 trace_xreap_dispose_unmap_extent(pag_group(sc->sa.pag), agbno,
438 *aglenp);
439
440 if (rs->oinfo == &XFS_RMAP_OINFO_COW) {
441 /*
442 * t0: Unmapping CoW staging extents, remove the
443 * records from the refcountbt, which will remove the
444 * rmap record as well.
445 */
446 xfs_refcount_free_cow_extent(sc->tp, false, fsbno,
447 *aglenp);
448 xreap_inc_defer(rs);
449 return 0;
450 }
451
452 /* t1: unmap crosslinked metadata blocks */
453 xfs_rmap_free_extent(sc->tp, false, fsbno, *aglenp,
454 rs->oinfo->oi_owner);
455 xreap_inc_defer(rs);
456 return 0;
457 }
458
459 trace_xreap_dispose_free_extent(pag_group(sc->sa.pag), agbno, *aglenp);
460
461 /*
462 * Invalidate as many buffers as we can, starting at agbno. If this
463 * function sets *aglenp to zero, the transaction is full of logged
464 * buffer invalidations, so we need to return early so that we can
465 * roll and retry.
466 */
467 xreap_agextent_binval(rs, agbno, aglenp);
468 if (*aglenp == 0) {
469 ASSERT(xreap_want_binval_roll(rs));
470 return 0;
471 }
472
473 /*
474 * t2: To get rid of CoW staging extents, use deferred work items
475 * to remove the refcountbt records (which removes the rmap records)
476 * and free the extent. We're not worried about the system going down
477 * here because log recovery walks the refcount btree to clean out the
478 * CoW staging extents.
479 */
480 if (rs->oinfo == &XFS_RMAP_OINFO_COW) {
481 ASSERT(rs->resv == XFS_AG_RESV_NONE);
482
483 xfs_refcount_free_cow_extent(sc->tp, false, fsbno, *aglenp);
484 error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, NULL,
485 rs->resv, XFS_FREE_EXTENT_SKIP_DISCARD);
486 if (error)
487 return error;
488
489 xreap_inc_defer(rs);
490 return 0;
491 }
492
493 /* t3: Put blocks back on the AGFL one at a time. */
494 if (rs->resv == XFS_AG_RESV_AGFL) {
495 ASSERT(*aglenp == 1);
496 error = xreap_put_freelist(sc, agbno);
497 if (error)
498 return error;
499
500 xreap_force_defer_finish(rs);
501 return 0;
502 }
503
504 /*
505 * t4: Use deferred frees to get rid of the old btree blocks to try to
506 * minimize the window in which we could crash and lose the old blocks.
507 * Add a defer ops barrier every other extent to avoid stressing the
508 * system with large EFIs.
509 */
510 error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo,
511 rs->resv, XFS_FREE_EXTENT_SKIP_DISCARD);
512 if (error)
513 return error;
514
515 xreap_inc_defer(rs);
516 if (rs->nr_deferred % 2 == 0)
517 xfs_defer_add_barrier(sc->tp);
518 return 0;
519 }
520
521 /* Configure the deferral and invalidation limits */
522 static inline void
xreap_configure_limits(struct xreap_state * rs,unsigned int fixed_overhead,unsigned int variable_overhead,unsigned int per_intent,unsigned int per_binval)523 xreap_configure_limits(
524 struct xreap_state *rs,
525 unsigned int fixed_overhead,
526 unsigned int variable_overhead,
527 unsigned int per_intent,
528 unsigned int per_binval)
529 {
530 struct xfs_scrub *sc = rs->sc;
531 unsigned int res = sc->tp->t_log_res - fixed_overhead;
532
533 /* Don't underflow the reservation */
534 if (sc->tp->t_log_res < (fixed_overhead + variable_overhead)) {
535 ASSERT(sc->tp->t_log_res >=
536 (fixed_overhead + variable_overhead));
537 xfs_force_shutdown(sc->mp, SHUTDOWN_CORRUPT_INCORE);
538 return;
539 }
540
541 rs->max_deferred = per_intent ? res / variable_overhead : 0;
542 res -= rs->max_deferred * per_intent;
543 rs->max_binval = per_binval ? res / per_binval : 0;
544 }
545
546 /*
547 * Compute the maximum number of intent items that reaping can attach to the
548 * scrub transaction given the worst case log overhead of the intent items
549 * needed to reap a single per-AG space extent. This is not for freeing CoW
550 * staging extents.
551 */
552 STATIC void
xreap_configure_agextent_limits(struct xreap_state * rs)553 xreap_configure_agextent_limits(
554 struct xreap_state *rs)
555 {
556 struct xfs_scrub *sc = rs->sc;
557 struct xfs_mount *mp = sc->mp;
558
559 /*
560 * In the worst case, relogging an intent item causes both an intent
561 * item and a done item to be attached to a transaction for each extent
562 * that we'd like to process.
563 */
564 const unsigned int efi = xfs_efi_log_space(1) +
565 xfs_efd_log_space(1);
566 const unsigned int rui = xfs_rui_log_space(1) +
567 xfs_rud_log_space();
568
569 /*
570 * Various things can happen when reaping non-CoW metadata blocks:
571 *
572 * t1: Unmapping crosslinked metadata blocks: deferred removal of rmap
573 * record.
574 *
575 * t3: Freeing to AGFL: roll and finish deferred items for every block.
576 * Limits here do not matter.
577 *
578 * t4: Freeing metadata blocks: deferred freeing of the space, which
579 * also removes the rmap record.
580 *
581 * For simplicity, we'll use the worst-case intents size to determine
582 * the maximum number of deferred extents before we have to finish the
583 * whole chain. If we're trying to reap a btree larger than this size,
584 * a crash midway through reaping can result in leaked blocks.
585 */
586 const unsigned int t1 = rui;
587 const unsigned int t4 = rui + efi;
588 const unsigned int per_intent = max(t1, t4);
589
590 /*
591 * For each transaction in a reap chain, we must be able to take one
592 * step in the defer item chain, which should only consist of EFI or
593 * RUI items.
594 */
595 const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1);
596 const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1);
597 const unsigned int step_size = max(f1, f2);
598
599 /* Largest buffer size (in fsblocks) that can be invalidated. */
600 const unsigned int max_binval = xrep_binval_max_fsblocks(mp);
601
602 /* Maximum overhead of invalidating one buffer. */
603 const unsigned int per_binval =
604 xfs_buf_inval_log_space(1, XFS_B_TO_FSBT(mp, max_binval));
605
606 /*
607 * For each transaction in a reap chain, we can delete some number of
608 * extents and invalidate some number of blocks. We assume that btree
609 * blocks aren't usually contiguous; and that scrub likely pulled all
610 * the buffers into memory. From these assumptions, set the maximum
611 * number of deferrals we can queue before flushing the defer chain,
612 * and the number of invalidations we can queue before rolling to a
613 * clean transaction (and possibly relogging some of the deferrals) to
614 * the same quantity.
615 */
616 const unsigned int variable_overhead = per_intent + per_binval;
617
618 xreap_configure_limits(rs, step_size, variable_overhead, per_intent,
619 per_binval);
620
621 trace_xreap_agextent_limits(sc->tp, per_binval, rs->max_binval,
622 step_size, per_intent, rs->max_deferred);
623 }
624
625 /*
626 * Compute the maximum number of intent items that reaping can attach to the
627 * scrub transaction given the worst case log overhead of the intent items
628 * needed to reap a single CoW staging extent. This is not for freeing
629 * metadata blocks.
630 */
631 STATIC void
xreap_configure_agcow_limits(struct xreap_state * rs)632 xreap_configure_agcow_limits(
633 struct xreap_state *rs)
634 {
635 struct xfs_scrub *sc = rs->sc;
636 struct xfs_mount *mp = sc->mp;
637
638 /*
639 * In the worst case, relogging an intent item causes both an intent
640 * item and a done item to be attached to a transaction for each extent
641 * that we'd like to process.
642 */
643 const unsigned int efi = xfs_efi_log_space(1) +
644 xfs_efd_log_space(1);
645 const unsigned int rui = xfs_rui_log_space(1) +
646 xfs_rud_log_space();
647 const unsigned int cui = xfs_cui_log_space(1) +
648 xfs_cud_log_space();
649
650 /*
651 * Various things can happen when reaping non-CoW metadata blocks:
652 *
653 * t0: Unmapping crosslinked CoW blocks: deferred removal of refcount
654 * record, which defers removal of rmap record
655 *
656 * t2: Freeing CoW blocks: deferred removal of refcount record, which
657 * defers removal of rmap record; and deferred removal of the space
658 *
659 * For simplicity, we'll use the worst-case intents size to determine
660 * the maximum number of deferred extents before we have to finish the
661 * whole chain. If we're trying to reap a btree larger than this size,
662 * a crash midway through reaping can result in leaked blocks.
663 */
664 const unsigned int t0 = cui + rui;
665 const unsigned int t2 = cui + rui + efi;
666 const unsigned int per_intent = max(t0, t2);
667
668 /*
669 * For each transaction in a reap chain, we must be able to take one
670 * step in the defer item chain, which should only consist of CUI, EFI,
671 * or RUI items.
672 */
673 const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1);
674 const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1);
675 const unsigned int f3 = xfs_calc_finish_cui_reservation(mp, 1);
676 const unsigned int step_size = max3(f1, f2, f3);
677
678 /* Largest buffer size (in fsblocks) that can be invalidated. */
679 const unsigned int max_binval = xrep_binval_max_fsblocks(mp);
680
681 /* Overhead of invalidating one buffer */
682 const unsigned int per_binval =
683 xfs_buf_inval_log_space(1, XFS_B_TO_FSBT(mp, max_binval));
684
685 /*
686 * For each transaction in a reap chain, we can delete some number of
687 * extents and invalidate some number of blocks. We assume that CoW
688 * staging extents are usually more than 1 fsblock, and that there
689 * shouldn't be any buffers for those blocks. From the assumptions,
690 * set the number of deferrals to use as much of the reservation as
691 * it can, but leave space to invalidate 1/8th that number of buffers.
692 */
693 const unsigned int variable_overhead = per_intent +
694 (per_binval / 8);
695
696 xreap_configure_limits(rs, step_size, variable_overhead, per_intent,
697 per_binval);
698
699 trace_xreap_agcow_limits(sc->tp, per_binval, rs->max_binval, step_size,
700 per_intent, rs->max_deferred);
701 }
702
703 /*
704 * Break an AG metadata extent into sub-extents by fate (crosslinked, not
705 * crosslinked), and dispose of each sub-extent separately.
706 */
707 STATIC int
xreap_agmeta_extent(uint32_t agbno,uint32_t len,void * priv)708 xreap_agmeta_extent(
709 uint32_t agbno,
710 uint32_t len,
711 void *priv)
712 {
713 struct xreap_state *rs = priv;
714 struct xfs_scrub *sc = rs->sc;
715 xfs_agblock_t agbno_next = agbno + len;
716 int error = 0;
717
718 ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
719 ASSERT(sc->ip == NULL);
720
721 while (agbno < agbno_next) {
722 xfs_extlen_t aglen;
723 bool crosslinked;
724
725 error = xreap_agextent_select(rs, agbno, agbno_next,
726 &crosslinked, &aglen);
727 if (error)
728 return error;
729
730 error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked);
731 if (error)
732 return error;
733
734 if (xreap_want_defer_finish(rs)) {
735 error = xrep_defer_finish(sc);
736 if (error)
737 return error;
738 xreap_defer_finish_reset(rs);
739 } else if (xreap_want_binval_roll(rs)) {
740 error = xrep_roll_ag_trans(sc);
741 if (error)
742 return error;
743 xreap_binval_reset(rs);
744 }
745
746 agbno += aglen;
747 }
748
749 return 0;
750 }
751
752 /* Dispose of every block of every AG metadata extent in the bitmap. */
753 int
xrep_reap_agblocks(struct xfs_scrub * sc,struct xagb_bitmap * bitmap,const struct xfs_owner_info * oinfo,enum xfs_ag_resv_type type)754 xrep_reap_agblocks(
755 struct xfs_scrub *sc,
756 struct xagb_bitmap *bitmap,
757 const struct xfs_owner_info *oinfo,
758 enum xfs_ag_resv_type type)
759 {
760 struct xreap_state rs = {
761 .sc = sc,
762 .oinfo = oinfo,
763 .resv = type,
764 };
765 int error;
766
767 ASSERT(xfs_has_rmapbt(sc->mp));
768 ASSERT(sc->ip == NULL);
769
770 xreap_configure_agextent_limits(&rs);
771 error = xagb_bitmap_walk(bitmap, xreap_agmeta_extent, &rs);
772 if (error)
773 return error;
774
775 if (xreap_is_dirty(&rs))
776 return xrep_defer_finish(sc);
777
778 return 0;
779 }
780
781 /*
782 * Break a file metadata extent into sub-extents by fate (crosslinked, not
783 * crosslinked), and dispose of each sub-extent separately. The extent must
784 * not cross an AG boundary.
785 */
786 STATIC int
xreap_fsmeta_extent(uint64_t fsbno,uint64_t len,void * priv)787 xreap_fsmeta_extent(
788 uint64_t fsbno,
789 uint64_t len,
790 void *priv)
791 {
792 struct xreap_state *rs = priv;
793 struct xfs_scrub *sc = rs->sc;
794 xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
795 xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
796 xfs_agblock_t agbno_next = agbno + len;
797 int error = 0;
798
799 ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
800 ASSERT(sc->ip != NULL);
801 ASSERT(!sc->sa.pag);
802
803 /*
804 * We're reaping blocks after repairing file metadata, which means that
805 * we have to init the xchk_ag structure ourselves.
806 */
807 sc->sa.pag = xfs_perag_get(sc->mp, agno);
808 if (!sc->sa.pag)
809 return -EFSCORRUPTED;
810
811 error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &sc->sa.agf_bp);
812 if (error)
813 goto out_pag;
814
815 while (agbno < agbno_next) {
816 xfs_extlen_t aglen;
817 bool crosslinked;
818
819 error = xreap_agextent_select(rs, agbno, agbno_next,
820 &crosslinked, &aglen);
821 if (error)
822 goto out_agf;
823
824 error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked);
825 if (error)
826 goto out_agf;
827
828 if (xreap_want_defer_finish(rs)) {
829 /*
830 * Holds the AGF buffer across the deferred chain
831 * processing.
832 */
833 error = xrep_defer_finish(sc);
834 if (error)
835 goto out_agf;
836 xreap_defer_finish_reset(rs);
837 } else if (xreap_want_binval_roll(rs)) {
838 /*
839 * Hold the AGF buffer across the transaction roll so
840 * that we don't have to reattach it to the scrub
841 * context.
842 */
843 xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
844 error = xfs_trans_roll_inode(&sc->tp, sc->ip);
845 xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
846 if (error)
847 goto out_agf;
848 xreap_binval_reset(rs);
849 }
850
851 agbno += aglen;
852 }
853
854 out_agf:
855 xfs_trans_brelse(sc->tp, sc->sa.agf_bp);
856 sc->sa.agf_bp = NULL;
857 out_pag:
858 xfs_perag_put(sc->sa.pag);
859 sc->sa.pag = NULL;
860 return error;
861 }
862
863 /*
864 * Dispose of every block of every fs metadata extent in the bitmap.
865 * Do not use this to dispose of the mappings in an ondisk inode fork.
866 */
867 int
xrep_reap_fsblocks(struct xfs_scrub * sc,struct xfsb_bitmap * bitmap,const struct xfs_owner_info * oinfo)868 xrep_reap_fsblocks(
869 struct xfs_scrub *sc,
870 struct xfsb_bitmap *bitmap,
871 const struct xfs_owner_info *oinfo)
872 {
873 struct xreap_state rs = {
874 .sc = sc,
875 .oinfo = oinfo,
876 .resv = XFS_AG_RESV_NONE,
877 };
878 int error;
879
880 ASSERT(xfs_has_rmapbt(sc->mp));
881 ASSERT(sc->ip != NULL);
882
883 if (oinfo == &XFS_RMAP_OINFO_COW)
884 xreap_configure_agcow_limits(&rs);
885 else
886 xreap_configure_agextent_limits(&rs);
887 error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs);
888 if (error)
889 return error;
890
891 if (xreap_is_dirty(&rs))
892 return xrep_defer_finish(sc);
893
894 return 0;
895 }
896
897 #ifdef CONFIG_XFS_RT
898 /*
899 * Figure out the longest run of blocks that we can dispose of with a single
900 * call. Cross-linked blocks should have their reverse mappings removed, but
901 * single-owner extents can be freed. Units are rt blocks, not rt extents.
902 */
903 STATIC int
xreap_rgextent_select(struct xreap_state * rs,xfs_rgblock_t rgbno,xfs_rgblock_t rgbno_next,bool * crosslinked,xfs_extlen_t * rglenp)904 xreap_rgextent_select(
905 struct xreap_state *rs,
906 xfs_rgblock_t rgbno,
907 xfs_rgblock_t rgbno_next,
908 bool *crosslinked,
909 xfs_extlen_t *rglenp)
910 {
911 struct xfs_scrub *sc = rs->sc;
912 struct xfs_btree_cur *cur;
913 xfs_rgblock_t bno = rgbno + 1;
914 xfs_extlen_t len = 1;
915 int error;
916
917 /*
918 * Determine if there are any other rmap records covering the first
919 * block of this extent. If so, the block is crosslinked.
920 */
921 cur = xfs_rtrmapbt_init_cursor(sc->tp, sc->sr.rtg);
922 error = xfs_rmap_has_other_keys(cur, rgbno, 1, rs->oinfo,
923 crosslinked);
924 if (error)
925 goto out_cur;
926
927 /*
928 * Figure out how many of the subsequent blocks have the same crosslink
929 * status.
930 */
931 while (bno < rgbno_next) {
932 bool also_crosslinked;
933
934 error = xfs_rmap_has_other_keys(cur, bno, 1, rs->oinfo,
935 &also_crosslinked);
936 if (error)
937 goto out_cur;
938
939 if (*crosslinked != also_crosslinked)
940 break;
941
942 len++;
943 bno++;
944 }
945
946 *rglenp = len;
947 trace_xreap_agextent_select(rtg_group(sc->sr.rtg), rgbno, len,
948 *crosslinked);
949 out_cur:
950 xfs_btree_del_cursor(cur, error);
951 return error;
952 }
953
954 /*
955 * Dispose of as much of the beginning of this rtgroup extent as possible.
956 * The number of blocks disposed of will be returned in @rglenp.
957 */
958 STATIC int
xreap_rgextent_iter(struct xreap_state * rs,xfs_rgblock_t rgbno,xfs_extlen_t * rglenp,bool crosslinked)959 xreap_rgextent_iter(
960 struct xreap_state *rs,
961 xfs_rgblock_t rgbno,
962 xfs_extlen_t *rglenp,
963 bool crosslinked)
964 {
965 struct xfs_scrub *sc = rs->sc;
966 xfs_rtblock_t rtbno;
967 int error;
968
969 /*
970 * The only caller so far is CoW fork repair, so we only know how to
971 * unlink or free CoW staging extents. Here we don't have to worry
972 * about invalidating buffers!
973 */
974 if (rs->oinfo != &XFS_RMAP_OINFO_COW) {
975 ASSERT(rs->oinfo == &XFS_RMAP_OINFO_COW);
976 return -EFSCORRUPTED;
977 }
978 ASSERT(rs->resv == XFS_AG_RESV_NONE);
979
980 rtbno = xfs_rgbno_to_rtb(sc->sr.rtg, rgbno);
981
982 /*
983 * t1: There are other rmappings; this block is cross linked and must
984 * not be freed. Remove the forward and reverse mapping and move on.
985 */
986 if (crosslinked) {
987 trace_xreap_dispose_unmap_extent(rtg_group(sc->sr.rtg), rgbno,
988 *rglenp);
989
990 xfs_refcount_free_cow_extent(sc->tp, true, rtbno, *rglenp);
991 xreap_inc_defer(rs);
992 return 0;
993 }
994
995 trace_xreap_dispose_free_extent(rtg_group(sc->sr.rtg), rgbno, *rglenp);
996
997 /*
998 * t2: The CoW staging extent is not crosslinked. Use deferred work
999 * to remove the refcountbt records (which removes the rmap records)
1000 * and free the extent. We're not worried about the system going down
1001 * here because log recovery walks the refcount btree to clean out the
1002 * CoW staging extents.
1003 */
1004 xfs_refcount_free_cow_extent(sc->tp, true, rtbno, *rglenp);
1005 error = xfs_free_extent_later(sc->tp, rtbno, *rglenp, NULL,
1006 rs->resv,
1007 XFS_FREE_EXTENT_REALTIME |
1008 XFS_FREE_EXTENT_SKIP_DISCARD);
1009 if (error)
1010 return error;
1011
1012 xreap_inc_defer(rs);
1013 return 0;
1014 }
1015
1016 /*
1017 * Compute the maximum number of intent items that reaping can attach to the
1018 * scrub transaction given the worst case log overhead of the intent items
1019 * needed to reap a single CoW staging extent. This is not for freeing
1020 * metadata blocks.
1021 */
1022 STATIC void
xreap_configure_rgcow_limits(struct xreap_state * rs)1023 xreap_configure_rgcow_limits(
1024 struct xreap_state *rs)
1025 {
1026 struct xfs_scrub *sc = rs->sc;
1027 struct xfs_mount *mp = sc->mp;
1028
1029 /*
1030 * In the worst case, relogging an intent item causes both an intent
1031 * item and a done item to be attached to a transaction for each extent
1032 * that we'd like to process.
1033 */
1034 const unsigned int efi = xfs_efi_log_space(1) +
1035 xfs_efd_log_space(1);
1036 const unsigned int rui = xfs_rui_log_space(1) +
1037 xfs_rud_log_space();
1038 const unsigned int cui = xfs_cui_log_space(1) +
1039 xfs_cud_log_space();
1040
1041 /*
1042 * Various things can happen when reaping non-CoW metadata blocks:
1043 *
1044 * t1: Unmapping crosslinked CoW blocks: deferred removal of refcount
1045 * record, which defers removal of rmap record
1046 *
1047 * t2: Freeing CoW blocks: deferred removal of refcount record, which
1048 * defers removal of rmap record; and deferred removal of the space
1049 *
1050 * For simplicity, we'll use the worst-case intents size to determine
1051 * the maximum number of deferred extents before we have to finish the
1052 * whole chain. If we're trying to reap a btree larger than this size,
1053 * a crash midway through reaping can result in leaked blocks.
1054 */
1055 const unsigned int t1 = cui + rui;
1056 const unsigned int t2 = cui + rui + efi;
1057 const unsigned int per_intent = max(t1, t2);
1058
1059 /*
1060 * For each transaction in a reap chain, we must be able to take one
1061 * step in the defer item chain, which should only consist of CUI, EFI,
1062 * or RUI items.
1063 */
1064 const unsigned int f1 = xfs_calc_finish_rt_efi_reservation(mp, 1);
1065 const unsigned int f2 = xfs_calc_finish_rt_rui_reservation(mp, 1);
1066 const unsigned int f3 = xfs_calc_finish_rt_cui_reservation(mp, 1);
1067 const unsigned int step_size = max3(f1, f2, f3);
1068
1069 /*
1070 * The only buffer for the rt device is the rtgroup super, so we don't
1071 * need to save space for buffer invalidations.
1072 */
1073 xreap_configure_limits(rs, step_size, per_intent, per_intent, 0);
1074
1075 trace_xreap_rgcow_limits(sc->tp, 0, 0, step_size, per_intent,
1076 rs->max_deferred);
1077 }
1078
1079 #define XREAP_RTGLOCK_ALL (XFS_RTGLOCK_BITMAP | \
1080 XFS_RTGLOCK_RMAP | \
1081 XFS_RTGLOCK_REFCOUNT)
1082
1083 /*
1084 * Break a rt file metadata extent into sub-extents by fate (crosslinked, not
1085 * crosslinked), and dispose of each sub-extent separately. The extent must
1086 * be aligned to a realtime extent.
1087 */
1088 STATIC int
xreap_rtmeta_extent(uint64_t rtbno,uint64_t len,void * priv)1089 xreap_rtmeta_extent(
1090 uint64_t rtbno,
1091 uint64_t len,
1092 void *priv)
1093 {
1094 struct xreap_state *rs = priv;
1095 struct xfs_scrub *sc = rs->sc;
1096 xfs_rgblock_t rgbno = xfs_rtb_to_rgbno(sc->mp, rtbno);
1097 xfs_rgblock_t rgbno_next = rgbno + len;
1098 int error = 0;
1099
1100 ASSERT(sc->ip != NULL);
1101 ASSERT(!sc->sr.rtg);
1102
1103 /*
1104 * We're reaping blocks after repairing file metadata, which means that
1105 * we have to init the xchk_ag structure ourselves.
1106 */
1107 sc->sr.rtg = xfs_rtgroup_get(sc->mp, xfs_rtb_to_rgno(sc->mp, rtbno));
1108 if (!sc->sr.rtg)
1109 return -EFSCORRUPTED;
1110
1111 xfs_rtgroup_lock(sc->sr.rtg, XREAP_RTGLOCK_ALL);
1112
1113 while (rgbno < rgbno_next) {
1114 xfs_extlen_t rglen;
1115 bool crosslinked;
1116
1117 error = xreap_rgextent_select(rs, rgbno, rgbno_next,
1118 &crosslinked, &rglen);
1119 if (error)
1120 goto out_unlock;
1121
1122 error = xreap_rgextent_iter(rs, rgbno, &rglen, crosslinked);
1123 if (error)
1124 goto out_unlock;
1125
1126 if (xreap_want_defer_finish(rs)) {
1127 error = xfs_defer_finish(&sc->tp);
1128 if (error)
1129 goto out_unlock;
1130 xreap_defer_finish_reset(rs);
1131 } else if (xreap_want_binval_roll(rs)) {
1132 error = xfs_trans_roll_inode(&sc->tp, sc->ip);
1133 if (error)
1134 goto out_unlock;
1135 xreap_binval_reset(rs);
1136 }
1137
1138 rgbno += rglen;
1139 }
1140
1141 out_unlock:
1142 xfs_rtgroup_unlock(sc->sr.rtg, XREAP_RTGLOCK_ALL);
1143 xfs_rtgroup_put(sc->sr.rtg);
1144 sc->sr.rtg = NULL;
1145 return error;
1146 }
1147
1148 /*
1149 * Dispose of every block of every rt metadata extent in the bitmap.
1150 * Do not use this to dispose of the mappings in an ondisk inode fork.
1151 */
1152 int
xrep_reap_rtblocks(struct xfs_scrub * sc,struct xrtb_bitmap * bitmap,const struct xfs_owner_info * oinfo)1153 xrep_reap_rtblocks(
1154 struct xfs_scrub *sc,
1155 struct xrtb_bitmap *bitmap,
1156 const struct xfs_owner_info *oinfo)
1157 {
1158 struct xreap_state rs = {
1159 .sc = sc,
1160 .oinfo = oinfo,
1161 .resv = XFS_AG_RESV_NONE,
1162 };
1163 int error;
1164
1165 ASSERT(xfs_has_rmapbt(sc->mp));
1166 ASSERT(sc->ip != NULL);
1167 ASSERT(oinfo == &XFS_RMAP_OINFO_COW);
1168
1169 xreap_configure_rgcow_limits(&rs);
1170 error = xrtb_bitmap_walk(bitmap, xreap_rtmeta_extent, &rs);
1171 if (error)
1172 return error;
1173
1174 if (xreap_is_dirty(&rs))
1175 return xrep_defer_finish(sc);
1176
1177 return 0;
1178 }
1179 #endif /* CONFIG_XFS_RT */
1180
1181 /*
1182 * Dispose of every block of an old metadata btree that used to be rooted in a
1183 * metadata directory file.
1184 */
1185 int
xrep_reap_metadir_fsblocks(struct xfs_scrub * sc,struct xfsb_bitmap * bitmap)1186 xrep_reap_metadir_fsblocks(
1187 struct xfs_scrub *sc,
1188 struct xfsb_bitmap *bitmap)
1189 {
1190 /*
1191 * Reap old metadir btree blocks with XFS_AG_RESV_NONE because the old
1192 * blocks are no longer mapped by the inode, and inode metadata space
1193 * reservations can only account freed space to the i_nblocks.
1194 */
1195 struct xfs_owner_info oinfo;
1196 struct xreap_state rs = {
1197 .sc = sc,
1198 .oinfo = &oinfo,
1199 .resv = XFS_AG_RESV_NONE,
1200 };
1201 int error;
1202
1203 ASSERT(xfs_has_rmapbt(sc->mp));
1204 ASSERT(sc->ip != NULL);
1205 ASSERT(xfs_is_metadir_inode(sc->ip));
1206
1207 xreap_configure_agextent_limits(&rs);
1208 xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, XFS_DATA_FORK);
1209 error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs);
1210 if (error)
1211 return error;
1212
1213 if (xreap_is_dirty(&rs)) {
1214 error = xrep_defer_finish(sc);
1215 if (error)
1216 return error;
1217 }
1218
1219 return xrep_reset_metafile_resv(sc);
1220 }
1221
1222 /*
1223 * Metadata files are not supposed to share blocks with anything else.
1224 * If blocks are shared, we remove the reverse mapping (thus reducing the
1225 * crosslink factor); if blocks are not shared, we also need to free them.
1226 *
1227 * This first step determines the longest subset of the passed-in imap
1228 * (starting at its beginning) that is either crosslinked or not crosslinked.
1229 * The blockcount will be adjust down as needed.
1230 */
1231 STATIC int
xreap_bmapi_select(struct xreap_state * rs,struct xfs_bmbt_irec * imap,bool * crosslinked)1232 xreap_bmapi_select(
1233 struct xreap_state *rs,
1234 struct xfs_bmbt_irec *imap,
1235 bool *crosslinked)
1236 {
1237 struct xfs_owner_info oinfo;
1238 struct xfs_scrub *sc = rs->sc;
1239 struct xfs_btree_cur *cur;
1240 xfs_filblks_t len = 1;
1241 xfs_agblock_t bno;
1242 xfs_agblock_t agbno;
1243 xfs_agblock_t agbno_next;
1244 int error;
1245
1246 agbno = XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock);
1247 agbno_next = agbno + imap->br_blockcount;
1248
1249 cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
1250 sc->sa.pag);
1251
1252 xfs_rmap_ino_owner(&oinfo, rs->ip->i_ino, rs->whichfork,
1253 imap->br_startoff);
1254 error = xfs_rmap_has_other_keys(cur, agbno, 1, &oinfo, crosslinked);
1255 if (error)
1256 goto out_cur;
1257
1258 bno = agbno + 1;
1259 while (bno < agbno_next) {
1260 bool also_crosslinked;
1261
1262 oinfo.oi_offset++;
1263 error = xfs_rmap_has_other_keys(cur, bno, 1, &oinfo,
1264 &also_crosslinked);
1265 if (error)
1266 goto out_cur;
1267
1268 if (also_crosslinked != *crosslinked)
1269 break;
1270
1271 len++;
1272 bno++;
1273 }
1274
1275 imap->br_blockcount = len;
1276 trace_xreap_bmapi_select(pag_group(sc->sa.pag), agbno, len,
1277 *crosslinked);
1278 out_cur:
1279 xfs_btree_del_cursor(cur, error);
1280 return error;
1281 }
1282
1283 /*
1284 * Decide if this buffer can be joined to a transaction. This is true for most
1285 * buffers, but there are two cases that we want to catch: large remote xattr
1286 * value buffers are not logged and can overflow the buffer log item dirty
1287 * bitmap size; and oversized cached buffers if things have really gone
1288 * haywire.
1289 */
1290 static inline bool
xreap_buf_loggable(const struct xfs_buf * bp)1291 xreap_buf_loggable(
1292 const struct xfs_buf *bp)
1293 {
1294 int i;
1295
1296 for (i = 0; i < bp->b_map_count; i++) {
1297 int chunks;
1298 int map_size;
1299
1300 chunks = DIV_ROUND_UP(BBTOB(bp->b_maps[i].bm_len),
1301 XFS_BLF_CHUNK);
1302 map_size = DIV_ROUND_UP(chunks, NBWORD);
1303 if (map_size > XFS_BLF_DATAMAP_SIZE)
1304 return false;
1305 }
1306
1307 return true;
1308 }
1309
1310 /*
1311 * Invalidate any buffers for this file mapping. The @imap blockcount may be
1312 * adjusted downward if we need to roll the transaction.
1313 */
1314 STATIC int
xreap_bmapi_binval(struct xreap_state * rs,struct xfs_bmbt_irec * imap)1315 xreap_bmapi_binval(
1316 struct xreap_state *rs,
1317 struct xfs_bmbt_irec *imap)
1318 {
1319 struct xfs_scrub *sc = rs->sc;
1320 struct xfs_mount *mp = sc->mp;
1321 struct xfs_perag *pag = sc->sa.pag;
1322 int bmap_flags = xfs_bmapi_aflag(rs->whichfork);
1323 xfs_fileoff_t off;
1324 xfs_fileoff_t max_off;
1325 xfs_extlen_t scan_blocks;
1326 xfs_agblock_t bno;
1327 xfs_agblock_t agbno;
1328 xfs_agblock_t agbno_next;
1329 int error;
1330
1331 /*
1332 * Avoid invalidating AG headers and post-EOFS blocks because we never
1333 * own those.
1334 */
1335 agbno = bno = XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock);
1336 agbno_next = agbno + imap->br_blockcount;
1337 if (!xfs_verify_agbno(pag, agbno) ||
1338 !xfs_verify_agbno(pag, agbno_next - 1))
1339 return 0;
1340
1341 /*
1342 * Buffers for file blocks can span multiple contiguous mappings. This
1343 * means that for each block in the mapping, there could exist an
1344 * xfs_buf indexed by that block with any length up to the maximum
1345 * buffer size (remote xattr values) or to the next hole in the fork.
1346 * To set up our binval scan, first we need to figure out the location
1347 * of the next hole.
1348 */
1349 off = imap->br_startoff + imap->br_blockcount;
1350 max_off = off + xfs_attr3_max_rmt_blocks(mp);
1351 while (off < max_off) {
1352 struct xfs_bmbt_irec hmap;
1353 int nhmaps = 1;
1354
1355 error = xfs_bmapi_read(rs->ip, off, max_off - off, &hmap,
1356 &nhmaps, bmap_flags);
1357 if (error)
1358 return error;
1359 if (nhmaps != 1 || hmap.br_startblock == DELAYSTARTBLOCK) {
1360 ASSERT(0);
1361 return -EFSCORRUPTED;
1362 }
1363
1364 if (!xfs_bmap_is_real_extent(&hmap))
1365 break;
1366
1367 off = hmap.br_startoff + hmap.br_blockcount;
1368 }
1369 scan_blocks = off - imap->br_startoff;
1370
1371 trace_xreap_bmapi_binval_scan(sc, imap, scan_blocks);
1372
1373 /*
1374 * If there are incore buffers for these blocks, invalidate them. If
1375 * we can't (try)lock the buffer we assume it's owned by someone else
1376 * and leave it alone. The buffer cache cannot detect aliasing, so
1377 * employ nested loops to detect incore buffers of any plausible size.
1378 */
1379 while (bno < agbno_next) {
1380 struct xrep_bufscan scan = {
1381 .daddr = xfs_agbno_to_daddr(pag, bno),
1382 .max_sectors = xrep_bufscan_max_sectors(mp,
1383 scan_blocks),
1384 .daddr_step = XFS_FSB_TO_BB(mp, 1),
1385 };
1386 struct xfs_buf *bp;
1387
1388 while ((bp = xrep_bufscan_advance(mp, &scan)) != NULL) {
1389 if (xreap_buf_loggable(bp)) {
1390 xfs_trans_bjoin(sc->tp, bp);
1391 xfs_trans_binval(sc->tp, bp);
1392 } else {
1393 xfs_buf_stale(bp);
1394 xfs_buf_relse(bp);
1395 }
1396
1397 /*
1398 * Stop invalidating if we've hit the limit; we should
1399 * still have enough reservation left to free however
1400 * far we've gotten.
1401 */
1402 if (!xreap_inc_binval(rs)) {
1403 imap->br_blockcount = agbno_next - bno;
1404 goto out;
1405 }
1406 }
1407
1408 bno++;
1409 scan_blocks--;
1410 }
1411
1412 out:
1413 trace_xreap_bmapi_binval(pag_group(sc->sa.pag), agbno,
1414 imap->br_blockcount);
1415 return 0;
1416 }
1417
1418 /*
1419 * Dispose of as much of the beginning of this file fork mapping as possible.
1420 * The number of blocks disposed of is returned in @imap->br_blockcount.
1421 */
1422 STATIC int
xrep_reap_bmapi_iter(struct xreap_state * rs,struct xfs_bmbt_irec * imap,bool crosslinked)1423 xrep_reap_bmapi_iter(
1424 struct xreap_state *rs,
1425 struct xfs_bmbt_irec *imap,
1426 bool crosslinked)
1427 {
1428 struct xfs_scrub *sc = rs->sc;
1429 int error;
1430
1431 if (crosslinked) {
1432 /*
1433 * If there are other rmappings, this block is cross linked and
1434 * must not be freed. Remove the reverse mapping, leave the
1435 * buffer cache in its possibly confused state, and move on.
1436 * We don't want to risk discarding valid data buffers from
1437 * anybody else who thinks they own the block, even though that
1438 * runs the risk of stale buffer warnings in the future.
1439 */
1440 trace_xreap_dispose_unmap_extent(pag_group(sc->sa.pag),
1441 XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock),
1442 imap->br_blockcount);
1443
1444 /*
1445 * t0: Schedule removal of the mapping from the fork. We use
1446 * deferred log intents in this function to control the exact
1447 * sequence of metadata updates.
1448 */
1449 xfs_bmap_unmap_extent(sc->tp, rs->ip, rs->whichfork, imap);
1450 xfs_trans_mod_dquot_byino(sc->tp, rs->ip, XFS_TRANS_DQ_BCOUNT,
1451 -(int64_t)imap->br_blockcount);
1452 xfs_rmap_unmap_extent(sc->tp, rs->ip, rs->whichfork, imap);
1453 return 0;
1454 }
1455
1456 /*
1457 * If the block is not crosslinked, we can invalidate all the incore
1458 * buffers for the extent, and then free the extent. This is a bit of
1459 * a mess since we don't detect discontiguous buffers that are indexed
1460 * by a block starting before the first block of the extent but overlap
1461 * anyway.
1462 */
1463 trace_xreap_dispose_free_extent(pag_group(sc->sa.pag),
1464 XFS_FSB_TO_AGBNO(sc->mp, imap->br_startblock),
1465 imap->br_blockcount);
1466
1467 /*
1468 * Invalidate as many buffers as we can, starting at the beginning of
1469 * this mapping. If this function sets blockcount to zero, the
1470 * transaction is full of logged buffer invalidations, so we need to
1471 * return early so that we can roll and retry.
1472 */
1473 error = xreap_bmapi_binval(rs, imap);
1474 if (error || imap->br_blockcount == 0)
1475 return error;
1476
1477 /*
1478 * t1: Schedule removal of the mapping from the fork. We use deferred
1479 * work in this function to control the exact sequence of metadata
1480 * updates.
1481 */
1482 xfs_bmap_unmap_extent(sc->tp, rs->ip, rs->whichfork, imap);
1483 xfs_trans_mod_dquot_byino(sc->tp, rs->ip, XFS_TRANS_DQ_BCOUNT,
1484 -(int64_t)imap->br_blockcount);
1485 return xfs_free_extent_later(sc->tp, imap->br_startblock,
1486 imap->br_blockcount, NULL, XFS_AG_RESV_NONE,
1487 XFS_FREE_EXTENT_SKIP_DISCARD);
1488 }
1489
1490 /* Compute the maximum mapcount of a file buffer. */
1491 static unsigned int
xreap_bmapi_binval_mapcount(struct xfs_scrub * sc)1492 xreap_bmapi_binval_mapcount(
1493 struct xfs_scrub *sc)
1494 {
1495 /* directory blocks can span multiple fsblocks and be discontiguous */
1496 if (sc->sm->sm_type == XFS_SCRUB_TYPE_DIR)
1497 return sc->mp->m_dir_geo->fsbcount;
1498
1499 /* all other file xattr/symlink blocks must be contiguous */
1500 return 1;
1501 }
1502
1503 /* Compute the maximum block size of a file buffer. */
1504 static unsigned int
xreap_bmapi_binval_blocksize(struct xfs_scrub * sc)1505 xreap_bmapi_binval_blocksize(
1506 struct xfs_scrub *sc)
1507 {
1508 switch (sc->sm->sm_type) {
1509 case XFS_SCRUB_TYPE_DIR:
1510 return sc->mp->m_dir_geo->blksize;
1511 case XFS_SCRUB_TYPE_XATTR:
1512 case XFS_SCRUB_TYPE_PARENT:
1513 /*
1514 * The xattr structure itself consists of single fsblocks, but
1515 * there could be remote xattr blocks to invalidate.
1516 */
1517 return XFS_XATTR_SIZE_MAX;
1518 }
1519
1520 /* everything else is a single block */
1521 return sc->mp->m_sb.sb_blocksize;
1522 }
1523
1524 /*
1525 * Compute the maximum number of buffer invalidations that we can do while
1526 * reaping a single extent from a file fork.
1527 */
1528 STATIC void
xreap_configure_bmapi_limits(struct xreap_state * rs)1529 xreap_configure_bmapi_limits(
1530 struct xreap_state *rs)
1531 {
1532 struct xfs_scrub *sc = rs->sc;
1533 struct xfs_mount *mp = sc->mp;
1534
1535 /* overhead of invalidating a buffer */
1536 const unsigned int per_binval =
1537 xfs_buf_inval_log_space(xreap_bmapi_binval_mapcount(sc),
1538 xreap_bmapi_binval_blocksize(sc));
1539
1540 /*
1541 * In the worst case, relogging an intent item causes both an intent
1542 * item and a done item to be attached to a transaction for each extent
1543 * that we'd like to process.
1544 */
1545 const unsigned int efi = xfs_efi_log_space(1) +
1546 xfs_efd_log_space(1);
1547 const unsigned int rui = xfs_rui_log_space(1) +
1548 xfs_rud_log_space();
1549 const unsigned int bui = xfs_bui_log_space(1) +
1550 xfs_bud_log_space();
1551
1552 /*
1553 * t1: Unmapping crosslinked file data blocks: one bmap deletion,
1554 * possibly an EFI for underfilled bmbt blocks, and an rmap deletion.
1555 *
1556 * t2: Freeing freeing file data blocks: one bmap deletion, possibly an
1557 * EFI for underfilled bmbt blocks, and another EFI for the space
1558 * itself.
1559 */
1560 const unsigned int t1 = (bui + efi) + rui;
1561 const unsigned int t2 = (bui + efi) + efi;
1562 const unsigned int per_intent = max(t1, t2);
1563
1564 /*
1565 * For each transaction in a reap chain, we must be able to take one
1566 * step in the defer item chain, which should only consist of CUI, EFI,
1567 * or RUI items.
1568 */
1569 const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1);
1570 const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1);
1571 const unsigned int f3 = xfs_calc_finish_bui_reservation(mp, 1);
1572 const unsigned int step_size = max3(f1, f2, f3);
1573
1574 /*
1575 * Each call to xreap_ifork_extent starts with a clean transaction and
1576 * operates on a single mapping by creating a chain of log intent items
1577 * for that mapping. We need to leave enough reservation in the
1578 * transaction to log btree buffer and inode updates for each step in
1579 * the chain, and to relog the log intents.
1580 */
1581 const unsigned int per_extent_res = per_intent + step_size;
1582
1583 xreap_configure_limits(rs, per_extent_res, per_binval, 0, per_binval);
1584
1585 trace_xreap_bmapi_limits(sc->tp, per_binval, rs->max_binval,
1586 step_size, per_intent, 1);
1587 }
1588
1589 /*
1590 * Dispose of as much of this file extent as we can. Upon successful return,
1591 * the imap will reflect the mapping that was removed from the fork.
1592 */
1593 STATIC int
xreap_ifork_extent(struct xreap_state * rs,struct xfs_bmbt_irec * imap)1594 xreap_ifork_extent(
1595 struct xreap_state *rs,
1596 struct xfs_bmbt_irec *imap)
1597 {
1598 struct xfs_scrub *sc = rs->sc;
1599 xfs_agnumber_t agno;
1600 bool crosslinked;
1601 int error;
1602
1603 ASSERT(sc->sa.pag == NULL);
1604
1605 trace_xreap_ifork_extent(sc, rs->ip, rs->whichfork, imap);
1606
1607 agno = XFS_FSB_TO_AGNO(sc->mp, imap->br_startblock);
1608 sc->sa.pag = xfs_perag_get(sc->mp, agno);
1609 if (!sc->sa.pag)
1610 return -EFSCORRUPTED;
1611
1612 error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &sc->sa.agf_bp);
1613 if (error)
1614 goto out_pag;
1615
1616 /*
1617 * Decide the fate of the blocks at the beginning of the mapping, then
1618 * update the mapping to use it with the unmap calls.
1619 */
1620 error = xreap_bmapi_select(rs, imap, &crosslinked);
1621 if (error)
1622 goto out_agf;
1623
1624 error = xrep_reap_bmapi_iter(rs, imap, crosslinked);
1625 if (error)
1626 goto out_agf;
1627
1628 out_agf:
1629 xfs_trans_brelse(sc->tp, sc->sa.agf_bp);
1630 sc->sa.agf_bp = NULL;
1631 out_pag:
1632 xfs_perag_put(sc->sa.pag);
1633 sc->sa.pag = NULL;
1634 return error;
1635 }
1636
1637 /*
1638 * Dispose of each block mapped to the given fork of the given file. Callers
1639 * must hold ILOCK_EXCL, and ip can only be sc->ip or sc->tempip. The fork
1640 * must not have any delalloc reservations.
1641 */
1642 int
xrep_reap_ifork(struct xfs_scrub * sc,struct xfs_inode * ip,int whichfork)1643 xrep_reap_ifork(
1644 struct xfs_scrub *sc,
1645 struct xfs_inode *ip,
1646 int whichfork)
1647 {
1648 struct xreap_state rs = {
1649 .sc = sc,
1650 .ip = ip,
1651 .whichfork = whichfork,
1652 };
1653 xfs_fileoff_t off = 0;
1654 int bmap_flags = xfs_bmapi_aflag(whichfork);
1655 int error;
1656
1657 ASSERT(xfs_has_rmapbt(sc->mp));
1658 ASSERT(ip == sc->ip || ip == sc->tempip);
1659 ASSERT(whichfork == XFS_ATTR_FORK || !XFS_IS_REALTIME_INODE(ip));
1660
1661 xreap_configure_bmapi_limits(&rs);
1662 while (off < XFS_MAX_FILEOFF) {
1663 struct xfs_bmbt_irec imap;
1664 int nimaps = 1;
1665
1666 /* Read the next extent, skip past holes and delalloc. */
1667 error = xfs_bmapi_read(ip, off, XFS_MAX_FILEOFF - off, &imap,
1668 &nimaps, bmap_flags);
1669 if (error)
1670 return error;
1671 if (nimaps != 1 || imap.br_startblock == DELAYSTARTBLOCK) {
1672 ASSERT(0);
1673 return -EFSCORRUPTED;
1674 }
1675
1676 /*
1677 * If this is a real space mapping, reap as much of it as we
1678 * can in a single transaction.
1679 */
1680 if (xfs_bmap_is_real_extent(&imap)) {
1681 error = xreap_ifork_extent(&rs, &imap);
1682 if (error)
1683 return error;
1684
1685 error = xfs_defer_finish(&sc->tp);
1686 if (error)
1687 return error;
1688 xreap_defer_finish_reset(&rs);
1689 }
1690
1691 off = imap.br_startoff + imap.br_blockcount;
1692 }
1693
1694 return 0;
1695 }
1696