1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Copyright (c) 2018-2024 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_defer.h"
13 #include "xfs_btree.h"
14 #include "xfs_btree_staging.h"
15 #include "xfs_buf_mem.h"
16 #include "xfs_btree_mem.h"
17 #include "xfs_bit.h"
18 #include "xfs_log_format.h"
19 #include "xfs_trans.h"
20 #include "xfs_sb.h"
21 #include "xfs_alloc.h"
22 #include "xfs_alloc_btree.h"
23 #include "xfs_ialloc.h"
24 #include "xfs_ialloc_btree.h"
25 #include "xfs_rmap.h"
26 #include "xfs_rmap_btree.h"
27 #include "xfs_inode.h"
28 #include "xfs_icache.h"
29 #include "xfs_bmap.h"
30 #include "xfs_bmap_btree.h"
31 #include "xfs_refcount.h"
32 #include "xfs_refcount_btree.h"
33 #include "xfs_ag.h"
34 #include "xfs_rtrmap_btree.h"
35 #include "xfs_rtgroup.h"
36 #include "xfs_rtrefcount_btree.h"
37 #include "scrub/xfs_scrub.h"
38 #include "scrub/scrub.h"
39 #include "scrub/common.h"
40 #include "scrub/btree.h"
41 #include "scrub/trace.h"
42 #include "scrub/repair.h"
43 #include "scrub/bitmap.h"
44 #include "scrub/agb_bitmap.h"
45 #include "scrub/xfile.h"
46 #include "scrub/xfarray.h"
47 #include "scrub/iscan.h"
48 #include "scrub/newbt.h"
49 #include "scrub/reap.h"
50
51 /*
52 * Reverse Mapping Btree Repair
53 * ============================
54 *
55 * This is the most involved of all the AG space btree rebuilds. Everywhere
56 * else in XFS we lock inodes and then AG data structures, but generating the
57 * list of rmap records requires that we be able to scan both block mapping
58 * btrees of every inode in the filesystem to see if it owns any extents in
59 * this AG. We can't tolerate any inode updates while we do this, so we
60 * freeze the filesystem to lock everyone else out, and grant ourselves
61 * special privileges to run transactions with regular background reclamation
62 * turned off.
63 *
64 * We also have to be very careful not to allow inode reclaim to start a
65 * transaction because all transactions (other than our own) will block.
66 * Deferred inode inactivation helps us out there.
67 *
68 * I) Reverse mappings for all non-space metadata and file data are collected
69 * according to the following algorithm:
70 *
71 * 1. For each fork of each inode:
72 * 1.1. Create a bitmap BMBIT to track bmbt blocks if necessary.
73 * 1.2. If the incore extent map isn't loaded, walk the bmbt to accumulate
74 * bmaps into rmap records (see 1.1.4). Set bits in BMBIT for each btree
75 * block.
76 * 1.3. If the incore extent map is loaded but the fork is in btree format,
77 * just visit the bmbt blocks to set the corresponding BMBIT areas.
78 * 1.4. From the incore extent map, accumulate each bmap that falls into our
79 * target AG. Remember, multiple bmap records can map to a single rmap
80 * record, so we cannot simply emit rmap records 1:1.
81 * 1.5. Emit rmap records for each extent in BMBIT and free it.
82 * 2. Create bitmaps INOBIT and ICHUNKBIT.
83 * 3. For each record in the inobt, set the corresponding areas in ICHUNKBIT,
84 * and set bits in INOBIT for each btree block. If the inobt has no records
85 * at all, we must be careful to record its root in INOBIT.
86 * 4. For each block in the finobt, set the corresponding INOBIT area.
87 * 5. Emit rmap records for each extent in INOBIT and ICHUNKBIT and free them.
88 * 6. Create bitmaps REFCBIT and COWBIT.
89 * 7. For each CoW staging extent in the refcountbt, set the corresponding
90 * areas in COWBIT.
91 * 8. For each block in the refcountbt, set the corresponding REFCBIT area.
92 * 9. Emit rmap records for each extent in REFCBIT and COWBIT and free them.
93 * A. Emit rmap for the AG headers.
94 * B. Emit rmap for the log, if there is one.
95 *
96 * II) The rmapbt shape and space metadata rmaps are computed as follows:
97 *
98 * 1. Count the rmaps collected in the previous step. (= NR)
99 * 2. Estimate the number of rmapbt blocks needed to store NR records. (= RMB)
100 * 3. Reserve RMB blocks through the newbt using the allocator in normap mode.
101 * 4. Create bitmap AGBIT.
102 * 5. For each reservation in the newbt, set the corresponding areas in AGBIT.
103 * 6. For each block in the AGFL, bnobt, and cntbt, set the bits in AGBIT.
104 * 7. Count the extents in AGBIT. (= AGNR)
105 * 8. Estimate the number of rmapbt blocks needed for NR + AGNR rmaps. (= RMB')
106 * 9. If RMB' >= RMB, reserve RMB' - RMB more newbt blocks, set RMB = RMB',
107 * and clear AGBIT. Go to step 5.
108 * A. Emit rmaps for each extent in AGBIT.
109 *
110 * III) The rmapbt is constructed and set in place as follows:
111 *
112 * 1. Sort the rmap records.
113 * 2. Bulk load the rmaps.
114 *
115 * IV) Reap the old btree blocks.
116 *
117 * 1. Create a bitmap OLDRMBIT.
118 * 2. For each gap in the new rmapbt, set the corresponding areas of OLDRMBIT.
119 * 3. For each extent in the bnobt, clear the corresponding parts of OLDRMBIT.
120 * 4. Reap the extents corresponding to the set areas in OLDRMBIT. These are
121 * the parts of the AG that the rmap didn't find during its scan of the
122 * primary metadata and aren't known to be in the free space, which implies
123 * that they were the old rmapbt blocks.
124 * 5. Commit.
125 *
126 * We use the 'xrep_rmap' prefix for all the rmap functions.
127 */
128
129 /* Context for collecting rmaps */
130 struct xrep_rmap {
131 /* new rmapbt information */
132 struct xrep_newbt new_btree;
133
134 /* lock for the xfbtree and xfile */
135 struct mutex lock;
136
137 /* rmap records generated from primary metadata */
138 struct xfbtree rmap_btree;
139
140 struct xfs_scrub *sc;
141
142 /* in-memory btree cursor for the xfs_btree_bload iteration */
143 struct xfs_btree_cur *mcur;
144
145 /* Hooks into rmap update code. */
146 struct xfs_rmap_hook rhook;
147
148 /* inode scan cursor */
149 struct xchk_iscan iscan;
150
151 /* Number of non-freespace records found. */
152 unsigned long long nr_records;
153
154 /* bnobt/cntbt contribution to btreeblks */
155 xfs_agblock_t freesp_btblocks;
156
157 /* old agf_rmap_blocks counter */
158 unsigned int old_rmapbt_fsbcount;
159 };
160
161 /* Set us up to repair reverse mapping btrees. */
162 int
xrep_setup_ag_rmapbt(struct xfs_scrub * sc)163 xrep_setup_ag_rmapbt(
164 struct xfs_scrub *sc)
165 {
166 struct xrep_rmap *rr;
167 char *descr;
168 int error;
169
170 xchk_fsgates_enable(sc, XCHK_FSGATES_RMAP);
171
172 descr = xchk_xfile_ag_descr(sc, "reverse mapping records");
173 error = xrep_setup_xfbtree(sc, descr);
174 kfree(descr);
175 if (error)
176 return error;
177
178 rr = kzalloc(sizeof(struct xrep_rmap), XCHK_GFP_FLAGS);
179 if (!rr)
180 return -ENOMEM;
181
182 rr->sc = sc;
183 sc->buf = rr;
184 return 0;
185 }
186
187 /* Make sure there's nothing funny about this mapping. */
188 STATIC int
xrep_rmap_check_mapping(struct xfs_scrub * sc,const struct xfs_rmap_irec * rec)189 xrep_rmap_check_mapping(
190 struct xfs_scrub *sc,
191 const struct xfs_rmap_irec *rec)
192 {
193 enum xbtree_recpacking outcome;
194 int error;
195
196 if (xfs_rmap_check_irec(sc->sa.pag, rec) != NULL)
197 return -EFSCORRUPTED;
198
199 /* Make sure this isn't free space. */
200 error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rm_startblock,
201 rec->rm_blockcount, &outcome);
202 if (error)
203 return error;
204 if (outcome != XBTREE_RECPACKING_EMPTY)
205 return -EFSCORRUPTED;
206
207 return 0;
208 }
209
210 /* Store a reverse-mapping record. */
211 static inline int
xrep_rmap_stash(struct xrep_rmap * rr,xfs_agblock_t startblock,xfs_extlen_t blockcount,uint64_t owner,uint64_t offset,unsigned int flags)212 xrep_rmap_stash(
213 struct xrep_rmap *rr,
214 xfs_agblock_t startblock,
215 xfs_extlen_t blockcount,
216 uint64_t owner,
217 uint64_t offset,
218 unsigned int flags)
219 {
220 struct xfs_rmap_irec rmap = {
221 .rm_startblock = startblock,
222 .rm_blockcount = blockcount,
223 .rm_owner = owner,
224 .rm_offset = offset,
225 .rm_flags = flags,
226 };
227 struct xfs_scrub *sc = rr->sc;
228 struct xfs_btree_cur *mcur;
229 int error = 0;
230
231 if (xchk_should_terminate(sc, &error))
232 return error;
233
234 if (xchk_iscan_aborted(&rr->iscan))
235 return -EFSCORRUPTED;
236
237 trace_xrep_rmap_found(sc->sa.pag, &rmap);
238
239 mutex_lock(&rr->lock);
240 mcur = xfs_rmapbt_mem_cursor(sc->sa.pag, sc->tp, &rr->rmap_btree);
241 error = xfs_rmap_map_raw(mcur, &rmap);
242 xfs_btree_del_cursor(mcur, error);
243 if (error)
244 goto out_cancel;
245
246 error = xfbtree_trans_commit(&rr->rmap_btree, sc->tp);
247 if (error)
248 goto out_abort;
249
250 mutex_unlock(&rr->lock);
251 return 0;
252
253 out_cancel:
254 xfbtree_trans_cancel(&rr->rmap_btree, sc->tp);
255 out_abort:
256 xchk_iscan_abort(&rr->iscan);
257 mutex_unlock(&rr->lock);
258 return error;
259 }
260
261 struct xrep_rmap_stash_run {
262 struct xrep_rmap *rr;
263 uint64_t owner;
264 unsigned int rmap_flags;
265 };
266
267 static int
xrep_rmap_stash_run(uint32_t start,uint32_t len,void * priv)268 xrep_rmap_stash_run(
269 uint32_t start,
270 uint32_t len,
271 void *priv)
272 {
273 struct xrep_rmap_stash_run *rsr = priv;
274 struct xrep_rmap *rr = rsr->rr;
275
276 return xrep_rmap_stash(rr, start, len, rsr->owner, 0, rsr->rmap_flags);
277 }
278
279 /*
280 * Emit rmaps for every extent of bits set in the bitmap. Caller must ensure
281 * that the ranges are in units of FS blocks.
282 */
283 STATIC int
xrep_rmap_stash_bitmap(struct xrep_rmap * rr,struct xagb_bitmap * bitmap,const struct xfs_owner_info * oinfo)284 xrep_rmap_stash_bitmap(
285 struct xrep_rmap *rr,
286 struct xagb_bitmap *bitmap,
287 const struct xfs_owner_info *oinfo)
288 {
289 struct xrep_rmap_stash_run rsr = {
290 .rr = rr,
291 .owner = oinfo->oi_owner,
292 .rmap_flags = 0,
293 };
294
295 if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK)
296 rsr.rmap_flags |= XFS_RMAP_ATTR_FORK;
297 if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
298 rsr.rmap_flags |= XFS_RMAP_BMBT_BLOCK;
299
300 return xagb_bitmap_walk(bitmap, xrep_rmap_stash_run, &rsr);
301 }
302
303 /* Section (I): Finding all file and bmbt extents. */
304
305 /* Context for accumulating rmaps for an inode fork. */
306 struct xrep_rmap_ifork {
307 /*
308 * Accumulate rmap data here to turn multiple adjacent bmaps into a
309 * single rmap.
310 */
311 struct xfs_rmap_irec accum;
312
313 /* Bitmap of bmbt blocks in this AG. */
314 struct xagb_bitmap bmbt_blocks;
315
316 struct xrep_rmap *rr;
317
318 /* Which inode fork? */
319 int whichfork;
320 };
321
322 /* Stash an rmap that we accumulated while walking an inode fork. */
323 STATIC int
xrep_rmap_stash_accumulated(struct xrep_rmap_ifork * rf)324 xrep_rmap_stash_accumulated(
325 struct xrep_rmap_ifork *rf)
326 {
327 if (rf->accum.rm_blockcount == 0)
328 return 0;
329
330 return xrep_rmap_stash(rf->rr, rf->accum.rm_startblock,
331 rf->accum.rm_blockcount, rf->accum.rm_owner,
332 rf->accum.rm_offset, rf->accum.rm_flags);
333 }
334
335 /* Accumulate a bmbt record. */
336 STATIC int
xrep_rmap_visit_bmbt(struct xfs_btree_cur * cur,struct xfs_bmbt_irec * rec,void * priv)337 xrep_rmap_visit_bmbt(
338 struct xfs_btree_cur *cur,
339 struct xfs_bmbt_irec *rec,
340 void *priv)
341 {
342 struct xrep_rmap_ifork *rf = priv;
343 struct xfs_mount *mp = rf->rr->sc->mp;
344 struct xfs_rmap_irec *accum = &rf->accum;
345 xfs_agblock_t agbno;
346 unsigned int rmap_flags = 0;
347 int error;
348
349 if (XFS_FSB_TO_AGNO(mp, rec->br_startblock) !=
350 pag_agno(rf->rr->sc->sa.pag))
351 return 0;
352
353 agbno = XFS_FSB_TO_AGBNO(mp, rec->br_startblock);
354 if (rf->whichfork == XFS_ATTR_FORK)
355 rmap_flags |= XFS_RMAP_ATTR_FORK;
356 if (rec->br_state == XFS_EXT_UNWRITTEN)
357 rmap_flags |= XFS_RMAP_UNWRITTEN;
358
359 /* If this bmap is adjacent to the previous one, just add it. */
360 if (accum->rm_blockcount > 0 &&
361 rec->br_startoff == accum->rm_offset + accum->rm_blockcount &&
362 agbno == accum->rm_startblock + accum->rm_blockcount &&
363 rmap_flags == accum->rm_flags) {
364 accum->rm_blockcount += rec->br_blockcount;
365 return 0;
366 }
367
368 /* Otherwise stash the old rmap and start accumulating a new one. */
369 error = xrep_rmap_stash_accumulated(rf);
370 if (error)
371 return error;
372
373 accum->rm_startblock = agbno;
374 accum->rm_blockcount = rec->br_blockcount;
375 accum->rm_offset = rec->br_startoff;
376 accum->rm_flags = rmap_flags;
377 return 0;
378 }
379
380 /* Add a btree block to the bitmap. */
381 STATIC int
xrep_rmap_visit_iroot_btree_block(struct xfs_btree_cur * cur,int level,void * priv)382 xrep_rmap_visit_iroot_btree_block(
383 struct xfs_btree_cur *cur,
384 int level,
385 void *priv)
386 {
387 struct xrep_rmap_ifork *rf = priv;
388 struct xfs_buf *bp;
389 xfs_fsblock_t fsbno;
390 xfs_agblock_t agbno;
391
392 xfs_btree_get_block(cur, level, &bp);
393 if (!bp)
394 return 0;
395
396 fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
397 if (XFS_FSB_TO_AGNO(cur->bc_mp, fsbno) != pag_agno(rf->rr->sc->sa.pag))
398 return 0;
399
400 agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
401 return xagb_bitmap_set(&rf->bmbt_blocks, agbno, 1);
402 }
403
404 /*
405 * Iterate a metadata btree rooted in an inode to collect rmap records for
406 * anything in this fork that matches the AG.
407 */
408 STATIC int
xrep_rmap_scan_iroot_btree(struct xrep_rmap_ifork * rf,struct xfs_btree_cur * cur)409 xrep_rmap_scan_iroot_btree(
410 struct xrep_rmap_ifork *rf,
411 struct xfs_btree_cur *cur)
412 {
413 struct xfs_owner_info oinfo;
414 struct xrep_rmap *rr = rf->rr;
415 int error;
416
417 xagb_bitmap_init(&rf->bmbt_blocks);
418
419 /* Record all the blocks in the btree itself. */
420 error = xfs_btree_visit_blocks(cur, xrep_rmap_visit_iroot_btree_block,
421 XFS_BTREE_VISIT_ALL, rf);
422 if (error)
423 goto out;
424
425 /* Emit rmaps for the btree blocks. */
426 xfs_rmap_ino_bmbt_owner(&oinfo, rf->accum.rm_owner, rf->whichfork);
427 error = xrep_rmap_stash_bitmap(rr, &rf->bmbt_blocks, &oinfo);
428 if (error)
429 goto out;
430
431 /* Stash any remaining accumulated rmaps. */
432 error = xrep_rmap_stash_accumulated(rf);
433 out:
434 xagb_bitmap_destroy(&rf->bmbt_blocks);
435 return error;
436 }
437
438 /*
439 * Iterate the block mapping btree to collect rmap records for anything in this
440 * fork that matches the AG. Sets @mappings_done to true if we've scanned the
441 * block mappings in this fork.
442 */
443 STATIC int
xrep_rmap_scan_bmbt(struct xrep_rmap_ifork * rf,struct xfs_inode * ip,bool * mappings_done)444 xrep_rmap_scan_bmbt(
445 struct xrep_rmap_ifork *rf,
446 struct xfs_inode *ip,
447 bool *mappings_done)
448 {
449 struct xrep_rmap *rr = rf->rr;
450 struct xfs_btree_cur *cur;
451 struct xfs_ifork *ifp;
452 int error;
453
454 *mappings_done = false;
455 ifp = xfs_ifork_ptr(ip, rf->whichfork);
456 cur = xfs_bmbt_init_cursor(rr->sc->mp, rr->sc->tp, ip, rf->whichfork);
457
458 if (!xfs_ifork_is_realtime(ip, rf->whichfork) &&
459 xfs_need_iread_extents(ifp)) {
460 /*
461 * If the incore extent cache isn't loaded, scan the bmbt for
462 * mapping records. This avoids loading the incore extent
463 * tree, which will increase memory pressure at a time when
464 * we're trying to run as quickly as we possibly can. Ignore
465 * realtime extents.
466 */
467 error = xfs_bmap_query_all(cur, xrep_rmap_visit_bmbt, rf);
468 if (error)
469 goto out_cur;
470
471 *mappings_done = true;
472 }
473
474 /* Scan for the bmbt blocks, which always live on the data device. */
475 error = xrep_rmap_scan_iroot_btree(rf, cur);
476 out_cur:
477 xfs_btree_del_cursor(cur, error);
478 return error;
479 }
480
481 /*
482 * Iterate the in-core extent cache to collect rmap records for anything in
483 * this fork that matches the AG.
484 */
485 STATIC int
xrep_rmap_scan_iext(struct xrep_rmap_ifork * rf,struct xfs_ifork * ifp)486 xrep_rmap_scan_iext(
487 struct xrep_rmap_ifork *rf,
488 struct xfs_ifork *ifp)
489 {
490 struct xfs_bmbt_irec rec;
491 struct xfs_iext_cursor icur;
492 int error;
493
494 for_each_xfs_iext(ifp, &icur, &rec) {
495 if (isnullstartblock(rec.br_startblock))
496 continue;
497 error = xrep_rmap_visit_bmbt(NULL, &rec, rf);
498 if (error)
499 return error;
500 }
501
502 return xrep_rmap_stash_accumulated(rf);
503 }
504
505 static int
xrep_rmap_scan_meta_btree(struct xrep_rmap_ifork * rf,struct xfs_inode * ip)506 xrep_rmap_scan_meta_btree(
507 struct xrep_rmap_ifork *rf,
508 struct xfs_inode *ip)
509 {
510 struct xfs_scrub *sc = rf->rr->sc;
511 struct xfs_rtgroup *rtg = NULL;
512 struct xfs_btree_cur *cur = NULL;
513 enum xfs_rtg_inodes type;
514 int error;
515
516 if (rf->whichfork != XFS_DATA_FORK)
517 return -EFSCORRUPTED;
518
519 switch (ip->i_metatype) {
520 case XFS_METAFILE_RTRMAP:
521 type = XFS_RTGI_RMAP;
522 break;
523 case XFS_METAFILE_RTREFCOUNT:
524 type = XFS_RTGI_REFCOUNT;
525 break;
526 default:
527 ASSERT(0);
528 return -EFSCORRUPTED;
529 }
530
531 while ((rtg = xfs_rtgroup_next(sc->mp, rtg))) {
532 if (ip == rtg->rtg_inodes[type])
533 goto found;
534 }
535
536 /*
537 * We should never find an rt metadata btree inode that isn't
538 * associated with an rtgroup yet has ondisk blocks allocated to it.
539 */
540 if (ip->i_nblocks) {
541 ASSERT(0);
542 return -EFSCORRUPTED;
543 }
544
545 return 0;
546
547 found:
548 switch (ip->i_metatype) {
549 case XFS_METAFILE_RTRMAP:
550 cur = xfs_rtrmapbt_init_cursor(sc->tp, rtg);
551 break;
552 case XFS_METAFILE_RTREFCOUNT:
553 cur = xfs_rtrefcountbt_init_cursor(sc->tp, rtg);
554 break;
555 default:
556 ASSERT(0);
557 error = -EFSCORRUPTED;
558 goto out_rtg;
559 }
560
561 error = xrep_rmap_scan_iroot_btree(rf, cur);
562 xfs_btree_del_cursor(cur, error);
563 out_rtg:
564 xfs_rtgroup_rele(rtg);
565 return error;
566 }
567
568 /* Find all the extents from a given AG in an inode fork. */
569 STATIC int
xrep_rmap_scan_ifork(struct xrep_rmap * rr,struct xfs_inode * ip,int whichfork)570 xrep_rmap_scan_ifork(
571 struct xrep_rmap *rr,
572 struct xfs_inode *ip,
573 int whichfork)
574 {
575 struct xrep_rmap_ifork rf = {
576 .accum = { .rm_owner = ip->i_ino, },
577 .rr = rr,
578 .whichfork = whichfork,
579 };
580 struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
581 bool mappings_done;
582 int error = 0;
583
584 if (!ifp)
585 return 0;
586
587 switch (ifp->if_format) {
588 case XFS_DINODE_FMT_BTREE:
589 /*
590 * Scan the bmap btree for data device mappings. This includes
591 * the btree blocks themselves, even if this is a realtime
592 * file.
593 */
594 error = xrep_rmap_scan_bmbt(&rf, ip, &mappings_done);
595 if (error || mappings_done)
596 return error;
597 fallthrough;
598 case XFS_DINODE_FMT_EXTENTS:
599 /* Scan incore extent cache if this isn't a realtime file. */
600 if (xfs_ifork_is_realtime(ip, whichfork))
601 return 0;
602
603 return xrep_rmap_scan_iext(&rf, ifp);
604 case XFS_DINODE_FMT_META_BTREE:
605 return xrep_rmap_scan_meta_btree(&rf, ip);
606 }
607
608 return 0;
609 }
610
611 /*
612 * Take ILOCK on a file that we want to scan.
613 *
614 * Select ILOCK_EXCL if the file has an unloaded data bmbt or has an unloaded
615 * attr bmbt. Otherwise, take ILOCK_SHARED.
616 */
617 static inline unsigned int
xrep_rmap_scan_ilock(struct xfs_inode * ip)618 xrep_rmap_scan_ilock(
619 struct xfs_inode *ip)
620 {
621 uint lock_mode = XFS_ILOCK_SHARED;
622
623 if (xfs_need_iread_extents(&ip->i_df)) {
624 lock_mode = XFS_ILOCK_EXCL;
625 goto lock;
626 }
627
628 if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af))
629 lock_mode = XFS_ILOCK_EXCL;
630
631 lock:
632 xfs_ilock(ip, lock_mode);
633 return lock_mode;
634 }
635
636 /* Record reverse mappings for a file. */
637 STATIC int
xrep_rmap_scan_inode(struct xrep_rmap * rr,struct xfs_inode * ip)638 xrep_rmap_scan_inode(
639 struct xrep_rmap *rr,
640 struct xfs_inode *ip)
641 {
642 unsigned int lock_mode = xrep_rmap_scan_ilock(ip);
643 int error;
644
645 /* Check the data fork. */
646 error = xrep_rmap_scan_ifork(rr, ip, XFS_DATA_FORK);
647 if (error)
648 goto out_unlock;
649
650 /* Check the attr fork. */
651 error = xrep_rmap_scan_ifork(rr, ip, XFS_ATTR_FORK);
652 if (error)
653 goto out_unlock;
654
655 /* COW fork extents are "owned" by the refcount btree. */
656
657 xchk_iscan_mark_visited(&rr->iscan, ip);
658 out_unlock:
659 xfs_iunlock(ip, lock_mode);
660 return error;
661 }
662
663 /* Section (I): Find all AG metadata extents except for free space metadata. */
664
665 struct xrep_rmap_inodes {
666 struct xrep_rmap *rr;
667 struct xagb_bitmap inobt_blocks; /* INOBIT */
668 struct xagb_bitmap ichunk_blocks; /* ICHUNKBIT */
669 };
670
671 /* Record inode btree rmaps. */
672 STATIC int
xrep_rmap_walk_inobt(struct xfs_btree_cur * cur,const union xfs_btree_rec * rec,void * priv)673 xrep_rmap_walk_inobt(
674 struct xfs_btree_cur *cur,
675 const union xfs_btree_rec *rec,
676 void *priv)
677 {
678 struct xfs_inobt_rec_incore irec;
679 struct xrep_rmap_inodes *ri = priv;
680 struct xfs_mount *mp = cur->bc_mp;
681 xfs_agblock_t agbno;
682 xfs_extlen_t aglen;
683 xfs_agino_t agino;
684 xfs_agino_t iperhole;
685 unsigned int i;
686 int error;
687
688 /* Record the inobt blocks. */
689 error = xagb_bitmap_set_btcur_path(&ri->inobt_blocks, cur);
690 if (error)
691 return error;
692
693 xfs_inobt_btrec_to_irec(mp, rec, &irec);
694 if (xfs_inobt_check_irec(to_perag(cur->bc_group), &irec) != NULL)
695 return -EFSCORRUPTED;
696
697 agino = irec.ir_startino;
698
699 /* Record a non-sparse inode chunk. */
700 if (!xfs_inobt_issparse(irec.ir_holemask)) {
701 agbno = XFS_AGINO_TO_AGBNO(mp, agino);
702 aglen = max_t(xfs_extlen_t, 1,
703 XFS_INODES_PER_CHUNK / mp->m_sb.sb_inopblock);
704
705 return xagb_bitmap_set(&ri->ichunk_blocks, agbno, aglen);
706 }
707
708 /* Iterate each chunk. */
709 iperhole = max_t(xfs_agino_t, mp->m_sb.sb_inopblock,
710 XFS_INODES_PER_HOLEMASK_BIT);
711 aglen = iperhole / mp->m_sb.sb_inopblock;
712 for (i = 0, agino = irec.ir_startino;
713 i < XFS_INOBT_HOLEMASK_BITS;
714 i += iperhole / XFS_INODES_PER_HOLEMASK_BIT, agino += iperhole) {
715 /* Skip holes. */
716 if (irec.ir_holemask & (1 << i))
717 continue;
718
719 /* Record the inode chunk otherwise. */
720 agbno = XFS_AGINO_TO_AGBNO(mp, agino);
721 error = xagb_bitmap_set(&ri->ichunk_blocks, agbno, aglen);
722 if (error)
723 return error;
724 }
725
726 return 0;
727 }
728
729 /* Collect rmaps for the blocks containing inode btrees and the inode chunks. */
730 STATIC int
xrep_rmap_find_inode_rmaps(struct xrep_rmap * rr)731 xrep_rmap_find_inode_rmaps(
732 struct xrep_rmap *rr)
733 {
734 struct xrep_rmap_inodes ri = {
735 .rr = rr,
736 };
737 struct xfs_scrub *sc = rr->sc;
738 int error;
739
740 xagb_bitmap_init(&ri.inobt_blocks);
741 xagb_bitmap_init(&ri.ichunk_blocks);
742
743 /*
744 * Iterate every record in the inobt so we can capture all the inode
745 * chunks and the blocks in the inobt itself.
746 */
747 error = xfs_btree_query_all(sc->sa.ino_cur, xrep_rmap_walk_inobt, &ri);
748 if (error)
749 goto out_bitmap;
750
751 /*
752 * Note that if there are zero records in the inobt then query_all does
753 * nothing and we have to account the empty inobt root manually.
754 */
755 if (xagb_bitmap_empty(&ri.ichunk_blocks)) {
756 struct xfs_agi *agi = sc->sa.agi_bp->b_addr;
757
758 error = xagb_bitmap_set(&ri.inobt_blocks,
759 be32_to_cpu(agi->agi_root), 1);
760 if (error)
761 goto out_bitmap;
762 }
763
764 /* Scan the finobt too. */
765 if (xfs_has_finobt(sc->mp)) {
766 error = xagb_bitmap_set_btblocks(&ri.inobt_blocks,
767 sc->sa.fino_cur);
768 if (error)
769 goto out_bitmap;
770 }
771
772 /* Generate rmaps for everything. */
773 error = xrep_rmap_stash_bitmap(rr, &ri.inobt_blocks,
774 &XFS_RMAP_OINFO_INOBT);
775 if (error)
776 goto out_bitmap;
777 error = xrep_rmap_stash_bitmap(rr, &ri.ichunk_blocks,
778 &XFS_RMAP_OINFO_INODES);
779
780 out_bitmap:
781 xagb_bitmap_destroy(&ri.inobt_blocks);
782 xagb_bitmap_destroy(&ri.ichunk_blocks);
783 return error;
784 }
785
786 /* Record a CoW staging extent. */
787 STATIC int
xrep_rmap_walk_cowblocks(struct xfs_btree_cur * cur,const struct xfs_refcount_irec * irec,void * priv)788 xrep_rmap_walk_cowblocks(
789 struct xfs_btree_cur *cur,
790 const struct xfs_refcount_irec *irec,
791 void *priv)
792 {
793 struct xagb_bitmap *bitmap = priv;
794
795 if (!xfs_refcount_check_domain(irec) ||
796 irec->rc_domain != XFS_REFC_DOMAIN_COW)
797 return -EFSCORRUPTED;
798
799 return xagb_bitmap_set(bitmap, irec->rc_startblock, irec->rc_blockcount);
800 }
801
802 /*
803 * Collect rmaps for the blocks containing the refcount btree, and all CoW
804 * staging extents.
805 */
806 STATIC int
xrep_rmap_find_refcount_rmaps(struct xrep_rmap * rr)807 xrep_rmap_find_refcount_rmaps(
808 struct xrep_rmap *rr)
809 {
810 struct xagb_bitmap refcountbt_blocks; /* REFCBIT */
811 struct xagb_bitmap cow_blocks; /* COWBIT */
812 struct xfs_refcount_irec low = {
813 .rc_startblock = 0,
814 .rc_domain = XFS_REFC_DOMAIN_COW,
815 };
816 struct xfs_refcount_irec high = {
817 .rc_startblock = -1U,
818 .rc_domain = XFS_REFC_DOMAIN_COW,
819 };
820 struct xfs_scrub *sc = rr->sc;
821 int error;
822
823 if (!xfs_has_reflink(sc->mp))
824 return 0;
825
826 xagb_bitmap_init(&refcountbt_blocks);
827 xagb_bitmap_init(&cow_blocks);
828
829 /* refcountbt */
830 error = xagb_bitmap_set_btblocks(&refcountbt_blocks, sc->sa.refc_cur);
831 if (error)
832 goto out_bitmap;
833
834 /* Collect rmaps for CoW staging extents. */
835 error = xfs_refcount_query_range(sc->sa.refc_cur, &low, &high,
836 xrep_rmap_walk_cowblocks, &cow_blocks);
837 if (error)
838 goto out_bitmap;
839
840 /* Generate rmaps for everything. */
841 error = xrep_rmap_stash_bitmap(rr, &cow_blocks, &XFS_RMAP_OINFO_COW);
842 if (error)
843 goto out_bitmap;
844 error = xrep_rmap_stash_bitmap(rr, &refcountbt_blocks,
845 &XFS_RMAP_OINFO_REFC);
846
847 out_bitmap:
848 xagb_bitmap_destroy(&cow_blocks);
849 xagb_bitmap_destroy(&refcountbt_blocks);
850 return error;
851 }
852
853 /* Generate rmaps for the AG headers (AGI/AGF/AGFL) */
854 STATIC int
xrep_rmap_find_agheader_rmaps(struct xrep_rmap * rr)855 xrep_rmap_find_agheader_rmaps(
856 struct xrep_rmap *rr)
857 {
858 struct xfs_scrub *sc = rr->sc;
859
860 /* Create a record for the AG sb->agfl. */
861 return xrep_rmap_stash(rr, XFS_SB_BLOCK(sc->mp),
862 XFS_AGFL_BLOCK(sc->mp) - XFS_SB_BLOCK(sc->mp) + 1,
863 XFS_RMAP_OWN_FS, 0, 0);
864 }
865
866 /* Generate rmaps for the log, if it's in this AG. */
867 STATIC int
xrep_rmap_find_log_rmaps(struct xrep_rmap * rr)868 xrep_rmap_find_log_rmaps(
869 struct xrep_rmap *rr)
870 {
871 struct xfs_scrub *sc = rr->sc;
872
873 if (!xfs_ag_contains_log(sc->mp, pag_agno(sc->sa.pag)))
874 return 0;
875
876 return xrep_rmap_stash(rr,
877 XFS_FSB_TO_AGBNO(sc->mp, sc->mp->m_sb.sb_logstart),
878 sc->mp->m_sb.sb_logblocks, XFS_RMAP_OWN_LOG, 0, 0);
879 }
880
881 /* Check and count all the records that we gathered. */
882 STATIC int
xrep_rmap_check_record(struct xfs_btree_cur * cur,const struct xfs_rmap_irec * rec,void * priv)883 xrep_rmap_check_record(
884 struct xfs_btree_cur *cur,
885 const struct xfs_rmap_irec *rec,
886 void *priv)
887 {
888 struct xrep_rmap *rr = priv;
889 int error;
890
891 error = xrep_rmap_check_mapping(rr->sc, rec);
892 if (error)
893 return error;
894
895 rr->nr_records++;
896 return 0;
897 }
898
899 /*
900 * Generate all the reverse-mappings for this AG, a list of the old rmapbt
901 * blocks, and the new btreeblks count. Figure out if we have enough free
902 * space to reconstruct the inode btrees. The caller must clean up the lists
903 * if anything goes wrong. This implements section (I) above.
904 */
905 STATIC int
xrep_rmap_find_rmaps(struct xrep_rmap * rr)906 xrep_rmap_find_rmaps(
907 struct xrep_rmap *rr)
908 {
909 struct xfs_scrub *sc = rr->sc;
910 struct xchk_ag *sa = &sc->sa;
911 struct xfs_inode *ip;
912 struct xfs_btree_cur *mcur;
913 int error;
914
915 /* Find all the per-AG metadata. */
916 xrep_ag_btcur_init(sc, &sc->sa);
917
918 error = xrep_rmap_find_inode_rmaps(rr);
919 if (error)
920 goto end_agscan;
921
922 error = xrep_rmap_find_refcount_rmaps(rr);
923 if (error)
924 goto end_agscan;
925
926 error = xrep_rmap_find_agheader_rmaps(rr);
927 if (error)
928 goto end_agscan;
929
930 error = xrep_rmap_find_log_rmaps(rr);
931 end_agscan:
932 xchk_ag_btcur_free(&sc->sa);
933 if (error)
934 return error;
935
936 /*
937 * Set up for a potentially lengthy filesystem scan by reducing our
938 * transaction resource usage for the duration. Specifically:
939 *
940 * Unlock the AG header buffers and cancel the transaction to release
941 * the log grant space while we scan the filesystem.
942 *
943 * Create a new empty transaction to eliminate the possibility of the
944 * inode scan deadlocking on cyclical metadata.
945 *
946 * We pass the empty transaction to the file scanning function to avoid
947 * repeatedly cycling empty transactions. This can be done even though
948 * we take the IOLOCK to quiesce the file because empty transactions
949 * do not take sb_internal.
950 */
951 sa->agf_bp = NULL;
952 sa->agi_bp = NULL;
953 xchk_trans_cancel(sc);
954 xchk_trans_alloc_empty(sc);
955
956 /* Iterate all AGs for inodes rmaps. */
957 while ((error = xchk_iscan_iter(&rr->iscan, &ip)) == 1) {
958 error = xrep_rmap_scan_inode(rr, ip);
959 xchk_irele(sc, ip);
960 if (error)
961 break;
962
963 if (xchk_should_terminate(sc, &error))
964 break;
965 }
966 xchk_iscan_iter_finish(&rr->iscan);
967 if (error)
968 return error;
969
970 /*
971 * Switch out for a real transaction and lock the AG headers in
972 * preparation for building a new tree.
973 */
974 xchk_trans_cancel(sc);
975 error = xchk_setup_fs(sc);
976 if (error)
977 return error;
978 error = xchk_perag_drain_and_lock(sc);
979 if (error)
980 return error;
981
982 /*
983 * If a hook failed to update the in-memory btree, we lack the data to
984 * continue the repair.
985 */
986 if (xchk_iscan_aborted(&rr->iscan))
987 return -EFSCORRUPTED;
988
989 /*
990 * Now that we have everything locked again, we need to count the
991 * number of rmap records stashed in the btree. This should reflect
992 * all actively-owned space in the filesystem. At the same time, check
993 * all our records before we start building a new btree, which requires
994 * a bnobt cursor.
995 */
996 mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, &rr->rmap_btree);
997 sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
998 sc->sa.pag);
999
1000 rr->nr_records = 0;
1001 error = xfs_rmap_query_all(mcur, xrep_rmap_check_record, rr);
1002
1003 xfs_btree_del_cursor(sc->sa.bno_cur, error);
1004 sc->sa.bno_cur = NULL;
1005 xfs_btree_del_cursor(mcur, error);
1006
1007 return error;
1008 }
1009
1010 /* Section (II): Reserving space for new rmapbt and setting free space bitmap */
1011
1012 struct xrep_rmap_agfl {
1013 struct xagb_bitmap *bitmap;
1014 xfs_agnumber_t agno;
1015 };
1016
1017 /* Add an AGFL block to the rmap list. */
1018 STATIC int
xrep_rmap_walk_agfl(struct xfs_mount * mp,xfs_agblock_t agbno,void * priv)1019 xrep_rmap_walk_agfl(
1020 struct xfs_mount *mp,
1021 xfs_agblock_t agbno,
1022 void *priv)
1023 {
1024 struct xrep_rmap_agfl *ra = priv;
1025
1026 return xagb_bitmap_set(ra->bitmap, agbno, 1);
1027 }
1028
1029 /*
1030 * Run one round of reserving space for the new rmapbt and recomputing the
1031 * number of blocks needed to store the previously observed rmapbt records and
1032 * the ones we'll create for the free space metadata. When we don't need more
1033 * blocks, return a bitmap of OWN_AG extents in @freesp_blocks and set @done to
1034 * true.
1035 */
1036 STATIC int
xrep_rmap_try_reserve(struct xrep_rmap * rr,struct xfs_btree_cur * rmap_cur,struct xagb_bitmap * freesp_blocks,uint64_t * blocks_reserved,bool * done)1037 xrep_rmap_try_reserve(
1038 struct xrep_rmap *rr,
1039 struct xfs_btree_cur *rmap_cur,
1040 struct xagb_bitmap *freesp_blocks,
1041 uint64_t *blocks_reserved,
1042 bool *done)
1043 {
1044 struct xrep_rmap_agfl ra = {
1045 .bitmap = freesp_blocks,
1046 .agno = pag_agno(rr->sc->sa.pag),
1047 };
1048 struct xfs_scrub *sc = rr->sc;
1049 struct xrep_newbt_resv *resv, *n;
1050 struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
1051 struct xfs_buf *agfl_bp;
1052 uint64_t nr_blocks; /* RMB */
1053 uint64_t freesp_records;
1054 int error;
1055
1056 /*
1057 * We're going to recompute new_btree.bload.nr_blocks at the end of
1058 * this function to reflect however many btree blocks we need to store
1059 * all the rmap records (including the ones that reflect the changes we
1060 * made to support the new rmapbt blocks), so we save the old value
1061 * here so we can decide if we've reserved enough blocks.
1062 */
1063 nr_blocks = rr->new_btree.bload.nr_blocks;
1064
1065 /*
1066 * Make sure we've reserved enough space for the new btree. This can
1067 * change the shape of the free space btrees, which can cause secondary
1068 * interactions with the rmap records because all three space btrees
1069 * have the same rmap owner. We'll account for all that below.
1070 */
1071 error = xrep_newbt_alloc_blocks(&rr->new_btree,
1072 nr_blocks - *blocks_reserved);
1073 if (error)
1074 return error;
1075
1076 *blocks_reserved = rr->new_btree.bload.nr_blocks;
1077
1078 /* Clear everything in the bitmap. */
1079 xagb_bitmap_destroy(freesp_blocks);
1080
1081 /* Set all the bnobt blocks in the bitmap. */
1082 sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
1083 sc->sa.pag);
1084 error = xagb_bitmap_set_btblocks(freesp_blocks, sc->sa.bno_cur);
1085 xfs_btree_del_cursor(sc->sa.bno_cur, error);
1086 sc->sa.bno_cur = NULL;
1087 if (error)
1088 return error;
1089
1090 /* Set all the cntbt blocks in the bitmap. */
1091 sc->sa.cnt_cur = xfs_cntbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
1092 sc->sa.pag);
1093 error = xagb_bitmap_set_btblocks(freesp_blocks, sc->sa.cnt_cur);
1094 xfs_btree_del_cursor(sc->sa.cnt_cur, error);
1095 sc->sa.cnt_cur = NULL;
1096 if (error)
1097 return error;
1098
1099 /* Record our new btreeblks value. */
1100 rr->freesp_btblocks = xagb_bitmap_hweight(freesp_blocks) - 2;
1101
1102 /* Set all the new rmapbt blocks in the bitmap. */
1103 list_for_each_entry_safe(resv, n, &rr->new_btree.resv_list, list) {
1104 error = xagb_bitmap_set(freesp_blocks, resv->agbno, resv->len);
1105 if (error)
1106 return error;
1107 }
1108
1109 /* Set all the AGFL blocks in the bitmap. */
1110 error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
1111 if (error)
1112 return error;
1113
1114 error = xfs_agfl_walk(sc->mp, agf, agfl_bp, xrep_rmap_walk_agfl, &ra);
1115 if (error)
1116 return error;
1117
1118 /* Count the extents in the bitmap. */
1119 freesp_records = xagb_bitmap_count_set_regions(freesp_blocks);
1120
1121 /* Compute how many blocks we'll need for all the rmaps. */
1122 error = xfs_btree_bload_compute_geometry(rmap_cur,
1123 &rr->new_btree.bload, rr->nr_records + freesp_records);
1124 if (error)
1125 return error;
1126
1127 /* We're done when we don't need more blocks. */
1128 *done = nr_blocks >= rr->new_btree.bload.nr_blocks;
1129 return 0;
1130 }
1131
1132 /*
1133 * Iteratively reserve space for rmap btree while recording OWN_AG rmaps for
1134 * the free space metadata. This implements section (II) above.
1135 */
1136 STATIC int
xrep_rmap_reserve_space(struct xrep_rmap * rr,struct xfs_btree_cur * rmap_cur)1137 xrep_rmap_reserve_space(
1138 struct xrep_rmap *rr,
1139 struct xfs_btree_cur *rmap_cur)
1140 {
1141 struct xagb_bitmap freesp_blocks; /* AGBIT */
1142 uint64_t blocks_reserved = 0;
1143 bool done = false;
1144 int error;
1145
1146 /* Compute how many blocks we'll need for the rmaps collected so far. */
1147 error = xfs_btree_bload_compute_geometry(rmap_cur,
1148 &rr->new_btree.bload, rr->nr_records);
1149 if (error)
1150 return error;
1151
1152 /* Last chance to abort before we start committing fixes. */
1153 if (xchk_should_terminate(rr->sc, &error))
1154 return error;
1155
1156 xagb_bitmap_init(&freesp_blocks);
1157
1158 /*
1159 * Iteratively reserve space for the new rmapbt and recompute the
1160 * number of blocks needed to store the previously observed rmapbt
1161 * records and the ones we'll create for the free space metadata.
1162 * Finish when we don't need more blocks.
1163 */
1164 do {
1165 error = xrep_rmap_try_reserve(rr, rmap_cur, &freesp_blocks,
1166 &blocks_reserved, &done);
1167 if (error)
1168 goto out_bitmap;
1169 } while (!done);
1170
1171 /* Emit rmaps for everything in the free space bitmap. */
1172 xrep_ag_btcur_init(rr->sc, &rr->sc->sa);
1173 error = xrep_rmap_stash_bitmap(rr, &freesp_blocks, &XFS_RMAP_OINFO_AG);
1174 xchk_ag_btcur_free(&rr->sc->sa);
1175
1176 out_bitmap:
1177 xagb_bitmap_destroy(&freesp_blocks);
1178 return error;
1179 }
1180
1181 /* Section (III): Building the new rmap btree. */
1182
1183 /* Update the AGF counters. */
1184 STATIC int
xrep_rmap_reset_counters(struct xrep_rmap * rr)1185 xrep_rmap_reset_counters(
1186 struct xrep_rmap *rr)
1187 {
1188 struct xfs_scrub *sc = rr->sc;
1189 struct xfs_perag *pag = sc->sa.pag;
1190 struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
1191 xfs_agblock_t rmap_btblocks;
1192
1193 /*
1194 * The AGF header contains extra information related to the reverse
1195 * mapping btree, so we must update those fields here.
1196 */
1197 rmap_btblocks = rr->new_btree.afake.af_blocks - 1;
1198 agf->agf_btreeblks = cpu_to_be32(rr->freesp_btblocks + rmap_btblocks);
1199 xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS);
1200
1201 /*
1202 * After we commit the new btree to disk, it is possible that the
1203 * process to reap the old btree blocks will race with the AIL trying
1204 * to checkpoint the old btree blocks into the filesystem. If the new
1205 * tree is shorter than the old one, the rmapbt write verifier will
1206 * fail and the AIL will shut down the filesystem.
1207 *
1208 * To avoid this, save the old incore btree height values as the alt
1209 * height values before re-initializing the perag info from the updated
1210 * AGF to capture all the new values.
1211 */
1212 pag->pagf_repair_rmap_level = pag->pagf_rmap_level;
1213
1214 /* Reinitialize with the values we just logged. */
1215 return xrep_reinit_pagf(sc);
1216 }
1217
1218 /* Retrieve rmapbt data for bulk load. */
1219 STATIC int
xrep_rmap_get_records(struct xfs_btree_cur * cur,unsigned int idx,struct xfs_btree_block * block,unsigned int nr_wanted,void * priv)1220 xrep_rmap_get_records(
1221 struct xfs_btree_cur *cur,
1222 unsigned int idx,
1223 struct xfs_btree_block *block,
1224 unsigned int nr_wanted,
1225 void *priv)
1226 {
1227 struct xrep_rmap *rr = priv;
1228 union xfs_btree_rec *block_rec;
1229 unsigned int loaded;
1230 int error;
1231
1232 for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
1233 int stat = 0;
1234
1235 error = xfs_btree_increment(rr->mcur, 0, &stat);
1236 if (error)
1237 return error;
1238 if (!stat)
1239 return -EFSCORRUPTED;
1240
1241 error = xfs_rmap_get_rec(rr->mcur, &cur->bc_rec.r, &stat);
1242 if (error)
1243 return error;
1244 if (!stat)
1245 return -EFSCORRUPTED;
1246
1247 block_rec = xfs_btree_rec_addr(cur, idx, block);
1248 cur->bc_ops->init_rec_from_cur(cur, block_rec);
1249 }
1250
1251 return loaded;
1252 }
1253
1254 /* Feed one of the new btree blocks to the bulk loader. */
1255 STATIC int
xrep_rmap_claim_block(struct xfs_btree_cur * cur,union xfs_btree_ptr * ptr,void * priv)1256 xrep_rmap_claim_block(
1257 struct xfs_btree_cur *cur,
1258 union xfs_btree_ptr *ptr,
1259 void *priv)
1260 {
1261 struct xrep_rmap *rr = priv;
1262
1263 return xrep_newbt_claim_block(cur, &rr->new_btree, ptr);
1264 }
1265
1266 /* Custom allocation function for new rmap btrees. */
1267 STATIC int
xrep_rmap_alloc_vextent(struct xfs_scrub * sc,struct xfs_alloc_arg * args,xfs_fsblock_t alloc_hint)1268 xrep_rmap_alloc_vextent(
1269 struct xfs_scrub *sc,
1270 struct xfs_alloc_arg *args,
1271 xfs_fsblock_t alloc_hint)
1272 {
1273 int error;
1274
1275 /*
1276 * We don't want an rmap update on the allocation, since we iteratively
1277 * compute the OWN_AG records /after/ allocating blocks for the records
1278 * that we already know we need to store. Therefore, fix the freelist
1279 * with the NORMAP flag set so that we don't also try to create an rmap
1280 * for new AGFL blocks.
1281 */
1282 error = xrep_fix_freelist(sc, XFS_ALLOC_FLAG_NORMAP);
1283 if (error)
1284 return error;
1285
1286 /*
1287 * If xrep_fix_freelist fixed the freelist by moving blocks from the
1288 * free space btrees or by removing blocks from the AGFL and queueing
1289 * an EFI to free the block, the transaction will be dirty. This
1290 * second case is of interest to us.
1291 *
1292 * Later on, we will need to compare gaps in the new recordset against
1293 * the block usage of all OWN_AG owners in order to free the old
1294 * btree's blocks, which means that we can't have EFIs for former AGFL
1295 * blocks attached to the repair transaction when we commit the new
1296 * btree.
1297 *
1298 * xrep_newbt_alloc_blocks guarantees this for us by calling
1299 * xrep_defer_finish to commit anything that fix_freelist may have
1300 * added to the transaction.
1301 */
1302 return xfs_alloc_vextent_near_bno(args, alloc_hint);
1303 }
1304
1305
1306 /* Count the records in this btree. */
1307 STATIC int
xrep_rmap_count_records(struct xfs_btree_cur * cur,unsigned long long * nr)1308 xrep_rmap_count_records(
1309 struct xfs_btree_cur *cur,
1310 unsigned long long *nr)
1311 {
1312 int running = 1;
1313 int error;
1314
1315 *nr = 0;
1316
1317 error = xfs_btree_goto_left_edge(cur);
1318 if (error)
1319 return error;
1320
1321 while (running && !(error = xfs_btree_increment(cur, 0, &running))) {
1322 if (running)
1323 (*nr)++;
1324 }
1325
1326 return error;
1327 }
1328 /*
1329 * Use the collected rmap information to stage a new rmap btree. If this is
1330 * successful we'll return with the new btree root information logged to the
1331 * repair transaction but not yet committed. This implements section (III)
1332 * above.
1333 */
1334 STATIC int
xrep_rmap_build_new_tree(struct xrep_rmap * rr)1335 xrep_rmap_build_new_tree(
1336 struct xrep_rmap *rr)
1337 {
1338 struct xfs_scrub *sc = rr->sc;
1339 struct xfs_perag *pag = sc->sa.pag;
1340 struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
1341 struct xfs_btree_cur *rmap_cur;
1342 int error;
1343
1344 /*
1345 * Preserve the old rmapbt block count so that we can adjust the
1346 * per-AG rmapbt reservation after we commit the new btree root and
1347 * want to dispose of the old btree blocks.
1348 */
1349 rr->old_rmapbt_fsbcount = be32_to_cpu(agf->agf_rmap_blocks);
1350
1351 /*
1352 * Prepare to construct the new btree by reserving disk space for the
1353 * new btree and setting up all the accounting information we'll need
1354 * to root the new btree while it's under construction and before we
1355 * attach it to the AG header. The new blocks are accounted to the
1356 * rmapbt per-AG reservation, which we will adjust further after
1357 * committing the new btree.
1358 */
1359 xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_SKIP_UPDATE,
1360 xfs_agbno_to_fsb(pag, XFS_RMAP_BLOCK(sc->mp)),
1361 XFS_AG_RESV_RMAPBT);
1362 rr->new_btree.bload.get_records = xrep_rmap_get_records;
1363 rr->new_btree.bload.claim_block = xrep_rmap_claim_block;
1364 rr->new_btree.alloc_vextent = xrep_rmap_alloc_vextent;
1365 rmap_cur = xfs_rmapbt_init_cursor(sc->mp, NULL, NULL, pag);
1366 xfs_btree_stage_afakeroot(rmap_cur, &rr->new_btree.afake);
1367
1368 /*
1369 * Initialize @rr->new_btree, reserve space for the new rmapbt,
1370 * and compute OWN_AG rmaps.
1371 */
1372 error = xrep_rmap_reserve_space(rr, rmap_cur);
1373 if (error)
1374 goto err_cur;
1375
1376 /*
1377 * Count the rmapbt records again, because the space reservation
1378 * for the rmapbt itself probably added more records to the btree.
1379 */
1380 rr->mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL,
1381 &rr->rmap_btree);
1382
1383 error = xrep_rmap_count_records(rr->mcur, &rr->nr_records);
1384 if (error)
1385 goto err_mcur;
1386
1387 /*
1388 * Due to btree slack factors, it's possible for a new btree to be one
1389 * level taller than the old btree. Update the incore btree height so
1390 * that we don't trip the verifiers when writing the new btree blocks
1391 * to disk.
1392 */
1393 pag->pagf_repair_rmap_level = rr->new_btree.bload.btree_height;
1394
1395 /*
1396 * Move the cursor to the left edge of the tree so that the first
1397 * increment in ->get_records positions us at the first record.
1398 */
1399 error = xfs_btree_goto_left_edge(rr->mcur);
1400 if (error)
1401 goto err_level;
1402
1403 /* Add all observed rmap records. */
1404 error = xfs_btree_bload(rmap_cur, &rr->new_btree.bload, rr);
1405 if (error)
1406 goto err_level;
1407
1408 /*
1409 * Install the new btree in the AG header. After this point the old
1410 * btree is no longer accessible and the new tree is live.
1411 */
1412 xfs_rmapbt_commit_staged_btree(rmap_cur, sc->tp, sc->sa.agf_bp);
1413 xfs_btree_del_cursor(rmap_cur, 0);
1414 xfs_btree_del_cursor(rr->mcur, 0);
1415 rr->mcur = NULL;
1416
1417 /*
1418 * Now that we've written the new btree to disk, we don't need to keep
1419 * updating the in-memory btree. Abort the scan to stop live updates.
1420 */
1421 xchk_iscan_abort(&rr->iscan);
1422
1423 /*
1424 * The newly committed rmap recordset includes mappings for the blocks
1425 * that we reserved to build the new btree. If there is excess space
1426 * reservation to be freed, the corresponding rmap records must also be
1427 * removed.
1428 */
1429 rr->new_btree.oinfo = XFS_RMAP_OINFO_AG;
1430
1431 /* Reset the AGF counters now that we've changed the btree shape. */
1432 error = xrep_rmap_reset_counters(rr);
1433 if (error)
1434 goto err_newbt;
1435
1436 /* Dispose of any unused blocks and the accounting information. */
1437 error = xrep_newbt_commit(&rr->new_btree);
1438 if (error)
1439 return error;
1440
1441 return xrep_roll_ag_trans(sc);
1442
1443 err_level:
1444 pag->pagf_repair_rmap_level = 0;
1445 err_mcur:
1446 xfs_btree_del_cursor(rr->mcur, error);
1447 err_cur:
1448 xfs_btree_del_cursor(rmap_cur, error);
1449 err_newbt:
1450 xrep_newbt_cancel(&rr->new_btree);
1451 return error;
1452 }
1453
1454 /* Section (IV): Reaping the old btree. */
1455
1456 struct xrep_rmap_find_gaps {
1457 struct xagb_bitmap rmap_gaps;
1458 xfs_agblock_t next_agbno;
1459 };
1460
1461 /* Subtract each free extent in the bnobt from the rmap gaps. */
1462 STATIC int
xrep_rmap_find_freesp(struct xfs_btree_cur * cur,const struct xfs_alloc_rec_incore * rec,void * priv)1463 xrep_rmap_find_freesp(
1464 struct xfs_btree_cur *cur,
1465 const struct xfs_alloc_rec_incore *rec,
1466 void *priv)
1467 {
1468 struct xrep_rmap_find_gaps *rfg = priv;
1469
1470 return xagb_bitmap_clear(&rfg->rmap_gaps, rec->ar_startblock,
1471 rec->ar_blockcount);
1472 }
1473
1474 /* Record the free space we find, as part of cleaning out the btree. */
1475 STATIC int
xrep_rmap_find_gaps(struct xfs_btree_cur * cur,const struct xfs_rmap_irec * rec,void * priv)1476 xrep_rmap_find_gaps(
1477 struct xfs_btree_cur *cur,
1478 const struct xfs_rmap_irec *rec,
1479 void *priv)
1480 {
1481 struct xrep_rmap_find_gaps *rfg = priv;
1482 int error;
1483
1484 if (rec->rm_startblock > rfg->next_agbno) {
1485 error = xagb_bitmap_set(&rfg->rmap_gaps, rfg->next_agbno,
1486 rec->rm_startblock - rfg->next_agbno);
1487 if (error)
1488 return error;
1489 }
1490
1491 rfg->next_agbno = max_t(xfs_agblock_t, rfg->next_agbno,
1492 rec->rm_startblock + rec->rm_blockcount);
1493 return 0;
1494 }
1495
1496 /*
1497 * Reap the old rmapbt blocks. Now that the rmapbt is fully rebuilt, we make
1498 * a list of gaps in the rmap records and a list of the extents mentioned in
1499 * the bnobt. Any block that's in the new rmapbt gap list but not mentioned
1500 * in the bnobt is a block from the old rmapbt and can be removed.
1501 */
1502 STATIC int
xrep_rmap_remove_old_tree(struct xrep_rmap * rr)1503 xrep_rmap_remove_old_tree(
1504 struct xrep_rmap *rr)
1505 {
1506 struct xrep_rmap_find_gaps rfg = {
1507 .next_agbno = 0,
1508 };
1509 struct xfs_scrub *sc = rr->sc;
1510 struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
1511 struct xfs_perag *pag = sc->sa.pag;
1512 struct xfs_btree_cur *mcur;
1513 xfs_agblock_t agend;
1514 int error;
1515
1516 xagb_bitmap_init(&rfg.rmap_gaps);
1517
1518 /* Compute free space from the new rmapbt. */
1519 mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, &rr->rmap_btree);
1520
1521 error = xfs_rmap_query_all(mcur, xrep_rmap_find_gaps, &rfg);
1522 xfs_btree_del_cursor(mcur, error);
1523 if (error)
1524 goto out_bitmap;
1525
1526 /* Insert a record for space between the last rmap and EOAG. */
1527 agend = be32_to_cpu(agf->agf_length);
1528 if (rfg.next_agbno < agend) {
1529 error = xagb_bitmap_set(&rfg.rmap_gaps, rfg.next_agbno,
1530 agend - rfg.next_agbno);
1531 if (error)
1532 goto out_bitmap;
1533 }
1534
1535 /* Compute free space from the existing bnobt. */
1536 sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
1537 sc->sa.pag);
1538 error = xfs_alloc_query_all(sc->sa.bno_cur, xrep_rmap_find_freesp,
1539 &rfg);
1540 xfs_btree_del_cursor(sc->sa.bno_cur, error);
1541 sc->sa.bno_cur = NULL;
1542 if (error)
1543 goto out_bitmap;
1544
1545 /*
1546 * Free the "free" blocks that the new rmapbt knows about but the bnobt
1547 * doesn't--these are the old rmapbt blocks. Credit the old rmapbt
1548 * block usage count back to the per-AG rmapbt reservation (and not
1549 * fdblocks, since the rmap btree lives in free space) to keep the
1550 * reservation and free space accounting correct.
1551 */
1552 error = xrep_reap_agblocks(sc, &rfg.rmap_gaps,
1553 &XFS_RMAP_OINFO_ANY_OWNER, XFS_AG_RESV_RMAPBT);
1554 if (error)
1555 goto out_bitmap;
1556
1557 /*
1558 * Now that we've zapped all the old rmapbt blocks we can turn off
1559 * the alternate height mechanism and reset the per-AG space
1560 * reservation.
1561 */
1562 pag->pagf_repair_rmap_level = 0;
1563 sc->flags |= XREP_RESET_PERAG_RESV;
1564 out_bitmap:
1565 xagb_bitmap_destroy(&rfg.rmap_gaps);
1566 return error;
1567 }
1568
1569 static inline bool
xrep_rmapbt_want_live_update(struct xchk_iscan * iscan,const struct xfs_owner_info * oi)1570 xrep_rmapbt_want_live_update(
1571 struct xchk_iscan *iscan,
1572 const struct xfs_owner_info *oi)
1573 {
1574 if (xchk_iscan_aborted(iscan))
1575 return false;
1576
1577 /*
1578 * Before unlocking the AG header to perform the inode scan, we
1579 * recorded reverse mappings for all AG metadata except for the OWN_AG
1580 * metadata. IOWs, the in-memory btree knows about the AG headers, the
1581 * two inode btrees, the CoW staging extents, and the refcount btrees.
1582 * For these types of metadata, we need to record the live updates in
1583 * the in-memory rmap btree.
1584 *
1585 * However, we do not scan the free space btrees or the AGFL until we
1586 * have re-locked the AGF and are ready to reserve space for the new
1587 * rmap btree, so we do not want live updates for OWN_AG metadata.
1588 */
1589 if (XFS_RMAP_NON_INODE_OWNER(oi->oi_owner))
1590 return oi->oi_owner != XFS_RMAP_OWN_AG;
1591
1592 /* Ignore updates to files that the scanner hasn't visited yet. */
1593 return xchk_iscan_want_live_update(iscan, oi->oi_owner);
1594 }
1595
1596 /*
1597 * Apply a rmapbt update from the regular filesystem into our shadow btree.
1598 * We're running from the thread that owns the AGF buffer and is generating
1599 * the update, so we must be careful about which parts of the struct xrep_rmap
1600 * that we change.
1601 */
1602 static int
xrep_rmapbt_live_update(struct notifier_block * nb,unsigned long action,void * data)1603 xrep_rmapbt_live_update(
1604 struct notifier_block *nb,
1605 unsigned long action,
1606 void *data)
1607 {
1608 struct xfs_rmap_update_params *p = data;
1609 struct xrep_rmap *rr;
1610 struct xfs_mount *mp;
1611 struct xfs_btree_cur *mcur;
1612 struct xfs_trans *tp;
1613 int error;
1614
1615 rr = container_of(nb, struct xrep_rmap, rhook.rmap_hook.nb);
1616 mp = rr->sc->mp;
1617
1618 if (!xrep_rmapbt_want_live_update(&rr->iscan, &p->oinfo))
1619 goto out_unlock;
1620
1621 trace_xrep_rmap_live_update(pag_group(rr->sc->sa.pag), action, p);
1622
1623 tp = xfs_trans_alloc_empty(mp);
1624
1625 mutex_lock(&rr->lock);
1626 mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, tp, &rr->rmap_btree);
1627 error = __xfs_rmap_finish_intent(mcur, action, p->startblock,
1628 p->blockcount, &p->oinfo, p->unwritten);
1629 xfs_btree_del_cursor(mcur, error);
1630 if (error)
1631 goto out_cancel;
1632
1633 error = xfbtree_trans_commit(&rr->rmap_btree, tp);
1634 if (error)
1635 goto out_cancel;
1636
1637 xfs_trans_cancel(tp);
1638 mutex_unlock(&rr->lock);
1639 return NOTIFY_DONE;
1640
1641 out_cancel:
1642 xfbtree_trans_cancel(&rr->rmap_btree, tp);
1643 xfs_trans_cancel(tp);
1644 mutex_unlock(&rr->lock);
1645 xchk_iscan_abort(&rr->iscan);
1646 out_unlock:
1647 return NOTIFY_DONE;
1648 }
1649
1650 /* Set up the filesystem scan components. */
1651 STATIC int
xrep_rmap_setup_scan(struct xrep_rmap * rr)1652 xrep_rmap_setup_scan(
1653 struct xrep_rmap *rr)
1654 {
1655 struct xfs_scrub *sc = rr->sc;
1656 int error;
1657
1658 mutex_init(&rr->lock);
1659
1660 /* Set up in-memory rmap btree */
1661 error = xfs_rmapbt_mem_init(sc->mp, &rr->rmap_btree, sc->xmbtp,
1662 pag_agno(sc->sa.pag));
1663 if (error)
1664 goto out_mutex;
1665
1666 /* Retry iget every tenth of a second for up to 30 seconds. */
1667 xchk_iscan_start(sc, 30000, 100, &rr->iscan);
1668
1669 /*
1670 * Hook into live rmap operations so that we can update our in-memory
1671 * btree to reflect live changes on the filesystem. Since we drop the
1672 * AGF buffer to scan all the inodes, we need this piece to avoid
1673 * installing a stale btree.
1674 */
1675 ASSERT(sc->flags & XCHK_FSGATES_RMAP);
1676 xfs_rmap_hook_setup(&rr->rhook, xrep_rmapbt_live_update);
1677 error = xfs_rmap_hook_add(pag_group(sc->sa.pag), &rr->rhook);
1678 if (error)
1679 goto out_iscan;
1680 return 0;
1681
1682 out_iscan:
1683 xchk_iscan_teardown(&rr->iscan);
1684 xfbtree_destroy(&rr->rmap_btree);
1685 out_mutex:
1686 mutex_destroy(&rr->lock);
1687 return error;
1688 }
1689
1690 /* Tear down scan components. */
1691 STATIC void
xrep_rmap_teardown(struct xrep_rmap * rr)1692 xrep_rmap_teardown(
1693 struct xrep_rmap *rr)
1694 {
1695 struct xfs_scrub *sc = rr->sc;
1696
1697 xchk_iscan_abort(&rr->iscan);
1698 xfs_rmap_hook_del(pag_group(sc->sa.pag), &rr->rhook);
1699 xchk_iscan_teardown(&rr->iscan);
1700 xfbtree_destroy(&rr->rmap_btree);
1701 mutex_destroy(&rr->lock);
1702 }
1703
1704 /* Repair the rmap btree for some AG. */
1705 int
xrep_rmapbt(struct xfs_scrub * sc)1706 xrep_rmapbt(
1707 struct xfs_scrub *sc)
1708 {
1709 struct xrep_rmap *rr = sc->buf;
1710 int error;
1711
1712 error = xrep_rmap_setup_scan(rr);
1713 if (error)
1714 return error;
1715
1716 /*
1717 * Collect rmaps for everything in this AG that isn't space metadata.
1718 * These rmaps won't change even as we try to allocate blocks.
1719 */
1720 error = xrep_rmap_find_rmaps(rr);
1721 if (error)
1722 goto out_records;
1723
1724 /* Rebuild the rmap information. */
1725 error = xrep_rmap_build_new_tree(rr);
1726 if (error)
1727 goto out_records;
1728
1729 /* Kill the old tree. */
1730 error = xrep_rmap_remove_old_tree(rr);
1731 if (error)
1732 goto out_records;
1733
1734 out_records:
1735 xrep_rmap_teardown(rr);
1736 return error;
1737 }
1738