1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Copyright (c) 2018-2024 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6 #include "xfs_platform.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_defer.h"
13 #include "xfs_btree.h"
14 #include "xfs_btree_staging.h"
15 #include "xfs_buf_mem.h"
16 #include "xfs_btree_mem.h"
17 #include "xfs_bit.h"
18 #include "xfs_log_format.h"
19 #include "xfs_trans.h"
20 #include "xfs_sb.h"
21 #include "xfs_alloc.h"
22 #include "xfs_alloc_btree.h"
23 #include "xfs_ialloc.h"
24 #include "xfs_ialloc_btree.h"
25 #include "xfs_rmap.h"
26 #include "xfs_rmap_btree.h"
27 #include "xfs_inode.h"
28 #include "xfs_icache.h"
29 #include "xfs_bmap.h"
30 #include "xfs_bmap_btree.h"
31 #include "xfs_refcount.h"
32 #include "xfs_refcount_btree.h"
33 #include "xfs_ag.h"
34 #include "xfs_rtrmap_btree.h"
35 #include "xfs_rtgroup.h"
36 #include "xfs_rtrefcount_btree.h"
37 #include "scrub/xfs_scrub.h"
38 #include "scrub/scrub.h"
39 #include "scrub/common.h"
40 #include "scrub/btree.h"
41 #include "scrub/trace.h"
42 #include "scrub/repair.h"
43 #include "scrub/bitmap.h"
44 #include "scrub/agb_bitmap.h"
45 #include "scrub/xfile.h"
46 #include "scrub/xfarray.h"
47 #include "scrub/iscan.h"
48 #include "scrub/newbt.h"
49 #include "scrub/reap.h"
50
51 /*
52 * Reverse Mapping Btree Repair
53 * ============================
54 *
55 * This is the most involved of all the AG space btree rebuilds. Everywhere
56 * else in XFS we lock inodes and then AG data structures, but generating the
57 * list of rmap records requires that we be able to scan both block mapping
58 * btrees of every inode in the filesystem to see if it owns any extents in
59 * this AG. We can't tolerate any inode updates while we do this, so we
60 * freeze the filesystem to lock everyone else out, and grant ourselves
61 * special privileges to run transactions with regular background reclamation
62 * turned off.
63 *
64 * We also have to be very careful not to allow inode reclaim to start a
65 * transaction because all transactions (other than our own) will block.
66 * Deferred inode inactivation helps us out there.
67 *
68 * I) Reverse mappings for all non-space metadata and file data are collected
69 * according to the following algorithm:
70 *
71 * 1. For each fork of each inode:
72 * 1.1. Create a bitmap BMBIT to track bmbt blocks if necessary.
73 * 1.2. If the incore extent map isn't loaded, walk the bmbt to accumulate
74 * bmaps into rmap records (see 1.1.4). Set bits in BMBIT for each btree
75 * block.
76 * 1.3. If the incore extent map is loaded but the fork is in btree format,
77 * just visit the bmbt blocks to set the corresponding BMBIT areas.
78 * 1.4. From the incore extent map, accumulate each bmap that falls into our
79 * target AG. Remember, multiple bmap records can map to a single rmap
80 * record, so we cannot simply emit rmap records 1:1.
81 * 1.5. Emit rmap records for each extent in BMBIT and free it.
82 * 2. Create bitmaps INOBIT and ICHUNKBIT.
83 * 3. For each record in the inobt, set the corresponding areas in ICHUNKBIT,
84 * and set bits in INOBIT for each btree block. If the inobt has no records
85 * at all, we must be careful to record its root in INOBIT.
86 * 4. For each block in the finobt, set the corresponding INOBIT area.
87 * 5. Emit rmap records for each extent in INOBIT and ICHUNKBIT and free them.
88 * 6. Create bitmaps REFCBIT and COWBIT.
89 * 7. For each CoW staging extent in the refcountbt, set the corresponding
90 * areas in COWBIT.
91 * 8. For each block in the refcountbt, set the corresponding REFCBIT area.
92 * 9. Emit rmap records for each extent in REFCBIT and COWBIT and free them.
93 * A. Emit rmap for the AG headers.
94 * B. Emit rmap for the log, if there is one.
95 *
96 * II) The rmapbt shape and space metadata rmaps are computed as follows:
97 *
98 * 1. Count the rmaps collected in the previous step. (= NR)
99 * 2. Estimate the number of rmapbt blocks needed to store NR records. (= RMB)
100 * 3. Reserve RMB blocks through the newbt using the allocator in normap mode.
101 * 4. Create bitmap AGBIT.
102 * 5. For each reservation in the newbt, set the corresponding areas in AGBIT.
103 * 6. For each block in the AGFL, bnobt, and cntbt, set the bits in AGBIT.
104 * 7. Count the extents in AGBIT. (= AGNR)
105 * 8. Estimate the number of rmapbt blocks needed for NR + AGNR rmaps. (= RMB')
106 * 9. If RMB' >= RMB, reserve RMB' - RMB more newbt blocks, set RMB = RMB',
107 * and clear AGBIT. Go to step 5.
108 * A. Emit rmaps for each extent in AGBIT.
109 *
110 * III) The rmapbt is constructed and set in place as follows:
111 *
112 * 1. Sort the rmap records.
113 * 2. Bulk load the rmaps.
114 *
115 * IV) Reap the old btree blocks.
116 *
117 * 1. Create a bitmap OLDRMBIT.
118 * 2. For each gap in the new rmapbt, set the corresponding areas of OLDRMBIT.
119 * 3. For each extent in the bnobt, clear the corresponding parts of OLDRMBIT.
120 * 4. Reap the extents corresponding to the set areas in OLDRMBIT. These are
121 * the parts of the AG that the rmap didn't find during its scan of the
122 * primary metadata and aren't known to be in the free space, which implies
123 * that they were the old rmapbt blocks.
124 * 5. Commit.
125 *
126 * We use the 'xrep_rmap' prefix for all the rmap functions.
127 */
128
129 /* Context for collecting rmaps */
130 struct xrep_rmap {
131 /* new rmapbt information */
132 struct xrep_newbt new_btree;
133
134 /* lock for the xfbtree and xfile */
135 struct mutex lock;
136
137 /* rmap records generated from primary metadata */
138 struct xfbtree rmap_btree;
139
140 struct xfs_scrub *sc;
141
142 /* in-memory btree cursor for the xfs_btree_bload iteration */
143 struct xfs_btree_cur *mcur;
144
145 /* Hooks into rmap update code. */
146 struct xfs_rmap_hook rhook;
147
148 /* inode scan cursor */
149 struct xchk_iscan iscan;
150
151 /* Number of non-freespace records found. */
152 unsigned long long nr_records;
153
154 /* bnobt/cntbt contribution to btreeblks */
155 xfs_agblock_t freesp_btblocks;
156
157 /* old agf_rmap_blocks counter */
158 unsigned int old_rmapbt_fsbcount;
159 };
160
161 /* Set us up to repair reverse mapping btrees. */
162 int
xrep_setup_ag_rmapbt(struct xfs_scrub * sc)163 xrep_setup_ag_rmapbt(
164 struct xfs_scrub *sc)
165 {
166 struct xrep_rmap *rr;
167 int error;
168
169 xchk_fsgates_enable(sc, XCHK_FSGATES_RMAP);
170
171 error = xrep_setup_xfbtree(sc, "reverse mapping records");
172 if (error)
173 return error;
174
175 rr = kzalloc_obj(struct xrep_rmap, XCHK_GFP_FLAGS);
176 if (!rr)
177 return -ENOMEM;
178
179 rr->sc = sc;
180 sc->buf = rr;
181 return 0;
182 }
183
184 /* Make sure there's nothing funny about this mapping. */
185 STATIC int
xrep_rmap_check_mapping(struct xfs_scrub * sc,const struct xfs_rmap_irec * rec)186 xrep_rmap_check_mapping(
187 struct xfs_scrub *sc,
188 const struct xfs_rmap_irec *rec)
189 {
190 enum xbtree_recpacking outcome;
191 int error;
192
193 if (xfs_rmap_check_irec(sc->sa.pag, rec) != NULL)
194 return -EFSCORRUPTED;
195
196 /* Make sure this isn't free space. */
197 error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rm_startblock,
198 rec->rm_blockcount, &outcome);
199 if (error)
200 return error;
201 if (outcome != XBTREE_RECPACKING_EMPTY)
202 return -EFSCORRUPTED;
203
204 return 0;
205 }
206
207 /* Store a reverse-mapping record. */
208 static inline int
xrep_rmap_stash(struct xrep_rmap * rr,xfs_agblock_t startblock,xfs_extlen_t blockcount,uint64_t owner,uint64_t offset,unsigned int flags)209 xrep_rmap_stash(
210 struct xrep_rmap *rr,
211 xfs_agblock_t startblock,
212 xfs_extlen_t blockcount,
213 uint64_t owner,
214 uint64_t offset,
215 unsigned int flags)
216 {
217 struct xfs_rmap_irec rmap = {
218 .rm_startblock = startblock,
219 .rm_blockcount = blockcount,
220 .rm_owner = owner,
221 .rm_offset = offset,
222 .rm_flags = flags,
223 };
224 struct xfs_scrub *sc = rr->sc;
225 struct xfs_btree_cur *mcur;
226 int error = 0;
227
228 if (xchk_should_terminate(sc, &error))
229 return error;
230
231 if (xchk_iscan_aborted(&rr->iscan))
232 return -EFSCORRUPTED;
233
234 trace_xrep_rmap_found(sc->sa.pag, &rmap);
235
236 mutex_lock(&rr->lock);
237 mcur = xfs_rmapbt_mem_cursor(sc->sa.pag, sc->tp, &rr->rmap_btree);
238 error = xfs_rmap_map_raw(mcur, &rmap);
239 xfs_btree_del_cursor(mcur, error);
240 if (error)
241 goto out_cancel;
242
243 error = xfbtree_trans_commit(&rr->rmap_btree, sc->tp);
244 if (error)
245 goto out_abort;
246
247 mutex_unlock(&rr->lock);
248 return 0;
249
250 out_cancel:
251 xfbtree_trans_cancel(&rr->rmap_btree, sc->tp);
252 out_abort:
253 xchk_iscan_abort(&rr->iscan);
254 mutex_unlock(&rr->lock);
255 return error;
256 }
257
258 struct xrep_rmap_stash_run {
259 struct xrep_rmap *rr;
260 uint64_t owner;
261 unsigned int rmap_flags;
262 };
263
264 static int
xrep_rmap_stash_run(uint32_t start,uint32_t len,void * priv)265 xrep_rmap_stash_run(
266 uint32_t start,
267 uint32_t len,
268 void *priv)
269 {
270 struct xrep_rmap_stash_run *rsr = priv;
271 struct xrep_rmap *rr = rsr->rr;
272
273 return xrep_rmap_stash(rr, start, len, rsr->owner, 0, rsr->rmap_flags);
274 }
275
276 /*
277 * Emit rmaps for every extent of bits set in the bitmap. Caller must ensure
278 * that the ranges are in units of FS blocks.
279 */
280 STATIC int
xrep_rmap_stash_bitmap(struct xrep_rmap * rr,struct xagb_bitmap * bitmap,const struct xfs_owner_info * oinfo)281 xrep_rmap_stash_bitmap(
282 struct xrep_rmap *rr,
283 struct xagb_bitmap *bitmap,
284 const struct xfs_owner_info *oinfo)
285 {
286 struct xrep_rmap_stash_run rsr = {
287 .rr = rr,
288 .owner = oinfo->oi_owner,
289 .rmap_flags = 0,
290 };
291
292 if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK)
293 rsr.rmap_flags |= XFS_RMAP_ATTR_FORK;
294 if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
295 rsr.rmap_flags |= XFS_RMAP_BMBT_BLOCK;
296
297 return xagb_bitmap_walk(bitmap, xrep_rmap_stash_run, &rsr);
298 }
299
300 /* Section (I): Finding all file and bmbt extents. */
301
302 /* Context for accumulating rmaps for an inode fork. */
303 struct xrep_rmap_ifork {
304 /*
305 * Accumulate rmap data here to turn multiple adjacent bmaps into a
306 * single rmap.
307 */
308 struct xfs_rmap_irec accum;
309
310 /* Bitmap of bmbt blocks in this AG. */
311 struct xagb_bitmap bmbt_blocks;
312
313 struct xrep_rmap *rr;
314
315 /* Which inode fork? */
316 int whichfork;
317 };
318
319 /* Stash an rmap that we accumulated while walking an inode fork. */
320 STATIC int
xrep_rmap_stash_accumulated(struct xrep_rmap_ifork * rf)321 xrep_rmap_stash_accumulated(
322 struct xrep_rmap_ifork *rf)
323 {
324 if (rf->accum.rm_blockcount == 0)
325 return 0;
326
327 return xrep_rmap_stash(rf->rr, rf->accum.rm_startblock,
328 rf->accum.rm_blockcount, rf->accum.rm_owner,
329 rf->accum.rm_offset, rf->accum.rm_flags);
330 }
331
332 /* Accumulate a bmbt record. */
333 STATIC int
xrep_rmap_visit_bmbt(struct xfs_btree_cur * cur,struct xfs_bmbt_irec * rec,void * priv)334 xrep_rmap_visit_bmbt(
335 struct xfs_btree_cur *cur,
336 struct xfs_bmbt_irec *rec,
337 void *priv)
338 {
339 struct xrep_rmap_ifork *rf = priv;
340 struct xfs_mount *mp = rf->rr->sc->mp;
341 struct xfs_rmap_irec *accum = &rf->accum;
342 xfs_agblock_t agbno;
343 unsigned int rmap_flags = 0;
344 int error;
345
346 if (XFS_FSB_TO_AGNO(mp, rec->br_startblock) !=
347 pag_agno(rf->rr->sc->sa.pag))
348 return 0;
349
350 agbno = XFS_FSB_TO_AGBNO(mp, rec->br_startblock);
351 if (rf->whichfork == XFS_ATTR_FORK)
352 rmap_flags |= XFS_RMAP_ATTR_FORK;
353 if (rec->br_state == XFS_EXT_UNWRITTEN)
354 rmap_flags |= XFS_RMAP_UNWRITTEN;
355
356 /* If this bmap is adjacent to the previous one, just add it. */
357 if (accum->rm_blockcount > 0 &&
358 rec->br_startoff == accum->rm_offset + accum->rm_blockcount &&
359 agbno == accum->rm_startblock + accum->rm_blockcount &&
360 rmap_flags == accum->rm_flags) {
361 accum->rm_blockcount += rec->br_blockcount;
362 return 0;
363 }
364
365 /* Otherwise stash the old rmap and start accumulating a new one. */
366 error = xrep_rmap_stash_accumulated(rf);
367 if (error)
368 return error;
369
370 accum->rm_startblock = agbno;
371 accum->rm_blockcount = rec->br_blockcount;
372 accum->rm_offset = rec->br_startoff;
373 accum->rm_flags = rmap_flags;
374 return 0;
375 }
376
377 /* Add a btree block to the bitmap. */
378 STATIC int
xrep_rmap_visit_iroot_btree_block(struct xfs_btree_cur * cur,int level,void * priv)379 xrep_rmap_visit_iroot_btree_block(
380 struct xfs_btree_cur *cur,
381 int level,
382 void *priv)
383 {
384 struct xrep_rmap_ifork *rf = priv;
385 struct xfs_buf *bp;
386 xfs_fsblock_t fsbno;
387 xfs_agblock_t agbno;
388
389 xfs_btree_get_block(cur, level, &bp);
390 if (!bp)
391 return 0;
392
393 fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
394 if (XFS_FSB_TO_AGNO(cur->bc_mp, fsbno) != pag_agno(rf->rr->sc->sa.pag))
395 return 0;
396
397 agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
398 return xagb_bitmap_set(&rf->bmbt_blocks, agbno, 1);
399 }
400
401 /*
402 * Iterate a metadata btree rooted in an inode to collect rmap records for
403 * anything in this fork that matches the AG.
404 */
405 STATIC int
xrep_rmap_scan_iroot_btree(struct xrep_rmap_ifork * rf,struct xfs_btree_cur * cur)406 xrep_rmap_scan_iroot_btree(
407 struct xrep_rmap_ifork *rf,
408 struct xfs_btree_cur *cur)
409 {
410 struct xfs_owner_info oinfo;
411 struct xrep_rmap *rr = rf->rr;
412 int error;
413
414 xagb_bitmap_init(&rf->bmbt_blocks);
415
416 /* Record all the blocks in the btree itself. */
417 error = xfs_btree_visit_blocks(cur, xrep_rmap_visit_iroot_btree_block,
418 XFS_BTREE_VISIT_ALL, rf);
419 if (error)
420 goto out;
421
422 /* Emit rmaps for the btree blocks. */
423 xfs_rmap_ino_bmbt_owner(&oinfo, rf->accum.rm_owner, rf->whichfork);
424 error = xrep_rmap_stash_bitmap(rr, &rf->bmbt_blocks, &oinfo);
425 if (error)
426 goto out;
427
428 /* Stash any remaining accumulated rmaps. */
429 error = xrep_rmap_stash_accumulated(rf);
430 out:
431 xagb_bitmap_destroy(&rf->bmbt_blocks);
432 return error;
433 }
434
435 /*
436 * Iterate the block mapping btree to collect rmap records for anything in this
437 * fork that matches the AG. Sets @mappings_done to true if we've scanned the
438 * block mappings in this fork.
439 */
440 STATIC int
xrep_rmap_scan_bmbt(struct xrep_rmap_ifork * rf,struct xfs_inode * ip,bool * mappings_done)441 xrep_rmap_scan_bmbt(
442 struct xrep_rmap_ifork *rf,
443 struct xfs_inode *ip,
444 bool *mappings_done)
445 {
446 struct xrep_rmap *rr = rf->rr;
447 struct xfs_btree_cur *cur;
448 struct xfs_ifork *ifp;
449 int error;
450
451 *mappings_done = false;
452 ifp = xfs_ifork_ptr(ip, rf->whichfork);
453 cur = xfs_bmbt_init_cursor(rr->sc->mp, rr->sc->tp, ip, rf->whichfork);
454
455 if (!xfs_ifork_is_realtime(ip, rf->whichfork) &&
456 xfs_need_iread_extents(ifp)) {
457 /*
458 * If the incore extent cache isn't loaded, scan the bmbt for
459 * mapping records. This avoids loading the incore extent
460 * tree, which will increase memory pressure at a time when
461 * we're trying to run as quickly as we possibly can. Ignore
462 * realtime extents.
463 */
464 error = xfs_bmap_query_all(cur, xrep_rmap_visit_bmbt, rf);
465 if (error)
466 goto out_cur;
467
468 *mappings_done = true;
469 }
470
471 /* Scan for the bmbt blocks, which always live on the data device. */
472 error = xrep_rmap_scan_iroot_btree(rf, cur);
473 out_cur:
474 xfs_btree_del_cursor(cur, error);
475 return error;
476 }
477
478 /*
479 * Iterate the in-core extent cache to collect rmap records for anything in
480 * this fork that matches the AG.
481 */
482 STATIC int
xrep_rmap_scan_iext(struct xrep_rmap_ifork * rf,struct xfs_ifork * ifp)483 xrep_rmap_scan_iext(
484 struct xrep_rmap_ifork *rf,
485 struct xfs_ifork *ifp)
486 {
487 struct xfs_bmbt_irec rec;
488 struct xfs_iext_cursor icur;
489 int error;
490
491 for_each_xfs_iext(ifp, &icur, &rec) {
492 if (isnullstartblock(rec.br_startblock))
493 continue;
494 error = xrep_rmap_visit_bmbt(NULL, &rec, rf);
495 if (error)
496 return error;
497 }
498
499 return xrep_rmap_stash_accumulated(rf);
500 }
501
502 static int
xrep_rmap_scan_meta_btree(struct xrep_rmap_ifork * rf,struct xfs_inode * ip)503 xrep_rmap_scan_meta_btree(
504 struct xrep_rmap_ifork *rf,
505 struct xfs_inode *ip)
506 {
507 struct xfs_scrub *sc = rf->rr->sc;
508 struct xfs_rtgroup *rtg = NULL;
509 struct xfs_btree_cur *cur = NULL;
510 enum xfs_rtg_inodes type;
511 int error;
512
513 if (rf->whichfork != XFS_DATA_FORK)
514 return -EFSCORRUPTED;
515
516 switch (ip->i_metatype) {
517 case XFS_METAFILE_RTRMAP:
518 type = XFS_RTGI_RMAP;
519 break;
520 case XFS_METAFILE_RTREFCOUNT:
521 type = XFS_RTGI_REFCOUNT;
522 break;
523 default:
524 ASSERT(0);
525 return -EFSCORRUPTED;
526 }
527
528 while ((rtg = xfs_rtgroup_next(sc->mp, rtg))) {
529 if (ip == rtg->rtg_inodes[type])
530 goto found;
531 }
532
533 /*
534 * We should never find an rt metadata btree inode that isn't
535 * associated with an rtgroup yet has ondisk blocks allocated to it.
536 */
537 if (ip->i_nblocks) {
538 ASSERT(0);
539 return -EFSCORRUPTED;
540 }
541
542 return 0;
543
544 found:
545 switch (ip->i_metatype) {
546 case XFS_METAFILE_RTRMAP:
547 cur = xfs_rtrmapbt_init_cursor(sc->tp, rtg);
548 break;
549 case XFS_METAFILE_RTREFCOUNT:
550 cur = xfs_rtrefcountbt_init_cursor(sc->tp, rtg);
551 break;
552 default:
553 ASSERT(0);
554 error = -EFSCORRUPTED;
555 goto out_rtg;
556 }
557
558 error = xrep_rmap_scan_iroot_btree(rf, cur);
559 xfs_btree_del_cursor(cur, error);
560 out_rtg:
561 xfs_rtgroup_rele(rtg);
562 return error;
563 }
564
565 /* Find all the extents from a given AG in an inode fork. */
566 STATIC int
xrep_rmap_scan_ifork(struct xrep_rmap * rr,struct xfs_inode * ip,int whichfork)567 xrep_rmap_scan_ifork(
568 struct xrep_rmap *rr,
569 struct xfs_inode *ip,
570 int whichfork)
571 {
572 struct xrep_rmap_ifork rf = {
573 .accum = { .rm_owner = ip->i_ino, },
574 .rr = rr,
575 .whichfork = whichfork,
576 };
577 struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
578 bool mappings_done;
579 int error = 0;
580
581 if (!ifp)
582 return 0;
583
584 switch (ifp->if_format) {
585 case XFS_DINODE_FMT_BTREE:
586 /*
587 * Scan the bmap btree for data device mappings. This includes
588 * the btree blocks themselves, even if this is a realtime
589 * file.
590 */
591 error = xrep_rmap_scan_bmbt(&rf, ip, &mappings_done);
592 if (error || mappings_done)
593 return error;
594 fallthrough;
595 case XFS_DINODE_FMT_EXTENTS:
596 /* Scan incore extent cache if this isn't a realtime file. */
597 if (xfs_ifork_is_realtime(ip, whichfork))
598 return 0;
599
600 return xrep_rmap_scan_iext(&rf, ifp);
601 case XFS_DINODE_FMT_META_BTREE:
602 return xrep_rmap_scan_meta_btree(&rf, ip);
603 }
604
605 return 0;
606 }
607
608 /*
609 * Take ILOCK on a file that we want to scan.
610 *
611 * Select ILOCK_EXCL if the file has an unloaded data bmbt or has an unloaded
612 * attr bmbt. Otherwise, take ILOCK_SHARED.
613 */
614 static inline unsigned int
xrep_rmap_scan_ilock(struct xfs_inode * ip)615 xrep_rmap_scan_ilock(
616 struct xfs_inode *ip)
617 {
618 uint lock_mode = XFS_ILOCK_SHARED;
619
620 if (xfs_need_iread_extents(&ip->i_df)) {
621 lock_mode = XFS_ILOCK_EXCL;
622 goto lock;
623 }
624
625 if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af))
626 lock_mode = XFS_ILOCK_EXCL;
627
628 lock:
629 xfs_ilock(ip, lock_mode);
630 return lock_mode;
631 }
632
633 /* Record reverse mappings for a file. */
634 STATIC int
xrep_rmap_scan_inode(struct xrep_rmap * rr,struct xfs_inode * ip)635 xrep_rmap_scan_inode(
636 struct xrep_rmap *rr,
637 struct xfs_inode *ip)
638 {
639 unsigned int lock_mode = xrep_rmap_scan_ilock(ip);
640 int error;
641
642 /* Check the data fork. */
643 error = xrep_rmap_scan_ifork(rr, ip, XFS_DATA_FORK);
644 if (error)
645 goto out_unlock;
646
647 /* Check the attr fork. */
648 error = xrep_rmap_scan_ifork(rr, ip, XFS_ATTR_FORK);
649 if (error)
650 goto out_unlock;
651
652 /* COW fork extents are "owned" by the refcount btree. */
653
654 xchk_iscan_mark_visited(&rr->iscan, ip);
655 out_unlock:
656 xfs_iunlock(ip, lock_mode);
657 return error;
658 }
659
660 /* Section (I): Find all AG metadata extents except for free space metadata. */
661
662 struct xrep_rmap_inodes {
663 struct xrep_rmap *rr;
664 struct xagb_bitmap inobt_blocks; /* INOBIT */
665 struct xagb_bitmap ichunk_blocks; /* ICHUNKBIT */
666 };
667
668 /* Record inode btree rmaps. */
669 STATIC int
xrep_rmap_walk_inobt(struct xfs_btree_cur * cur,const union xfs_btree_rec * rec,void * priv)670 xrep_rmap_walk_inobt(
671 struct xfs_btree_cur *cur,
672 const union xfs_btree_rec *rec,
673 void *priv)
674 {
675 struct xfs_inobt_rec_incore irec;
676 struct xrep_rmap_inodes *ri = priv;
677 struct xfs_mount *mp = cur->bc_mp;
678 xfs_agblock_t agbno;
679 xfs_extlen_t aglen;
680 xfs_agino_t agino;
681 xfs_agino_t iperhole;
682 unsigned int i;
683 int error;
684
685 /* Record the inobt blocks. */
686 error = xagb_bitmap_set_btcur_path(&ri->inobt_blocks, cur);
687 if (error)
688 return error;
689
690 xfs_inobt_btrec_to_irec(mp, rec, &irec);
691 if (xfs_inobt_check_irec(to_perag(cur->bc_group), &irec) != NULL)
692 return -EFSCORRUPTED;
693
694 agino = irec.ir_startino;
695
696 /* Record a non-sparse inode chunk. */
697 if (!xfs_inobt_issparse(irec.ir_holemask)) {
698 agbno = XFS_AGINO_TO_AGBNO(mp, agino);
699 aglen = max_t(xfs_extlen_t, 1,
700 XFS_INODES_PER_CHUNK / mp->m_sb.sb_inopblock);
701
702 return xagb_bitmap_set(&ri->ichunk_blocks, agbno, aglen);
703 }
704
705 /* Iterate each chunk. */
706 iperhole = max_t(xfs_agino_t, mp->m_sb.sb_inopblock,
707 XFS_INODES_PER_HOLEMASK_BIT);
708 aglen = iperhole / mp->m_sb.sb_inopblock;
709 for (i = 0, agino = irec.ir_startino;
710 i < XFS_INOBT_HOLEMASK_BITS;
711 i += iperhole / XFS_INODES_PER_HOLEMASK_BIT, agino += iperhole) {
712 /* Skip holes. */
713 if (irec.ir_holemask & (1 << i))
714 continue;
715
716 /* Record the inode chunk otherwise. */
717 agbno = XFS_AGINO_TO_AGBNO(mp, agino);
718 error = xagb_bitmap_set(&ri->ichunk_blocks, agbno, aglen);
719 if (error)
720 return error;
721 }
722
723 return 0;
724 }
725
726 /* Collect rmaps for the blocks containing inode btrees and the inode chunks. */
727 STATIC int
xrep_rmap_find_inode_rmaps(struct xrep_rmap * rr)728 xrep_rmap_find_inode_rmaps(
729 struct xrep_rmap *rr)
730 {
731 struct xrep_rmap_inodes ri = {
732 .rr = rr,
733 };
734 struct xfs_scrub *sc = rr->sc;
735 int error;
736
737 xagb_bitmap_init(&ri.inobt_blocks);
738 xagb_bitmap_init(&ri.ichunk_blocks);
739
740 /*
741 * Iterate every record in the inobt so we can capture all the inode
742 * chunks and the blocks in the inobt itself.
743 */
744 error = xfs_btree_query_all(sc->sa.ino_cur, xrep_rmap_walk_inobt, &ri);
745 if (error)
746 goto out_bitmap;
747
748 /*
749 * Note that if there are zero records in the inobt then query_all does
750 * nothing and we have to account the empty inobt root manually.
751 */
752 if (xagb_bitmap_empty(&ri.ichunk_blocks)) {
753 struct xfs_agi *agi = sc->sa.agi_bp->b_addr;
754
755 error = xagb_bitmap_set(&ri.inobt_blocks,
756 be32_to_cpu(agi->agi_root), 1);
757 if (error)
758 goto out_bitmap;
759 }
760
761 /* Scan the finobt too. */
762 if (xfs_has_finobt(sc->mp)) {
763 error = xagb_bitmap_set_btblocks(&ri.inobt_blocks,
764 sc->sa.fino_cur);
765 if (error)
766 goto out_bitmap;
767 }
768
769 /* Generate rmaps for everything. */
770 error = xrep_rmap_stash_bitmap(rr, &ri.inobt_blocks,
771 &XFS_RMAP_OINFO_INOBT);
772 if (error)
773 goto out_bitmap;
774 error = xrep_rmap_stash_bitmap(rr, &ri.ichunk_blocks,
775 &XFS_RMAP_OINFO_INODES);
776
777 out_bitmap:
778 xagb_bitmap_destroy(&ri.inobt_blocks);
779 xagb_bitmap_destroy(&ri.ichunk_blocks);
780 return error;
781 }
782
783 /* Record a CoW staging extent. */
784 STATIC int
xrep_rmap_walk_cowblocks(struct xfs_btree_cur * cur,const struct xfs_refcount_irec * irec,void * priv)785 xrep_rmap_walk_cowblocks(
786 struct xfs_btree_cur *cur,
787 const struct xfs_refcount_irec *irec,
788 void *priv)
789 {
790 struct xagb_bitmap *bitmap = priv;
791
792 if (!xfs_refcount_check_domain(irec) ||
793 irec->rc_domain != XFS_REFC_DOMAIN_COW)
794 return -EFSCORRUPTED;
795
796 return xagb_bitmap_set(bitmap, irec->rc_startblock, irec->rc_blockcount);
797 }
798
799 /*
800 * Collect rmaps for the blocks containing the refcount btree, and all CoW
801 * staging extents.
802 */
803 STATIC int
xrep_rmap_find_refcount_rmaps(struct xrep_rmap * rr)804 xrep_rmap_find_refcount_rmaps(
805 struct xrep_rmap *rr)
806 {
807 struct xagb_bitmap refcountbt_blocks; /* REFCBIT */
808 struct xagb_bitmap cow_blocks; /* COWBIT */
809 struct xfs_refcount_irec low = {
810 .rc_startblock = 0,
811 .rc_domain = XFS_REFC_DOMAIN_COW,
812 };
813 struct xfs_refcount_irec high = {
814 .rc_startblock = -1U,
815 .rc_domain = XFS_REFC_DOMAIN_COW,
816 };
817 struct xfs_scrub *sc = rr->sc;
818 int error;
819
820 if (!xfs_has_reflink(sc->mp))
821 return 0;
822
823 xagb_bitmap_init(&refcountbt_blocks);
824 xagb_bitmap_init(&cow_blocks);
825
826 /* refcountbt */
827 error = xagb_bitmap_set_btblocks(&refcountbt_blocks, sc->sa.refc_cur);
828 if (error)
829 goto out_bitmap;
830
831 /* Collect rmaps for CoW staging extents. */
832 error = xfs_refcount_query_range(sc->sa.refc_cur, &low, &high,
833 xrep_rmap_walk_cowblocks, &cow_blocks);
834 if (error)
835 goto out_bitmap;
836
837 /* Generate rmaps for everything. */
838 error = xrep_rmap_stash_bitmap(rr, &cow_blocks, &XFS_RMAP_OINFO_COW);
839 if (error)
840 goto out_bitmap;
841 error = xrep_rmap_stash_bitmap(rr, &refcountbt_blocks,
842 &XFS_RMAP_OINFO_REFC);
843
844 out_bitmap:
845 xagb_bitmap_destroy(&cow_blocks);
846 xagb_bitmap_destroy(&refcountbt_blocks);
847 return error;
848 }
849
850 /* Generate rmaps for the AG headers (AGI/AGF/AGFL) */
851 STATIC int
xrep_rmap_find_agheader_rmaps(struct xrep_rmap * rr)852 xrep_rmap_find_agheader_rmaps(
853 struct xrep_rmap *rr)
854 {
855 struct xfs_scrub *sc = rr->sc;
856
857 /* Create a record for the AG sb->agfl. */
858 return xrep_rmap_stash(rr, XFS_SB_BLOCK(sc->mp),
859 XFS_AGFL_BLOCK(sc->mp) - XFS_SB_BLOCK(sc->mp) + 1,
860 XFS_RMAP_OWN_FS, 0, 0);
861 }
862
863 /* Generate rmaps for the log, if it's in this AG. */
864 STATIC int
xrep_rmap_find_log_rmaps(struct xrep_rmap * rr)865 xrep_rmap_find_log_rmaps(
866 struct xrep_rmap *rr)
867 {
868 struct xfs_scrub *sc = rr->sc;
869
870 if (!xfs_ag_contains_log(sc->mp, pag_agno(sc->sa.pag)))
871 return 0;
872
873 return xrep_rmap_stash(rr,
874 XFS_FSB_TO_AGBNO(sc->mp, sc->mp->m_sb.sb_logstart),
875 sc->mp->m_sb.sb_logblocks, XFS_RMAP_OWN_LOG, 0, 0);
876 }
877
878 /* Check and count all the records that we gathered. */
879 STATIC int
xrep_rmap_check_record(struct xfs_btree_cur * cur,const struct xfs_rmap_irec * rec,void * priv)880 xrep_rmap_check_record(
881 struct xfs_btree_cur *cur,
882 const struct xfs_rmap_irec *rec,
883 void *priv)
884 {
885 struct xrep_rmap *rr = priv;
886 int error;
887
888 error = xrep_rmap_check_mapping(rr->sc, rec);
889 if (error)
890 return error;
891
892 rr->nr_records++;
893 return 0;
894 }
895
896 /*
897 * Generate all the reverse-mappings for this AG, a list of the old rmapbt
898 * blocks, and the new btreeblks count. Figure out if we have enough free
899 * space to reconstruct the inode btrees. The caller must clean up the lists
900 * if anything goes wrong. This implements section (I) above.
901 */
902 STATIC int
xrep_rmap_find_rmaps(struct xrep_rmap * rr)903 xrep_rmap_find_rmaps(
904 struct xrep_rmap *rr)
905 {
906 struct xfs_scrub *sc = rr->sc;
907 struct xchk_ag *sa = &sc->sa;
908 struct xfs_inode *ip;
909 struct xfs_btree_cur *mcur;
910 int error;
911
912 /* Find all the per-AG metadata. */
913 xrep_ag_btcur_init(sc, &sc->sa);
914
915 error = xrep_rmap_find_inode_rmaps(rr);
916 if (error)
917 goto end_agscan;
918
919 error = xrep_rmap_find_refcount_rmaps(rr);
920 if (error)
921 goto end_agscan;
922
923 error = xrep_rmap_find_agheader_rmaps(rr);
924 if (error)
925 goto end_agscan;
926
927 error = xrep_rmap_find_log_rmaps(rr);
928 end_agscan:
929 xchk_ag_btcur_free(&sc->sa);
930 if (error)
931 return error;
932
933 /*
934 * Set up for a potentially lengthy filesystem scan by reducing our
935 * transaction resource usage for the duration. Specifically:
936 *
937 * Unlock the AG header buffers and cancel the transaction to release
938 * the log grant space while we scan the filesystem.
939 *
940 * Create a new empty transaction to eliminate the possibility of the
941 * inode scan deadlocking on cyclical metadata.
942 *
943 * We pass the empty transaction to the file scanning function to avoid
944 * repeatedly cycling empty transactions. This can be done even though
945 * we take the IOLOCK to quiesce the file because empty transactions
946 * do not take sb_internal.
947 */
948 sa->agf_bp = NULL;
949 sa->agi_bp = NULL;
950 xchk_trans_cancel(sc);
951 xchk_trans_alloc_empty(sc);
952
953 /* Iterate all AGs for inodes rmaps. */
954 while ((error = xchk_iscan_iter(&rr->iscan, &ip)) == 1) {
955 error = xrep_rmap_scan_inode(rr, ip);
956 xchk_irele(sc, ip);
957 if (error)
958 break;
959
960 if (xchk_should_terminate(sc, &error))
961 break;
962 }
963 xchk_iscan_iter_finish(&rr->iscan);
964 if (error)
965 return error;
966
967 /*
968 * Switch out for a real transaction and lock the AG headers in
969 * preparation for building a new tree.
970 */
971 xchk_trans_cancel(sc);
972 error = xchk_setup_fs(sc);
973 if (error)
974 return error;
975 error = xchk_perag_drain_and_lock(sc);
976 if (error)
977 return error;
978
979 /*
980 * If a hook failed to update the in-memory btree, we lack the data to
981 * continue the repair.
982 */
983 if (xchk_iscan_aborted(&rr->iscan))
984 return -EFSCORRUPTED;
985
986 /*
987 * Now that we have everything locked again, we need to count the
988 * number of rmap records stashed in the btree. This should reflect
989 * all actively-owned space in the filesystem. At the same time, check
990 * all our records before we start building a new btree, which requires
991 * a bnobt cursor.
992 */
993 mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, &rr->rmap_btree);
994 sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
995 sc->sa.pag);
996
997 rr->nr_records = 0;
998 error = xfs_rmap_query_all(mcur, xrep_rmap_check_record, rr);
999
1000 xfs_btree_del_cursor(sc->sa.bno_cur, error);
1001 sc->sa.bno_cur = NULL;
1002 xfs_btree_del_cursor(mcur, error);
1003
1004 return error;
1005 }
1006
1007 /* Section (II): Reserving space for new rmapbt and setting free space bitmap */
1008
1009 struct xrep_rmap_agfl {
1010 struct xagb_bitmap *bitmap;
1011 xfs_agnumber_t agno;
1012 };
1013
1014 /* Add an AGFL block to the rmap list. */
1015 STATIC int
xrep_rmap_walk_agfl(struct xfs_mount * mp,xfs_agblock_t agbno,void * priv)1016 xrep_rmap_walk_agfl(
1017 struct xfs_mount *mp,
1018 xfs_agblock_t agbno,
1019 void *priv)
1020 {
1021 struct xrep_rmap_agfl *ra = priv;
1022
1023 return xagb_bitmap_set(ra->bitmap, agbno, 1);
1024 }
1025
1026 /*
1027 * Run one round of reserving space for the new rmapbt and recomputing the
1028 * number of blocks needed to store the previously observed rmapbt records and
1029 * the ones we'll create for the free space metadata. When we don't need more
1030 * blocks, return a bitmap of OWN_AG extents in @freesp_blocks and set @done to
1031 * true.
1032 */
1033 STATIC int
xrep_rmap_try_reserve(struct xrep_rmap * rr,struct xfs_btree_cur * rmap_cur,struct xagb_bitmap * freesp_blocks,uint64_t * blocks_reserved,bool * done)1034 xrep_rmap_try_reserve(
1035 struct xrep_rmap *rr,
1036 struct xfs_btree_cur *rmap_cur,
1037 struct xagb_bitmap *freesp_blocks,
1038 uint64_t *blocks_reserved,
1039 bool *done)
1040 {
1041 struct xrep_rmap_agfl ra = {
1042 .bitmap = freesp_blocks,
1043 .agno = pag_agno(rr->sc->sa.pag),
1044 };
1045 struct xfs_scrub *sc = rr->sc;
1046 struct xrep_newbt_resv *resv, *n;
1047 struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
1048 struct xfs_buf *agfl_bp;
1049 uint64_t nr_blocks; /* RMB */
1050 uint64_t freesp_records;
1051 int error;
1052
1053 /*
1054 * We're going to recompute new_btree.bload.nr_blocks at the end of
1055 * this function to reflect however many btree blocks we need to store
1056 * all the rmap records (including the ones that reflect the changes we
1057 * made to support the new rmapbt blocks), so we save the old value
1058 * here so we can decide if we've reserved enough blocks.
1059 */
1060 nr_blocks = rr->new_btree.bload.nr_blocks;
1061
1062 /*
1063 * Make sure we've reserved enough space for the new btree. This can
1064 * change the shape of the free space btrees, which can cause secondary
1065 * interactions with the rmap records because all three space btrees
1066 * have the same rmap owner. We'll account for all that below.
1067 */
1068 error = xrep_newbt_alloc_blocks(&rr->new_btree,
1069 nr_blocks - *blocks_reserved);
1070 if (error)
1071 return error;
1072
1073 *blocks_reserved = rr->new_btree.bload.nr_blocks;
1074
1075 /* Clear everything in the bitmap. */
1076 xagb_bitmap_destroy(freesp_blocks);
1077
1078 /* Set all the bnobt blocks in the bitmap. */
1079 sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
1080 sc->sa.pag);
1081 error = xagb_bitmap_set_btblocks(freesp_blocks, sc->sa.bno_cur);
1082 xfs_btree_del_cursor(sc->sa.bno_cur, error);
1083 sc->sa.bno_cur = NULL;
1084 if (error)
1085 return error;
1086
1087 /* Set all the cntbt blocks in the bitmap. */
1088 sc->sa.cnt_cur = xfs_cntbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
1089 sc->sa.pag);
1090 error = xagb_bitmap_set_btblocks(freesp_blocks, sc->sa.cnt_cur);
1091 xfs_btree_del_cursor(sc->sa.cnt_cur, error);
1092 sc->sa.cnt_cur = NULL;
1093 if (error)
1094 return error;
1095
1096 /* Record our new btreeblks value. */
1097 rr->freesp_btblocks = xagb_bitmap_hweight(freesp_blocks) - 2;
1098
1099 /* Set all the new rmapbt blocks in the bitmap. */
1100 list_for_each_entry_safe(resv, n, &rr->new_btree.resv_list, list) {
1101 error = xagb_bitmap_set(freesp_blocks, resv->agbno, resv->len);
1102 if (error)
1103 return error;
1104 }
1105
1106 /* Set all the AGFL blocks in the bitmap. */
1107 error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
1108 if (error)
1109 return error;
1110
1111 error = xfs_agfl_walk(sc->mp, agf, agfl_bp, xrep_rmap_walk_agfl, &ra);
1112 if (error)
1113 return error;
1114
1115 /* Count the extents in the bitmap. */
1116 freesp_records = xagb_bitmap_count_set_regions(freesp_blocks);
1117
1118 /* Compute how many blocks we'll need for all the rmaps. */
1119 error = xfs_btree_bload_compute_geometry(rmap_cur,
1120 &rr->new_btree.bload, rr->nr_records + freesp_records);
1121 if (error)
1122 return error;
1123
1124 /* We're done when we don't need more blocks. */
1125 *done = nr_blocks >= rr->new_btree.bload.nr_blocks;
1126 return 0;
1127 }
1128
1129 /*
1130 * Iteratively reserve space for rmap btree while recording OWN_AG rmaps for
1131 * the free space metadata. This implements section (II) above.
1132 */
1133 STATIC int
xrep_rmap_reserve_space(struct xrep_rmap * rr,struct xfs_btree_cur * rmap_cur)1134 xrep_rmap_reserve_space(
1135 struct xrep_rmap *rr,
1136 struct xfs_btree_cur *rmap_cur)
1137 {
1138 struct xagb_bitmap freesp_blocks; /* AGBIT */
1139 uint64_t blocks_reserved = 0;
1140 bool done = false;
1141 int error;
1142
1143 /* Compute how many blocks we'll need for the rmaps collected so far. */
1144 error = xfs_btree_bload_compute_geometry(rmap_cur,
1145 &rr->new_btree.bload, rr->nr_records);
1146 if (error)
1147 return error;
1148
1149 /* Last chance to abort before we start committing fixes. */
1150 if (xchk_should_terminate(rr->sc, &error))
1151 return error;
1152
1153 xagb_bitmap_init(&freesp_blocks);
1154
1155 /*
1156 * Iteratively reserve space for the new rmapbt and recompute the
1157 * number of blocks needed to store the previously observed rmapbt
1158 * records and the ones we'll create for the free space metadata.
1159 * Finish when we don't need more blocks.
1160 */
1161 do {
1162 error = xrep_rmap_try_reserve(rr, rmap_cur, &freesp_blocks,
1163 &blocks_reserved, &done);
1164 if (error)
1165 goto out_bitmap;
1166 } while (!done);
1167
1168 /* Emit rmaps for everything in the free space bitmap. */
1169 xrep_ag_btcur_init(rr->sc, &rr->sc->sa);
1170 error = xrep_rmap_stash_bitmap(rr, &freesp_blocks, &XFS_RMAP_OINFO_AG);
1171 xchk_ag_btcur_free(&rr->sc->sa);
1172
1173 out_bitmap:
1174 xagb_bitmap_destroy(&freesp_blocks);
1175 return error;
1176 }
1177
1178 /* Section (III): Building the new rmap btree. */
1179
1180 /* Update the AGF counters. */
1181 STATIC int
xrep_rmap_reset_counters(struct xrep_rmap * rr)1182 xrep_rmap_reset_counters(
1183 struct xrep_rmap *rr)
1184 {
1185 struct xfs_scrub *sc = rr->sc;
1186 struct xfs_perag *pag = sc->sa.pag;
1187 struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
1188 xfs_agblock_t rmap_btblocks;
1189
1190 /*
1191 * The AGF header contains extra information related to the reverse
1192 * mapping btree, so we must update those fields here.
1193 */
1194 rmap_btblocks = rr->new_btree.afake.af_blocks - 1;
1195 agf->agf_btreeblks = cpu_to_be32(rr->freesp_btblocks + rmap_btblocks);
1196 xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS);
1197
1198 /*
1199 * After we commit the new btree to disk, it is possible that the
1200 * process to reap the old btree blocks will race with the AIL trying
1201 * to checkpoint the old btree blocks into the filesystem. If the new
1202 * tree is shorter than the old one, the rmapbt write verifier will
1203 * fail and the AIL will shut down the filesystem.
1204 *
1205 * To avoid this, save the old incore btree height values as the alt
1206 * height values before re-initializing the perag info from the updated
1207 * AGF to capture all the new values.
1208 */
1209 pag->pagf_repair_rmap_level = pag->pagf_rmap_level;
1210
1211 /* Reinitialize with the values we just logged. */
1212 return xrep_reinit_pagf(sc);
1213 }
1214
1215 /* Retrieve rmapbt data for bulk load. */
1216 STATIC int
xrep_rmap_get_records(struct xfs_btree_cur * cur,unsigned int idx,struct xfs_btree_block * block,unsigned int nr_wanted,void * priv)1217 xrep_rmap_get_records(
1218 struct xfs_btree_cur *cur,
1219 unsigned int idx,
1220 struct xfs_btree_block *block,
1221 unsigned int nr_wanted,
1222 void *priv)
1223 {
1224 struct xrep_rmap *rr = priv;
1225 union xfs_btree_rec *block_rec;
1226 unsigned int loaded;
1227 int error;
1228
1229 for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
1230 int stat = 0;
1231
1232 error = xfs_btree_increment(rr->mcur, 0, &stat);
1233 if (error)
1234 return error;
1235 if (!stat)
1236 return -EFSCORRUPTED;
1237
1238 error = xfs_rmap_get_rec(rr->mcur, &cur->bc_rec.r, &stat);
1239 if (error)
1240 return error;
1241 if (!stat)
1242 return -EFSCORRUPTED;
1243
1244 block_rec = xfs_btree_rec_addr(cur, idx, block);
1245 cur->bc_ops->init_rec_from_cur(cur, block_rec);
1246 }
1247
1248 return loaded;
1249 }
1250
1251 /* Feed one of the new btree blocks to the bulk loader. */
1252 STATIC int
xrep_rmap_claim_block(struct xfs_btree_cur * cur,union xfs_btree_ptr * ptr,void * priv)1253 xrep_rmap_claim_block(
1254 struct xfs_btree_cur *cur,
1255 union xfs_btree_ptr *ptr,
1256 void *priv)
1257 {
1258 struct xrep_rmap *rr = priv;
1259
1260 return xrep_newbt_claim_block(cur, &rr->new_btree, ptr);
1261 }
1262
1263 /* Custom allocation function for new rmap btrees. */
1264 STATIC int
xrep_rmap_alloc_vextent(struct xfs_scrub * sc,struct xfs_alloc_arg * args,xfs_fsblock_t alloc_hint)1265 xrep_rmap_alloc_vextent(
1266 struct xfs_scrub *sc,
1267 struct xfs_alloc_arg *args,
1268 xfs_fsblock_t alloc_hint)
1269 {
1270 int error;
1271
1272 /*
1273 * We don't want an rmap update on the allocation, since we iteratively
1274 * compute the OWN_AG records /after/ allocating blocks for the records
1275 * that we already know we need to store. Therefore, fix the freelist
1276 * with the NORMAP flag set so that we don't also try to create an rmap
1277 * for new AGFL blocks.
1278 */
1279 error = xrep_fix_freelist(sc, XFS_ALLOC_FLAG_NORMAP);
1280 if (error)
1281 return error;
1282
1283 /*
1284 * If xrep_fix_freelist fixed the freelist by moving blocks from the
1285 * free space btrees or by removing blocks from the AGFL and queueing
1286 * an EFI to free the block, the transaction will be dirty. This
1287 * second case is of interest to us.
1288 *
1289 * Later on, we will need to compare gaps in the new recordset against
1290 * the block usage of all OWN_AG owners in order to free the old
1291 * btree's blocks, which means that we can't have EFIs for former AGFL
1292 * blocks attached to the repair transaction when we commit the new
1293 * btree.
1294 *
1295 * xrep_newbt_alloc_blocks guarantees this for us by calling
1296 * xrep_defer_finish to commit anything that fix_freelist may have
1297 * added to the transaction.
1298 */
1299 return xfs_alloc_vextent_near_bno(args, alloc_hint);
1300 }
1301
1302
1303 /* Count the records in this btree. */
1304 STATIC int
xrep_rmap_count_records(struct xfs_btree_cur * cur,unsigned long long * nr)1305 xrep_rmap_count_records(
1306 struct xfs_btree_cur *cur,
1307 unsigned long long *nr)
1308 {
1309 int running = 1;
1310 int error;
1311
1312 *nr = 0;
1313
1314 error = xfs_btree_goto_left_edge(cur);
1315 if (error)
1316 return error;
1317
1318 while (running && !(error = xfs_btree_increment(cur, 0, &running))) {
1319 if (running)
1320 (*nr)++;
1321 }
1322
1323 return error;
1324 }
1325 /*
1326 * Use the collected rmap information to stage a new rmap btree. If this is
1327 * successful we'll return with the new btree root information logged to the
1328 * repair transaction but not yet committed. This implements section (III)
1329 * above.
1330 */
1331 STATIC int
xrep_rmap_build_new_tree(struct xrep_rmap * rr)1332 xrep_rmap_build_new_tree(
1333 struct xrep_rmap *rr)
1334 {
1335 struct xfs_scrub *sc = rr->sc;
1336 struct xfs_perag *pag = sc->sa.pag;
1337 struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
1338 struct xfs_btree_cur *rmap_cur;
1339 int error;
1340
1341 /*
1342 * Preserve the old rmapbt block count so that we can adjust the
1343 * per-AG rmapbt reservation after we commit the new btree root and
1344 * want to dispose of the old btree blocks.
1345 */
1346 rr->old_rmapbt_fsbcount = be32_to_cpu(agf->agf_rmap_blocks);
1347
1348 /*
1349 * Prepare to construct the new btree by reserving disk space for the
1350 * new btree and setting up all the accounting information we'll need
1351 * to root the new btree while it's under construction and before we
1352 * attach it to the AG header. The new blocks are accounted to the
1353 * rmapbt per-AG reservation, which we will adjust further after
1354 * committing the new btree.
1355 */
1356 xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_SKIP_UPDATE,
1357 xfs_agbno_to_fsb(pag, XFS_RMAP_BLOCK(sc->mp)),
1358 XFS_AG_RESV_RMAPBT);
1359 rr->new_btree.bload.get_records = xrep_rmap_get_records;
1360 rr->new_btree.bload.claim_block = xrep_rmap_claim_block;
1361 rr->new_btree.alloc_vextent = xrep_rmap_alloc_vextent;
1362 rmap_cur = xfs_rmapbt_init_cursor(sc->mp, NULL, NULL, pag);
1363 xfs_btree_stage_afakeroot(rmap_cur, &rr->new_btree.afake);
1364
1365 /*
1366 * Initialize @rr->new_btree, reserve space for the new rmapbt,
1367 * and compute OWN_AG rmaps.
1368 */
1369 error = xrep_rmap_reserve_space(rr, rmap_cur);
1370 if (error)
1371 goto err_cur;
1372
1373 /*
1374 * Count the rmapbt records again, because the space reservation
1375 * for the rmapbt itself probably added more records to the btree.
1376 */
1377 rr->mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL,
1378 &rr->rmap_btree);
1379
1380 error = xrep_rmap_count_records(rr->mcur, &rr->nr_records);
1381 if (error)
1382 goto err_mcur;
1383
1384 /*
1385 * Due to btree slack factors, it's possible for a new btree to be one
1386 * level taller than the old btree. Update the incore btree height so
1387 * that we don't trip the verifiers when writing the new btree blocks
1388 * to disk.
1389 */
1390 pag->pagf_repair_rmap_level = rr->new_btree.bload.btree_height;
1391
1392 /*
1393 * Move the cursor to the left edge of the tree so that the first
1394 * increment in ->get_records positions us at the first record.
1395 */
1396 error = xfs_btree_goto_left_edge(rr->mcur);
1397 if (error)
1398 goto err_level;
1399
1400 /* Add all observed rmap records. */
1401 error = xfs_btree_bload(rmap_cur, &rr->new_btree.bload, rr);
1402 if (error)
1403 goto err_level;
1404
1405 /*
1406 * Install the new btree in the AG header. After this point the old
1407 * btree is no longer accessible and the new tree is live.
1408 */
1409 xfs_rmapbt_commit_staged_btree(rmap_cur, sc->tp, sc->sa.agf_bp);
1410 xfs_btree_del_cursor(rmap_cur, 0);
1411 xfs_btree_del_cursor(rr->mcur, 0);
1412 rr->mcur = NULL;
1413
1414 /*
1415 * Now that we've written the new btree to disk, we don't need to keep
1416 * updating the in-memory btree. Abort the scan to stop live updates.
1417 */
1418 xchk_iscan_abort(&rr->iscan);
1419
1420 /*
1421 * The newly committed rmap recordset includes mappings for the blocks
1422 * that we reserved to build the new btree. If there is excess space
1423 * reservation to be freed, the corresponding rmap records must also be
1424 * removed.
1425 */
1426 rr->new_btree.oinfo = XFS_RMAP_OINFO_AG;
1427
1428 /* Reset the AGF counters now that we've changed the btree shape. */
1429 error = xrep_rmap_reset_counters(rr);
1430 if (error)
1431 goto err_newbt;
1432
1433 /* Dispose of any unused blocks and the accounting information. */
1434 error = xrep_newbt_commit(&rr->new_btree);
1435 if (error)
1436 return error;
1437
1438 return xrep_roll_ag_trans(sc);
1439
1440 err_level:
1441 pag->pagf_repair_rmap_level = 0;
1442 err_mcur:
1443 xfs_btree_del_cursor(rr->mcur, error);
1444 err_cur:
1445 xfs_btree_del_cursor(rmap_cur, error);
1446 err_newbt:
1447 xrep_newbt_cancel(&rr->new_btree);
1448 return error;
1449 }
1450
1451 /* Section (IV): Reaping the old btree. */
1452
1453 struct xrep_rmap_find_gaps {
1454 struct xagb_bitmap rmap_gaps;
1455 xfs_agblock_t next_agbno;
1456 };
1457
1458 /* Subtract each free extent in the bnobt from the rmap gaps. */
1459 STATIC int
xrep_rmap_find_freesp(struct xfs_btree_cur * cur,const struct xfs_alloc_rec_incore * rec,void * priv)1460 xrep_rmap_find_freesp(
1461 struct xfs_btree_cur *cur,
1462 const struct xfs_alloc_rec_incore *rec,
1463 void *priv)
1464 {
1465 struct xrep_rmap_find_gaps *rfg = priv;
1466
1467 return xagb_bitmap_clear(&rfg->rmap_gaps, rec->ar_startblock,
1468 rec->ar_blockcount);
1469 }
1470
1471 /* Record the free space we find, as part of cleaning out the btree. */
1472 STATIC int
xrep_rmap_find_gaps(struct xfs_btree_cur * cur,const struct xfs_rmap_irec * rec,void * priv)1473 xrep_rmap_find_gaps(
1474 struct xfs_btree_cur *cur,
1475 const struct xfs_rmap_irec *rec,
1476 void *priv)
1477 {
1478 struct xrep_rmap_find_gaps *rfg = priv;
1479 int error;
1480
1481 if (rec->rm_startblock > rfg->next_agbno) {
1482 error = xagb_bitmap_set(&rfg->rmap_gaps, rfg->next_agbno,
1483 rec->rm_startblock - rfg->next_agbno);
1484 if (error)
1485 return error;
1486 }
1487
1488 rfg->next_agbno = max_t(xfs_agblock_t, rfg->next_agbno,
1489 rec->rm_startblock + rec->rm_blockcount);
1490 return 0;
1491 }
1492
1493 /*
1494 * Reap the old rmapbt blocks. Now that the rmapbt is fully rebuilt, we make
1495 * a list of gaps in the rmap records and a list of the extents mentioned in
1496 * the bnobt. Any block that's in the new rmapbt gap list but not mentioned
1497 * in the bnobt is a block from the old rmapbt and can be removed.
1498 */
1499 STATIC int
xrep_rmap_remove_old_tree(struct xrep_rmap * rr)1500 xrep_rmap_remove_old_tree(
1501 struct xrep_rmap *rr)
1502 {
1503 struct xrep_rmap_find_gaps rfg = {
1504 .next_agbno = 0,
1505 };
1506 struct xfs_scrub *sc = rr->sc;
1507 struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
1508 struct xfs_perag *pag = sc->sa.pag;
1509 struct xfs_btree_cur *mcur;
1510 xfs_agblock_t agend;
1511 int error;
1512
1513 xagb_bitmap_init(&rfg.rmap_gaps);
1514
1515 /* Compute free space from the new rmapbt. */
1516 mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, &rr->rmap_btree);
1517
1518 error = xfs_rmap_query_all(mcur, xrep_rmap_find_gaps, &rfg);
1519 xfs_btree_del_cursor(mcur, error);
1520 if (error)
1521 goto out_bitmap;
1522
1523 /* Insert a record for space between the last rmap and EOAG. */
1524 agend = be32_to_cpu(agf->agf_length);
1525 if (rfg.next_agbno < agend) {
1526 error = xagb_bitmap_set(&rfg.rmap_gaps, rfg.next_agbno,
1527 agend - rfg.next_agbno);
1528 if (error)
1529 goto out_bitmap;
1530 }
1531
1532 /* Compute free space from the existing bnobt. */
1533 sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
1534 sc->sa.pag);
1535 error = xfs_alloc_query_all(sc->sa.bno_cur, xrep_rmap_find_freesp,
1536 &rfg);
1537 xfs_btree_del_cursor(sc->sa.bno_cur, error);
1538 sc->sa.bno_cur = NULL;
1539 if (error)
1540 goto out_bitmap;
1541
1542 /*
1543 * Free the "free" blocks that the new rmapbt knows about but the bnobt
1544 * doesn't--these are the old rmapbt blocks. Credit the old rmapbt
1545 * block usage count back to the per-AG rmapbt reservation (and not
1546 * fdblocks, since the rmap btree lives in free space) to keep the
1547 * reservation and free space accounting correct.
1548 */
1549 error = xrep_reap_agblocks(sc, &rfg.rmap_gaps,
1550 &XFS_RMAP_OINFO_ANY_OWNER, XFS_AG_RESV_RMAPBT);
1551 if (error)
1552 goto out_bitmap;
1553
1554 /*
1555 * Now that we've zapped all the old rmapbt blocks we can turn off
1556 * the alternate height mechanism and reset the per-AG space
1557 * reservation.
1558 */
1559 pag->pagf_repair_rmap_level = 0;
1560 sc->flags |= XREP_RESET_PERAG_RESV;
1561 out_bitmap:
1562 xagb_bitmap_destroy(&rfg.rmap_gaps);
1563 return error;
1564 }
1565
1566 static inline bool
xrep_rmapbt_want_live_update(struct xchk_iscan * iscan,const struct xfs_owner_info * oi)1567 xrep_rmapbt_want_live_update(
1568 struct xchk_iscan *iscan,
1569 const struct xfs_owner_info *oi)
1570 {
1571 if (xchk_iscan_aborted(iscan))
1572 return false;
1573
1574 /*
1575 * Before unlocking the AG header to perform the inode scan, we
1576 * recorded reverse mappings for all AG metadata except for the OWN_AG
1577 * metadata. IOWs, the in-memory btree knows about the AG headers, the
1578 * two inode btrees, the CoW staging extents, and the refcount btrees.
1579 * For these types of metadata, we need to record the live updates in
1580 * the in-memory rmap btree.
1581 *
1582 * However, we do not scan the free space btrees or the AGFL until we
1583 * have re-locked the AGF and are ready to reserve space for the new
1584 * rmap btree, so we do not want live updates for OWN_AG metadata.
1585 */
1586 if (XFS_RMAP_NON_INODE_OWNER(oi->oi_owner))
1587 return oi->oi_owner != XFS_RMAP_OWN_AG;
1588
1589 /* Ignore updates to files that the scanner hasn't visited yet. */
1590 return xchk_iscan_want_live_update(iscan, oi->oi_owner);
1591 }
1592
1593 /*
1594 * Apply a rmapbt update from the regular filesystem into our shadow btree.
1595 * We're running from the thread that owns the AGF buffer and is generating
1596 * the update, so we must be careful about which parts of the struct xrep_rmap
1597 * that we change.
1598 */
1599 static int
xrep_rmapbt_live_update(struct notifier_block * nb,unsigned long action,void * data)1600 xrep_rmapbt_live_update(
1601 struct notifier_block *nb,
1602 unsigned long action,
1603 void *data)
1604 {
1605 struct xfs_rmap_update_params *p = data;
1606 struct xrep_rmap *rr;
1607 struct xfs_mount *mp;
1608 struct xfs_btree_cur *mcur;
1609 struct xfs_trans *tp;
1610 int error;
1611
1612 rr = container_of(nb, struct xrep_rmap, rhook.rmap_hook.nb);
1613 mp = rr->sc->mp;
1614
1615 if (!xrep_rmapbt_want_live_update(&rr->iscan, &p->oinfo))
1616 goto out_unlock;
1617
1618 trace_xrep_rmap_live_update(pag_group(rr->sc->sa.pag), action, p);
1619
1620 tp = xfs_trans_alloc_empty(mp);
1621
1622 mutex_lock(&rr->lock);
1623 mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, tp, &rr->rmap_btree);
1624 error = __xfs_rmap_finish_intent(mcur, action, p->startblock,
1625 p->blockcount, &p->oinfo, p->unwritten);
1626 xfs_btree_del_cursor(mcur, error);
1627 if (error)
1628 goto out_cancel;
1629
1630 error = xfbtree_trans_commit(&rr->rmap_btree, tp);
1631 if (error)
1632 goto out_cancel;
1633
1634 xfs_trans_cancel(tp);
1635 mutex_unlock(&rr->lock);
1636 return NOTIFY_DONE;
1637
1638 out_cancel:
1639 xfbtree_trans_cancel(&rr->rmap_btree, tp);
1640 xfs_trans_cancel(tp);
1641 mutex_unlock(&rr->lock);
1642 xchk_iscan_abort(&rr->iscan);
1643 out_unlock:
1644 return NOTIFY_DONE;
1645 }
1646
1647 /* Set up the filesystem scan components. */
1648 STATIC int
xrep_rmap_setup_scan(struct xrep_rmap * rr)1649 xrep_rmap_setup_scan(
1650 struct xrep_rmap *rr)
1651 {
1652 struct xfs_scrub *sc = rr->sc;
1653 int error;
1654
1655 mutex_init(&rr->lock);
1656
1657 /* Set up in-memory rmap btree */
1658 error = xfs_rmapbt_mem_init(sc->mp, &rr->rmap_btree, sc->xmbtp,
1659 pag_agno(sc->sa.pag));
1660 if (error)
1661 goto out_mutex;
1662
1663 /* Retry iget every tenth of a second for up to 30 seconds. */
1664 xchk_iscan_start(sc, 30000, 100, &rr->iscan);
1665
1666 /*
1667 * Hook into live rmap operations so that we can update our in-memory
1668 * btree to reflect live changes on the filesystem. Since we drop the
1669 * AGF buffer to scan all the inodes, we need this piece to avoid
1670 * installing a stale btree.
1671 */
1672 ASSERT(sc->flags & XCHK_FSGATES_RMAP);
1673 xfs_rmap_hook_setup(&rr->rhook, xrep_rmapbt_live_update);
1674 error = xfs_rmap_hook_add(pag_group(sc->sa.pag), &rr->rhook);
1675 if (error)
1676 goto out_iscan;
1677 return 0;
1678
1679 out_iscan:
1680 xchk_iscan_teardown(&rr->iscan);
1681 xfbtree_destroy(&rr->rmap_btree);
1682 out_mutex:
1683 mutex_destroy(&rr->lock);
1684 return error;
1685 }
1686
1687 /* Tear down scan components. */
1688 STATIC void
xrep_rmap_teardown(struct xrep_rmap * rr)1689 xrep_rmap_teardown(
1690 struct xrep_rmap *rr)
1691 {
1692 struct xfs_scrub *sc = rr->sc;
1693
1694 xchk_iscan_abort(&rr->iscan);
1695 xfs_rmap_hook_del(pag_group(sc->sa.pag), &rr->rhook);
1696 xchk_iscan_teardown(&rr->iscan);
1697 xfbtree_destroy(&rr->rmap_btree);
1698 mutex_destroy(&rr->lock);
1699 }
1700
1701 /* Repair the rmap btree for some AG. */
1702 int
xrep_rmapbt(struct xfs_scrub * sc)1703 xrep_rmapbt(
1704 struct xfs_scrub *sc)
1705 {
1706 struct xrep_rmap *rr = sc->buf;
1707 int error;
1708
1709 error = xrep_rmap_setup_scan(rr);
1710 if (error)
1711 return error;
1712
1713 /*
1714 * Collect rmaps for everything in this AG that isn't space metadata.
1715 * These rmaps won't change even as we try to allocate blocks.
1716 */
1717 error = xrep_rmap_find_rmaps(rr);
1718 if (error)
1719 goto out_records;
1720
1721 /* Rebuild the rmap information. */
1722 error = xrep_rmap_build_new_tree(rr);
1723 if (error)
1724 goto out_records;
1725
1726 /* Kill the old tree. */
1727 error = xrep_rmap_remove_old_tree(rr);
1728 if (error)
1729 goto out_records;
1730
1731 out_records:
1732 xrep_rmap_teardown(rr);
1733 return error;
1734 }
1735