1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_defer.h"
13 #include "xfs_btree.h"
14 #include "xfs_btree_staging.h"
15 #include "xfs_bit.h"
16 #include "xfs_log_format.h"
17 #include "xfs_trans.h"
18 #include "xfs_sb.h"
19 #include "xfs_inode.h"
20 #include "xfs_alloc.h"
21 #include "xfs_ialloc.h"
22 #include "xfs_ialloc_btree.h"
23 #include "xfs_icache.h"
24 #include "xfs_rmap.h"
25 #include "xfs_rmap_btree.h"
26 #include "xfs_log.h"
27 #include "xfs_trans_priv.h"
28 #include "xfs_error.h"
29 #include "xfs_health.h"
30 #include "xfs_ag.h"
31 #include "scrub/xfs_scrub.h"
32 #include "scrub/scrub.h"
33 #include "scrub/common.h"
34 #include "scrub/btree.h"
35 #include "scrub/trace.h"
36 #include "scrub/repair.h"
37 #include "scrub/bitmap.h"
38 #include "scrub/agb_bitmap.h"
39 #include "scrub/xfile.h"
40 #include "scrub/xfarray.h"
41 #include "scrub/newbt.h"
42 #include "scrub/reap.h"
43
44 /*
45 * Inode Btree Repair
46 * ==================
47 *
48 * A quick refresher of inode btrees on a v5 filesystem:
49 *
50 * - Inode records are read into memory in units of 'inode clusters'. However
51 * many inodes fit in a cluster buffer is the smallest number of inodes that
52 * can be allocated or freed. Clusters are never smaller than one fs block
53 * though they can span multiple blocks. The size (in fs blocks) is
54 * computed with xfs_icluster_size_fsb(). The fs block alignment of a
55 * cluster is computed with xfs_ialloc_cluster_alignment().
56 *
57 * - Each inode btree record can describe a single 'inode chunk'. The chunk
58 * size is defined to be 64 inodes. If sparse inodes are enabled, every
59 * inobt record must be aligned to the chunk size; if not, every record must
60 * be aligned to the start of a cluster. It is possible to construct an XFS
61 * geometry where one inobt record maps to multiple inode clusters; it is
62 * also possible to construct a geometry where multiple inobt records map to
63 * different parts of one inode cluster.
64 *
65 * - If sparse inodes are not enabled, the smallest unit of allocation for
66 * inode records is enough to contain one inode chunk's worth of inodes.
67 *
68 * - If sparse inodes are enabled, the holemask field will be active. Each
69 * bit of the holemask represents 4 potential inodes; if set, the
70 * corresponding space does *not* contain inodes and must be left alone.
71 * Clusters cannot be smaller than 4 inodes. The smallest unit of allocation
72 * of inode records is one inode cluster.
73 *
74 * So what's the rebuild algorithm?
75 *
76 * Iterate the reverse mapping records looking for OWN_INODES and OWN_INOBT
77 * records. The OWN_INOBT records are the old inode btree blocks and will be
78 * cleared out after we've rebuilt the tree. Each possible inode cluster
79 * within an OWN_INODES record will be read in; for each possible inobt record
80 * associated with that cluster, compute the freemask calculated from the
81 * i_mode data in the inode chunk. For sparse inodes the holemask will be
82 * calculated by creating the properly aligned inobt record and punching out
83 * any chunk that's missing. Inode allocations and frees grab the AGI first,
84 * so repair protects itself from concurrent access by locking the AGI.
85 *
86 * Once we've reconstructed all the inode records, we can create new inode
87 * btree roots and reload the btrees. We rebuild both inode trees at the same
88 * time because they have the same rmap owner and it would be more complex to
89 * figure out if the other tree isn't in need of a rebuild and which OWN_INOBT
90 * blocks it owns. We have all the data we need to build both, so dump
91 * everything and start over.
92 *
93 * We use the prefix 'xrep_ibt' because we rebuild both inode btrees at once.
94 */
95
96 struct xrep_ibt {
97 /* Record under construction. */
98 struct xfs_inobt_rec_incore rie;
99
100 /* new inobt information */
101 struct xrep_newbt new_inobt;
102
103 /* new finobt information */
104 struct xrep_newbt new_finobt;
105
106 /* Old inode btree blocks we found in the rmap. */
107 struct xagb_bitmap old_iallocbt_blocks;
108
109 /* Reconstructed inode records. */
110 struct xfarray *inode_records;
111
112 struct xfs_scrub *sc;
113
114 /* Number of inodes assigned disk space. */
115 unsigned int icount;
116
117 /* Number of inodes in use. */
118 unsigned int iused;
119
120 /* Number of finobt records needed. */
121 unsigned int finobt_recs;
122
123 /* get_records()'s position in the inode record array. */
124 xfarray_idx_t array_cur;
125 };
126
127 /*
128 * Is this inode in use? If the inode is in memory we can tell from i_mode,
129 * otherwise we have to check di_mode in the on-disk buffer. We only care
130 * that the high (i.e. non-permission) bits of _mode are zero. This should be
131 * safe because repair keeps all AG headers locked until the end, and process
132 * trying to perform an inode allocation/free must lock the AGI.
133 *
134 * @cluster_ag_base is the inode offset of the cluster within the AG.
135 * @cluster_bp is the cluster buffer.
136 * @cluster_index is the inode offset within the inode cluster.
137 */
138 STATIC int
xrep_ibt_check_ifree(struct xrep_ibt * ri,xfs_agino_t cluster_ag_base,struct xfs_buf * cluster_bp,unsigned int cluster_index,bool * inuse)139 xrep_ibt_check_ifree(
140 struct xrep_ibt *ri,
141 xfs_agino_t cluster_ag_base,
142 struct xfs_buf *cluster_bp,
143 unsigned int cluster_index,
144 bool *inuse)
145 {
146 struct xfs_scrub *sc = ri->sc;
147 struct xfs_mount *mp = sc->mp;
148 struct xfs_dinode *dip;
149 xfs_ino_t fsino;
150 xfs_agino_t agino;
151 xfs_agnumber_t agno = ri->sc->sa.pag->pag_agno;
152 unsigned int cluster_buf_base;
153 unsigned int offset;
154 int error;
155
156 agino = cluster_ag_base + cluster_index;
157 fsino = XFS_AGINO_TO_INO(mp, agno, agino);
158
159 /* Inode uncached or half assembled, read disk buffer */
160 cluster_buf_base = XFS_INO_TO_OFFSET(mp, cluster_ag_base);
161 offset = (cluster_buf_base + cluster_index) * mp->m_sb.sb_inodesize;
162 if (offset >= BBTOB(cluster_bp->b_length))
163 return -EFSCORRUPTED;
164 dip = xfs_buf_offset(cluster_bp, offset);
165 if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)
166 return -EFSCORRUPTED;
167
168 if (dip->di_version >= 3 && be64_to_cpu(dip->di_ino) != fsino)
169 return -EFSCORRUPTED;
170
171 /* Will the in-core inode tell us if it's in use? */
172 error = xchk_inode_is_allocated(sc, agino, inuse);
173 if (!error)
174 return 0;
175
176 *inuse = dip->di_mode != 0;
177 return 0;
178 }
179
180 /* Stash the accumulated inobt record for rebuilding. */
181 STATIC int
xrep_ibt_stash(struct xrep_ibt * ri)182 xrep_ibt_stash(
183 struct xrep_ibt *ri)
184 {
185 int error = 0;
186
187 if (xchk_should_terminate(ri->sc, &error))
188 return error;
189
190 ri->rie.ir_freecount = xfs_inobt_rec_freecount(&ri->rie);
191 if (xfs_inobt_check_irec(ri->sc->sa.pag, &ri->rie) != NULL)
192 return -EFSCORRUPTED;
193
194 if (ri->rie.ir_freecount > 0)
195 ri->finobt_recs++;
196
197 trace_xrep_ibt_found(ri->sc->mp, ri->sc->sa.pag->pag_agno, &ri->rie);
198
199 error = xfarray_append(ri->inode_records, &ri->rie);
200 if (error)
201 return error;
202
203 ri->rie.ir_startino = NULLAGINO;
204 return 0;
205 }
206
207 /*
208 * Given an extent of inodes and an inode cluster buffer, calculate the
209 * location of the corresponding inobt record (creating it if necessary),
210 * then update the parts of the holemask and freemask of that record that
211 * correspond to the inode extent we were given.
212 *
213 * @cluster_ir_startino is the AG inode number of an inobt record that we're
214 * proposing to create for this inode cluster. If sparse inodes are enabled,
215 * we must round down to a chunk boundary to find the actual sparse record.
216 * @cluster_bp is the buffer of the inode cluster.
217 * @nr_inodes is the number of inodes to check from the cluster.
218 */
219 STATIC int
xrep_ibt_cluster_record(struct xrep_ibt * ri,xfs_agino_t cluster_ir_startino,struct xfs_buf * cluster_bp,unsigned int nr_inodes)220 xrep_ibt_cluster_record(
221 struct xrep_ibt *ri,
222 xfs_agino_t cluster_ir_startino,
223 struct xfs_buf *cluster_bp,
224 unsigned int nr_inodes)
225 {
226 struct xfs_scrub *sc = ri->sc;
227 struct xfs_mount *mp = sc->mp;
228 xfs_agino_t ir_startino;
229 unsigned int cluster_base;
230 unsigned int cluster_index;
231 int error = 0;
232
233 ir_startino = cluster_ir_startino;
234 if (xfs_has_sparseinodes(mp))
235 ir_startino = rounddown(ir_startino, XFS_INODES_PER_CHUNK);
236 cluster_base = cluster_ir_startino - ir_startino;
237
238 /*
239 * If the accumulated inobt record doesn't map this cluster, add it to
240 * the list and reset it.
241 */
242 if (ri->rie.ir_startino != NULLAGINO &&
243 ri->rie.ir_startino + XFS_INODES_PER_CHUNK <= ir_startino) {
244 error = xrep_ibt_stash(ri);
245 if (error)
246 return error;
247 }
248
249 if (ri->rie.ir_startino == NULLAGINO) {
250 ri->rie.ir_startino = ir_startino;
251 ri->rie.ir_free = XFS_INOBT_ALL_FREE;
252 ri->rie.ir_holemask = 0xFFFF;
253 ri->rie.ir_count = 0;
254 }
255
256 /* Record the whole cluster. */
257 ri->icount += nr_inodes;
258 ri->rie.ir_count += nr_inodes;
259 ri->rie.ir_holemask &= ~xfs_inobt_maskn(
260 cluster_base / XFS_INODES_PER_HOLEMASK_BIT,
261 nr_inodes / XFS_INODES_PER_HOLEMASK_BIT);
262
263 /* Which inodes within this cluster are free? */
264 for (cluster_index = 0; cluster_index < nr_inodes; cluster_index++) {
265 bool inuse = false;
266
267 error = xrep_ibt_check_ifree(ri, cluster_ir_startino,
268 cluster_bp, cluster_index, &inuse);
269 if (error)
270 return error;
271 if (!inuse)
272 continue;
273 ri->iused++;
274 ri->rie.ir_free &= ~XFS_INOBT_MASK(cluster_base +
275 cluster_index);
276 }
277 return 0;
278 }
279
280 /*
281 * For each inode cluster covering the physical extent recorded by the rmapbt,
282 * we must calculate the properly aligned startino of that cluster, then
283 * iterate each cluster to fill in used and filled masks appropriately. We
284 * then use the (startino, used, filled) information to construct the
285 * appropriate inode records.
286 */
287 STATIC int
xrep_ibt_process_cluster(struct xrep_ibt * ri,xfs_agblock_t cluster_bno)288 xrep_ibt_process_cluster(
289 struct xrep_ibt *ri,
290 xfs_agblock_t cluster_bno)
291 {
292 struct xfs_imap imap;
293 struct xfs_buf *cluster_bp;
294 struct xfs_scrub *sc = ri->sc;
295 struct xfs_mount *mp = sc->mp;
296 struct xfs_ino_geometry *igeo = M_IGEO(mp);
297 xfs_agino_t cluster_ag_base;
298 xfs_agino_t irec_index;
299 unsigned int nr_inodes;
300 int error;
301
302 nr_inodes = min_t(unsigned int, igeo->inodes_per_cluster,
303 XFS_INODES_PER_CHUNK);
304
305 /*
306 * Grab the inode cluster buffer. This is safe to do with a broken
307 * inobt because imap_to_bp directly maps the buffer without touching
308 * either inode btree.
309 */
310 imap.im_blkno = XFS_AGB_TO_DADDR(mp, sc->sa.pag->pag_agno, cluster_bno);
311 imap.im_len = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster);
312 imap.im_boffset = 0;
313 error = xfs_imap_to_bp(mp, sc->tp, &imap, &cluster_bp);
314 if (error)
315 return error;
316
317 /*
318 * Record the contents of each possible inobt record mapping this
319 * cluster.
320 */
321 cluster_ag_base = XFS_AGB_TO_AGINO(mp, cluster_bno);
322 for (irec_index = 0;
323 irec_index < igeo->inodes_per_cluster;
324 irec_index += XFS_INODES_PER_CHUNK) {
325 error = xrep_ibt_cluster_record(ri,
326 cluster_ag_base + irec_index, cluster_bp,
327 nr_inodes);
328 if (error)
329 break;
330
331 }
332
333 xfs_trans_brelse(sc->tp, cluster_bp);
334 return error;
335 }
336
337 /* Check for any obvious conflicts in the inode chunk extent. */
338 STATIC int
xrep_ibt_check_inode_ext(struct xfs_scrub * sc,xfs_agblock_t agbno,xfs_extlen_t len)339 xrep_ibt_check_inode_ext(
340 struct xfs_scrub *sc,
341 xfs_agblock_t agbno,
342 xfs_extlen_t len)
343 {
344 struct xfs_mount *mp = sc->mp;
345 struct xfs_ino_geometry *igeo = M_IGEO(mp);
346 xfs_agino_t agino;
347 enum xbtree_recpacking outcome;
348 int error;
349
350 /* Inode records must be within the AG. */
351 if (!xfs_verify_agbext(sc->sa.pag, agbno, len))
352 return -EFSCORRUPTED;
353
354 /* The entire record must align to the inode cluster size. */
355 if (!IS_ALIGNED(agbno, igeo->blocks_per_cluster) ||
356 !IS_ALIGNED(agbno + len, igeo->blocks_per_cluster))
357 return -EFSCORRUPTED;
358
359 /*
360 * The entire record must also adhere to the inode cluster alignment
361 * size if sparse inodes are not enabled.
362 */
363 if (!xfs_has_sparseinodes(mp) &&
364 (!IS_ALIGNED(agbno, igeo->cluster_align) ||
365 !IS_ALIGNED(agbno + len, igeo->cluster_align)))
366 return -EFSCORRUPTED;
367
368 /*
369 * On a sparse inode fs, this cluster could be part of a sparse chunk.
370 * Sparse clusters must be aligned to sparse chunk alignment.
371 */
372 if (xfs_has_sparseinodes(mp) && mp->m_sb.sb_spino_align &&
373 (!IS_ALIGNED(agbno, mp->m_sb.sb_spino_align) ||
374 !IS_ALIGNED(agbno + len, mp->m_sb.sb_spino_align)))
375 return -EFSCORRUPTED;
376
377 /* Make sure the entire range of blocks are valid AG inodes. */
378 agino = XFS_AGB_TO_AGINO(mp, agbno);
379 if (!xfs_verify_agino(sc->sa.pag, agino))
380 return -EFSCORRUPTED;
381
382 agino = XFS_AGB_TO_AGINO(mp, agbno + len) - 1;
383 if (!xfs_verify_agino(sc->sa.pag, agino))
384 return -EFSCORRUPTED;
385
386 /* Make sure this isn't free space. */
387 error = xfs_alloc_has_records(sc->sa.bno_cur, agbno, len, &outcome);
388 if (error)
389 return error;
390 if (outcome != XBTREE_RECPACKING_EMPTY)
391 return -EFSCORRUPTED;
392
393 return 0;
394 }
395
396 /* Found a fragment of the old inode btrees; dispose of them later. */
397 STATIC int
xrep_ibt_record_old_btree_blocks(struct xrep_ibt * ri,const struct xfs_rmap_irec * rec)398 xrep_ibt_record_old_btree_blocks(
399 struct xrep_ibt *ri,
400 const struct xfs_rmap_irec *rec)
401 {
402 if (!xfs_verify_agbext(ri->sc->sa.pag, rec->rm_startblock,
403 rec->rm_blockcount))
404 return -EFSCORRUPTED;
405
406 return xagb_bitmap_set(&ri->old_iallocbt_blocks, rec->rm_startblock,
407 rec->rm_blockcount);
408 }
409
410 /* Record extents that belong to inode cluster blocks. */
411 STATIC int
xrep_ibt_record_inode_blocks(struct xrep_ibt * ri,const struct xfs_rmap_irec * rec)412 xrep_ibt_record_inode_blocks(
413 struct xrep_ibt *ri,
414 const struct xfs_rmap_irec *rec)
415 {
416 struct xfs_mount *mp = ri->sc->mp;
417 struct xfs_ino_geometry *igeo = M_IGEO(mp);
418 xfs_agblock_t cluster_base;
419 int error;
420
421 error = xrep_ibt_check_inode_ext(ri->sc, rec->rm_startblock,
422 rec->rm_blockcount);
423 if (error)
424 return error;
425
426 trace_xrep_ibt_walk_rmap(mp, ri->sc->sa.pag->pag_agno,
427 rec->rm_startblock, rec->rm_blockcount, rec->rm_owner,
428 rec->rm_offset, rec->rm_flags);
429
430 /*
431 * Record the free/hole masks for each inode cluster that could be
432 * mapped by this rmap record.
433 */
434 for (cluster_base = 0;
435 cluster_base < rec->rm_blockcount;
436 cluster_base += igeo->blocks_per_cluster) {
437 error = xrep_ibt_process_cluster(ri,
438 rec->rm_startblock + cluster_base);
439 if (error)
440 return error;
441 }
442
443 return 0;
444 }
445
446 STATIC int
xrep_ibt_walk_rmap(struct xfs_btree_cur * cur,const struct xfs_rmap_irec * rec,void * priv)447 xrep_ibt_walk_rmap(
448 struct xfs_btree_cur *cur,
449 const struct xfs_rmap_irec *rec,
450 void *priv)
451 {
452 struct xrep_ibt *ri = priv;
453 int error = 0;
454
455 if (xchk_should_terminate(ri->sc, &error))
456 return error;
457
458 switch (rec->rm_owner) {
459 case XFS_RMAP_OWN_INOBT:
460 return xrep_ibt_record_old_btree_blocks(ri, rec);
461 case XFS_RMAP_OWN_INODES:
462 return xrep_ibt_record_inode_blocks(ri, rec);
463 }
464 return 0;
465 }
466
467 /*
468 * Iterate all reverse mappings to find the inodes (OWN_INODES) and the inode
469 * btrees (OWN_INOBT). Figure out if we have enough free space to reconstruct
470 * the inode btrees. The caller must clean up the lists if anything goes
471 * wrong.
472 */
473 STATIC int
xrep_ibt_find_inodes(struct xrep_ibt * ri)474 xrep_ibt_find_inodes(
475 struct xrep_ibt *ri)
476 {
477 struct xfs_scrub *sc = ri->sc;
478 int error;
479
480 ri->rie.ir_startino = NULLAGINO;
481
482 /* Collect all reverse mappings for inode blocks. */
483 xrep_ag_btcur_init(sc, &sc->sa);
484 error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_ibt_walk_rmap, ri);
485 xchk_ag_btcur_free(&sc->sa);
486 if (error)
487 return error;
488
489 /* If we have a record ready to go, add it to the array. */
490 if (ri->rie.ir_startino != NULLAGINO)
491 return xrep_ibt_stash(ri);
492
493 return 0;
494 }
495
496 /* Update the AGI counters. */
497 STATIC int
xrep_ibt_reset_counters(struct xrep_ibt * ri)498 xrep_ibt_reset_counters(
499 struct xrep_ibt *ri)
500 {
501 struct xfs_scrub *sc = ri->sc;
502 struct xfs_agi *agi = sc->sa.agi_bp->b_addr;
503 unsigned int freecount = ri->icount - ri->iused;
504
505 /* Trigger inode count recalculation */
506 xfs_force_summary_recalc(sc->mp);
507
508 /*
509 * The AGI header contains extra information related to the inode
510 * btrees, so we must update those fields here.
511 */
512 agi->agi_count = cpu_to_be32(ri->icount);
513 agi->agi_freecount = cpu_to_be32(freecount);
514 xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp,
515 XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
516
517 /* Reinitialize with the values we just logged. */
518 return xrep_reinit_pagi(sc);
519 }
520
521 /* Retrieve finobt data for bulk load. */
522 STATIC int
xrep_fibt_get_records(struct xfs_btree_cur * cur,unsigned int idx,struct xfs_btree_block * block,unsigned int nr_wanted,void * priv)523 xrep_fibt_get_records(
524 struct xfs_btree_cur *cur,
525 unsigned int idx,
526 struct xfs_btree_block *block,
527 unsigned int nr_wanted,
528 void *priv)
529 {
530 struct xfs_inobt_rec_incore *irec = &cur->bc_rec.i;
531 struct xrep_ibt *ri = priv;
532 union xfs_btree_rec *block_rec;
533 unsigned int loaded;
534 int error;
535
536 for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
537 do {
538 error = xfarray_load(ri->inode_records,
539 ri->array_cur++, irec);
540 } while (error == 0 && xfs_inobt_rec_freecount(irec) == 0);
541 if (error)
542 return error;
543
544 block_rec = xfs_btree_rec_addr(cur, idx, block);
545 cur->bc_ops->init_rec_from_cur(cur, block_rec);
546 }
547
548 return loaded;
549 }
550
551 /* Retrieve inobt data for bulk load. */
552 STATIC int
xrep_ibt_get_records(struct xfs_btree_cur * cur,unsigned int idx,struct xfs_btree_block * block,unsigned int nr_wanted,void * priv)553 xrep_ibt_get_records(
554 struct xfs_btree_cur *cur,
555 unsigned int idx,
556 struct xfs_btree_block *block,
557 unsigned int nr_wanted,
558 void *priv)
559 {
560 struct xfs_inobt_rec_incore *irec = &cur->bc_rec.i;
561 struct xrep_ibt *ri = priv;
562 union xfs_btree_rec *block_rec;
563 unsigned int loaded;
564 int error;
565
566 for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
567 error = xfarray_load(ri->inode_records, ri->array_cur++, irec);
568 if (error)
569 return error;
570
571 block_rec = xfs_btree_rec_addr(cur, idx, block);
572 cur->bc_ops->init_rec_from_cur(cur, block_rec);
573 }
574
575 return loaded;
576 }
577
578 /* Feed one of the new inobt blocks to the bulk loader. */
579 STATIC int
xrep_ibt_claim_block(struct xfs_btree_cur * cur,union xfs_btree_ptr * ptr,void * priv)580 xrep_ibt_claim_block(
581 struct xfs_btree_cur *cur,
582 union xfs_btree_ptr *ptr,
583 void *priv)
584 {
585 struct xrep_ibt *ri = priv;
586
587 return xrep_newbt_claim_block(cur, &ri->new_inobt, ptr);
588 }
589
590 /* Feed one of the new finobt blocks to the bulk loader. */
591 STATIC int
xrep_fibt_claim_block(struct xfs_btree_cur * cur,union xfs_btree_ptr * ptr,void * priv)592 xrep_fibt_claim_block(
593 struct xfs_btree_cur *cur,
594 union xfs_btree_ptr *ptr,
595 void *priv)
596 {
597 struct xrep_ibt *ri = priv;
598
599 return xrep_newbt_claim_block(cur, &ri->new_finobt, ptr);
600 }
601
602 /* Make sure the records do not overlap in inumber address space. */
603 STATIC int
xrep_ibt_check_overlap(struct xrep_ibt * ri)604 xrep_ibt_check_overlap(
605 struct xrep_ibt *ri)
606 {
607 struct xfs_inobt_rec_incore irec;
608 xfarray_idx_t cur;
609 xfs_agino_t next_agino = 0;
610 int error = 0;
611
612 foreach_xfarray_idx(ri->inode_records, cur) {
613 if (xchk_should_terminate(ri->sc, &error))
614 return error;
615
616 error = xfarray_load(ri->inode_records, cur, &irec);
617 if (error)
618 return error;
619
620 if (irec.ir_startino < next_agino)
621 return -EFSCORRUPTED;
622
623 next_agino = irec.ir_startino + XFS_INODES_PER_CHUNK;
624 }
625
626 return error;
627 }
628
629 /* Build new inode btrees and dispose of the old one. */
630 STATIC int
xrep_ibt_build_new_trees(struct xrep_ibt * ri)631 xrep_ibt_build_new_trees(
632 struct xrep_ibt *ri)
633 {
634 struct xfs_scrub *sc = ri->sc;
635 struct xfs_btree_cur *ino_cur;
636 struct xfs_btree_cur *fino_cur = NULL;
637 xfs_fsblock_t fsbno;
638 bool need_finobt;
639 int error;
640
641 need_finobt = xfs_has_finobt(sc->mp);
642
643 /*
644 * Create new btrees for staging all the inobt records we collected
645 * earlier. The records were collected in order of increasing agino,
646 * so we do not have to sort them. Ensure there are no overlapping
647 * records.
648 */
649 error = xrep_ibt_check_overlap(ri);
650 if (error)
651 return error;
652
653 /*
654 * The new inode btrees will not be rooted in the AGI until we've
655 * successfully rebuilt the tree.
656 *
657 * Start by setting up the inobt staging cursor.
658 */
659 fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno,
660 XFS_IBT_BLOCK(sc->mp));
661 xrep_newbt_init_ag(&ri->new_inobt, sc, &XFS_RMAP_OINFO_INOBT, fsbno,
662 XFS_AG_RESV_NONE);
663 ri->new_inobt.bload.claim_block = xrep_ibt_claim_block;
664 ri->new_inobt.bload.get_records = xrep_ibt_get_records;
665
666 ino_cur = xfs_inobt_init_cursor(sc->sa.pag, NULL, NULL);
667 xfs_btree_stage_afakeroot(ino_cur, &ri->new_inobt.afake);
668 error = xfs_btree_bload_compute_geometry(ino_cur, &ri->new_inobt.bload,
669 xfarray_length(ri->inode_records));
670 if (error)
671 goto err_inocur;
672
673 /* Set up finobt staging cursor. */
674 if (need_finobt) {
675 enum xfs_ag_resv_type resv = XFS_AG_RESV_METADATA;
676
677 if (sc->mp->m_finobt_nores)
678 resv = XFS_AG_RESV_NONE;
679
680 fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno,
681 XFS_FIBT_BLOCK(sc->mp));
682 xrep_newbt_init_ag(&ri->new_finobt, sc, &XFS_RMAP_OINFO_INOBT,
683 fsbno, resv);
684 ri->new_finobt.bload.claim_block = xrep_fibt_claim_block;
685 ri->new_finobt.bload.get_records = xrep_fibt_get_records;
686
687 fino_cur = xfs_finobt_init_cursor(sc->sa.pag, NULL, NULL);
688 xfs_btree_stage_afakeroot(fino_cur, &ri->new_finobt.afake);
689 error = xfs_btree_bload_compute_geometry(fino_cur,
690 &ri->new_finobt.bload, ri->finobt_recs);
691 if (error)
692 goto err_finocur;
693 }
694
695 /* Last chance to abort before we start committing fixes. */
696 if (xchk_should_terminate(sc, &error))
697 goto err_finocur;
698
699 /* Reserve all the space we need to build the new btrees. */
700 error = xrep_newbt_alloc_blocks(&ri->new_inobt,
701 ri->new_inobt.bload.nr_blocks);
702 if (error)
703 goto err_finocur;
704
705 if (need_finobt) {
706 error = xrep_newbt_alloc_blocks(&ri->new_finobt,
707 ri->new_finobt.bload.nr_blocks);
708 if (error)
709 goto err_finocur;
710 }
711
712 /* Add all inobt records. */
713 ri->array_cur = XFARRAY_CURSOR_INIT;
714 error = xfs_btree_bload(ino_cur, &ri->new_inobt.bload, ri);
715 if (error)
716 goto err_finocur;
717
718 /* Add all finobt records. */
719 if (need_finobt) {
720 ri->array_cur = XFARRAY_CURSOR_INIT;
721 error = xfs_btree_bload(fino_cur, &ri->new_finobt.bload, ri);
722 if (error)
723 goto err_finocur;
724 }
725
726 /*
727 * Install the new btrees in the AG header. After this point the old
728 * btrees are no longer accessible and the new trees are live.
729 */
730 xfs_inobt_commit_staged_btree(ino_cur, sc->tp, sc->sa.agi_bp);
731 xfs_btree_del_cursor(ino_cur, 0);
732
733 if (fino_cur) {
734 xfs_inobt_commit_staged_btree(fino_cur, sc->tp, sc->sa.agi_bp);
735 xfs_btree_del_cursor(fino_cur, 0);
736 }
737
738 /* Reset the AGI counters now that we've changed the inode roots. */
739 error = xrep_ibt_reset_counters(ri);
740 if (error)
741 goto err_finobt;
742
743 /* Free unused blocks and bitmap. */
744 if (need_finobt) {
745 error = xrep_newbt_commit(&ri->new_finobt);
746 if (error)
747 goto err_inobt;
748 }
749 error = xrep_newbt_commit(&ri->new_inobt);
750 if (error)
751 return error;
752
753 return xrep_roll_ag_trans(sc);
754
755 err_finocur:
756 if (need_finobt)
757 xfs_btree_del_cursor(fino_cur, error);
758 err_inocur:
759 xfs_btree_del_cursor(ino_cur, error);
760 err_finobt:
761 if (need_finobt)
762 xrep_newbt_cancel(&ri->new_finobt);
763 err_inobt:
764 xrep_newbt_cancel(&ri->new_inobt);
765 return error;
766 }
767
768 /*
769 * Now that we've logged the roots of the new btrees, invalidate all of the
770 * old blocks and free them.
771 */
772 STATIC int
xrep_ibt_remove_old_trees(struct xrep_ibt * ri)773 xrep_ibt_remove_old_trees(
774 struct xrep_ibt *ri)
775 {
776 struct xfs_scrub *sc = ri->sc;
777 int error;
778
779 /*
780 * Free the old inode btree blocks if they're not in use. It's ok to
781 * reap with XFS_AG_RESV_NONE even if the finobt had a per-AG
782 * reservation because we reset the reservation before releasing the
783 * AGI and AGF header buffer locks.
784 */
785 error = xrep_reap_agblocks(sc, &ri->old_iallocbt_blocks,
786 &XFS_RMAP_OINFO_INOBT, XFS_AG_RESV_NONE);
787 if (error)
788 return error;
789
790 /*
791 * If the finobt is enabled and has a per-AG reservation, make sure we
792 * reinitialize the per-AG reservations.
793 */
794 if (xfs_has_finobt(sc->mp) && !sc->mp->m_finobt_nores)
795 sc->flags |= XREP_RESET_PERAG_RESV;
796
797 return 0;
798 }
799
800 /* Repair both inode btrees. */
801 int
xrep_iallocbt(struct xfs_scrub * sc)802 xrep_iallocbt(
803 struct xfs_scrub *sc)
804 {
805 struct xrep_ibt *ri;
806 struct xfs_mount *mp = sc->mp;
807 char *descr;
808 xfs_agino_t first_agino, last_agino;
809 int error = 0;
810
811 /* We require the rmapbt to rebuild anything. */
812 if (!xfs_has_rmapbt(mp))
813 return -EOPNOTSUPP;
814
815 ri = kzalloc(sizeof(struct xrep_ibt), XCHK_GFP_FLAGS);
816 if (!ri)
817 return -ENOMEM;
818 ri->sc = sc;
819
820 /* We rebuild both inode btrees. */
821 sc->sick_mask = XFS_SICK_AG_INOBT | XFS_SICK_AG_FINOBT;
822
823 /* Set up enough storage to handle an AG with nothing but inodes. */
824 xfs_agino_range(mp, sc->sa.pag->pag_agno, &first_agino, &last_agino);
825 last_agino /= XFS_INODES_PER_CHUNK;
826 descr = xchk_xfile_ag_descr(sc, "inode index records");
827 error = xfarray_create(descr, last_agino,
828 sizeof(struct xfs_inobt_rec_incore),
829 &ri->inode_records);
830 kfree(descr);
831 if (error)
832 goto out_ri;
833
834 /* Collect the inode data and find the old btree blocks. */
835 xagb_bitmap_init(&ri->old_iallocbt_blocks);
836 error = xrep_ibt_find_inodes(ri);
837 if (error)
838 goto out_bitmap;
839
840 /* Rebuild the inode indexes. */
841 error = xrep_ibt_build_new_trees(ri);
842 if (error)
843 goto out_bitmap;
844
845 /* Kill the old tree. */
846 error = xrep_ibt_remove_old_trees(ri);
847 if (error)
848 goto out_bitmap;
849
850 out_bitmap:
851 xagb_bitmap_destroy(&ri->old_iallocbt_blocks);
852 xfarray_destroy(ri->inode_records);
853 out_ri:
854 kfree(ri);
855 return error;
856 }
857
858 /* Make sure both btrees are ok after we've rebuilt them. */
859 int
xrep_revalidate_iallocbt(struct xfs_scrub * sc)860 xrep_revalidate_iallocbt(
861 struct xfs_scrub *sc)
862 {
863 __u32 old_type = sc->sm->sm_type;
864 int error;
865
866 /*
867 * We must update sm_type temporarily so that the tree-to-tree cross
868 * reference checks will work in the correct direction, and also so
869 * that tracing will report correctly if there are more errors.
870 */
871 sc->sm->sm_type = XFS_SCRUB_TYPE_INOBT;
872 error = xchk_iallocbt(sc);
873 if (error)
874 goto out;
875
876 if (xfs_has_finobt(sc->mp)) {
877 sc->sm->sm_type = XFS_SCRUB_TYPE_FINOBT;
878 error = xchk_iallocbt(sc);
879 }
880
881 out:
882 sc->sm->sm_type = old_type;
883 return error;
884 }
885