1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_defer.h"
13 #include "xfs_btree.h"
14 #include "xfs_btree_staging.h"
15 #include "xfs_bit.h"
16 #include "xfs_log_format.h"
17 #include "xfs_trans.h"
18 #include "xfs_sb.h"
19 #include "xfs_inode.h"
20 #include "xfs_alloc.h"
21 #include "xfs_ialloc.h"
22 #include "xfs_ialloc_btree.h"
23 #include "xfs_icache.h"
24 #include "xfs_rmap.h"
25 #include "xfs_rmap_btree.h"
26 #include "xfs_log.h"
27 #include "xfs_trans_priv.h"
28 #include "xfs_error.h"
29 #include "xfs_health.h"
30 #include "xfs_ag.h"
31 #include "scrub/xfs_scrub.h"
32 #include "scrub/scrub.h"
33 #include "scrub/common.h"
34 #include "scrub/btree.h"
35 #include "scrub/trace.h"
36 #include "scrub/repair.h"
37 #include "scrub/bitmap.h"
38 #include "scrub/agb_bitmap.h"
39 #include "scrub/xfile.h"
40 #include "scrub/xfarray.h"
41 #include "scrub/newbt.h"
42 #include "scrub/reap.h"
43
44 /*
45 * Inode Btree Repair
46 * ==================
47 *
48 * A quick refresher of inode btrees on a v5 filesystem:
49 *
50 * - Inode records are read into memory in units of 'inode clusters'. However
51 * many inodes fit in a cluster buffer is the smallest number of inodes that
52 * can be allocated or freed. Clusters are never smaller than one fs block
53 * though they can span multiple blocks. The size (in fs blocks) is
54 * computed with xfs_icluster_size_fsb(). The fs block alignment of a
55 * cluster is computed with xfs_ialloc_cluster_alignment().
56 *
57 * - Each inode btree record can describe a single 'inode chunk'. The chunk
58 * size is defined to be 64 inodes. If sparse inodes are enabled, every
59 * inobt record must be aligned to the chunk size; if not, every record must
60 * be aligned to the start of a cluster. It is possible to construct an XFS
61 * geometry where one inobt record maps to multiple inode clusters; it is
62 * also possible to construct a geometry where multiple inobt records map to
63 * different parts of one inode cluster.
64 *
65 * - If sparse inodes are not enabled, the smallest unit of allocation for
66 * inode records is enough to contain one inode chunk's worth of inodes.
67 *
68 * - If sparse inodes are enabled, the holemask field will be active. Each
69 * bit of the holemask represents 4 potential inodes; if set, the
70 * corresponding space does *not* contain inodes and must be left alone.
71 * Clusters cannot be smaller than 4 inodes. The smallest unit of allocation
72 * of inode records is one inode cluster.
73 *
74 * So what's the rebuild algorithm?
75 *
76 * Iterate the reverse mapping records looking for OWN_INODES and OWN_INOBT
77 * records. The OWN_INOBT records are the old inode btree blocks and will be
78 * cleared out after we've rebuilt the tree. Each possible inode cluster
79 * within an OWN_INODES record will be read in; for each possible inobt record
80 * associated with that cluster, compute the freemask calculated from the
81 * i_mode data in the inode chunk. For sparse inodes the holemask will be
82 * calculated by creating the properly aligned inobt record and punching out
83 * any chunk that's missing. Inode allocations and frees grab the AGI first,
84 * so repair protects itself from concurrent access by locking the AGI.
85 *
86 * Once we've reconstructed all the inode records, we can create new inode
87 * btree roots and reload the btrees. We rebuild both inode trees at the same
88 * time because they have the same rmap owner and it would be more complex to
89 * figure out if the other tree isn't in need of a rebuild and which OWN_INOBT
90 * blocks it owns. We have all the data we need to build both, so dump
91 * everything and start over.
92 *
93 * We use the prefix 'xrep_ibt' because we rebuild both inode btrees at once.
94 */
95
96 struct xrep_ibt {
97 /* Record under construction. */
98 struct xfs_inobt_rec_incore rie;
99
100 /* new inobt information */
101 struct xrep_newbt new_inobt;
102
103 /* new finobt information */
104 struct xrep_newbt new_finobt;
105
106 /* Old inode btree blocks we found in the rmap. */
107 struct xagb_bitmap old_iallocbt_blocks;
108
109 /* Reconstructed inode records. */
110 struct xfarray *inode_records;
111
112 struct xfs_scrub *sc;
113
114 /* Number of inodes assigned disk space. */
115 unsigned int icount;
116
117 /* Number of inodes in use. */
118 unsigned int iused;
119
120 /* Number of finobt records needed. */
121 unsigned int finobt_recs;
122
123 /* get_records()'s position in the inode record array. */
124 xfarray_idx_t array_cur;
125 };
126
127 /*
128 * Is this inode in use? If the inode is in memory we can tell from i_mode,
129 * otherwise we have to check di_mode in the on-disk buffer. We only care
130 * that the high (i.e. non-permission) bits of _mode are zero. This should be
131 * safe because repair keeps all AG headers locked until the end, and process
132 * trying to perform an inode allocation/free must lock the AGI.
133 *
134 * @cluster_ag_base is the inode offset of the cluster within the AG.
135 * @cluster_bp is the cluster buffer.
136 * @cluster_index is the inode offset within the inode cluster.
137 */
138 STATIC int
xrep_ibt_check_ifree(struct xrep_ibt * ri,xfs_agino_t cluster_ag_base,struct xfs_buf * cluster_bp,unsigned int cluster_index,bool * inuse)139 xrep_ibt_check_ifree(
140 struct xrep_ibt *ri,
141 xfs_agino_t cluster_ag_base,
142 struct xfs_buf *cluster_bp,
143 unsigned int cluster_index,
144 bool *inuse)
145 {
146 struct xfs_scrub *sc = ri->sc;
147 struct xfs_mount *mp = sc->mp;
148 struct xfs_dinode *dip;
149 xfs_agino_t agino;
150 unsigned int cluster_buf_base;
151 unsigned int offset;
152 int error;
153
154 agino = cluster_ag_base + cluster_index;
155
156 /* Inode uncached or half assembled, read disk buffer */
157 cluster_buf_base = XFS_INO_TO_OFFSET(mp, cluster_ag_base);
158 offset = (cluster_buf_base + cluster_index) * mp->m_sb.sb_inodesize;
159 if (offset >= BBTOB(cluster_bp->b_length))
160 return -EFSCORRUPTED;
161 dip = xfs_buf_offset(cluster_bp, offset);
162 if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)
163 return -EFSCORRUPTED;
164
165 if (dip->di_version >= 3 &&
166 be64_to_cpu(dip->di_ino) != xfs_agino_to_ino(ri->sc->sa.pag, agino))
167 return -EFSCORRUPTED;
168
169 /* Will the in-core inode tell us if it's in use? */
170 error = xchk_inode_is_allocated(sc, agino, inuse);
171 if (!error)
172 return 0;
173
174 *inuse = dip->di_mode != 0;
175 return 0;
176 }
177
178 /* Stash the accumulated inobt record for rebuilding. */
179 STATIC int
xrep_ibt_stash(struct xrep_ibt * ri)180 xrep_ibt_stash(
181 struct xrep_ibt *ri)
182 {
183 int error = 0;
184
185 if (xchk_should_terminate(ri->sc, &error))
186 return error;
187
188 ri->rie.ir_freecount = xfs_inobt_rec_freecount(&ri->rie);
189 if (xfs_inobt_check_irec(ri->sc->sa.pag, &ri->rie) != NULL)
190 return -EFSCORRUPTED;
191
192 if (ri->rie.ir_freecount > 0)
193 ri->finobt_recs++;
194
195 trace_xrep_ibt_found(ri->sc->sa.pag, &ri->rie);
196
197 error = xfarray_append(ri->inode_records, &ri->rie);
198 if (error)
199 return error;
200
201 ri->rie.ir_startino = NULLAGINO;
202 return 0;
203 }
204
205 /*
206 * Given an extent of inodes and an inode cluster buffer, calculate the
207 * location of the corresponding inobt record (creating it if necessary),
208 * then update the parts of the holemask and freemask of that record that
209 * correspond to the inode extent we were given.
210 *
211 * @cluster_ir_startino is the AG inode number of an inobt record that we're
212 * proposing to create for this inode cluster. If sparse inodes are enabled,
213 * we must round down to a chunk boundary to find the actual sparse record.
214 * @cluster_bp is the buffer of the inode cluster.
215 * @nr_inodes is the number of inodes to check from the cluster.
216 */
217 STATIC int
xrep_ibt_cluster_record(struct xrep_ibt * ri,xfs_agino_t cluster_ir_startino,struct xfs_buf * cluster_bp,unsigned int nr_inodes)218 xrep_ibt_cluster_record(
219 struct xrep_ibt *ri,
220 xfs_agino_t cluster_ir_startino,
221 struct xfs_buf *cluster_bp,
222 unsigned int nr_inodes)
223 {
224 struct xfs_scrub *sc = ri->sc;
225 struct xfs_mount *mp = sc->mp;
226 xfs_agino_t ir_startino;
227 unsigned int cluster_base;
228 unsigned int cluster_index;
229 int error = 0;
230
231 ir_startino = cluster_ir_startino;
232 if (xfs_has_sparseinodes(mp))
233 ir_startino = rounddown(ir_startino, XFS_INODES_PER_CHUNK);
234 cluster_base = cluster_ir_startino - ir_startino;
235
236 /*
237 * If the accumulated inobt record doesn't map this cluster, add it to
238 * the list and reset it.
239 */
240 if (ri->rie.ir_startino != NULLAGINO &&
241 ri->rie.ir_startino + XFS_INODES_PER_CHUNK <= ir_startino) {
242 error = xrep_ibt_stash(ri);
243 if (error)
244 return error;
245 }
246
247 if (ri->rie.ir_startino == NULLAGINO) {
248 ri->rie.ir_startino = ir_startino;
249 ri->rie.ir_free = XFS_INOBT_ALL_FREE;
250 ri->rie.ir_holemask = 0xFFFF;
251 ri->rie.ir_count = 0;
252 }
253
254 /* Record the whole cluster. */
255 ri->icount += nr_inodes;
256 ri->rie.ir_count += nr_inodes;
257 ri->rie.ir_holemask &= ~xfs_inobt_maskn(
258 cluster_base / XFS_INODES_PER_HOLEMASK_BIT,
259 nr_inodes / XFS_INODES_PER_HOLEMASK_BIT);
260
261 /* Which inodes within this cluster are free? */
262 for (cluster_index = 0; cluster_index < nr_inodes; cluster_index++) {
263 bool inuse = false;
264
265 error = xrep_ibt_check_ifree(ri, cluster_ir_startino,
266 cluster_bp, cluster_index, &inuse);
267 if (error)
268 return error;
269 if (!inuse)
270 continue;
271 ri->iused++;
272 ri->rie.ir_free &= ~XFS_INOBT_MASK(cluster_base +
273 cluster_index);
274 }
275 return 0;
276 }
277
278 /*
279 * For each inode cluster covering the physical extent recorded by the rmapbt,
280 * we must calculate the properly aligned startino of that cluster, then
281 * iterate each cluster to fill in used and filled masks appropriately. We
282 * then use the (startino, used, filled) information to construct the
283 * appropriate inode records.
284 */
285 STATIC int
xrep_ibt_process_cluster(struct xrep_ibt * ri,xfs_agblock_t cluster_bno)286 xrep_ibt_process_cluster(
287 struct xrep_ibt *ri,
288 xfs_agblock_t cluster_bno)
289 {
290 struct xfs_imap imap;
291 struct xfs_buf *cluster_bp;
292 struct xfs_scrub *sc = ri->sc;
293 struct xfs_mount *mp = sc->mp;
294 struct xfs_ino_geometry *igeo = M_IGEO(mp);
295 xfs_agino_t cluster_ag_base;
296 xfs_agino_t irec_index;
297 unsigned int nr_inodes;
298 int error;
299
300 nr_inodes = min_t(unsigned int, igeo->inodes_per_cluster,
301 XFS_INODES_PER_CHUNK);
302
303 /*
304 * Grab the inode cluster buffer. This is safe to do with a broken
305 * inobt because imap_to_bp directly maps the buffer without touching
306 * either inode btree.
307 */
308 imap.im_blkno = xfs_agbno_to_daddr(sc->sa.pag, cluster_bno);
309 imap.im_len = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster);
310 imap.im_boffset = 0;
311 error = xfs_imap_to_bp(mp, sc->tp, &imap, &cluster_bp);
312 if (error)
313 return error;
314
315 /*
316 * Record the contents of each possible inobt record mapping this
317 * cluster.
318 */
319 cluster_ag_base = XFS_AGB_TO_AGINO(mp, cluster_bno);
320 for (irec_index = 0;
321 irec_index < igeo->inodes_per_cluster;
322 irec_index += XFS_INODES_PER_CHUNK) {
323 error = xrep_ibt_cluster_record(ri,
324 cluster_ag_base + irec_index, cluster_bp,
325 nr_inodes);
326 if (error)
327 break;
328
329 }
330
331 xfs_trans_brelse(sc->tp, cluster_bp);
332 return error;
333 }
334
335 /* Check for any obvious conflicts in the inode chunk extent. */
336 STATIC int
xrep_ibt_check_inode_ext(struct xfs_scrub * sc,xfs_agblock_t agbno,xfs_extlen_t len)337 xrep_ibt_check_inode_ext(
338 struct xfs_scrub *sc,
339 xfs_agblock_t agbno,
340 xfs_extlen_t len)
341 {
342 struct xfs_mount *mp = sc->mp;
343 struct xfs_ino_geometry *igeo = M_IGEO(mp);
344 xfs_agino_t agino;
345 enum xbtree_recpacking outcome;
346 int error;
347
348 /* Inode records must be within the AG. */
349 if (!xfs_verify_agbext(sc->sa.pag, agbno, len))
350 return -EFSCORRUPTED;
351
352 /* The entire record must align to the inode cluster size. */
353 if (!IS_ALIGNED(agbno, igeo->blocks_per_cluster) ||
354 !IS_ALIGNED(agbno + len, igeo->blocks_per_cluster))
355 return -EFSCORRUPTED;
356
357 /*
358 * The entire record must also adhere to the inode cluster alignment
359 * size if sparse inodes are not enabled.
360 */
361 if (!xfs_has_sparseinodes(mp) &&
362 (!IS_ALIGNED(agbno, igeo->cluster_align) ||
363 !IS_ALIGNED(agbno + len, igeo->cluster_align)))
364 return -EFSCORRUPTED;
365
366 /*
367 * On a sparse inode fs, this cluster could be part of a sparse chunk.
368 * Sparse clusters must be aligned to sparse chunk alignment.
369 */
370 if (xfs_has_sparseinodes(mp) && mp->m_sb.sb_spino_align &&
371 (!IS_ALIGNED(agbno, mp->m_sb.sb_spino_align) ||
372 !IS_ALIGNED(agbno + len, mp->m_sb.sb_spino_align)))
373 return -EFSCORRUPTED;
374
375 /* Make sure the entire range of blocks are valid AG inodes. */
376 agino = XFS_AGB_TO_AGINO(mp, agbno);
377 if (!xfs_verify_agino(sc->sa.pag, agino))
378 return -EFSCORRUPTED;
379
380 agino = XFS_AGB_TO_AGINO(mp, agbno + len) - 1;
381 if (!xfs_verify_agino(sc->sa.pag, agino))
382 return -EFSCORRUPTED;
383
384 /* Make sure this isn't free space. */
385 error = xfs_alloc_has_records(sc->sa.bno_cur, agbno, len, &outcome);
386 if (error)
387 return error;
388 if (outcome != XBTREE_RECPACKING_EMPTY)
389 return -EFSCORRUPTED;
390
391 return 0;
392 }
393
394 /* Found a fragment of the old inode btrees; dispose of them later. */
395 STATIC int
xrep_ibt_record_old_btree_blocks(struct xrep_ibt * ri,const struct xfs_rmap_irec * rec)396 xrep_ibt_record_old_btree_blocks(
397 struct xrep_ibt *ri,
398 const struct xfs_rmap_irec *rec)
399 {
400 if (!xfs_verify_agbext(ri->sc->sa.pag, rec->rm_startblock,
401 rec->rm_blockcount))
402 return -EFSCORRUPTED;
403
404 return xagb_bitmap_set(&ri->old_iallocbt_blocks, rec->rm_startblock,
405 rec->rm_blockcount);
406 }
407
408 /* Record extents that belong to inode cluster blocks. */
409 STATIC int
xrep_ibt_record_inode_blocks(struct xrep_ibt * ri,const struct xfs_rmap_irec * rec)410 xrep_ibt_record_inode_blocks(
411 struct xrep_ibt *ri,
412 const struct xfs_rmap_irec *rec)
413 {
414 struct xfs_mount *mp = ri->sc->mp;
415 struct xfs_ino_geometry *igeo = M_IGEO(mp);
416 xfs_agblock_t cluster_base;
417 int error;
418
419 error = xrep_ibt_check_inode_ext(ri->sc, rec->rm_startblock,
420 rec->rm_blockcount);
421 if (error)
422 return error;
423
424 trace_xrep_ibt_walk_rmap(ri->sc->sa.pag, rec);
425
426 /*
427 * Record the free/hole masks for each inode cluster that could be
428 * mapped by this rmap record.
429 */
430 for (cluster_base = 0;
431 cluster_base < rec->rm_blockcount;
432 cluster_base += igeo->blocks_per_cluster) {
433 error = xrep_ibt_process_cluster(ri,
434 rec->rm_startblock + cluster_base);
435 if (error)
436 return error;
437 }
438
439 return 0;
440 }
441
442 STATIC int
xrep_ibt_walk_rmap(struct xfs_btree_cur * cur,const struct xfs_rmap_irec * rec,void * priv)443 xrep_ibt_walk_rmap(
444 struct xfs_btree_cur *cur,
445 const struct xfs_rmap_irec *rec,
446 void *priv)
447 {
448 struct xrep_ibt *ri = priv;
449 int error = 0;
450
451 if (xchk_should_terminate(ri->sc, &error))
452 return error;
453
454 switch (rec->rm_owner) {
455 case XFS_RMAP_OWN_INOBT:
456 return xrep_ibt_record_old_btree_blocks(ri, rec);
457 case XFS_RMAP_OWN_INODES:
458 return xrep_ibt_record_inode_blocks(ri, rec);
459 }
460 return 0;
461 }
462
463 /*
464 * Iterate all reverse mappings to find the inodes (OWN_INODES) and the inode
465 * btrees (OWN_INOBT). Figure out if we have enough free space to reconstruct
466 * the inode btrees. The caller must clean up the lists if anything goes
467 * wrong.
468 */
469 STATIC int
xrep_ibt_find_inodes(struct xrep_ibt * ri)470 xrep_ibt_find_inodes(
471 struct xrep_ibt *ri)
472 {
473 struct xfs_scrub *sc = ri->sc;
474 int error;
475
476 ri->rie.ir_startino = NULLAGINO;
477
478 /* Collect all reverse mappings for inode blocks. */
479 xrep_ag_btcur_init(sc, &sc->sa);
480 error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_ibt_walk_rmap, ri);
481 xchk_ag_btcur_free(&sc->sa);
482 if (error)
483 return error;
484
485 /* If we have a record ready to go, add it to the array. */
486 if (ri->rie.ir_startino != NULLAGINO)
487 return xrep_ibt_stash(ri);
488
489 return 0;
490 }
491
492 /* Update the AGI counters. */
493 STATIC int
xrep_ibt_reset_counters(struct xrep_ibt * ri)494 xrep_ibt_reset_counters(
495 struct xrep_ibt *ri)
496 {
497 struct xfs_scrub *sc = ri->sc;
498 struct xfs_agi *agi = sc->sa.agi_bp->b_addr;
499 unsigned int freecount = ri->icount - ri->iused;
500
501 /* Trigger inode count recalculation */
502 xfs_force_summary_recalc(sc->mp);
503
504 /*
505 * The AGI header contains extra information related to the inode
506 * btrees, so we must update those fields here.
507 */
508 agi->agi_count = cpu_to_be32(ri->icount);
509 agi->agi_freecount = cpu_to_be32(freecount);
510 xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp,
511 XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
512
513 /* Reinitialize with the values we just logged. */
514 return xrep_reinit_pagi(sc);
515 }
516
517 /* Retrieve finobt data for bulk load. */
518 STATIC int
xrep_fibt_get_records(struct xfs_btree_cur * cur,unsigned int idx,struct xfs_btree_block * block,unsigned int nr_wanted,void * priv)519 xrep_fibt_get_records(
520 struct xfs_btree_cur *cur,
521 unsigned int idx,
522 struct xfs_btree_block *block,
523 unsigned int nr_wanted,
524 void *priv)
525 {
526 struct xfs_inobt_rec_incore *irec = &cur->bc_rec.i;
527 struct xrep_ibt *ri = priv;
528 union xfs_btree_rec *block_rec;
529 unsigned int loaded;
530 int error;
531
532 for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
533 do {
534 error = xfarray_load(ri->inode_records,
535 ri->array_cur++, irec);
536 } while (error == 0 && xfs_inobt_rec_freecount(irec) == 0);
537 if (error)
538 return error;
539
540 block_rec = xfs_btree_rec_addr(cur, idx, block);
541 cur->bc_ops->init_rec_from_cur(cur, block_rec);
542 }
543
544 return loaded;
545 }
546
547 /* Retrieve inobt data for bulk load. */
548 STATIC int
xrep_ibt_get_records(struct xfs_btree_cur * cur,unsigned int idx,struct xfs_btree_block * block,unsigned int nr_wanted,void * priv)549 xrep_ibt_get_records(
550 struct xfs_btree_cur *cur,
551 unsigned int idx,
552 struct xfs_btree_block *block,
553 unsigned int nr_wanted,
554 void *priv)
555 {
556 struct xfs_inobt_rec_incore *irec = &cur->bc_rec.i;
557 struct xrep_ibt *ri = priv;
558 union xfs_btree_rec *block_rec;
559 unsigned int loaded;
560 int error;
561
562 for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
563 error = xfarray_load(ri->inode_records, ri->array_cur++, irec);
564 if (error)
565 return error;
566
567 block_rec = xfs_btree_rec_addr(cur, idx, block);
568 cur->bc_ops->init_rec_from_cur(cur, block_rec);
569 }
570
571 return loaded;
572 }
573
574 /* Feed one of the new inobt blocks to the bulk loader. */
575 STATIC int
xrep_ibt_claim_block(struct xfs_btree_cur * cur,union xfs_btree_ptr * ptr,void * priv)576 xrep_ibt_claim_block(
577 struct xfs_btree_cur *cur,
578 union xfs_btree_ptr *ptr,
579 void *priv)
580 {
581 struct xrep_ibt *ri = priv;
582
583 return xrep_newbt_claim_block(cur, &ri->new_inobt, ptr);
584 }
585
586 /* Feed one of the new finobt blocks to the bulk loader. */
587 STATIC int
xrep_fibt_claim_block(struct xfs_btree_cur * cur,union xfs_btree_ptr * ptr,void * priv)588 xrep_fibt_claim_block(
589 struct xfs_btree_cur *cur,
590 union xfs_btree_ptr *ptr,
591 void *priv)
592 {
593 struct xrep_ibt *ri = priv;
594
595 return xrep_newbt_claim_block(cur, &ri->new_finobt, ptr);
596 }
597
598 /* Make sure the records do not overlap in inumber address space. */
599 STATIC int
xrep_ibt_check_overlap(struct xrep_ibt * ri)600 xrep_ibt_check_overlap(
601 struct xrep_ibt *ri)
602 {
603 struct xfs_inobt_rec_incore irec;
604 xfarray_idx_t cur;
605 xfs_agino_t next_agino = 0;
606 int error = 0;
607
608 foreach_xfarray_idx(ri->inode_records, cur) {
609 if (xchk_should_terminate(ri->sc, &error))
610 return error;
611
612 error = xfarray_load(ri->inode_records, cur, &irec);
613 if (error)
614 return error;
615
616 if (irec.ir_startino < next_agino)
617 return -EFSCORRUPTED;
618
619 next_agino = irec.ir_startino + XFS_INODES_PER_CHUNK;
620 }
621
622 return error;
623 }
624
625 /* Build new inode btrees and dispose of the old one. */
626 STATIC int
xrep_ibt_build_new_trees(struct xrep_ibt * ri)627 xrep_ibt_build_new_trees(
628 struct xrep_ibt *ri)
629 {
630 struct xfs_scrub *sc = ri->sc;
631 struct xfs_btree_cur *ino_cur;
632 struct xfs_btree_cur *fino_cur = NULL;
633 bool need_finobt;
634 int error;
635
636 need_finobt = xfs_has_finobt(sc->mp);
637
638 /*
639 * Create new btrees for staging all the inobt records we collected
640 * earlier. The records were collected in order of increasing agino,
641 * so we do not have to sort them. Ensure there are no overlapping
642 * records.
643 */
644 error = xrep_ibt_check_overlap(ri);
645 if (error)
646 return error;
647
648 /*
649 * The new inode btrees will not be rooted in the AGI until we've
650 * successfully rebuilt the tree.
651 *
652 * Start by setting up the inobt staging cursor.
653 */
654 xrep_newbt_init_ag(&ri->new_inobt, sc, &XFS_RMAP_OINFO_INOBT,
655 xfs_agbno_to_fsb(sc->sa.pag, XFS_IBT_BLOCK(sc->mp)),
656 XFS_AG_RESV_NONE);
657 ri->new_inobt.bload.claim_block = xrep_ibt_claim_block;
658 ri->new_inobt.bload.get_records = xrep_ibt_get_records;
659
660 ino_cur = xfs_inobt_init_cursor(sc->sa.pag, NULL, NULL);
661 xfs_btree_stage_afakeroot(ino_cur, &ri->new_inobt.afake);
662 error = xfs_btree_bload_compute_geometry(ino_cur, &ri->new_inobt.bload,
663 xfarray_length(ri->inode_records));
664 if (error)
665 goto err_inocur;
666
667 /* Set up finobt staging cursor. */
668 if (need_finobt) {
669 enum xfs_ag_resv_type resv = XFS_AG_RESV_METADATA;
670
671 if (sc->mp->m_finobt_nores)
672 resv = XFS_AG_RESV_NONE;
673
674 xrep_newbt_init_ag(&ri->new_finobt, sc, &XFS_RMAP_OINFO_INOBT,
675 xfs_agbno_to_fsb(sc->sa.pag, XFS_FIBT_BLOCK(sc->mp)),
676 resv);
677 ri->new_finobt.bload.claim_block = xrep_fibt_claim_block;
678 ri->new_finobt.bload.get_records = xrep_fibt_get_records;
679
680 fino_cur = xfs_finobt_init_cursor(sc->sa.pag, NULL, NULL);
681 xfs_btree_stage_afakeroot(fino_cur, &ri->new_finobt.afake);
682 error = xfs_btree_bload_compute_geometry(fino_cur,
683 &ri->new_finobt.bload, ri->finobt_recs);
684 if (error)
685 goto err_finocur;
686 }
687
688 /* Last chance to abort before we start committing fixes. */
689 if (xchk_should_terminate(sc, &error))
690 goto err_finocur;
691
692 /* Reserve all the space we need to build the new btrees. */
693 error = xrep_newbt_alloc_blocks(&ri->new_inobt,
694 ri->new_inobt.bload.nr_blocks);
695 if (error)
696 goto err_finocur;
697
698 if (need_finobt) {
699 error = xrep_newbt_alloc_blocks(&ri->new_finobt,
700 ri->new_finobt.bload.nr_blocks);
701 if (error)
702 goto err_finocur;
703 }
704
705 /* Add all inobt records. */
706 ri->array_cur = XFARRAY_CURSOR_INIT;
707 error = xfs_btree_bload(ino_cur, &ri->new_inobt.bload, ri);
708 if (error)
709 goto err_finocur;
710
711 /* Add all finobt records. */
712 if (need_finobt) {
713 ri->array_cur = XFARRAY_CURSOR_INIT;
714 error = xfs_btree_bload(fino_cur, &ri->new_finobt.bload, ri);
715 if (error)
716 goto err_finocur;
717 }
718
719 /*
720 * Install the new btrees in the AG header. After this point the old
721 * btrees are no longer accessible and the new trees are live.
722 */
723 xfs_inobt_commit_staged_btree(ino_cur, sc->tp, sc->sa.agi_bp);
724 xfs_btree_del_cursor(ino_cur, 0);
725
726 if (fino_cur) {
727 xfs_inobt_commit_staged_btree(fino_cur, sc->tp, sc->sa.agi_bp);
728 xfs_btree_del_cursor(fino_cur, 0);
729 }
730
731 /* Reset the AGI counters now that we've changed the inode roots. */
732 error = xrep_ibt_reset_counters(ri);
733 if (error)
734 goto err_finobt;
735
736 /* Free unused blocks and bitmap. */
737 if (need_finobt) {
738 error = xrep_newbt_commit(&ri->new_finobt);
739 if (error)
740 goto err_inobt;
741 }
742 error = xrep_newbt_commit(&ri->new_inobt);
743 if (error)
744 return error;
745
746 return xrep_roll_ag_trans(sc);
747
748 err_finocur:
749 if (need_finobt)
750 xfs_btree_del_cursor(fino_cur, error);
751 err_inocur:
752 xfs_btree_del_cursor(ino_cur, error);
753 err_finobt:
754 if (need_finobt)
755 xrep_newbt_cancel(&ri->new_finobt);
756 err_inobt:
757 xrep_newbt_cancel(&ri->new_inobt);
758 return error;
759 }
760
761 /*
762 * Now that we've logged the roots of the new btrees, invalidate all of the
763 * old blocks and free them.
764 */
765 STATIC int
xrep_ibt_remove_old_trees(struct xrep_ibt * ri)766 xrep_ibt_remove_old_trees(
767 struct xrep_ibt *ri)
768 {
769 struct xfs_scrub *sc = ri->sc;
770 int error;
771
772 /*
773 * Free the old inode btree blocks if they're not in use. It's ok to
774 * reap with XFS_AG_RESV_NONE even if the finobt had a per-AG
775 * reservation because we reset the reservation before releasing the
776 * AGI and AGF header buffer locks.
777 */
778 error = xrep_reap_agblocks(sc, &ri->old_iallocbt_blocks,
779 &XFS_RMAP_OINFO_INOBT, XFS_AG_RESV_NONE);
780 if (error)
781 return error;
782
783 /*
784 * If the finobt is enabled and has a per-AG reservation, make sure we
785 * reinitialize the per-AG reservations.
786 */
787 if (xfs_has_finobt(sc->mp) && !sc->mp->m_finobt_nores)
788 sc->flags |= XREP_RESET_PERAG_RESV;
789
790 return 0;
791 }
792
793 /* Repair both inode btrees. */
794 int
xrep_iallocbt(struct xfs_scrub * sc)795 xrep_iallocbt(
796 struct xfs_scrub *sc)
797 {
798 struct xrep_ibt *ri;
799 struct xfs_mount *mp = sc->mp;
800 char *descr;
801 xfs_agino_t first_agino, last_agino;
802 int error = 0;
803
804 /* We require the rmapbt to rebuild anything. */
805 if (!xfs_has_rmapbt(mp))
806 return -EOPNOTSUPP;
807
808 ri = kzalloc(sizeof(struct xrep_ibt), XCHK_GFP_FLAGS);
809 if (!ri)
810 return -ENOMEM;
811 ri->sc = sc;
812
813 /* We rebuild both inode btrees. */
814 sc->sick_mask = XFS_SICK_AG_INOBT | XFS_SICK_AG_FINOBT;
815
816 /* Set up enough storage to handle an AG with nothing but inodes. */
817 xfs_agino_range(mp, pag_agno(sc->sa.pag), &first_agino, &last_agino);
818 last_agino /= XFS_INODES_PER_CHUNK;
819 descr = xchk_xfile_ag_descr(sc, "inode index records");
820 error = xfarray_create(descr, last_agino,
821 sizeof(struct xfs_inobt_rec_incore),
822 &ri->inode_records);
823 kfree(descr);
824 if (error)
825 goto out_ri;
826
827 /* Collect the inode data and find the old btree blocks. */
828 xagb_bitmap_init(&ri->old_iallocbt_blocks);
829 error = xrep_ibt_find_inodes(ri);
830 if (error)
831 goto out_bitmap;
832
833 /* Rebuild the inode indexes. */
834 error = xrep_ibt_build_new_trees(ri);
835 if (error)
836 goto out_bitmap;
837
838 /* Kill the old tree. */
839 error = xrep_ibt_remove_old_trees(ri);
840 if (error)
841 goto out_bitmap;
842
843 out_bitmap:
844 xagb_bitmap_destroy(&ri->old_iallocbt_blocks);
845 xfarray_destroy(ri->inode_records);
846 out_ri:
847 kfree(ri);
848 return error;
849 }
850
851 /* Make sure both btrees are ok after we've rebuilt them. */
852 int
xrep_revalidate_iallocbt(struct xfs_scrub * sc)853 xrep_revalidate_iallocbt(
854 struct xfs_scrub *sc)
855 {
856 __u32 old_type = sc->sm->sm_type;
857 int error;
858
859 /*
860 * We must update sm_type temporarily so that the tree-to-tree cross
861 * reference checks will work in the correct direction, and also so
862 * that tracing will report correctly if there are more errors.
863 */
864 sc->sm->sm_type = XFS_SCRUB_TYPE_INOBT;
865 error = xchk_iallocbt(sc);
866 if (error)
867 goto out;
868
869 if (xfs_has_finobt(sc->mp)) {
870 sc->sm->sm_type = XFS_SCRUB_TYPE_FINOBT;
871 error = xchk_iallocbt(sc);
872 }
873
874 out:
875 sc->sm->sm_type = old_type;
876 return error;
877 }
878