1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Copyright (c) 2020-2024 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
5 */
6 #include "xfs_platform.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_defer.h"
13 #include "xfs_bit.h"
14 #include "xfs_log_format.h"
15 #include "xfs_trans.h"
16 #include "xfs_sb.h"
17 #include "xfs_inode.h"
18 #include "xfs_icache.h"
19 #include "xfs_da_format.h"
20 #include "xfs_da_btree.h"
21 #include "xfs_dir2.h"
22 #include "xfs_dir2_priv.h"
23 #include "xfs_bmap.h"
24 #include "xfs_quota.h"
25 #include "xfs_bmap_btree.h"
26 #include "xfs_trans_space.h"
27 #include "xfs_bmap_util.h"
28 #include "xfs_exchmaps.h"
29 #include "xfs_exchrange.h"
30 #include "xfs_ag.h"
31 #include "xfs_parent.h"
32 #include "scrub/xfs_scrub.h"
33 #include "scrub/scrub.h"
34 #include "scrub/common.h"
35 #include "scrub/trace.h"
36 #include "scrub/repair.h"
37 #include "scrub/tempfile.h"
38 #include "scrub/tempexch.h"
39 #include "scrub/xfile.h"
40 #include "scrub/xfarray.h"
41 #include "scrub/xfblob.h"
42 #include "scrub/iscan.h"
43 #include "scrub/readdir.h"
44 #include "scrub/reap.h"
45 #include "scrub/findparent.h"
46 #include "scrub/orphanage.h"
47 #include "scrub/listxattr.h"
48
49 /*
50 * Directory Repair
51 * ================
52 *
53 * We repair directories by reading the directory data blocks looking for
54 * directory entries that look salvageable (name passes verifiers, entry points
55 * to a valid allocated inode, etc). Each entry worth salvaging is stashed in
56 * memory, and the stashed entries are periodically replayed into a temporary
57 * directory to constrain memory use. Batching the construction of the
58 * temporary directory in this fashion reduces lock cycling of the directory
59 * being repaired and the temporary directory, and will later become important
60 * for parent pointer scanning.
61 *
62 * If parent pointers are enabled on this filesystem, we instead reconstruct
63 * the directory by visiting each parent pointer of each file in the filesystem
64 * and translating the relevant parent pointer records into dirents. In this
65 * case, it is advantageous to stash all directory entries created from parent
66 * pointers for a single child file before replaying them into the temporary
67 * directory. To save memory, the live filesystem scan reuses the findparent
68 * fields. Directory repair chooses either parent pointer scanning or
69 * directory entry salvaging, but not both.
70 *
71 * Directory entries added to the temporary directory do not elevate the link
72 * counts of the inodes found. When salvaging completes, the remaining stashed
73 * entries are replayed to the temporary directory. An atomic mapping exchange
74 * is used to commit the new directory blocks to the directory being repaired.
75 * This will disrupt readdir cursors.
76 *
77 * Locking Issues
78 * --------------
79 *
80 * If /a, /a/b, and /c are all directories, the VFS does not take i_rwsem on
81 * /a/b for a "mv /a/b /c/" operation. This means that only b's ILOCK protects
82 * b's dotdot update. This is in contrast to every other dotdot update (link,
83 * remove, mkdir). If the repair code drops the ILOCK, it must either
84 * revalidate the dotdot entry or use dirent hooks to capture updates from
85 * other threads.
86 */
87
88 /* Create a dirent in the tempdir. */
89 #define XREP_DIRENT_ADD (1)
90
91 /* Remove a dirent from the tempdir. */
92 #define XREP_DIRENT_REMOVE (2)
93
94 /* Directory entry to be restored in the new directory. */
95 struct xrep_dirent {
96 /* Cookie for retrieval of the dirent name. */
97 xfblob_cookie name_cookie;
98
99 /* Target inode number. */
100 xfs_ino_t ino;
101
102 /* Length of the dirent name. */
103 uint8_t namelen;
104
105 /* File type of the dirent. */
106 uint8_t ftype;
107
108 /* XREP_DIRENT_{ADD,REMOVE} */
109 uint8_t action;
110 };
111
112 /*
113 * Stash up to 8 pages of recovered dirent data in dir_entries and dir_names
114 * before we write them to the temp dir.
115 */
116 #define XREP_DIR_MAX_STASH_BYTES (PAGE_SIZE * 8)
117
118 struct xrep_dir {
119 struct xfs_scrub *sc;
120
121 /* Fixed-size array of xrep_dirent structures. */
122 struct xfarray *dir_entries;
123
124 /* Blobs containing directory entry names. */
125 struct xfblob *dir_names;
126
127 /* Information for exchanging data forks at the end. */
128 struct xrep_tempexch tx;
129
130 /* Preallocated args struct for performing dir operations */
131 struct xfs_da_args args;
132
133 /*
134 * Information used to scan the filesystem to find the inumber of the
135 * dotdot entry for this directory. For directory salvaging when
136 * parent pointers are not enabled, we use the findparent_* functions
137 * on this object and access only the parent_ino field directly.
138 *
139 * When parent pointers are enabled, however, the pptr scanner uses the
140 * iscan, hooks, lock, and parent_ino fields of this object directly.
141 * @pscan.lock coordinates access to dir_entries, dir_names,
142 * parent_ino, subdirs, dirents, and args. This reduces the memory
143 * requirements of this structure.
144 */
145 struct xrep_parent_scan_info pscan;
146
147 /*
148 * Context information for attaching this directory to the lost+found
149 * if this directory does not have a parent.
150 */
151 struct xrep_adoption adoption;
152
153 /* How many subdirectories did we find? */
154 uint64_t subdirs;
155
156 /* How many dirents did we find? */
157 unsigned int dirents;
158
159 /* Should we move this directory to the orphanage? */
160 bool needs_adoption;
161
162 /* Directory entry name, plus the trailing null. */
163 struct xfs_name xname;
164 unsigned char namebuf[MAXNAMELEN];
165 };
166
167 /* Tear down all the incore stuff we created. */
168 static void
xrep_dir_teardown(struct xfs_scrub * sc)169 xrep_dir_teardown(
170 struct xfs_scrub *sc)
171 {
172 struct xrep_dir *rd = sc->buf;
173
174 xrep_findparent_scan_teardown(&rd->pscan);
175 if (rd->dir_names)
176 xfblob_destroy(rd->dir_names);
177 rd->dir_names = NULL;
178 if (rd->dir_entries)
179 xfarray_destroy(rd->dir_entries);
180 rd->dir_entries = NULL;
181 }
182
183 /* Set up for a directory repair. */
184 int
xrep_setup_directory(struct xfs_scrub * sc)185 xrep_setup_directory(
186 struct xfs_scrub *sc)
187 {
188 struct xrep_dir *rd;
189 int error;
190
191 xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS);
192
193 error = xrep_orphanage_try_create(sc);
194 if (error)
195 return error;
196
197 error = xrep_tempfile_create(sc, S_IFDIR);
198 if (error)
199 return error;
200
201 rd = kvzalloc_obj(struct xrep_dir, XCHK_GFP_FLAGS);
202 if (!rd)
203 return -ENOMEM;
204 rd->sc = sc;
205 rd->xname.name = rd->namebuf;
206 sc->buf = rd;
207
208 return 0;
209 }
210
211 /*
212 * Look up the dotdot entry and confirm that it's really the parent.
213 * Returns NULLFSINO if we don't know what to do.
214 */
215 static inline xfs_ino_t
xrep_dir_lookup_parent(struct xrep_dir * rd)216 xrep_dir_lookup_parent(
217 struct xrep_dir *rd)
218 {
219 struct xfs_scrub *sc = rd->sc;
220 xfs_ino_t ino;
221 int error;
222
223 error = xfs_dir_lookup(sc->tp, sc->ip, &xfs_name_dotdot, &ino, NULL);
224 if (error)
225 return NULLFSINO;
226 if (!xfs_verify_dir_ino(sc->mp, ino))
227 return NULLFSINO;
228
229 error = xrep_findparent_confirm(sc, &ino);
230 if (error)
231 return NULLFSINO;
232
233 return ino;
234 }
235
236 /*
237 * Look up '..' in the dentry cache and confirm that it's really the parent.
238 * Returns NULLFSINO if the dcache misses or if the hit is implausible.
239 */
240 static inline xfs_ino_t
xrep_dir_dcache_parent(struct xrep_dir * rd)241 xrep_dir_dcache_parent(
242 struct xrep_dir *rd)
243 {
244 struct xfs_scrub *sc = rd->sc;
245 xfs_ino_t parent_ino;
246 int error;
247
248 parent_ino = xrep_findparent_from_dcache(sc);
249 if (parent_ino == NULLFSINO)
250 return parent_ino;
251
252 error = xrep_findparent_confirm(sc, &parent_ino);
253 if (error)
254 return NULLFSINO;
255
256 return parent_ino;
257 }
258
259 /* Try to find the parent of the directory being repaired. */
260 STATIC int
xrep_dir_find_parent(struct xrep_dir * rd)261 xrep_dir_find_parent(
262 struct xrep_dir *rd)
263 {
264 xfs_ino_t ino;
265
266 ino = xrep_findparent_self_reference(rd->sc);
267 if (ino != NULLFSINO) {
268 xrep_findparent_scan_finish_early(&rd->pscan, ino);
269 return 0;
270 }
271
272 ino = xrep_dir_dcache_parent(rd);
273 if (ino != NULLFSINO) {
274 xrep_findparent_scan_finish_early(&rd->pscan, ino);
275 return 0;
276 }
277
278 ino = xrep_dir_lookup_parent(rd);
279 if (ino != NULLFSINO) {
280 xrep_findparent_scan_finish_early(&rd->pscan, ino);
281 return 0;
282 }
283
284 /*
285 * A full filesystem scan is the last resort. On a busy filesystem,
286 * the scan can fail with -EBUSY if we cannot grab IOLOCKs. That means
287 * that we don't know what who the parent is, so we should return to
288 * userspace.
289 */
290 return xrep_findparent_scan(&rd->pscan);
291 }
292
293 /*
294 * Decide if we want to salvage this entry. We don't bother with oversized
295 * names or the dot entry.
296 */
297 STATIC int
xrep_dir_want_salvage(struct xrep_dir * rd,const char * name,int namelen,xfs_ino_t ino)298 xrep_dir_want_salvage(
299 struct xrep_dir *rd,
300 const char *name,
301 int namelen,
302 xfs_ino_t ino)
303 {
304 struct xfs_mount *mp = rd->sc->mp;
305
306 /* No pointers to ourselves or to garbage. */
307 if (ino == rd->sc->ip->i_ino)
308 return false;
309 if (!xfs_verify_dir_ino(mp, ino))
310 return false;
311
312 /* No weird looking names or dot entries. */
313 if (namelen >= MAXNAMELEN || namelen <= 0)
314 return false;
315 if (namelen == 1 && name[0] == '.')
316 return false;
317 if (!xfs_dir2_namecheck(name, namelen))
318 return false;
319
320 return true;
321 }
322
323 /*
324 * Remember that we want to create a dirent in the tempdir. These stashed
325 * actions will be replayed later.
326 */
327 STATIC int
xrep_dir_stash_createname(struct xrep_dir * rd,const struct xfs_name * name,xfs_ino_t ino)328 xrep_dir_stash_createname(
329 struct xrep_dir *rd,
330 const struct xfs_name *name,
331 xfs_ino_t ino)
332 {
333 struct xrep_dirent dirent = {
334 .action = XREP_DIRENT_ADD,
335 .ino = ino,
336 .namelen = name->len,
337 .ftype = name->type,
338 };
339 int error;
340
341 trace_xrep_dir_stash_createname(rd->sc->tempip, name, ino);
342
343 error = xfblob_storename(rd->dir_names, &dirent.name_cookie, name);
344 if (error)
345 return error;
346
347 return xfarray_append(rd->dir_entries, &dirent);
348 }
349
350 /*
351 * Remember that we want to remove a dirent from the tempdir. These stashed
352 * actions will be replayed later.
353 */
354 STATIC int
xrep_dir_stash_removename(struct xrep_dir * rd,const struct xfs_name * name,xfs_ino_t ino)355 xrep_dir_stash_removename(
356 struct xrep_dir *rd,
357 const struct xfs_name *name,
358 xfs_ino_t ino)
359 {
360 struct xrep_dirent dirent = {
361 .action = XREP_DIRENT_REMOVE,
362 .ino = ino,
363 .namelen = name->len,
364 .ftype = name->type,
365 };
366 int error;
367
368 trace_xrep_dir_stash_removename(rd->sc->tempip, name, ino);
369
370 error = xfblob_storename(rd->dir_names, &dirent.name_cookie, name);
371 if (error)
372 return error;
373
374 return xfarray_append(rd->dir_entries, &dirent);
375 }
376
377 /* Allocate an in-core record to hold entries while we rebuild the dir data. */
378 STATIC int
xrep_dir_salvage_entry(struct xrep_dir * rd,unsigned char * name,unsigned int namelen,xfs_ino_t ino)379 xrep_dir_salvage_entry(
380 struct xrep_dir *rd,
381 unsigned char *name,
382 unsigned int namelen,
383 xfs_ino_t ino)
384 {
385 struct xfs_name xname = {
386 .name = name,
387 };
388 struct xfs_scrub *sc = rd->sc;
389 struct xfs_inode *ip;
390 unsigned int i = 0;
391 int error = 0;
392
393 if (xchk_should_terminate(sc, &error))
394 return error;
395
396 /*
397 * Truncate the name to the first character that would trip namecheck.
398 * If we no longer have a name after that, ignore this entry.
399 */
400 while (i < namelen && name[i] != 0 && name[i] != '/')
401 i++;
402 if (i == 0)
403 return 0;
404 xname.len = i;
405
406 /* Ignore '..' entries; we already picked the new parent. */
407 if (xname.len == 2 && name[0] == '.' && name[1] == '.') {
408 trace_xrep_dir_salvaged_parent(sc->ip, ino);
409 return 0;
410 }
411
412 trace_xrep_dir_salvage_entry(sc->ip, &xname, ino);
413
414 /*
415 * Compute the ftype or dump the entry if we can't. We don't lock the
416 * inode because inodes can't change type while we have a reference.
417 */
418 error = xchk_iget(sc, ino, &ip);
419 if (error)
420 return 0;
421
422 /* Don't mix metadata and regular directory trees. */
423 if (xfs_is_metadir_inode(ip) != xfs_is_metadir_inode(rd->sc->ip)) {
424 xchk_irele(sc, ip);
425 return 0;
426 }
427
428 xname.type = xfs_mode_to_ftype(VFS_I(ip)->i_mode);
429 xchk_irele(sc, ip);
430
431 return xrep_dir_stash_createname(rd, &xname, ino);
432 }
433
434 /* Record a shortform directory entry for later reinsertion. */
435 STATIC int
xrep_dir_salvage_sf_entry(struct xrep_dir * rd,struct xfs_dir2_sf_hdr * sfp,struct xfs_dir2_sf_entry * sfep)436 xrep_dir_salvage_sf_entry(
437 struct xrep_dir *rd,
438 struct xfs_dir2_sf_hdr *sfp,
439 struct xfs_dir2_sf_entry *sfep)
440 {
441 xfs_ino_t ino;
442
443 ino = xfs_dir2_sf_get_ino(rd->sc->mp, sfp, sfep);
444 if (!xrep_dir_want_salvage(rd, sfep->name, sfep->namelen, ino))
445 return 0;
446
447 return xrep_dir_salvage_entry(rd, sfep->name, sfep->namelen, ino);
448 }
449
450 /* Record a regular directory entry for later reinsertion. */
451 STATIC int
xrep_dir_salvage_data_entry(struct xrep_dir * rd,struct xfs_dir2_data_entry * dep)452 xrep_dir_salvage_data_entry(
453 struct xrep_dir *rd,
454 struct xfs_dir2_data_entry *dep)
455 {
456 xfs_ino_t ino;
457
458 ino = be64_to_cpu(dep->inumber);
459 if (!xrep_dir_want_salvage(rd, dep->name, dep->namelen, ino))
460 return 0;
461
462 return xrep_dir_salvage_entry(rd, dep->name, dep->namelen, ino);
463 }
464
465 /* Try to recover block/data format directory entries. */
466 STATIC int
xrep_dir_recover_data(struct xrep_dir * rd,struct xfs_buf * bp)467 xrep_dir_recover_data(
468 struct xrep_dir *rd,
469 struct xfs_buf *bp)
470 {
471 struct xfs_da_geometry *geo = rd->sc->mp->m_dir_geo;
472 unsigned int offset;
473 unsigned int end;
474 int error = 0;
475
476 /*
477 * Loop over the data portion of the block.
478 * Each object is a real entry (dep) or an unused one (dup).
479 */
480 offset = geo->data_entry_offset;
481 end = min_t(unsigned int, BBTOB(bp->b_length),
482 xfs_dir3_data_end_offset(geo, bp->b_addr));
483
484 while (offset < end) {
485 struct xfs_dir2_data_unused *dup = bp->b_addr + offset;
486 struct xfs_dir2_data_entry *dep = bp->b_addr + offset;
487
488 if (xchk_should_terminate(rd->sc, &error))
489 return error;
490
491 /* Skip unused entries. */
492 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
493 offset += be16_to_cpu(dup->length);
494 continue;
495 }
496
497 /* Don't walk off the end of the block. */
498 offset += xfs_dir2_data_entsize(rd->sc->mp, dep->namelen);
499 if (offset > end)
500 break;
501
502 /* Ok, let's save this entry. */
503 error = xrep_dir_salvage_data_entry(rd, dep);
504 if (error)
505 return error;
506
507 }
508
509 return 0;
510 }
511
512 /* Try to recover shortform directory entries. */
513 STATIC int
xrep_dir_recover_sf(struct xrep_dir * rd)514 xrep_dir_recover_sf(
515 struct xrep_dir *rd)
516 {
517 struct xfs_dir2_sf_hdr *hdr;
518 struct xfs_dir2_sf_entry *sfep;
519 struct xfs_dir2_sf_entry *next;
520 struct xfs_ifork *ifp;
521 xfs_ino_t ino;
522 unsigned char *end;
523 int error = 0;
524
525 ifp = xfs_ifork_ptr(rd->sc->ip, XFS_DATA_FORK);
526 hdr = ifp->if_data;
527 end = (unsigned char *)ifp->if_data + ifp->if_bytes;
528
529 ino = xfs_dir2_sf_get_parent_ino(hdr);
530 trace_xrep_dir_salvaged_parent(rd->sc->ip, ino);
531
532 sfep = xfs_dir2_sf_firstentry(hdr);
533 while ((unsigned char *)sfep < end) {
534 if (xchk_should_terminate(rd->sc, &error))
535 return error;
536
537 next = xfs_dir2_sf_nextentry(rd->sc->mp, hdr, sfep);
538 if ((unsigned char *)next > end)
539 break;
540
541 /* Ok, let's save this entry. */
542 error = xrep_dir_salvage_sf_entry(rd, hdr, sfep);
543 if (error)
544 return error;
545
546 sfep = next;
547 }
548
549 return 0;
550 }
551
552 /*
553 * Try to figure out the format of this directory from the data fork mappings
554 * and the directory size. If we can be reasonably sure of format, we can be
555 * more aggressive in salvaging directory entries. On return, @magic_guess
556 * will be set to DIR3_BLOCK_MAGIC if we think this is a "block format"
557 * directory; DIR3_DATA_MAGIC if we think this is a "data format" directory,
558 * and 0 if we can't tell.
559 */
560 STATIC void
xrep_dir_guess_format(struct xrep_dir * rd,__be32 * magic_guess)561 xrep_dir_guess_format(
562 struct xrep_dir *rd,
563 __be32 *magic_guess)
564 {
565 struct xfs_inode *dp = rd->sc->ip;
566 struct xfs_mount *mp = rd->sc->mp;
567 struct xfs_da_geometry *geo = mp->m_dir_geo;
568 xfs_fileoff_t last;
569 int error;
570
571 ASSERT(xfs_has_crc(mp));
572
573 *magic_guess = 0;
574
575 /*
576 * If there's a single directory block and the directory size is
577 * exactly one block, this has to be a single block format directory.
578 */
579 error = xfs_bmap_last_offset(dp, &last, XFS_DATA_FORK);
580 if (!error && XFS_FSB_TO_B(mp, last) == geo->blksize &&
581 dp->i_disk_size == geo->blksize) {
582 *magic_guess = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
583 return;
584 }
585
586 /*
587 * If the last extent before the leaf offset matches the directory
588 * size and the directory size is larger than 1 block, this is a
589 * data format directory.
590 */
591 last = geo->leafblk;
592 error = xfs_bmap_last_before(rd->sc->tp, dp, &last, XFS_DATA_FORK);
593 if (!error &&
594 XFS_FSB_TO_B(mp, last) > geo->blksize &&
595 XFS_FSB_TO_B(mp, last) == dp->i_disk_size) {
596 *magic_guess = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
597 return;
598 }
599 }
600
601 /* Recover directory entries from a specific directory block. */
602 STATIC int
xrep_dir_recover_dirblock(struct xrep_dir * rd,__be32 magic_guess,xfs_dablk_t dabno)603 xrep_dir_recover_dirblock(
604 struct xrep_dir *rd,
605 __be32 magic_guess,
606 xfs_dablk_t dabno)
607 {
608 struct xfs_dir2_data_hdr *hdr;
609 struct xfs_buf *bp;
610 __be32 oldmagic;
611 int error;
612
613 /*
614 * Try to read buffer. We invalidate them in the next step so we don't
615 * bother to set a buffer type or ops.
616 */
617 error = xfs_da_read_buf(rd->sc->tp, rd->sc->ip, dabno,
618 XFS_DABUF_MAP_HOLE_OK, &bp, XFS_DATA_FORK, NULL);
619 if (error || !bp)
620 return error;
621
622 hdr = bp->b_addr;
623 oldmagic = hdr->magic;
624
625 trace_xrep_dir_recover_dirblock(rd->sc->ip, dabno,
626 be32_to_cpu(hdr->magic), be32_to_cpu(magic_guess));
627
628 /*
629 * If we're sure of the block's format, proceed with the salvage
630 * operation using the specified magic number.
631 */
632 if (magic_guess) {
633 hdr->magic = magic_guess;
634 goto recover;
635 }
636
637 /*
638 * If we couldn't guess what type of directory this is, then we will
639 * only salvage entries from directory blocks that match the magic
640 * number and pass verifiers.
641 */
642 switch (hdr->magic) {
643 case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
644 case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
645 if (!xrep_buf_verify_struct(bp, &xfs_dir3_block_buf_ops))
646 goto out;
647 if (xfs_dir3_block_header_check(bp, rd->sc->ip->i_ino) != NULL)
648 goto out;
649 break;
650 case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
651 case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
652 if (!xrep_buf_verify_struct(bp, &xfs_dir3_data_buf_ops))
653 goto out;
654 if (xfs_dir3_data_header_check(bp, rd->sc->ip->i_ino) != NULL)
655 goto out;
656 break;
657 default:
658 goto out;
659 }
660
661 recover:
662 error = xrep_dir_recover_data(rd, bp);
663
664 out:
665 hdr->magic = oldmagic;
666 xfs_trans_brelse(rd->sc->tp, bp);
667 return error;
668 }
669
670 static inline void
xrep_dir_init_args(struct xrep_dir * rd,struct xfs_inode * dp,const struct xfs_name * name)671 xrep_dir_init_args(
672 struct xrep_dir *rd,
673 struct xfs_inode *dp,
674 const struct xfs_name *name)
675 {
676 memset(&rd->args, 0, sizeof(struct xfs_da_args));
677 rd->args.geo = rd->sc->mp->m_dir_geo;
678 rd->args.whichfork = XFS_DATA_FORK;
679 rd->args.owner = rd->sc->ip->i_ino;
680 rd->args.trans = rd->sc->tp;
681 rd->args.dp = dp;
682 if (!name)
683 return;
684 rd->args.name = name->name;
685 rd->args.namelen = name->len;
686 rd->args.filetype = name->type;
687 rd->args.hashval = xfs_dir2_hashname(rd->sc->mp, name);
688 }
689
690 /* Replay a stashed createname into the temporary directory. */
691 STATIC int
xrep_dir_replay_createname(struct xrep_dir * rd,const struct xfs_name * name,xfs_ino_t inum,xfs_extlen_t total)692 xrep_dir_replay_createname(
693 struct xrep_dir *rd,
694 const struct xfs_name *name,
695 xfs_ino_t inum,
696 xfs_extlen_t total)
697 {
698 struct xfs_scrub *sc = rd->sc;
699 struct xfs_inode *dp = rd->sc->tempip;
700 int error;
701
702 ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
703
704 error = xfs_dir_ino_validate(sc->mp, inum);
705 if (error)
706 return error;
707
708 trace_xrep_dir_replay_createname(dp, name, inum);
709
710 xrep_dir_init_args(rd, dp, name);
711 rd->args.inumber = inum;
712 rd->args.total = total;
713 rd->args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
714 return xfs_dir_createname_args(&rd->args);
715 }
716
717 /* Replay a stashed removename onto the temporary directory. */
718 STATIC int
xrep_dir_replay_removename(struct xrep_dir * rd,const struct xfs_name * name,xfs_extlen_t total)719 xrep_dir_replay_removename(
720 struct xrep_dir *rd,
721 const struct xfs_name *name,
722 xfs_extlen_t total)
723 {
724 struct xfs_inode *dp = rd->args.dp;
725
726 ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
727
728 xrep_dir_init_args(rd, dp, name);
729 rd->args.op_flags = 0;
730 rd->args.total = total;
731
732 trace_xrep_dir_replay_removename(dp, name, 0);
733 return xfs_dir_removename_args(&rd->args);
734 }
735
736 /*
737 * Add this stashed incore directory entry to the temporary directory.
738 * The caller must hold the tempdir's IOLOCK, must not hold any ILOCKs, and
739 * must not be in transaction context.
740 */
741 STATIC int
xrep_dir_replay_update(struct xrep_dir * rd,const struct xfs_name * xname,const struct xrep_dirent * dirent)742 xrep_dir_replay_update(
743 struct xrep_dir *rd,
744 const struct xfs_name *xname,
745 const struct xrep_dirent *dirent)
746 {
747 struct xfs_mount *mp = rd->sc->mp;
748 #ifdef DEBUG
749 xfs_ino_t ino;
750 #endif
751 uint resblks;
752 int error;
753
754 resblks = xfs_link_space_res(mp, xname->len);
755 error = xchk_trans_alloc(rd->sc, resblks);
756 if (error)
757 return error;
758
759 /* Lock the temporary directory and join it to the transaction */
760 xrep_tempfile_ilock(rd->sc);
761 xfs_trans_ijoin(rd->sc->tp, rd->sc->tempip, 0);
762
763 switch (dirent->action) {
764 case XREP_DIRENT_ADD:
765 /*
766 * Create a replacement dirent in the temporary directory.
767 * Note that _createname doesn't check for existing entries.
768 * There shouldn't be any in the temporary dir, but we'll
769 * verify this in debug mode.
770 */
771 #ifdef DEBUG
772 error = xchk_dir_lookup(rd->sc, rd->sc->tempip, xname, &ino);
773 if (error != -ENOENT) {
774 ASSERT(error != -ENOENT);
775 goto out_cancel;
776 }
777 #endif
778
779 error = xrep_dir_replay_createname(rd, xname, dirent->ino,
780 resblks);
781 if (error)
782 goto out_cancel;
783
784 if (xname->type == XFS_DIR3_FT_DIR)
785 rd->subdirs++;
786 rd->dirents++;
787 break;
788 case XREP_DIRENT_REMOVE:
789 /*
790 * Remove a dirent from the temporary directory. Note that
791 * _removename doesn't check the inode target of the exist
792 * entry. There should be a perfect match in the temporary
793 * dir, but we'll verify this in debug mode.
794 */
795 #ifdef DEBUG
796 error = xchk_dir_lookup(rd->sc, rd->sc->tempip, xname, &ino);
797 if (error) {
798 ASSERT(error != 0);
799 goto out_cancel;
800 }
801 if (ino != dirent->ino) {
802 ASSERT(ino == dirent->ino);
803 error = -EIO;
804 goto out_cancel;
805 }
806 #endif
807
808 error = xrep_dir_replay_removename(rd, xname, resblks);
809 if (error)
810 goto out_cancel;
811
812 if (xname->type == XFS_DIR3_FT_DIR)
813 rd->subdirs--;
814 rd->dirents--;
815 break;
816 default:
817 ASSERT(0);
818 error = -EIO;
819 goto out_cancel;
820 }
821
822 /* Commit and unlock. */
823 error = xrep_trans_commit(rd->sc);
824 if (error)
825 return error;
826
827 xrep_tempfile_iunlock(rd->sc);
828 return 0;
829 out_cancel:
830 xchk_trans_cancel(rd->sc);
831 xrep_tempfile_iunlock(rd->sc);
832 return error;
833 }
834
835 /*
836 * Flush stashed incore dirent updates that have been recorded by the scanner.
837 * This is done to reduce the memory requirements of the directory rebuild,
838 * since directories can contain up to 32GB of directory data.
839 *
840 * Caller must not hold transactions or ILOCKs. Caller must hold the tempdir
841 * IOLOCK.
842 */
843 STATIC int
xrep_dir_replay_updates(struct xrep_dir * rd)844 xrep_dir_replay_updates(
845 struct xrep_dir *rd)
846 {
847 xfarray_idx_t array_cur;
848 int error;
849
850 /* Add all the salvaged dirents to the temporary directory. */
851 mutex_lock(&rd->pscan.lock);
852 foreach_xfarray_idx(rd->dir_entries, array_cur) {
853 struct xrep_dirent dirent;
854
855 error = xfarray_load(rd->dir_entries, array_cur, &dirent);
856 if (error)
857 goto out_unlock;
858
859 error = xfblob_loadname(rd->dir_names, dirent.name_cookie,
860 &rd->xname, dirent.namelen);
861 if (error)
862 goto out_unlock;
863 rd->xname.type = dirent.ftype;
864 mutex_unlock(&rd->pscan.lock);
865
866 error = xrep_dir_replay_update(rd, &rd->xname, &dirent);
867 if (error)
868 return error;
869 mutex_lock(&rd->pscan.lock);
870 }
871
872 /* Empty out both arrays now that we've added the entries. */
873 xfarray_truncate(rd->dir_entries);
874 xfblob_truncate(rd->dir_names);
875 mutex_unlock(&rd->pscan.lock);
876 return 0;
877 out_unlock:
878 mutex_unlock(&rd->pscan.lock);
879 return error;
880 }
881
882 /*
883 * Periodically flush stashed directory entries to the temporary dir. This
884 * is done to reduce the memory requirements of the directory rebuild, since
885 * directories can contain up to 32GB of directory data.
886 */
887 STATIC int
xrep_dir_flush_stashed(struct xrep_dir * rd)888 xrep_dir_flush_stashed(
889 struct xrep_dir *rd)
890 {
891 int error;
892
893 /*
894 * Entering this function, the scrub context has a reference to the
895 * inode being repaired, the temporary file, and a scrub transaction
896 * that we use during dirent salvaging to avoid livelocking if there
897 * are cycles in the directory structures. We hold ILOCK_EXCL on both
898 * the inode being repaired and the temporary file, though they are
899 * not ijoined to the scrub transaction.
900 *
901 * To constrain kernel memory use, we occasionally write salvaged
902 * dirents from the xfarray and xfblob structures into the temporary
903 * directory in preparation for exchanging the directory structures at
904 * the end. Updating the temporary file requires a transaction, so we
905 * commit the scrub transaction and drop the two ILOCKs so that
906 * we can allocate whatever transaction we want.
907 *
908 * We still hold IOLOCK_EXCL on the inode being repaired, which
909 * prevents anyone from accessing the damaged directory data while we
910 * repair it.
911 */
912 error = xrep_trans_commit(rd->sc);
913 if (error)
914 return error;
915 xchk_iunlock(rd->sc, XFS_ILOCK_EXCL);
916
917 /*
918 * Take the IOLOCK of the temporary file while we modify dirents. This
919 * isn't strictly required because the temporary file is never revealed
920 * to userspace, but we follow the same locking rules. We still hold
921 * sc->ip's IOLOCK.
922 */
923 error = xrep_tempfile_iolock_polled(rd->sc);
924 if (error)
925 return error;
926
927 /* Write to the tempdir all the updates that we've stashed. */
928 error = xrep_dir_replay_updates(rd);
929 xrep_tempfile_iounlock(rd->sc);
930 if (error)
931 return error;
932
933 /*
934 * Recreate the salvage transaction and relock the dir we're salvaging.
935 */
936 error = xchk_trans_alloc(rd->sc, 0);
937 if (error)
938 return error;
939 xchk_ilock(rd->sc, XFS_ILOCK_EXCL);
940 return 0;
941 }
942
943 /* Decide if we've stashed too much dirent data in memory. */
944 static inline bool
xrep_dir_want_flush_stashed(struct xrep_dir * rd)945 xrep_dir_want_flush_stashed(
946 struct xrep_dir *rd)
947 {
948 unsigned long long bytes;
949
950 bytes = xfarray_bytes(rd->dir_entries) + xfblob_bytes(rd->dir_names);
951 return bytes > XREP_DIR_MAX_STASH_BYTES;
952 }
953
954 /* Extract as many directory entries as we can. */
955 STATIC int
xrep_dir_recover(struct xrep_dir * rd)956 xrep_dir_recover(
957 struct xrep_dir *rd)
958 {
959 struct xfs_bmbt_irec got;
960 struct xfs_scrub *sc = rd->sc;
961 struct xfs_da_geometry *geo = sc->mp->m_dir_geo;
962 xfs_fileoff_t offset;
963 xfs_dablk_t dabno;
964 __be32 magic_guess;
965 int nmap;
966 int error;
967
968 xrep_dir_guess_format(rd, &magic_guess);
969
970 /* Iterate each directory data block in the data fork. */
971 for (offset = 0;
972 offset < geo->leafblk;
973 offset = got.br_startoff + got.br_blockcount) {
974 nmap = 1;
975 error = xfs_bmapi_read(sc->ip, offset, geo->leafblk - offset,
976 &got, &nmap, 0);
977 if (error)
978 return error;
979 if (nmap != 1)
980 return -EFSCORRUPTED;
981 if (!xfs_bmap_is_written_extent(&got))
982 continue;
983
984 for (dabno = round_up(got.br_startoff, geo->fsbcount);
985 dabno < got.br_startoff + got.br_blockcount;
986 dabno += geo->fsbcount) {
987 if (xchk_should_terminate(rd->sc, &error))
988 return error;
989
990 error = xrep_dir_recover_dirblock(rd,
991 magic_guess, dabno);
992 if (error)
993 return error;
994
995 /* Flush dirents to constrain memory usage. */
996 if (xrep_dir_want_flush_stashed(rd)) {
997 error = xrep_dir_flush_stashed(rd);
998 if (error)
999 return error;
1000 }
1001 }
1002 }
1003
1004 return 0;
1005 }
1006
1007 /*
1008 * Find all the directory entries for this inode by scraping them out of the
1009 * directory leaf blocks by hand, and flushing them into the temp dir.
1010 */
1011 STATIC int
xrep_dir_find_entries(struct xrep_dir * rd)1012 xrep_dir_find_entries(
1013 struct xrep_dir *rd)
1014 {
1015 struct xfs_inode *dp = rd->sc->ip;
1016 int error;
1017
1018 /*
1019 * Salvage directory entries from the old directory, and write them to
1020 * the temporary directory.
1021 */
1022 if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
1023 error = xrep_dir_recover_sf(rd);
1024 } else {
1025 error = xfs_iread_extents(rd->sc->tp, dp, XFS_DATA_FORK);
1026 if (error)
1027 return error;
1028
1029 error = xrep_dir_recover(rd);
1030 }
1031 if (error)
1032 return error;
1033
1034 return xrep_dir_flush_stashed(rd);
1035 }
1036
1037 /* Scan all files in the filesystem for dirents. */
1038 STATIC int
xrep_dir_salvage_entries(struct xrep_dir * rd)1039 xrep_dir_salvage_entries(
1040 struct xrep_dir *rd)
1041 {
1042 struct xfs_scrub *sc = rd->sc;
1043 int error;
1044
1045 /*
1046 * Drop the ILOCK on this directory so that we can scan for this
1047 * directory's parent. Figure out who is going to be the parent of
1048 * this directory, then retake the ILOCK so that we can salvage
1049 * directory entries.
1050 */
1051 xchk_iunlock(sc, XFS_ILOCK_EXCL);
1052 error = xrep_dir_find_parent(rd);
1053 xchk_ilock(sc, XFS_ILOCK_EXCL);
1054 if (error)
1055 return error;
1056
1057 /*
1058 * Collect directory entries by parsing raw leaf blocks to salvage
1059 * whatever we can. When we're done, free the staging memory before
1060 * exchanging the directories to reduce memory usage.
1061 */
1062 error = xrep_dir_find_entries(rd);
1063 if (error)
1064 return error;
1065
1066 /*
1067 * Cancel the repair transaction and drop the ILOCK so that we can
1068 * (later) use the atomic mapping exchange functions to compute the
1069 * correct block reservations and re-lock the inodes.
1070 *
1071 * We still hold IOLOCK_EXCL (aka i_rwsem) which will prevent directory
1072 * modifications, but there's nothing to prevent userspace from reading
1073 * the directory until we're ready for the exchange operation. Reads
1074 * will return -EIO without shutting down the fs, so we're ok with
1075 * that.
1076 *
1077 * The VFS can change dotdot on us, but the findparent scan will keep
1078 * our incore parent inode up to date. See the note on locking issues
1079 * for more details.
1080 */
1081 error = xrep_trans_commit(sc);
1082 if (error)
1083 return error;
1084
1085 xchk_iunlock(sc, XFS_ILOCK_EXCL);
1086 return 0;
1087 }
1088
1089
1090 /*
1091 * Examine a parent pointer of a file. If it leads us back to the directory
1092 * that we're rebuilding, create an incore dirent from the parent pointer and
1093 * stash it.
1094 */
1095 STATIC int
xrep_dir_scan_pptr(struct xfs_scrub * sc,struct xfs_inode * ip,unsigned int attr_flags,const unsigned char * name,unsigned int namelen,const void * value,unsigned int valuelen,void * priv)1096 xrep_dir_scan_pptr(
1097 struct xfs_scrub *sc,
1098 struct xfs_inode *ip,
1099 unsigned int attr_flags,
1100 const unsigned char *name,
1101 unsigned int namelen,
1102 const void *value,
1103 unsigned int valuelen,
1104 void *priv)
1105 {
1106 struct xfs_name xname = {
1107 .name = name,
1108 .len = namelen,
1109 .type = xfs_mode_to_ftype(VFS_I(ip)->i_mode),
1110 };
1111 xfs_ino_t parent_ino;
1112 uint32_t parent_gen;
1113 struct xrep_dir *rd = priv;
1114 int error;
1115
1116 if (!(attr_flags & XFS_ATTR_PARENT))
1117 return 0;
1118
1119 /*
1120 * Ignore parent pointers that point back to a different dir, list the
1121 * wrong generation number, or are invalid.
1122 */
1123 error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value,
1124 valuelen, &parent_ino, &parent_gen);
1125 if (error)
1126 return error;
1127
1128 if (parent_ino != sc->ip->i_ino ||
1129 parent_gen != VFS_I(sc->ip)->i_generation)
1130 return 0;
1131
1132 mutex_lock(&rd->pscan.lock);
1133 error = xrep_dir_stash_createname(rd, &xname, ip->i_ino);
1134 mutex_unlock(&rd->pscan.lock);
1135 return error;
1136 }
1137
1138 /*
1139 * If this child dirent points to the directory being repaired, remember that
1140 * fact so that we can reset the dotdot entry if necessary.
1141 */
1142 STATIC int
xrep_dir_scan_dirent(struct xfs_scrub * sc,struct xfs_inode * dp,xfs_dir2_dataptr_t dapos,const struct xfs_name * name,xfs_ino_t ino,void * priv)1143 xrep_dir_scan_dirent(
1144 struct xfs_scrub *sc,
1145 struct xfs_inode *dp,
1146 xfs_dir2_dataptr_t dapos,
1147 const struct xfs_name *name,
1148 xfs_ino_t ino,
1149 void *priv)
1150 {
1151 struct xrep_dir *rd = priv;
1152
1153 /* Dirent doesn't point to this directory. */
1154 if (ino != rd->sc->ip->i_ino)
1155 return 0;
1156
1157 /* Ignore garbage inum. */
1158 if (!xfs_verify_dir_ino(rd->sc->mp, ino))
1159 return 0;
1160
1161 /* No weird looking names. */
1162 if (name->len >= MAXNAMELEN || name->len <= 0)
1163 return 0;
1164
1165 /* Don't pick up dot or dotdot entries; we only want child dirents. */
1166 if (xfs_dir2_samename(name, &xfs_name_dotdot) ||
1167 xfs_dir2_samename(name, &xfs_name_dot))
1168 return 0;
1169
1170 trace_xrep_dir_stash_createname(sc->tempip, &xfs_name_dotdot,
1171 dp->i_ino);
1172
1173 xrep_findparent_scan_found(&rd->pscan, dp->i_ino);
1174 return 0;
1175 }
1176
1177 /*
1178 * Decide if we want to look for child dirents or parent pointers in this file.
1179 * Skip the dir being repaired and any files being used to stage repairs.
1180 */
1181 static inline bool
xrep_dir_want_scan(struct xrep_dir * rd,const struct xfs_inode * ip)1182 xrep_dir_want_scan(
1183 struct xrep_dir *rd,
1184 const struct xfs_inode *ip)
1185 {
1186 return ip != rd->sc->ip && !xrep_is_tempfile(ip);
1187 }
1188
1189 /*
1190 * Take ILOCK on a file that we want to scan.
1191 *
1192 * Select ILOCK_EXCL if the file is a directory with an unloaded data bmbt or
1193 * has an unloaded attr bmbt. Otherwise, take ILOCK_SHARED.
1194 */
1195 static inline unsigned int
xrep_dir_scan_ilock(struct xrep_dir * rd,struct xfs_inode * ip)1196 xrep_dir_scan_ilock(
1197 struct xrep_dir *rd,
1198 struct xfs_inode *ip)
1199 {
1200 uint lock_mode = XFS_ILOCK_SHARED;
1201
1202 /* Need to take the shared ILOCK to advance the iscan cursor. */
1203 if (!xrep_dir_want_scan(rd, ip))
1204 goto lock;
1205
1206 if (S_ISDIR(VFS_I(ip)->i_mode) && xfs_need_iread_extents(&ip->i_df)) {
1207 lock_mode = XFS_ILOCK_EXCL;
1208 goto lock;
1209 }
1210
1211 if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af))
1212 lock_mode = XFS_ILOCK_EXCL;
1213
1214 lock:
1215 xfs_ilock(ip, lock_mode);
1216 return lock_mode;
1217 }
1218
1219 /*
1220 * Scan this file for relevant child dirents or parent pointers that point to
1221 * the directory we're rebuilding.
1222 */
1223 STATIC int
xrep_dir_scan_file(struct xrep_dir * rd,struct xfs_inode * ip)1224 xrep_dir_scan_file(
1225 struct xrep_dir *rd,
1226 struct xfs_inode *ip)
1227 {
1228 unsigned int lock_mode;
1229 int error = 0;
1230
1231 lock_mode = xrep_dir_scan_ilock(rd, ip);
1232
1233 if (!xrep_dir_want_scan(rd, ip))
1234 goto scan_done;
1235
1236 /*
1237 * If the extended attributes look as though they has been zapped by
1238 * the inode record repair code, we cannot scan for parent pointers.
1239 */
1240 if (xchk_pptr_looks_zapped(ip)) {
1241 error = -EBUSY;
1242 goto scan_done;
1243 }
1244
1245 error = xchk_xattr_walk(rd->sc, ip, xrep_dir_scan_pptr, NULL, rd);
1246 if (error)
1247 goto scan_done;
1248
1249 if (S_ISDIR(VFS_I(ip)->i_mode)) {
1250 /*
1251 * If the directory looks as though it has been zapped by the
1252 * inode record repair code, we cannot scan for child dirents.
1253 */
1254 if (xchk_dir_looks_zapped(ip)) {
1255 error = -EBUSY;
1256 goto scan_done;
1257 }
1258
1259 error = xchk_dir_walk(rd->sc, ip, xrep_dir_scan_dirent, rd);
1260 if (error)
1261 goto scan_done;
1262 }
1263
1264 scan_done:
1265 xchk_iscan_mark_visited(&rd->pscan.iscan, ip);
1266 xfs_iunlock(ip, lock_mode);
1267 return error;
1268 }
1269
1270 /*
1271 * Scan all files in the filesystem for parent pointers that we can turn into
1272 * replacement dirents, and a dirent that we can use to set the dotdot pointer.
1273 */
1274 STATIC int
xrep_dir_scan_dirtree(struct xrep_dir * rd)1275 xrep_dir_scan_dirtree(
1276 struct xrep_dir *rd)
1277 {
1278 struct xfs_scrub *sc = rd->sc;
1279 struct xfs_inode *ip;
1280 int error;
1281
1282 /* Roots of directory trees are their own parents. */
1283 if (xchk_inode_is_dirtree_root(sc->ip))
1284 xrep_findparent_scan_found(&rd->pscan, sc->ip->i_ino);
1285
1286 /*
1287 * Filesystem scans are time consuming. Drop the directory ILOCK and
1288 * all other resources for the duration of the scan and hope for the
1289 * best. The live update hooks will keep our scan information up to
1290 * date even though we've dropped the locks.
1291 */
1292 xchk_trans_cancel(sc);
1293 if (sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL))
1294 xchk_iunlock(sc, sc->ilock_flags & (XFS_ILOCK_SHARED |
1295 XFS_ILOCK_EXCL));
1296 xchk_trans_alloc_empty(sc);
1297
1298 while ((error = xchk_iscan_iter(&rd->pscan.iscan, &ip)) == 1) {
1299 bool flush;
1300
1301 error = xrep_dir_scan_file(rd, ip);
1302 xchk_irele(sc, ip);
1303 if (error)
1304 break;
1305
1306 /* Flush stashed dirent updates to constrain memory usage. */
1307 mutex_lock(&rd->pscan.lock);
1308 flush = xrep_dir_want_flush_stashed(rd);
1309 mutex_unlock(&rd->pscan.lock);
1310 if (flush) {
1311 xchk_trans_cancel(sc);
1312
1313 error = xrep_tempfile_iolock_polled(sc);
1314 if (error)
1315 break;
1316
1317 error = xrep_dir_replay_updates(rd);
1318 xrep_tempfile_iounlock(sc);
1319 if (error)
1320 break;
1321
1322 xchk_trans_alloc_empty(sc);
1323 }
1324
1325 if (xchk_should_terminate(sc, &error))
1326 break;
1327 }
1328 xchk_iscan_iter_finish(&rd->pscan.iscan);
1329 if (error) {
1330 /*
1331 * If we couldn't grab an inode that was busy with a state
1332 * change, change the error code so that we exit to userspace
1333 * as quickly as possible.
1334 */
1335 if (error == -EBUSY)
1336 return -ECANCELED;
1337 return error;
1338 }
1339
1340 /*
1341 * Cancel the empty transaction so that we can (later) use the atomic
1342 * file mapping exchange functions to lock files and commit the new
1343 * directory.
1344 */
1345 xchk_trans_cancel(rd->sc);
1346 return 0;
1347 }
1348
1349 /*
1350 * Capture dirent updates being made by other threads which are relevant to the
1351 * directory being repaired.
1352 */
1353 STATIC int
xrep_dir_live_update(struct notifier_block * nb,unsigned long action,void * data)1354 xrep_dir_live_update(
1355 struct notifier_block *nb,
1356 unsigned long action,
1357 void *data)
1358 {
1359 struct xfs_dir_update_params *p = data;
1360 struct xrep_dir *rd;
1361 struct xfs_scrub *sc;
1362 int error = 0;
1363
1364 rd = container_of(nb, struct xrep_dir, pscan.dhook.dirent_hook.nb);
1365 sc = rd->sc;
1366
1367 /*
1368 * This thread updated a child dirent in the directory that we're
1369 * rebuilding. Stash the update for replay against the temporary
1370 * directory.
1371 */
1372 if (p->dp->i_ino == sc->ip->i_ino &&
1373 xchk_iscan_want_live_update(&rd->pscan.iscan, p->ip->i_ino)) {
1374 mutex_lock(&rd->pscan.lock);
1375 if (p->delta > 0)
1376 error = xrep_dir_stash_createname(rd, p->name,
1377 p->ip->i_ino);
1378 else
1379 error = xrep_dir_stash_removename(rd, p->name,
1380 p->ip->i_ino);
1381 mutex_unlock(&rd->pscan.lock);
1382 if (error)
1383 goto out_abort;
1384 }
1385
1386 /*
1387 * This thread updated another directory's child dirent that points to
1388 * the directory that we're rebuilding, so remember the new dotdot
1389 * target.
1390 */
1391 if (p->ip->i_ino == sc->ip->i_ino &&
1392 xchk_iscan_want_live_update(&rd->pscan.iscan, p->dp->i_ino)) {
1393 if (p->delta > 0) {
1394 trace_xrep_dir_stash_createname(sc->tempip,
1395 &xfs_name_dotdot,
1396 p->dp->i_ino);
1397
1398 xrep_findparent_scan_found(&rd->pscan, p->dp->i_ino);
1399 } else {
1400 trace_xrep_dir_stash_removename(sc->tempip,
1401 &xfs_name_dotdot,
1402 rd->pscan.parent_ino);
1403
1404 xrep_findparent_scan_found(&rd->pscan, NULLFSINO);
1405 }
1406 }
1407
1408 return NOTIFY_DONE;
1409 out_abort:
1410 xchk_iscan_abort(&rd->pscan.iscan);
1411 return NOTIFY_DONE;
1412 }
1413
1414 /*
1415 * Free all the directory blocks and reset the data fork. The caller must
1416 * join the inode to the transaction. This function returns with the inode
1417 * joined to a clean scrub transaction.
1418 */
1419 STATIC int
xrep_dir_reset_fork(struct xrep_dir * rd,xfs_ino_t parent_ino)1420 xrep_dir_reset_fork(
1421 struct xrep_dir *rd,
1422 xfs_ino_t parent_ino)
1423 {
1424 struct xfs_scrub *sc = rd->sc;
1425 struct xfs_ifork *ifp = xfs_ifork_ptr(sc->tempip, XFS_DATA_FORK);
1426 int error;
1427
1428 /* Unmap all the directory buffers. */
1429 if (xfs_ifork_has_extents(ifp)) {
1430 error = xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK);
1431 if (error)
1432 return error;
1433 }
1434
1435 trace_xrep_dir_reset_fork(sc->tempip, parent_ino);
1436
1437 /* Reset the data fork to an empty data fork. */
1438 xfs_idestroy_fork(ifp);
1439 ifp->if_bytes = 0;
1440 sc->tempip->i_disk_size = 0;
1441
1442 /* Reinitialize the short form directory. */
1443 xrep_dir_init_args(rd, sc->tempip, NULL);
1444 return xfs_dir2_sf_create(&rd->args, parent_ino);
1445 }
1446
1447 /*
1448 * Prepare both inodes' directory forks for exchanging mappings. Promote the
1449 * tempfile from short format to leaf format, and if the file being repaired
1450 * has a short format data fork, turn it into an empty extent list.
1451 */
1452 STATIC int
xrep_dir_swap_prep(struct xfs_scrub * sc,bool temp_local,bool ip_local)1453 xrep_dir_swap_prep(
1454 struct xfs_scrub *sc,
1455 bool temp_local,
1456 bool ip_local)
1457 {
1458 int error;
1459
1460 /*
1461 * If the tempfile's directory is in shortform format, convert that to
1462 * a single leaf extent so that we can use the atomic mapping exchange.
1463 */
1464 if (temp_local) {
1465 struct xfs_da_args args = {
1466 .dp = sc->tempip,
1467 .geo = sc->mp->m_dir_geo,
1468 .whichfork = XFS_DATA_FORK,
1469 .trans = sc->tp,
1470 .total = 1,
1471 .owner = sc->ip->i_ino,
1472 };
1473
1474 error = xfs_dir2_sf_to_block(&args);
1475 if (error)
1476 return error;
1477
1478 /*
1479 * Roll the deferred log items to get us back to a clean
1480 * transaction.
1481 */
1482 error = xfs_defer_finish(&sc->tp);
1483 if (error)
1484 return error;
1485 }
1486
1487 /*
1488 * If the file being repaired had a shortform data fork, convert that
1489 * to an empty extent list in preparation for the atomic mapping
1490 * exchange.
1491 */
1492 if (ip_local) {
1493 struct xfs_ifork *ifp;
1494
1495 ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
1496 xfs_idestroy_fork(ifp);
1497 ifp->if_format = XFS_DINODE_FMT_EXTENTS;
1498 ifp->if_nextents = 0;
1499 ifp->if_bytes = 0;
1500 ifp->if_data = NULL;
1501 ifp->if_height = 0;
1502
1503 xfs_trans_log_inode(sc->tp, sc->ip,
1504 XFS_ILOG_CORE | XFS_ILOG_DDATA);
1505 }
1506
1507 return 0;
1508 }
1509
1510 /*
1511 * Replace the inode number of a directory entry.
1512 */
1513 static int
xrep_dir_replace(struct xrep_dir * rd,struct xfs_inode * dp,const struct xfs_name * name,xfs_ino_t inum,xfs_extlen_t total)1514 xrep_dir_replace(
1515 struct xrep_dir *rd,
1516 struct xfs_inode *dp,
1517 const struct xfs_name *name,
1518 xfs_ino_t inum,
1519 xfs_extlen_t total)
1520 {
1521 struct xfs_scrub *sc = rd->sc;
1522 int error;
1523
1524 ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
1525
1526 error = xfs_dir_ino_validate(sc->mp, inum);
1527 if (error)
1528 return error;
1529
1530 xrep_dir_init_args(rd, dp, name);
1531 rd->args.inumber = inum;
1532 rd->args.total = total;
1533 return xfs_dir_replace_args(&rd->args);
1534 }
1535
1536 /*
1537 * Reset the link count of this directory and adjust the unlinked list pointers
1538 * as needed.
1539 */
1540 STATIC int
xrep_dir_set_nlink(struct xrep_dir * rd)1541 xrep_dir_set_nlink(
1542 struct xrep_dir *rd)
1543 {
1544 struct xfs_scrub *sc = rd->sc;
1545 struct xfs_inode *dp = sc->ip;
1546 struct xfs_perag *pag;
1547 unsigned int new_nlink = min_t(unsigned long long,
1548 rd->subdirs + 2,
1549 XFS_NLINK_PINNED);
1550 int error;
1551
1552 /*
1553 * The directory is not on the incore unlinked list, which means that
1554 * it needs to be reachable via the directory tree. Update the nlink
1555 * with our observed link count. If the directory has no parent, it
1556 * will be moved to the orphanage.
1557 */
1558 if (!xfs_inode_on_unlinked_list(dp))
1559 goto reset_nlink;
1560
1561 /*
1562 * The directory is on the unlinked list and we did not find any
1563 * dirents. Set the link count to zero and let the directory
1564 * inactivate when the last reference drops.
1565 */
1566 if (rd->dirents == 0) {
1567 rd->needs_adoption = false;
1568 new_nlink = 0;
1569 goto reset_nlink;
1570 }
1571
1572 /*
1573 * The directory is on the unlinked list and we found dirents. This
1574 * directory needs to be reachable via the directory tree. Remove the
1575 * dir from the unlinked list and update nlink with the observed link
1576 * count. If the directory has no parent, it will be moved to the
1577 * orphanage.
1578 */
1579 pag = xfs_perag_get(sc->mp, XFS_INO_TO_AGNO(sc->mp, dp->i_ino));
1580 if (!pag) {
1581 ASSERT(0);
1582 return -EFSCORRUPTED;
1583 }
1584
1585 error = xfs_iunlink_remove(sc->tp, pag, dp);
1586 xfs_perag_put(pag);
1587 if (error)
1588 return error;
1589
1590 reset_nlink:
1591 if (VFS_I(dp)->i_nlink != new_nlink)
1592 set_nlink(VFS_I(dp), new_nlink);
1593 return 0;
1594 }
1595
1596 /*
1597 * Finish replaying stashed dirent updates, allocate a transaction for
1598 * exchanging data fork mappings, and take the ILOCKs of both directories
1599 * before we commit the new directory structure.
1600 */
1601 STATIC int
xrep_dir_finalize_tempdir(struct xrep_dir * rd)1602 xrep_dir_finalize_tempdir(
1603 struct xrep_dir *rd)
1604 {
1605 struct xfs_scrub *sc = rd->sc;
1606 int error;
1607
1608 if (!xfs_has_parent(sc->mp))
1609 return xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, &rd->tx);
1610
1611 /*
1612 * Repair relies on the ILOCK to quiesce all possible dirent updates.
1613 * Replay all queued dirent updates into the tempdir before exchanging
1614 * the contents, even if that means dropping the ILOCKs and the
1615 * transaction.
1616 */
1617 do {
1618 error = xrep_dir_replay_updates(rd);
1619 if (error)
1620 return error;
1621
1622 error = xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, &rd->tx);
1623 if (error)
1624 return error;
1625
1626 if (xfarray_length(rd->dir_entries) == 0)
1627 break;
1628
1629 xchk_trans_cancel(sc);
1630 xrep_tempfile_iunlock_both(sc);
1631 } while (!xchk_should_terminate(sc, &error));
1632 return error;
1633 }
1634
1635 /* Exchange the temporary directory's data fork with the one being repaired. */
1636 STATIC int
xrep_dir_swap(struct xrep_dir * rd)1637 xrep_dir_swap(
1638 struct xrep_dir *rd)
1639 {
1640 struct xfs_scrub *sc = rd->sc;
1641 xfs_ino_t ino;
1642 bool ip_local, temp_local;
1643 int error = 0;
1644
1645 /*
1646 * If we never found the parent for this directory, temporarily assign
1647 * the root dir as the parent; we'll move this to the orphanage after
1648 * exchanging the dir contents. We hold the ILOCK of the dir being
1649 * repaired, so we're not worried about racy updates of dotdot.
1650 */
1651 ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL);
1652 if (rd->pscan.parent_ino == NULLFSINO) {
1653 rd->needs_adoption = true;
1654 rd->pscan.parent_ino = rd->sc->mp->m_sb.sb_rootino;
1655 }
1656
1657 /*
1658 * Reset the temporary directory's '..' entry to point to the parent
1659 * that we found. The dirent replace code asserts if the dirent
1660 * already points at the new inumber, so we look it up here.
1661 *
1662 * It's also possible that this replacement could also expand a sf
1663 * tempdir into block format.
1664 */
1665 error = xchk_dir_lookup(sc, rd->sc->tempip, &xfs_name_dotdot, &ino);
1666 if (error)
1667 return error;
1668
1669 if (rd->pscan.parent_ino != ino) {
1670 error = xrep_dir_replace(rd, rd->sc->tempip, &xfs_name_dotdot,
1671 rd->pscan.parent_ino, rd->tx.req.resblks);
1672 if (error)
1673 return error;
1674 }
1675
1676 /*
1677 * Changing the dot and dotdot entries could have changed the shape of
1678 * the directory, so we recompute these.
1679 */
1680 ip_local = sc->ip->i_df.if_format == XFS_DINODE_FMT_LOCAL;
1681 temp_local = sc->tempip->i_df.if_format == XFS_DINODE_FMT_LOCAL;
1682
1683 /*
1684 * If the both files have a local format data fork and the rebuilt
1685 * directory data would fit in the repaired file's data fork, copy
1686 * the contents from the tempfile and update the directory link count.
1687 * We're done now.
1688 */
1689 if (ip_local && temp_local &&
1690 sc->tempip->i_disk_size <= xfs_inode_data_fork_size(sc->ip)) {
1691 xrep_tempfile_copyout_local(sc, XFS_DATA_FORK);
1692 return xrep_dir_set_nlink(rd);
1693 }
1694
1695 /*
1696 * Clean the transaction before we start working on exchanging
1697 * directory contents.
1698 */
1699 error = xrep_tempfile_roll_trans(rd->sc);
1700 if (error)
1701 return error;
1702
1703 /* Otherwise, make sure both data forks are in block-mapping mode. */
1704 error = xrep_dir_swap_prep(sc, temp_local, ip_local);
1705 if (error)
1706 return error;
1707
1708 /*
1709 * Set nlink of the directory in the same transaction sequence that
1710 * (atomically) commits the new directory data.
1711 */
1712 error = xrep_dir_set_nlink(rd);
1713 if (error)
1714 return error;
1715
1716 return xrep_tempexch_contents(sc, &rd->tx);
1717 }
1718
1719 /*
1720 * Exchange the new directory contents (which we created in the tempfile) with
1721 * the directory being repaired.
1722 */
1723 STATIC int
xrep_dir_rebuild_tree(struct xrep_dir * rd)1724 xrep_dir_rebuild_tree(
1725 struct xrep_dir *rd)
1726 {
1727 struct xfs_scrub *sc = rd->sc;
1728 int error;
1729
1730 trace_xrep_dir_rebuild_tree(sc->ip, rd->pscan.parent_ino);
1731
1732 /*
1733 * Take the IOLOCK on the temporary file so that we can run dir
1734 * operations with the same locks held as we would for a normal file.
1735 * We still hold sc->ip's IOLOCK.
1736 */
1737 error = xrep_tempfile_iolock_polled(rd->sc);
1738 if (error)
1739 return error;
1740
1741 /*
1742 * Allocate transaction, lock inodes, and make sure that we've replayed
1743 * all the stashed dirent updates to the tempdir. After this point,
1744 * we're ready to exchange data fork mappings.
1745 */
1746 error = xrep_dir_finalize_tempdir(rd);
1747 if (error)
1748 return error;
1749
1750 if (xchk_iscan_aborted(&rd->pscan.iscan))
1751 return -ECANCELED;
1752
1753 /*
1754 * Exchange the tempdir's data fork with the file being repaired. This
1755 * recreates the transaction and re-takes the ILOCK in the scrub
1756 * context.
1757 */
1758 error = xrep_dir_swap(rd);
1759 if (error)
1760 return error;
1761
1762 /*
1763 * Release the old directory blocks and reset the data fork of the temp
1764 * directory to an empty shortform directory because inactivation does
1765 * nothing for directories.
1766 */
1767 error = xrep_dir_reset_fork(rd, sc->mp->m_rootip->i_ino);
1768 if (error)
1769 return error;
1770
1771 /*
1772 * Roll to get a transaction without any inodes joined to it. Then we
1773 * can drop the tempfile's ILOCK and IOLOCK before doing more work on
1774 * the scrub target directory.
1775 */
1776 error = xfs_trans_roll(&sc->tp);
1777 if (error)
1778 return error;
1779
1780 xrep_tempfile_iunlock(sc);
1781 xrep_tempfile_iounlock(sc);
1782 return 0;
1783 }
1784
1785 /* Set up the filesystem scan so we can regenerate directory entries. */
1786 STATIC int
xrep_dir_setup_scan(struct xrep_dir * rd)1787 xrep_dir_setup_scan(
1788 struct xrep_dir *rd)
1789 {
1790 struct xfs_scrub *sc = rd->sc;
1791 int error;
1792
1793 /* Set up some staging memory for salvaging dirents. */
1794 error = xfarray_create("directory entries", 0,
1795 sizeof(struct xrep_dirent), &rd->dir_entries);
1796 if (error)
1797 return error;
1798
1799 error = xfblob_create("directory entry names", &rd->dir_names);
1800 if (error)
1801 goto out_xfarray;
1802
1803 if (xfs_has_parent(sc->mp))
1804 error = __xrep_findparent_scan_start(sc, &rd->pscan,
1805 xrep_dir_live_update);
1806 else
1807 error = xrep_findparent_scan_start(sc, &rd->pscan);
1808 if (error)
1809 goto out_xfblob;
1810
1811 return 0;
1812
1813 out_xfblob:
1814 xfblob_destroy(rd->dir_names);
1815 rd->dir_names = NULL;
1816 out_xfarray:
1817 xfarray_destroy(rd->dir_entries);
1818 rd->dir_entries = NULL;
1819 return error;
1820 }
1821
1822 /*
1823 * Move the current file to the orphanage.
1824 *
1825 * Caller must hold IOLOCK_EXCL on @sc->ip, and no other inode locks. Upon
1826 * successful return, the scrub transaction will have enough extra reservation
1827 * to make the move; it will hold IOLOCK_EXCL and ILOCK_EXCL of @sc->ip and the
1828 * orphanage; and both inodes will be ijoined.
1829 */
1830 STATIC int
xrep_dir_move_to_orphanage(struct xrep_dir * rd)1831 xrep_dir_move_to_orphanage(
1832 struct xrep_dir *rd)
1833 {
1834 struct xfs_scrub *sc = rd->sc;
1835 xfs_ino_t orig_parent, new_parent;
1836 int error;
1837
1838 /*
1839 * We are about to drop the ILOCK on sc->ip to lock the orphanage and
1840 * prepare for the adoption. Therefore, look up the old dotdot entry
1841 * for sc->ip so that we can compare it after we re-lock sc->ip.
1842 */
1843 error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &orig_parent);
1844 if (error)
1845 return error;
1846
1847 /*
1848 * Drop the ILOCK on the scrub target and commit the transaction.
1849 * Adoption computes its own resource requirements and gathers the
1850 * necessary components.
1851 */
1852 error = xrep_trans_commit(sc);
1853 if (error)
1854 return error;
1855 xchk_iunlock(sc, XFS_ILOCK_EXCL);
1856
1857 /* If we can take the orphanage's iolock then we're ready to move. */
1858 if (!xrep_orphanage_ilock_nowait(sc, XFS_IOLOCK_EXCL)) {
1859 xchk_iunlock(sc, sc->ilock_flags);
1860 error = xrep_orphanage_iolock_two(sc);
1861 if (error)
1862 return error;
1863 }
1864
1865 /* Grab transaction and ILOCK the two files. */
1866 error = xrep_adoption_trans_alloc(sc, &rd->adoption);
1867 if (error)
1868 return error;
1869
1870 error = xrep_adoption_compute_name(&rd->adoption, &rd->xname);
1871 if (error)
1872 return error;
1873
1874 /*
1875 * Now that we've reacquired the ILOCK on sc->ip, look up the dotdot
1876 * entry again. If the parent changed or the child was unlinked while
1877 * the child directory was unlocked, we don't need to move the child to
1878 * the orphanage after all.
1879 */
1880 error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &new_parent);
1881 if (error)
1882 return error;
1883
1884 /*
1885 * Attach to the orphanage if we still have a linked directory and it
1886 * hasn't been moved.
1887 */
1888 if (orig_parent == new_parent && VFS_I(sc->ip)->i_nlink > 0) {
1889 error = xrep_adoption_move(&rd->adoption);
1890 if (error)
1891 return error;
1892 }
1893
1894 /*
1895 * Launder the scrub transaction so we can drop the orphanage ILOCK
1896 * and IOLOCK. Return holding the scrub target's ILOCK and IOLOCK.
1897 */
1898 error = xrep_adoption_trans_roll(&rd->adoption);
1899 if (error)
1900 return error;
1901
1902 xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL);
1903 xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL);
1904 return 0;
1905 }
1906
1907 /*
1908 * Repair the directory metadata.
1909 *
1910 * XXX: Directory entry buffers can be multiple fsblocks in size. The buffer
1911 * cache in XFS can't handle aliased multiblock buffers, so this might
1912 * misbehave if the directory blocks are crosslinked with other filesystem
1913 * metadata.
1914 *
1915 * XXX: Is it necessary to check the dcache for this directory to make sure
1916 * that we always recreate every cached entry?
1917 */
1918 int
xrep_directory(struct xfs_scrub * sc)1919 xrep_directory(
1920 struct xfs_scrub *sc)
1921 {
1922 struct xrep_dir *rd = sc->buf;
1923 int error;
1924
1925 /* The rmapbt is required to reap the old data fork. */
1926 if (!xfs_has_rmapbt(sc->mp))
1927 return -EOPNOTSUPP;
1928 /* We require atomic file exchange range to rebuild anything. */
1929 if (!xfs_has_exchange_range(sc->mp))
1930 return -EOPNOTSUPP;
1931
1932 error = xrep_dir_setup_scan(rd);
1933 if (error)
1934 return error;
1935
1936 if (xfs_has_parent(sc->mp))
1937 error = xrep_dir_scan_dirtree(rd);
1938 else
1939 error = xrep_dir_salvage_entries(rd);
1940 if (error)
1941 goto out_teardown;
1942
1943 /* Last chance to abort before we start committing fixes. */
1944 if (xchk_should_terminate(sc, &error))
1945 goto out_teardown;
1946
1947 error = xrep_dir_rebuild_tree(rd);
1948 if (error)
1949 goto out_teardown;
1950
1951 if (rd->needs_adoption) {
1952 if (!xrep_orphanage_can_adopt(rd->sc))
1953 error = -EFSCORRUPTED;
1954 else
1955 error = xrep_dir_move_to_orphanage(rd);
1956 if (error)
1957 goto out_teardown;
1958 }
1959
1960 out_teardown:
1961 xrep_dir_teardown(sc);
1962 return error;
1963 }
1964