// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2021-2024 Oracle. All Rights Reserved. * Author: Darrick J. Wong */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_log_format.h" #include "xfs_trans.h" #include "xfs_inode.h" #include "xfs_ialloc.h" #include "xfs_quota.h" #include "xfs_bmap.h" #include "xfs_bmap_btree.h" #include "xfs_trans_space.h" #include "xfs_dir2.h" #include "xfs_exchrange.h" #include "xfs_exchmaps.h" #include "xfs_defer.h" #include "xfs_symlink_remote.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/repair.h" #include "scrub/trace.h" #include "scrub/tempfile.h" #include "scrub/tempexch.h" #include "scrub/xfile.h" /* * Create a temporary file for reconstructing metadata, with the intention of * atomically exchanging the temporary file's contents with the file that's * being repaired. */ int xrep_tempfile_create( struct xfs_scrub *sc, uint16_t mode) { struct xfs_icreate_args args = { .pip = sc->mp->m_rootip, .mode = mode, .flags = XFS_ICREATE_TMPFILE | XFS_ICREATE_UNLINKABLE, }; struct xfs_mount *mp = sc->mp; struct xfs_trans *tp = NULL; struct xfs_dquot *udqp; struct xfs_dquot *gdqp; struct xfs_dquot *pdqp; struct xfs_trans_res *tres; struct xfs_inode *dp = mp->m_rootip; xfs_ino_t ino; unsigned int resblks; bool is_dir = S_ISDIR(mode); int error; if (xfs_is_shutdown(mp)) return -EIO; if (xfs_is_readonly(mp)) return -EROFS; ASSERT(sc->tp == NULL); ASSERT(sc->tempip == NULL); /* * Make sure that we have allocated dquot(s) on disk. The temporary * inode should be completely root owned so that we don't fail due to * quota limits. */ error = xfs_icreate_dqalloc(&args, &udqp, &gdqp, &pdqp); if (error) return error; if (is_dir) { resblks = xfs_mkdir_space_res(mp, 0); tres = &M_RES(mp)->tr_mkdir; } else { resblks = XFS_IALLOC_SPACE_RES(mp); tres = &M_RES(mp)->tr_create_tmpfile; } error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks, &tp); if (error) goto out_release_dquots; /* Allocate inode, set up directory. */ error = xfs_dialloc(&tp, &args, &ino); if (error) goto out_trans_cancel; error = xfs_icreate(tp, ino, &args, &sc->tempip); if (error) goto out_trans_cancel; /* We don't touch file data, so drop the realtime flags. */ sc->tempip->i_diflags &= ~(XFS_DIFLAG_REALTIME | XFS_DIFLAG_RTINHERIT); xfs_trans_log_inode(tp, sc->tempip, XFS_ILOG_CORE); /* * Mark our temporary file as private so that LSMs and the ACL code * don't try to add their own metadata or reason about these files. * The file should never be exposed to userspace. */ VFS_I(sc->tempip)->i_flags |= S_PRIVATE; VFS_I(sc->tempip)->i_opflags &= ~IOP_XATTR; if (is_dir) { error = xfs_dir_init(tp, sc->tempip, dp); if (error) goto out_trans_cancel; } else if (S_ISLNK(VFS_I(sc->tempip)->i_mode)) { /* * Initialize the temporary symlink with a meaningless target * that won't trip the verifiers. Repair must rewrite the * target with meaningful content before swapping with the file * being repaired. A single-byte target will not write a * remote target block, so the owner is irrelevant. */ error = xfs_symlink_write_target(tp, sc->tempip, sc->tempip->i_ino, ".", 1, 0, 0); if (error) goto out_trans_cancel; } /* * Attach the dquot(s) to the inodes and modify them incore. * These ids of the inode couldn't have changed since the new * inode has been locked ever since it was created. */ xfs_qm_vop_create_dqattach(tp, sc->tempip, udqp, gdqp, pdqp); /* * Put our temp file on the unlinked list so it's purged automatically. * All file-based metadata being reconstructed using this file must be * atomically exchanged with the original file because the contents * here will be purged when the inode is dropped or log recovery cleans * out the unlinked list. */ error = xfs_iunlink(tp, sc->tempip); if (error) goto out_trans_cancel; error = xfs_trans_commit(tp); if (error) goto out_release_inode; trace_xrep_tempfile_create(sc); xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); xfs_qm_dqrele(pdqp); /* Finish setting up the incore / vfs context. */ xfs_iunlock(sc->tempip, XFS_ILOCK_EXCL); xfs_setup_iops(sc->tempip); xfs_finish_inode_setup(sc->tempip); sc->temp_ilock_flags = 0; return error; out_trans_cancel: xfs_trans_cancel(tp); out_release_inode: /* * Wait until after the current transaction is aborted to finish the * setup of the inode and release the inode. This prevents recursive * transactions and deadlocks from xfs_inactive. */ if (sc->tempip) { xfs_iunlock(sc->tempip, XFS_ILOCK_EXCL); xfs_finish_inode_setup(sc->tempip); xchk_irele(sc, sc->tempip); } out_release_dquots: xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); xfs_qm_dqrele(pdqp); return error; } /* Take IOLOCK_EXCL on the temporary file, maybe. */ bool xrep_tempfile_iolock_nowait( struct xfs_scrub *sc) { if (xfs_ilock_nowait(sc->tempip, XFS_IOLOCK_EXCL)) { sc->temp_ilock_flags |= XFS_IOLOCK_EXCL; return true; } return false; } /* * Take the temporary file's IOLOCK while holding a different inode's IOLOCK. * In theory nobody else should hold the tempfile's IOLOCK, but we use trylock * to avoid deadlocks and lockdep complaints. */ int xrep_tempfile_iolock_polled( struct xfs_scrub *sc) { int error = 0; while (!xrep_tempfile_iolock_nowait(sc)) { if (xchk_should_terminate(sc, &error)) return error; delay(1); } return 0; } /* Release IOLOCK_EXCL on the temporary file. */ void xrep_tempfile_iounlock( struct xfs_scrub *sc) { xfs_iunlock(sc->tempip, XFS_IOLOCK_EXCL); sc->temp_ilock_flags &= ~XFS_IOLOCK_EXCL; } /* Prepare the temporary file for metadata updates by grabbing ILOCK_EXCL. */ void xrep_tempfile_ilock( struct xfs_scrub *sc) { sc->temp_ilock_flags |= XFS_ILOCK_EXCL; xfs_ilock(sc->tempip, XFS_ILOCK_EXCL); } /* Try to grab ILOCK_EXCL on the temporary file. */ bool xrep_tempfile_ilock_nowait( struct xfs_scrub *sc) { if (xfs_ilock_nowait(sc->tempip, XFS_ILOCK_EXCL)) { sc->temp_ilock_flags |= XFS_ILOCK_EXCL; return true; } return false; } /* Unlock ILOCK_EXCL on the temporary file after an update. */ void xrep_tempfile_iunlock( struct xfs_scrub *sc) { xfs_iunlock(sc->tempip, XFS_ILOCK_EXCL); sc->temp_ilock_flags &= ~XFS_ILOCK_EXCL; } /* * Begin the process of making changes to both the file being scrubbed and * the temporary file by taking ILOCK_EXCL on both. */ void xrep_tempfile_ilock_both( struct xfs_scrub *sc) { xfs_lock_two_inodes(sc->ip, XFS_ILOCK_EXCL, sc->tempip, XFS_ILOCK_EXCL); sc->ilock_flags |= XFS_ILOCK_EXCL; sc->temp_ilock_flags |= XFS_ILOCK_EXCL; } /* Unlock ILOCK_EXCL on both files. */ void xrep_tempfile_iunlock_both( struct xfs_scrub *sc) { xrep_tempfile_iunlock(sc); xchk_iunlock(sc, XFS_ILOCK_EXCL); } /* Release the temporary file. */ void xrep_tempfile_rele( struct xfs_scrub *sc) { if (!sc->tempip) return; if (sc->temp_ilock_flags) { xfs_iunlock(sc->tempip, sc->temp_ilock_flags); sc->temp_ilock_flags = 0; } xchk_irele(sc, sc->tempip); sc->tempip = NULL; } /* * Make sure that the given range of the data fork of the temporary file is * mapped to written blocks. The caller must ensure that both inodes are * joined to the transaction. */ int xrep_tempfile_prealloc( struct xfs_scrub *sc, xfs_fileoff_t off, xfs_filblks_t len) { struct xfs_bmbt_irec map; xfs_fileoff_t end = off + len; int error; ASSERT(sc->tempip != NULL); ASSERT(!XFS_NOT_DQATTACHED(sc->mp, sc->tempip)); for (; off < end; off = map.br_startoff + map.br_blockcount) { int nmaps = 1; /* * If we have a real extent mapping this block then we're * in ok shape. */ error = xfs_bmapi_read(sc->tempip, off, end - off, &map, &nmaps, XFS_DATA_FORK); if (error) return error; if (nmaps == 0) { ASSERT(nmaps != 0); return -EFSCORRUPTED; } if (xfs_bmap_is_written_extent(&map)) continue; /* * If we find a delalloc reservation then something is very * very wrong. Bail out. */ if (map.br_startblock == DELAYSTARTBLOCK) return -EFSCORRUPTED; /* * Make sure this block has a real zeroed extent allocated to * it. */ nmaps = 1; error = xfs_bmapi_write(sc->tp, sc->tempip, off, end - off, XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO, 0, &map, &nmaps); if (error) return error; if (nmaps != 1) return -EFSCORRUPTED; trace_xrep_tempfile_prealloc(sc, XFS_DATA_FORK, &map); /* Commit new extent and all deferred work. */ error = xfs_defer_finish(&sc->tp); if (error) return error; } return 0; } /* * Write data to each block of a file. The given range of the tempfile's data * fork must already be populated with written extents. */ int xrep_tempfile_copyin( struct xfs_scrub *sc, xfs_fileoff_t off, xfs_filblks_t len, xrep_tempfile_copyin_fn prep_fn, void *data) { LIST_HEAD(buffers_list); struct xfs_mount *mp = sc->mp; struct xfs_buf *bp; xfs_fileoff_t flush_mask; xfs_fileoff_t end = off + len; loff_t pos = XFS_FSB_TO_B(mp, off); int error = 0; ASSERT(S_ISREG(VFS_I(sc->tempip)->i_mode)); /* Flush buffers to disk every 512K */ flush_mask = XFS_B_TO_FSBT(mp, (1U << 19)) - 1; for (; off < end; off++, pos += mp->m_sb.sb_blocksize) { struct xfs_bmbt_irec map; int nmaps = 1; /* Read block mapping for this file block. */ error = xfs_bmapi_read(sc->tempip, off, 1, &map, &nmaps, 0); if (error) goto out_err; if (nmaps == 0 || !xfs_bmap_is_written_extent(&map)) { error = -EFSCORRUPTED; goto out_err; } /* Get the metadata buffer for this offset in the file. */ error = xfs_trans_get_buf(sc->tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, map.br_startblock), mp->m_bsize, 0, &bp); if (error) goto out_err; trace_xrep_tempfile_copyin(sc, XFS_DATA_FORK, &map); /* Read in a block's worth of data from the xfile. */ error = prep_fn(sc, bp, data); if (error) { xfs_trans_brelse(sc->tp, bp); goto out_err; } /* Queue buffer, and flush if we have too much dirty data. */ xfs_buf_delwri_queue_here(bp, &buffers_list); xfs_trans_brelse(sc->tp, bp); if (!(off & flush_mask)) { error = xfs_buf_delwri_submit(&buffers_list); if (error) goto out_err; } } /* * Write the new blocks to disk. If the ordered list isn't empty after * that, then something went wrong and we have to fail. This should * never happen, but we'll check anyway. */ error = xfs_buf_delwri_submit(&buffers_list); if (error) goto out_err; if (!list_empty(&buffers_list)) { ASSERT(list_empty(&buffers_list)); error = -EIO; goto out_err; } return 0; out_err: xfs_buf_delwri_cancel(&buffers_list); return error; } /* * Set the temporary file's size. Caller must join the tempfile to the scrub * transaction and is responsible for adjusting block mappings as needed. */ int xrep_tempfile_set_isize( struct xfs_scrub *sc, unsigned long long isize) { if (sc->tempip->i_disk_size == isize) return 0; sc->tempip->i_disk_size = isize; i_size_write(VFS_I(sc->tempip), isize); return xrep_tempfile_roll_trans(sc); } /* * Roll a repair transaction involving the temporary file. Caller must join * both the temporary file and the file being scrubbed to the transaction. * This function return with both inodes joined to a new scrub transaction, * or the usual negative errno. */ int xrep_tempfile_roll_trans( struct xfs_scrub *sc) { int error; xfs_trans_log_inode(sc->tp, sc->tempip, XFS_ILOG_CORE); error = xrep_roll_trans(sc); if (error) return error; xfs_trans_ijoin(sc->tp, sc->tempip, 0); return 0; } /* * Fill out the mapping exchange request in preparation for atomically * committing the contents of a metadata file that we've rebuilt in the temp * file. */ STATIC int xrep_tempexch_prep_request( struct xfs_scrub *sc, int whichfork, struct xrep_tempexch *tx) { struct xfs_exchmaps_req *req = &tx->req; memset(tx, 0, sizeof(struct xrep_tempexch)); /* COW forks don't exist on disk. */ if (whichfork == XFS_COW_FORK) { ASSERT(0); return -EINVAL; } /* Both files should have the relevant forks. */ if (!xfs_ifork_ptr(sc->ip, whichfork) || !xfs_ifork_ptr(sc->tempip, whichfork)) { ASSERT(xfs_ifork_ptr(sc->ip, whichfork) != NULL); ASSERT(xfs_ifork_ptr(sc->tempip, whichfork) != NULL); return -EINVAL; } /* Exchange all mappings in both forks. */ req->ip1 = sc->tempip; req->ip2 = sc->ip; req->startoff1 = 0; req->startoff2 = 0; switch (whichfork) { case XFS_ATTR_FORK: req->flags |= XFS_EXCHMAPS_ATTR_FORK; break; case XFS_DATA_FORK: /* Always exchange sizes when exchanging data fork mappings. */ req->flags |= XFS_EXCHMAPS_SET_SIZES; break; } req->blockcount = XFS_MAX_FILEOFF; return 0; } /* * Fill out the mapping exchange resource estimation structures in preparation * for exchanging the contents of a metadata file that we've rebuilt in the * temp file. Caller must hold IOLOCK_EXCL but not ILOCK_EXCL on both files. */ STATIC int xrep_tempexch_estimate( struct xfs_scrub *sc, struct xrep_tempexch *tx) { struct xfs_exchmaps_req *req = &tx->req; struct xfs_ifork *ifp; struct xfs_ifork *tifp; int whichfork = xfs_exchmaps_reqfork(req); int state = 0; /* * The exchmaps code only knows how to exchange file fork space * mappings. Any fork data in local format must be promoted to a * single block before the exchange can take place. */ ifp = xfs_ifork_ptr(sc->ip, whichfork); if (ifp->if_format == XFS_DINODE_FMT_LOCAL) state |= 1; tifp = xfs_ifork_ptr(sc->tempip, whichfork); if (tifp->if_format == XFS_DINODE_FMT_LOCAL) state |= 2; switch (state) { case 0: /* Both files have mapped extents; use the regular estimate. */ return xfs_exchrange_estimate(req); case 1: /* * The file being repaired is in local format, but the temp * file has mapped extents. To perform the exchange, the file * being repaired must have its shorform data converted to an * ondisk block so that the forks will be in extents format. * We need one resblk for the conversion; the number of * exchanges is (worst case) the temporary file's extent count * plus the block we converted. */ req->ip1_bcount = sc->tempip->i_nblocks; req->ip2_bcount = 1; req->nr_exchanges = 1 + tifp->if_nextents; req->resblks = 1; break; case 2: /* * The temporary file is in local format, but the file being * repaired has mapped extents. To perform the exchange, the * temp file must have its shortform data converted to an * ondisk block, and the fork changed to extents format. We * need one resblk for the conversion; the number of exchanges * is (worst case) the extent count of the file being repaired * plus the block we converted. */ req->ip1_bcount = 1; req->ip2_bcount = sc->ip->i_nblocks; req->nr_exchanges = 1 + ifp->if_nextents; req->resblks = 1; break; case 3: /* * Both forks are in local format. To perform the exchange, * both files must have their shortform data converted to * fsblocks, and both forks must be converted to extents * format. We need two resblks for the two conversions, and * the number of exchanges is 1 since there's only one block at * fileoff 0. Presumably, the caller could not exchange the * two inode fork areas directly. */ req->ip1_bcount = 1; req->ip2_bcount = 1; req->nr_exchanges = 1; req->resblks = 2; break; } return xfs_exchmaps_estimate_overhead(req); } /* * Obtain a quota reservation to make sure we don't hit EDQUOT. We can skip * this if quota enforcement is disabled or if both inodes' dquots are the * same. The qretry structure must be initialized to zeroes before the first * call to this function. */ STATIC int xrep_tempexch_reserve_quota( struct xfs_scrub *sc, const struct xrep_tempexch *tx) { struct xfs_trans *tp = sc->tp; const struct xfs_exchmaps_req *req = &tx->req; int64_t ddelta, rdelta; int error; /* * Don't bother with a quota reservation if we're not enforcing them * or the two inodes have the same dquots. */ if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 || (req->ip1->i_udquot == req->ip2->i_udquot && req->ip1->i_gdquot == req->ip2->i_gdquot && req->ip1->i_pdquot == req->ip2->i_pdquot)) return 0; /* * Quota reservation for each file comes from two sources. First, we * need to account for any net gain in mapped blocks during the * exchange. Second, we need reservation for the gross gain in mapped * blocks so that we don't trip over any quota block reservation * assertions. We must reserve the gross gain because the quota code * subtracts from bcount the number of blocks that we unmap; it does * not add that quantity back to the quota block reservation. */ ddelta = max_t(int64_t, 0, req->ip2_bcount - req->ip1_bcount); rdelta = max_t(int64_t, 0, req->ip2_rtbcount - req->ip1_rtbcount); error = xfs_trans_reserve_quota_nblks(tp, req->ip1, ddelta + req->ip1_bcount, rdelta + req->ip1_rtbcount, true); if (error) return error; ddelta = max_t(int64_t, 0, req->ip1_bcount - req->ip2_bcount); rdelta = max_t(int64_t, 0, req->ip1_rtbcount - req->ip2_rtbcount); return xfs_trans_reserve_quota_nblks(tp, req->ip2, ddelta + req->ip2_bcount, rdelta + req->ip2_rtbcount, true); } /* * Prepare an existing transaction for an atomic file contents exchange. * * This function fills out the mapping exchange request and resource estimation * structures in preparation for exchanging the contents of a metadata file * that has been rebuilt in the temp file. Next, it reserves space and quota * for the transaction. * * The caller must hold ILOCK_EXCL of the scrub target file and the temporary * file. The caller must join both inodes to the transaction with no unlock * flags, and is responsible for dropping both ILOCKs when appropriate. Only * use this when those ILOCKs cannot be dropped. */ int xrep_tempexch_trans_reserve( struct xfs_scrub *sc, int whichfork, struct xrep_tempexch *tx) { int error; ASSERT(sc->tp != NULL); xfs_assert_ilocked(sc->ip, XFS_ILOCK_EXCL); xfs_assert_ilocked(sc->tempip, XFS_ILOCK_EXCL); error = xrep_tempexch_prep_request(sc, whichfork, tx); if (error) return error; error = xfs_exchmaps_estimate(&tx->req); if (error) return error; error = xfs_trans_reserve_more(sc->tp, tx->req.resblks, 0); if (error) return error; return xrep_tempexch_reserve_quota(sc, tx); } /* * Create a new transaction for a file contents exchange. * * This function fills out the mapping excahange request and resource * estimation structures in preparation for exchanging the contents of a * metadata file that has been rebuilt in the temp file. Next, it reserves * space, takes ILOCK_EXCL of both inodes, joins them to the transaction and * reserves quota for the transaction. * * The caller is responsible for dropping both ILOCKs when appropriate. */ int xrep_tempexch_trans_alloc( struct xfs_scrub *sc, int whichfork, struct xrep_tempexch *tx) { unsigned int flags = 0; int error; ASSERT(sc->tp == NULL); ASSERT(xfs_has_exchange_range(sc->mp)); error = xrep_tempexch_prep_request(sc, whichfork, tx); if (error) return error; error = xrep_tempexch_estimate(sc, tx); if (error) return error; if (xfs_has_lazysbcount(sc->mp)) flags |= XFS_TRANS_RES_FDBLKS; error = xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate, tx->req.resblks, 0, flags, &sc->tp); if (error) return error; sc->temp_ilock_flags |= XFS_ILOCK_EXCL; sc->ilock_flags |= XFS_ILOCK_EXCL; xfs_exchrange_ilock(sc->tp, sc->ip, sc->tempip); return xrep_tempexch_reserve_quota(sc, tx); } /* * Exchange file mappings (and hence file contents) between the file being * repaired and the temporary file. Returns with both inodes locked and joined * to a clean scrub transaction. */ int xrep_tempexch_contents( struct xfs_scrub *sc, struct xrep_tempexch *tx) { int error; ASSERT(xfs_has_exchange_range(sc->mp)); xfs_exchange_mappings(sc->tp, &tx->req); error = xfs_defer_finish(&sc->tp); if (error) return error; /* * If we exchanged the ondisk sizes of two metadata files, we must * exchanged the incore sizes as well. */ if (tx->req.flags & XFS_EXCHMAPS_SET_SIZES) { loff_t temp; temp = i_size_read(VFS_I(sc->ip)); i_size_write(VFS_I(sc->ip), i_size_read(VFS_I(sc->tempip))); i_size_write(VFS_I(sc->tempip), temp); } return 0; } /* * Write local format data from one of the temporary file's forks into the same * fork of file being repaired, and exchange the file sizes, if appropriate. * Caller must ensure that the file being repaired has enough fork space to * hold all the bytes. */ void xrep_tempfile_copyout_local( struct xfs_scrub *sc, int whichfork) { struct xfs_ifork *temp_ifp; struct xfs_ifork *ifp; unsigned int ilog_flags = XFS_ILOG_CORE; temp_ifp = xfs_ifork_ptr(sc->tempip, whichfork); ifp = xfs_ifork_ptr(sc->ip, whichfork); ASSERT(temp_ifp != NULL); ASSERT(ifp != NULL); ASSERT(temp_ifp->if_format == XFS_DINODE_FMT_LOCAL); ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL); switch (whichfork) { case XFS_DATA_FORK: ASSERT(sc->tempip->i_disk_size <= xfs_inode_data_fork_size(sc->ip)); break; case XFS_ATTR_FORK: ASSERT(sc->tempip->i_forkoff >= sc->ip->i_forkoff); break; default: ASSERT(0); return; } /* Recreate @sc->ip's incore fork (ifp) with data from temp_ifp. */ xfs_idestroy_fork(ifp); xfs_init_local_fork(sc->ip, whichfork, temp_ifp->if_data, temp_ifp->if_bytes); if (whichfork == XFS_DATA_FORK) { i_size_write(VFS_I(sc->ip), i_size_read(VFS_I(sc->tempip))); sc->ip->i_disk_size = sc->tempip->i_disk_size; } ilog_flags |= xfs_ilog_fdata(whichfork); xfs_trans_log_inode(sc->tp, sc->ip, ilog_flags); } /* Decide if a given XFS inode is a temporary file for a repair. */ bool xrep_is_tempfile( const struct xfs_inode *ip) { const struct inode *inode = &ip->i_vnode; if (IS_PRIVATE(inode) && !(inode->i_opflags & IOP_XATTR)) return true; return false; }