1 /* 2 * Copyright (C) 2017 Oracle. All Rights Reserved. 3 * 4 * Author: Darrick J. Wong <darrick.wong@oracle.com> 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 2 9 * of the License, or (at your option) any later version. 10 * 11 * This program is distributed in the hope that it would be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with this program; if not, write the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. 19 */ 20 #include "xfs.h" 21 #include "xfs_fs.h" 22 #include "xfs_shared.h" 23 #include "xfs_format.h" 24 #include "xfs_trans_resv.h" 25 #include "xfs_mount.h" 26 #include "xfs_defer.h" 27 #include "xfs_btree.h" 28 #include "xfs_bit.h" 29 #include "xfs_log_format.h" 30 #include "xfs_trans.h" 31 #include "xfs_sb.h" 32 #include "xfs_inode.h" 33 #include "xfs_icache.h" 34 #include "xfs_itable.h" 35 #include "xfs_alloc.h" 36 #include "xfs_alloc_btree.h" 37 #include "xfs_bmap.h" 38 #include "xfs_bmap_btree.h" 39 #include "xfs_ialloc.h" 40 #include "xfs_ialloc_btree.h" 41 #include "xfs_refcount.h" 42 #include "xfs_refcount_btree.h" 43 #include "xfs_rmap.h" 44 #include "xfs_rmap_btree.h" 45 #include "scrub/xfs_scrub.h" 46 #include "scrub/scrub.h" 47 #include "scrub/common.h" 48 #include "scrub/trace.h" 49 #include "scrub/scrub.h" 50 #include "scrub/btree.h" 51 52 /* 53 * Online Scrub and Repair 54 * 55 * Traditionally, XFS (the kernel driver) did not know how to check or 56 * repair on-disk data structures. That task was left to the xfs_check 57 * and xfs_repair tools, both of which require taking the filesystem 58 * offline for a thorough but time consuming examination. Online 59 * scrub & repair, on the other hand, enables us to check the metadata 60 * for obvious errors while carefully stepping around the filesystem's 61 * ongoing operations, locking rules, etc. 62 * 63 * Given that most XFS metadata consist of records stored in a btree, 64 * most of the checking functions iterate the btree blocks themselves 65 * looking for irregularities. When a record block is encountered, each 66 * record can be checked for obviously bad values. Record values can 67 * also be cross-referenced against other btrees to look for potential 68 * misunderstandings between pieces of metadata. 69 * 70 * It is expected that the checkers responsible for per-AG metadata 71 * structures will lock the AG headers (AGI, AGF, AGFL), iterate the 72 * metadata structure, and perform any relevant cross-referencing before 73 * unlocking the AG and returning the results to userspace. These 74 * scrubbers must not keep an AG locked for too long to avoid tying up 75 * the block and inode allocators. 76 * 77 * Block maps and b-trees rooted in an inode present a special challenge 78 * because they can involve extents from any AG. The general scrubber 79 * structure of lock -> check -> xref -> unlock still holds, but AG 80 * locking order rules /must/ be obeyed to avoid deadlocks. The 81 * ordering rule, of course, is that we must lock in increasing AG 82 * order. Helper functions are provided to track which AG headers we've 83 * already locked. If we detect an imminent locking order violation, we 84 * can signal a potential deadlock, in which case the scrubber can jump 85 * out to the top level, lock all the AGs in order, and retry the scrub. 86 * 87 * For file data (directories, extended attributes, symlinks) scrub, we 88 * can simply lock the inode and walk the data. For btree data 89 * (directories and attributes) we follow the same btree-scrubbing 90 * strategy outlined previously to check the records. 91 * 92 * We use a bit of trickery with transactions to avoid buffer deadlocks 93 * if there is a cycle in the metadata. The basic problem is that 94 * travelling down a btree involves locking the current buffer at each 95 * tree level. If a pointer should somehow point back to a buffer that 96 * we've already examined, we will deadlock due to the second buffer 97 * locking attempt. Note however that grabbing a buffer in transaction 98 * context links the locked buffer to the transaction. If we try to 99 * re-grab the buffer in the context of the same transaction, we avoid 100 * the second lock attempt and continue. Between the verifier and the 101 * scrubber, something will notice that something is amiss and report 102 * the corruption. Therefore, each scrubber will allocate an empty 103 * transaction, attach buffers to it, and cancel the transaction at the 104 * end of the scrub run. Cancelling a non-dirty transaction simply 105 * unlocks the buffers. 106 * 107 * There are four pieces of data that scrub can communicate to 108 * userspace. The first is the error code (errno), which can be used to 109 * communicate operational errors in performing the scrub. There are 110 * also three flags that can be set in the scrub context. If the data 111 * structure itself is corrupt, the CORRUPT flag will be set. If 112 * the metadata is correct but otherwise suboptimal, the PREEN flag 113 * will be set. 114 */ 115 116 /* 117 * Scrub probe -- userspace uses this to probe if we're willing to scrub 118 * or repair a given mountpoint. This will be used by xfs_scrub to 119 * probe the kernel's abilities to scrub (and repair) the metadata. We 120 * do this by validating the ioctl inputs from userspace, preparing the 121 * filesystem for a scrub (or a repair) operation, and immediately 122 * returning to userspace. Userspace can use the returned errno and 123 * structure state to decide (in broad terms) if scrub/repair are 124 * supported by the running kernel. 125 */ 126 int 127 xfs_scrub_probe( 128 struct xfs_scrub_context *sc) 129 { 130 int error = 0; 131 132 if (sc->sm->sm_ino || sc->sm->sm_agno) 133 return -EINVAL; 134 if (xfs_scrub_should_terminate(sc, &error)) 135 return error; 136 137 return 0; 138 } 139 140 /* Scrub setup and teardown */ 141 142 /* Free all the resources and finish the transactions. */ 143 STATIC int 144 xfs_scrub_teardown( 145 struct xfs_scrub_context *sc, 146 struct xfs_inode *ip_in, 147 int error) 148 { 149 xfs_scrub_ag_free(sc, &sc->sa); 150 if (sc->tp) { 151 xfs_trans_cancel(sc->tp); 152 sc->tp = NULL; 153 } 154 if (sc->ip) { 155 xfs_iunlock(sc->ip, sc->ilock_flags); 156 if (sc->ip != ip_in && 157 !xfs_internal_inum(sc->mp, sc->ip->i_ino)) 158 iput(VFS_I(sc->ip)); 159 sc->ip = NULL; 160 } 161 return error; 162 } 163 164 /* Scrubbing dispatch. */ 165 166 static const struct xfs_scrub_meta_ops meta_scrub_ops[] = { 167 { /* ioctl presence test */ 168 .setup = xfs_scrub_setup_fs, 169 .scrub = xfs_scrub_probe, 170 }, 171 { /* superblock */ 172 .setup = xfs_scrub_setup_ag_header, 173 .scrub = xfs_scrub_superblock, 174 }, 175 { /* agf */ 176 .setup = xfs_scrub_setup_ag_header, 177 .scrub = xfs_scrub_agf, 178 }, 179 { /* agfl */ 180 .setup = xfs_scrub_setup_ag_header, 181 .scrub = xfs_scrub_agfl, 182 }, 183 { /* agi */ 184 .setup = xfs_scrub_setup_ag_header, 185 .scrub = xfs_scrub_agi, 186 }, 187 { /* bnobt */ 188 .setup = xfs_scrub_setup_ag_allocbt, 189 .scrub = xfs_scrub_bnobt, 190 }, 191 { /* cntbt */ 192 .setup = xfs_scrub_setup_ag_allocbt, 193 .scrub = xfs_scrub_cntbt, 194 }, 195 { /* inobt */ 196 .setup = xfs_scrub_setup_ag_iallocbt, 197 .scrub = xfs_scrub_inobt, 198 }, 199 { /* finobt */ 200 .setup = xfs_scrub_setup_ag_iallocbt, 201 .scrub = xfs_scrub_finobt, 202 .has = xfs_sb_version_hasfinobt, 203 }, 204 { /* rmapbt */ 205 .setup = xfs_scrub_setup_ag_rmapbt, 206 .scrub = xfs_scrub_rmapbt, 207 .has = xfs_sb_version_hasrmapbt, 208 }, 209 { /* refcountbt */ 210 .setup = xfs_scrub_setup_ag_refcountbt, 211 .scrub = xfs_scrub_refcountbt, 212 .has = xfs_sb_version_hasreflink, 213 }, 214 { /* inode record */ 215 .setup = xfs_scrub_setup_inode, 216 .scrub = xfs_scrub_inode, 217 }, 218 }; 219 220 /* This isn't a stable feature, warn once per day. */ 221 static inline void 222 xfs_scrub_experimental_warning( 223 struct xfs_mount *mp) 224 { 225 static struct ratelimit_state scrub_warning = RATELIMIT_STATE_INIT( 226 "xfs_scrub_warning", 86400 * HZ, 1); 227 ratelimit_set_flags(&scrub_warning, RATELIMIT_MSG_ON_RELEASE); 228 229 if (__ratelimit(&scrub_warning)) 230 xfs_alert(mp, 231 "EXPERIMENTAL online scrub feature in use. Use at your own risk!"); 232 } 233 234 /* Dispatch metadata scrubbing. */ 235 int 236 xfs_scrub_metadata( 237 struct xfs_inode *ip, 238 struct xfs_scrub_metadata *sm) 239 { 240 struct xfs_scrub_context sc; 241 struct xfs_mount *mp = ip->i_mount; 242 const struct xfs_scrub_meta_ops *ops; 243 bool try_harder = false; 244 int error = 0; 245 246 trace_xfs_scrub_start(ip, sm, error); 247 248 /* Forbidden if we are shut down or mounted norecovery. */ 249 error = -ESHUTDOWN; 250 if (XFS_FORCED_SHUTDOWN(mp)) 251 goto out; 252 error = -ENOTRECOVERABLE; 253 if (mp->m_flags & XFS_MOUNT_NORECOVERY) 254 goto out; 255 256 /* Check our inputs. */ 257 error = -EINVAL; 258 sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; 259 if (sm->sm_flags & ~XFS_SCRUB_FLAGS_IN) 260 goto out; 261 if (memchr_inv(sm->sm_reserved, 0, sizeof(sm->sm_reserved))) 262 goto out; 263 264 /* Do we know about this type of metadata? */ 265 error = -ENOENT; 266 if (sm->sm_type >= XFS_SCRUB_TYPE_NR) 267 goto out; 268 ops = &meta_scrub_ops[sm->sm_type]; 269 if (ops->scrub == NULL) 270 goto out; 271 272 /* 273 * We won't scrub any filesystem that doesn't have the ability 274 * to record unwritten extents. The option was made default in 275 * 2003, removed from mkfs in 2007, and cannot be disabled in 276 * v5, so if we find a filesystem without this flag it's either 277 * really old or totally unsupported. Avoid it either way. 278 * We also don't support v1-v3 filesystems, which aren't 279 * mountable. 280 */ 281 error = -EOPNOTSUPP; 282 if (!xfs_sb_version_hasextflgbit(&mp->m_sb)) 283 goto out; 284 285 /* Does this fs even support this type of metadata? */ 286 error = -ENOENT; 287 if (ops->has && !ops->has(&mp->m_sb)) 288 goto out; 289 290 /* We don't know how to repair anything yet. */ 291 error = -EOPNOTSUPP; 292 if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) 293 goto out; 294 295 xfs_scrub_experimental_warning(mp); 296 297 retry_op: 298 /* Set up for the operation. */ 299 memset(&sc, 0, sizeof(sc)); 300 sc.mp = ip->i_mount; 301 sc.sm = sm; 302 sc.ops = ops; 303 sc.try_harder = try_harder; 304 sc.sa.agno = NULLAGNUMBER; 305 error = sc.ops->setup(&sc, ip); 306 if (error) 307 goto out_teardown; 308 309 /* Scrub for errors. */ 310 error = sc.ops->scrub(&sc); 311 if (!try_harder && error == -EDEADLOCK) { 312 /* 313 * Scrubbers return -EDEADLOCK to mean 'try harder'. 314 * Tear down everything we hold, then set up again with 315 * preparation for worst-case scenarios. 316 */ 317 error = xfs_scrub_teardown(&sc, ip, 0); 318 if (error) 319 goto out; 320 try_harder = true; 321 goto retry_op; 322 } else if (error) 323 goto out_teardown; 324 325 if (sc.sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | 326 XFS_SCRUB_OFLAG_XCORRUPT)) 327 xfs_alert_ratelimited(mp, "Corruption detected during scrub."); 328 329 out_teardown: 330 error = xfs_scrub_teardown(&sc, ip, error); 331 out: 332 trace_xfs_scrub_done(ip, sm, error); 333 if (error == -EFSCORRUPTED || error == -EFSBADCRC) { 334 sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; 335 error = 0; 336 } 337 return error; 338 } 339