1 /* 2 * Copyright (C) 2017 Oracle. All Rights Reserved. 3 * 4 * Author: Darrick J. Wong <darrick.wong@oracle.com> 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 2 9 * of the License, or (at your option) any later version. 10 * 11 * This program is distributed in the hope that it would be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with this program; if not, write the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. 19 */ 20 #include "xfs.h" 21 #include "xfs_fs.h" 22 #include "xfs_shared.h" 23 #include "xfs_format.h" 24 #include "xfs_trans_resv.h" 25 #include "xfs_mount.h" 26 #include "xfs_defer.h" 27 #include "xfs_btree.h" 28 #include "xfs_bit.h" 29 #include "xfs_log_format.h" 30 #include "xfs_trans.h" 31 #include "xfs_sb.h" 32 #include "xfs_inode.h" 33 #include "xfs_icache.h" 34 #include "xfs_itable.h" 35 #include "xfs_alloc.h" 36 #include "xfs_alloc_btree.h" 37 #include "xfs_bmap.h" 38 #include "xfs_bmap_btree.h" 39 #include "xfs_ialloc.h" 40 #include "xfs_ialloc_btree.h" 41 #include "xfs_refcount.h" 42 #include "xfs_refcount_btree.h" 43 #include "xfs_rmap.h" 44 #include "xfs_rmap_btree.h" 45 #include "scrub/xfs_scrub.h" 46 #include "scrub/scrub.h" 47 #include "scrub/common.h" 48 #include "scrub/trace.h" 49 #include "scrub/scrub.h" 50 #include "scrub/btree.h" 51 52 /* 53 * Online Scrub and Repair 54 * 55 * Traditionally, XFS (the kernel driver) did not know how to check or 56 * repair on-disk data structures. That task was left to the xfs_check 57 * and xfs_repair tools, both of which require taking the filesystem 58 * offline for a thorough but time consuming examination. Online 59 * scrub & repair, on the other hand, enables us to check the metadata 60 * for obvious errors while carefully stepping around the filesystem's 61 * ongoing operations, locking rules, etc. 62 * 63 * Given that most XFS metadata consist of records stored in a btree, 64 * most of the checking functions iterate the btree blocks themselves 65 * looking for irregularities. When a record block is encountered, each 66 * record can be checked for obviously bad values. Record values can 67 * also be cross-referenced against other btrees to look for potential 68 * misunderstandings between pieces of metadata. 69 * 70 * It is expected that the checkers responsible for per-AG metadata 71 * structures will lock the AG headers (AGI, AGF, AGFL), iterate the 72 * metadata structure, and perform any relevant cross-referencing before 73 * unlocking the AG and returning the results to userspace. These 74 * scrubbers must not keep an AG locked for too long to avoid tying up 75 * the block and inode allocators. 76 * 77 * Block maps and b-trees rooted in an inode present a special challenge 78 * because they can involve extents from any AG. The general scrubber 79 * structure of lock -> check -> xref -> unlock still holds, but AG 80 * locking order rules /must/ be obeyed to avoid deadlocks. The 81 * ordering rule, of course, is that we must lock in increasing AG 82 * order. Helper functions are provided to track which AG headers we've 83 * already locked. If we detect an imminent locking order violation, we 84 * can signal a potential deadlock, in which case the scrubber can jump 85 * out to the top level, lock all the AGs in order, and retry the scrub. 86 * 87 * For file data (directories, extended attributes, symlinks) scrub, we 88 * can simply lock the inode and walk the data. For btree data 89 * (directories and attributes) we follow the same btree-scrubbing 90 * strategy outlined previously to check the records. 91 * 92 * We use a bit of trickery with transactions to avoid buffer deadlocks 93 * if there is a cycle in the metadata. The basic problem is that 94 * travelling down a btree involves locking the current buffer at each 95 * tree level. If a pointer should somehow point back to a buffer that 96 * we've already examined, we will deadlock due to the second buffer 97 * locking attempt. Note however that grabbing a buffer in transaction 98 * context links the locked buffer to the transaction. If we try to 99 * re-grab the buffer in the context of the same transaction, we avoid 100 * the second lock attempt and continue. Between the verifier and the 101 * scrubber, something will notice that something is amiss and report 102 * the corruption. Therefore, each scrubber will allocate an empty 103 * transaction, attach buffers to it, and cancel the transaction at the 104 * end of the scrub run. Cancelling a non-dirty transaction simply 105 * unlocks the buffers. 106 * 107 * There are four pieces of data that scrub can communicate to 108 * userspace. The first is the error code (errno), which can be used to 109 * communicate operational errors in performing the scrub. There are 110 * also three flags that can be set in the scrub context. If the data 111 * structure itself is corrupt, the CORRUPT flag will be set. If 112 * the metadata is correct but otherwise suboptimal, the PREEN flag 113 * will be set. 114 */ 115 116 /* 117 * Scrub probe -- userspace uses this to probe if we're willing to scrub 118 * or repair a given mountpoint. This will be used by xfs_scrub to 119 * probe the kernel's abilities to scrub (and repair) the metadata. We 120 * do this by validating the ioctl inputs from userspace, preparing the 121 * filesystem for a scrub (or a repair) operation, and immediately 122 * returning to userspace. Userspace can use the returned errno and 123 * structure state to decide (in broad terms) if scrub/repair are 124 * supported by the running kernel. 125 */ 126 static int 127 xfs_scrub_probe( 128 struct xfs_scrub_context *sc) 129 { 130 int error = 0; 131 132 if (sc->sm->sm_ino || sc->sm->sm_agno) 133 return -EINVAL; 134 if (xfs_scrub_should_terminate(sc, &error)) 135 return error; 136 137 return 0; 138 } 139 140 /* Scrub setup and teardown */ 141 142 /* Free all the resources and finish the transactions. */ 143 STATIC int 144 xfs_scrub_teardown( 145 struct xfs_scrub_context *sc, 146 struct xfs_inode *ip_in, 147 int error) 148 { 149 xfs_scrub_ag_free(sc, &sc->sa); 150 if (sc->tp) { 151 xfs_trans_cancel(sc->tp); 152 sc->tp = NULL; 153 } 154 if (sc->ip) { 155 xfs_iunlock(sc->ip, sc->ilock_flags); 156 if (sc->ip != ip_in && 157 !xfs_internal_inum(sc->mp, sc->ip->i_ino)) 158 iput(VFS_I(sc->ip)); 159 sc->ip = NULL; 160 } 161 if (sc->buf) { 162 kmem_free(sc->buf); 163 sc->buf = NULL; 164 } 165 return error; 166 } 167 168 /* Scrubbing dispatch. */ 169 170 static const struct xfs_scrub_meta_ops meta_scrub_ops[] = { 171 { /* ioctl presence test */ 172 .setup = xfs_scrub_setup_fs, 173 .scrub = xfs_scrub_probe, 174 }, 175 { /* superblock */ 176 .setup = xfs_scrub_setup_ag_header, 177 .scrub = xfs_scrub_superblock, 178 }, 179 { /* agf */ 180 .setup = xfs_scrub_setup_ag_header, 181 .scrub = xfs_scrub_agf, 182 }, 183 { /* agfl */ 184 .setup = xfs_scrub_setup_ag_header, 185 .scrub = xfs_scrub_agfl, 186 }, 187 { /* agi */ 188 .setup = xfs_scrub_setup_ag_header, 189 .scrub = xfs_scrub_agi, 190 }, 191 { /* bnobt */ 192 .setup = xfs_scrub_setup_ag_allocbt, 193 .scrub = xfs_scrub_bnobt, 194 }, 195 { /* cntbt */ 196 .setup = xfs_scrub_setup_ag_allocbt, 197 .scrub = xfs_scrub_cntbt, 198 }, 199 { /* inobt */ 200 .setup = xfs_scrub_setup_ag_iallocbt, 201 .scrub = xfs_scrub_inobt, 202 }, 203 { /* finobt */ 204 .setup = xfs_scrub_setup_ag_iallocbt, 205 .scrub = xfs_scrub_finobt, 206 .has = xfs_sb_version_hasfinobt, 207 }, 208 { /* rmapbt */ 209 .setup = xfs_scrub_setup_ag_rmapbt, 210 .scrub = xfs_scrub_rmapbt, 211 .has = xfs_sb_version_hasrmapbt, 212 }, 213 { /* refcountbt */ 214 .setup = xfs_scrub_setup_ag_refcountbt, 215 .scrub = xfs_scrub_refcountbt, 216 .has = xfs_sb_version_hasreflink, 217 }, 218 { /* inode record */ 219 .setup = xfs_scrub_setup_inode, 220 .scrub = xfs_scrub_inode, 221 }, 222 { /* inode data fork */ 223 .setup = xfs_scrub_setup_inode_bmap, 224 .scrub = xfs_scrub_bmap_data, 225 }, 226 { /* inode attr fork */ 227 .setup = xfs_scrub_setup_inode_bmap, 228 .scrub = xfs_scrub_bmap_attr, 229 }, 230 { /* inode CoW fork */ 231 .setup = xfs_scrub_setup_inode_bmap, 232 .scrub = xfs_scrub_bmap_cow, 233 }, 234 { /* directory */ 235 .setup = xfs_scrub_setup_directory, 236 .scrub = xfs_scrub_directory, 237 }, 238 { /* extended attributes */ 239 .setup = xfs_scrub_setup_xattr, 240 .scrub = xfs_scrub_xattr, 241 }, 242 { /* symbolic link */ 243 .setup = xfs_scrub_setup_symlink, 244 .scrub = xfs_scrub_symlink, 245 }, 246 { /* parent pointers */ 247 .setup = xfs_scrub_setup_parent, 248 .scrub = xfs_scrub_parent, 249 }, 250 { /* realtime bitmap */ 251 .setup = xfs_scrub_setup_rt, 252 .scrub = xfs_scrub_rtbitmap, 253 .has = xfs_sb_version_hasrealtime, 254 }, 255 { /* realtime summary */ 256 .setup = xfs_scrub_setup_rt, 257 .scrub = xfs_scrub_rtsummary, 258 .has = xfs_sb_version_hasrealtime, 259 }, 260 { /* user quota */ 261 .setup = xfs_scrub_setup_quota, 262 .scrub = xfs_scrub_quota, 263 }, 264 { /* group quota */ 265 .setup = xfs_scrub_setup_quota, 266 .scrub = xfs_scrub_quota, 267 }, 268 { /* project quota */ 269 .setup = xfs_scrub_setup_quota, 270 .scrub = xfs_scrub_quota, 271 }, 272 }; 273 274 /* This isn't a stable feature, warn once per day. */ 275 static inline void 276 xfs_scrub_experimental_warning( 277 struct xfs_mount *mp) 278 { 279 static struct ratelimit_state scrub_warning = RATELIMIT_STATE_INIT( 280 "xfs_scrub_warning", 86400 * HZ, 1); 281 ratelimit_set_flags(&scrub_warning, RATELIMIT_MSG_ON_RELEASE); 282 283 if (__ratelimit(&scrub_warning)) 284 xfs_alert(mp, 285 "EXPERIMENTAL online scrub feature in use. Use at your own risk!"); 286 } 287 288 /* Dispatch metadata scrubbing. */ 289 int 290 xfs_scrub_metadata( 291 struct xfs_inode *ip, 292 struct xfs_scrub_metadata *sm) 293 { 294 struct xfs_scrub_context sc; 295 struct xfs_mount *mp = ip->i_mount; 296 const struct xfs_scrub_meta_ops *ops; 297 bool try_harder = false; 298 int error = 0; 299 300 trace_xfs_scrub_start(ip, sm, error); 301 302 /* Forbidden if we are shut down or mounted norecovery. */ 303 error = -ESHUTDOWN; 304 if (XFS_FORCED_SHUTDOWN(mp)) 305 goto out; 306 error = -ENOTRECOVERABLE; 307 if (mp->m_flags & XFS_MOUNT_NORECOVERY) 308 goto out; 309 310 /* Check our inputs. */ 311 error = -EINVAL; 312 sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; 313 if (sm->sm_flags & ~XFS_SCRUB_FLAGS_IN) 314 goto out; 315 if (memchr_inv(sm->sm_reserved, 0, sizeof(sm->sm_reserved))) 316 goto out; 317 318 /* Do we know about this type of metadata? */ 319 error = -ENOENT; 320 if (sm->sm_type >= XFS_SCRUB_TYPE_NR) 321 goto out; 322 ops = &meta_scrub_ops[sm->sm_type]; 323 if (ops->scrub == NULL) 324 goto out; 325 326 /* 327 * We won't scrub any filesystem that doesn't have the ability 328 * to record unwritten extents. The option was made default in 329 * 2003, removed from mkfs in 2007, and cannot be disabled in 330 * v5, so if we find a filesystem without this flag it's either 331 * really old or totally unsupported. Avoid it either way. 332 * We also don't support v1-v3 filesystems, which aren't 333 * mountable. 334 */ 335 error = -EOPNOTSUPP; 336 if (!xfs_sb_version_hasextflgbit(&mp->m_sb)) 337 goto out; 338 339 /* Does this fs even support this type of metadata? */ 340 error = -ENOENT; 341 if (ops->has && !ops->has(&mp->m_sb)) 342 goto out; 343 344 /* We don't know how to repair anything yet. */ 345 error = -EOPNOTSUPP; 346 if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) 347 goto out; 348 349 xfs_scrub_experimental_warning(mp); 350 351 retry_op: 352 /* Set up for the operation. */ 353 memset(&sc, 0, sizeof(sc)); 354 sc.mp = ip->i_mount; 355 sc.sm = sm; 356 sc.ops = ops; 357 sc.try_harder = try_harder; 358 sc.sa.agno = NULLAGNUMBER; 359 error = sc.ops->setup(&sc, ip); 360 if (error) 361 goto out_teardown; 362 363 /* Scrub for errors. */ 364 error = sc.ops->scrub(&sc); 365 if (!try_harder && error == -EDEADLOCK) { 366 /* 367 * Scrubbers return -EDEADLOCK to mean 'try harder'. 368 * Tear down everything we hold, then set up again with 369 * preparation for worst-case scenarios. 370 */ 371 error = xfs_scrub_teardown(&sc, ip, 0); 372 if (error) 373 goto out; 374 try_harder = true; 375 goto retry_op; 376 } else if (error) 377 goto out_teardown; 378 379 if (sc.sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | 380 XFS_SCRUB_OFLAG_XCORRUPT)) 381 xfs_alert_ratelimited(mp, "Corruption detected during scrub."); 382 383 out_teardown: 384 error = xfs_scrub_teardown(&sc, ip, error); 385 out: 386 trace_xfs_scrub_done(ip, sm, error); 387 if (error == -EFSCORRUPTED || error == -EFSBADCRC) { 388 sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; 389 error = 0; 390 } 391 return error; 392 } 393