1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. 5 * 6 * Further information about snapshots can be obtained from: 7 * 8 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 9 * 1614 Oxford Street mckusick@mckusick.com 10 * Berkeley, CA 94709-1608 +1-510-843-9542 11 * USA 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 23 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 24 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 25 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 26 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 27 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 36 */ 37 38 #include <sys/cdefs.h> 39 __FBSDID("$FreeBSD$"); 40 41 #include "opt_quota.h" 42 43 #include <sys/param.h> 44 #include <sys/kernel.h> 45 #include <sys/systm.h> 46 #include <sys/conf.h> 47 #include <sys/gsb_crc32.h> 48 #include <sys/bio.h> 49 #include <sys/buf.h> 50 #include <sys/fcntl.h> 51 #include <sys/proc.h> 52 #include <sys/namei.h> 53 #include <sys/sched.h> 54 #include <sys/stat.h> 55 #include <sys/malloc.h> 56 #include <sys/mount.h> 57 #include <sys/resource.h> 58 #include <sys/resourcevar.h> 59 #include <sys/rwlock.h> 60 #include <sys/vnode.h> 61 62 #include <vm/vm.h> 63 #include <vm/vm_extern.h> 64 65 #include <geom/geom.h> 66 #include <geom/geom_vfs.h> 67 68 #include <ufs/ufs/extattr.h> 69 #include <ufs/ufs/quota.h> 70 #include <ufs/ufs/ufsmount.h> 71 #include <ufs/ufs/inode.h> 72 #include <ufs/ufs/ufs_extern.h> 73 74 #include <ufs/ffs/fs.h> 75 #include <ufs/ffs/ffs_extern.h> 76 77 #define KERNCRED thread0.td_ucred 78 79 #include "opt_ffs.h" 80 81 #ifdef NO_FFS_SNAPSHOT 82 int 83 ffs_snapshot(struct mount *mp, char *snapfile) 84 { 85 return (EINVAL); 86 } 87 88 int 89 ffs_snapblkfree(struct fs *fs, 90 struct vnode *devvp, 91 ufs2_daddr_t bno, 92 long size, 93 ino_t inum, 94 __enum_uint8(vtype) vtype, 95 struct workhead *wkhd) 96 { 97 return (EINVAL); 98 } 99 100 void 101 ffs_snapremove(struct vnode *vp) 102 { 103 } 104 105 void 106 ffs_snapshot_mount(struct mount *mp) 107 { 108 } 109 110 void 111 ffs_snapshot_unmount(struct mount *mp) 112 { 113 } 114 115 void 116 ffs_snapgone(struct inode *ip) 117 { 118 } 119 120 int 121 ffs_copyonwrite(struct vnode *devvp, struct buf *bp) 122 { 123 return (EINVAL); 124 } 125 126 void 127 ffs_sync_snap(struct mount *mp, int waitfor) 128 { 129 } 130 131 #else 132 FEATURE(ffs_snapshot, "FFS snapshot support"); 133 134 LIST_HEAD(, snapdata) snapfree; 135 static struct mtx snapfree_lock; 136 MTX_SYSINIT(ffs_snapfree, &snapfree_lock, "snapdata free list", MTX_DEF); 137 138 static int cgaccount(int, struct vnode *, struct buf *, int); 139 static int expunge_ufs1(struct vnode *, struct inode *, struct fs *, 140 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 141 ufs_lbn_t, int), int, int); 142 static int indiracct_ufs1(struct vnode *, struct vnode *, int, 143 ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 144 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 145 ufs_lbn_t, int), int); 146 static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 147 struct fs *, ufs_lbn_t, int); 148 static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 149 struct fs *, ufs_lbn_t, int); 150 static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 151 struct fs *, ufs_lbn_t, int); 152 static int expunge_ufs2(struct vnode *, struct inode *, struct fs *, 153 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 154 ufs_lbn_t, int), int, int); 155 static int indiracct_ufs2(struct vnode *, struct vnode *, int, 156 ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 157 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 158 ufs_lbn_t, int), int); 159 static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 160 struct fs *, ufs_lbn_t, int); 161 static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 162 struct fs *, ufs_lbn_t, int); 163 static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 164 struct fs *, ufs_lbn_t, int); 165 static int readblock(struct vnode *vp, struct buf *, ufs2_daddr_t); 166 static void try_free_snapdata(struct vnode *devvp); 167 static void revert_snaplock(struct vnode *, struct vnode *, struct snapdata *); 168 static struct snapdata *ffs_snapdata_acquire(struct vnode *devvp); 169 static int ffs_bp_snapblk(struct vnode *, struct buf *); 170 171 /* 172 * To ensure the consistency of snapshots across crashes, we must 173 * synchronously write out copied blocks before allowing the 174 * originals to be modified. Because of the rather severe speed 175 * penalty that this imposes, the code normally only ensures 176 * persistence for the filesystem metadata contained within a 177 * snapshot. Setting the following flag allows this crash 178 * persistence to be enabled for file contents. 179 */ 180 int dopersistence = 0; 181 182 #ifdef DIAGNOSTIC 183 #include <sys/sysctl.h> 184 SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, ""); 185 static int snapdebug = 0; 186 SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, ""); 187 int collectsnapstats = 0; 188 SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats, 189 0, ""); 190 #endif /* DIAGNOSTIC */ 191 192 /* 193 * Create a snapshot file and initialize it for the filesystem. 194 */ 195 int 196 ffs_snapshot(struct mount *mp, char *snapfile) 197 { 198 ufs2_daddr_t numblks, blkno, *blkp, *snapblklist; 199 int error, cg, snaploc; 200 int i, size, len, loc; 201 ufs2_daddr_t blockno; 202 uint64_t flag; 203 char saved_nice = 0; 204 #ifdef DIAGNOSTIC 205 long redo = 0; 206 #endif 207 long snaplistsize = 0; 208 int32_t *lp; 209 void *space; 210 struct fs *copy_fs = NULL, *fs, *bpfs; 211 struct thread *td = curthread; 212 struct inode *ip, *xp; 213 struct buf *bp, *nbp, *ibp; 214 struct nameidata nd; 215 struct mount *wrtmp; 216 struct vattr vat; 217 struct vnode *vp, *xvp, *mvp, *devvp; 218 struct uio auio; 219 struct iovec aiov; 220 struct snapdata *sn; 221 struct ufsmount *ump; 222 #ifdef DIAGNOSTIC 223 struct timespec starttime = {0, 0}, endtime; 224 #endif 225 226 ump = VFSTOUFS(mp); 227 fs = ump->um_fs; 228 sn = NULL; 229 MNT_ILOCK(mp); 230 flag = mp->mnt_flag; 231 MNT_IUNLOCK(mp); 232 /* 233 * Need to serialize access to snapshot code per filesystem. 234 */ 235 /* 236 * Assign a snapshot slot in the superblock. 237 */ 238 UFS_LOCK(ump); 239 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 240 if (fs->fs_snapinum[snaploc] == 0) 241 break; 242 UFS_UNLOCK(ump); 243 if (snaploc == FSMAXSNAP) 244 return (ENOSPC); 245 /* 246 * Create the snapshot file. 247 */ 248 restart: 249 NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF | NOCACHE, UIO_SYSSPACE, 250 snapfile); 251 if ((error = namei(&nd)) != 0) 252 return (error); 253 if (nd.ni_vp != NULL) { 254 vput(nd.ni_vp); 255 error = EEXIST; 256 } 257 if (nd.ni_dvp->v_mount != mp) 258 error = EXDEV; 259 if (error) { 260 NDFREE_PNBUF(&nd); 261 if (nd.ni_dvp == nd.ni_vp) 262 vrele(nd.ni_dvp); 263 else 264 vput(nd.ni_dvp); 265 return (error); 266 } 267 VATTR_NULL(&vat); 268 vat.va_type = VREG; 269 vat.va_mode = S_IRUSR; 270 vat.va_vaflags |= VA_EXCLUSIVE; 271 if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp)) 272 wrtmp = NULL; 273 if (wrtmp != mp) 274 panic("ffs_snapshot: mount mismatch"); 275 vfs_rel(wrtmp); 276 if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) { 277 NDFREE_PNBUF(&nd); 278 vput(nd.ni_dvp); 279 if ((error = vn_start_write(NULL, &wrtmp, 280 V_XSLEEP | PCATCH)) != 0) 281 return (error); 282 goto restart; 283 } 284 error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat); 285 if (error) { 286 VOP_VPUT_PAIR(nd.ni_dvp, NULL, true); 287 NDFREE_PNBUF(&nd); 288 vn_finished_write(wrtmp); 289 if (error == ERELOOKUP) 290 goto restart; 291 return (error); 292 } 293 vp = nd.ni_vp; 294 vref(nd.ni_dvp); 295 VOP_VPUT_PAIR(nd.ni_dvp, &vp, false); 296 if (VN_IS_DOOMED(vp)) { 297 error = EBADF; 298 goto out; 299 } 300 vnode_create_vobject(nd.ni_vp, fs->fs_size, td); 301 vp->v_vflag |= VV_SYSTEM; 302 ip = VTOI(vp); 303 devvp = ITODEVVP(ip); 304 /* 305 * Calculate the size of the filesystem then allocate the block 306 * immediately following the last block of the filesystem that 307 * will contain the snapshot list. This operation allows us to 308 * set the size of the snapshot. 309 */ 310 numblks = howmany(fs->fs_size, fs->fs_frag); 311 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)numblks), 312 fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); 313 if (error) 314 goto out; 315 bawrite(bp); 316 ip->i_size = lblktosize(fs, (off_t)(numblks + 1)); 317 vnode_pager_setsize(vp, ip->i_size); 318 DIP_SET(ip, i_size, ip->i_size); 319 UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE | IN_UPDATE); 320 /* 321 * Preallocate critical data structures so that we can copy 322 * them in without further allocation after we suspend all 323 * operations on the filesystem. We would like to just release 324 * the allocated buffers without writing them since they will 325 * be filled in below once we are ready to go, but this upsets 326 * the soft update code, so we go ahead and write the new buffers. 327 * 328 * Allocate all indirect blocks and mark all of them as not 329 * needing to be copied. 330 */ 331 for (blkno = UFS_NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 332 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), 333 fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp); 334 if (error) 335 goto out; 336 bawrite(ibp); 337 } 338 /* 339 * Allocate copies for the superblock and its summary information. 340 */ 341 error = UFS_BALLOC(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED, 342 0, &nbp); 343 if (error) 344 goto out; 345 bawrite(nbp); 346 blkno = fragstoblks(fs, fs->fs_csaddr); 347 len = howmany(fs->fs_cssize, fs->fs_bsize); 348 for (loc = 0; loc < len; loc++) { 349 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), 350 fs->fs_bsize, KERNCRED, 0, &nbp); 351 if (error) 352 goto out; 353 bawrite(nbp); 354 } 355 /* 356 * Allocate all cylinder group blocks. 357 */ 358 for (cg = 0; cg < fs->fs_ncg; cg++) { 359 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 360 fs->fs_bsize, KERNCRED, 0, &nbp); 361 if (error) 362 goto out; 363 bawrite(nbp); 364 if (cg % 10 == 0) { 365 error = ffs_syncvnode(vp, MNT_WAIT, 0); 366 /* vp possibly reclaimed if unlocked */ 367 if (error != 0) 368 goto out; 369 } 370 } 371 /* 372 * Change inode to snapshot type file. Before setting its block 373 * pointers to BLK_SNAP and BLK_NOCOPY in cgaccount, we have to 374 * set its type to SF_SNAPSHOT so that VOP_REMOVE will know that 375 * they need to be rolled back before attempting deletion. 376 */ 377 ip->i_flags |= SF_SNAPSHOT; 378 DIP_SET(ip, i_flags, ip->i_flags); 379 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); 380 /* 381 * Copy all the cylinder group maps. Although the 382 * filesystem is still active, we hope that only a few 383 * cylinder groups will change between now and when we 384 * suspend operations. Thus, we will be able to quickly 385 * touch up the few cylinder groups that changed during 386 * the suspension period. 387 */ 388 len = roundup2(howmany(fs->fs_ncg, NBBY), sizeof(uint64_t)); 389 space = malloc(len, M_DEVBUF, M_WAITOK | M_ZERO); 390 UFS_LOCK(ump); 391 fs->fs_active = space; 392 UFS_UNLOCK(ump); 393 for (cg = 0; cg < fs->fs_ncg; cg++) { 394 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 395 fs->fs_bsize, KERNCRED, 0, &nbp); 396 if (error) 397 goto out; 398 error = cgaccount(cg, vp, nbp, 1); 399 bawrite(nbp); 400 if (cg % 10 == 0 && error == 0) 401 error = ffs_syncvnode(vp, MNT_WAIT, 0); 402 if (error) 403 goto out; 404 } 405 /* 406 * Ensure that the snapshot is completely on disk. 407 * Since we have marked it as a snapshot it is safe to 408 * unlock it as no process will be allowed to write to it. 409 */ 410 if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) 411 goto out; 412 VOP_UNLOCK(vp); 413 /* 414 * All allocations are done, so we can now snapshot the system. 415 * 416 * Recind nice scheduling while running with the filesystem suspended. 417 */ 418 if (td->td_proc->p_nice > 0) { 419 struct proc *p; 420 421 p = td->td_proc; 422 PROC_LOCK(p); 423 saved_nice = p->p_nice; 424 sched_nice(p, 0); 425 PROC_UNLOCK(p); 426 } 427 /* 428 * Suspend operation on filesystem. 429 */ 430 for (;;) { 431 vn_finished_write(wrtmp); 432 if ((error = vfs_write_suspend(vp->v_mount, 0)) != 0) { 433 vn_start_write(NULL, &wrtmp, V_WAIT); 434 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 435 goto out; 436 } 437 if (mp->mnt_kern_flag & MNTK_SUSPENDED) 438 break; 439 vn_start_write(NULL, &wrtmp, V_WAIT); 440 } 441 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 442 if (ip->i_effnlink == 0) { 443 error = ENOENT; /* Snapshot file unlinked */ 444 goto resumefs; 445 } 446 #ifdef DIAGNOSTIC 447 if (collectsnapstats) 448 nanotime(&starttime); 449 #endif 450 451 /* 452 * First, copy all the cylinder group maps that have changed. 453 */ 454 for (cg = 0; cg < fs->fs_ncg; cg++) { 455 if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0) 456 continue; 457 #ifdef DIAGNOSTIC 458 redo++; 459 #endif 460 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 461 fs->fs_bsize, KERNCRED, 0, &nbp); 462 if (error) 463 goto resumefs; 464 error = cgaccount(cg, vp, nbp, 2); 465 bawrite(nbp); 466 if (error) 467 goto resumefs; 468 } 469 /* 470 * Grab a copy of the superblock and its summary information. 471 * We delay writing it until the suspension is released below. 472 */ 473 copy_fs = malloc((uint64_t)fs->fs_bsize, M_UFSMNT, M_WAITOK); 474 bcopy(fs, copy_fs, fs->fs_sbsize); 475 copy_fs->fs_si = malloc(sizeof(struct fs_summary_info), M_UFSMNT, 476 M_ZERO | M_WAITOK); 477 if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) 478 copy_fs->fs_clean = 1; 479 size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE; 480 if (fs->fs_sbsize < size) 481 bzero(&((char *)copy_fs)[fs->fs_sbsize], 482 size - fs->fs_sbsize); 483 size = blkroundup(fs, fs->fs_cssize); 484 if (fs->fs_contigsumsize > 0) 485 size += fs->fs_ncg * sizeof(int32_t); 486 space = malloc((uint64_t)size, M_UFSMNT, M_WAITOK); 487 copy_fs->fs_csp = space; 488 bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize); 489 space = (char *)space + fs->fs_cssize; 490 loc = howmany(fs->fs_cssize, fs->fs_fsize); 491 i = fs->fs_frag - loc % fs->fs_frag; 492 len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; 493 if (len > 0) { 494 if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc), 495 len, KERNCRED, &bp)) != 0) { 496 brelse(bp); 497 goto resumefs; 498 } 499 bcopy(bp->b_data, space, (uint64_t)len); 500 space = (char *)space + len; 501 bp->b_flags |= B_INVAL | B_NOCACHE; 502 brelse(bp); 503 } 504 if (fs->fs_contigsumsize > 0) { 505 copy_fs->fs_maxcluster = lp = space; 506 for (i = 0; i < fs->fs_ncg; i++) 507 *lp++ = fs->fs_contigsumsize; 508 } 509 /* 510 * We must check for active files that have been unlinked 511 * (e.g., with a zero link count). We have to expunge all 512 * trace of these files from the snapshot so that they are 513 * not reclaimed prematurely by fsck or unnecessarily dumped. 514 * We turn off the MNTK_SUSPENDED flag to avoid a panic from 515 * spec_strategy about writing on a suspended filesystem. 516 * Note that we skip unlinked snapshot files as they will 517 * be handled separately below. 518 * 519 * We also calculate the size needed for the snapshot list. 520 * Initial number of entries is composed of: 521 * - one for each cylinder group map 522 * - one for each block used by superblock summary table 523 * - one for each snapshot inode block 524 * - one for the superblock 525 * - one for the snapshot list 526 * The direct block entries in the snapshot are always 527 * copied (see reason below). Note that the superblock and 528 * the first cylinder group will almost always be allocated 529 * in the direct blocks, but we add the slop for them in case 530 * they do not end up there. The snapshot list size may get 531 * expanded by one because of an update of an inode block for 532 * an unlinked but still open file when it is expunged. 533 * 534 * Because the direct block pointers are always copied, they 535 * are not added to the list. Instead ffs_copyonwrite() 536 * explicitly checks for them before checking the snapshot list. 537 */ 538 snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + 539 FSMAXSNAP + /* superblock */ 1 + /* snaplist */ 1; 540 MNT_ILOCK(mp); 541 mp->mnt_kern_flag &= ~MNTK_SUSPENDED; 542 MNT_IUNLOCK(mp); 543 loop: 544 MNT_VNODE_FOREACH_ALL(xvp, mp, mvp) { 545 if ((xvp->v_usecount == 0 && 546 (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) || 547 xvp->v_type == VNON || 548 IS_SNAPSHOT(VTOI(xvp))) { 549 VI_UNLOCK(xvp); 550 continue; 551 } 552 /* 553 * We can skip parent directory vnode because it must have 554 * this snapshot file in it. 555 */ 556 if (xvp == nd.ni_dvp) { 557 VI_UNLOCK(xvp); 558 continue; 559 } 560 vholdl(xvp); 561 if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK) != 0) { 562 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 563 vdrop(xvp); 564 goto loop; 565 } 566 VI_LOCK(xvp); 567 if (xvp->v_usecount == 0 && 568 (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) { 569 VI_UNLOCK(xvp); 570 VOP_UNLOCK(xvp); 571 vdrop(xvp); 572 continue; 573 } 574 VI_UNLOCK(xvp); 575 #ifdef DIAGNOSTIC 576 if (snapdebug) 577 vn_printf(xvp, "ffs_snapshot: busy vnode "); 578 #endif 579 if (VOP_GETATTR(xvp, &vat, td->td_ucred) == 0 && 580 vat.va_nlink > 0) { 581 VOP_UNLOCK(xvp); 582 vdrop(xvp); 583 continue; 584 } 585 xp = VTOI(xvp); 586 if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) { 587 VOP_UNLOCK(xvp); 588 vdrop(xvp); 589 continue; 590 } 591 /* 592 * If there is a fragment, clear it here. 593 */ 594 blkno = 0; 595 loc = howmany(xp->i_size, fs->fs_bsize) - 1; 596 if (loc < UFS_NDADDR) { 597 len = fragroundup(fs, blkoff(fs, xp->i_size)); 598 if (len != 0 && len < fs->fs_bsize) { 599 ffs_blkfree(ump, copy_fs, vp, 600 DIP(xp, i_db[loc]), len, xp->i_number, 601 xvp->v_type, NULL, SINGLETON_KEY); 602 blkno = DIP(xp, i_db[loc]); 603 DIP_SET(xp, i_db[loc], 0); 604 } 605 } 606 snaplistsize += 1; 607 if (I_IS_UFS1(xp)) 608 error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, 609 BLK_NOCOPY, 1); 610 else 611 error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, 612 BLK_NOCOPY, 1); 613 if (blkno) 614 DIP_SET(xp, i_db[loc], blkno); 615 if (!error) 616 error = ffs_freefile(ump, copy_fs, vp, xp->i_number, 617 xp->i_mode, NULL); 618 VOP_UNLOCK(xvp); 619 vdrop(xvp); 620 if (error) { 621 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 622 goto resumefs; 623 } 624 } 625 /* 626 * Erase the journal file from the snapshot. 627 */ 628 if (fs->fs_flags & FS_SUJ) { 629 error = softdep_journal_lookup(mp, &xvp); 630 if (error) 631 goto resumefs; 632 xp = VTOI(xvp); 633 if (I_IS_UFS1(xp)) 634 error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, 635 BLK_NOCOPY, 0); 636 else 637 error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, 638 BLK_NOCOPY, 0); 639 vput(xvp); 640 } 641 /* 642 * Preallocate all the direct blocks in the snapshot inode so 643 * that we never have to write the inode itself to commit an 644 * update to the contents of the snapshot. Note that once 645 * created, the size of the snapshot will never change, so 646 * there will never be a need to write the inode except to 647 * update the non-integrity-critical time fields and 648 * allocated-block count. 649 */ 650 for (blockno = 0; blockno < UFS_NDADDR; blockno++) { 651 if (DIP(ip, i_db[blockno]) != 0) 652 continue; 653 error = UFS_BALLOC(vp, lblktosize(fs, blockno), 654 fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); 655 if (error) 656 goto resumefs; 657 error = readblock(vp, bp, blockno); 658 bawrite(bp); 659 if (error != 0) 660 goto resumefs; 661 } 662 /* 663 * Acquire a lock on the snapdata structure, creating it if necessary. 664 */ 665 sn = ffs_snapdata_acquire(devvp); 666 /* 667 * Change vnode to use shared snapshot lock instead of the original 668 * private lock. 669 */ 670 vp->v_vnlock = &sn->sn_lock; 671 lockmgr(&vp->v_lock, LK_RELEASE, NULL); 672 xp = TAILQ_FIRST(&sn->sn_head); 673 /* 674 * If this is the first snapshot on this filesystem, then we need 675 * to allocate the space for the list of preallocated snapshot blocks. 676 * This list will be refined below, but this preliminary one will 677 * keep us out of deadlock until the full one is ready. 678 */ 679 if (xp == NULL) { 680 snapblklist = malloc(snaplistsize * sizeof(daddr_t), 681 M_UFSMNT, M_WAITOK); 682 blkp = &snapblklist[1]; 683 *blkp++ = lblkno(fs, fs->fs_sblockloc); 684 blkno = fragstoblks(fs, fs->fs_csaddr); 685 for (cg = 0; cg < fs->fs_ncg; cg++) { 686 if (fragstoblks(fs, cgtod(fs, cg)) > blkno) 687 break; 688 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 689 } 690 len = howmany(fs->fs_cssize, fs->fs_bsize); 691 for (loc = 0; loc < len; loc++) 692 *blkp++ = blkno + loc; 693 for (; cg < fs->fs_ncg; cg++) 694 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 695 snapblklist[0] = blkp - snapblklist; 696 VI_LOCK(devvp); 697 if (sn->sn_blklist != NULL) 698 panic("ffs_snapshot: non-empty list"); 699 sn->sn_blklist = snapblklist; 700 sn->sn_listsize = blkp - snapblklist; 701 VI_UNLOCK(devvp); 702 } 703 /* 704 * Record snapshot inode. Since this is the newest snapshot, 705 * it must be placed at the end of the list. 706 */ 707 VI_LOCK(devvp); 708 fs->fs_snapinum[snaploc] = ip->i_number; 709 if (ip->i_nextsnap.tqe_prev != 0) 710 panic("ffs_snapshot: %ju already on list", 711 (uintmax_t)ip->i_number); 712 TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap); 713 devvp->v_vflag |= VV_COPYONWRITE; 714 VI_UNLOCK(devvp); 715 resumefs: 716 ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp"); 717 if (error != 0 && copy_fs != NULL) { 718 free(copy_fs->fs_csp, M_UFSMNT); 719 free(copy_fs->fs_si, M_UFSMNT); 720 free(copy_fs, M_UFSMNT); 721 copy_fs = NULL; 722 } 723 KASSERT(error != 0 || (sn != NULL && copy_fs != NULL), 724 ("missing snapshot setup parameters")); 725 /* 726 * Resume operation on filesystem. 727 */ 728 vfs_write_resume(vp->v_mount, VR_START_WRITE | VR_NO_SUSPCLR); 729 #ifdef DIAGNOSTIC 730 if (collectsnapstats && starttime.tv_sec > 0) { 731 nanotime(&endtime); 732 timespecsub(&endtime, &starttime, &endtime); 733 printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n", 734 vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec, 735 endtime.tv_nsec / 1000000, redo, fs->fs_ncg); 736 } 737 #endif 738 if (copy_fs == NULL) 739 goto out; 740 /* 741 * Copy allocation information from all the snapshots in 742 * this snapshot and then expunge them from its view. 743 */ 744 TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) { 745 if (xp == ip) 746 break; 747 if (I_IS_UFS1(xp)) 748 error = expunge_ufs1(vp, xp, fs, snapacct_ufs1, 749 BLK_SNAP, 0); 750 else 751 error = expunge_ufs2(vp, xp, fs, snapacct_ufs2, 752 BLK_SNAP, 0); 753 if (error == 0 && xp->i_effnlink == 0) { 754 error = ffs_freefile(ump, 755 copy_fs, 756 vp, 757 xp->i_number, 758 xp->i_mode, NULL); 759 } 760 if (error) { 761 fs->fs_snapinum[snaploc] = 0; 762 goto done; 763 } 764 } 765 /* 766 * Allocate space for the full list of preallocated snapshot blocks. 767 */ 768 snapblklist = malloc(snaplistsize * sizeof(daddr_t), 769 M_UFSMNT, M_WAITOK); 770 ip->i_snapblklist = &snapblklist[1]; 771 /* 772 * Expunge the blocks used by the snapshots from the set of 773 * blocks marked as used in the snapshot bitmaps. Also, collect 774 * the list of allocated blocks in i_snapblklist. 775 */ 776 if (I_IS_UFS1(ip)) 777 error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, 778 BLK_SNAP, 0); 779 else 780 error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, 781 BLK_SNAP, 0); 782 if (error) { 783 fs->fs_snapinum[snaploc] = 0; 784 free(snapblklist, M_UFSMNT); 785 goto done; 786 } 787 if (snaplistsize < ip->i_snapblklist - snapblklist) 788 panic("ffs_snapshot: list too small"); 789 snaplistsize = ip->i_snapblklist - snapblklist; 790 snapblklist[0] = snaplistsize; 791 ip->i_snapblklist = 0; 792 /* 793 * Write out the list of allocated blocks to the end of the snapshot. 794 */ 795 auio.uio_iov = &aiov; 796 auio.uio_iovcnt = 1; 797 aiov.iov_base = (void *)snapblklist; 798 aiov.iov_len = snaplistsize * sizeof(daddr_t); 799 auio.uio_resid = aiov.iov_len; 800 auio.uio_offset = lblktosize(fs, (off_t)numblks); 801 auio.uio_segflg = UIO_SYSSPACE; 802 auio.uio_rw = UIO_WRITE; 803 auio.uio_td = td; 804 if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 805 fs->fs_snapinum[snaploc] = 0; 806 free(snapblklist, M_UFSMNT); 807 goto done; 808 } 809 /* 810 * Write the superblock and its summary information 811 * to the snapshot. 812 */ 813 blkno = fragstoblks(fs, fs->fs_csaddr); 814 len = howmany(fs->fs_cssize, fs->fs_bsize); 815 space = copy_fs->fs_csp; 816 for (loc = 0; loc < len; loc++) { 817 error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp); 818 if (error) { 819 fs->fs_snapinum[snaploc] = 0; 820 free(snapblklist, M_UFSMNT); 821 goto done; 822 } 823 bcopy(space, nbp->b_data, fs->fs_bsize); 824 space = (char *)space + fs->fs_bsize; 825 bawrite(nbp); 826 } 827 error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize, 828 KERNCRED, &nbp); 829 if (error) { 830 brelse(nbp); 831 } else { 832 loc = blkoff(fs, fs->fs_sblockloc); 833 copy_fs->fs_fmod = 0; 834 bpfs = (struct fs *)&nbp->b_data[loc]; 835 bcopy((caddr_t)copy_fs, (caddr_t)bpfs, (uint64_t)fs->fs_sbsize); 836 ffs_oldfscompat_write(bpfs, ump); 837 bpfs->fs_ckhash = ffs_calc_sbhash(bpfs); 838 bawrite(nbp); 839 } 840 /* 841 * As this is the newest list, it is the most inclusive, so 842 * should replace the previous list. 843 */ 844 VI_LOCK(devvp); 845 space = sn->sn_blklist; 846 sn->sn_blklist = snapblklist; 847 sn->sn_listsize = snaplistsize; 848 VI_UNLOCK(devvp); 849 if (space != NULL) 850 free(space, M_UFSMNT); 851 done: 852 free(copy_fs->fs_csp, M_UFSMNT); 853 free(copy_fs->fs_si, M_UFSMNT); 854 free(copy_fs, M_UFSMNT); 855 copy_fs = NULL; 856 out: 857 if (saved_nice > 0) { 858 struct proc *p; 859 860 p = td->td_proc; 861 PROC_LOCK(p); 862 sched_nice(td->td_proc, saved_nice); 863 PROC_UNLOCK(td->td_proc); 864 } 865 UFS_LOCK(ump); 866 if (fs->fs_active != 0) { 867 free(fs->fs_active, M_DEVBUF); 868 fs->fs_active = 0; 869 } 870 UFS_UNLOCK(ump); 871 MNT_ILOCK(mp); 872 mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA); 873 MNT_IUNLOCK(mp); 874 NDFREE_PNBUF(&nd); 875 vrele(nd.ni_dvp); 876 if (error == 0) { 877 (void) ffs_syncvnode(vp, MNT_WAIT, 0); 878 VOP_UNLOCK(vp); 879 } else if (VN_IS_DOOMED(vp)) { 880 vput(vp); 881 } else { 882 int rmerr; 883 884 /* Remove snapshot as its creation has failed. */ 885 vput(vp); 886 NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF, UIO_SYSSPACE, 887 snapfile); 888 if ((rmerr = namei(&nd)) != 0 || 889 (rmerr = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd)) != 0) 890 printf("Delete of %s failed with error %d\n", 891 nd.ni_dirp, rmerr); 892 NDFREE_PNBUF(&nd); 893 if (nd.ni_dvp != NULL) 894 vput(nd.ni_dvp); 895 if (nd.ni_vp != NULL) 896 vput(nd.ni_vp); 897 } 898 vn_finished_write(wrtmp); 899 process_deferred_inactive(mp); 900 return (error); 901 } 902 903 /* 904 * Copy a cylinder group map. All the unallocated blocks are marked 905 * BLK_NOCOPY so that the snapshot knows that it need not copy them 906 * if they are later written. If passno is one, then this is a first 907 * pass, so only setting needs to be done. If passno is 2, then this 908 * is a revision to a previous pass which must be undone as the 909 * replacement pass is done. 910 */ 911 static int 912 cgaccount(int cg, 913 struct vnode *vp, 914 struct buf *nbp, 915 int passno) 916 { 917 struct buf *bp, *ibp; 918 struct inode *ip; 919 struct cg *cgp; 920 struct fs *fs; 921 ufs2_daddr_t base, numblks; 922 int error, len, loc, indiroff; 923 924 ip = VTOI(vp); 925 fs = ITOFS(ip); 926 if ((error = ffs_getcg(fs, ITODEVVP(ip), cg, 0, &bp, &cgp)) != 0) 927 return (error); 928 UFS_LOCK(ITOUMP(ip)); 929 ACTIVESET(fs, cg); 930 /* 931 * Recomputation of summary information might not have been performed 932 * at mount time. Sync up summary information for current cylinder 933 * group while data is in memory to ensure that result of background 934 * fsck is slightly more consistent. 935 */ 936 fs->fs_cs(fs, cg) = cgp->cg_cs; 937 UFS_UNLOCK(ITOUMP(ip)); 938 bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize); 939 if (fs->fs_cgsize < fs->fs_bsize) 940 bzero(&nbp->b_data[fs->fs_cgsize], 941 fs->fs_bsize - fs->fs_cgsize); 942 cgp = (struct cg *)nbp->b_data; 943 bqrelse(bp); 944 if (passno == 2) 945 nbp->b_flags |= B_VALIDSUSPWRT; 946 numblks = howmany(fs->fs_size, fs->fs_frag); 947 len = howmany(fs->fs_fpg, fs->fs_frag); 948 base = cgbase(fs, cg) / fs->fs_frag; 949 if (base + len >= numblks) 950 len = numblks - base - 1; 951 loc = 0; 952 if (base < UFS_NDADDR) { 953 for ( ; loc < UFS_NDADDR; loc++) { 954 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 955 DIP_SET(ip, i_db[loc], BLK_NOCOPY); 956 else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY) 957 DIP_SET(ip, i_db[loc], 0); 958 else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY) 959 panic("ffs_snapshot: lost direct block"); 960 } 961 } 962 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), 963 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 964 if (error) { 965 goto out; 966 } 967 indiroff = (base + loc - UFS_NDADDR) % NINDIR(fs); 968 for ( ; loc < len; loc++, indiroff++) { 969 if (indiroff >= NINDIR(fs)) { 970 if (passno == 2) 971 ibp->b_flags |= B_VALIDSUSPWRT; 972 bawrite(ibp); 973 error = UFS_BALLOC(vp, 974 lblktosize(fs, (off_t)(base + loc)), 975 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 976 if (error) { 977 goto out; 978 } 979 indiroff = 0; 980 } 981 if (I_IS_UFS1(ip)) { 982 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 983 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 984 BLK_NOCOPY; 985 else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data)) 986 [indiroff] == BLK_NOCOPY) 987 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0; 988 else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data)) 989 [indiroff] == BLK_NOCOPY) 990 panic("ffs_snapshot: lost indirect block"); 991 continue; 992 } 993 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 994 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; 995 else if (passno == 2 && 996 ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) 997 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0; 998 else if (passno == 1 && 999 ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) 1000 panic("ffs_snapshot: lost indirect block"); 1001 } 1002 if (passno == 2) 1003 ibp->b_flags |= B_VALIDSUSPWRT; 1004 bdwrite(ibp); 1005 out: 1006 /* 1007 * We have to calculate the crc32c here rather than just setting the 1008 * BX_CYLGRP b_xflags because the allocation of the block for the 1009 * the cylinder group map will always be a full size block (fs_bsize) 1010 * even though the cylinder group may be smaller (fs_cgsize). The 1011 * crc32c must be computed only over fs_cgsize whereas the BX_CYLGRP 1012 * flag causes it to be computed over the size of the buffer. 1013 */ 1014 if ((fs->fs_metackhash & CK_CYLGRP) != 0) { 1015 ((struct cg *)nbp->b_data)->cg_ckhash = 0; 1016 ((struct cg *)nbp->b_data)->cg_ckhash = 1017 calculate_crc32c(~0L, nbp->b_data, fs->fs_cgsize); 1018 } 1019 return (error); 1020 } 1021 1022 /* 1023 * Before expunging a snapshot inode, note all the 1024 * blocks that it claims with BLK_SNAP so that fsck will 1025 * be able to account for those blocks properly and so 1026 * that this snapshot knows that it need not copy them 1027 * if the other snapshot holding them is freed. This code 1028 * is reproduced once each for UFS1 and UFS2. 1029 */ 1030 static int 1031 expunge_ufs1(struct vnode *snapvp, 1032 struct inode *cancelip, 1033 struct fs *fs, 1034 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 1035 struct fs *, ufs_lbn_t, int), 1036 int expungetype, 1037 int clearmode) 1038 { 1039 int i, error, indiroff; 1040 ufs_lbn_t lbn, rlbn; 1041 ufs2_daddr_t len, blkno, numblks, blksperindir; 1042 struct ufs1_dinode *dip; 1043 struct thread *td = curthread; 1044 struct buf *bp; 1045 1046 /* 1047 * Prepare to expunge the inode. If its inode block has not 1048 * yet been copied, then allocate and fill the copy. 1049 */ 1050 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 1051 blkno = 0; 1052 if (lbn < UFS_NDADDR) { 1053 blkno = VTOI(snapvp)->i_din1->di_db[lbn]; 1054 } else { 1055 if (DOINGSOFTDEP(snapvp)) 1056 softdep_prealloc(snapvp, MNT_WAIT); 1057 td->td_pflags |= TDP_COWINPROGRESS; 1058 error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn), 1059 fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); 1060 td->td_pflags &= ~TDP_COWINPROGRESS; 1061 if (error) 1062 return (error); 1063 indiroff = (lbn - UFS_NDADDR) % NINDIR(fs); 1064 blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff]; 1065 bqrelse(bp); 1066 } 1067 if (blkno != 0) { 1068 if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) 1069 return (error); 1070 } else { 1071 error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn), 1072 fs->fs_bsize, KERNCRED, 0, &bp); 1073 if (error) 1074 return (error); 1075 if ((error = readblock(snapvp, bp, lbn)) != 0) 1076 return (error); 1077 } 1078 /* 1079 * Set a snapshot inode to be a zero length file, regular files 1080 * or unlinked snapshots to be completely unallocated. 1081 */ 1082 dip = (struct ufs1_dinode *)bp->b_data + 1083 ino_to_fsbo(fs, cancelip->i_number); 1084 if (clearmode || cancelip->i_effnlink == 0) 1085 dip->di_mode = 0; 1086 dip->di_size = 0; 1087 dip->di_blocks = 0; 1088 dip->di_flags &= ~SF_SNAPSHOT; 1089 bzero(dip->di_db, UFS_NDADDR * sizeof(ufs1_daddr_t)); 1090 bzero(dip->di_ib, UFS_NIADDR * sizeof(ufs1_daddr_t)); 1091 bdwrite(bp); 1092 /* 1093 * Now go through and expunge all the blocks in the file 1094 * using the function requested. 1095 */ 1096 numblks = howmany(cancelip->i_size, fs->fs_bsize); 1097 if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0], 1098 &cancelip->i_din1->di_db[UFS_NDADDR], fs, 0, expungetype))) 1099 return (error); 1100 if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_ib[0], 1101 &cancelip->i_din1->di_ib[UFS_NIADDR], fs, -1, expungetype))) 1102 return (error); 1103 blksperindir = 1; 1104 lbn = -UFS_NDADDR; 1105 len = numblks - UFS_NDADDR; 1106 rlbn = UFS_NDADDR; 1107 for (i = 0; len > 0 && i < UFS_NIADDR; i++) { 1108 error = indiracct_ufs1(snapvp, ITOV(cancelip), i, 1109 cancelip->i_din1->di_ib[i], lbn, rlbn, len, 1110 blksperindir, fs, acctfunc, expungetype); 1111 if (error) 1112 return (error); 1113 blksperindir *= NINDIR(fs); 1114 lbn -= blksperindir + 1; 1115 len -= blksperindir; 1116 rlbn += blksperindir; 1117 } 1118 return (0); 1119 } 1120 1121 /* 1122 * Descend an indirect block chain for vnode cancelvp accounting for all 1123 * its indirect blocks in snapvp. 1124 */ 1125 static int 1126 indiracct_ufs1(struct vnode *snapvp, 1127 struct vnode *cancelvp, 1128 int level, 1129 ufs1_daddr_t blkno, 1130 ufs_lbn_t lbn, 1131 ufs_lbn_t rlbn, 1132 ufs_lbn_t remblks, 1133 ufs_lbn_t blksperindir, 1134 struct fs *fs, 1135 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 1136 struct fs *, ufs_lbn_t, int), 1137 int expungetype) 1138 { 1139 int error, num, i; 1140 ufs_lbn_t subblksperindir; 1141 struct indir indirs[UFS_NIADDR + 2]; 1142 ufs1_daddr_t last, *bap; 1143 struct buf *bp; 1144 1145 if (blkno == 0) { 1146 if (expungetype == BLK_NOCOPY) 1147 return (0); 1148 panic("indiracct_ufs1: missing indir"); 1149 } 1150 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 1151 return (error); 1152 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 1153 panic("indiracct_ufs1: botched params"); 1154 /* 1155 * We have to expand bread here since it will deadlock looking 1156 * up the block number for any blocks that are not in the cache. 1157 */ 1158 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); 1159 bp->b_blkno = fsbtodb(fs, blkno); 1160 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 1161 (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) { 1162 brelse(bp); 1163 return (error); 1164 } 1165 /* 1166 * Account for the block pointers in this indirect block. 1167 */ 1168 last = howmany(remblks, blksperindir); 1169 if (last > NINDIR(fs)) 1170 last = NINDIR(fs); 1171 bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK); 1172 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 1173 bqrelse(bp); 1174 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 1175 level == 0 ? rlbn : -1, expungetype); 1176 if (error || level == 0) 1177 goto out; 1178 /* 1179 * Account for the block pointers in each of the indirect blocks 1180 * in the levels below us. 1181 */ 1182 subblksperindir = blksperindir / NINDIR(fs); 1183 for (lbn++, level--, i = 0; i < last; i++) { 1184 error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn, 1185 rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); 1186 if (error) 1187 goto out; 1188 rlbn += blksperindir; 1189 lbn -= blksperindir; 1190 remblks -= blksperindir; 1191 } 1192 out: 1193 free(bap, M_DEVBUF); 1194 return (error); 1195 } 1196 1197 /* 1198 * Do both snap accounting and map accounting. 1199 */ 1200 static int 1201 fullacct_ufs1(struct vnode *vp, 1202 ufs1_daddr_t *oldblkp, 1203 ufs1_daddr_t *lastblkp, 1204 struct fs *fs, 1205 ufs_lbn_t lblkno, 1206 int exptype) /* BLK_SNAP or BLK_NOCOPY */ 1207 { 1208 int error; 1209 1210 if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 1211 return (error); 1212 return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 1213 } 1214 1215 /* 1216 * Identify a set of blocks allocated in a snapshot inode. 1217 */ 1218 static int 1219 snapacct_ufs1(struct vnode *vp, 1220 ufs1_daddr_t *oldblkp, 1221 ufs1_daddr_t *lastblkp, 1222 struct fs *fs, 1223 ufs_lbn_t lblkno, 1224 int expungetype) /* BLK_SNAP or BLK_NOCOPY */ 1225 { 1226 struct inode *ip = VTOI(vp); 1227 ufs1_daddr_t blkno, *blkp; 1228 ufs_lbn_t lbn; 1229 struct buf *ibp; 1230 int error; 1231 1232 for ( ; oldblkp < lastblkp; oldblkp++) { 1233 blkno = *oldblkp; 1234 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1235 continue; 1236 lbn = fragstoblks(fs, blkno); 1237 if (lbn < UFS_NDADDR) { 1238 blkp = &ip->i_din1->di_db[lbn]; 1239 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); 1240 } else { 1241 error = ffs_balloc_ufs1(vp, lblktosize(fs, (off_t)lbn), 1242 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1243 if (error) 1244 return (error); 1245 blkp = &((ufs1_daddr_t *)(ibp->b_data)) 1246 [(lbn - UFS_NDADDR) % NINDIR(fs)]; 1247 } 1248 /* 1249 * If we are expunging a snapshot vnode and we 1250 * find a block marked BLK_NOCOPY, then it is 1251 * one that has been allocated to this snapshot after 1252 * we took our current snapshot and can be ignored. 1253 */ 1254 if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { 1255 if (lbn >= UFS_NDADDR) 1256 brelse(ibp); 1257 } else { 1258 if (*blkp != 0) 1259 panic("snapacct_ufs1: bad block"); 1260 *blkp = expungetype; 1261 if (lbn >= UFS_NDADDR) 1262 bdwrite(ibp); 1263 } 1264 } 1265 return (0); 1266 } 1267 1268 /* 1269 * Account for a set of blocks allocated in a snapshot inode. 1270 */ 1271 static int 1272 mapacct_ufs1(struct vnode *vp, 1273 ufs1_daddr_t *oldblkp, 1274 ufs1_daddr_t *lastblkp, 1275 struct fs *fs, 1276 ufs_lbn_t lblkno, 1277 int expungetype) 1278 { 1279 ufs1_daddr_t blkno; 1280 struct inode *ip; 1281 ino_t inum; 1282 int acctit; 1283 1284 ip = VTOI(vp); 1285 inum = ip->i_number; 1286 if (lblkno == -1) 1287 acctit = 0; 1288 else 1289 acctit = 1; 1290 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1291 blkno = *oldblkp; 1292 if (blkno == 0 || blkno == BLK_NOCOPY) 1293 continue; 1294 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1295 *ip->i_snapblklist++ = lblkno; 1296 if (blkno == BLK_SNAP) 1297 blkno = blkstofrags(fs, lblkno); 1298 ffs_blkfree(ITOUMP(ip), fs, vp, blkno, fs->fs_bsize, inum, 1299 vp->v_type, NULL, SINGLETON_KEY); 1300 } 1301 return (0); 1302 } 1303 1304 /* 1305 * Before expunging a snapshot inode, note all the 1306 * blocks that it claims with BLK_SNAP so that fsck will 1307 * be able to account for those blocks properly and so 1308 * that this snapshot knows that it need not copy them 1309 * if the other snapshot holding them is freed. This code 1310 * is reproduced once each for UFS1 and UFS2. 1311 */ 1312 static int 1313 expunge_ufs2(struct vnode *snapvp, 1314 struct inode *cancelip, 1315 struct fs *fs, 1316 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1317 struct fs *, ufs_lbn_t, int), 1318 int expungetype, 1319 int clearmode) 1320 { 1321 int i, error, indiroff; 1322 ufs_lbn_t lbn, rlbn; 1323 ufs2_daddr_t len, blkno, numblks, blksperindir; 1324 struct ufs2_dinode *dip; 1325 struct thread *td = curthread; 1326 struct buf *bp; 1327 1328 /* 1329 * Prepare to expunge the inode. If its inode block has not 1330 * yet been copied, then allocate and fill the copy. 1331 */ 1332 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 1333 blkno = 0; 1334 if (lbn < UFS_NDADDR) { 1335 blkno = VTOI(snapvp)->i_din2->di_db[lbn]; 1336 } else { 1337 if (DOINGSOFTDEP(snapvp)) 1338 softdep_prealloc(snapvp, MNT_WAIT); 1339 td->td_pflags |= TDP_COWINPROGRESS; 1340 error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn), 1341 fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); 1342 td->td_pflags &= ~TDP_COWINPROGRESS; 1343 if (error) 1344 return (error); 1345 indiroff = (lbn - UFS_NDADDR) % NINDIR(fs); 1346 blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff]; 1347 bqrelse(bp); 1348 } 1349 if (blkno != 0) { 1350 if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) 1351 return (error); 1352 } else { 1353 error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn), 1354 fs->fs_bsize, KERNCRED, 0, &bp); 1355 if (error) 1356 return (error); 1357 if ((error = readblock(snapvp, bp, lbn)) != 0) 1358 return (error); 1359 } 1360 /* 1361 * Set a snapshot inode to be a zero length file, regular files 1362 * to be completely unallocated. 1363 */ 1364 dip = (struct ufs2_dinode *)bp->b_data + 1365 ino_to_fsbo(fs, cancelip->i_number); 1366 dip->di_size = 0; 1367 dip->di_blocks = 0; 1368 dip->di_flags &= ~SF_SNAPSHOT; 1369 bzero(dip->di_db, UFS_NDADDR * sizeof(ufs2_daddr_t)); 1370 bzero(dip->di_ib, UFS_NIADDR * sizeof(ufs2_daddr_t)); 1371 if (clearmode || cancelip->i_effnlink == 0) 1372 dip->di_mode = 0; 1373 else 1374 ffs_update_dinode_ckhash(fs, dip); 1375 bdwrite(bp); 1376 /* 1377 * Now go through and expunge all the blocks in the file 1378 * using the function requested. 1379 */ 1380 numblks = howmany(cancelip->i_size, fs->fs_bsize); 1381 if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0], 1382 &cancelip->i_din2->di_db[UFS_NDADDR], fs, 0, expungetype))) 1383 return (error); 1384 if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_ib[0], 1385 &cancelip->i_din2->di_ib[UFS_NIADDR], fs, -1, expungetype))) 1386 return (error); 1387 blksperindir = 1; 1388 lbn = -UFS_NDADDR; 1389 len = numblks - UFS_NDADDR; 1390 rlbn = UFS_NDADDR; 1391 for (i = 0; len > 0 && i < UFS_NIADDR; i++) { 1392 error = indiracct_ufs2(snapvp, ITOV(cancelip), i, 1393 cancelip->i_din2->di_ib[i], lbn, rlbn, len, 1394 blksperindir, fs, acctfunc, expungetype); 1395 if (error) 1396 return (error); 1397 blksperindir *= NINDIR(fs); 1398 lbn -= blksperindir + 1; 1399 len -= blksperindir; 1400 rlbn += blksperindir; 1401 } 1402 return (0); 1403 } 1404 1405 /* 1406 * Descend an indirect block chain for vnode cancelvp accounting for all 1407 * its indirect blocks in snapvp. 1408 */ 1409 static int 1410 indiracct_ufs2(struct vnode *snapvp, 1411 struct vnode *cancelvp, 1412 int level, 1413 ufs2_daddr_t blkno, 1414 ufs_lbn_t lbn, 1415 ufs_lbn_t rlbn, 1416 ufs_lbn_t remblks, 1417 ufs_lbn_t blksperindir, 1418 struct fs *fs, 1419 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1420 struct fs *, ufs_lbn_t, int), 1421 int expungetype) 1422 { 1423 int error, num, i; 1424 ufs_lbn_t subblksperindir; 1425 struct indir indirs[UFS_NIADDR + 2]; 1426 ufs2_daddr_t last, *bap; 1427 struct buf *bp; 1428 1429 if (blkno == 0) { 1430 if (expungetype == BLK_NOCOPY) 1431 return (0); 1432 panic("indiracct_ufs2: missing indir"); 1433 } 1434 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 1435 return (error); 1436 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 1437 panic("indiracct_ufs2: botched params"); 1438 /* 1439 * We have to expand bread here since it will deadlock looking 1440 * up the block number for any blocks that are not in the cache. 1441 */ 1442 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); 1443 bp->b_blkno = fsbtodb(fs, blkno); 1444 if ((bp->b_flags & B_CACHE) == 0 && 1445 (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) { 1446 brelse(bp); 1447 return (error); 1448 } 1449 /* 1450 * Account for the block pointers in this indirect block. 1451 */ 1452 last = howmany(remblks, blksperindir); 1453 if (last > NINDIR(fs)) 1454 last = NINDIR(fs); 1455 bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK); 1456 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 1457 bqrelse(bp); 1458 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 1459 level == 0 ? rlbn : -1, expungetype); 1460 if (error || level == 0) 1461 goto out; 1462 /* 1463 * Account for the block pointers in each of the indirect blocks 1464 * in the levels below us. 1465 */ 1466 subblksperindir = blksperindir / NINDIR(fs); 1467 for (lbn++, level--, i = 0; i < last; i++) { 1468 error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn, 1469 rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); 1470 if (error) 1471 goto out; 1472 rlbn += blksperindir; 1473 lbn -= blksperindir; 1474 remblks -= blksperindir; 1475 } 1476 out: 1477 free(bap, M_DEVBUF); 1478 return (error); 1479 } 1480 1481 /* 1482 * Do both snap accounting and map accounting. 1483 */ 1484 static int 1485 fullacct_ufs2(struct vnode *vp, 1486 ufs2_daddr_t *oldblkp, 1487 ufs2_daddr_t *lastblkp, 1488 struct fs *fs, 1489 ufs_lbn_t lblkno, 1490 int exptype) /* BLK_SNAP or BLK_NOCOPY */ 1491 { 1492 int error; 1493 1494 if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 1495 return (error); 1496 return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 1497 } 1498 1499 /* 1500 * Identify a set of blocks allocated in a snapshot inode. 1501 */ 1502 static int 1503 snapacct_ufs2(struct vnode *vp, 1504 ufs2_daddr_t *oldblkp, 1505 ufs2_daddr_t *lastblkp, 1506 struct fs *fs, 1507 ufs_lbn_t lblkno, 1508 int expungetype) /* BLK_SNAP or BLK_NOCOPY */ 1509 { 1510 struct inode *ip = VTOI(vp); 1511 ufs2_daddr_t blkno, *blkp; 1512 ufs_lbn_t lbn; 1513 struct buf *ibp; 1514 int error; 1515 1516 for ( ; oldblkp < lastblkp; oldblkp++) { 1517 blkno = *oldblkp; 1518 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1519 continue; 1520 lbn = fragstoblks(fs, blkno); 1521 if (lbn < UFS_NDADDR) { 1522 blkp = &ip->i_din2->di_db[lbn]; 1523 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); 1524 } else { 1525 error = ffs_balloc_ufs2(vp, lblktosize(fs, (off_t)lbn), 1526 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1527 if (error) 1528 return (error); 1529 blkp = &((ufs2_daddr_t *)(ibp->b_data)) 1530 [(lbn - UFS_NDADDR) % NINDIR(fs)]; 1531 } 1532 /* 1533 * If we are expunging a snapshot vnode and we 1534 * find a block marked BLK_NOCOPY, then it is 1535 * one that has been allocated to this snapshot after 1536 * we took our current snapshot and can be ignored. 1537 */ 1538 if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { 1539 if (lbn >= UFS_NDADDR) 1540 brelse(ibp); 1541 } else { 1542 if (*blkp != 0) 1543 panic("snapacct_ufs2: bad block"); 1544 *blkp = expungetype; 1545 if (lbn >= UFS_NDADDR) 1546 bdwrite(ibp); 1547 } 1548 } 1549 return (0); 1550 } 1551 1552 /* 1553 * Account for a set of blocks allocated in a snapshot inode. 1554 */ 1555 static int 1556 mapacct_ufs2(struct vnode *vp, 1557 ufs2_daddr_t *oldblkp, 1558 ufs2_daddr_t *lastblkp, 1559 struct fs *fs, 1560 ufs_lbn_t lblkno, 1561 int expungetype) 1562 { 1563 ufs2_daddr_t blkno; 1564 struct inode *ip; 1565 ino_t inum; 1566 int acctit; 1567 1568 ip = VTOI(vp); 1569 inum = ip->i_number; 1570 if (lblkno == -1) 1571 acctit = 0; 1572 else 1573 acctit = 1; 1574 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1575 blkno = *oldblkp; 1576 if (blkno == 0 || blkno == BLK_NOCOPY) 1577 continue; 1578 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP && 1579 lblkno >= UFS_NDADDR) 1580 *ip->i_snapblklist++ = lblkno; 1581 if (blkno == BLK_SNAP) 1582 blkno = blkstofrags(fs, lblkno); 1583 ffs_blkfree(ITOUMP(ip), fs, vp, blkno, fs->fs_bsize, inum, 1584 vp->v_type, NULL, SINGLETON_KEY); 1585 } 1586 return (0); 1587 } 1588 1589 /* 1590 * Decrement extra reference on snapshot when last name is removed. 1591 * It will not be freed until the last open reference goes away. 1592 */ 1593 void 1594 ffs_snapgone(struct inode *ip) 1595 { 1596 struct inode *xp; 1597 struct fs *fs; 1598 int snaploc; 1599 struct snapdata *sn; 1600 struct ufsmount *ump; 1601 1602 /* 1603 * Find snapshot in incore list. 1604 */ 1605 xp = NULL; 1606 sn = ITODEVVP(ip)->v_rdev->si_snapdata; 1607 if (sn != NULL) 1608 TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) 1609 if (xp == ip) 1610 break; 1611 if (xp != NULL) 1612 vrele(ITOV(ip)); 1613 #ifdef DIAGNOSTIC 1614 else if (snapdebug) 1615 printf("ffs_snapgone: lost snapshot vnode %ju\n", 1616 (uintmax_t)ip->i_number); 1617 #endif 1618 /* 1619 * Delete snapshot inode from superblock. Keep list dense. 1620 */ 1621 ump = ITOUMP(ip); 1622 fs = ump->um_fs; 1623 UFS_LOCK(ump); 1624 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 1625 if (fs->fs_snapinum[snaploc] == ip->i_number) 1626 break; 1627 if (snaploc < FSMAXSNAP) { 1628 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { 1629 if (fs->fs_snapinum[snaploc] == 0) 1630 break; 1631 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; 1632 } 1633 fs->fs_snapinum[snaploc - 1] = 0; 1634 } 1635 UFS_UNLOCK(ump); 1636 } 1637 1638 /* 1639 * Prepare a snapshot file for being removed. 1640 */ 1641 void 1642 ffs_snapremove(struct vnode *vp) 1643 { 1644 struct inode *ip; 1645 struct vnode *devvp; 1646 struct buf *ibp; 1647 struct fs *fs; 1648 ufs2_daddr_t numblks, blkno, dblk; 1649 int error, last, loc; 1650 struct snapdata *sn; 1651 1652 ip = VTOI(vp); 1653 fs = ITOFS(ip); 1654 devvp = ITODEVVP(ip); 1655 /* 1656 * If active, delete from incore list (this snapshot may 1657 * already have been in the process of being deleted, so 1658 * would not have been active). 1659 * 1660 * Clear copy-on-write flag if last snapshot. 1661 */ 1662 VI_LOCK(devvp); 1663 if (ip->i_nextsnap.tqe_prev != 0) { 1664 sn = devvp->v_rdev->si_snapdata; 1665 TAILQ_REMOVE(&sn->sn_head, ip, i_nextsnap); 1666 ip->i_nextsnap.tqe_prev = 0; 1667 revert_snaplock(vp, devvp, sn); 1668 try_free_snapdata(devvp); 1669 } 1670 VI_UNLOCK(devvp); 1671 /* 1672 * Clear all BLK_NOCOPY fields. Pass any block claims to other 1673 * snapshots that want them (see ffs_snapblkfree below). 1674 */ 1675 for (blkno = 1; blkno < UFS_NDADDR; blkno++) { 1676 dblk = DIP(ip, i_db[blkno]); 1677 if (dblk == 0) 1678 continue; 1679 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1680 DIP_SET(ip, i_db[blkno], 0); 1681 else if ((dblk == blkstofrags(fs, blkno) && 1682 ffs_snapblkfree(fs, ITODEVVP(ip), dblk, fs->fs_bsize, 1683 ip->i_number, vp->v_type, NULL))) { 1684 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - 1685 btodb(fs->fs_bsize)); 1686 DIP_SET(ip, i_db[blkno], 0); 1687 } 1688 } 1689 numblks = howmany(ip->i_size, fs->fs_bsize); 1690 for (blkno = UFS_NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 1691 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), 1692 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1693 if (error) 1694 continue; 1695 if (fs->fs_size - blkno > NINDIR(fs)) 1696 last = NINDIR(fs); 1697 else 1698 last = fs->fs_size - blkno; 1699 for (loc = 0; loc < last; loc++) { 1700 if (I_IS_UFS1(ip)) { 1701 dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc]; 1702 if (dblk == 0) 1703 continue; 1704 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1705 ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; 1706 else if ((dblk == blkstofrags(fs, blkno) && 1707 ffs_snapblkfree(fs, ITODEVVP(ip), dblk, 1708 fs->fs_bsize, ip->i_number, vp->v_type, 1709 NULL))) { 1710 ip->i_din1->di_blocks -= 1711 btodb(fs->fs_bsize); 1712 ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; 1713 } 1714 continue; 1715 } 1716 dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc]; 1717 if (dblk == 0) 1718 continue; 1719 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1720 ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; 1721 else if ((dblk == blkstofrags(fs, blkno) && 1722 ffs_snapblkfree(fs, ITODEVVP(ip), dblk, 1723 fs->fs_bsize, ip->i_number, vp->v_type, NULL))) { 1724 ip->i_din2->di_blocks -= btodb(fs->fs_bsize); 1725 ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; 1726 } 1727 } 1728 bawrite(ibp); 1729 } 1730 /* 1731 * Clear snapshot flag and drop reference. 1732 */ 1733 ip->i_flags &= ~SF_SNAPSHOT; 1734 DIP_SET(ip, i_flags, ip->i_flags); 1735 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); 1736 /* 1737 * The dirtied indirects must be written out before 1738 * softdep_setup_freeblocks() is called. Otherwise indir_trunc() 1739 * may find indirect pointers using the magic BLK_* values. 1740 */ 1741 if (DOINGSOFTDEP(vp)) 1742 ffs_syncvnode(vp, MNT_WAIT, 0); 1743 #ifdef QUOTA 1744 /* 1745 * Reenable disk quotas for ex-snapshot file. 1746 */ 1747 if (!getinoquota(ip)) 1748 (void) chkdq(ip, DIP(ip, i_blocks), KERNCRED, FORCE); 1749 #endif 1750 } 1751 1752 /* 1753 * Notification that a block is being freed. Return zero if the free 1754 * should be allowed to proceed. Return non-zero if the snapshot file 1755 * wants to claim the block. The block will be claimed if it is an 1756 * uncopied part of one of the snapshots. It will be freed if it is 1757 * either a BLK_NOCOPY or has already been copied in all of the snapshots. 1758 * If a fragment is being freed, then all snapshots that care about 1759 * it must make a copy since a snapshot file can only claim full sized 1760 * blocks. Note that if more than one snapshot file maps the block, 1761 * we can pick one at random to claim it. Since none of the snapshots 1762 * can change, we are assurred that they will all see the same unmodified 1763 * image. When deleting a snapshot file (see ffs_snapremove above), we 1764 * must push any of these claimed blocks to one of the other snapshots 1765 * that maps it. These claimed blocks are easily identified as they will 1766 * have a block number equal to their logical block number within the 1767 * snapshot. A copied block can never have this property because they 1768 * must always have been allocated from a BLK_NOCOPY location. 1769 */ 1770 int 1771 ffs_snapblkfree(struct fs *fs, 1772 struct vnode *devvp, 1773 ufs2_daddr_t bno, 1774 long size, 1775 ino_t inum, 1776 __enum_uint8(vtype) vtype, 1777 struct workhead *wkhd) 1778 { 1779 struct buf *ibp, *cbp, *savedcbp = NULL; 1780 struct thread *td = curthread; 1781 struct inode *ip; 1782 struct vnode *vp = NULL; 1783 ufs_lbn_t lbn; 1784 ufs2_daddr_t blkno; 1785 int indiroff = 0, error = 0, claimedblk = 0; 1786 struct snapdata *sn; 1787 1788 lbn = fragstoblks(fs, bno); 1789 retry: 1790 VI_LOCK(devvp); 1791 sn = devvp->v_rdev->si_snapdata; 1792 if (sn == NULL) { 1793 VI_UNLOCK(devvp); 1794 return (0); 1795 } 1796 1797 /* 1798 * Use LK_SLEEPFAIL because sn might be freed under us while 1799 * both devvp interlock and snaplk are not owned. 1800 */ 1801 if (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 1802 VI_MTX(devvp)) != 0) 1803 goto retry; 1804 1805 TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { 1806 vp = ITOV(ip); 1807 if (DOINGSOFTDEP(vp)) 1808 softdep_prealloc(vp, MNT_WAIT); 1809 /* 1810 * Lookup block being written. 1811 */ 1812 if (lbn < UFS_NDADDR) { 1813 blkno = DIP(ip, i_db[lbn]); 1814 } else { 1815 td->td_pflags |= TDP_COWINPROGRESS; 1816 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1817 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1818 td->td_pflags &= ~TDP_COWINPROGRESS; 1819 if (error) 1820 break; 1821 indiroff = (lbn - UFS_NDADDR) % NINDIR(fs); 1822 if (I_IS_UFS1(ip)) 1823 blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; 1824 else 1825 blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; 1826 } 1827 /* 1828 * Check to see if block needs to be copied. 1829 */ 1830 if (blkno == 0) { 1831 /* 1832 * A block that we map is being freed. If it has not 1833 * been claimed yet, we will claim or copy it (below). 1834 */ 1835 claimedblk = 1; 1836 } else if (blkno == BLK_SNAP) { 1837 /* 1838 * No previous snapshot claimed the block, 1839 * so it will be freed and become a BLK_NOCOPY 1840 * (don't care) for us. 1841 */ 1842 if (claimedblk) 1843 panic("snapblkfree: inconsistent block type"); 1844 if (lbn < UFS_NDADDR) { 1845 DIP_SET(ip, i_db[lbn], BLK_NOCOPY); 1846 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); 1847 } else if (I_IS_UFS1(ip)) { 1848 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 1849 BLK_NOCOPY; 1850 bdwrite(ibp); 1851 } else { 1852 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 1853 BLK_NOCOPY; 1854 bdwrite(ibp); 1855 } 1856 continue; 1857 } else /* BLK_NOCOPY or default */ { 1858 /* 1859 * If the snapshot has already copied the block 1860 * (default), or does not care about the block, 1861 * it is not needed. 1862 */ 1863 if (lbn >= UFS_NDADDR) 1864 bqrelse(ibp); 1865 continue; 1866 } 1867 /* 1868 * If this is a full size block, we will just grab it 1869 * and assign it to the snapshot inode. Otherwise we 1870 * will proceed to copy it. See explanation for this 1871 * routine as to why only a single snapshot needs to 1872 * claim this block. 1873 */ 1874 if (size == fs->fs_bsize) { 1875 #ifdef DIAGNOSTIC 1876 if (snapdebug) 1877 printf("%s %ju lbn %jd from inum %ju\n", 1878 "Grabonremove: snapino", 1879 (uintmax_t)ip->i_number, 1880 (intmax_t)lbn, (uintmax_t)inum); 1881 #endif 1882 /* 1883 * If journaling is tracking this write we must add 1884 * the work to the inode or indirect being written. 1885 */ 1886 if (wkhd != NULL) { 1887 if (lbn < UFS_NDADDR) 1888 softdep_inode_append(ip, 1889 curthread->td_ucred, wkhd); 1890 else 1891 softdep_buf_append(ibp, wkhd); 1892 } 1893 if (lbn < UFS_NDADDR) { 1894 DIP_SET(ip, i_db[lbn], bno); 1895 } else if (I_IS_UFS1(ip)) { 1896 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno; 1897 bdwrite(ibp); 1898 } else { 1899 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno; 1900 bdwrite(ibp); 1901 } 1902 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + btodb(size)); 1903 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); 1904 lockmgr(vp->v_vnlock, LK_RELEASE, NULL); 1905 return (1); 1906 } 1907 if (lbn >= UFS_NDADDR) 1908 bqrelse(ibp); 1909 /* 1910 * Allocate the block into which to do the copy. Note that this 1911 * allocation will never require any additional allocations for 1912 * the snapshot inode. 1913 */ 1914 td->td_pflags |= TDP_COWINPROGRESS; 1915 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1916 fs->fs_bsize, KERNCRED, 0, &cbp); 1917 td->td_pflags &= ~TDP_COWINPROGRESS; 1918 if (error) 1919 break; 1920 #ifdef DIAGNOSTIC 1921 if (snapdebug) 1922 printf("%s%ju lbn %jd %s %ju size %ld to blkno %jd\n", 1923 "Copyonremove: snapino ", (uintmax_t)ip->i_number, 1924 (intmax_t)lbn, "for inum", (uintmax_t)inum, size, 1925 (intmax_t)cbp->b_blkno); 1926 #endif 1927 /* 1928 * If we have already read the old block contents, then 1929 * simply copy them to the new block. Note that we need 1930 * to synchronously write snapshots that have not been 1931 * unlinked, and hence will be visible after a crash, 1932 * to ensure their integrity. At a minimum we ensure the 1933 * integrity of the filesystem metadata, but use the 1934 * dopersistence sysctl-setable flag to decide on the 1935 * persistence needed for file content data. 1936 */ 1937 if (savedcbp != NULL) { 1938 bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 1939 bawrite(cbp); 1940 if ((vtype == VDIR || dopersistence) && 1941 ip->i_effnlink > 0) 1942 (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); 1943 continue; 1944 } 1945 /* 1946 * Otherwise, read the old block contents into the buffer. 1947 */ 1948 if ((error = readblock(vp, cbp, lbn)) != 0) { 1949 bzero(cbp->b_data, fs->fs_bsize); 1950 bawrite(cbp); 1951 if ((vtype == VDIR || dopersistence) && 1952 ip->i_effnlink > 0) 1953 (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); 1954 break; 1955 } 1956 savedcbp = cbp; 1957 } 1958 /* 1959 * Note that we need to synchronously write snapshots that 1960 * have not been unlinked, and hence will be visible after 1961 * a crash, to ensure their integrity. At a minimum we 1962 * ensure the integrity of the filesystem metadata, but 1963 * use the dopersistence sysctl-setable flag to decide on 1964 * the persistence needed for file content data. 1965 */ 1966 if (savedcbp) { 1967 vp = savedcbp->b_vp; 1968 bawrite(savedcbp); 1969 if ((vtype == VDIR || dopersistence) && 1970 VTOI(vp)->i_effnlink > 0) 1971 (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); 1972 } 1973 /* 1974 * If we have been unable to allocate a block in which to do 1975 * the copy, then return non-zero so that the fragment will 1976 * not be freed. Although space will be lost, the snapshot 1977 * will stay consistent. 1978 */ 1979 if (error != 0 && wkhd != NULL) 1980 softdep_freework(wkhd); 1981 lockmgr(&sn->sn_lock, LK_RELEASE, NULL); 1982 return (error); 1983 } 1984 1985 /* 1986 * Associate snapshot files when mounting. 1987 */ 1988 void 1989 ffs_snapshot_mount(struct mount *mp) 1990 { 1991 struct ufsmount *ump = VFSTOUFS(mp); 1992 struct vnode *devvp = ump->um_devvp; 1993 struct fs *fs = ump->um_fs; 1994 struct thread *td = curthread; 1995 struct snapdata *sn; 1996 struct vnode *vp; 1997 struct vnode *lastvp; 1998 struct inode *ip; 1999 struct uio auio; 2000 struct iovec aiov; 2001 void *snapblklist; 2002 char *reason; 2003 daddr_t snaplistsize; 2004 int error, snaploc, loc; 2005 2006 /* 2007 * XXX The following needs to be set before ffs_truncate or 2008 * VOP_READ can be called. 2009 */ 2010 mp->mnt_stat.f_iosize = fs->fs_bsize; 2011 /* 2012 * Process each snapshot listed in the superblock. 2013 */ 2014 vp = NULL; 2015 lastvp = NULL; 2016 sn = NULL; 2017 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { 2018 if (fs->fs_snapinum[snaploc] == 0) 2019 break; 2020 if ((error = ffs_vget(mp, fs->fs_snapinum[snaploc], 2021 LK_EXCLUSIVE, &vp)) != 0){ 2022 printf("ffs_snapshot_mount: vget failed %d\n", error); 2023 continue; 2024 } 2025 ip = VTOI(vp); 2026 if (vp->v_type != VREG) { 2027 reason = "non-file snapshot"; 2028 } else if (!IS_SNAPSHOT(ip)) { 2029 reason = "non-snapshot"; 2030 } else if (ip->i_size == 2031 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) { 2032 reason = "old format snapshot"; 2033 (void)ffs_truncate(vp, (off_t)0, 0, NOCRED); 2034 (void)ffs_syncvnode(vp, MNT_WAIT, 0); 2035 } else { 2036 reason = NULL; 2037 } 2038 if (reason != NULL) { 2039 printf("ffs_snapshot_mount: %s inode %d\n", 2040 reason, fs->fs_snapinum[snaploc]); 2041 vput(vp); 2042 vp = NULL; 2043 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { 2044 if (fs->fs_snapinum[loc] == 0) 2045 break; 2046 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; 2047 } 2048 fs->fs_snapinum[loc - 1] = 0; 2049 snaploc--; 2050 continue; 2051 } 2052 /* 2053 * Acquire a lock on the snapdata structure, creating it if 2054 * necessary. 2055 */ 2056 sn = ffs_snapdata_acquire(devvp); 2057 /* 2058 * Change vnode to use shared snapshot lock instead of the 2059 * original private lock. 2060 */ 2061 vp->v_vnlock = &sn->sn_lock; 2062 lockmgr(&vp->v_lock, LK_RELEASE, NULL); 2063 /* 2064 * Link it onto the active snapshot list. 2065 */ 2066 VI_LOCK(devvp); 2067 if (ip->i_nextsnap.tqe_prev != 0) 2068 panic("ffs_snapshot_mount: %ju already on list", 2069 (uintmax_t)ip->i_number); 2070 else 2071 TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap); 2072 vp->v_vflag |= VV_SYSTEM; 2073 VI_UNLOCK(devvp); 2074 VOP_UNLOCK(vp); 2075 lastvp = vp; 2076 } 2077 vp = lastvp; 2078 /* 2079 * No usable snapshots found. 2080 */ 2081 if (sn == NULL || vp == NULL) 2082 return; 2083 /* 2084 * Allocate the space for the block hints list. We always want to 2085 * use the list from the newest snapshot. 2086 */ 2087 auio.uio_iov = &aiov; 2088 auio.uio_iovcnt = 1; 2089 aiov.iov_base = (void *)&snaplistsize; 2090 aiov.iov_len = sizeof(snaplistsize); 2091 auio.uio_resid = aiov.iov_len; 2092 auio.uio_offset = 2093 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)); 2094 auio.uio_segflg = UIO_SYSSPACE; 2095 auio.uio_rw = UIO_READ; 2096 auio.uio_td = td; 2097 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2098 if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 2099 printf("ffs_snapshot_mount: read_1 failed %d\n", error); 2100 VOP_UNLOCK(vp); 2101 return; 2102 } 2103 snapblklist = malloc(snaplistsize * sizeof(daddr_t), 2104 M_UFSMNT, M_WAITOK); 2105 auio.uio_iovcnt = 1; 2106 aiov.iov_base = snapblklist; 2107 aiov.iov_len = snaplistsize * sizeof (daddr_t); 2108 auio.uio_resid = aiov.iov_len; 2109 auio.uio_offset -= sizeof(snaplistsize); 2110 if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 2111 printf("ffs_snapshot_mount: read_2 failed %d\n", error); 2112 VOP_UNLOCK(vp); 2113 free(snapblklist, M_UFSMNT); 2114 return; 2115 } 2116 VOP_UNLOCK(vp); 2117 VI_LOCK(devvp); 2118 sn->sn_listsize = snaplistsize; 2119 sn->sn_blklist = (daddr_t *)snapblklist; 2120 devvp->v_vflag |= VV_COPYONWRITE; 2121 VI_UNLOCK(devvp); 2122 } 2123 2124 /* 2125 * Disassociate snapshot files when unmounting. 2126 */ 2127 void 2128 ffs_snapshot_unmount(struct mount *mp) 2129 { 2130 struct vnode *devvp = VFSTOUFS(mp)->um_devvp; 2131 struct snapdata *sn; 2132 struct inode *xp; 2133 struct vnode *vp; 2134 2135 VI_LOCK(devvp); 2136 sn = devvp->v_rdev->si_snapdata; 2137 while (sn != NULL && (xp = TAILQ_FIRST(&sn->sn_head)) != NULL) { 2138 vp = ITOV(xp); 2139 TAILQ_REMOVE(&sn->sn_head, xp, i_nextsnap); 2140 xp->i_nextsnap.tqe_prev = 0; 2141 lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE, 2142 VI_MTX(devvp)); 2143 VI_LOCK(devvp); 2144 revert_snaplock(vp, devvp, sn); 2145 lockmgr(&vp->v_lock, LK_RELEASE, NULL); 2146 if (xp->i_effnlink > 0) { 2147 VI_UNLOCK(devvp); 2148 vrele(vp); 2149 VI_LOCK(devvp); 2150 } 2151 sn = devvp->v_rdev->si_snapdata; 2152 } 2153 try_free_snapdata(devvp); 2154 VI_UNLOCK(devvp); 2155 } 2156 2157 /* 2158 * Check the buffer block to be belong to device buffer that shall be 2159 * locked after snaplk. devvp shall be locked on entry, and will be 2160 * leaved locked upon exit. 2161 */ 2162 static int 2163 ffs_bp_snapblk(struct vnode *devvp, struct buf *bp) 2164 { 2165 struct snapdata *sn; 2166 struct fs *fs; 2167 ufs2_daddr_t lbn, *snapblklist; 2168 int lower, upper, mid; 2169 2170 ASSERT_VI_LOCKED(devvp, "ffs_bp_snapblk"); 2171 KASSERT(devvp->v_type == VCHR, ("Not a device %p", devvp)); 2172 sn = devvp->v_rdev->si_snapdata; 2173 if (sn == NULL || TAILQ_FIRST(&sn->sn_head) == NULL) 2174 return (0); 2175 fs = ITOFS(TAILQ_FIRST(&sn->sn_head)); 2176 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); 2177 snapblklist = sn->sn_blklist; 2178 upper = sn->sn_listsize - 1; 2179 lower = 1; 2180 while (lower <= upper) { 2181 mid = (lower + upper) / 2; 2182 if (snapblklist[mid] == lbn) 2183 break; 2184 if (snapblklist[mid] < lbn) 2185 lower = mid + 1; 2186 else 2187 upper = mid - 1; 2188 } 2189 if (lower <= upper) 2190 return (1); 2191 return (0); 2192 } 2193 2194 void 2195 ffs_bdflush(struct bufobj *bo, struct buf *bp) 2196 { 2197 struct thread *td; 2198 struct vnode *vp, *devvp; 2199 struct buf *nbp; 2200 int bp_bdskip; 2201 2202 if (bo->bo_dirty.bv_cnt <= dirtybufthresh) 2203 return; 2204 2205 td = curthread; 2206 vp = bp->b_vp; 2207 devvp = bo2vnode(bo); 2208 KASSERT(vp == devvp, ("devvp != vp %p %p", bo, bp)); 2209 2210 VI_LOCK(devvp); 2211 bp_bdskip = ffs_bp_snapblk(devvp, bp); 2212 if (bp_bdskip) 2213 bdwriteskip++; 2214 VI_UNLOCK(devvp); 2215 if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10 && !bp_bdskip) { 2216 (void) VOP_FSYNC(vp, MNT_NOWAIT, td); 2217 altbufferflushes++; 2218 } else { 2219 BO_LOCK(bo); 2220 /* 2221 * Try to find a buffer to flush. 2222 */ 2223 TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) { 2224 if ((nbp->b_vflags & BV_BKGRDINPROG) || 2225 BUF_LOCK(nbp, 2226 LK_EXCLUSIVE | LK_NOWAIT, NULL)) 2227 continue; 2228 if (bp == nbp) 2229 panic("bdwrite: found ourselves"); 2230 BO_UNLOCK(bo); 2231 /* 2232 * Don't countdeps with the bo lock 2233 * held. 2234 */ 2235 if (buf_countdeps(nbp, 0)) { 2236 BO_LOCK(bo); 2237 BUF_UNLOCK(nbp); 2238 continue; 2239 } 2240 if (bp_bdskip) { 2241 VI_LOCK(devvp); 2242 if (!ffs_bp_snapblk(vp, nbp)) { 2243 VI_UNLOCK(devvp); 2244 BO_LOCK(bo); 2245 BUF_UNLOCK(nbp); 2246 continue; 2247 } 2248 VI_UNLOCK(devvp); 2249 } 2250 if (nbp->b_flags & B_CLUSTEROK) { 2251 vfs_bio_awrite(nbp); 2252 } else { 2253 bremfree(nbp); 2254 bawrite(nbp); 2255 } 2256 dirtybufferflushes++; 2257 break; 2258 } 2259 if (nbp == NULL) 2260 BO_UNLOCK(bo); 2261 } 2262 } 2263 2264 /* 2265 * Check for need to copy block that is about to be written, 2266 * copying the block if necessary. 2267 */ 2268 int 2269 ffs_copyonwrite(struct vnode *devvp, struct buf *bp) 2270 { 2271 struct snapdata *sn; 2272 struct buf *ibp, *cbp, *savedcbp = NULL; 2273 struct thread *td = curthread; 2274 struct fs *fs; 2275 struct inode *ip; 2276 struct vnode *vp = NULL; 2277 ufs2_daddr_t lbn, blkno, *snapblklist; 2278 int lower, upper, mid, indiroff, error = 0; 2279 int launched_async_io, prev_norunningbuf; 2280 long saved_runningbufspace; 2281 2282 if (devvp != bp->b_vp && IS_SNAPSHOT(VTOI(bp->b_vp))) 2283 return (0); /* Update on a snapshot file */ 2284 if (td->td_pflags & TDP_COWINPROGRESS) 2285 panic("ffs_copyonwrite: recursive call"); 2286 /* 2287 * First check to see if it is in the preallocated list. 2288 * By doing this check we avoid several potential deadlocks. 2289 */ 2290 VI_LOCK(devvp); 2291 sn = devvp->v_rdev->si_snapdata; 2292 if (sn == NULL || 2293 TAILQ_EMPTY(&sn->sn_head)) { 2294 VI_UNLOCK(devvp); 2295 return (0); /* No snapshot */ 2296 } 2297 ip = TAILQ_FIRST(&sn->sn_head); 2298 fs = ITOFS(ip); 2299 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); 2300 if (lbn < UFS_NDADDR) { 2301 VI_UNLOCK(devvp); 2302 return (0); /* Direct blocks are always copied */ 2303 } 2304 snapblklist = sn->sn_blklist; 2305 upper = sn->sn_listsize - 1; 2306 lower = 1; 2307 while (lower <= upper) { 2308 mid = (lower + upper) / 2; 2309 if (snapblklist[mid] == lbn) 2310 break; 2311 if (snapblklist[mid] < lbn) 2312 lower = mid + 1; 2313 else 2314 upper = mid - 1; 2315 } 2316 if (lower <= upper) { 2317 VI_UNLOCK(devvp); 2318 return (0); 2319 } 2320 launched_async_io = 0; 2321 prev_norunningbuf = td->td_pflags & TDP_NORUNNINGBUF; 2322 /* 2323 * Since I/O on bp isn't yet in progress and it may be blocked 2324 * for a long time waiting on snaplk, back it out of 2325 * runningbufspace, possibly waking other threads waiting for space. 2326 */ 2327 saved_runningbufspace = bp->b_runningbufspace; 2328 if (saved_runningbufspace != 0) 2329 runningbufwakeup(bp); 2330 /* 2331 * Not in the precomputed list, so check the snapshots. 2332 */ 2333 while (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 2334 VI_MTX(devvp)) != 0) { 2335 VI_LOCK(devvp); 2336 sn = devvp->v_rdev->si_snapdata; 2337 if (sn == NULL || 2338 TAILQ_EMPTY(&sn->sn_head)) { 2339 VI_UNLOCK(devvp); 2340 if (saved_runningbufspace != 0) { 2341 bp->b_runningbufspace = saved_runningbufspace; 2342 atomic_add_long(&runningbufspace, 2343 bp->b_runningbufspace); 2344 } 2345 return (0); /* Snapshot gone */ 2346 } 2347 } 2348 TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { 2349 vp = ITOV(ip); 2350 if (DOINGSOFTDEP(vp)) 2351 softdep_prealloc(vp, MNT_WAIT); 2352 /* 2353 * We ensure that everything of our own that needs to be 2354 * copied will be done at the time that ffs_snapshot is 2355 * called. Thus we can skip the check here which can 2356 * deadlock in doing the lookup in UFS_BALLOC. 2357 */ 2358 if (bp->b_vp == vp) 2359 continue; 2360 /* 2361 * Check to see if block needs to be copied. We do not have 2362 * to hold the snapshot lock while doing this lookup as it 2363 * will never require any additional allocations for the 2364 * snapshot inode. 2365 */ 2366 if (lbn < UFS_NDADDR) { 2367 blkno = DIP(ip, i_db[lbn]); 2368 } else { 2369 td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF; 2370 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 2371 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 2372 td->td_pflags &= ~TDP_COWINPROGRESS; 2373 if (error) 2374 break; 2375 indiroff = (lbn - UFS_NDADDR) % NINDIR(fs); 2376 if (I_IS_UFS1(ip)) 2377 blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; 2378 else 2379 blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; 2380 bqrelse(ibp); 2381 } 2382 #ifdef INVARIANTS 2383 if (blkno == BLK_SNAP && bp->b_lblkno >= 0) 2384 panic("ffs_copyonwrite: bad copy block"); 2385 #endif 2386 if (blkno != 0) 2387 continue; 2388 /* 2389 * Allocate the block into which to do the copy. Since 2390 * multiple processes may all try to copy the same block, 2391 * we have to recheck our need to do a copy if we sleep 2392 * waiting for the lock. 2393 * 2394 * Because all snapshots on a filesystem share a single 2395 * lock, we ensure that we will never be in competition 2396 * with another process to allocate a block. 2397 */ 2398 td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF; 2399 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 2400 fs->fs_bsize, KERNCRED, 0, &cbp); 2401 td->td_pflags &= ~TDP_COWINPROGRESS; 2402 if (error) 2403 break; 2404 #ifdef DIAGNOSTIC 2405 if (snapdebug) { 2406 printf("Copyonwrite: snapino %ju lbn %jd for ", 2407 (uintmax_t)ip->i_number, (intmax_t)lbn); 2408 if (bp->b_vp == devvp) 2409 printf("fs metadata"); 2410 else 2411 printf("inum %ju", 2412 (uintmax_t)VTOI(bp->b_vp)->i_number); 2413 printf(" lblkno %jd to blkno %jd\n", 2414 (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno); 2415 } 2416 #endif 2417 /* 2418 * If we have already read the old block contents, then 2419 * simply copy them to the new block. Note that we need 2420 * to synchronously write snapshots that have not been 2421 * unlinked, and hence will be visible after a crash, 2422 * to ensure their integrity. At a minimum we ensure the 2423 * integrity of the filesystem metadata, but use the 2424 * dopersistence sysctl-setable flag to decide on the 2425 * persistence needed for file content data. 2426 */ 2427 if (savedcbp != NULL) { 2428 bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 2429 bawrite(cbp); 2430 if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR || 2431 dopersistence) && ip->i_effnlink > 0) 2432 (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); 2433 else 2434 launched_async_io = 1; 2435 continue; 2436 } 2437 /* 2438 * Otherwise, read the old block contents into the buffer. 2439 */ 2440 if ((error = readblock(vp, cbp, lbn)) != 0) { 2441 bzero(cbp->b_data, fs->fs_bsize); 2442 bawrite(cbp); 2443 if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR || 2444 dopersistence) && ip->i_effnlink > 0) 2445 (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); 2446 else 2447 launched_async_io = 1; 2448 break; 2449 } 2450 savedcbp = cbp; 2451 } 2452 /* 2453 * Note that we need to synchronously write snapshots that 2454 * have not been unlinked, and hence will be visible after 2455 * a crash, to ensure their integrity. At a minimum we 2456 * ensure the integrity of the filesystem metadata, but 2457 * use the dopersistence sysctl-setable flag to decide on 2458 * the persistence needed for file content data. 2459 */ 2460 if (savedcbp) { 2461 vp = savedcbp->b_vp; 2462 bawrite(savedcbp); 2463 if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR || 2464 dopersistence) && VTOI(vp)->i_effnlink > 0) 2465 (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); 2466 else 2467 launched_async_io = 1; 2468 } 2469 lockmgr(vp->v_vnlock, LK_RELEASE, NULL); 2470 td->td_pflags = (td->td_pflags & ~TDP_NORUNNINGBUF) | 2471 prev_norunningbuf; 2472 if (launched_async_io && (td->td_pflags & TDP_NORUNNINGBUF) == 0) 2473 waitrunningbufspace(); 2474 /* 2475 * I/O on bp will now be started, so count it in runningbufspace. 2476 */ 2477 if (saved_runningbufspace != 0) { 2478 bp->b_runningbufspace = saved_runningbufspace; 2479 atomic_add_long(&runningbufspace, bp->b_runningbufspace); 2480 } 2481 return (error); 2482 } 2483 2484 /* 2485 * sync snapshots to force freework records waiting on snapshots to claim 2486 * blocks to free. 2487 */ 2488 void 2489 ffs_sync_snap(struct mount *mp, int waitfor) 2490 { 2491 struct snapdata *sn; 2492 struct vnode *devvp; 2493 struct vnode *vp; 2494 struct inode *ip; 2495 2496 devvp = VFSTOUFS(mp)->um_devvp; 2497 if ((devvp->v_vflag & VV_COPYONWRITE) == 0) 2498 return; 2499 for (;;) { 2500 VI_LOCK(devvp); 2501 sn = devvp->v_rdev->si_snapdata; 2502 if (sn == NULL) { 2503 VI_UNLOCK(devvp); 2504 return; 2505 } 2506 if (lockmgr(&sn->sn_lock, 2507 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 2508 VI_MTX(devvp)) == 0) 2509 break; 2510 } 2511 TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { 2512 vp = ITOV(ip); 2513 ffs_syncvnode(vp, waitfor, NO_INO_UPDT); 2514 } 2515 lockmgr(&sn->sn_lock, LK_RELEASE, NULL); 2516 } 2517 2518 /* 2519 * Read the specified block into the given buffer. 2520 * Much of this boiler-plate comes from bwrite(). 2521 */ 2522 static int 2523 readblock(struct vnode *vp, 2524 struct buf *bp, 2525 ufs2_daddr_t lbn) 2526 { 2527 struct inode *ip; 2528 struct fs *fs; 2529 2530 ip = VTOI(vp); 2531 fs = ITOFS(ip); 2532 2533 bp->b_iocmd = BIO_READ; 2534 bp->b_iooffset = dbtob(fsbtodb(fs, blkstofrags(fs, lbn))); 2535 bp->b_iodone = bdone; 2536 g_vfs_strategy(&ITODEVVP(ip)->v_bufobj, bp); 2537 bufwait(bp); 2538 return (bp->b_error); 2539 } 2540 2541 #endif 2542 2543 /* 2544 * Process file deletes that were deferred by ufs_inactive() due to 2545 * the file system being suspended. Transfer IN_LAZYACCESS into 2546 * IN_MODIFIED for vnodes that were accessed during suspension. 2547 */ 2548 void 2549 process_deferred_inactive(struct mount *mp) 2550 { 2551 struct vnode *vp, *mvp; 2552 struct inode *ip; 2553 int error; 2554 2555 (void) vn_start_secondary_write(NULL, &mp, V_WAIT); 2556 loop: 2557 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 2558 /* 2559 * IN_LAZYACCESS is checked here without holding any 2560 * vnode lock, but this flag is set only while holding 2561 * vnode interlock. 2562 */ 2563 if (vp->v_type == VNON || 2564 ((VTOI(vp)->i_flag & IN_LAZYACCESS) == 0 && 2565 ((vp->v_iflag & VI_OWEINACT) == 0 || vp->v_usecount > 0))) { 2566 VI_UNLOCK(vp); 2567 continue; 2568 } 2569 vholdl(vp); 2570 retry_vnode: 2571 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); 2572 if (error != 0) { 2573 vdrop(vp); 2574 if (error == ENOENT) 2575 continue; /* vnode recycled */ 2576 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 2577 goto loop; 2578 } 2579 ip = VTOI(vp); 2580 if ((ip->i_flag & IN_LAZYACCESS) != 0) { 2581 ip->i_flag &= ~IN_LAZYACCESS; 2582 UFS_INODE_SET_FLAG(ip, IN_MODIFIED); 2583 } 2584 VI_LOCK(vp); 2585 error = vinactive(vp); 2586 if (error == ERELOOKUP && vp->v_usecount == 0) { 2587 VI_UNLOCK(vp); 2588 VOP_UNLOCK(vp); 2589 goto retry_vnode; 2590 } 2591 VI_UNLOCK(vp); 2592 VOP_UNLOCK(vp); 2593 vdrop(vp); 2594 } 2595 vn_finished_secondary_write(mp); 2596 } 2597 2598 #ifndef NO_FFS_SNAPSHOT 2599 2600 static struct snapdata * 2601 ffs_snapdata_alloc(void) 2602 { 2603 struct snapdata *sn; 2604 2605 /* 2606 * Fetch a snapdata from the free list if there is one available. 2607 */ 2608 mtx_lock(&snapfree_lock); 2609 sn = LIST_FIRST(&snapfree); 2610 if (sn != NULL) 2611 LIST_REMOVE(sn, sn_link); 2612 mtx_unlock(&snapfree_lock); 2613 if (sn != NULL) 2614 return (sn); 2615 /* 2616 * If there were no free snapdatas allocate one. 2617 */ 2618 sn = malloc(sizeof *sn, M_UFSMNT, M_WAITOK | M_ZERO); 2619 TAILQ_INIT(&sn->sn_head); 2620 lockinit(&sn->sn_lock, PVFS, "snaplk", VLKTIMEOUT, 2621 LK_CANRECURSE | LK_NOSHARE); 2622 return (sn); 2623 } 2624 2625 /* 2626 * The snapdata is never freed because we can not be certain that 2627 * there are no threads sleeping on the snap lock. Persisting 2628 * them permanently avoids costly synchronization in ffs_lock(). 2629 */ 2630 static void 2631 ffs_snapdata_free(struct snapdata *sn) 2632 { 2633 mtx_lock(&snapfree_lock); 2634 LIST_INSERT_HEAD(&snapfree, sn, sn_link); 2635 mtx_unlock(&snapfree_lock); 2636 } 2637 2638 /* Try to free snapdata associated with devvp */ 2639 static void 2640 try_free_snapdata(struct vnode *devvp) 2641 { 2642 struct snapdata *sn; 2643 ufs2_daddr_t *snapblklist; 2644 2645 ASSERT_VI_LOCKED(devvp, "try_free_snapdata"); 2646 sn = devvp->v_rdev->si_snapdata; 2647 2648 if (sn == NULL || TAILQ_FIRST(&sn->sn_head) != NULL || 2649 (devvp->v_vflag & VV_COPYONWRITE) == 0) 2650 return; 2651 2652 devvp->v_rdev->si_snapdata = NULL; 2653 devvp->v_vflag &= ~VV_COPYONWRITE; 2654 lockmgr(&sn->sn_lock, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp)); 2655 snapblklist = sn->sn_blklist; 2656 sn->sn_blklist = NULL; 2657 sn->sn_listsize = 0; 2658 lockmgr(&sn->sn_lock, LK_RELEASE, NULL); 2659 if (snapblklist != NULL) 2660 free(snapblklist, M_UFSMNT); 2661 ffs_snapdata_free(sn); 2662 VI_LOCK(devvp); 2663 } 2664 2665 /* 2666 * Revert a vnode lock from using the snapshot lock back to its own lock. 2667 * 2668 * Aquire a lock on the vnode's own lock and release the lock on the 2669 * snapshot lock. If there are any recursions on the snapshot lock 2670 * get the same number of recursions on the vnode's own lock. 2671 */ 2672 static void 2673 revert_snaplock(struct vnode *vp, 2674 struct vnode *devvp, 2675 struct snapdata *sn) 2676 { 2677 int i; 2678 2679 ASSERT_VI_LOCKED(devvp, "revert_snaplock"); 2680 /* 2681 * Avoid LOR with snapshot lock. The LK_NOWAIT should 2682 * never fail as the lock is currently unused. Rather than 2683 * panic, we recover by doing the blocking lock. 2684 */ 2685 for (i = 0; i <= sn->sn_lock.lk_recurse; i++) { 2686 if (lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT | 2687 LK_INTERLOCK, VI_MTX(devvp)) != 0) { 2688 printf("revert_snaplock: Unexpected LK_NOWAIT " 2689 "failure\n"); 2690 lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_INTERLOCK, 2691 VI_MTX(devvp)); 2692 } 2693 VI_LOCK(devvp); 2694 } 2695 KASSERT(vp->v_vnlock == &sn->sn_lock, 2696 ("revert_snaplock: lost lock mutation")); 2697 vp->v_vnlock = &vp->v_lock; 2698 while (sn->sn_lock.lk_recurse > 0) 2699 lockmgr(&sn->sn_lock, LK_RELEASE, NULL); 2700 lockmgr(&sn->sn_lock, LK_RELEASE, NULL); 2701 } 2702 2703 static struct snapdata * 2704 ffs_snapdata_acquire(struct vnode *devvp) 2705 { 2706 struct snapdata *nsn, *sn; 2707 int error; 2708 2709 /* 2710 * Allocate a free snapdata. This is done before acquiring the 2711 * devvp lock to avoid allocation while the devvp interlock is 2712 * held. 2713 */ 2714 nsn = ffs_snapdata_alloc(); 2715 2716 for (;;) { 2717 VI_LOCK(devvp); 2718 sn = devvp->v_rdev->si_snapdata; 2719 if (sn == NULL) { 2720 /* 2721 * This is the first snapshot on this 2722 * filesystem and we use our pre-allocated 2723 * snapdata. Publish sn with the sn_lock 2724 * owned by us, to avoid the race. 2725 */ 2726 error = lockmgr(&nsn->sn_lock, LK_EXCLUSIVE | 2727 LK_NOWAIT, NULL); 2728 if (error != 0) 2729 panic("leaked sn, lockmgr error %d", error); 2730 sn = devvp->v_rdev->si_snapdata = nsn; 2731 VI_UNLOCK(devvp); 2732 nsn = NULL; 2733 break; 2734 } 2735 2736 /* 2737 * There is a snapshots which already exists on this 2738 * filesystem, grab a reference to the common lock. 2739 */ 2740 error = lockmgr(&sn->sn_lock, LK_INTERLOCK | 2741 LK_EXCLUSIVE | LK_SLEEPFAIL, VI_MTX(devvp)); 2742 if (error == 0) 2743 break; 2744 } 2745 2746 /* 2747 * Free any unused snapdata. 2748 */ 2749 if (nsn != NULL) 2750 ffs_snapdata_free(nsn); 2751 2752 return (sn); 2753 } 2754 2755 #endif 2756