1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. 5 * 6 * Further information about snapshots can be obtained from: 7 * 8 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 9 * 1614 Oxford Street mckusick@mckusick.com 10 * Berkeley, CA 94709-1608 +1-510-843-9542 11 * USA 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 23 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 24 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 25 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 26 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 27 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 * 35 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 36 */ 37 38 #include <sys/cdefs.h> 39 __FBSDID("$FreeBSD$"); 40 41 #include "opt_quota.h" 42 43 #include <sys/param.h> 44 #include <sys/kernel.h> 45 #include <sys/systm.h> 46 #include <sys/conf.h> 47 #include <sys/gsb_crc32.h> 48 #include <sys/bio.h> 49 #include <sys/buf.h> 50 #include <sys/fcntl.h> 51 #include <sys/proc.h> 52 #include <sys/namei.h> 53 #include <sys/sched.h> 54 #include <sys/stat.h> 55 #include <sys/malloc.h> 56 #include <sys/mount.h> 57 #include <sys/resource.h> 58 #include <sys/resourcevar.h> 59 #include <sys/rwlock.h> 60 #include <sys/vnode.h> 61 62 #include <geom/geom.h> 63 64 #include <ufs/ufs/extattr.h> 65 #include <ufs/ufs/quota.h> 66 #include <ufs/ufs/ufsmount.h> 67 #include <ufs/ufs/inode.h> 68 #include <ufs/ufs/ufs_extern.h> 69 70 #include <ufs/ffs/fs.h> 71 #include <ufs/ffs/ffs_extern.h> 72 73 #define KERNCRED thread0.td_ucred 74 75 #include "opt_ffs.h" 76 77 #ifdef NO_FFS_SNAPSHOT 78 int 79 ffs_snapshot(mp, snapfile) 80 struct mount *mp; 81 char *snapfile; 82 { 83 return (EINVAL); 84 } 85 86 int 87 ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, wkhd) 88 struct fs *fs; 89 struct vnode *devvp; 90 ufs2_daddr_t bno; 91 long size; 92 ino_t inum; 93 enum vtype vtype; 94 struct workhead *wkhd; 95 { 96 return (EINVAL); 97 } 98 99 void 100 ffs_snapremove(vp) 101 struct vnode *vp; 102 { 103 } 104 105 void 106 ffs_snapshot_mount(mp) 107 struct mount *mp; 108 { 109 } 110 111 void 112 ffs_snapshot_unmount(mp) 113 struct mount *mp; 114 { 115 } 116 117 void 118 ffs_snapgone(ip) 119 struct inode *ip; 120 { 121 } 122 123 int 124 ffs_copyonwrite(devvp, bp) 125 struct vnode *devvp; 126 struct buf *bp; 127 { 128 return (EINVAL); 129 } 130 131 void 132 ffs_sync_snap(mp, waitfor) 133 struct mount *mp; 134 int waitfor; 135 { 136 } 137 138 #else 139 FEATURE(ffs_snapshot, "FFS snapshot support"); 140 141 LIST_HEAD(, snapdata) snapfree; 142 static struct mtx snapfree_lock; 143 MTX_SYSINIT(ffs_snapfree, &snapfree_lock, "snapdata free list", MTX_DEF); 144 145 static int cgaccount(int, struct vnode *, struct buf *, int); 146 static int expunge_ufs1(struct vnode *, struct inode *, struct fs *, 147 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 148 ufs_lbn_t, int), int, int); 149 static int indiracct_ufs1(struct vnode *, struct vnode *, int, 150 ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 151 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 152 ufs_lbn_t, int), int); 153 static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 154 struct fs *, ufs_lbn_t, int); 155 static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 156 struct fs *, ufs_lbn_t, int); 157 static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 158 struct fs *, ufs_lbn_t, int); 159 static int expunge_ufs2(struct vnode *, struct inode *, struct fs *, 160 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 161 ufs_lbn_t, int), int, int); 162 static int indiracct_ufs2(struct vnode *, struct vnode *, int, 163 ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 164 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 165 ufs_lbn_t, int), int); 166 static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 167 struct fs *, ufs_lbn_t, int); 168 static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 169 struct fs *, ufs_lbn_t, int); 170 static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 171 struct fs *, ufs_lbn_t, int); 172 static int readblock(struct vnode *vp, struct buf *, ufs2_daddr_t); 173 static void try_free_snapdata(struct vnode *devvp); 174 static struct snapdata *ffs_snapdata_acquire(struct vnode *devvp); 175 static int ffs_bp_snapblk(struct vnode *, struct buf *); 176 177 /* 178 * To ensure the consistency of snapshots across crashes, we must 179 * synchronously write out copied blocks before allowing the 180 * originals to be modified. Because of the rather severe speed 181 * penalty that this imposes, the code normally only ensures 182 * persistence for the filesystem metadata contained within a 183 * snapshot. Setting the following flag allows this crash 184 * persistence to be enabled for file contents. 185 */ 186 int dopersistence = 0; 187 188 #ifdef DIAGNOSTIC 189 #include <sys/sysctl.h> 190 SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, ""); 191 static int snapdebug = 0; 192 SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, ""); 193 int collectsnapstats = 0; 194 SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats, 195 0, ""); 196 #endif /* DIAGNOSTIC */ 197 198 /* 199 * Create a snapshot file and initialize it for the filesystem. 200 */ 201 int 202 ffs_snapshot(mp, snapfile) 203 struct mount *mp; 204 char *snapfile; 205 { 206 ufs2_daddr_t numblks, blkno, *blkp, *snapblklist; 207 int error, cg, snaploc; 208 int i, size, len, loc; 209 ufs2_daddr_t blockno; 210 uint64_t flag; 211 char saved_nice = 0; 212 long redo = 0, snaplistsize = 0; 213 int32_t *lp; 214 void *space; 215 struct fs *copy_fs = NULL, *fs; 216 struct thread *td = curthread; 217 struct inode *ip, *xp; 218 struct buf *bp, *nbp, *ibp; 219 struct nameidata nd; 220 struct mount *wrtmp; 221 struct vattr vat; 222 struct vnode *vp, *xvp, *mvp, *devvp; 223 struct uio auio; 224 struct iovec aiov; 225 struct snapdata *sn; 226 struct ufsmount *ump; 227 #ifdef DIAGNOSTIC 228 struct timespec starttime = {0, 0}, endtime; 229 #endif 230 231 ump = VFSTOUFS(mp); 232 fs = ump->um_fs; 233 sn = NULL; 234 /* 235 * At the moment, journaled soft updates cannot support 236 * taking snapshots. 237 */ 238 if (MOUNTEDSUJ(mp)) { 239 vfs_mount_error(mp, "%s: Snapshots are not yet supported when " 240 "running with journaled soft updates", fs->fs_fsmnt); 241 return (EOPNOTSUPP); 242 } 243 MNT_ILOCK(mp); 244 flag = mp->mnt_flag; 245 MNT_IUNLOCK(mp); 246 /* 247 * Need to serialize access to snapshot code per filesystem. 248 */ 249 /* 250 * Assign a snapshot slot in the superblock. 251 */ 252 UFS_LOCK(ump); 253 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 254 if (fs->fs_snapinum[snaploc] == 0) 255 break; 256 UFS_UNLOCK(ump); 257 if (snaploc == FSMAXSNAP) 258 return (ENOSPC); 259 /* 260 * Create the snapshot file. 261 */ 262 restart: 263 NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF | NOCACHE, UIO_SYSSPACE, 264 snapfile, td); 265 if ((error = namei(&nd)) != 0) 266 return (error); 267 if (nd.ni_vp != NULL) { 268 vput(nd.ni_vp); 269 error = EEXIST; 270 } 271 if (nd.ni_dvp->v_mount != mp) 272 error = EXDEV; 273 if (error) { 274 NDFREE(&nd, NDF_ONLY_PNBUF); 275 if (nd.ni_dvp == nd.ni_vp) 276 vrele(nd.ni_dvp); 277 else 278 vput(nd.ni_dvp); 279 return (error); 280 } 281 VATTR_NULL(&vat); 282 vat.va_type = VREG; 283 vat.va_mode = S_IRUSR; 284 vat.va_vaflags |= VA_EXCLUSIVE; 285 if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp)) 286 wrtmp = NULL; 287 if (wrtmp != mp) 288 panic("ffs_snapshot: mount mismatch"); 289 vfs_rel(wrtmp); 290 if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) { 291 NDFREE(&nd, NDF_ONLY_PNBUF); 292 vput(nd.ni_dvp); 293 if ((error = vn_start_write(NULL, &wrtmp, 294 V_XSLEEP | PCATCH)) != 0) 295 return (error); 296 goto restart; 297 } 298 error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat); 299 VOP_UNLOCK(nd.ni_dvp); 300 if (error) { 301 NDFREE(&nd, NDF_ONLY_PNBUF); 302 vn_finished_write(wrtmp); 303 vrele(nd.ni_dvp); 304 return (error); 305 } 306 vp = nd.ni_vp; 307 vnode_create_vobject(nd.ni_vp, fs->fs_size, td); 308 vp->v_vflag |= VV_SYSTEM; 309 ip = VTOI(vp); 310 devvp = ITODEVVP(ip); 311 /* 312 * Allocate and copy the last block contents so as to be able 313 * to set size to that of the filesystem. 314 */ 315 numblks = howmany(fs->fs_size, fs->fs_frag); 316 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), 317 fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); 318 if (error) 319 goto out; 320 ip->i_size = lblktosize(fs, (off_t)numblks); 321 DIP_SET(ip, i_size, ip->i_size); 322 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); 323 error = readblock(vp, bp, numblks - 1); 324 bawrite(bp); 325 if (error != 0) 326 goto out; 327 /* 328 * Preallocate critical data structures so that we can copy 329 * them in without further allocation after we suspend all 330 * operations on the filesystem. We would like to just release 331 * the allocated buffers without writing them since they will 332 * be filled in below once we are ready to go, but this upsets 333 * the soft update code, so we go ahead and write the new buffers. 334 * 335 * Allocate all indirect blocks and mark all of them as not 336 * needing to be copied. 337 */ 338 for (blkno = UFS_NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 339 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), 340 fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp); 341 if (error) 342 goto out; 343 bawrite(ibp); 344 } 345 /* 346 * Allocate copies for the superblock and its summary information. 347 */ 348 error = UFS_BALLOC(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED, 349 0, &nbp); 350 if (error) 351 goto out; 352 bawrite(nbp); 353 blkno = fragstoblks(fs, fs->fs_csaddr); 354 len = howmany(fs->fs_cssize, fs->fs_bsize); 355 for (loc = 0; loc < len; loc++) { 356 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), 357 fs->fs_bsize, KERNCRED, 0, &nbp); 358 if (error) 359 goto out; 360 bawrite(nbp); 361 } 362 /* 363 * Allocate all cylinder group blocks. 364 */ 365 for (cg = 0; cg < fs->fs_ncg; cg++) { 366 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 367 fs->fs_bsize, KERNCRED, 0, &nbp); 368 if (error) 369 goto out; 370 bawrite(nbp); 371 if (cg % 10 == 0) 372 ffs_syncvnode(vp, MNT_WAIT, 0); 373 } 374 /* 375 * Copy all the cylinder group maps. Although the 376 * filesystem is still active, we hope that only a few 377 * cylinder groups will change between now and when we 378 * suspend operations. Thus, we will be able to quickly 379 * touch up the few cylinder groups that changed during 380 * the suspension period. 381 */ 382 len = howmany(fs->fs_ncg, NBBY); 383 space = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO); 384 UFS_LOCK(ump); 385 fs->fs_active = space; 386 UFS_UNLOCK(ump); 387 for (cg = 0; cg < fs->fs_ncg; cg++) { 388 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 389 fs->fs_bsize, KERNCRED, 0, &nbp); 390 if (error) 391 goto out; 392 error = cgaccount(cg, vp, nbp, 1); 393 bawrite(nbp); 394 if (cg % 10 == 0) 395 ffs_syncvnode(vp, MNT_WAIT, 0); 396 if (error) 397 goto out; 398 } 399 /* 400 * Change inode to snapshot type file. 401 */ 402 ip->i_flags |= SF_SNAPSHOT; 403 DIP_SET(ip, i_flags, ip->i_flags); 404 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); 405 /* 406 * Ensure that the snapshot is completely on disk. 407 * Since we have marked it as a snapshot it is safe to 408 * unlock it as no process will be allowed to write to it. 409 */ 410 if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) 411 goto out; 412 VOP_UNLOCK(vp); 413 /* 414 * All allocations are done, so we can now snapshot the system. 415 * 416 * Recind nice scheduling while running with the filesystem suspended. 417 */ 418 if (td->td_proc->p_nice > 0) { 419 struct proc *p; 420 421 p = td->td_proc; 422 PROC_LOCK(p); 423 saved_nice = p->p_nice; 424 sched_nice(p, 0); 425 PROC_UNLOCK(p); 426 } 427 /* 428 * Suspend operation on filesystem. 429 */ 430 for (;;) { 431 vn_finished_write(wrtmp); 432 if ((error = vfs_write_suspend(vp->v_mount, 0)) != 0) { 433 vn_start_write(NULL, &wrtmp, V_WAIT); 434 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 435 goto out; 436 } 437 if (mp->mnt_kern_flag & MNTK_SUSPENDED) 438 break; 439 vn_start_write(NULL, &wrtmp, V_WAIT); 440 } 441 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 442 if (ip->i_effnlink == 0) { 443 error = ENOENT; /* Snapshot file unlinked */ 444 goto out1; 445 } 446 #ifdef DIAGNOSTIC 447 if (collectsnapstats) 448 nanotime(&starttime); 449 #endif 450 451 /* The last block might have changed. Copy it again to be sure. */ 452 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), 453 fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); 454 if (error != 0) 455 goto out1; 456 error = readblock(vp, bp, numblks - 1); 457 bp->b_flags |= B_VALIDSUSPWRT; 458 bawrite(bp); 459 if (error != 0) 460 goto out1; 461 /* 462 * First, copy all the cylinder group maps that have changed. 463 */ 464 for (cg = 0; cg < fs->fs_ncg; cg++) { 465 if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0) 466 continue; 467 redo++; 468 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 469 fs->fs_bsize, KERNCRED, 0, &nbp); 470 if (error) 471 goto out1; 472 error = cgaccount(cg, vp, nbp, 2); 473 bawrite(nbp); 474 if (error) 475 goto out1; 476 } 477 /* 478 * Grab a copy of the superblock and its summary information. 479 * We delay writing it until the suspension is released below. 480 */ 481 copy_fs = malloc((u_long)fs->fs_bsize, M_UFSMNT, M_WAITOK); 482 bcopy(fs, copy_fs, fs->fs_sbsize); 483 if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) 484 copy_fs->fs_clean = 1; 485 size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE; 486 if (fs->fs_sbsize < size) 487 bzero(&((char *)copy_fs)[fs->fs_sbsize], 488 size - fs->fs_sbsize); 489 size = blkroundup(fs, fs->fs_cssize); 490 if (fs->fs_contigsumsize > 0) 491 size += fs->fs_ncg * sizeof(int32_t); 492 space = malloc((u_long)size, M_UFSMNT, M_WAITOK); 493 copy_fs->fs_csp = space; 494 bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize); 495 space = (char *)space + fs->fs_cssize; 496 loc = howmany(fs->fs_cssize, fs->fs_fsize); 497 i = fs->fs_frag - loc % fs->fs_frag; 498 len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; 499 if (len > 0) { 500 if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc), 501 len, KERNCRED, &bp)) != 0) { 502 brelse(bp); 503 free(copy_fs->fs_csp, M_UFSMNT); 504 free(copy_fs, M_UFSMNT); 505 copy_fs = NULL; 506 goto out1; 507 } 508 bcopy(bp->b_data, space, (u_int)len); 509 space = (char *)space + len; 510 bp->b_flags |= B_INVAL | B_NOCACHE; 511 brelse(bp); 512 } 513 if (fs->fs_contigsumsize > 0) { 514 copy_fs->fs_maxcluster = lp = space; 515 for (i = 0; i < fs->fs_ncg; i++) 516 *lp++ = fs->fs_contigsumsize; 517 } 518 /* 519 * We must check for active files that have been unlinked 520 * (e.g., with a zero link count). We have to expunge all 521 * trace of these files from the snapshot so that they are 522 * not reclaimed prematurely by fsck or unnecessarily dumped. 523 * We turn off the MNTK_SUSPENDED flag to avoid a panic from 524 * spec_strategy about writing on a suspended filesystem. 525 * Note that we skip unlinked snapshot files as they will 526 * be handled separately below. 527 * 528 * We also calculate the needed size for the snapshot list. 529 */ 530 snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + 531 FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */; 532 MNT_ILOCK(mp); 533 mp->mnt_kern_flag &= ~MNTK_SUSPENDED; 534 MNT_IUNLOCK(mp); 535 loop: 536 MNT_VNODE_FOREACH_ALL(xvp, mp, mvp) { 537 if ((xvp->v_usecount == 0 && 538 (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) || 539 xvp->v_type == VNON || 540 IS_SNAPSHOT(VTOI(xvp))) { 541 VI_UNLOCK(xvp); 542 continue; 543 } 544 /* 545 * We can skip parent directory vnode because it must have 546 * this snapshot file in it. 547 */ 548 if (xvp == nd.ni_dvp) { 549 VI_UNLOCK(xvp); 550 continue; 551 } 552 vholdl(xvp); 553 if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK) != 0) { 554 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 555 vdrop(xvp); 556 goto loop; 557 } 558 VI_LOCK(xvp); 559 if (xvp->v_usecount == 0 && 560 (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) { 561 VI_UNLOCK(xvp); 562 VOP_UNLOCK(xvp); 563 vdrop(xvp); 564 continue; 565 } 566 VI_UNLOCK(xvp); 567 #ifdef DIAGNOSTIC 568 if (snapdebug) 569 vn_printf(xvp, "ffs_snapshot: busy vnode "); 570 #endif 571 if (VOP_GETATTR(xvp, &vat, td->td_ucred) == 0 && 572 vat.va_nlink > 0) { 573 VOP_UNLOCK(xvp); 574 vdrop(xvp); 575 continue; 576 } 577 xp = VTOI(xvp); 578 if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) { 579 VOP_UNLOCK(xvp); 580 vdrop(xvp); 581 continue; 582 } 583 /* 584 * If there is a fragment, clear it here. 585 */ 586 blkno = 0; 587 loc = howmany(xp->i_size, fs->fs_bsize) - 1; 588 if (loc < UFS_NDADDR) { 589 len = fragroundup(fs, blkoff(fs, xp->i_size)); 590 if (len != 0 && len < fs->fs_bsize) { 591 ffs_blkfree(ump, copy_fs, vp, 592 DIP(xp, i_db[loc]), len, xp->i_number, 593 xvp->v_type, NULL, SINGLETON_KEY); 594 blkno = DIP(xp, i_db[loc]); 595 DIP_SET(xp, i_db[loc], 0); 596 } 597 } 598 snaplistsize += 1; 599 if (I_IS_UFS1(xp)) 600 error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, 601 BLK_NOCOPY, 1); 602 else 603 error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, 604 BLK_NOCOPY, 1); 605 if (blkno) 606 DIP_SET(xp, i_db[loc], blkno); 607 if (!error) 608 error = ffs_freefile(ump, copy_fs, vp, xp->i_number, 609 xp->i_mode, NULL); 610 VOP_UNLOCK(xvp); 611 vdrop(xvp); 612 if (error) { 613 free(copy_fs->fs_csp, M_UFSMNT); 614 free(copy_fs, M_UFSMNT); 615 copy_fs = NULL; 616 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 617 goto out1; 618 } 619 } 620 /* 621 * Erase the journal file from the snapshot. 622 */ 623 if (fs->fs_flags & FS_SUJ) { 624 error = softdep_journal_lookup(mp, &xvp); 625 if (error) { 626 free(copy_fs->fs_csp, M_UFSMNT); 627 free(copy_fs, M_UFSMNT); 628 copy_fs = NULL; 629 goto out1; 630 } 631 xp = VTOI(xvp); 632 if (I_IS_UFS1(xp)) 633 error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, 634 BLK_NOCOPY, 0); 635 else 636 error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, 637 BLK_NOCOPY, 0); 638 vput(xvp); 639 } 640 /* 641 * Acquire a lock on the snapdata structure, creating it if necessary. 642 */ 643 sn = ffs_snapdata_acquire(devvp); 644 /* 645 * Change vnode to use shared snapshot lock instead of the original 646 * private lock. 647 */ 648 vp->v_vnlock = &sn->sn_lock; 649 lockmgr(&vp->v_lock, LK_RELEASE, NULL); 650 xp = TAILQ_FIRST(&sn->sn_head); 651 /* 652 * If this is the first snapshot on this filesystem, then we need 653 * to allocate the space for the list of preallocated snapshot blocks. 654 * This list will be refined below, but this preliminary one will 655 * keep us out of deadlock until the full one is ready. 656 */ 657 if (xp == NULL) { 658 snapblklist = malloc(snaplistsize * sizeof(daddr_t), 659 M_UFSMNT, M_WAITOK); 660 blkp = &snapblklist[1]; 661 *blkp++ = lblkno(fs, fs->fs_sblockloc); 662 blkno = fragstoblks(fs, fs->fs_csaddr); 663 for (cg = 0; cg < fs->fs_ncg; cg++) { 664 if (fragstoblks(fs, cgtod(fs, cg) > blkno)) 665 break; 666 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 667 } 668 len = howmany(fs->fs_cssize, fs->fs_bsize); 669 for (loc = 0; loc < len; loc++) 670 *blkp++ = blkno + loc; 671 for (; cg < fs->fs_ncg; cg++) 672 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 673 snapblklist[0] = blkp - snapblklist; 674 VI_LOCK(devvp); 675 if (sn->sn_blklist != NULL) 676 panic("ffs_snapshot: non-empty list"); 677 sn->sn_blklist = snapblklist; 678 sn->sn_listsize = blkp - snapblklist; 679 VI_UNLOCK(devvp); 680 } 681 /* 682 * Record snapshot inode. Since this is the newest snapshot, 683 * it must be placed at the end of the list. 684 */ 685 VI_LOCK(devvp); 686 fs->fs_snapinum[snaploc] = ip->i_number; 687 if (ip->i_nextsnap.tqe_prev != 0) 688 panic("ffs_snapshot: %ju already on list", 689 (uintmax_t)ip->i_number); 690 TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap); 691 devvp->v_vflag |= VV_COPYONWRITE; 692 VI_UNLOCK(devvp); 693 ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp"); 694 out1: 695 KASSERT((sn != NULL && copy_fs != NULL && error == 0) || 696 (sn == NULL && copy_fs == NULL && error != 0), 697 ("email phk@ and mckusick@")); 698 /* 699 * Resume operation on filesystem. 700 */ 701 vfs_write_resume(vp->v_mount, VR_START_WRITE | VR_NO_SUSPCLR); 702 #ifdef DIAGNOSTIC 703 if (collectsnapstats && starttime.tv_sec > 0) { 704 nanotime(&endtime); 705 timespecsub(&endtime, &starttime, &endtime); 706 printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n", 707 vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec, 708 endtime.tv_nsec / 1000000, redo, fs->fs_ncg); 709 } 710 #endif 711 if (copy_fs == NULL) 712 goto out; 713 /* 714 * Copy allocation information from all the snapshots in 715 * this snapshot and then expunge them from its view. 716 */ 717 TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) { 718 if (xp == ip) 719 break; 720 if (I_IS_UFS1(xp)) 721 error = expunge_ufs1(vp, xp, fs, snapacct_ufs1, 722 BLK_SNAP, 0); 723 else 724 error = expunge_ufs2(vp, xp, fs, snapacct_ufs2, 725 BLK_SNAP, 0); 726 if (error == 0 && xp->i_effnlink == 0) { 727 error = ffs_freefile(ump, 728 copy_fs, 729 vp, 730 xp->i_number, 731 xp->i_mode, NULL); 732 } 733 if (error) { 734 fs->fs_snapinum[snaploc] = 0; 735 goto done; 736 } 737 } 738 /* 739 * Allocate space for the full list of preallocated snapshot blocks. 740 */ 741 snapblklist = malloc(snaplistsize * sizeof(daddr_t), 742 M_UFSMNT, M_WAITOK); 743 ip->i_snapblklist = &snapblklist[1]; 744 /* 745 * Expunge the blocks used by the snapshots from the set of 746 * blocks marked as used in the snapshot bitmaps. Also, collect 747 * the list of allocated blocks in i_snapblklist. 748 */ 749 if (I_IS_UFS1(ip)) 750 error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, 751 BLK_SNAP, 0); 752 else 753 error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, 754 BLK_SNAP, 0); 755 if (error) { 756 fs->fs_snapinum[snaploc] = 0; 757 free(snapblklist, M_UFSMNT); 758 goto done; 759 } 760 if (snaplistsize < ip->i_snapblklist - snapblklist) 761 panic("ffs_snapshot: list too small"); 762 snaplistsize = ip->i_snapblklist - snapblklist; 763 snapblklist[0] = snaplistsize; 764 ip->i_snapblklist = 0; 765 /* 766 * Write out the list of allocated blocks to the end of the snapshot. 767 */ 768 auio.uio_iov = &aiov; 769 auio.uio_iovcnt = 1; 770 aiov.iov_base = (void *)snapblklist; 771 aiov.iov_len = snaplistsize * sizeof(daddr_t); 772 auio.uio_resid = aiov.iov_len; 773 auio.uio_offset = ip->i_size; 774 auio.uio_segflg = UIO_SYSSPACE; 775 auio.uio_rw = UIO_WRITE; 776 auio.uio_td = td; 777 if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 778 fs->fs_snapinum[snaploc] = 0; 779 free(snapblklist, M_UFSMNT); 780 goto done; 781 } 782 /* 783 * Write the superblock and its summary information 784 * to the snapshot. 785 */ 786 blkno = fragstoblks(fs, fs->fs_csaddr); 787 len = howmany(fs->fs_cssize, fs->fs_bsize); 788 space = copy_fs->fs_csp; 789 for (loc = 0; loc < len; loc++) { 790 error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp); 791 if (error) { 792 fs->fs_snapinum[snaploc] = 0; 793 free(snapblklist, M_UFSMNT); 794 goto done; 795 } 796 bcopy(space, nbp->b_data, fs->fs_bsize); 797 space = (char *)space + fs->fs_bsize; 798 bawrite(nbp); 799 } 800 error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize, 801 KERNCRED, &nbp); 802 if (error) { 803 brelse(nbp); 804 } else { 805 loc = blkoff(fs, fs->fs_sblockloc); 806 copy_fs->fs_fmod = 0; 807 copy_fs->fs_ckhash = ffs_calc_sbhash(copy_fs); 808 bcopy((char *)copy_fs, &nbp->b_data[loc], (u_int)fs->fs_sbsize); 809 bawrite(nbp); 810 } 811 /* 812 * As this is the newest list, it is the most inclusive, so 813 * should replace the previous list. 814 */ 815 VI_LOCK(devvp); 816 space = sn->sn_blklist; 817 sn->sn_blklist = snapblklist; 818 sn->sn_listsize = snaplistsize; 819 VI_UNLOCK(devvp); 820 if (space != NULL) 821 free(space, M_UFSMNT); 822 /* 823 * Preallocate all the direct blocks in the snapshot inode so 824 * that we never have to write the inode itself to commit an 825 * update to the contents of the snapshot. Note that once 826 * created, the size of the snapshot will never change, so 827 * there will never be a need to write the inode except to 828 * update the non-integrity-critical time fields and 829 * allocated-block count. 830 */ 831 for (blockno = 0; blockno < UFS_NDADDR; blockno++) { 832 if (DIP(ip, i_db[blockno]) != 0) 833 continue; 834 error = UFS_BALLOC(vp, lblktosize(fs, blockno), 835 fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); 836 if (error) 837 break; 838 error = readblock(vp, bp, blockno); 839 bawrite(bp); 840 if (error != 0) 841 break; 842 } 843 done: 844 free(copy_fs->fs_csp, M_UFSMNT); 845 free(copy_fs, M_UFSMNT); 846 copy_fs = NULL; 847 out: 848 NDFREE(&nd, NDF_ONLY_PNBUF); 849 if (saved_nice > 0) { 850 struct proc *p; 851 852 p = td->td_proc; 853 PROC_LOCK(p); 854 sched_nice(td->td_proc, saved_nice); 855 PROC_UNLOCK(td->td_proc); 856 } 857 UFS_LOCK(ump); 858 if (fs->fs_active != 0) { 859 free(fs->fs_active, M_DEVBUF); 860 fs->fs_active = 0; 861 } 862 UFS_UNLOCK(ump); 863 MNT_ILOCK(mp); 864 mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA); 865 MNT_IUNLOCK(mp); 866 if (error) 867 (void) ffs_truncate(vp, (off_t)0, 0, NOCRED); 868 (void) ffs_syncvnode(vp, MNT_WAIT, 0); 869 if (error) 870 vput(vp); 871 else 872 VOP_UNLOCK(vp); 873 vrele(nd.ni_dvp); 874 vn_finished_write(wrtmp); 875 process_deferred_inactive(mp); 876 return (error); 877 } 878 879 /* 880 * Copy a cylinder group map. All the unallocated blocks are marked 881 * BLK_NOCOPY so that the snapshot knows that it need not copy them 882 * if they are later written. If passno is one, then this is a first 883 * pass, so only setting needs to be done. If passno is 2, then this 884 * is a revision to a previous pass which must be undone as the 885 * replacement pass is done. 886 */ 887 static int 888 cgaccount(cg, vp, nbp, passno) 889 int cg; 890 struct vnode *vp; 891 struct buf *nbp; 892 int passno; 893 { 894 struct buf *bp, *ibp; 895 struct inode *ip; 896 struct cg *cgp; 897 struct fs *fs; 898 ufs2_daddr_t base, numblks; 899 int error, len, loc, indiroff; 900 901 ip = VTOI(vp); 902 fs = ITOFS(ip); 903 if ((error = ffs_getcg(fs, ITODEVVP(ip), cg, 0, &bp, &cgp)) != 0) 904 return (error); 905 UFS_LOCK(ITOUMP(ip)); 906 ACTIVESET(fs, cg); 907 /* 908 * Recomputation of summary information might not have been performed 909 * at mount time. Sync up summary information for current cylinder 910 * group while data is in memory to ensure that result of background 911 * fsck is slightly more consistent. 912 */ 913 fs->fs_cs(fs, cg) = cgp->cg_cs; 914 UFS_UNLOCK(ITOUMP(ip)); 915 bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize); 916 if (fs->fs_cgsize < fs->fs_bsize) 917 bzero(&nbp->b_data[fs->fs_cgsize], 918 fs->fs_bsize - fs->fs_cgsize); 919 cgp = (struct cg *)nbp->b_data; 920 bqrelse(bp); 921 if (passno == 2) 922 nbp->b_flags |= B_VALIDSUSPWRT; 923 numblks = howmany(fs->fs_size, fs->fs_frag); 924 len = howmany(fs->fs_fpg, fs->fs_frag); 925 base = cgbase(fs, cg) / fs->fs_frag; 926 if (base + len >= numblks) 927 len = numblks - base - 1; 928 loc = 0; 929 if (base < UFS_NDADDR) { 930 for ( ; loc < UFS_NDADDR; loc++) { 931 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 932 DIP_SET(ip, i_db[loc], BLK_NOCOPY); 933 else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY) 934 DIP_SET(ip, i_db[loc], 0); 935 else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY) 936 panic("ffs_snapshot: lost direct block"); 937 } 938 } 939 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), 940 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 941 if (error) { 942 goto out; 943 } 944 indiroff = (base + loc - UFS_NDADDR) % NINDIR(fs); 945 for ( ; loc < len; loc++, indiroff++) { 946 if (indiroff >= NINDIR(fs)) { 947 if (passno == 2) 948 ibp->b_flags |= B_VALIDSUSPWRT; 949 bawrite(ibp); 950 error = UFS_BALLOC(vp, 951 lblktosize(fs, (off_t)(base + loc)), 952 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 953 if (error) { 954 goto out; 955 } 956 indiroff = 0; 957 } 958 if (I_IS_UFS1(ip)) { 959 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 960 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 961 BLK_NOCOPY; 962 else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data)) 963 [indiroff] == BLK_NOCOPY) 964 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0; 965 else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data)) 966 [indiroff] == BLK_NOCOPY) 967 panic("ffs_snapshot: lost indirect block"); 968 continue; 969 } 970 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 971 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; 972 else if (passno == 2 && 973 ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) 974 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0; 975 else if (passno == 1 && 976 ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) 977 panic("ffs_snapshot: lost indirect block"); 978 } 979 if (passno == 2) 980 ibp->b_flags |= B_VALIDSUSPWRT; 981 bdwrite(ibp); 982 out: 983 /* 984 * We have to calculate the crc32c here rather than just setting the 985 * BX_CYLGRP b_xflags because the allocation of the block for the 986 * the cylinder group map will always be a full size block (fs_bsize) 987 * even though the cylinder group may be smaller (fs_cgsize). The 988 * crc32c must be computed only over fs_cgsize whereas the BX_CYLGRP 989 * flag causes it to be computed over the size of the buffer. 990 */ 991 if ((fs->fs_metackhash & CK_CYLGRP) != 0) { 992 ((struct cg *)nbp->b_data)->cg_ckhash = 0; 993 ((struct cg *)nbp->b_data)->cg_ckhash = 994 calculate_crc32c(~0L, nbp->b_data, fs->fs_cgsize); 995 } 996 return (error); 997 } 998 999 /* 1000 * Before expunging a snapshot inode, note all the 1001 * blocks that it claims with BLK_SNAP so that fsck will 1002 * be able to account for those blocks properly and so 1003 * that this snapshot knows that it need not copy them 1004 * if the other snapshot holding them is freed. This code 1005 * is reproduced once each for UFS1 and UFS2. 1006 */ 1007 static int 1008 expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype, clearmode) 1009 struct vnode *snapvp; 1010 struct inode *cancelip; 1011 struct fs *fs; 1012 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 1013 struct fs *, ufs_lbn_t, int); 1014 int expungetype; 1015 int clearmode; 1016 { 1017 int i, error, indiroff; 1018 ufs_lbn_t lbn, rlbn; 1019 ufs2_daddr_t len, blkno, numblks, blksperindir; 1020 struct ufs1_dinode *dip; 1021 struct thread *td = curthread; 1022 struct buf *bp; 1023 1024 /* 1025 * Prepare to expunge the inode. If its inode block has not 1026 * yet been copied, then allocate and fill the copy. 1027 */ 1028 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 1029 blkno = 0; 1030 if (lbn < UFS_NDADDR) { 1031 blkno = VTOI(snapvp)->i_din1->di_db[lbn]; 1032 } else { 1033 if (DOINGSOFTDEP(snapvp)) 1034 softdep_prealloc(snapvp, MNT_WAIT); 1035 td->td_pflags |= TDP_COWINPROGRESS; 1036 error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn), 1037 fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); 1038 td->td_pflags &= ~TDP_COWINPROGRESS; 1039 if (error) 1040 return (error); 1041 indiroff = (lbn - UFS_NDADDR) % NINDIR(fs); 1042 blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff]; 1043 bqrelse(bp); 1044 } 1045 if (blkno != 0) { 1046 if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) 1047 return (error); 1048 } else { 1049 error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn), 1050 fs->fs_bsize, KERNCRED, 0, &bp); 1051 if (error) 1052 return (error); 1053 if ((error = readblock(snapvp, bp, lbn)) != 0) 1054 return (error); 1055 } 1056 /* 1057 * Set a snapshot inode to be a zero length file, regular files 1058 * or unlinked snapshots to be completely unallocated. 1059 */ 1060 dip = (struct ufs1_dinode *)bp->b_data + 1061 ino_to_fsbo(fs, cancelip->i_number); 1062 if (clearmode || cancelip->i_effnlink == 0) 1063 dip->di_mode = 0; 1064 dip->di_size = 0; 1065 dip->di_blocks = 0; 1066 dip->di_flags &= ~SF_SNAPSHOT; 1067 bzero(&dip->di_db[0], (UFS_NDADDR + UFS_NIADDR) * sizeof(ufs1_daddr_t)); 1068 bdwrite(bp); 1069 /* 1070 * Now go through and expunge all the blocks in the file 1071 * using the function requested. 1072 */ 1073 numblks = howmany(cancelip->i_size, fs->fs_bsize); 1074 if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0], 1075 &cancelip->i_din1->di_db[UFS_NDADDR], fs, 0, expungetype))) 1076 return (error); 1077 if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_ib[0], 1078 &cancelip->i_din1->di_ib[UFS_NIADDR], fs, -1, expungetype))) 1079 return (error); 1080 blksperindir = 1; 1081 lbn = -UFS_NDADDR; 1082 len = numblks - UFS_NDADDR; 1083 rlbn = UFS_NDADDR; 1084 for (i = 0; len > 0 && i < UFS_NIADDR; i++) { 1085 error = indiracct_ufs1(snapvp, ITOV(cancelip), i, 1086 cancelip->i_din1->di_ib[i], lbn, rlbn, len, 1087 blksperindir, fs, acctfunc, expungetype); 1088 if (error) 1089 return (error); 1090 blksperindir *= NINDIR(fs); 1091 lbn -= blksperindir + 1; 1092 len -= blksperindir; 1093 rlbn += blksperindir; 1094 } 1095 return (0); 1096 } 1097 1098 /* 1099 * Descend an indirect block chain for vnode cancelvp accounting for all 1100 * its indirect blocks in snapvp. 1101 */ 1102 static int 1103 indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, 1104 blksperindir, fs, acctfunc, expungetype) 1105 struct vnode *snapvp; 1106 struct vnode *cancelvp; 1107 int level; 1108 ufs1_daddr_t blkno; 1109 ufs_lbn_t lbn; 1110 ufs_lbn_t rlbn; 1111 ufs_lbn_t remblks; 1112 ufs_lbn_t blksperindir; 1113 struct fs *fs; 1114 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 1115 struct fs *, ufs_lbn_t, int); 1116 int expungetype; 1117 { 1118 int error, num, i; 1119 ufs_lbn_t subblksperindir; 1120 struct indir indirs[UFS_NIADDR + 2]; 1121 ufs1_daddr_t last, *bap; 1122 struct buf *bp; 1123 1124 if (blkno == 0) { 1125 if (expungetype == BLK_NOCOPY) 1126 return (0); 1127 panic("indiracct_ufs1: missing indir"); 1128 } 1129 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 1130 return (error); 1131 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 1132 panic("indiracct_ufs1: botched params"); 1133 /* 1134 * We have to expand bread here since it will deadlock looking 1135 * up the block number for any blocks that are not in the cache. 1136 */ 1137 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); 1138 bp->b_blkno = fsbtodb(fs, blkno); 1139 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 1140 (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) { 1141 brelse(bp); 1142 return (error); 1143 } 1144 /* 1145 * Account for the block pointers in this indirect block. 1146 */ 1147 last = howmany(remblks, blksperindir); 1148 if (last > NINDIR(fs)) 1149 last = NINDIR(fs); 1150 bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK); 1151 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 1152 bqrelse(bp); 1153 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 1154 level == 0 ? rlbn : -1, expungetype); 1155 if (error || level == 0) 1156 goto out; 1157 /* 1158 * Account for the block pointers in each of the indirect blocks 1159 * in the levels below us. 1160 */ 1161 subblksperindir = blksperindir / NINDIR(fs); 1162 for (lbn++, level--, i = 0; i < last; i++) { 1163 error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn, 1164 rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); 1165 if (error) 1166 goto out; 1167 rlbn += blksperindir; 1168 lbn -= blksperindir; 1169 remblks -= blksperindir; 1170 } 1171 out: 1172 free(bap, M_DEVBUF); 1173 return (error); 1174 } 1175 1176 /* 1177 * Do both snap accounting and map accounting. 1178 */ 1179 static int 1180 fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype) 1181 struct vnode *vp; 1182 ufs1_daddr_t *oldblkp, *lastblkp; 1183 struct fs *fs; 1184 ufs_lbn_t lblkno; 1185 int exptype; /* BLK_SNAP or BLK_NOCOPY */ 1186 { 1187 int error; 1188 1189 if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 1190 return (error); 1191 return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 1192 } 1193 1194 /* 1195 * Identify a set of blocks allocated in a snapshot inode. 1196 */ 1197 static int 1198 snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1199 struct vnode *vp; 1200 ufs1_daddr_t *oldblkp, *lastblkp; 1201 struct fs *fs; 1202 ufs_lbn_t lblkno; 1203 int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 1204 { 1205 struct inode *ip = VTOI(vp); 1206 ufs1_daddr_t blkno, *blkp; 1207 ufs_lbn_t lbn; 1208 struct buf *ibp; 1209 int error; 1210 1211 for ( ; oldblkp < lastblkp; oldblkp++) { 1212 blkno = *oldblkp; 1213 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1214 continue; 1215 lbn = fragstoblks(fs, blkno); 1216 if (lbn < UFS_NDADDR) { 1217 blkp = &ip->i_din1->di_db[lbn]; 1218 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); 1219 } else { 1220 error = ffs_balloc_ufs1(vp, lblktosize(fs, (off_t)lbn), 1221 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1222 if (error) 1223 return (error); 1224 blkp = &((ufs1_daddr_t *)(ibp->b_data)) 1225 [(lbn - UFS_NDADDR) % NINDIR(fs)]; 1226 } 1227 /* 1228 * If we are expunging a snapshot vnode and we 1229 * find a block marked BLK_NOCOPY, then it is 1230 * one that has been allocated to this snapshot after 1231 * we took our current snapshot and can be ignored. 1232 */ 1233 if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { 1234 if (lbn >= UFS_NDADDR) 1235 brelse(ibp); 1236 } else { 1237 if (*blkp != 0) 1238 panic("snapacct_ufs1: bad block"); 1239 *blkp = expungetype; 1240 if (lbn >= UFS_NDADDR) 1241 bdwrite(ibp); 1242 } 1243 } 1244 return (0); 1245 } 1246 1247 /* 1248 * Account for a set of blocks allocated in a snapshot inode. 1249 */ 1250 static int 1251 mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1252 struct vnode *vp; 1253 ufs1_daddr_t *oldblkp, *lastblkp; 1254 struct fs *fs; 1255 ufs_lbn_t lblkno; 1256 int expungetype; 1257 { 1258 ufs1_daddr_t blkno; 1259 struct inode *ip; 1260 ino_t inum; 1261 int acctit; 1262 1263 ip = VTOI(vp); 1264 inum = ip->i_number; 1265 if (lblkno == -1) 1266 acctit = 0; 1267 else 1268 acctit = 1; 1269 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1270 blkno = *oldblkp; 1271 if (blkno == 0 || blkno == BLK_NOCOPY) 1272 continue; 1273 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1274 *ip->i_snapblklist++ = lblkno; 1275 if (blkno == BLK_SNAP) 1276 blkno = blkstofrags(fs, lblkno); 1277 ffs_blkfree(ITOUMP(ip), fs, vp, blkno, fs->fs_bsize, inum, 1278 vp->v_type, NULL, SINGLETON_KEY); 1279 } 1280 return (0); 1281 } 1282 1283 /* 1284 * Before expunging a snapshot inode, note all the 1285 * blocks that it claims with BLK_SNAP so that fsck will 1286 * be able to account for those blocks properly and so 1287 * that this snapshot knows that it need not copy them 1288 * if the other snapshot holding them is freed. This code 1289 * is reproduced once each for UFS1 and UFS2. 1290 */ 1291 static int 1292 expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype, clearmode) 1293 struct vnode *snapvp; 1294 struct inode *cancelip; 1295 struct fs *fs; 1296 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1297 struct fs *, ufs_lbn_t, int); 1298 int expungetype; 1299 int clearmode; 1300 { 1301 int i, error, indiroff; 1302 ufs_lbn_t lbn, rlbn; 1303 ufs2_daddr_t len, blkno, numblks, blksperindir; 1304 struct ufs2_dinode *dip; 1305 struct thread *td = curthread; 1306 struct buf *bp; 1307 1308 /* 1309 * Prepare to expunge the inode. If its inode block has not 1310 * yet been copied, then allocate and fill the copy. 1311 */ 1312 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 1313 blkno = 0; 1314 if (lbn < UFS_NDADDR) { 1315 blkno = VTOI(snapvp)->i_din2->di_db[lbn]; 1316 } else { 1317 if (DOINGSOFTDEP(snapvp)) 1318 softdep_prealloc(snapvp, MNT_WAIT); 1319 td->td_pflags |= TDP_COWINPROGRESS; 1320 error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn), 1321 fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); 1322 td->td_pflags &= ~TDP_COWINPROGRESS; 1323 if (error) 1324 return (error); 1325 indiroff = (lbn - UFS_NDADDR) % NINDIR(fs); 1326 blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff]; 1327 bqrelse(bp); 1328 } 1329 if (blkno != 0) { 1330 if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) 1331 return (error); 1332 } else { 1333 error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn), 1334 fs->fs_bsize, KERNCRED, 0, &bp); 1335 if (error) 1336 return (error); 1337 if ((error = readblock(snapvp, bp, lbn)) != 0) 1338 return (error); 1339 } 1340 /* 1341 * Set a snapshot inode to be a zero length file, regular files 1342 * to be completely unallocated. 1343 */ 1344 dip = (struct ufs2_dinode *)bp->b_data + 1345 ino_to_fsbo(fs, cancelip->i_number); 1346 dip->di_size = 0; 1347 dip->di_blocks = 0; 1348 dip->di_flags &= ~SF_SNAPSHOT; 1349 bzero(&dip->di_db[0], (UFS_NDADDR + UFS_NIADDR) * sizeof(ufs2_daddr_t)); 1350 if (clearmode || cancelip->i_effnlink == 0) 1351 dip->di_mode = 0; 1352 else 1353 ffs_update_dinode_ckhash(fs, dip); 1354 bdwrite(bp); 1355 /* 1356 * Now go through and expunge all the blocks in the file 1357 * using the function requested. 1358 */ 1359 numblks = howmany(cancelip->i_size, fs->fs_bsize); 1360 if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0], 1361 &cancelip->i_din2->di_db[UFS_NDADDR], fs, 0, expungetype))) 1362 return (error); 1363 if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_ib[0], 1364 &cancelip->i_din2->di_ib[UFS_NIADDR], fs, -1, expungetype))) 1365 return (error); 1366 blksperindir = 1; 1367 lbn = -UFS_NDADDR; 1368 len = numblks - UFS_NDADDR; 1369 rlbn = UFS_NDADDR; 1370 for (i = 0; len > 0 && i < UFS_NIADDR; i++) { 1371 error = indiracct_ufs2(snapvp, ITOV(cancelip), i, 1372 cancelip->i_din2->di_ib[i], lbn, rlbn, len, 1373 blksperindir, fs, acctfunc, expungetype); 1374 if (error) 1375 return (error); 1376 blksperindir *= NINDIR(fs); 1377 lbn -= blksperindir + 1; 1378 len -= blksperindir; 1379 rlbn += blksperindir; 1380 } 1381 return (0); 1382 } 1383 1384 /* 1385 * Descend an indirect block chain for vnode cancelvp accounting for all 1386 * its indirect blocks in snapvp. 1387 */ 1388 static int 1389 indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, 1390 blksperindir, fs, acctfunc, expungetype) 1391 struct vnode *snapvp; 1392 struct vnode *cancelvp; 1393 int level; 1394 ufs2_daddr_t blkno; 1395 ufs_lbn_t lbn; 1396 ufs_lbn_t rlbn; 1397 ufs_lbn_t remblks; 1398 ufs_lbn_t blksperindir; 1399 struct fs *fs; 1400 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1401 struct fs *, ufs_lbn_t, int); 1402 int expungetype; 1403 { 1404 int error, num, i; 1405 ufs_lbn_t subblksperindir; 1406 struct indir indirs[UFS_NIADDR + 2]; 1407 ufs2_daddr_t last, *bap; 1408 struct buf *bp; 1409 1410 if (blkno == 0) { 1411 if (expungetype == BLK_NOCOPY) 1412 return (0); 1413 panic("indiracct_ufs2: missing indir"); 1414 } 1415 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 1416 return (error); 1417 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 1418 panic("indiracct_ufs2: botched params"); 1419 /* 1420 * We have to expand bread here since it will deadlock looking 1421 * up the block number for any blocks that are not in the cache. 1422 */ 1423 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); 1424 bp->b_blkno = fsbtodb(fs, blkno); 1425 if ((bp->b_flags & B_CACHE) == 0 && 1426 (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) { 1427 brelse(bp); 1428 return (error); 1429 } 1430 /* 1431 * Account for the block pointers in this indirect block. 1432 */ 1433 last = howmany(remblks, blksperindir); 1434 if (last > NINDIR(fs)) 1435 last = NINDIR(fs); 1436 bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK); 1437 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 1438 bqrelse(bp); 1439 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 1440 level == 0 ? rlbn : -1, expungetype); 1441 if (error || level == 0) 1442 goto out; 1443 /* 1444 * Account for the block pointers in each of the indirect blocks 1445 * in the levels below us. 1446 */ 1447 subblksperindir = blksperindir / NINDIR(fs); 1448 for (lbn++, level--, i = 0; i < last; i++) { 1449 error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn, 1450 rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); 1451 if (error) 1452 goto out; 1453 rlbn += blksperindir; 1454 lbn -= blksperindir; 1455 remblks -= blksperindir; 1456 } 1457 out: 1458 free(bap, M_DEVBUF); 1459 return (error); 1460 } 1461 1462 /* 1463 * Do both snap accounting and map accounting. 1464 */ 1465 static int 1466 fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype) 1467 struct vnode *vp; 1468 ufs2_daddr_t *oldblkp, *lastblkp; 1469 struct fs *fs; 1470 ufs_lbn_t lblkno; 1471 int exptype; /* BLK_SNAP or BLK_NOCOPY */ 1472 { 1473 int error; 1474 1475 if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 1476 return (error); 1477 return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 1478 } 1479 1480 /* 1481 * Identify a set of blocks allocated in a snapshot inode. 1482 */ 1483 static int 1484 snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1485 struct vnode *vp; 1486 ufs2_daddr_t *oldblkp, *lastblkp; 1487 struct fs *fs; 1488 ufs_lbn_t lblkno; 1489 int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 1490 { 1491 struct inode *ip = VTOI(vp); 1492 ufs2_daddr_t blkno, *blkp; 1493 ufs_lbn_t lbn; 1494 struct buf *ibp; 1495 int error; 1496 1497 for ( ; oldblkp < lastblkp; oldblkp++) { 1498 blkno = *oldblkp; 1499 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1500 continue; 1501 lbn = fragstoblks(fs, blkno); 1502 if (lbn < UFS_NDADDR) { 1503 blkp = &ip->i_din2->di_db[lbn]; 1504 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); 1505 } else { 1506 error = ffs_balloc_ufs2(vp, lblktosize(fs, (off_t)lbn), 1507 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1508 if (error) 1509 return (error); 1510 blkp = &((ufs2_daddr_t *)(ibp->b_data)) 1511 [(lbn - UFS_NDADDR) % NINDIR(fs)]; 1512 } 1513 /* 1514 * If we are expunging a snapshot vnode and we 1515 * find a block marked BLK_NOCOPY, then it is 1516 * one that has been allocated to this snapshot after 1517 * we took our current snapshot and can be ignored. 1518 */ 1519 if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { 1520 if (lbn >= UFS_NDADDR) 1521 brelse(ibp); 1522 } else { 1523 if (*blkp != 0) 1524 panic("snapacct_ufs2: bad block"); 1525 *blkp = expungetype; 1526 if (lbn >= UFS_NDADDR) 1527 bdwrite(ibp); 1528 } 1529 } 1530 return (0); 1531 } 1532 1533 /* 1534 * Account for a set of blocks allocated in a snapshot inode. 1535 */ 1536 static int 1537 mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1538 struct vnode *vp; 1539 ufs2_daddr_t *oldblkp, *lastblkp; 1540 struct fs *fs; 1541 ufs_lbn_t lblkno; 1542 int expungetype; 1543 { 1544 ufs2_daddr_t blkno; 1545 struct inode *ip; 1546 ino_t inum; 1547 int acctit; 1548 1549 ip = VTOI(vp); 1550 inum = ip->i_number; 1551 if (lblkno == -1) 1552 acctit = 0; 1553 else 1554 acctit = 1; 1555 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1556 blkno = *oldblkp; 1557 if (blkno == 0 || blkno == BLK_NOCOPY) 1558 continue; 1559 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1560 *ip->i_snapblklist++ = lblkno; 1561 if (blkno == BLK_SNAP) 1562 blkno = blkstofrags(fs, lblkno); 1563 ffs_blkfree(ITOUMP(ip), fs, vp, blkno, fs->fs_bsize, inum, 1564 vp->v_type, NULL, SINGLETON_KEY); 1565 } 1566 return (0); 1567 } 1568 1569 /* 1570 * Decrement extra reference on snapshot when last name is removed. 1571 * It will not be freed until the last open reference goes away. 1572 */ 1573 void 1574 ffs_snapgone(ip) 1575 struct inode *ip; 1576 { 1577 struct inode *xp; 1578 struct fs *fs; 1579 int snaploc; 1580 struct snapdata *sn; 1581 struct ufsmount *ump; 1582 1583 /* 1584 * Find snapshot in incore list. 1585 */ 1586 xp = NULL; 1587 sn = ITODEVVP(ip)->v_rdev->si_snapdata; 1588 if (sn != NULL) 1589 TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) 1590 if (xp == ip) 1591 break; 1592 if (xp != NULL) 1593 vrele(ITOV(ip)); 1594 #ifdef DIAGNOSTIC 1595 else if (snapdebug) 1596 printf("ffs_snapgone: lost snapshot vnode %ju\n", 1597 (uintmax_t)ip->i_number); 1598 #endif 1599 /* 1600 * Delete snapshot inode from superblock. Keep list dense. 1601 */ 1602 ump = ITOUMP(ip); 1603 fs = ump->um_fs; 1604 UFS_LOCK(ump); 1605 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 1606 if (fs->fs_snapinum[snaploc] == ip->i_number) 1607 break; 1608 if (snaploc < FSMAXSNAP) { 1609 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { 1610 if (fs->fs_snapinum[snaploc] == 0) 1611 break; 1612 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; 1613 } 1614 fs->fs_snapinum[snaploc - 1] = 0; 1615 } 1616 UFS_UNLOCK(ump); 1617 } 1618 1619 /* 1620 * Prepare a snapshot file for being removed. 1621 */ 1622 void 1623 ffs_snapremove(vp) 1624 struct vnode *vp; 1625 { 1626 struct inode *ip; 1627 struct vnode *devvp; 1628 struct buf *ibp; 1629 struct fs *fs; 1630 ufs2_daddr_t numblks, blkno, dblk; 1631 int error, i, last, loc; 1632 struct snapdata *sn; 1633 1634 ip = VTOI(vp); 1635 fs = ITOFS(ip); 1636 devvp = ITODEVVP(ip); 1637 /* 1638 * If active, delete from incore list (this snapshot may 1639 * already have been in the process of being deleted, so 1640 * would not have been active). 1641 * 1642 * Clear copy-on-write flag if last snapshot. 1643 */ 1644 VI_LOCK(devvp); 1645 if (ip->i_nextsnap.tqe_prev != 0) { 1646 sn = devvp->v_rdev->si_snapdata; 1647 TAILQ_REMOVE(&sn->sn_head, ip, i_nextsnap); 1648 ip->i_nextsnap.tqe_prev = 0; 1649 VI_UNLOCK(devvp); 1650 lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL); 1651 for (i = 0; i < sn->sn_lock.lk_recurse; i++) 1652 lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL); 1653 KASSERT(vp->v_vnlock == &sn->sn_lock, 1654 ("ffs_snapremove: lost lock mutation")); 1655 vp->v_vnlock = &vp->v_lock; 1656 VI_LOCK(devvp); 1657 while (sn->sn_lock.lk_recurse > 0) 1658 lockmgr(&sn->sn_lock, LK_RELEASE, NULL); 1659 lockmgr(&sn->sn_lock, LK_RELEASE, NULL); 1660 try_free_snapdata(devvp); 1661 } else 1662 VI_UNLOCK(devvp); 1663 /* 1664 * Clear all BLK_NOCOPY fields. Pass any block claims to other 1665 * snapshots that want them (see ffs_snapblkfree below). 1666 */ 1667 for (blkno = 1; blkno < UFS_NDADDR; blkno++) { 1668 dblk = DIP(ip, i_db[blkno]); 1669 if (dblk == 0) 1670 continue; 1671 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1672 DIP_SET(ip, i_db[blkno], 0); 1673 else if ((dblk == blkstofrags(fs, blkno) && 1674 ffs_snapblkfree(fs, ITODEVVP(ip), dblk, fs->fs_bsize, 1675 ip->i_number, vp->v_type, NULL))) { 1676 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - 1677 btodb(fs->fs_bsize)); 1678 DIP_SET(ip, i_db[blkno], 0); 1679 } 1680 } 1681 numblks = howmany(ip->i_size, fs->fs_bsize); 1682 for (blkno = UFS_NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 1683 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), 1684 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1685 if (error) 1686 continue; 1687 if (fs->fs_size - blkno > NINDIR(fs)) 1688 last = NINDIR(fs); 1689 else 1690 last = fs->fs_size - blkno; 1691 for (loc = 0; loc < last; loc++) { 1692 if (I_IS_UFS1(ip)) { 1693 dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc]; 1694 if (dblk == 0) 1695 continue; 1696 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1697 ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; 1698 else if ((dblk == blkstofrags(fs, blkno) && 1699 ffs_snapblkfree(fs, ITODEVVP(ip), dblk, 1700 fs->fs_bsize, ip->i_number, vp->v_type, 1701 NULL))) { 1702 ip->i_din1->di_blocks -= 1703 btodb(fs->fs_bsize); 1704 ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; 1705 } 1706 continue; 1707 } 1708 dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc]; 1709 if (dblk == 0) 1710 continue; 1711 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1712 ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; 1713 else if ((dblk == blkstofrags(fs, blkno) && 1714 ffs_snapblkfree(fs, ITODEVVP(ip), dblk, 1715 fs->fs_bsize, ip->i_number, vp->v_type, NULL))) { 1716 ip->i_din2->di_blocks -= btodb(fs->fs_bsize); 1717 ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; 1718 } 1719 } 1720 bawrite(ibp); 1721 } 1722 /* 1723 * Clear snapshot flag and drop reference. 1724 */ 1725 ip->i_flags &= ~SF_SNAPSHOT; 1726 DIP_SET(ip, i_flags, ip->i_flags); 1727 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); 1728 /* 1729 * The dirtied indirects must be written out before 1730 * softdep_setup_freeblocks() is called. Otherwise indir_trunc() 1731 * may find indirect pointers using the magic BLK_* values. 1732 */ 1733 if (DOINGSOFTDEP(vp)) 1734 ffs_syncvnode(vp, MNT_WAIT, 0); 1735 #ifdef QUOTA 1736 /* 1737 * Reenable disk quotas for ex-snapshot file. 1738 */ 1739 if (!getinoquota(ip)) 1740 (void) chkdq(ip, DIP(ip, i_blocks), KERNCRED, FORCE); 1741 #endif 1742 } 1743 1744 /* 1745 * Notification that a block is being freed. Return zero if the free 1746 * should be allowed to proceed. Return non-zero if the snapshot file 1747 * wants to claim the block. The block will be claimed if it is an 1748 * uncopied part of one of the snapshots. It will be freed if it is 1749 * either a BLK_NOCOPY or has already been copied in all of the snapshots. 1750 * If a fragment is being freed, then all snapshots that care about 1751 * it must make a copy since a snapshot file can only claim full sized 1752 * blocks. Note that if more than one snapshot file maps the block, 1753 * we can pick one at random to claim it. Since none of the snapshots 1754 * can change, we are assurred that they will all see the same unmodified 1755 * image. When deleting a snapshot file (see ffs_snapremove above), we 1756 * must push any of these claimed blocks to one of the other snapshots 1757 * that maps it. These claimed blocks are easily identified as they will 1758 * have a block number equal to their logical block number within the 1759 * snapshot. A copied block can never have this property because they 1760 * must always have been allocated from a BLK_NOCOPY location. 1761 */ 1762 int 1763 ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, wkhd) 1764 struct fs *fs; 1765 struct vnode *devvp; 1766 ufs2_daddr_t bno; 1767 long size; 1768 ino_t inum; 1769 enum vtype vtype; 1770 struct workhead *wkhd; 1771 { 1772 struct buf *ibp, *cbp, *savedcbp = NULL; 1773 struct thread *td = curthread; 1774 struct inode *ip; 1775 struct vnode *vp = NULL; 1776 ufs_lbn_t lbn; 1777 ufs2_daddr_t blkno; 1778 int indiroff = 0, error = 0, claimedblk = 0; 1779 struct snapdata *sn; 1780 1781 lbn = fragstoblks(fs, bno); 1782 retry: 1783 VI_LOCK(devvp); 1784 sn = devvp->v_rdev->si_snapdata; 1785 if (sn == NULL) { 1786 VI_UNLOCK(devvp); 1787 return (0); 1788 } 1789 if (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 1790 VI_MTX(devvp)) != 0) 1791 goto retry; 1792 TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { 1793 vp = ITOV(ip); 1794 if (DOINGSOFTDEP(vp)) 1795 softdep_prealloc(vp, MNT_WAIT); 1796 /* 1797 * Lookup block being written. 1798 */ 1799 if (lbn < UFS_NDADDR) { 1800 blkno = DIP(ip, i_db[lbn]); 1801 } else { 1802 td->td_pflags |= TDP_COWINPROGRESS; 1803 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1804 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1805 td->td_pflags &= ~TDP_COWINPROGRESS; 1806 if (error) 1807 break; 1808 indiroff = (lbn - UFS_NDADDR) % NINDIR(fs); 1809 if (I_IS_UFS1(ip)) 1810 blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; 1811 else 1812 blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; 1813 } 1814 /* 1815 * Check to see if block needs to be copied. 1816 */ 1817 if (blkno == 0) { 1818 /* 1819 * A block that we map is being freed. If it has not 1820 * been claimed yet, we will claim or copy it (below). 1821 */ 1822 claimedblk = 1; 1823 } else if (blkno == BLK_SNAP) { 1824 /* 1825 * No previous snapshot claimed the block, 1826 * so it will be freed and become a BLK_NOCOPY 1827 * (don't care) for us. 1828 */ 1829 if (claimedblk) 1830 panic("snapblkfree: inconsistent block type"); 1831 if (lbn < UFS_NDADDR) { 1832 DIP_SET(ip, i_db[lbn], BLK_NOCOPY); 1833 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); 1834 } else if (I_IS_UFS1(ip)) { 1835 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 1836 BLK_NOCOPY; 1837 bdwrite(ibp); 1838 } else { 1839 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 1840 BLK_NOCOPY; 1841 bdwrite(ibp); 1842 } 1843 continue; 1844 } else /* BLK_NOCOPY or default */ { 1845 /* 1846 * If the snapshot has already copied the block 1847 * (default), or does not care about the block, 1848 * it is not needed. 1849 */ 1850 if (lbn >= UFS_NDADDR) 1851 bqrelse(ibp); 1852 continue; 1853 } 1854 /* 1855 * If this is a full size block, we will just grab it 1856 * and assign it to the snapshot inode. Otherwise we 1857 * will proceed to copy it. See explanation for this 1858 * routine as to why only a single snapshot needs to 1859 * claim this block. 1860 */ 1861 if (size == fs->fs_bsize) { 1862 #ifdef DIAGNOSTIC 1863 if (snapdebug) 1864 printf("%s %ju lbn %jd from inum %ju\n", 1865 "Grabonremove: snapino", 1866 (uintmax_t)ip->i_number, 1867 (intmax_t)lbn, (uintmax_t)inum); 1868 #endif 1869 /* 1870 * If journaling is tracking this write we must add 1871 * the work to the inode or indirect being written. 1872 */ 1873 if (wkhd != NULL) { 1874 if (lbn < UFS_NDADDR) 1875 softdep_inode_append(ip, 1876 curthread->td_ucred, wkhd); 1877 else 1878 softdep_buf_append(ibp, wkhd); 1879 } 1880 if (lbn < UFS_NDADDR) { 1881 DIP_SET(ip, i_db[lbn], bno); 1882 } else if (I_IS_UFS1(ip)) { 1883 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno; 1884 bdwrite(ibp); 1885 } else { 1886 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno; 1887 bdwrite(ibp); 1888 } 1889 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + btodb(size)); 1890 UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); 1891 lockmgr(vp->v_vnlock, LK_RELEASE, NULL); 1892 return (1); 1893 } 1894 if (lbn >= UFS_NDADDR) 1895 bqrelse(ibp); 1896 /* 1897 * Allocate the block into which to do the copy. Note that this 1898 * allocation will never require any additional allocations for 1899 * the snapshot inode. 1900 */ 1901 td->td_pflags |= TDP_COWINPROGRESS; 1902 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1903 fs->fs_bsize, KERNCRED, 0, &cbp); 1904 td->td_pflags &= ~TDP_COWINPROGRESS; 1905 if (error) 1906 break; 1907 #ifdef DIAGNOSTIC 1908 if (snapdebug) 1909 printf("%s%ju lbn %jd %s %ju size %ld to blkno %jd\n", 1910 "Copyonremove: snapino ", (uintmax_t)ip->i_number, 1911 (intmax_t)lbn, "for inum", (uintmax_t)inum, size, 1912 (intmax_t)cbp->b_blkno); 1913 #endif 1914 /* 1915 * If we have already read the old block contents, then 1916 * simply copy them to the new block. Note that we need 1917 * to synchronously write snapshots that have not been 1918 * unlinked, and hence will be visible after a crash, 1919 * to ensure their integrity. At a minimum we ensure the 1920 * integrity of the filesystem metadata, but use the 1921 * dopersistence sysctl-setable flag to decide on the 1922 * persistence needed for file content data. 1923 */ 1924 if (savedcbp != NULL) { 1925 bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 1926 bawrite(cbp); 1927 if ((vtype == VDIR || dopersistence) && 1928 ip->i_effnlink > 0) 1929 (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); 1930 continue; 1931 } 1932 /* 1933 * Otherwise, read the old block contents into the buffer. 1934 */ 1935 if ((error = readblock(vp, cbp, lbn)) != 0) { 1936 bzero(cbp->b_data, fs->fs_bsize); 1937 bawrite(cbp); 1938 if ((vtype == VDIR || dopersistence) && 1939 ip->i_effnlink > 0) 1940 (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); 1941 break; 1942 } 1943 savedcbp = cbp; 1944 } 1945 /* 1946 * Note that we need to synchronously write snapshots that 1947 * have not been unlinked, and hence will be visible after 1948 * a crash, to ensure their integrity. At a minimum we 1949 * ensure the integrity of the filesystem metadata, but 1950 * use the dopersistence sysctl-setable flag to decide on 1951 * the persistence needed for file content data. 1952 */ 1953 if (savedcbp) { 1954 vp = savedcbp->b_vp; 1955 bawrite(savedcbp); 1956 if ((vtype == VDIR || dopersistence) && 1957 VTOI(vp)->i_effnlink > 0) 1958 (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); 1959 } 1960 /* 1961 * If we have been unable to allocate a block in which to do 1962 * the copy, then return non-zero so that the fragment will 1963 * not be freed. Although space will be lost, the snapshot 1964 * will stay consistent. 1965 */ 1966 if (error != 0 && wkhd != NULL) 1967 softdep_freework(wkhd); 1968 lockmgr(&sn->sn_lock, LK_RELEASE, NULL); 1969 return (error); 1970 } 1971 1972 /* 1973 * Associate snapshot files when mounting. 1974 */ 1975 void 1976 ffs_snapshot_mount(mp) 1977 struct mount *mp; 1978 { 1979 struct ufsmount *ump = VFSTOUFS(mp); 1980 struct vnode *devvp = ump->um_devvp; 1981 struct fs *fs = ump->um_fs; 1982 struct thread *td = curthread; 1983 struct snapdata *sn; 1984 struct vnode *vp; 1985 struct vnode *lastvp; 1986 struct inode *ip; 1987 struct uio auio; 1988 struct iovec aiov; 1989 void *snapblklist; 1990 char *reason; 1991 daddr_t snaplistsize; 1992 int error, snaploc, loc; 1993 1994 /* 1995 * XXX The following needs to be set before ffs_truncate or 1996 * VOP_READ can be called. 1997 */ 1998 mp->mnt_stat.f_iosize = fs->fs_bsize; 1999 /* 2000 * Process each snapshot listed in the superblock. 2001 */ 2002 vp = NULL; 2003 lastvp = NULL; 2004 sn = NULL; 2005 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { 2006 if (fs->fs_snapinum[snaploc] == 0) 2007 break; 2008 if ((error = ffs_vget(mp, fs->fs_snapinum[snaploc], 2009 LK_EXCLUSIVE, &vp)) != 0){ 2010 printf("ffs_snapshot_mount: vget failed %d\n", error); 2011 continue; 2012 } 2013 ip = VTOI(vp); 2014 if (vp->v_type != VREG) { 2015 reason = "non-file snapshot"; 2016 } else if (!IS_SNAPSHOT(ip)) { 2017 reason = "non-snapshot"; 2018 } else if (ip->i_size == 2019 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) { 2020 reason = "old format snapshot"; 2021 (void)ffs_truncate(vp, (off_t)0, 0, NOCRED); 2022 (void)ffs_syncvnode(vp, MNT_WAIT, 0); 2023 } else { 2024 reason = NULL; 2025 } 2026 if (reason != NULL) { 2027 printf("ffs_snapshot_mount: %s inode %d\n", 2028 reason, fs->fs_snapinum[snaploc]); 2029 vput(vp); 2030 vp = NULL; 2031 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { 2032 if (fs->fs_snapinum[loc] == 0) 2033 break; 2034 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; 2035 } 2036 fs->fs_snapinum[loc - 1] = 0; 2037 snaploc--; 2038 continue; 2039 } 2040 /* 2041 * Acquire a lock on the snapdata structure, creating it if 2042 * necessary. 2043 */ 2044 sn = ffs_snapdata_acquire(devvp); 2045 /* 2046 * Change vnode to use shared snapshot lock instead of the 2047 * original private lock. 2048 */ 2049 vp->v_vnlock = &sn->sn_lock; 2050 lockmgr(&vp->v_lock, LK_RELEASE, NULL); 2051 /* 2052 * Link it onto the active snapshot list. 2053 */ 2054 VI_LOCK(devvp); 2055 if (ip->i_nextsnap.tqe_prev != 0) 2056 panic("ffs_snapshot_mount: %ju already on list", 2057 (uintmax_t)ip->i_number); 2058 else 2059 TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap); 2060 vp->v_vflag |= VV_SYSTEM; 2061 VI_UNLOCK(devvp); 2062 VOP_UNLOCK(vp); 2063 lastvp = vp; 2064 } 2065 vp = lastvp; 2066 /* 2067 * No usable snapshots found. 2068 */ 2069 if (sn == NULL || vp == NULL) 2070 return; 2071 /* 2072 * Allocate the space for the block hints list. We always want to 2073 * use the list from the newest snapshot. 2074 */ 2075 auio.uio_iov = &aiov; 2076 auio.uio_iovcnt = 1; 2077 aiov.iov_base = (void *)&snaplistsize; 2078 aiov.iov_len = sizeof(snaplistsize); 2079 auio.uio_resid = aiov.iov_len; 2080 auio.uio_offset = 2081 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)); 2082 auio.uio_segflg = UIO_SYSSPACE; 2083 auio.uio_rw = UIO_READ; 2084 auio.uio_td = td; 2085 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2086 if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 2087 printf("ffs_snapshot_mount: read_1 failed %d\n", error); 2088 VOP_UNLOCK(vp); 2089 return; 2090 } 2091 snapblklist = malloc(snaplistsize * sizeof(daddr_t), 2092 M_UFSMNT, M_WAITOK); 2093 auio.uio_iovcnt = 1; 2094 aiov.iov_base = snapblklist; 2095 aiov.iov_len = snaplistsize * sizeof (daddr_t); 2096 auio.uio_resid = aiov.iov_len; 2097 auio.uio_offset -= sizeof(snaplistsize); 2098 if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 2099 printf("ffs_snapshot_mount: read_2 failed %d\n", error); 2100 VOP_UNLOCK(vp); 2101 free(snapblklist, M_UFSMNT); 2102 return; 2103 } 2104 VOP_UNLOCK(vp); 2105 VI_LOCK(devvp); 2106 ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_mount"); 2107 sn->sn_listsize = snaplistsize; 2108 sn->sn_blklist = (daddr_t *)snapblklist; 2109 devvp->v_vflag |= VV_COPYONWRITE; 2110 VI_UNLOCK(devvp); 2111 } 2112 2113 /* 2114 * Disassociate snapshot files when unmounting. 2115 */ 2116 void 2117 ffs_snapshot_unmount(mp) 2118 struct mount *mp; 2119 { 2120 struct vnode *devvp = VFSTOUFS(mp)->um_devvp; 2121 struct snapdata *sn; 2122 struct inode *xp; 2123 struct vnode *vp; 2124 2125 VI_LOCK(devvp); 2126 sn = devvp->v_rdev->si_snapdata; 2127 while (sn != NULL && (xp = TAILQ_FIRST(&sn->sn_head)) != NULL) { 2128 vp = ITOV(xp); 2129 TAILQ_REMOVE(&sn->sn_head, xp, i_nextsnap); 2130 xp->i_nextsnap.tqe_prev = 0; 2131 lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE, 2132 VI_MTX(devvp)); 2133 lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL); 2134 KASSERT(vp->v_vnlock == &sn->sn_lock, 2135 ("ffs_snapshot_unmount: lost lock mutation")); 2136 vp->v_vnlock = &vp->v_lock; 2137 lockmgr(&vp->v_lock, LK_RELEASE, NULL); 2138 lockmgr(&sn->sn_lock, LK_RELEASE, NULL); 2139 if (xp->i_effnlink > 0) 2140 vrele(vp); 2141 VI_LOCK(devvp); 2142 sn = devvp->v_rdev->si_snapdata; 2143 } 2144 try_free_snapdata(devvp); 2145 ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount"); 2146 } 2147 2148 /* 2149 * Check the buffer block to be belong to device buffer that shall be 2150 * locked after snaplk. devvp shall be locked on entry, and will be 2151 * leaved locked upon exit. 2152 */ 2153 static int 2154 ffs_bp_snapblk(devvp, bp) 2155 struct vnode *devvp; 2156 struct buf *bp; 2157 { 2158 struct snapdata *sn; 2159 struct fs *fs; 2160 ufs2_daddr_t lbn, *snapblklist; 2161 int lower, upper, mid; 2162 2163 ASSERT_VI_LOCKED(devvp, "ffs_bp_snapblk"); 2164 KASSERT(devvp->v_type == VCHR, ("Not a device %p", devvp)); 2165 sn = devvp->v_rdev->si_snapdata; 2166 if (sn == NULL || TAILQ_FIRST(&sn->sn_head) == NULL) 2167 return (0); 2168 fs = ITOFS(TAILQ_FIRST(&sn->sn_head)); 2169 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); 2170 snapblklist = sn->sn_blklist; 2171 upper = sn->sn_listsize - 1; 2172 lower = 1; 2173 while (lower <= upper) { 2174 mid = (lower + upper) / 2; 2175 if (snapblklist[mid] == lbn) 2176 break; 2177 if (snapblklist[mid] < lbn) 2178 lower = mid + 1; 2179 else 2180 upper = mid - 1; 2181 } 2182 if (lower <= upper) 2183 return (1); 2184 return (0); 2185 } 2186 2187 void 2188 ffs_bdflush(bo, bp) 2189 struct bufobj *bo; 2190 struct buf *bp; 2191 { 2192 struct thread *td; 2193 struct vnode *vp, *devvp; 2194 struct buf *nbp; 2195 int bp_bdskip; 2196 2197 if (bo->bo_dirty.bv_cnt <= dirtybufthresh) 2198 return; 2199 2200 td = curthread; 2201 vp = bp->b_vp; 2202 devvp = bo2vnode(bo); 2203 KASSERT(vp == devvp, ("devvp != vp %p %p", bo, bp)); 2204 2205 VI_LOCK(devvp); 2206 bp_bdskip = ffs_bp_snapblk(devvp, bp); 2207 if (bp_bdskip) 2208 bdwriteskip++; 2209 VI_UNLOCK(devvp); 2210 if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10 && !bp_bdskip) { 2211 (void) VOP_FSYNC(vp, MNT_NOWAIT, td); 2212 altbufferflushes++; 2213 } else { 2214 BO_LOCK(bo); 2215 /* 2216 * Try to find a buffer to flush. 2217 */ 2218 TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) { 2219 if ((nbp->b_vflags & BV_BKGRDINPROG) || 2220 BUF_LOCK(nbp, 2221 LK_EXCLUSIVE | LK_NOWAIT, NULL)) 2222 continue; 2223 if (bp == nbp) 2224 panic("bdwrite: found ourselves"); 2225 BO_UNLOCK(bo); 2226 /* 2227 * Don't countdeps with the bo lock 2228 * held. 2229 */ 2230 if (buf_countdeps(nbp, 0)) { 2231 BO_LOCK(bo); 2232 BUF_UNLOCK(nbp); 2233 continue; 2234 } 2235 if (bp_bdskip) { 2236 VI_LOCK(devvp); 2237 if (!ffs_bp_snapblk(vp, nbp)) { 2238 VI_UNLOCK(devvp); 2239 BO_LOCK(bo); 2240 BUF_UNLOCK(nbp); 2241 continue; 2242 } 2243 VI_UNLOCK(devvp); 2244 } 2245 if (nbp->b_flags & B_CLUSTEROK) { 2246 vfs_bio_awrite(nbp); 2247 } else { 2248 bremfree(nbp); 2249 bawrite(nbp); 2250 } 2251 dirtybufferflushes++; 2252 break; 2253 } 2254 if (nbp == NULL) 2255 BO_UNLOCK(bo); 2256 } 2257 } 2258 2259 /* 2260 * Check for need to copy block that is about to be written, 2261 * copying the block if necessary. 2262 */ 2263 int 2264 ffs_copyonwrite(devvp, bp) 2265 struct vnode *devvp; 2266 struct buf *bp; 2267 { 2268 struct snapdata *sn; 2269 struct buf *ibp, *cbp, *savedcbp = NULL; 2270 struct thread *td = curthread; 2271 struct fs *fs; 2272 struct inode *ip; 2273 struct vnode *vp = NULL; 2274 ufs2_daddr_t lbn, blkno, *snapblklist; 2275 int lower, upper, mid, indiroff, error = 0; 2276 int launched_async_io, prev_norunningbuf; 2277 long saved_runningbufspace; 2278 2279 if (devvp != bp->b_vp && IS_SNAPSHOT(VTOI(bp->b_vp))) 2280 return (0); /* Update on a snapshot file */ 2281 if (td->td_pflags & TDP_COWINPROGRESS) 2282 panic("ffs_copyonwrite: recursive call"); 2283 /* 2284 * First check to see if it is in the preallocated list. 2285 * By doing this check we avoid several potential deadlocks. 2286 */ 2287 VI_LOCK(devvp); 2288 sn = devvp->v_rdev->si_snapdata; 2289 if (sn == NULL || 2290 TAILQ_EMPTY(&sn->sn_head)) { 2291 VI_UNLOCK(devvp); 2292 return (0); /* No snapshot */ 2293 } 2294 ip = TAILQ_FIRST(&sn->sn_head); 2295 fs = ITOFS(ip); 2296 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); 2297 snapblklist = sn->sn_blklist; 2298 upper = sn->sn_listsize - 1; 2299 lower = 1; 2300 while (lower <= upper) { 2301 mid = (lower + upper) / 2; 2302 if (snapblklist[mid] == lbn) 2303 break; 2304 if (snapblklist[mid] < lbn) 2305 lower = mid + 1; 2306 else 2307 upper = mid - 1; 2308 } 2309 if (lower <= upper) { 2310 VI_UNLOCK(devvp); 2311 return (0); 2312 } 2313 launched_async_io = 0; 2314 prev_norunningbuf = td->td_pflags & TDP_NORUNNINGBUF; 2315 /* 2316 * Since I/O on bp isn't yet in progress and it may be blocked 2317 * for a long time waiting on snaplk, back it out of 2318 * runningbufspace, possibly waking other threads waiting for space. 2319 */ 2320 saved_runningbufspace = bp->b_runningbufspace; 2321 if (saved_runningbufspace != 0) 2322 runningbufwakeup(bp); 2323 /* 2324 * Not in the precomputed list, so check the snapshots. 2325 */ 2326 while (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 2327 VI_MTX(devvp)) != 0) { 2328 VI_LOCK(devvp); 2329 sn = devvp->v_rdev->si_snapdata; 2330 if (sn == NULL || 2331 TAILQ_EMPTY(&sn->sn_head)) { 2332 VI_UNLOCK(devvp); 2333 if (saved_runningbufspace != 0) { 2334 bp->b_runningbufspace = saved_runningbufspace; 2335 atomic_add_long(&runningbufspace, 2336 bp->b_runningbufspace); 2337 } 2338 return (0); /* Snapshot gone */ 2339 } 2340 } 2341 TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { 2342 vp = ITOV(ip); 2343 if (DOINGSOFTDEP(vp)) 2344 softdep_prealloc(vp, MNT_WAIT); 2345 /* 2346 * We ensure that everything of our own that needs to be 2347 * copied will be done at the time that ffs_snapshot is 2348 * called. Thus we can skip the check here which can 2349 * deadlock in doing the lookup in UFS_BALLOC. 2350 */ 2351 if (bp->b_vp == vp) 2352 continue; 2353 /* 2354 * Check to see if block needs to be copied. We do not have 2355 * to hold the snapshot lock while doing this lookup as it 2356 * will never require any additional allocations for the 2357 * snapshot inode. 2358 */ 2359 if (lbn < UFS_NDADDR) { 2360 blkno = DIP(ip, i_db[lbn]); 2361 } else { 2362 td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF; 2363 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 2364 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 2365 td->td_pflags &= ~TDP_COWINPROGRESS; 2366 if (error) 2367 break; 2368 indiroff = (lbn - UFS_NDADDR) % NINDIR(fs); 2369 if (I_IS_UFS1(ip)) 2370 blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; 2371 else 2372 blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; 2373 bqrelse(ibp); 2374 } 2375 #ifdef INVARIANTS 2376 if (blkno == BLK_SNAP && bp->b_lblkno >= 0) 2377 panic("ffs_copyonwrite: bad copy block"); 2378 #endif 2379 if (blkno != 0) 2380 continue; 2381 /* 2382 * Allocate the block into which to do the copy. Since 2383 * multiple processes may all try to copy the same block, 2384 * we have to recheck our need to do a copy if we sleep 2385 * waiting for the lock. 2386 * 2387 * Because all snapshots on a filesystem share a single 2388 * lock, we ensure that we will never be in competition 2389 * with another process to allocate a block. 2390 */ 2391 td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF; 2392 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 2393 fs->fs_bsize, KERNCRED, 0, &cbp); 2394 td->td_pflags &= ~TDP_COWINPROGRESS; 2395 if (error) 2396 break; 2397 #ifdef DIAGNOSTIC 2398 if (snapdebug) { 2399 printf("Copyonwrite: snapino %ju lbn %jd for ", 2400 (uintmax_t)ip->i_number, (intmax_t)lbn); 2401 if (bp->b_vp == devvp) 2402 printf("fs metadata"); 2403 else 2404 printf("inum %ju", 2405 (uintmax_t)VTOI(bp->b_vp)->i_number); 2406 printf(" lblkno %jd to blkno %jd\n", 2407 (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno); 2408 } 2409 #endif 2410 /* 2411 * If we have already read the old block contents, then 2412 * simply copy them to the new block. Note that we need 2413 * to synchronously write snapshots that have not been 2414 * unlinked, and hence will be visible after a crash, 2415 * to ensure their integrity. At a minimum we ensure the 2416 * integrity of the filesystem metadata, but use the 2417 * dopersistence sysctl-setable flag to decide on the 2418 * persistence needed for file content data. 2419 */ 2420 if (savedcbp != NULL) { 2421 bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 2422 bawrite(cbp); 2423 if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR || 2424 dopersistence) && ip->i_effnlink > 0) 2425 (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); 2426 else 2427 launched_async_io = 1; 2428 continue; 2429 } 2430 /* 2431 * Otherwise, read the old block contents into the buffer. 2432 */ 2433 if ((error = readblock(vp, cbp, lbn)) != 0) { 2434 bzero(cbp->b_data, fs->fs_bsize); 2435 bawrite(cbp); 2436 if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR || 2437 dopersistence) && ip->i_effnlink > 0) 2438 (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); 2439 else 2440 launched_async_io = 1; 2441 break; 2442 } 2443 savedcbp = cbp; 2444 } 2445 /* 2446 * Note that we need to synchronously write snapshots that 2447 * have not been unlinked, and hence will be visible after 2448 * a crash, to ensure their integrity. At a minimum we 2449 * ensure the integrity of the filesystem metadata, but 2450 * use the dopersistence sysctl-setable flag to decide on 2451 * the persistence needed for file content data. 2452 */ 2453 if (savedcbp) { 2454 vp = savedcbp->b_vp; 2455 bawrite(savedcbp); 2456 if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR || 2457 dopersistence) && VTOI(vp)->i_effnlink > 0) 2458 (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); 2459 else 2460 launched_async_io = 1; 2461 } 2462 lockmgr(vp->v_vnlock, LK_RELEASE, NULL); 2463 td->td_pflags = (td->td_pflags & ~TDP_NORUNNINGBUF) | 2464 prev_norunningbuf; 2465 if (launched_async_io && (td->td_pflags & TDP_NORUNNINGBUF) == 0) 2466 waitrunningbufspace(); 2467 /* 2468 * I/O on bp will now be started, so count it in runningbufspace. 2469 */ 2470 if (saved_runningbufspace != 0) { 2471 bp->b_runningbufspace = saved_runningbufspace; 2472 atomic_add_long(&runningbufspace, bp->b_runningbufspace); 2473 } 2474 return (error); 2475 } 2476 2477 /* 2478 * sync snapshots to force freework records waiting on snapshots to claim 2479 * blocks to free. 2480 */ 2481 void 2482 ffs_sync_snap(mp, waitfor) 2483 struct mount *mp; 2484 int waitfor; 2485 { 2486 struct snapdata *sn; 2487 struct vnode *devvp; 2488 struct vnode *vp; 2489 struct inode *ip; 2490 2491 devvp = VFSTOUFS(mp)->um_devvp; 2492 if ((devvp->v_vflag & VV_COPYONWRITE) == 0) 2493 return; 2494 for (;;) { 2495 VI_LOCK(devvp); 2496 sn = devvp->v_rdev->si_snapdata; 2497 if (sn == NULL) { 2498 VI_UNLOCK(devvp); 2499 return; 2500 } 2501 if (lockmgr(&sn->sn_lock, 2502 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 2503 VI_MTX(devvp)) == 0) 2504 break; 2505 } 2506 TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { 2507 vp = ITOV(ip); 2508 ffs_syncvnode(vp, waitfor, NO_INO_UPDT); 2509 } 2510 lockmgr(&sn->sn_lock, LK_RELEASE, NULL); 2511 } 2512 2513 /* 2514 * Read the specified block into the given buffer. 2515 * Much of this boiler-plate comes from bwrite(). 2516 */ 2517 static int 2518 readblock(vp, bp, lbn) 2519 struct vnode *vp; 2520 struct buf *bp; 2521 ufs2_daddr_t lbn; 2522 { 2523 struct inode *ip; 2524 struct bio *bip; 2525 struct fs *fs; 2526 2527 ip = VTOI(vp); 2528 fs = ITOFS(ip); 2529 2530 bip = g_alloc_bio(); 2531 bip->bio_cmd = BIO_READ; 2532 bip->bio_offset = dbtob(fsbtodb(fs, blkstofrags(fs, lbn))); 2533 bip->bio_data = bp->b_data; 2534 bip->bio_length = bp->b_bcount; 2535 bip->bio_done = NULL; 2536 2537 g_io_request(bip, ITODEVVP(ip)->v_bufobj.bo_private); 2538 bp->b_error = biowait(bip, "snaprdb"); 2539 g_destroy_bio(bip); 2540 return (bp->b_error); 2541 } 2542 2543 #endif 2544 2545 /* 2546 * Process file deletes that were deferred by ufs_inactive() due to 2547 * the file system being suspended. Transfer IN_LAZYACCESS into 2548 * IN_MODIFIED for vnodes that were accessed during suspension. 2549 */ 2550 void 2551 process_deferred_inactive(struct mount *mp) 2552 { 2553 struct vnode *vp, *mvp; 2554 struct inode *ip; 2555 int error; 2556 2557 (void) vn_start_secondary_write(NULL, &mp, V_WAIT); 2558 loop: 2559 MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { 2560 /* 2561 * IN_LAZYACCESS is checked here without holding any 2562 * vnode lock, but this flag is set only while holding 2563 * vnode interlock. 2564 */ 2565 if (vp->v_type == VNON || 2566 ((VTOI(vp)->i_flag & IN_LAZYACCESS) == 0 && 2567 ((vp->v_iflag & VI_OWEINACT) == 0 || vp->v_usecount > 0))) { 2568 VI_UNLOCK(vp); 2569 continue; 2570 } 2571 vholdl(vp); 2572 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); 2573 if (error != 0) { 2574 vdrop(vp); 2575 if (error == ENOENT) 2576 continue; /* vnode recycled */ 2577 MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); 2578 goto loop; 2579 } 2580 ip = VTOI(vp); 2581 if ((ip->i_flag & IN_LAZYACCESS) != 0) { 2582 ip->i_flag &= ~IN_LAZYACCESS; 2583 UFS_INODE_SET_FLAG(ip, IN_MODIFIED); 2584 } 2585 VI_LOCK(vp); 2586 if ((vp->v_iflag & VI_OWEINACT) == 0 || vp->v_usecount > 0) { 2587 VI_UNLOCK(vp); 2588 VOP_UNLOCK(vp); 2589 vdrop(vp); 2590 continue; 2591 } 2592 vinactive(vp); 2593 VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp, 2594 ("process_deferred_inactive: got VI_OWEINACT")); 2595 VI_UNLOCK(vp); 2596 VOP_UNLOCK(vp); 2597 vdrop(vp); 2598 } 2599 vn_finished_secondary_write(mp); 2600 } 2601 2602 #ifndef NO_FFS_SNAPSHOT 2603 2604 static struct snapdata * 2605 ffs_snapdata_alloc(void) 2606 { 2607 struct snapdata *sn; 2608 2609 /* 2610 * Fetch a snapdata from the free list if there is one available. 2611 */ 2612 mtx_lock(&snapfree_lock); 2613 sn = LIST_FIRST(&snapfree); 2614 if (sn != NULL) 2615 LIST_REMOVE(sn, sn_link); 2616 mtx_unlock(&snapfree_lock); 2617 if (sn != NULL) 2618 return (sn); 2619 /* 2620 * If there were no free snapdatas allocate one. 2621 */ 2622 sn = malloc(sizeof *sn, M_UFSMNT, M_WAITOK | M_ZERO); 2623 TAILQ_INIT(&sn->sn_head); 2624 lockinit(&sn->sn_lock, PVFS, "snaplk", VLKTIMEOUT, 2625 LK_CANRECURSE | LK_NOSHARE); 2626 return (sn); 2627 } 2628 2629 /* 2630 * The snapdata is never freed because we can not be certain that 2631 * there are no threads sleeping on the snap lock. Persisting 2632 * them permanently avoids costly synchronization in ffs_lock(). 2633 */ 2634 static void 2635 ffs_snapdata_free(struct snapdata *sn) 2636 { 2637 mtx_lock(&snapfree_lock); 2638 LIST_INSERT_HEAD(&snapfree, sn, sn_link); 2639 mtx_unlock(&snapfree_lock); 2640 } 2641 2642 /* Try to free snapdata associated with devvp */ 2643 static void 2644 try_free_snapdata(struct vnode *devvp) 2645 { 2646 struct snapdata *sn; 2647 ufs2_daddr_t *snapblklist; 2648 2649 ASSERT_VI_LOCKED(devvp, "try_free_snapdata"); 2650 sn = devvp->v_rdev->si_snapdata; 2651 2652 if (sn == NULL || TAILQ_FIRST(&sn->sn_head) != NULL || 2653 (devvp->v_vflag & VV_COPYONWRITE) == 0) { 2654 VI_UNLOCK(devvp); 2655 return; 2656 } 2657 2658 devvp->v_rdev->si_snapdata = NULL; 2659 devvp->v_vflag &= ~VV_COPYONWRITE; 2660 lockmgr(&sn->sn_lock, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp)); 2661 snapblklist = sn->sn_blklist; 2662 sn->sn_blklist = NULL; 2663 sn->sn_listsize = 0; 2664 lockmgr(&sn->sn_lock, LK_RELEASE, NULL); 2665 if (snapblklist != NULL) 2666 free(snapblklist, M_UFSMNT); 2667 ffs_snapdata_free(sn); 2668 } 2669 2670 static struct snapdata * 2671 ffs_snapdata_acquire(struct vnode *devvp) 2672 { 2673 struct snapdata *nsn, *sn; 2674 int error; 2675 2676 /* 2677 * Allocate a free snapdata. This is done before acquiring the 2678 * devvp lock to avoid allocation while the devvp interlock is 2679 * held. 2680 */ 2681 nsn = ffs_snapdata_alloc(); 2682 2683 for (;;) { 2684 VI_LOCK(devvp); 2685 sn = devvp->v_rdev->si_snapdata; 2686 if (sn == NULL) { 2687 /* 2688 * This is the first snapshot on this 2689 * filesystem and we use our pre-allocated 2690 * snapdata. Publish sn with the sn_lock 2691 * owned by us, to avoid the race. 2692 */ 2693 error = lockmgr(&nsn->sn_lock, LK_EXCLUSIVE | 2694 LK_NOWAIT, NULL); 2695 if (error != 0) 2696 panic("leaked sn, lockmgr error %d", error); 2697 sn = devvp->v_rdev->si_snapdata = nsn; 2698 VI_UNLOCK(devvp); 2699 nsn = NULL; 2700 break; 2701 } 2702 2703 /* 2704 * There is a snapshots which already exists on this 2705 * filesystem, grab a reference to the common lock. 2706 */ 2707 error = lockmgr(&sn->sn_lock, LK_INTERLOCK | 2708 LK_EXCLUSIVE | LK_SLEEPFAIL, VI_MTX(devvp)); 2709 if (error == 0) 2710 break; 2711 } 2712 2713 /* 2714 * Free any unused snapdata. 2715 */ 2716 if (nsn != NULL) 2717 ffs_snapdata_free(nsn); 2718 2719 return (sn); 2720 } 2721 2722 #endif 2723