1 /*- 2 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. 3 * 4 * Further information about snapshots can be obtained from: 5 * 6 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 7 * 1614 Oxford Street mckusick@mckusick.com 8 * Berkeley, CA 94709-1608 +1-510-843-9542 9 * USA 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 21 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 22 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 34 */ 35 36 #include <sys/cdefs.h> 37 __FBSDID("$FreeBSD$"); 38 39 #include "opt_quota.h" 40 41 #include <sys/param.h> 42 #include <sys/kernel.h> 43 #include <sys/systm.h> 44 #include <sys/conf.h> 45 #include <sys/bio.h> 46 #include <sys/buf.h> 47 #include <sys/fcntl.h> 48 #include <sys/proc.h> 49 #include <sys/namei.h> 50 #include <sys/sched.h> 51 #include <sys/stat.h> 52 #include <sys/malloc.h> 53 #include <sys/mount.h> 54 #include <sys/resource.h> 55 #include <sys/resourcevar.h> 56 #include <sys/vnode.h> 57 58 #include <geom/geom.h> 59 60 #include <ufs/ufs/extattr.h> 61 #include <ufs/ufs/quota.h> 62 #include <ufs/ufs/ufsmount.h> 63 #include <ufs/ufs/inode.h> 64 #include <ufs/ufs/ufs_extern.h> 65 66 #include <ufs/ffs/fs.h> 67 #include <ufs/ffs/ffs_extern.h> 68 69 #define KERNCRED thread0.td_ucred 70 #define DEBUG 1 71 72 #include "opt_ffs.h" 73 74 #ifdef NO_FFS_SNAPSHOT 75 int 76 ffs_snapshot(mp, snapfile) 77 struct mount *mp; 78 char *snapfile; 79 { 80 return (EINVAL); 81 } 82 83 int 84 ffs_snapblkfree(fs, devvp, bno, size, inum) 85 struct fs *fs; 86 struct vnode *devvp; 87 ufs2_daddr_t bno; 88 long size; 89 ino_t inum; 90 { 91 return (EINVAL); 92 } 93 94 void 95 ffs_snapremove(vp) 96 struct vnode *vp; 97 { 98 } 99 100 void 101 ffs_snapshot_mount(mp) 102 struct mount *mp; 103 { 104 } 105 106 void 107 ffs_snapshot_unmount(mp) 108 struct mount *mp; 109 { 110 } 111 112 void 113 ffs_snapgone(ip) 114 struct inode *ip; 115 { 116 } 117 118 int 119 ffs_copyonwrite(devvp, bp) 120 struct vnode *devvp; 121 struct buf *bp; 122 { 123 return (EINVAL); 124 } 125 126 #else 127 128 TAILQ_HEAD(snaphead, inode); 129 130 struct snapdata { 131 LIST_ENTRY(snapdata) sn_link; 132 struct snaphead sn_head; 133 daddr_t sn_listsize; 134 daddr_t *sn_blklist; 135 struct lock sn_lock; 136 }; 137 138 LIST_HEAD(, snapdata) snapfree; 139 static struct mtx snapfree_lock; 140 MTX_SYSINIT(ffs_snapfree, &snapfree_lock, "snapdata free list", MTX_DEF); 141 142 static int cgaccount(int, struct vnode *, struct buf *, int); 143 static int expunge_ufs1(struct vnode *, struct inode *, struct fs *, 144 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 145 ufs_lbn_t, int), int); 146 static int indiracct_ufs1(struct vnode *, struct vnode *, int, 147 ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 148 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 149 ufs_lbn_t, int), int); 150 static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 151 struct fs *, ufs_lbn_t, int); 152 static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 153 struct fs *, ufs_lbn_t, int); 154 static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 155 struct fs *, ufs_lbn_t, int); 156 static int expunge_ufs2(struct vnode *, struct inode *, struct fs *, 157 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 158 ufs_lbn_t, int), int); 159 static int indiracct_ufs2(struct vnode *, struct vnode *, int, 160 ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 161 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 162 ufs_lbn_t, int), int); 163 static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 164 struct fs *, ufs_lbn_t, int); 165 static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 166 struct fs *, ufs_lbn_t, int); 167 static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 168 struct fs *, ufs_lbn_t, int); 169 static int readblock(struct vnode *vp, struct buf *, ufs2_daddr_t); 170 static void try_free_snapdata(struct vnode *devvp); 171 static struct snapdata *ffs_snapdata_acquire(struct vnode *devvp); 172 static int ffs_bp_snapblk(struct vnode *, struct buf *); 173 174 /* 175 * To ensure the consistency of snapshots across crashes, we must 176 * synchronously write out copied blocks before allowing the 177 * originals to be modified. Because of the rather severe speed 178 * penalty that this imposes, the following flag allows this 179 * crash persistence to be disabled. 180 */ 181 int dopersistence = 0; 182 183 #ifdef DEBUG 184 #include <sys/sysctl.h> 185 SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, ""); 186 static int snapdebug = 0; 187 SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, ""); 188 int collectsnapstats = 0; 189 SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats, 190 0, ""); 191 #endif /* DEBUG */ 192 193 /* 194 * Create a snapshot file and initialize it for the filesystem. 195 */ 196 int 197 ffs_snapshot(mp, snapfile) 198 struct mount *mp; 199 char *snapfile; 200 { 201 ufs2_daddr_t numblks, blkno, *blkp, *snapblklist; 202 int error, cg, snaploc; 203 int i, size, len, loc; 204 int flag; 205 struct timespec starttime = {0, 0}, endtime; 206 char saved_nice = 0; 207 long redo = 0, snaplistsize = 0; 208 int32_t *lp; 209 void *space; 210 struct fs *copy_fs = NULL, *fs; 211 struct thread *td = curthread; 212 struct inode *ip, *xp; 213 struct buf *bp, *nbp, *ibp, *sbp = NULL; 214 struct nameidata nd; 215 struct mount *wrtmp; 216 struct vattr vat; 217 struct vnode *vp, *xvp, *mvp, *devvp; 218 struct uio auio; 219 struct iovec aiov; 220 struct snapdata *sn; 221 struct ufsmount *ump; 222 223 ump = VFSTOUFS(mp); 224 fs = ump->um_fs; 225 sn = NULL; 226 MNT_ILOCK(mp); 227 flag = mp->mnt_flag; 228 MNT_IUNLOCK(mp); 229 230 /* 231 * Need to serialize access to snapshot code per filesystem. 232 */ 233 /* 234 * Assign a snapshot slot in the superblock. 235 */ 236 UFS_LOCK(ump); 237 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 238 if (fs->fs_snapinum[snaploc] == 0) 239 break; 240 UFS_UNLOCK(ump); 241 if (snaploc == FSMAXSNAP) 242 return (ENOSPC); 243 /* 244 * Create the snapshot file. 245 */ 246 restart: 247 NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_SYSSPACE, snapfile, td); 248 if ((error = namei(&nd)) != 0) 249 return (error); 250 if (nd.ni_vp != NULL) { 251 vput(nd.ni_vp); 252 error = EEXIST; 253 } 254 if (nd.ni_dvp->v_mount != mp) 255 error = EXDEV; 256 if (error) { 257 NDFREE(&nd, NDF_ONLY_PNBUF); 258 if (nd.ni_dvp == nd.ni_vp) 259 vrele(nd.ni_dvp); 260 else 261 vput(nd.ni_dvp); 262 return (error); 263 } 264 VATTR_NULL(&vat); 265 vat.va_type = VREG; 266 vat.va_mode = S_IRUSR; 267 vat.va_vaflags |= VA_EXCLUSIVE; 268 if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp)) 269 wrtmp = NULL; 270 if (wrtmp != mp) 271 panic("ffs_snapshot: mount mismatch"); 272 vfs_rel(wrtmp); 273 if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) { 274 NDFREE(&nd, NDF_ONLY_PNBUF); 275 vput(nd.ni_dvp); 276 if ((error = vn_start_write(NULL, &wrtmp, 277 V_XSLEEP | PCATCH)) != 0) 278 return (error); 279 goto restart; 280 } 281 error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat); 282 VOP_UNLOCK(nd.ni_dvp, 0); 283 if (error) { 284 NDFREE(&nd, NDF_ONLY_PNBUF); 285 vn_finished_write(wrtmp); 286 vrele(nd.ni_dvp); 287 return (error); 288 } 289 vp = nd.ni_vp; 290 vp->v_vflag |= VV_SYSTEM; 291 ip = VTOI(vp); 292 devvp = ip->i_devvp; 293 /* 294 * Allocate and copy the last block contents so as to be able 295 * to set size to that of the filesystem. 296 */ 297 numblks = howmany(fs->fs_size, fs->fs_frag); 298 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), 299 fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); 300 if (error) 301 goto out; 302 ip->i_size = lblktosize(fs, (off_t)numblks); 303 DIP_SET(ip, i_size, ip->i_size); 304 ip->i_flag |= IN_CHANGE | IN_UPDATE; 305 error = readblock(vp, bp, numblks - 1); 306 bawrite(bp); 307 if (error != 0) 308 goto out; 309 /* 310 * Preallocate critical data structures so that we can copy 311 * them in without further allocation after we suspend all 312 * operations on the filesystem. We would like to just release 313 * the allocated buffers without writing them since they will 314 * be filled in below once we are ready to go, but this upsets 315 * the soft update code, so we go ahead and write the new buffers. 316 * 317 * Allocate all indirect blocks and mark all of them as not 318 * needing to be copied. 319 */ 320 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 321 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), 322 fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp); 323 if (error) 324 goto out; 325 bawrite(ibp); 326 } 327 /* 328 * Allocate copies for the superblock and its summary information. 329 */ 330 error = UFS_BALLOC(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED, 331 0, &nbp); 332 if (error) 333 goto out; 334 bawrite(nbp); 335 blkno = fragstoblks(fs, fs->fs_csaddr); 336 len = howmany(fs->fs_cssize, fs->fs_bsize); 337 for (loc = 0; loc < len; loc++) { 338 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), 339 fs->fs_bsize, KERNCRED, 0, &nbp); 340 if (error) 341 goto out; 342 bawrite(nbp); 343 } 344 /* 345 * Allocate all cylinder group blocks. 346 */ 347 for (cg = 0; cg < fs->fs_ncg; cg++) { 348 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 349 fs->fs_bsize, KERNCRED, 0, &nbp); 350 if (error) 351 goto out; 352 bawrite(nbp); 353 if (cg % 10 == 0) 354 ffs_syncvnode(vp, MNT_WAIT); 355 } 356 /* 357 * Copy all the cylinder group maps. Although the 358 * filesystem is still active, we hope that only a few 359 * cylinder groups will change between now and when we 360 * suspend operations. Thus, we will be able to quickly 361 * touch up the few cylinder groups that changed during 362 * the suspension period. 363 */ 364 len = howmany(fs->fs_ncg, NBBY); 365 space = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO); 366 UFS_LOCK(ump); 367 fs->fs_active = space; 368 UFS_UNLOCK(ump); 369 for (cg = 0; cg < fs->fs_ncg; cg++) { 370 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 371 fs->fs_bsize, KERNCRED, 0, &nbp); 372 if (error) 373 goto out; 374 error = cgaccount(cg, vp, nbp, 1); 375 bawrite(nbp); 376 if (cg % 10 == 0) 377 ffs_syncvnode(vp, MNT_WAIT); 378 if (error) 379 goto out; 380 } 381 /* 382 * Change inode to snapshot type file. 383 */ 384 ip->i_flags |= SF_SNAPSHOT; 385 DIP_SET(ip, i_flags, ip->i_flags); 386 ip->i_flag |= IN_CHANGE | IN_UPDATE; 387 /* 388 * Ensure that the snapshot is completely on disk. 389 * Since we have marked it as a snapshot it is safe to 390 * unlock it as no process will be allowed to write to it. 391 */ 392 if ((error = ffs_syncvnode(vp, MNT_WAIT)) != 0) 393 goto out; 394 VOP_UNLOCK(vp, 0); 395 /* 396 * All allocations are done, so we can now snapshot the system. 397 * 398 * Recind nice scheduling while running with the filesystem suspended. 399 */ 400 if (td->td_proc->p_nice > 0) { 401 struct proc *p; 402 403 p = td->td_proc; 404 PROC_LOCK(p); 405 saved_nice = p->p_nice; 406 sched_nice(p, 0); 407 PROC_UNLOCK(p); 408 } 409 /* 410 * Suspend operation on filesystem. 411 */ 412 for (;;) { 413 vn_finished_write(wrtmp); 414 if ((error = vfs_write_suspend(vp->v_mount)) != 0) { 415 vn_start_write(NULL, &wrtmp, V_WAIT); 416 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 417 goto out; 418 } 419 if (mp->mnt_kern_flag & MNTK_SUSPENDED) 420 break; 421 vn_start_write(NULL, &wrtmp, V_WAIT); 422 } 423 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 424 if (ip->i_effnlink == 0) { 425 error = ENOENT; /* Snapshot file unlinked */ 426 goto out1; 427 } 428 if (collectsnapstats) 429 nanotime(&starttime); 430 431 /* The last block might have changed. Copy it again to be sure. */ 432 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), 433 fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); 434 if (error != 0) 435 goto out1; 436 error = readblock(vp, bp, numblks - 1); 437 bp->b_flags |= B_VALIDSUSPWRT; 438 bawrite(bp); 439 if (error != 0) 440 goto out1; 441 /* 442 * First, copy all the cylinder group maps that have changed. 443 */ 444 for (cg = 0; cg < fs->fs_ncg; cg++) { 445 if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0) 446 continue; 447 redo++; 448 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 449 fs->fs_bsize, KERNCRED, 0, &nbp); 450 if (error) 451 goto out1; 452 error = cgaccount(cg, vp, nbp, 2); 453 bawrite(nbp); 454 if (error) 455 goto out1; 456 } 457 /* 458 * Grab a copy of the superblock and its summary information. 459 * We delay writing it until the suspension is released below. 460 */ 461 error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize, 462 KERNCRED, &sbp); 463 if (error) { 464 brelse(sbp); 465 sbp = NULL; 466 goto out1; 467 } 468 loc = blkoff(fs, fs->fs_sblockloc); 469 copy_fs = (struct fs *)(sbp->b_data + loc); 470 bcopy(fs, copy_fs, fs->fs_sbsize); 471 if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) 472 copy_fs->fs_clean = 1; 473 size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE; 474 if (fs->fs_sbsize < size) 475 bzero(&sbp->b_data[loc + fs->fs_sbsize], size - fs->fs_sbsize); 476 size = blkroundup(fs, fs->fs_cssize); 477 if (fs->fs_contigsumsize > 0) 478 size += fs->fs_ncg * sizeof(int32_t); 479 space = malloc((u_long)size, M_UFSMNT, M_WAITOK); 480 copy_fs->fs_csp = space; 481 bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize); 482 space = (char *)space + fs->fs_cssize; 483 loc = howmany(fs->fs_cssize, fs->fs_fsize); 484 i = fs->fs_frag - loc % fs->fs_frag; 485 len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; 486 if (len > 0) { 487 if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc), 488 len, KERNCRED, &bp)) != 0) { 489 brelse(bp); 490 free(copy_fs->fs_csp, M_UFSMNT); 491 bawrite(sbp); 492 sbp = NULL; 493 goto out1; 494 } 495 bcopy(bp->b_data, space, (u_int)len); 496 space = (char *)space + len; 497 bp->b_flags |= B_INVAL | B_NOCACHE; 498 brelse(bp); 499 } 500 if (fs->fs_contigsumsize > 0) { 501 copy_fs->fs_maxcluster = lp = space; 502 for (i = 0; i < fs->fs_ncg; i++) 503 *lp++ = fs->fs_contigsumsize; 504 } 505 /* 506 * We must check for active files that have been unlinked 507 * (e.g., with a zero link count). We have to expunge all 508 * trace of these files from the snapshot so that they are 509 * not reclaimed prematurely by fsck or unnecessarily dumped. 510 * We turn off the MNTK_SUSPENDED flag to avoid a panic from 511 * spec_strategy about writing on a suspended filesystem. 512 * Note that we skip unlinked snapshot files as they will 513 * be handled separately below. 514 * 515 * We also calculate the needed size for the snapshot list. 516 */ 517 snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + 518 FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */; 519 MNT_ILOCK(mp); 520 mp->mnt_kern_flag &= ~MNTK_SUSPENDED; 521 loop: 522 MNT_VNODE_FOREACH(xvp, mp, mvp) { 523 VI_LOCK(xvp); 524 MNT_IUNLOCK(mp); 525 if ((xvp->v_iflag & VI_DOOMED) || 526 (xvp->v_usecount == 0 && 527 (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) || 528 xvp->v_type == VNON || 529 (VTOI(xvp)->i_flags & SF_SNAPSHOT)) { 530 VI_UNLOCK(xvp); 531 MNT_ILOCK(mp); 532 continue; 533 } 534 /* 535 * We can skip parent directory vnode because it must have 536 * this snapshot file in it. 537 */ 538 if (xvp == nd.ni_dvp) { 539 VI_UNLOCK(xvp); 540 MNT_ILOCK(mp); 541 continue; 542 } 543 vholdl(xvp); 544 if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK) != 0) { 545 MNT_ILOCK(mp); 546 MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp); 547 vdrop(xvp); 548 goto loop; 549 } 550 VI_LOCK(xvp); 551 if (xvp->v_usecount == 0 && 552 (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) { 553 VI_UNLOCK(xvp); 554 VOP_UNLOCK(xvp, 0); 555 vdrop(xvp); 556 MNT_ILOCK(mp); 557 continue; 558 } 559 VI_UNLOCK(xvp); 560 if (snapdebug) 561 vprint("ffs_snapshot: busy vnode", xvp); 562 if (VOP_GETATTR(xvp, &vat, td->td_ucred) == 0 && 563 vat.va_nlink > 0) { 564 VOP_UNLOCK(xvp, 0); 565 vdrop(xvp); 566 MNT_ILOCK(mp); 567 continue; 568 } 569 xp = VTOI(xvp); 570 if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) { 571 VOP_UNLOCK(xvp, 0); 572 vdrop(xvp); 573 MNT_ILOCK(mp); 574 continue; 575 } 576 /* 577 * If there is a fragment, clear it here. 578 */ 579 blkno = 0; 580 loc = howmany(xp->i_size, fs->fs_bsize) - 1; 581 if (loc < NDADDR) { 582 len = fragroundup(fs, blkoff(fs, xp->i_size)); 583 if (len != 0 && len < fs->fs_bsize) { 584 ffs_blkfree(ump, copy_fs, vp, 585 DIP(xp, i_db[loc]), len, xp->i_number); 586 blkno = DIP(xp, i_db[loc]); 587 DIP_SET(xp, i_db[loc], 0); 588 } 589 } 590 snaplistsize += 1; 591 if (xp->i_ump->um_fstype == UFS1) 592 error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, 593 BLK_NOCOPY); 594 else 595 error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, 596 BLK_NOCOPY); 597 if (blkno) 598 DIP_SET(xp, i_db[loc], blkno); 599 if (!error) 600 error = ffs_freefile(ump, copy_fs, vp, xp->i_number, 601 xp->i_mode); 602 VOP_UNLOCK(xvp, 0); 603 vdrop(xvp); 604 if (error) { 605 free(copy_fs->fs_csp, M_UFSMNT); 606 bawrite(sbp); 607 sbp = NULL; 608 MNT_VNODE_FOREACH_ABORT(mp, mvp); 609 goto out1; 610 } 611 MNT_ILOCK(mp); 612 } 613 MNT_IUNLOCK(mp); 614 /* 615 * Acquire a lock on the snapdata structure, creating it if necessary. 616 */ 617 sn = ffs_snapdata_acquire(devvp); 618 /* 619 * Change vnode to use shared snapshot lock instead of the original 620 * private lock. 621 */ 622 vp->v_vnlock = &sn->sn_lock; 623 lockmgr(&vp->v_lock, LK_RELEASE, NULL); 624 xp = TAILQ_FIRST(&sn->sn_head); 625 /* 626 * If this is the first snapshot on this filesystem, then we need 627 * to allocate the space for the list of preallocated snapshot blocks. 628 * This list will be refined below, but this preliminary one will 629 * keep us out of deadlock until the full one is ready. 630 */ 631 if (xp == NULL) { 632 snapblklist = malloc(snaplistsize * sizeof(daddr_t), 633 M_UFSMNT, M_WAITOK); 634 blkp = &snapblklist[1]; 635 *blkp++ = lblkno(fs, fs->fs_sblockloc); 636 blkno = fragstoblks(fs, fs->fs_csaddr); 637 for (cg = 0; cg < fs->fs_ncg; cg++) { 638 if (fragstoblks(fs, cgtod(fs, cg) > blkno)) 639 break; 640 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 641 } 642 len = howmany(fs->fs_cssize, fs->fs_bsize); 643 for (loc = 0; loc < len; loc++) 644 *blkp++ = blkno + loc; 645 for (; cg < fs->fs_ncg; cg++) 646 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 647 snapblklist[0] = blkp - snapblklist; 648 VI_LOCK(devvp); 649 if (sn->sn_blklist != NULL) 650 panic("ffs_snapshot: non-empty list"); 651 sn->sn_blklist = snapblklist; 652 sn->sn_listsize = blkp - snapblklist; 653 VI_UNLOCK(devvp); 654 } 655 /* 656 * Record snapshot inode. Since this is the newest snapshot, 657 * it must be placed at the end of the list. 658 */ 659 VI_LOCK(devvp); 660 fs->fs_snapinum[snaploc] = ip->i_number; 661 if (ip->i_nextsnap.tqe_prev != 0) 662 panic("ffs_snapshot: %d already on list", ip->i_number); 663 TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap); 664 devvp->v_vflag |= VV_COPYONWRITE; 665 VI_UNLOCK(devvp); 666 ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp"); 667 out1: 668 KASSERT((sn != NULL && sbp != NULL && error == 0) || 669 (sn == NULL && sbp == NULL && error != 0), 670 ("email phk@ and mckusick@")); 671 /* 672 * Resume operation on filesystem. 673 */ 674 vfs_write_resume(vp->v_mount); 675 vn_start_write(NULL, &wrtmp, V_WAIT); 676 if (collectsnapstats && starttime.tv_sec > 0) { 677 nanotime(&endtime); 678 timespecsub(&endtime, &starttime); 679 printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n", 680 vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec, 681 endtime.tv_nsec / 1000000, redo, fs->fs_ncg); 682 } 683 if (sbp == NULL) 684 goto out; 685 /* 686 * Copy allocation information from all the snapshots in 687 * this snapshot and then expunge them from its view. 688 */ 689 TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) { 690 if (xp == ip) 691 break; 692 if (xp->i_ump->um_fstype == UFS1) 693 error = expunge_ufs1(vp, xp, fs, snapacct_ufs1, 694 BLK_SNAP); 695 else 696 error = expunge_ufs2(vp, xp, fs, snapacct_ufs2, 697 BLK_SNAP); 698 if (error == 0 && xp->i_effnlink == 0) { 699 error = ffs_freefile(ump, 700 copy_fs, 701 vp, 702 xp->i_number, 703 xp->i_mode); 704 } 705 if (error) { 706 fs->fs_snapinum[snaploc] = 0; 707 goto done; 708 } 709 } 710 /* 711 * Allocate space for the full list of preallocated snapshot blocks. 712 */ 713 snapblklist = malloc(snaplistsize * sizeof(daddr_t), 714 M_UFSMNT, M_WAITOK); 715 ip->i_snapblklist = &snapblklist[1]; 716 /* 717 * Expunge the blocks used by the snapshots from the set of 718 * blocks marked as used in the snapshot bitmaps. Also, collect 719 * the list of allocated blocks in i_snapblklist. 720 */ 721 if (ip->i_ump->um_fstype == UFS1) 722 error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP); 723 else 724 error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP); 725 if (error) { 726 fs->fs_snapinum[snaploc] = 0; 727 free(snapblklist, M_UFSMNT); 728 goto done; 729 } 730 if (snaplistsize < ip->i_snapblklist - snapblklist) 731 panic("ffs_snapshot: list too small"); 732 snaplistsize = ip->i_snapblklist - snapblklist; 733 snapblklist[0] = snaplistsize; 734 ip->i_snapblklist = 0; 735 /* 736 * Write out the list of allocated blocks to the end of the snapshot. 737 */ 738 auio.uio_iov = &aiov; 739 auio.uio_iovcnt = 1; 740 aiov.iov_base = (void *)snapblklist; 741 aiov.iov_len = snaplistsize * sizeof(daddr_t); 742 auio.uio_resid = aiov.iov_len; 743 auio.uio_offset = ip->i_size; 744 auio.uio_segflg = UIO_SYSSPACE; 745 auio.uio_rw = UIO_WRITE; 746 auio.uio_td = td; 747 if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 748 fs->fs_snapinum[snaploc] = 0; 749 free(snapblklist, M_UFSMNT); 750 goto done; 751 } 752 /* 753 * Write the superblock and its summary information 754 * to the snapshot. 755 */ 756 blkno = fragstoblks(fs, fs->fs_csaddr); 757 len = howmany(fs->fs_cssize, fs->fs_bsize); 758 space = copy_fs->fs_csp; 759 for (loc = 0; loc < len; loc++) { 760 error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp); 761 if (error) { 762 brelse(nbp); 763 fs->fs_snapinum[snaploc] = 0; 764 free(snapblklist, M_UFSMNT); 765 goto done; 766 } 767 bcopy(space, nbp->b_data, fs->fs_bsize); 768 space = (char *)space + fs->fs_bsize; 769 bawrite(nbp); 770 } 771 /* 772 * As this is the newest list, it is the most inclusive, so 773 * should replace the previous list. 774 */ 775 VI_LOCK(devvp); 776 space = sn->sn_blklist; 777 sn->sn_blklist = snapblklist; 778 sn->sn_listsize = snaplistsize; 779 VI_UNLOCK(devvp); 780 if (space != NULL) 781 free(space, M_UFSMNT); 782 /* 783 * If another process is currently writing the buffer containing 784 * the inode for this snapshot then a deadlock can occur. Drop 785 * the snapshot lock until the buffer has been written. 786 */ 787 VREF(vp); /* Protect against ffs_snapgone() */ 788 VOP_UNLOCK(vp, 0); 789 (void) bread(ip->i_devvp, 790 fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), 791 (int) fs->fs_bsize, NOCRED, &nbp); 792 brelse(nbp); 793 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 794 if (ip->i_effnlink == 0) 795 error = ENOENT; /* Snapshot file unlinked */ 796 else 797 vrele(vp); /* Drop extra reference */ 798 done: 799 free(copy_fs->fs_csp, M_UFSMNT); 800 bawrite(sbp); 801 out: 802 NDFREE(&nd, NDF_ONLY_PNBUF); 803 if (saved_nice > 0) { 804 struct proc *p; 805 806 p = td->td_proc; 807 PROC_LOCK(p); 808 sched_nice(td->td_proc, saved_nice); 809 PROC_UNLOCK(td->td_proc); 810 } 811 UFS_LOCK(ump); 812 if (fs->fs_active != 0) { 813 free(fs->fs_active, M_DEVBUF); 814 fs->fs_active = 0; 815 } 816 UFS_UNLOCK(ump); 817 MNT_ILOCK(mp); 818 mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA); 819 MNT_IUNLOCK(mp); 820 if (error) 821 (void) ffs_truncate(vp, (off_t)0, 0, NOCRED, td); 822 (void) ffs_syncvnode(vp, MNT_WAIT); 823 if (error) 824 vput(vp); 825 else 826 VOP_UNLOCK(vp, 0); 827 vrele(nd.ni_dvp); 828 vn_finished_write(wrtmp); 829 process_deferred_inactive(mp); 830 return (error); 831 } 832 833 /* 834 * Copy a cylinder group map. All the unallocated blocks are marked 835 * BLK_NOCOPY so that the snapshot knows that it need not copy them 836 * if they are later written. If passno is one, then this is a first 837 * pass, so only setting needs to be done. If passno is 2, then this 838 * is a revision to a previous pass which must be undone as the 839 * replacement pass is done. 840 */ 841 static int 842 cgaccount(cg, vp, nbp, passno) 843 int cg; 844 struct vnode *vp; 845 struct buf *nbp; 846 int passno; 847 { 848 struct buf *bp, *ibp; 849 struct inode *ip; 850 struct cg *cgp; 851 struct fs *fs; 852 ufs2_daddr_t base, numblks; 853 int error, len, loc, indiroff; 854 855 ip = VTOI(vp); 856 fs = ip->i_fs; 857 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 858 (int)fs->fs_cgsize, KERNCRED, &bp); 859 if (error) { 860 brelse(bp); 861 return (error); 862 } 863 cgp = (struct cg *)bp->b_data; 864 if (!cg_chkmagic(cgp)) { 865 brelse(bp); 866 return (EIO); 867 } 868 UFS_LOCK(ip->i_ump); 869 ACTIVESET(fs, cg); 870 /* 871 * Recomputation of summary information might not have been performed 872 * at mount time. Sync up summary information for current cylinder 873 * group while data is in memory to ensure that result of background 874 * fsck is slightly more consistent. 875 */ 876 fs->fs_cs(fs, cg) = cgp->cg_cs; 877 UFS_UNLOCK(ip->i_ump); 878 bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize); 879 if (fs->fs_cgsize < fs->fs_bsize) 880 bzero(&nbp->b_data[fs->fs_cgsize], 881 fs->fs_bsize - fs->fs_cgsize); 882 cgp = (struct cg *)nbp->b_data; 883 bqrelse(bp); 884 if (passno == 2) 885 nbp->b_flags |= B_VALIDSUSPWRT; 886 numblks = howmany(fs->fs_size, fs->fs_frag); 887 len = howmany(fs->fs_fpg, fs->fs_frag); 888 base = cgbase(fs, cg) / fs->fs_frag; 889 if (base + len >= numblks) 890 len = numblks - base - 1; 891 loc = 0; 892 if (base < NDADDR) { 893 for ( ; loc < NDADDR; loc++) { 894 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 895 DIP_SET(ip, i_db[loc], BLK_NOCOPY); 896 else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY) 897 DIP_SET(ip, i_db[loc], 0); 898 else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY) 899 panic("ffs_snapshot: lost direct block"); 900 } 901 } 902 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), 903 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 904 if (error) { 905 return (error); 906 } 907 indiroff = (base + loc - NDADDR) % NINDIR(fs); 908 for ( ; loc < len; loc++, indiroff++) { 909 if (indiroff >= NINDIR(fs)) { 910 if (passno == 2) 911 ibp->b_flags |= B_VALIDSUSPWRT; 912 bawrite(ibp); 913 error = UFS_BALLOC(vp, 914 lblktosize(fs, (off_t)(base + loc)), 915 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 916 if (error) { 917 return (error); 918 } 919 indiroff = 0; 920 } 921 if (ip->i_ump->um_fstype == UFS1) { 922 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 923 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 924 BLK_NOCOPY; 925 else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data)) 926 [indiroff] == BLK_NOCOPY) 927 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0; 928 else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data)) 929 [indiroff] == BLK_NOCOPY) 930 panic("ffs_snapshot: lost indirect block"); 931 continue; 932 } 933 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 934 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; 935 else if (passno == 2 && 936 ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) 937 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0; 938 else if (passno == 1 && 939 ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) 940 panic("ffs_snapshot: lost indirect block"); 941 } 942 if (passno == 2) 943 ibp->b_flags |= B_VALIDSUSPWRT; 944 bdwrite(ibp); 945 return (0); 946 } 947 948 /* 949 * Before expunging a snapshot inode, note all the 950 * blocks that it claims with BLK_SNAP so that fsck will 951 * be able to account for those blocks properly and so 952 * that this snapshot knows that it need not copy them 953 * if the other snapshot holding them is freed. This code 954 * is reproduced once each for UFS1 and UFS2. 955 */ 956 static int 957 expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype) 958 struct vnode *snapvp; 959 struct inode *cancelip; 960 struct fs *fs; 961 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 962 struct fs *, ufs_lbn_t, int); 963 int expungetype; 964 { 965 int i, error, indiroff; 966 ufs_lbn_t lbn, rlbn; 967 ufs2_daddr_t len, blkno, numblks, blksperindir; 968 struct ufs1_dinode *dip; 969 struct thread *td = curthread; 970 struct buf *bp; 971 972 /* 973 * Prepare to expunge the inode. If its inode block has not 974 * yet been copied, then allocate and fill the copy. 975 */ 976 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 977 blkno = 0; 978 if (lbn < NDADDR) { 979 blkno = VTOI(snapvp)->i_din1->di_db[lbn]; 980 } else { 981 td->td_pflags |= TDP_COWINPROGRESS; 982 error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn), 983 fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); 984 td->td_pflags &= ~TDP_COWINPROGRESS; 985 if (error) 986 return (error); 987 indiroff = (lbn - NDADDR) % NINDIR(fs); 988 blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff]; 989 bqrelse(bp); 990 } 991 if (blkno != 0) { 992 if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) 993 return (error); 994 } else { 995 error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn), 996 fs->fs_bsize, KERNCRED, 0, &bp); 997 if (error) 998 return (error); 999 if ((error = readblock(snapvp, bp, lbn)) != 0) 1000 return (error); 1001 } 1002 /* 1003 * Set a snapshot inode to be a zero length file, regular files 1004 * or unlinked snapshots to be completely unallocated. 1005 */ 1006 dip = (struct ufs1_dinode *)bp->b_data + 1007 ino_to_fsbo(fs, cancelip->i_number); 1008 if (expungetype == BLK_NOCOPY || cancelip->i_effnlink == 0) 1009 dip->di_mode = 0; 1010 dip->di_size = 0; 1011 dip->di_blocks = 0; 1012 dip->di_flags &= ~SF_SNAPSHOT; 1013 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t)); 1014 bdwrite(bp); 1015 /* 1016 * Now go through and expunge all the blocks in the file 1017 * using the function requested. 1018 */ 1019 numblks = howmany(cancelip->i_size, fs->fs_bsize); 1020 if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0], 1021 &cancelip->i_din1->di_db[NDADDR], fs, 0, expungetype))) 1022 return (error); 1023 if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_ib[0], 1024 &cancelip->i_din1->di_ib[NIADDR], fs, -1, expungetype))) 1025 return (error); 1026 blksperindir = 1; 1027 lbn = -NDADDR; 1028 len = numblks - NDADDR; 1029 rlbn = NDADDR; 1030 for (i = 0; len > 0 && i < NIADDR; i++) { 1031 error = indiracct_ufs1(snapvp, ITOV(cancelip), i, 1032 cancelip->i_din1->di_ib[i], lbn, rlbn, len, 1033 blksperindir, fs, acctfunc, expungetype); 1034 if (error) 1035 return (error); 1036 blksperindir *= NINDIR(fs); 1037 lbn -= blksperindir + 1; 1038 len -= blksperindir; 1039 rlbn += blksperindir; 1040 } 1041 return (0); 1042 } 1043 1044 /* 1045 * Descend an indirect block chain for vnode cancelvp accounting for all 1046 * its indirect blocks in snapvp. 1047 */ 1048 static int 1049 indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, 1050 blksperindir, fs, acctfunc, expungetype) 1051 struct vnode *snapvp; 1052 struct vnode *cancelvp; 1053 int level; 1054 ufs1_daddr_t blkno; 1055 ufs_lbn_t lbn; 1056 ufs_lbn_t rlbn; 1057 ufs_lbn_t remblks; 1058 ufs_lbn_t blksperindir; 1059 struct fs *fs; 1060 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 1061 struct fs *, ufs_lbn_t, int); 1062 int expungetype; 1063 { 1064 int error, num, i; 1065 ufs_lbn_t subblksperindir; 1066 struct indir indirs[NIADDR + 2]; 1067 ufs1_daddr_t last, *bap; 1068 struct buf *bp; 1069 1070 if (blkno == 0) { 1071 if (expungetype == BLK_NOCOPY) 1072 return (0); 1073 panic("indiracct_ufs1: missing indir"); 1074 } 1075 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 1076 return (error); 1077 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 1078 panic("indiracct_ufs1: botched params"); 1079 /* 1080 * We have to expand bread here since it will deadlock looking 1081 * up the block number for any blocks that are not in the cache. 1082 */ 1083 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); 1084 bp->b_blkno = fsbtodb(fs, blkno); 1085 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 1086 (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) { 1087 brelse(bp); 1088 return (error); 1089 } 1090 /* 1091 * Account for the block pointers in this indirect block. 1092 */ 1093 last = howmany(remblks, blksperindir); 1094 if (last > NINDIR(fs)) 1095 last = NINDIR(fs); 1096 bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK); 1097 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 1098 bqrelse(bp); 1099 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 1100 level == 0 ? rlbn : -1, expungetype); 1101 if (error || level == 0) 1102 goto out; 1103 /* 1104 * Account for the block pointers in each of the indirect blocks 1105 * in the levels below us. 1106 */ 1107 subblksperindir = blksperindir / NINDIR(fs); 1108 for (lbn++, level--, i = 0; i < last; i++) { 1109 error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn, 1110 rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); 1111 if (error) 1112 goto out; 1113 rlbn += blksperindir; 1114 lbn -= blksperindir; 1115 remblks -= blksperindir; 1116 } 1117 out: 1118 free(bap, M_DEVBUF); 1119 return (error); 1120 } 1121 1122 /* 1123 * Do both snap accounting and map accounting. 1124 */ 1125 static int 1126 fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype) 1127 struct vnode *vp; 1128 ufs1_daddr_t *oldblkp, *lastblkp; 1129 struct fs *fs; 1130 ufs_lbn_t lblkno; 1131 int exptype; /* BLK_SNAP or BLK_NOCOPY */ 1132 { 1133 int error; 1134 1135 if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 1136 return (error); 1137 return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 1138 } 1139 1140 /* 1141 * Identify a set of blocks allocated in a snapshot inode. 1142 */ 1143 static int 1144 snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1145 struct vnode *vp; 1146 ufs1_daddr_t *oldblkp, *lastblkp; 1147 struct fs *fs; 1148 ufs_lbn_t lblkno; 1149 int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 1150 { 1151 struct inode *ip = VTOI(vp); 1152 ufs1_daddr_t blkno, *blkp; 1153 ufs_lbn_t lbn; 1154 struct buf *ibp; 1155 int error; 1156 1157 for ( ; oldblkp < lastblkp; oldblkp++) { 1158 blkno = *oldblkp; 1159 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1160 continue; 1161 lbn = fragstoblks(fs, blkno); 1162 if (lbn < NDADDR) { 1163 blkp = &ip->i_din1->di_db[lbn]; 1164 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1165 } else { 1166 error = ffs_balloc_ufs1(vp, lblktosize(fs, (off_t)lbn), 1167 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1168 if (error) 1169 return (error); 1170 blkp = &((ufs1_daddr_t *)(ibp->b_data)) 1171 [(lbn - NDADDR) % NINDIR(fs)]; 1172 } 1173 /* 1174 * If we are expunging a snapshot vnode and we 1175 * find a block marked BLK_NOCOPY, then it is 1176 * one that has been allocated to this snapshot after 1177 * we took our current snapshot and can be ignored. 1178 */ 1179 if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { 1180 if (lbn >= NDADDR) 1181 brelse(ibp); 1182 } else { 1183 if (*blkp != 0) 1184 panic("snapacct_ufs1: bad block"); 1185 *blkp = expungetype; 1186 if (lbn >= NDADDR) 1187 bdwrite(ibp); 1188 } 1189 } 1190 return (0); 1191 } 1192 1193 /* 1194 * Account for a set of blocks allocated in a snapshot inode. 1195 */ 1196 static int 1197 mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1198 struct vnode *vp; 1199 ufs1_daddr_t *oldblkp, *lastblkp; 1200 struct fs *fs; 1201 ufs_lbn_t lblkno; 1202 int expungetype; 1203 { 1204 ufs1_daddr_t blkno; 1205 struct inode *ip; 1206 ino_t inum; 1207 int acctit; 1208 1209 ip = VTOI(vp); 1210 inum = ip->i_number; 1211 if (lblkno == -1) 1212 acctit = 0; 1213 else 1214 acctit = 1; 1215 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1216 blkno = *oldblkp; 1217 if (blkno == 0 || blkno == BLK_NOCOPY) 1218 continue; 1219 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1220 *ip->i_snapblklist++ = lblkno; 1221 if (blkno == BLK_SNAP) 1222 blkno = blkstofrags(fs, lblkno); 1223 ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum); 1224 } 1225 return (0); 1226 } 1227 1228 /* 1229 * Before expunging a snapshot inode, note all the 1230 * blocks that it claims with BLK_SNAP so that fsck will 1231 * be able to account for those blocks properly and so 1232 * that this snapshot knows that it need not copy them 1233 * if the other snapshot holding them is freed. This code 1234 * is reproduced once each for UFS1 and UFS2. 1235 */ 1236 static int 1237 expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype) 1238 struct vnode *snapvp; 1239 struct inode *cancelip; 1240 struct fs *fs; 1241 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1242 struct fs *, ufs_lbn_t, int); 1243 int expungetype; 1244 { 1245 int i, error, indiroff; 1246 ufs_lbn_t lbn, rlbn; 1247 ufs2_daddr_t len, blkno, numblks, blksperindir; 1248 struct ufs2_dinode *dip; 1249 struct thread *td = curthread; 1250 struct buf *bp; 1251 1252 /* 1253 * Prepare to expunge the inode. If its inode block has not 1254 * yet been copied, then allocate and fill the copy. 1255 */ 1256 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 1257 blkno = 0; 1258 if (lbn < NDADDR) { 1259 blkno = VTOI(snapvp)->i_din2->di_db[lbn]; 1260 } else { 1261 td->td_pflags |= TDP_COWINPROGRESS; 1262 error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn), 1263 fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); 1264 td->td_pflags &= ~TDP_COWINPROGRESS; 1265 if (error) 1266 return (error); 1267 indiroff = (lbn - NDADDR) % NINDIR(fs); 1268 blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff]; 1269 bqrelse(bp); 1270 } 1271 if (blkno != 0) { 1272 if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) 1273 return (error); 1274 } else { 1275 error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn), 1276 fs->fs_bsize, KERNCRED, 0, &bp); 1277 if (error) 1278 return (error); 1279 if ((error = readblock(snapvp, bp, lbn)) != 0) 1280 return (error); 1281 } 1282 /* 1283 * Set a snapshot inode to be a zero length file, regular files 1284 * to be completely unallocated. 1285 */ 1286 dip = (struct ufs2_dinode *)bp->b_data + 1287 ino_to_fsbo(fs, cancelip->i_number); 1288 if (expungetype == BLK_NOCOPY) 1289 dip->di_mode = 0; 1290 dip->di_size = 0; 1291 dip->di_blocks = 0; 1292 dip->di_flags &= ~SF_SNAPSHOT; 1293 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t)); 1294 bdwrite(bp); 1295 /* 1296 * Now go through and expunge all the blocks in the file 1297 * using the function requested. 1298 */ 1299 numblks = howmany(cancelip->i_size, fs->fs_bsize); 1300 if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0], 1301 &cancelip->i_din2->di_db[NDADDR], fs, 0, expungetype))) 1302 return (error); 1303 if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_ib[0], 1304 &cancelip->i_din2->di_ib[NIADDR], fs, -1, expungetype))) 1305 return (error); 1306 blksperindir = 1; 1307 lbn = -NDADDR; 1308 len = numblks - NDADDR; 1309 rlbn = NDADDR; 1310 for (i = 0; len > 0 && i < NIADDR; i++) { 1311 error = indiracct_ufs2(snapvp, ITOV(cancelip), i, 1312 cancelip->i_din2->di_ib[i], lbn, rlbn, len, 1313 blksperindir, fs, acctfunc, expungetype); 1314 if (error) 1315 return (error); 1316 blksperindir *= NINDIR(fs); 1317 lbn -= blksperindir + 1; 1318 len -= blksperindir; 1319 rlbn += blksperindir; 1320 } 1321 return (0); 1322 } 1323 1324 /* 1325 * Descend an indirect block chain for vnode cancelvp accounting for all 1326 * its indirect blocks in snapvp. 1327 */ 1328 static int 1329 indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, 1330 blksperindir, fs, acctfunc, expungetype) 1331 struct vnode *snapvp; 1332 struct vnode *cancelvp; 1333 int level; 1334 ufs2_daddr_t blkno; 1335 ufs_lbn_t lbn; 1336 ufs_lbn_t rlbn; 1337 ufs_lbn_t remblks; 1338 ufs_lbn_t blksperindir; 1339 struct fs *fs; 1340 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1341 struct fs *, ufs_lbn_t, int); 1342 int expungetype; 1343 { 1344 int error, num, i; 1345 ufs_lbn_t subblksperindir; 1346 struct indir indirs[NIADDR + 2]; 1347 ufs2_daddr_t last, *bap; 1348 struct buf *bp; 1349 1350 if (blkno == 0) { 1351 if (expungetype == BLK_NOCOPY) 1352 return (0); 1353 panic("indiracct_ufs2: missing indir"); 1354 } 1355 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 1356 return (error); 1357 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 1358 panic("indiracct_ufs2: botched params"); 1359 /* 1360 * We have to expand bread here since it will deadlock looking 1361 * up the block number for any blocks that are not in the cache. 1362 */ 1363 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); 1364 bp->b_blkno = fsbtodb(fs, blkno); 1365 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 1366 (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) { 1367 brelse(bp); 1368 return (error); 1369 } 1370 /* 1371 * Account for the block pointers in this indirect block. 1372 */ 1373 last = howmany(remblks, blksperindir); 1374 if (last > NINDIR(fs)) 1375 last = NINDIR(fs); 1376 bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK); 1377 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 1378 bqrelse(bp); 1379 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 1380 level == 0 ? rlbn : -1, expungetype); 1381 if (error || level == 0) 1382 goto out; 1383 /* 1384 * Account for the block pointers in each of the indirect blocks 1385 * in the levels below us. 1386 */ 1387 subblksperindir = blksperindir / NINDIR(fs); 1388 for (lbn++, level--, i = 0; i < last; i++) { 1389 error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn, 1390 rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); 1391 if (error) 1392 goto out; 1393 rlbn += blksperindir; 1394 lbn -= blksperindir; 1395 remblks -= blksperindir; 1396 } 1397 out: 1398 free(bap, M_DEVBUF); 1399 return (error); 1400 } 1401 1402 /* 1403 * Do both snap accounting and map accounting. 1404 */ 1405 static int 1406 fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype) 1407 struct vnode *vp; 1408 ufs2_daddr_t *oldblkp, *lastblkp; 1409 struct fs *fs; 1410 ufs_lbn_t lblkno; 1411 int exptype; /* BLK_SNAP or BLK_NOCOPY */ 1412 { 1413 int error; 1414 1415 if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 1416 return (error); 1417 return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 1418 } 1419 1420 /* 1421 * Identify a set of blocks allocated in a snapshot inode. 1422 */ 1423 static int 1424 snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1425 struct vnode *vp; 1426 ufs2_daddr_t *oldblkp, *lastblkp; 1427 struct fs *fs; 1428 ufs_lbn_t lblkno; 1429 int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 1430 { 1431 struct inode *ip = VTOI(vp); 1432 ufs2_daddr_t blkno, *blkp; 1433 ufs_lbn_t lbn; 1434 struct buf *ibp; 1435 int error; 1436 1437 for ( ; oldblkp < lastblkp; oldblkp++) { 1438 blkno = *oldblkp; 1439 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1440 continue; 1441 lbn = fragstoblks(fs, blkno); 1442 if (lbn < NDADDR) { 1443 blkp = &ip->i_din2->di_db[lbn]; 1444 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1445 } else { 1446 error = ffs_balloc_ufs2(vp, lblktosize(fs, (off_t)lbn), 1447 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1448 if (error) 1449 return (error); 1450 blkp = &((ufs2_daddr_t *)(ibp->b_data)) 1451 [(lbn - NDADDR) % NINDIR(fs)]; 1452 } 1453 /* 1454 * If we are expunging a snapshot vnode and we 1455 * find a block marked BLK_NOCOPY, then it is 1456 * one that has been allocated to this snapshot after 1457 * we took our current snapshot and can be ignored. 1458 */ 1459 if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { 1460 if (lbn >= NDADDR) 1461 brelse(ibp); 1462 } else { 1463 if (*blkp != 0) 1464 panic("snapacct_ufs2: bad block"); 1465 *blkp = expungetype; 1466 if (lbn >= NDADDR) 1467 bdwrite(ibp); 1468 } 1469 } 1470 return (0); 1471 } 1472 1473 /* 1474 * Account for a set of blocks allocated in a snapshot inode. 1475 */ 1476 static int 1477 mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1478 struct vnode *vp; 1479 ufs2_daddr_t *oldblkp, *lastblkp; 1480 struct fs *fs; 1481 ufs_lbn_t lblkno; 1482 int expungetype; 1483 { 1484 ufs2_daddr_t blkno; 1485 struct inode *ip; 1486 ino_t inum; 1487 int acctit; 1488 1489 ip = VTOI(vp); 1490 inum = ip->i_number; 1491 if (lblkno == -1) 1492 acctit = 0; 1493 else 1494 acctit = 1; 1495 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1496 blkno = *oldblkp; 1497 if (blkno == 0 || blkno == BLK_NOCOPY) 1498 continue; 1499 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1500 *ip->i_snapblklist++ = lblkno; 1501 if (blkno == BLK_SNAP) 1502 blkno = blkstofrags(fs, lblkno); 1503 ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum); 1504 } 1505 return (0); 1506 } 1507 1508 /* 1509 * Decrement extra reference on snapshot when last name is removed. 1510 * It will not be freed until the last open reference goes away. 1511 */ 1512 void 1513 ffs_snapgone(ip) 1514 struct inode *ip; 1515 { 1516 struct inode *xp; 1517 struct fs *fs; 1518 int snaploc; 1519 struct snapdata *sn; 1520 struct ufsmount *ump; 1521 1522 /* 1523 * Find snapshot in incore list. 1524 */ 1525 xp = NULL; 1526 sn = ip->i_devvp->v_rdev->si_snapdata; 1527 if (sn != NULL) 1528 TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) 1529 if (xp == ip) 1530 break; 1531 if (xp != NULL) 1532 vrele(ITOV(ip)); 1533 else if (snapdebug) 1534 printf("ffs_snapgone: lost snapshot vnode %d\n", 1535 ip->i_number); 1536 /* 1537 * Delete snapshot inode from superblock. Keep list dense. 1538 */ 1539 fs = ip->i_fs; 1540 ump = ip->i_ump; 1541 UFS_LOCK(ump); 1542 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 1543 if (fs->fs_snapinum[snaploc] == ip->i_number) 1544 break; 1545 if (snaploc < FSMAXSNAP) { 1546 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { 1547 if (fs->fs_snapinum[snaploc] == 0) 1548 break; 1549 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; 1550 } 1551 fs->fs_snapinum[snaploc - 1] = 0; 1552 } 1553 UFS_UNLOCK(ump); 1554 } 1555 1556 /* 1557 * Prepare a snapshot file for being removed. 1558 */ 1559 void 1560 ffs_snapremove(vp) 1561 struct vnode *vp; 1562 { 1563 struct inode *ip; 1564 struct vnode *devvp; 1565 struct buf *ibp; 1566 struct fs *fs; 1567 ufs2_daddr_t numblks, blkno, dblk; 1568 int error, loc, last; 1569 struct snapdata *sn; 1570 1571 ip = VTOI(vp); 1572 fs = ip->i_fs; 1573 devvp = ip->i_devvp; 1574 /* 1575 * If active, delete from incore list (this snapshot may 1576 * already have been in the process of being deleted, so 1577 * would not have been active). 1578 * 1579 * Clear copy-on-write flag if last snapshot. 1580 */ 1581 VI_LOCK(devvp); 1582 if (ip->i_nextsnap.tqe_prev != 0) { 1583 sn = devvp->v_rdev->si_snapdata; 1584 TAILQ_REMOVE(&sn->sn_head, ip, i_nextsnap); 1585 ip->i_nextsnap.tqe_prev = 0; 1586 VI_UNLOCK(devvp); 1587 lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL); 1588 KASSERT(vp->v_vnlock == &sn->sn_lock, 1589 ("ffs_snapremove: lost lock mutation")); 1590 vp->v_vnlock = &vp->v_lock; 1591 VI_LOCK(devvp); 1592 lockmgr(&sn->sn_lock, LK_RELEASE, NULL); 1593 try_free_snapdata(devvp); 1594 } else 1595 VI_UNLOCK(devvp); 1596 /* 1597 * Clear all BLK_NOCOPY fields. Pass any block claims to other 1598 * snapshots that want them (see ffs_snapblkfree below). 1599 */ 1600 for (blkno = 1; blkno < NDADDR; blkno++) { 1601 dblk = DIP(ip, i_db[blkno]); 1602 if (dblk == 0) 1603 continue; 1604 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1605 DIP_SET(ip, i_db[blkno], 0); 1606 else if ((dblk == blkstofrags(fs, blkno) && 1607 ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, 1608 ip->i_number))) { 1609 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - 1610 btodb(fs->fs_bsize)); 1611 DIP_SET(ip, i_db[blkno], 0); 1612 } 1613 } 1614 numblks = howmany(ip->i_size, fs->fs_bsize); 1615 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 1616 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), 1617 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1618 if (error) 1619 continue; 1620 if (fs->fs_size - blkno > NINDIR(fs)) 1621 last = NINDIR(fs); 1622 else 1623 last = fs->fs_size - blkno; 1624 for (loc = 0; loc < last; loc++) { 1625 if (ip->i_ump->um_fstype == UFS1) { 1626 dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc]; 1627 if (dblk == 0) 1628 continue; 1629 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1630 ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; 1631 else if ((dblk == blkstofrags(fs, blkno) && 1632 ffs_snapblkfree(fs, ip->i_devvp, dblk, 1633 fs->fs_bsize, ip->i_number))) { 1634 ip->i_din1->di_blocks -= 1635 btodb(fs->fs_bsize); 1636 ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; 1637 } 1638 continue; 1639 } 1640 dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc]; 1641 if (dblk == 0) 1642 continue; 1643 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1644 ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; 1645 else if ((dblk == blkstofrags(fs, blkno) && 1646 ffs_snapblkfree(fs, ip->i_devvp, dblk, 1647 fs->fs_bsize, ip->i_number))) { 1648 ip->i_din2->di_blocks -= btodb(fs->fs_bsize); 1649 ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; 1650 } 1651 } 1652 bawrite(ibp); 1653 } 1654 /* 1655 * Clear snapshot flag and drop reference. 1656 */ 1657 ip->i_flags &= ~SF_SNAPSHOT; 1658 DIP_SET(ip, i_flags, ip->i_flags); 1659 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1660 #ifdef QUOTA 1661 /* 1662 * Reenable disk quotas for ex-snapshot file. 1663 */ 1664 if (!getinoquota(ip)) 1665 (void) chkdq(ip, DIP(ip, i_blocks), KERNCRED, FORCE); 1666 #endif 1667 } 1668 1669 /* 1670 * Notification that a block is being freed. Return zero if the free 1671 * should be allowed to proceed. Return non-zero if the snapshot file 1672 * wants to claim the block. The block will be claimed if it is an 1673 * uncopied part of one of the snapshots. It will be freed if it is 1674 * either a BLK_NOCOPY or has already been copied in all of the snapshots. 1675 * If a fragment is being freed, then all snapshots that care about 1676 * it must make a copy since a snapshot file can only claim full sized 1677 * blocks. Note that if more than one snapshot file maps the block, 1678 * we can pick one at random to claim it. Since none of the snapshots 1679 * can change, we are assurred that they will all see the same unmodified 1680 * image. When deleting a snapshot file (see ffs_snapremove above), we 1681 * must push any of these claimed blocks to one of the other snapshots 1682 * that maps it. These claimed blocks are easily identified as they will 1683 * have a block number equal to their logical block number within the 1684 * snapshot. A copied block can never have this property because they 1685 * must always have been allocated from a BLK_NOCOPY location. 1686 */ 1687 int 1688 ffs_snapblkfree(fs, devvp, bno, size, inum) 1689 struct fs *fs; 1690 struct vnode *devvp; 1691 ufs2_daddr_t bno; 1692 long size; 1693 ino_t inum; 1694 { 1695 struct buf *ibp, *cbp, *savedcbp = 0; 1696 struct thread *td = curthread; 1697 struct inode *ip; 1698 struct vnode *vp = NULL; 1699 ufs_lbn_t lbn; 1700 ufs2_daddr_t blkno; 1701 int indiroff = 0, error = 0, claimedblk = 0; 1702 struct snapdata *sn; 1703 1704 lbn = fragstoblks(fs, bno); 1705 retry: 1706 VI_LOCK(devvp); 1707 sn = devvp->v_rdev->si_snapdata; 1708 if (sn == NULL) { 1709 VI_UNLOCK(devvp); 1710 return (0); 1711 } 1712 if (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 1713 VI_MTX(devvp)) != 0) 1714 goto retry; 1715 TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { 1716 vp = ITOV(ip); 1717 /* 1718 * Lookup block being written. 1719 */ 1720 if (lbn < NDADDR) { 1721 blkno = DIP(ip, i_db[lbn]); 1722 } else { 1723 td->td_pflags |= TDP_COWINPROGRESS; 1724 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1725 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1726 td->td_pflags &= ~TDP_COWINPROGRESS; 1727 if (error) 1728 break; 1729 indiroff = (lbn - NDADDR) % NINDIR(fs); 1730 if (ip->i_ump->um_fstype == UFS1) 1731 blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; 1732 else 1733 blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; 1734 } 1735 /* 1736 * Check to see if block needs to be copied. 1737 */ 1738 if (blkno == 0) { 1739 /* 1740 * A block that we map is being freed. If it has not 1741 * been claimed yet, we will claim or copy it (below). 1742 */ 1743 claimedblk = 1; 1744 } else if (blkno == BLK_SNAP) { 1745 /* 1746 * No previous snapshot claimed the block, 1747 * so it will be freed and become a BLK_NOCOPY 1748 * (don't care) for us. 1749 */ 1750 if (claimedblk) 1751 panic("snapblkfree: inconsistent block type"); 1752 if (lbn < NDADDR) { 1753 DIP_SET(ip, i_db[lbn], BLK_NOCOPY); 1754 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1755 } else if (ip->i_ump->um_fstype == UFS1) { 1756 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 1757 BLK_NOCOPY; 1758 bdwrite(ibp); 1759 } else { 1760 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 1761 BLK_NOCOPY; 1762 bdwrite(ibp); 1763 } 1764 continue; 1765 } else /* BLK_NOCOPY or default */ { 1766 /* 1767 * If the snapshot has already copied the block 1768 * (default), or does not care about the block, 1769 * it is not needed. 1770 */ 1771 if (lbn >= NDADDR) 1772 bqrelse(ibp); 1773 continue; 1774 } 1775 /* 1776 * If this is a full size block, we will just grab it 1777 * and assign it to the snapshot inode. Otherwise we 1778 * will proceed to copy it. See explanation for this 1779 * routine as to why only a single snapshot needs to 1780 * claim this block. 1781 */ 1782 if (size == fs->fs_bsize) { 1783 #ifdef DEBUG 1784 if (snapdebug) 1785 printf("%s %d lbn %jd from inum %d\n", 1786 "Grabonremove: snapino", ip->i_number, 1787 (intmax_t)lbn, inum); 1788 #endif 1789 if (lbn < NDADDR) { 1790 DIP_SET(ip, i_db[lbn], bno); 1791 } else if (ip->i_ump->um_fstype == UFS1) { 1792 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno; 1793 bdwrite(ibp); 1794 } else { 1795 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno; 1796 bdwrite(ibp); 1797 } 1798 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + btodb(size)); 1799 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1800 lockmgr(vp->v_vnlock, LK_RELEASE, NULL); 1801 return (1); 1802 } 1803 if (lbn >= NDADDR) 1804 bqrelse(ibp); 1805 /* 1806 * Allocate the block into which to do the copy. Note that this 1807 * allocation will never require any additional allocations for 1808 * the snapshot inode. 1809 */ 1810 td->td_pflags |= TDP_COWINPROGRESS; 1811 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1812 fs->fs_bsize, KERNCRED, 0, &cbp); 1813 td->td_pflags &= ~TDP_COWINPROGRESS; 1814 if (error) 1815 break; 1816 #ifdef DEBUG 1817 if (snapdebug) 1818 printf("%s%d lbn %jd %s %d size %ld to blkno %jd\n", 1819 "Copyonremove: snapino ", ip->i_number, 1820 (intmax_t)lbn, "for inum", inum, size, 1821 (intmax_t)cbp->b_blkno); 1822 #endif 1823 /* 1824 * If we have already read the old block contents, then 1825 * simply copy them to the new block. Note that we need 1826 * to synchronously write snapshots that have not been 1827 * unlinked, and hence will be visible after a crash, 1828 * to ensure their integrity. 1829 */ 1830 if (savedcbp != 0) { 1831 bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 1832 bawrite(cbp); 1833 if (dopersistence && ip->i_effnlink > 0) 1834 (void) ffs_syncvnode(vp, MNT_WAIT); 1835 continue; 1836 } 1837 /* 1838 * Otherwise, read the old block contents into the buffer. 1839 */ 1840 if ((error = readblock(vp, cbp, lbn)) != 0) { 1841 bzero(cbp->b_data, fs->fs_bsize); 1842 bawrite(cbp); 1843 if (dopersistence && ip->i_effnlink > 0) 1844 (void) ffs_syncvnode(vp, MNT_WAIT); 1845 break; 1846 } 1847 savedcbp = cbp; 1848 } 1849 /* 1850 * Note that we need to synchronously write snapshots that 1851 * have not been unlinked, and hence will be visible after 1852 * a crash, to ensure their integrity. 1853 */ 1854 if (savedcbp) { 1855 vp = savedcbp->b_vp; 1856 bawrite(savedcbp); 1857 if (dopersistence && VTOI(vp)->i_effnlink > 0) 1858 (void) ffs_syncvnode(vp, MNT_WAIT); 1859 } 1860 /* 1861 * If we have been unable to allocate a block in which to do 1862 * the copy, then return non-zero so that the fragment will 1863 * not be freed. Although space will be lost, the snapshot 1864 * will stay consistent. 1865 */ 1866 lockmgr(vp->v_vnlock, LK_RELEASE, NULL); 1867 return (error); 1868 } 1869 1870 /* 1871 * Associate snapshot files when mounting. 1872 */ 1873 void 1874 ffs_snapshot_mount(mp) 1875 struct mount *mp; 1876 { 1877 struct ufsmount *ump = VFSTOUFS(mp); 1878 struct vnode *devvp = ump->um_devvp; 1879 struct fs *fs = ump->um_fs; 1880 struct thread *td = curthread; 1881 struct snapdata *sn; 1882 struct vnode *vp; 1883 struct vnode *lastvp; 1884 struct inode *ip; 1885 struct uio auio; 1886 struct iovec aiov; 1887 void *snapblklist; 1888 char *reason; 1889 daddr_t snaplistsize; 1890 int error, snaploc, loc; 1891 1892 /* 1893 * XXX The following needs to be set before ffs_truncate or 1894 * VOP_READ can be called. 1895 */ 1896 mp->mnt_stat.f_iosize = fs->fs_bsize; 1897 /* 1898 * Process each snapshot listed in the superblock. 1899 */ 1900 vp = NULL; 1901 lastvp = NULL; 1902 sn = NULL; 1903 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { 1904 if (fs->fs_snapinum[snaploc] == 0) 1905 break; 1906 if ((error = ffs_vget(mp, fs->fs_snapinum[snaploc], 1907 LK_EXCLUSIVE, &vp)) != 0){ 1908 printf("ffs_snapshot_mount: vget failed %d\n", error); 1909 continue; 1910 } 1911 ip = VTOI(vp); 1912 if ((ip->i_flags & SF_SNAPSHOT) == 0 || ip->i_size == 1913 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) { 1914 if ((ip->i_flags & SF_SNAPSHOT) == 0) { 1915 reason = "non-snapshot"; 1916 } else { 1917 reason = "old format snapshot"; 1918 (void)ffs_truncate(vp, (off_t)0, 0, NOCRED, td); 1919 (void)ffs_syncvnode(vp, MNT_WAIT); 1920 } 1921 printf("ffs_snapshot_mount: %s inode %d\n", 1922 reason, fs->fs_snapinum[snaploc]); 1923 vput(vp); 1924 vp = NULL; 1925 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { 1926 if (fs->fs_snapinum[loc] == 0) 1927 break; 1928 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; 1929 } 1930 fs->fs_snapinum[loc - 1] = 0; 1931 snaploc--; 1932 continue; 1933 } 1934 /* 1935 * Acquire a lock on the snapdata structure, creating it if 1936 * necessary. 1937 */ 1938 sn = ffs_snapdata_acquire(devvp); 1939 /* 1940 * Change vnode to use shared snapshot lock instead of the 1941 * original private lock. 1942 */ 1943 vp->v_vnlock = &sn->sn_lock; 1944 lockmgr(&vp->v_lock, LK_RELEASE, NULL); 1945 /* 1946 * Link it onto the active snapshot list. 1947 */ 1948 VI_LOCK(devvp); 1949 if (ip->i_nextsnap.tqe_prev != 0) 1950 panic("ffs_snapshot_mount: %d already on list", 1951 ip->i_number); 1952 else 1953 TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap); 1954 vp->v_vflag |= VV_SYSTEM; 1955 VI_UNLOCK(devvp); 1956 VOP_UNLOCK(vp, 0); 1957 lastvp = vp; 1958 } 1959 vp = lastvp; 1960 /* 1961 * No usable snapshots found. 1962 */ 1963 if (sn == NULL || vp == NULL) 1964 return; 1965 /* 1966 * Allocate the space for the block hints list. We always want to 1967 * use the list from the newest snapshot. 1968 */ 1969 auio.uio_iov = &aiov; 1970 auio.uio_iovcnt = 1; 1971 aiov.iov_base = (void *)&snaplistsize; 1972 aiov.iov_len = sizeof(snaplistsize); 1973 auio.uio_resid = aiov.iov_len; 1974 auio.uio_offset = 1975 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)); 1976 auio.uio_segflg = UIO_SYSSPACE; 1977 auio.uio_rw = UIO_READ; 1978 auio.uio_td = td; 1979 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1980 if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 1981 printf("ffs_snapshot_mount: read_1 failed %d\n", error); 1982 VOP_UNLOCK(vp, 0); 1983 return; 1984 } 1985 snapblklist = malloc(snaplistsize * sizeof(daddr_t), 1986 M_UFSMNT, M_WAITOK); 1987 auio.uio_iovcnt = 1; 1988 aiov.iov_base = snapblklist; 1989 aiov.iov_len = snaplistsize * sizeof (daddr_t); 1990 auio.uio_resid = aiov.iov_len; 1991 auio.uio_offset -= sizeof(snaplistsize); 1992 if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 1993 printf("ffs_snapshot_mount: read_2 failed %d\n", error); 1994 VOP_UNLOCK(vp, 0); 1995 free(snapblklist, M_UFSMNT); 1996 return; 1997 } 1998 VOP_UNLOCK(vp, 0); 1999 VI_LOCK(devvp); 2000 ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_mount"); 2001 sn->sn_listsize = snaplistsize; 2002 sn->sn_blklist = (daddr_t *)snapblklist; 2003 devvp->v_vflag |= VV_COPYONWRITE; 2004 VI_UNLOCK(devvp); 2005 } 2006 2007 /* 2008 * Disassociate snapshot files when unmounting. 2009 */ 2010 void 2011 ffs_snapshot_unmount(mp) 2012 struct mount *mp; 2013 { 2014 struct vnode *devvp = VFSTOUFS(mp)->um_devvp; 2015 struct snapdata *sn; 2016 struct inode *xp; 2017 struct vnode *vp; 2018 2019 VI_LOCK(devvp); 2020 sn = devvp->v_rdev->si_snapdata; 2021 while (sn != NULL && (xp = TAILQ_FIRST(&sn->sn_head)) != NULL) { 2022 vp = ITOV(xp); 2023 TAILQ_REMOVE(&sn->sn_head, xp, i_nextsnap); 2024 xp->i_nextsnap.tqe_prev = 0; 2025 lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE, 2026 VI_MTX(devvp)); 2027 lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL); 2028 KASSERT(vp->v_vnlock == &sn->sn_lock, 2029 ("ffs_snapshot_unmount: lost lock mutation")); 2030 vp->v_vnlock = &vp->v_lock; 2031 lockmgr(&vp->v_lock, LK_RELEASE, NULL); 2032 lockmgr(&sn->sn_lock, LK_RELEASE, NULL); 2033 if (xp->i_effnlink > 0) 2034 vrele(vp); 2035 VI_LOCK(devvp); 2036 sn = devvp->v_rdev->si_snapdata; 2037 } 2038 try_free_snapdata(devvp); 2039 ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount"); 2040 } 2041 2042 /* 2043 * Check the buffer block to be belong to device buffer that shall be 2044 * locked after snaplk. devvp shall be locked on entry, and will be 2045 * leaved locked upon exit. 2046 */ 2047 static int 2048 ffs_bp_snapblk(devvp, bp) 2049 struct vnode *devvp; 2050 struct buf *bp; 2051 { 2052 struct snapdata *sn; 2053 struct fs *fs; 2054 ufs2_daddr_t lbn, *snapblklist; 2055 int lower, upper, mid; 2056 2057 ASSERT_VI_LOCKED(devvp, "ffs_bp_snapblk"); 2058 KASSERT(devvp->v_type == VCHR, ("Not a device %p", devvp)); 2059 sn = devvp->v_rdev->si_snapdata; 2060 if (sn == NULL || TAILQ_FIRST(&sn->sn_head) == NULL) 2061 return (0); 2062 fs = TAILQ_FIRST(&sn->sn_head)->i_fs; 2063 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); 2064 snapblklist = sn->sn_blklist; 2065 upper = sn->sn_listsize - 1; 2066 lower = 1; 2067 while (lower <= upper) { 2068 mid = (lower + upper) / 2; 2069 if (snapblklist[mid] == lbn) 2070 break; 2071 if (snapblklist[mid] < lbn) 2072 lower = mid + 1; 2073 else 2074 upper = mid - 1; 2075 } 2076 if (lower <= upper) 2077 return (1); 2078 return (0); 2079 } 2080 2081 void 2082 ffs_bdflush(bo, bp) 2083 struct bufobj *bo; 2084 struct buf *bp; 2085 { 2086 struct thread *td; 2087 struct vnode *vp, *devvp; 2088 struct buf *nbp; 2089 int bp_bdskip; 2090 2091 if (bo->bo_dirty.bv_cnt <= dirtybufthresh) 2092 return; 2093 2094 td = curthread; 2095 vp = bp->b_vp; 2096 devvp = bo->__bo_vnode; 2097 KASSERT(vp == devvp, ("devvp != vp %p %p", bo, bp)); 2098 2099 VI_LOCK(devvp); 2100 bp_bdskip = ffs_bp_snapblk(devvp, bp); 2101 if (bp_bdskip) 2102 bdwriteskip++; 2103 VI_UNLOCK(devvp); 2104 if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10 && !bp_bdskip) { 2105 (void) VOP_FSYNC(vp, MNT_NOWAIT, td); 2106 altbufferflushes++; 2107 } else { 2108 BO_LOCK(bo); 2109 /* 2110 * Try to find a buffer to flush. 2111 */ 2112 TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) { 2113 if ((nbp->b_vflags & BV_BKGRDINPROG) || 2114 BUF_LOCK(nbp, 2115 LK_EXCLUSIVE | LK_NOWAIT, NULL)) 2116 continue; 2117 if (bp == nbp) 2118 panic("bdwrite: found ourselves"); 2119 BO_UNLOCK(bo); 2120 /* 2121 * Don't countdeps with the bo lock 2122 * held. 2123 */ 2124 if (buf_countdeps(nbp, 0)) { 2125 BO_LOCK(bo); 2126 BUF_UNLOCK(nbp); 2127 continue; 2128 } 2129 if (bp_bdskip) { 2130 VI_LOCK(devvp); 2131 if (!ffs_bp_snapblk(vp, nbp)) { 2132 if (BO_MTX(bo) != VI_MTX(vp)) { 2133 VI_UNLOCK(devvp); 2134 BO_LOCK(bo); 2135 } 2136 BUF_UNLOCK(nbp); 2137 continue; 2138 } 2139 VI_UNLOCK(devvp); 2140 } 2141 if (nbp->b_flags & B_CLUSTEROK) { 2142 vfs_bio_awrite(nbp); 2143 } else { 2144 bremfree(nbp); 2145 bawrite(nbp); 2146 } 2147 dirtybufferflushes++; 2148 break; 2149 } 2150 if (nbp == NULL) 2151 BO_UNLOCK(bo); 2152 } 2153 } 2154 2155 /* 2156 * Check for need to copy block that is about to be written, 2157 * copying the block if necessary. 2158 */ 2159 int 2160 ffs_copyonwrite(devvp, bp) 2161 struct vnode *devvp; 2162 struct buf *bp; 2163 { 2164 struct snapdata *sn; 2165 struct buf *ibp, *cbp, *savedcbp = 0; 2166 struct thread *td = curthread; 2167 struct fs *fs; 2168 struct inode *ip; 2169 struct vnode *vp = 0; 2170 ufs2_daddr_t lbn, blkno, *snapblklist; 2171 int lower, upper, mid, indiroff, error = 0; 2172 int launched_async_io, prev_norunningbuf; 2173 long saved_runningbufspace; 2174 2175 if (devvp != bp->b_vp && (VTOI(bp->b_vp)->i_flags & SF_SNAPSHOT) != 0) 2176 return (0); /* Update on a snapshot file */ 2177 if (td->td_pflags & TDP_COWINPROGRESS) 2178 panic("ffs_copyonwrite: recursive call"); 2179 /* 2180 * First check to see if it is in the preallocated list. 2181 * By doing this check we avoid several potential deadlocks. 2182 */ 2183 VI_LOCK(devvp); 2184 sn = devvp->v_rdev->si_snapdata; 2185 if (sn == NULL || 2186 TAILQ_EMPTY(&sn->sn_head)) { 2187 VI_UNLOCK(devvp); 2188 return (0); /* No snapshot */ 2189 } 2190 ip = TAILQ_FIRST(&sn->sn_head); 2191 fs = ip->i_fs; 2192 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); 2193 snapblklist = sn->sn_blklist; 2194 upper = sn->sn_listsize - 1; 2195 lower = 1; 2196 while (lower <= upper) { 2197 mid = (lower + upper) / 2; 2198 if (snapblklist[mid] == lbn) 2199 break; 2200 if (snapblklist[mid] < lbn) 2201 lower = mid + 1; 2202 else 2203 upper = mid - 1; 2204 } 2205 if (lower <= upper) { 2206 VI_UNLOCK(devvp); 2207 return (0); 2208 } 2209 launched_async_io = 0; 2210 prev_norunningbuf = td->td_pflags & TDP_NORUNNINGBUF; 2211 /* 2212 * Since I/O on bp isn't yet in progress and it may be blocked 2213 * for a long time waiting on snaplk, back it out of 2214 * runningbufspace, possibly waking other threads waiting for space. 2215 */ 2216 saved_runningbufspace = bp->b_runningbufspace; 2217 if (saved_runningbufspace != 0) 2218 runningbufwakeup(bp); 2219 /* 2220 * Not in the precomputed list, so check the snapshots. 2221 */ 2222 while (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 2223 VI_MTX(devvp)) != 0) { 2224 VI_LOCK(devvp); 2225 sn = devvp->v_rdev->si_snapdata; 2226 if (sn == NULL || 2227 TAILQ_EMPTY(&sn->sn_head)) { 2228 VI_UNLOCK(devvp); 2229 if (saved_runningbufspace != 0) { 2230 bp->b_runningbufspace = saved_runningbufspace; 2231 atomic_add_long(&runningbufspace, 2232 bp->b_runningbufspace); 2233 } 2234 return (0); /* Snapshot gone */ 2235 } 2236 } 2237 TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { 2238 vp = ITOV(ip); 2239 /* 2240 * We ensure that everything of our own that needs to be 2241 * copied will be done at the time that ffs_snapshot is 2242 * called. Thus we can skip the check here which can 2243 * deadlock in doing the lookup in UFS_BALLOC. 2244 */ 2245 if (bp->b_vp == vp) 2246 continue; 2247 /* 2248 * Check to see if block needs to be copied. We do not have 2249 * to hold the snapshot lock while doing this lookup as it 2250 * will never require any additional allocations for the 2251 * snapshot inode. 2252 */ 2253 if (lbn < NDADDR) { 2254 blkno = DIP(ip, i_db[lbn]); 2255 } else { 2256 td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF; 2257 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 2258 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 2259 td->td_pflags &= ~TDP_COWINPROGRESS; 2260 if (error) 2261 break; 2262 indiroff = (lbn - NDADDR) % NINDIR(fs); 2263 if (ip->i_ump->um_fstype == UFS1) 2264 blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; 2265 else 2266 blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; 2267 bqrelse(ibp); 2268 } 2269 #ifdef INVARIANTS 2270 if (blkno == BLK_SNAP && bp->b_lblkno >= 0) 2271 panic("ffs_copyonwrite: bad copy block"); 2272 #endif 2273 if (blkno != 0) 2274 continue; 2275 /* 2276 * Allocate the block into which to do the copy. Since 2277 * multiple processes may all try to copy the same block, 2278 * we have to recheck our need to do a copy if we sleep 2279 * waiting for the lock. 2280 * 2281 * Because all snapshots on a filesystem share a single 2282 * lock, we ensure that we will never be in competition 2283 * with another process to allocate a block. 2284 */ 2285 td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF; 2286 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 2287 fs->fs_bsize, KERNCRED, 0, &cbp); 2288 td->td_pflags &= ~TDP_COWINPROGRESS; 2289 if (error) 2290 break; 2291 #ifdef DEBUG 2292 if (snapdebug) { 2293 printf("Copyonwrite: snapino %d lbn %jd for ", 2294 ip->i_number, (intmax_t)lbn); 2295 if (bp->b_vp == devvp) 2296 printf("fs metadata"); 2297 else 2298 printf("inum %d", VTOI(bp->b_vp)->i_number); 2299 printf(" lblkno %jd to blkno %jd\n", 2300 (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno); 2301 } 2302 #endif 2303 /* 2304 * If we have already read the old block contents, then 2305 * simply copy them to the new block. Note that we need 2306 * to synchronously write snapshots that have not been 2307 * unlinked, and hence will be visible after a crash, 2308 * to ensure their integrity. 2309 */ 2310 if (savedcbp != 0) { 2311 bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 2312 bawrite(cbp); 2313 if (dopersistence && ip->i_effnlink > 0) 2314 (void) ffs_syncvnode(vp, MNT_WAIT); 2315 else 2316 launched_async_io = 1; 2317 continue; 2318 } 2319 /* 2320 * Otherwise, read the old block contents into the buffer. 2321 */ 2322 if ((error = readblock(vp, cbp, lbn)) != 0) { 2323 bzero(cbp->b_data, fs->fs_bsize); 2324 bawrite(cbp); 2325 if (dopersistence && ip->i_effnlink > 0) 2326 (void) ffs_syncvnode(vp, MNT_WAIT); 2327 else 2328 launched_async_io = 1; 2329 break; 2330 } 2331 savedcbp = cbp; 2332 } 2333 /* 2334 * Note that we need to synchronously write snapshots that 2335 * have not been unlinked, and hence will be visible after 2336 * a crash, to ensure their integrity. 2337 */ 2338 if (savedcbp) { 2339 vp = savedcbp->b_vp; 2340 bawrite(savedcbp); 2341 if (dopersistence && VTOI(vp)->i_effnlink > 0) 2342 (void) ffs_syncvnode(vp, MNT_WAIT); 2343 else 2344 launched_async_io = 1; 2345 } 2346 lockmgr(vp->v_vnlock, LK_RELEASE, NULL); 2347 td->td_pflags = (td->td_pflags & ~TDP_NORUNNINGBUF) | 2348 prev_norunningbuf; 2349 if (launched_async_io && (td->td_pflags & TDP_NORUNNINGBUF) == 0) 2350 waitrunningbufspace(); 2351 /* 2352 * I/O on bp will now be started, so count it in runningbufspace. 2353 */ 2354 if (saved_runningbufspace != 0) { 2355 bp->b_runningbufspace = saved_runningbufspace; 2356 atomic_add_long(&runningbufspace, bp->b_runningbufspace); 2357 } 2358 return (error); 2359 } 2360 2361 /* 2362 * Read the specified block into the given buffer. 2363 * Much of this boiler-plate comes from bwrite(). 2364 */ 2365 static int 2366 readblock(vp, bp, lbn) 2367 struct vnode *vp; 2368 struct buf *bp; 2369 ufs2_daddr_t lbn; 2370 { 2371 struct inode *ip = VTOI(vp); 2372 struct bio *bip; 2373 2374 bip = g_alloc_bio(); 2375 bip->bio_cmd = BIO_READ; 2376 bip->bio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn))); 2377 bip->bio_data = bp->b_data; 2378 bip->bio_length = bp->b_bcount; 2379 bip->bio_done = NULL; 2380 2381 g_io_request(bip, ip->i_devvp->v_bufobj.bo_private); 2382 bp->b_error = biowait(bip, "snaprdb"); 2383 g_destroy_bio(bip); 2384 return (bp->b_error); 2385 } 2386 2387 #endif 2388 2389 /* 2390 * Process file deletes that were deferred by ufs_inactive() due to 2391 * the file system being suspended. Transfer IN_LAZYACCESS into 2392 * IN_MODIFIED for vnodes that were accessed during suspension. 2393 */ 2394 void 2395 process_deferred_inactive(struct mount *mp) 2396 { 2397 struct vnode *vp, *mvp; 2398 struct inode *ip; 2399 struct thread *td; 2400 int error; 2401 2402 td = curthread; 2403 (void) vn_start_secondary_write(NULL, &mp, V_WAIT); 2404 MNT_ILOCK(mp); 2405 loop: 2406 MNT_VNODE_FOREACH(vp, mp, mvp) { 2407 VI_LOCK(vp); 2408 /* 2409 * IN_LAZYACCESS is checked here without holding any 2410 * vnode lock, but this flag is set only while holding 2411 * vnode interlock. 2412 */ 2413 if (vp->v_type == VNON || (vp->v_iflag & VI_DOOMED) != 0 || 2414 ((VTOI(vp)->i_flag & IN_LAZYACCESS) == 0 && 2415 ((vp->v_iflag & VI_OWEINACT) == 0 || 2416 vp->v_usecount > 0))) { 2417 VI_UNLOCK(vp); 2418 continue; 2419 } 2420 MNT_IUNLOCK(mp); 2421 vholdl(vp); 2422 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); 2423 if (error != 0) { 2424 vdrop(vp); 2425 MNT_ILOCK(mp); 2426 if (error == ENOENT) 2427 continue; /* vnode recycled */ 2428 MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp); 2429 goto loop; 2430 } 2431 ip = VTOI(vp); 2432 if ((ip->i_flag & IN_LAZYACCESS) != 0) { 2433 ip->i_flag &= ~IN_LAZYACCESS; 2434 ip->i_flag |= IN_MODIFIED; 2435 } 2436 VI_LOCK(vp); 2437 if ((vp->v_iflag & VI_OWEINACT) == 0 || vp->v_usecount > 0) { 2438 VI_UNLOCK(vp); 2439 VOP_UNLOCK(vp, 0); 2440 vdrop(vp); 2441 MNT_ILOCK(mp); 2442 continue; 2443 } 2444 2445 VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, 2446 ("process_deferred_inactive: " 2447 "recursed on VI_DOINGINACT")); 2448 vp->v_iflag |= VI_DOINGINACT; 2449 vp->v_iflag &= ~VI_OWEINACT; 2450 VI_UNLOCK(vp); 2451 (void) VOP_INACTIVE(vp, td); 2452 VI_LOCK(vp); 2453 VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, 2454 ("process_deferred_inactive: lost VI_DOINGINACT")); 2455 VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp, 2456 ("process_deferred_inactive: got VI_OWEINACT")); 2457 vp->v_iflag &= ~VI_DOINGINACT; 2458 VI_UNLOCK(vp); 2459 VOP_UNLOCK(vp, 0); 2460 vdrop(vp); 2461 MNT_ILOCK(mp); 2462 } 2463 MNT_IUNLOCK(mp); 2464 vn_finished_secondary_write(mp); 2465 } 2466 2467 #ifndef NO_FFS_SNAPSHOT 2468 2469 static struct snapdata * 2470 ffs_snapdata_alloc(void) 2471 { 2472 struct snapdata *sn; 2473 2474 /* 2475 * Fetch a snapdata from the free list if there is one available. 2476 */ 2477 mtx_lock(&snapfree_lock); 2478 sn = LIST_FIRST(&snapfree); 2479 if (sn != NULL) 2480 LIST_REMOVE(sn, sn_link); 2481 mtx_unlock(&snapfree_lock); 2482 if (sn != NULL) 2483 return (sn); 2484 /* 2485 * If there were no free snapdatas allocate one. 2486 */ 2487 sn = malloc(sizeof *sn, M_UFSMNT, M_WAITOK | M_ZERO); 2488 TAILQ_INIT(&sn->sn_head); 2489 lockinit(&sn->sn_lock, PVFS, "snaplk", VLKTIMEOUT, 2490 LK_CANRECURSE | LK_NOSHARE); 2491 return (sn); 2492 } 2493 2494 /* 2495 * The snapdata is never freed because we can not be certain that 2496 * there are no threads sleeping on the snap lock. Persisting 2497 * them permanently avoids costly synchronization in ffs_lock(). 2498 */ 2499 static void 2500 ffs_snapdata_free(struct snapdata *sn) 2501 { 2502 mtx_lock(&snapfree_lock); 2503 LIST_INSERT_HEAD(&snapfree, sn, sn_link); 2504 mtx_unlock(&snapfree_lock); 2505 } 2506 2507 /* Try to free snapdata associated with devvp */ 2508 static void 2509 try_free_snapdata(struct vnode *devvp) 2510 { 2511 struct snapdata *sn; 2512 ufs2_daddr_t *snapblklist; 2513 2514 ASSERT_VI_LOCKED(devvp, "try_free_snapdata"); 2515 sn = devvp->v_rdev->si_snapdata; 2516 2517 if (sn == NULL || TAILQ_FIRST(&sn->sn_head) != NULL || 2518 (devvp->v_vflag & VV_COPYONWRITE) == 0) { 2519 VI_UNLOCK(devvp); 2520 return; 2521 } 2522 2523 devvp->v_rdev->si_snapdata = NULL; 2524 devvp->v_vflag &= ~VV_COPYONWRITE; 2525 lockmgr(&sn->sn_lock, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp)); 2526 snapblklist = sn->sn_blklist; 2527 sn->sn_blklist = NULL; 2528 sn->sn_listsize = 0; 2529 lockmgr(&sn->sn_lock, LK_RELEASE, NULL); 2530 if (snapblklist != NULL) 2531 free(snapblklist, M_UFSMNT); 2532 ffs_snapdata_free(sn); 2533 } 2534 2535 static struct snapdata * 2536 ffs_snapdata_acquire(struct vnode *devvp) 2537 { 2538 struct snapdata *nsn; 2539 struct snapdata *sn; 2540 2541 /* 2542 * Allocate a free snapdata. This is done before acquiring the 2543 * devvp lock to avoid allocation while the devvp interlock is 2544 * held. 2545 */ 2546 nsn = ffs_snapdata_alloc(); 2547 /* 2548 * If there snapshots already exist on this filesystem grab a 2549 * reference to the shared lock. Otherwise this is the first 2550 * snapshot on this filesystem and we need to use our 2551 * pre-allocated snapdata. 2552 */ 2553 VI_LOCK(devvp); 2554 if (devvp->v_rdev->si_snapdata == NULL) { 2555 devvp->v_rdev->si_snapdata = nsn; 2556 nsn = NULL; 2557 } 2558 sn = devvp->v_rdev->si_snapdata; 2559 /* 2560 * Acquire the snapshot lock. 2561 */ 2562 lockmgr(&sn->sn_lock, 2563 LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, VI_MTX(devvp)); 2564 /* 2565 * Free any unused snapdata. 2566 */ 2567 if (nsn != NULL) 2568 ffs_snapdata_free(nsn); 2569 2570 return (sn); 2571 } 2572 2573 #endif 2574