1 /*- 2 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. 3 * 4 * Further information about snapshots can be obtained from: 5 * 6 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 7 * 1614 Oxford Street mckusick@mckusick.com 8 * Berkeley, CA 94709-1608 +1-510-843-9542 9 * USA 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 21 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 22 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 34 */ 35 36 #include <sys/cdefs.h> 37 __FBSDID("$FreeBSD$"); 38 39 #include "opt_quota.h" 40 41 #include <sys/param.h> 42 #include <sys/kernel.h> 43 #include <sys/systm.h> 44 #include <sys/conf.h> 45 #include <sys/bio.h> 46 #include <sys/buf.h> 47 #include <sys/fcntl.h> 48 #include <sys/proc.h> 49 #include <sys/namei.h> 50 #include <sys/sched.h> 51 #include <sys/stat.h> 52 #include <sys/malloc.h> 53 #include <sys/mount.h> 54 #include <sys/resource.h> 55 #include <sys/resourcevar.h> 56 #include <sys/vnode.h> 57 58 #include <geom/geom.h> 59 60 #include <ufs/ufs/extattr.h> 61 #include <ufs/ufs/quota.h> 62 #include <ufs/ufs/ufsmount.h> 63 #include <ufs/ufs/inode.h> 64 #include <ufs/ufs/ufs_extern.h> 65 66 #include <ufs/ffs/fs.h> 67 #include <ufs/ffs/ffs_extern.h> 68 69 #define KERNCRED thread0.td_ucred 70 #define DEBUG 1 71 72 #include "opt_ffs.h" 73 74 #ifdef NO_FFS_SNAPSHOT 75 int 76 ffs_snapshot(mp, snapfile) 77 struct mount *mp; 78 char *snapfile; 79 { 80 return (EINVAL); 81 } 82 83 int 84 ffs_snapblkfree(fs, devvp, bno, size, inum) 85 struct fs *fs; 86 struct vnode *devvp; 87 ufs2_daddr_t bno; 88 long size; 89 ino_t inum; 90 { 91 return (EINVAL); 92 } 93 94 void 95 ffs_snapremove(vp) 96 struct vnode *vp; 97 { 98 } 99 100 void 101 ffs_snapshot_mount(mp) 102 struct mount *mp; 103 { 104 } 105 106 void 107 ffs_snapshot_unmount(mp) 108 struct mount *mp; 109 { 110 } 111 112 void 113 ffs_snapgone(ip) 114 struct inode *ip; 115 { 116 } 117 118 int 119 ffs_copyonwrite(devvp, bp) 120 struct vnode *devvp; 121 struct buf *bp; 122 { 123 return (EINVAL); 124 } 125 126 #else 127 FEATURE(ffs_snapshot, "FFS snapshot support"); 128 129 TAILQ_HEAD(snaphead, inode); 130 131 struct snapdata { 132 LIST_ENTRY(snapdata) sn_link; 133 struct snaphead sn_head; 134 daddr_t sn_listsize; 135 daddr_t *sn_blklist; 136 struct lock sn_lock; 137 }; 138 139 LIST_HEAD(, snapdata) snapfree; 140 static struct mtx snapfree_lock; 141 MTX_SYSINIT(ffs_snapfree, &snapfree_lock, "snapdata free list", MTX_DEF); 142 143 static int cgaccount(int, struct vnode *, struct buf *, int); 144 static int expunge_ufs1(struct vnode *, struct inode *, struct fs *, 145 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 146 ufs_lbn_t, int), int, int); 147 static int indiracct_ufs1(struct vnode *, struct vnode *, int, 148 ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 149 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 150 ufs_lbn_t, int), int); 151 static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 152 struct fs *, ufs_lbn_t, int); 153 static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 154 struct fs *, ufs_lbn_t, int); 155 static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 156 struct fs *, ufs_lbn_t, int); 157 static int expunge_ufs2(struct vnode *, struct inode *, struct fs *, 158 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 159 ufs_lbn_t, int), int, int); 160 static int indiracct_ufs2(struct vnode *, struct vnode *, int, 161 ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 162 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 163 ufs_lbn_t, int), int); 164 static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 165 struct fs *, ufs_lbn_t, int); 166 static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 167 struct fs *, ufs_lbn_t, int); 168 static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 169 struct fs *, ufs_lbn_t, int); 170 static int readblock(struct vnode *vp, struct buf *, ufs2_daddr_t); 171 static void try_free_snapdata(struct vnode *devvp); 172 static struct snapdata *ffs_snapdata_acquire(struct vnode *devvp); 173 static int ffs_bp_snapblk(struct vnode *, struct buf *); 174 175 /* 176 * To ensure the consistency of snapshots across crashes, we must 177 * synchronously write out copied blocks before allowing the 178 * originals to be modified. Because of the rather severe speed 179 * penalty that this imposes, the following flag allows this 180 * crash persistence to be disabled. 181 */ 182 int dopersistence = 0; 183 184 #ifdef DEBUG 185 #include <sys/sysctl.h> 186 SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, ""); 187 static int snapdebug = 0; 188 SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, ""); 189 int collectsnapstats = 0; 190 SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats, 191 0, ""); 192 #endif /* DEBUG */ 193 194 /* 195 * Create a snapshot file and initialize it for the filesystem. 196 */ 197 int 198 ffs_snapshot(mp, snapfile) 199 struct mount *mp; 200 char *snapfile; 201 { 202 ufs2_daddr_t numblks, blkno, *blkp, *snapblklist; 203 int error, cg, snaploc; 204 int i, size, len, loc; 205 int flag; 206 struct timespec starttime = {0, 0}, endtime; 207 char saved_nice = 0; 208 long redo = 0, snaplistsize = 0; 209 int32_t *lp; 210 void *space; 211 struct fs *copy_fs = NULL, *fs; 212 struct thread *td = curthread; 213 struct inode *ip, *xp; 214 struct buf *bp, *nbp, *ibp, *sbp = NULL; 215 struct nameidata nd; 216 struct mount *wrtmp; 217 struct vattr vat; 218 struct vnode *vp, *xvp, *mvp, *devvp; 219 struct uio auio; 220 struct iovec aiov; 221 struct snapdata *sn; 222 struct ufsmount *ump; 223 224 ump = VFSTOUFS(mp); 225 fs = ump->um_fs; 226 sn = NULL; 227 MNT_ILOCK(mp); 228 flag = mp->mnt_flag; 229 MNT_IUNLOCK(mp); 230 231 /* 232 * Need to serialize access to snapshot code per filesystem. 233 */ 234 /* 235 * Assign a snapshot slot in the superblock. 236 */ 237 UFS_LOCK(ump); 238 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 239 if (fs->fs_snapinum[snaploc] == 0) 240 break; 241 UFS_UNLOCK(ump); 242 if (snaploc == FSMAXSNAP) 243 return (ENOSPC); 244 /* 245 * Create the snapshot file. 246 */ 247 restart: 248 NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_SYSSPACE, snapfile, td); 249 if ((error = namei(&nd)) != 0) 250 return (error); 251 if (nd.ni_vp != NULL) { 252 vput(nd.ni_vp); 253 error = EEXIST; 254 } 255 if (nd.ni_dvp->v_mount != mp) 256 error = EXDEV; 257 if (error) { 258 NDFREE(&nd, NDF_ONLY_PNBUF); 259 if (nd.ni_dvp == nd.ni_vp) 260 vrele(nd.ni_dvp); 261 else 262 vput(nd.ni_dvp); 263 return (error); 264 } 265 VATTR_NULL(&vat); 266 vat.va_type = VREG; 267 vat.va_mode = S_IRUSR; 268 vat.va_vaflags |= VA_EXCLUSIVE; 269 if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp)) 270 wrtmp = NULL; 271 if (wrtmp != mp) 272 panic("ffs_snapshot: mount mismatch"); 273 vfs_rel(wrtmp); 274 if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) { 275 NDFREE(&nd, NDF_ONLY_PNBUF); 276 vput(nd.ni_dvp); 277 if ((error = vn_start_write(NULL, &wrtmp, 278 V_XSLEEP | PCATCH)) != 0) 279 return (error); 280 goto restart; 281 } 282 error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat); 283 VOP_UNLOCK(nd.ni_dvp, 0); 284 if (error) { 285 NDFREE(&nd, NDF_ONLY_PNBUF); 286 vn_finished_write(wrtmp); 287 vrele(nd.ni_dvp); 288 return (error); 289 } 290 vp = nd.ni_vp; 291 vp->v_vflag |= VV_SYSTEM; 292 ip = VTOI(vp); 293 devvp = ip->i_devvp; 294 /* 295 * Allocate and copy the last block contents so as to be able 296 * to set size to that of the filesystem. 297 */ 298 numblks = howmany(fs->fs_size, fs->fs_frag); 299 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), 300 fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); 301 if (error) 302 goto out; 303 ip->i_size = lblktosize(fs, (off_t)numblks); 304 DIP_SET(ip, i_size, ip->i_size); 305 ip->i_flag |= IN_CHANGE | IN_UPDATE; 306 error = readblock(vp, bp, numblks - 1); 307 bawrite(bp); 308 if (error != 0) 309 goto out; 310 /* 311 * Preallocate critical data structures so that we can copy 312 * them in without further allocation after we suspend all 313 * operations on the filesystem. We would like to just release 314 * the allocated buffers without writing them since they will 315 * be filled in below once we are ready to go, but this upsets 316 * the soft update code, so we go ahead and write the new buffers. 317 * 318 * Allocate all indirect blocks and mark all of them as not 319 * needing to be copied. 320 */ 321 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 322 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), 323 fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp); 324 if (error) 325 goto out; 326 bawrite(ibp); 327 } 328 /* 329 * Allocate copies for the superblock and its summary information. 330 */ 331 error = UFS_BALLOC(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED, 332 0, &nbp); 333 if (error) 334 goto out; 335 bawrite(nbp); 336 blkno = fragstoblks(fs, fs->fs_csaddr); 337 len = howmany(fs->fs_cssize, fs->fs_bsize); 338 for (loc = 0; loc < len; loc++) { 339 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), 340 fs->fs_bsize, KERNCRED, 0, &nbp); 341 if (error) 342 goto out; 343 bawrite(nbp); 344 } 345 /* 346 * Allocate all cylinder group blocks. 347 */ 348 for (cg = 0; cg < fs->fs_ncg; cg++) { 349 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 350 fs->fs_bsize, KERNCRED, 0, &nbp); 351 if (error) 352 goto out; 353 bawrite(nbp); 354 if (cg % 10 == 0) 355 ffs_syncvnode(vp, MNT_WAIT); 356 } 357 /* 358 * Copy all the cylinder group maps. Although the 359 * filesystem is still active, we hope that only a few 360 * cylinder groups will change between now and when we 361 * suspend operations. Thus, we will be able to quickly 362 * touch up the few cylinder groups that changed during 363 * the suspension period. 364 */ 365 len = howmany(fs->fs_ncg, NBBY); 366 space = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO); 367 UFS_LOCK(ump); 368 fs->fs_active = space; 369 UFS_UNLOCK(ump); 370 for (cg = 0; cg < fs->fs_ncg; cg++) { 371 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 372 fs->fs_bsize, KERNCRED, 0, &nbp); 373 if (error) 374 goto out; 375 error = cgaccount(cg, vp, nbp, 1); 376 bawrite(nbp); 377 if (cg % 10 == 0) 378 ffs_syncvnode(vp, MNT_WAIT); 379 if (error) 380 goto out; 381 } 382 /* 383 * Change inode to snapshot type file. 384 */ 385 ip->i_flags |= SF_SNAPSHOT; 386 DIP_SET(ip, i_flags, ip->i_flags); 387 ip->i_flag |= IN_CHANGE | IN_UPDATE; 388 /* 389 * Ensure that the snapshot is completely on disk. 390 * Since we have marked it as a snapshot it is safe to 391 * unlock it as no process will be allowed to write to it. 392 */ 393 if ((error = ffs_syncvnode(vp, MNT_WAIT)) != 0) 394 goto out; 395 VOP_UNLOCK(vp, 0); 396 /* 397 * All allocations are done, so we can now snapshot the system. 398 * 399 * Recind nice scheduling while running with the filesystem suspended. 400 */ 401 if (td->td_proc->p_nice > 0) { 402 struct proc *p; 403 404 p = td->td_proc; 405 PROC_LOCK(p); 406 saved_nice = p->p_nice; 407 sched_nice(p, 0); 408 PROC_UNLOCK(p); 409 } 410 /* 411 * Suspend operation on filesystem. 412 */ 413 for (;;) { 414 vn_finished_write(wrtmp); 415 if ((error = vfs_write_suspend(vp->v_mount)) != 0) { 416 vn_start_write(NULL, &wrtmp, V_WAIT); 417 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 418 goto out; 419 } 420 if (mp->mnt_kern_flag & MNTK_SUSPENDED) 421 break; 422 vn_start_write(NULL, &wrtmp, V_WAIT); 423 } 424 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 425 if (ip->i_effnlink == 0) { 426 error = ENOENT; /* Snapshot file unlinked */ 427 goto out1; 428 } 429 if (collectsnapstats) 430 nanotime(&starttime); 431 432 /* The last block might have changed. Copy it again to be sure. */ 433 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), 434 fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); 435 if (error != 0) 436 goto out1; 437 error = readblock(vp, bp, numblks - 1); 438 bp->b_flags |= B_VALIDSUSPWRT; 439 bawrite(bp); 440 if (error != 0) 441 goto out1; 442 /* 443 * First, copy all the cylinder group maps that have changed. 444 */ 445 for (cg = 0; cg < fs->fs_ncg; cg++) { 446 if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0) 447 continue; 448 redo++; 449 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 450 fs->fs_bsize, KERNCRED, 0, &nbp); 451 if (error) 452 goto out1; 453 error = cgaccount(cg, vp, nbp, 2); 454 bawrite(nbp); 455 if (error) 456 goto out1; 457 } 458 /* 459 * Grab a copy of the superblock and its summary information. 460 * We delay writing it until the suspension is released below. 461 */ 462 error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize, 463 KERNCRED, &sbp); 464 if (error) { 465 brelse(sbp); 466 sbp = NULL; 467 goto out1; 468 } 469 loc = blkoff(fs, fs->fs_sblockloc); 470 copy_fs = (struct fs *)(sbp->b_data + loc); 471 bcopy(fs, copy_fs, fs->fs_sbsize); 472 if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) 473 copy_fs->fs_clean = 1; 474 size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE; 475 if (fs->fs_sbsize < size) 476 bzero(&sbp->b_data[loc + fs->fs_sbsize], size - fs->fs_sbsize); 477 size = blkroundup(fs, fs->fs_cssize); 478 if (fs->fs_contigsumsize > 0) 479 size += fs->fs_ncg * sizeof(int32_t); 480 space = malloc((u_long)size, M_UFSMNT, M_WAITOK); 481 copy_fs->fs_csp = space; 482 bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize); 483 space = (char *)space + fs->fs_cssize; 484 loc = howmany(fs->fs_cssize, fs->fs_fsize); 485 i = fs->fs_frag - loc % fs->fs_frag; 486 len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; 487 if (len > 0) { 488 if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc), 489 len, KERNCRED, &bp)) != 0) { 490 brelse(bp); 491 free(copy_fs->fs_csp, M_UFSMNT); 492 bawrite(sbp); 493 sbp = NULL; 494 goto out1; 495 } 496 bcopy(bp->b_data, space, (u_int)len); 497 space = (char *)space + len; 498 bp->b_flags |= B_INVAL | B_NOCACHE; 499 brelse(bp); 500 } 501 if (fs->fs_contigsumsize > 0) { 502 copy_fs->fs_maxcluster = lp = space; 503 for (i = 0; i < fs->fs_ncg; i++) 504 *lp++ = fs->fs_contigsumsize; 505 } 506 /* 507 * We must check for active files that have been unlinked 508 * (e.g., with a zero link count). We have to expunge all 509 * trace of these files from the snapshot so that they are 510 * not reclaimed prematurely by fsck or unnecessarily dumped. 511 * We turn off the MNTK_SUSPENDED flag to avoid a panic from 512 * spec_strategy about writing on a suspended filesystem. 513 * Note that we skip unlinked snapshot files as they will 514 * be handled separately below. 515 * 516 * We also calculate the needed size for the snapshot list. 517 */ 518 snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + 519 FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */; 520 MNT_ILOCK(mp); 521 mp->mnt_kern_flag &= ~MNTK_SUSPENDED; 522 loop: 523 MNT_VNODE_FOREACH(xvp, mp, mvp) { 524 VI_LOCK(xvp); 525 MNT_IUNLOCK(mp); 526 if ((xvp->v_iflag & VI_DOOMED) || 527 (xvp->v_usecount == 0 && 528 (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) || 529 xvp->v_type == VNON || 530 (VTOI(xvp)->i_flags & SF_SNAPSHOT)) { 531 VI_UNLOCK(xvp); 532 MNT_ILOCK(mp); 533 continue; 534 } 535 /* 536 * We can skip parent directory vnode because it must have 537 * this snapshot file in it. 538 */ 539 if (xvp == nd.ni_dvp) { 540 VI_UNLOCK(xvp); 541 MNT_ILOCK(mp); 542 continue; 543 } 544 vholdl(xvp); 545 if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK) != 0) { 546 MNT_ILOCK(mp); 547 MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp); 548 vdrop(xvp); 549 goto loop; 550 } 551 VI_LOCK(xvp); 552 if (xvp->v_usecount == 0 && 553 (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) { 554 VI_UNLOCK(xvp); 555 VOP_UNLOCK(xvp, 0); 556 vdrop(xvp); 557 MNT_ILOCK(mp); 558 continue; 559 } 560 VI_UNLOCK(xvp); 561 if (snapdebug) 562 vprint("ffs_snapshot: busy vnode", xvp); 563 if (VOP_GETATTR(xvp, &vat, td->td_ucred) == 0 && 564 vat.va_nlink > 0) { 565 VOP_UNLOCK(xvp, 0); 566 vdrop(xvp); 567 MNT_ILOCK(mp); 568 continue; 569 } 570 xp = VTOI(xvp); 571 if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) { 572 VOP_UNLOCK(xvp, 0); 573 vdrop(xvp); 574 MNT_ILOCK(mp); 575 continue; 576 } 577 /* 578 * If there is a fragment, clear it here. 579 */ 580 blkno = 0; 581 loc = howmany(xp->i_size, fs->fs_bsize) - 1; 582 if (loc < NDADDR) { 583 len = fragroundup(fs, blkoff(fs, xp->i_size)); 584 if (len != 0 && len < fs->fs_bsize) { 585 ffs_blkfree(ump, copy_fs, vp, 586 DIP(xp, i_db[loc]), len, xp->i_number, 587 NULL); 588 blkno = DIP(xp, i_db[loc]); 589 DIP_SET(xp, i_db[loc], 0); 590 } 591 } 592 snaplistsize += 1; 593 if (xp->i_ump->um_fstype == UFS1) 594 error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, 595 BLK_NOCOPY, 1); 596 else 597 error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, 598 BLK_NOCOPY, 1); 599 if (blkno) 600 DIP_SET(xp, i_db[loc], blkno); 601 if (!error) 602 error = ffs_freefile(ump, copy_fs, vp, xp->i_number, 603 xp->i_mode, NULL); 604 VOP_UNLOCK(xvp, 0); 605 vdrop(xvp); 606 if (error) { 607 free(copy_fs->fs_csp, M_UFSMNT); 608 bawrite(sbp); 609 sbp = NULL; 610 MNT_VNODE_FOREACH_ABORT(mp, mvp); 611 goto out1; 612 } 613 MNT_ILOCK(mp); 614 } 615 MNT_IUNLOCK(mp); 616 /* 617 * Erase the journal file from the snapshot. 618 */ 619 if (fs->fs_flags & FS_SUJ) { 620 error = softdep_journal_lookup(mp, &xvp); 621 if (error) { 622 free(copy_fs->fs_csp, M_UFSMNT); 623 bawrite(sbp); 624 sbp = NULL; 625 goto out1; 626 } 627 xp = VTOI(xvp); 628 if (xp->i_ump->um_fstype == UFS1) 629 error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, 630 BLK_NOCOPY, 0); 631 else 632 error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, 633 BLK_NOCOPY, 0); 634 vput(xvp); 635 } 636 /* 637 * Acquire a lock on the snapdata structure, creating it if necessary. 638 */ 639 sn = ffs_snapdata_acquire(devvp); 640 /* 641 * Change vnode to use shared snapshot lock instead of the original 642 * private lock. 643 */ 644 vp->v_vnlock = &sn->sn_lock; 645 lockmgr(&vp->v_lock, LK_RELEASE, NULL); 646 xp = TAILQ_FIRST(&sn->sn_head); 647 /* 648 * If this is the first snapshot on this filesystem, then we need 649 * to allocate the space for the list of preallocated snapshot blocks. 650 * This list will be refined below, but this preliminary one will 651 * keep us out of deadlock until the full one is ready. 652 */ 653 if (xp == NULL) { 654 snapblklist = malloc(snaplistsize * sizeof(daddr_t), 655 M_UFSMNT, M_WAITOK); 656 blkp = &snapblklist[1]; 657 *blkp++ = lblkno(fs, fs->fs_sblockloc); 658 blkno = fragstoblks(fs, fs->fs_csaddr); 659 for (cg = 0; cg < fs->fs_ncg; cg++) { 660 if (fragstoblks(fs, cgtod(fs, cg) > blkno)) 661 break; 662 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 663 } 664 len = howmany(fs->fs_cssize, fs->fs_bsize); 665 for (loc = 0; loc < len; loc++) 666 *blkp++ = blkno + loc; 667 for (; cg < fs->fs_ncg; cg++) 668 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 669 snapblklist[0] = blkp - snapblklist; 670 VI_LOCK(devvp); 671 if (sn->sn_blklist != NULL) 672 panic("ffs_snapshot: non-empty list"); 673 sn->sn_blklist = snapblklist; 674 sn->sn_listsize = blkp - snapblklist; 675 VI_UNLOCK(devvp); 676 } 677 /* 678 * Record snapshot inode. Since this is the newest snapshot, 679 * it must be placed at the end of the list. 680 */ 681 VI_LOCK(devvp); 682 fs->fs_snapinum[snaploc] = ip->i_number; 683 if (ip->i_nextsnap.tqe_prev != 0) 684 panic("ffs_snapshot: %d already on list", ip->i_number); 685 TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap); 686 devvp->v_vflag |= VV_COPYONWRITE; 687 VI_UNLOCK(devvp); 688 ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp"); 689 out1: 690 KASSERT((sn != NULL && sbp != NULL && error == 0) || 691 (sn == NULL && sbp == NULL && error != 0), 692 ("email phk@ and mckusick@")); 693 /* 694 * Resume operation on filesystem. 695 */ 696 vfs_write_resume(vp->v_mount); 697 vn_start_write(NULL, &wrtmp, V_WAIT); 698 if (collectsnapstats && starttime.tv_sec > 0) { 699 nanotime(&endtime); 700 timespecsub(&endtime, &starttime); 701 printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n", 702 vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec, 703 endtime.tv_nsec / 1000000, redo, fs->fs_ncg); 704 } 705 if (sbp == NULL) 706 goto out; 707 /* 708 * Copy allocation information from all the snapshots in 709 * this snapshot and then expunge them from its view. 710 */ 711 TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) { 712 if (xp == ip) 713 break; 714 if (xp->i_ump->um_fstype == UFS1) 715 error = expunge_ufs1(vp, xp, fs, snapacct_ufs1, 716 BLK_SNAP, 0); 717 else 718 error = expunge_ufs2(vp, xp, fs, snapacct_ufs2, 719 BLK_SNAP, 0); 720 if (error == 0 && xp->i_effnlink == 0) { 721 error = ffs_freefile(ump, 722 copy_fs, 723 vp, 724 xp->i_number, 725 xp->i_mode, NULL); 726 } 727 if (error) { 728 fs->fs_snapinum[snaploc] = 0; 729 goto done; 730 } 731 } 732 /* 733 * Allocate space for the full list of preallocated snapshot blocks. 734 */ 735 snapblklist = malloc(snaplistsize * sizeof(daddr_t), 736 M_UFSMNT, M_WAITOK); 737 ip->i_snapblklist = &snapblklist[1]; 738 /* 739 * Expunge the blocks used by the snapshots from the set of 740 * blocks marked as used in the snapshot bitmaps. Also, collect 741 * the list of allocated blocks in i_snapblklist. 742 */ 743 if (ip->i_ump->um_fstype == UFS1) 744 error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, 745 BLK_SNAP, 0); 746 else 747 error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, 748 BLK_SNAP, 0); 749 if (error) { 750 fs->fs_snapinum[snaploc] = 0; 751 free(snapblklist, M_UFSMNT); 752 goto done; 753 } 754 if (snaplistsize < ip->i_snapblklist - snapblklist) 755 panic("ffs_snapshot: list too small"); 756 snaplistsize = ip->i_snapblklist - snapblklist; 757 snapblklist[0] = snaplistsize; 758 ip->i_snapblklist = 0; 759 /* 760 * Write out the list of allocated blocks to the end of the snapshot. 761 */ 762 auio.uio_iov = &aiov; 763 auio.uio_iovcnt = 1; 764 aiov.iov_base = (void *)snapblklist; 765 aiov.iov_len = snaplistsize * sizeof(daddr_t); 766 auio.uio_resid = aiov.iov_len; 767 auio.uio_offset = ip->i_size; 768 auio.uio_segflg = UIO_SYSSPACE; 769 auio.uio_rw = UIO_WRITE; 770 auio.uio_td = td; 771 if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 772 fs->fs_snapinum[snaploc] = 0; 773 free(snapblklist, M_UFSMNT); 774 goto done; 775 } 776 /* 777 * Write the superblock and its summary information 778 * to the snapshot. 779 */ 780 blkno = fragstoblks(fs, fs->fs_csaddr); 781 len = howmany(fs->fs_cssize, fs->fs_bsize); 782 space = copy_fs->fs_csp; 783 for (loc = 0; loc < len; loc++) { 784 error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp); 785 if (error) { 786 brelse(nbp); 787 fs->fs_snapinum[snaploc] = 0; 788 free(snapblklist, M_UFSMNT); 789 goto done; 790 } 791 bcopy(space, nbp->b_data, fs->fs_bsize); 792 space = (char *)space + fs->fs_bsize; 793 bawrite(nbp); 794 } 795 /* 796 * As this is the newest list, it is the most inclusive, so 797 * should replace the previous list. 798 */ 799 VI_LOCK(devvp); 800 space = sn->sn_blklist; 801 sn->sn_blklist = snapblklist; 802 sn->sn_listsize = snaplistsize; 803 VI_UNLOCK(devvp); 804 if (space != NULL) 805 free(space, M_UFSMNT); 806 /* 807 * If another process is currently writing the buffer containing 808 * the inode for this snapshot then a deadlock can occur. Drop 809 * the snapshot lock until the buffer has been written. 810 */ 811 VREF(vp); /* Protect against ffs_snapgone() */ 812 VOP_UNLOCK(vp, 0); 813 (void) bread(ip->i_devvp, 814 fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), 815 (int) fs->fs_bsize, NOCRED, &nbp); 816 brelse(nbp); 817 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 818 if (ip->i_effnlink == 0) 819 error = ENOENT; /* Snapshot file unlinked */ 820 else 821 vrele(vp); /* Drop extra reference */ 822 done: 823 free(copy_fs->fs_csp, M_UFSMNT); 824 bawrite(sbp); 825 out: 826 NDFREE(&nd, NDF_ONLY_PNBUF); 827 if (saved_nice > 0) { 828 struct proc *p; 829 830 p = td->td_proc; 831 PROC_LOCK(p); 832 sched_nice(td->td_proc, saved_nice); 833 PROC_UNLOCK(td->td_proc); 834 } 835 UFS_LOCK(ump); 836 if (fs->fs_active != 0) { 837 free(fs->fs_active, M_DEVBUF); 838 fs->fs_active = 0; 839 } 840 UFS_UNLOCK(ump); 841 MNT_ILOCK(mp); 842 mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA); 843 MNT_IUNLOCK(mp); 844 if (error) 845 (void) ffs_truncate(vp, (off_t)0, 0, NOCRED, td); 846 (void) ffs_syncvnode(vp, MNT_WAIT); 847 if (error) 848 vput(vp); 849 else 850 VOP_UNLOCK(vp, 0); 851 vrele(nd.ni_dvp); 852 vn_finished_write(wrtmp); 853 process_deferred_inactive(mp); 854 return (error); 855 } 856 857 /* 858 * Copy a cylinder group map. All the unallocated blocks are marked 859 * BLK_NOCOPY so that the snapshot knows that it need not copy them 860 * if they are later written. If passno is one, then this is a first 861 * pass, so only setting needs to be done. If passno is 2, then this 862 * is a revision to a previous pass which must be undone as the 863 * replacement pass is done. 864 */ 865 static int 866 cgaccount(cg, vp, nbp, passno) 867 int cg; 868 struct vnode *vp; 869 struct buf *nbp; 870 int passno; 871 { 872 struct buf *bp, *ibp; 873 struct inode *ip; 874 struct cg *cgp; 875 struct fs *fs; 876 ufs2_daddr_t base, numblks; 877 int error, len, loc, indiroff; 878 879 ip = VTOI(vp); 880 fs = ip->i_fs; 881 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 882 (int)fs->fs_cgsize, KERNCRED, &bp); 883 if (error) { 884 brelse(bp); 885 return (error); 886 } 887 cgp = (struct cg *)bp->b_data; 888 if (!cg_chkmagic(cgp)) { 889 brelse(bp); 890 return (EIO); 891 } 892 UFS_LOCK(ip->i_ump); 893 ACTIVESET(fs, cg); 894 /* 895 * Recomputation of summary information might not have been performed 896 * at mount time. Sync up summary information for current cylinder 897 * group while data is in memory to ensure that result of background 898 * fsck is slightly more consistent. 899 */ 900 fs->fs_cs(fs, cg) = cgp->cg_cs; 901 UFS_UNLOCK(ip->i_ump); 902 bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize); 903 if (fs->fs_cgsize < fs->fs_bsize) 904 bzero(&nbp->b_data[fs->fs_cgsize], 905 fs->fs_bsize - fs->fs_cgsize); 906 cgp = (struct cg *)nbp->b_data; 907 bqrelse(bp); 908 if (passno == 2) 909 nbp->b_flags |= B_VALIDSUSPWRT; 910 numblks = howmany(fs->fs_size, fs->fs_frag); 911 len = howmany(fs->fs_fpg, fs->fs_frag); 912 base = cgbase(fs, cg) / fs->fs_frag; 913 if (base + len >= numblks) 914 len = numblks - base - 1; 915 loc = 0; 916 if (base < NDADDR) { 917 for ( ; loc < NDADDR; loc++) { 918 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 919 DIP_SET(ip, i_db[loc], BLK_NOCOPY); 920 else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY) 921 DIP_SET(ip, i_db[loc], 0); 922 else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY) 923 panic("ffs_snapshot: lost direct block"); 924 } 925 } 926 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), 927 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 928 if (error) { 929 return (error); 930 } 931 indiroff = (base + loc - NDADDR) % NINDIR(fs); 932 for ( ; loc < len; loc++, indiroff++) { 933 if (indiroff >= NINDIR(fs)) { 934 if (passno == 2) 935 ibp->b_flags |= B_VALIDSUSPWRT; 936 bawrite(ibp); 937 error = UFS_BALLOC(vp, 938 lblktosize(fs, (off_t)(base + loc)), 939 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 940 if (error) { 941 return (error); 942 } 943 indiroff = 0; 944 } 945 if (ip->i_ump->um_fstype == UFS1) { 946 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 947 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 948 BLK_NOCOPY; 949 else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data)) 950 [indiroff] == BLK_NOCOPY) 951 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0; 952 else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data)) 953 [indiroff] == BLK_NOCOPY) 954 panic("ffs_snapshot: lost indirect block"); 955 continue; 956 } 957 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 958 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; 959 else if (passno == 2 && 960 ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) 961 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0; 962 else if (passno == 1 && 963 ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) 964 panic("ffs_snapshot: lost indirect block"); 965 } 966 if (passno == 2) 967 ibp->b_flags |= B_VALIDSUSPWRT; 968 bdwrite(ibp); 969 return (0); 970 } 971 972 /* 973 * Before expunging a snapshot inode, note all the 974 * blocks that it claims with BLK_SNAP so that fsck will 975 * be able to account for those blocks properly and so 976 * that this snapshot knows that it need not copy them 977 * if the other snapshot holding them is freed. This code 978 * is reproduced once each for UFS1 and UFS2. 979 */ 980 static int 981 expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype, clearmode) 982 struct vnode *snapvp; 983 struct inode *cancelip; 984 struct fs *fs; 985 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 986 struct fs *, ufs_lbn_t, int); 987 int expungetype; 988 int clearmode; 989 { 990 int i, error, indiroff; 991 ufs_lbn_t lbn, rlbn; 992 ufs2_daddr_t len, blkno, numblks, blksperindir; 993 struct ufs1_dinode *dip; 994 struct thread *td = curthread; 995 struct buf *bp; 996 997 /* 998 * Prepare to expunge the inode. If its inode block has not 999 * yet been copied, then allocate and fill the copy. 1000 */ 1001 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 1002 blkno = 0; 1003 if (lbn < NDADDR) { 1004 blkno = VTOI(snapvp)->i_din1->di_db[lbn]; 1005 } else { 1006 if (DOINGSOFTDEP(snapvp)) 1007 softdep_prealloc(snapvp, MNT_WAIT); 1008 td->td_pflags |= TDP_COWINPROGRESS; 1009 error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn), 1010 fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); 1011 td->td_pflags &= ~TDP_COWINPROGRESS; 1012 if (error) 1013 return (error); 1014 indiroff = (lbn - NDADDR) % NINDIR(fs); 1015 blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff]; 1016 bqrelse(bp); 1017 } 1018 if (blkno != 0) { 1019 if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) 1020 return (error); 1021 } else { 1022 error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn), 1023 fs->fs_bsize, KERNCRED, 0, &bp); 1024 if (error) 1025 return (error); 1026 if ((error = readblock(snapvp, bp, lbn)) != 0) 1027 return (error); 1028 } 1029 /* 1030 * Set a snapshot inode to be a zero length file, regular files 1031 * or unlinked snapshots to be completely unallocated. 1032 */ 1033 dip = (struct ufs1_dinode *)bp->b_data + 1034 ino_to_fsbo(fs, cancelip->i_number); 1035 if (clearmode || cancelip->i_effnlink == 0) 1036 dip->di_mode = 0; 1037 dip->di_size = 0; 1038 dip->di_blocks = 0; 1039 dip->di_flags &= ~SF_SNAPSHOT; 1040 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t)); 1041 bdwrite(bp); 1042 /* 1043 * Now go through and expunge all the blocks in the file 1044 * using the function requested. 1045 */ 1046 numblks = howmany(cancelip->i_size, fs->fs_bsize); 1047 if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0], 1048 &cancelip->i_din1->di_db[NDADDR], fs, 0, expungetype))) 1049 return (error); 1050 if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_ib[0], 1051 &cancelip->i_din1->di_ib[NIADDR], fs, -1, expungetype))) 1052 return (error); 1053 blksperindir = 1; 1054 lbn = -NDADDR; 1055 len = numblks - NDADDR; 1056 rlbn = NDADDR; 1057 for (i = 0; len > 0 && i < NIADDR; i++) { 1058 error = indiracct_ufs1(snapvp, ITOV(cancelip), i, 1059 cancelip->i_din1->di_ib[i], lbn, rlbn, len, 1060 blksperindir, fs, acctfunc, expungetype); 1061 if (error) 1062 return (error); 1063 blksperindir *= NINDIR(fs); 1064 lbn -= blksperindir + 1; 1065 len -= blksperindir; 1066 rlbn += blksperindir; 1067 } 1068 return (0); 1069 } 1070 1071 /* 1072 * Descend an indirect block chain for vnode cancelvp accounting for all 1073 * its indirect blocks in snapvp. 1074 */ 1075 static int 1076 indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, 1077 blksperindir, fs, acctfunc, expungetype) 1078 struct vnode *snapvp; 1079 struct vnode *cancelvp; 1080 int level; 1081 ufs1_daddr_t blkno; 1082 ufs_lbn_t lbn; 1083 ufs_lbn_t rlbn; 1084 ufs_lbn_t remblks; 1085 ufs_lbn_t blksperindir; 1086 struct fs *fs; 1087 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 1088 struct fs *, ufs_lbn_t, int); 1089 int expungetype; 1090 { 1091 int error, num, i; 1092 ufs_lbn_t subblksperindir; 1093 struct indir indirs[NIADDR + 2]; 1094 ufs1_daddr_t last, *bap; 1095 struct buf *bp; 1096 1097 if (blkno == 0) { 1098 if (expungetype == BLK_NOCOPY) 1099 return (0); 1100 panic("indiracct_ufs1: missing indir"); 1101 } 1102 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 1103 return (error); 1104 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 1105 panic("indiracct_ufs1: botched params"); 1106 /* 1107 * We have to expand bread here since it will deadlock looking 1108 * up the block number for any blocks that are not in the cache. 1109 */ 1110 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); 1111 bp->b_blkno = fsbtodb(fs, blkno); 1112 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 1113 (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) { 1114 brelse(bp); 1115 return (error); 1116 } 1117 /* 1118 * Account for the block pointers in this indirect block. 1119 */ 1120 last = howmany(remblks, blksperindir); 1121 if (last > NINDIR(fs)) 1122 last = NINDIR(fs); 1123 bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK); 1124 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 1125 bqrelse(bp); 1126 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 1127 level == 0 ? rlbn : -1, expungetype); 1128 if (error || level == 0) 1129 goto out; 1130 /* 1131 * Account for the block pointers in each of the indirect blocks 1132 * in the levels below us. 1133 */ 1134 subblksperindir = blksperindir / NINDIR(fs); 1135 for (lbn++, level--, i = 0; i < last; i++) { 1136 error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn, 1137 rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); 1138 if (error) 1139 goto out; 1140 rlbn += blksperindir; 1141 lbn -= blksperindir; 1142 remblks -= blksperindir; 1143 } 1144 out: 1145 free(bap, M_DEVBUF); 1146 return (error); 1147 } 1148 1149 /* 1150 * Do both snap accounting and map accounting. 1151 */ 1152 static int 1153 fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype) 1154 struct vnode *vp; 1155 ufs1_daddr_t *oldblkp, *lastblkp; 1156 struct fs *fs; 1157 ufs_lbn_t lblkno; 1158 int exptype; /* BLK_SNAP or BLK_NOCOPY */ 1159 { 1160 int error; 1161 1162 if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 1163 return (error); 1164 return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 1165 } 1166 1167 /* 1168 * Identify a set of blocks allocated in a snapshot inode. 1169 */ 1170 static int 1171 snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1172 struct vnode *vp; 1173 ufs1_daddr_t *oldblkp, *lastblkp; 1174 struct fs *fs; 1175 ufs_lbn_t lblkno; 1176 int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 1177 { 1178 struct inode *ip = VTOI(vp); 1179 ufs1_daddr_t blkno, *blkp; 1180 ufs_lbn_t lbn; 1181 struct buf *ibp; 1182 int error; 1183 1184 for ( ; oldblkp < lastblkp; oldblkp++) { 1185 blkno = *oldblkp; 1186 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1187 continue; 1188 lbn = fragstoblks(fs, blkno); 1189 if (lbn < NDADDR) { 1190 blkp = &ip->i_din1->di_db[lbn]; 1191 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1192 } else { 1193 error = ffs_balloc_ufs1(vp, lblktosize(fs, (off_t)lbn), 1194 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1195 if (error) 1196 return (error); 1197 blkp = &((ufs1_daddr_t *)(ibp->b_data)) 1198 [(lbn - NDADDR) % NINDIR(fs)]; 1199 } 1200 /* 1201 * If we are expunging a snapshot vnode and we 1202 * find a block marked BLK_NOCOPY, then it is 1203 * one that has been allocated to this snapshot after 1204 * we took our current snapshot and can be ignored. 1205 */ 1206 if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { 1207 if (lbn >= NDADDR) 1208 brelse(ibp); 1209 } else { 1210 if (*blkp != 0) 1211 panic("snapacct_ufs1: bad block"); 1212 *blkp = expungetype; 1213 if (lbn >= NDADDR) 1214 bdwrite(ibp); 1215 } 1216 } 1217 return (0); 1218 } 1219 1220 /* 1221 * Account for a set of blocks allocated in a snapshot inode. 1222 */ 1223 static int 1224 mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1225 struct vnode *vp; 1226 ufs1_daddr_t *oldblkp, *lastblkp; 1227 struct fs *fs; 1228 ufs_lbn_t lblkno; 1229 int expungetype; 1230 { 1231 ufs1_daddr_t blkno; 1232 struct inode *ip; 1233 ino_t inum; 1234 int acctit; 1235 1236 ip = VTOI(vp); 1237 inum = ip->i_number; 1238 if (lblkno == -1) 1239 acctit = 0; 1240 else 1241 acctit = 1; 1242 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1243 blkno = *oldblkp; 1244 if (blkno == 0 || blkno == BLK_NOCOPY) 1245 continue; 1246 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1247 *ip->i_snapblklist++ = lblkno; 1248 if (blkno == BLK_SNAP) 1249 blkno = blkstofrags(fs, lblkno); 1250 ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum, NULL); 1251 } 1252 return (0); 1253 } 1254 1255 /* 1256 * Before expunging a snapshot inode, note all the 1257 * blocks that it claims with BLK_SNAP so that fsck will 1258 * be able to account for those blocks properly and so 1259 * that this snapshot knows that it need not copy them 1260 * if the other snapshot holding them is freed. This code 1261 * is reproduced once each for UFS1 and UFS2. 1262 */ 1263 static int 1264 expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype, clearmode) 1265 struct vnode *snapvp; 1266 struct inode *cancelip; 1267 struct fs *fs; 1268 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1269 struct fs *, ufs_lbn_t, int); 1270 int expungetype; 1271 int clearmode; 1272 { 1273 int i, error, indiroff; 1274 ufs_lbn_t lbn, rlbn; 1275 ufs2_daddr_t len, blkno, numblks, blksperindir; 1276 struct ufs2_dinode *dip; 1277 struct thread *td = curthread; 1278 struct buf *bp; 1279 1280 /* 1281 * Prepare to expunge the inode. If its inode block has not 1282 * yet been copied, then allocate and fill the copy. 1283 */ 1284 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 1285 blkno = 0; 1286 if (lbn < NDADDR) { 1287 blkno = VTOI(snapvp)->i_din2->di_db[lbn]; 1288 } else { 1289 if (DOINGSOFTDEP(snapvp)) 1290 softdep_prealloc(snapvp, MNT_WAIT); 1291 td->td_pflags |= TDP_COWINPROGRESS; 1292 error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn), 1293 fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); 1294 td->td_pflags &= ~TDP_COWINPROGRESS; 1295 if (error) 1296 return (error); 1297 indiroff = (lbn - NDADDR) % NINDIR(fs); 1298 blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff]; 1299 bqrelse(bp); 1300 } 1301 if (blkno != 0) { 1302 if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) 1303 return (error); 1304 } else { 1305 error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn), 1306 fs->fs_bsize, KERNCRED, 0, &bp); 1307 if (error) 1308 return (error); 1309 if ((error = readblock(snapvp, bp, lbn)) != 0) 1310 return (error); 1311 } 1312 /* 1313 * Set a snapshot inode to be a zero length file, regular files 1314 * to be completely unallocated. 1315 */ 1316 dip = (struct ufs2_dinode *)bp->b_data + 1317 ino_to_fsbo(fs, cancelip->i_number); 1318 if (clearmode || cancelip->i_effnlink == 0) 1319 dip->di_mode = 0; 1320 dip->di_size = 0; 1321 dip->di_blocks = 0; 1322 dip->di_flags &= ~SF_SNAPSHOT; 1323 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t)); 1324 bdwrite(bp); 1325 /* 1326 * Now go through and expunge all the blocks in the file 1327 * using the function requested. 1328 */ 1329 numblks = howmany(cancelip->i_size, fs->fs_bsize); 1330 if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0], 1331 &cancelip->i_din2->di_db[NDADDR], fs, 0, expungetype))) 1332 return (error); 1333 if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_ib[0], 1334 &cancelip->i_din2->di_ib[NIADDR], fs, -1, expungetype))) 1335 return (error); 1336 blksperindir = 1; 1337 lbn = -NDADDR; 1338 len = numblks - NDADDR; 1339 rlbn = NDADDR; 1340 for (i = 0; len > 0 && i < NIADDR; i++) { 1341 error = indiracct_ufs2(snapvp, ITOV(cancelip), i, 1342 cancelip->i_din2->di_ib[i], lbn, rlbn, len, 1343 blksperindir, fs, acctfunc, expungetype); 1344 if (error) 1345 return (error); 1346 blksperindir *= NINDIR(fs); 1347 lbn -= blksperindir + 1; 1348 len -= blksperindir; 1349 rlbn += blksperindir; 1350 } 1351 return (0); 1352 } 1353 1354 /* 1355 * Descend an indirect block chain for vnode cancelvp accounting for all 1356 * its indirect blocks in snapvp. 1357 */ 1358 static int 1359 indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, 1360 blksperindir, fs, acctfunc, expungetype) 1361 struct vnode *snapvp; 1362 struct vnode *cancelvp; 1363 int level; 1364 ufs2_daddr_t blkno; 1365 ufs_lbn_t lbn; 1366 ufs_lbn_t rlbn; 1367 ufs_lbn_t remblks; 1368 ufs_lbn_t blksperindir; 1369 struct fs *fs; 1370 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1371 struct fs *, ufs_lbn_t, int); 1372 int expungetype; 1373 { 1374 int error, num, i; 1375 ufs_lbn_t subblksperindir; 1376 struct indir indirs[NIADDR + 2]; 1377 ufs2_daddr_t last, *bap; 1378 struct buf *bp; 1379 1380 if (blkno == 0) { 1381 if (expungetype == BLK_NOCOPY) 1382 return (0); 1383 panic("indiracct_ufs2: missing indir"); 1384 } 1385 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 1386 return (error); 1387 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 1388 panic("indiracct_ufs2: botched params"); 1389 /* 1390 * We have to expand bread here since it will deadlock looking 1391 * up the block number for any blocks that are not in the cache. 1392 */ 1393 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); 1394 bp->b_blkno = fsbtodb(fs, blkno); 1395 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 1396 (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) { 1397 brelse(bp); 1398 return (error); 1399 } 1400 /* 1401 * Account for the block pointers in this indirect block. 1402 */ 1403 last = howmany(remblks, blksperindir); 1404 if (last > NINDIR(fs)) 1405 last = NINDIR(fs); 1406 bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK); 1407 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 1408 bqrelse(bp); 1409 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 1410 level == 0 ? rlbn : -1, expungetype); 1411 if (error || level == 0) 1412 goto out; 1413 /* 1414 * Account for the block pointers in each of the indirect blocks 1415 * in the levels below us. 1416 */ 1417 subblksperindir = blksperindir / NINDIR(fs); 1418 for (lbn++, level--, i = 0; i < last; i++) { 1419 error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn, 1420 rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); 1421 if (error) 1422 goto out; 1423 rlbn += blksperindir; 1424 lbn -= blksperindir; 1425 remblks -= blksperindir; 1426 } 1427 out: 1428 free(bap, M_DEVBUF); 1429 return (error); 1430 } 1431 1432 /* 1433 * Do both snap accounting and map accounting. 1434 */ 1435 static int 1436 fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype) 1437 struct vnode *vp; 1438 ufs2_daddr_t *oldblkp, *lastblkp; 1439 struct fs *fs; 1440 ufs_lbn_t lblkno; 1441 int exptype; /* BLK_SNAP or BLK_NOCOPY */ 1442 { 1443 int error; 1444 1445 if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 1446 return (error); 1447 return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 1448 } 1449 1450 /* 1451 * Identify a set of blocks allocated in a snapshot inode. 1452 */ 1453 static int 1454 snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1455 struct vnode *vp; 1456 ufs2_daddr_t *oldblkp, *lastblkp; 1457 struct fs *fs; 1458 ufs_lbn_t lblkno; 1459 int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 1460 { 1461 struct inode *ip = VTOI(vp); 1462 ufs2_daddr_t blkno, *blkp; 1463 ufs_lbn_t lbn; 1464 struct buf *ibp; 1465 int error; 1466 1467 for ( ; oldblkp < lastblkp; oldblkp++) { 1468 blkno = *oldblkp; 1469 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1470 continue; 1471 lbn = fragstoblks(fs, blkno); 1472 if (lbn < NDADDR) { 1473 blkp = &ip->i_din2->di_db[lbn]; 1474 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1475 } else { 1476 error = ffs_balloc_ufs2(vp, lblktosize(fs, (off_t)lbn), 1477 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1478 if (error) 1479 return (error); 1480 blkp = &((ufs2_daddr_t *)(ibp->b_data)) 1481 [(lbn - NDADDR) % NINDIR(fs)]; 1482 } 1483 /* 1484 * If we are expunging a snapshot vnode and we 1485 * find a block marked BLK_NOCOPY, then it is 1486 * one that has been allocated to this snapshot after 1487 * we took our current snapshot and can be ignored. 1488 */ 1489 if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { 1490 if (lbn >= NDADDR) 1491 brelse(ibp); 1492 } else { 1493 if (*blkp != 0) 1494 panic("snapacct_ufs2: bad block"); 1495 *blkp = expungetype; 1496 if (lbn >= NDADDR) 1497 bdwrite(ibp); 1498 } 1499 } 1500 return (0); 1501 } 1502 1503 /* 1504 * Account for a set of blocks allocated in a snapshot inode. 1505 */ 1506 static int 1507 mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1508 struct vnode *vp; 1509 ufs2_daddr_t *oldblkp, *lastblkp; 1510 struct fs *fs; 1511 ufs_lbn_t lblkno; 1512 int expungetype; 1513 { 1514 ufs2_daddr_t blkno; 1515 struct inode *ip; 1516 ino_t inum; 1517 int acctit; 1518 1519 ip = VTOI(vp); 1520 inum = ip->i_number; 1521 if (lblkno == -1) 1522 acctit = 0; 1523 else 1524 acctit = 1; 1525 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1526 blkno = *oldblkp; 1527 if (blkno == 0 || blkno == BLK_NOCOPY) 1528 continue; 1529 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1530 *ip->i_snapblklist++ = lblkno; 1531 if (blkno == BLK_SNAP) 1532 blkno = blkstofrags(fs, lblkno); 1533 ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum, NULL); 1534 } 1535 return (0); 1536 } 1537 1538 /* 1539 * Decrement extra reference on snapshot when last name is removed. 1540 * It will not be freed until the last open reference goes away. 1541 */ 1542 void 1543 ffs_snapgone(ip) 1544 struct inode *ip; 1545 { 1546 struct inode *xp; 1547 struct fs *fs; 1548 int snaploc; 1549 struct snapdata *sn; 1550 struct ufsmount *ump; 1551 1552 /* 1553 * Find snapshot in incore list. 1554 */ 1555 xp = NULL; 1556 sn = ip->i_devvp->v_rdev->si_snapdata; 1557 if (sn != NULL) 1558 TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) 1559 if (xp == ip) 1560 break; 1561 if (xp != NULL) 1562 vrele(ITOV(ip)); 1563 else if (snapdebug) 1564 printf("ffs_snapgone: lost snapshot vnode %d\n", 1565 ip->i_number); 1566 /* 1567 * Delete snapshot inode from superblock. Keep list dense. 1568 */ 1569 fs = ip->i_fs; 1570 ump = ip->i_ump; 1571 UFS_LOCK(ump); 1572 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 1573 if (fs->fs_snapinum[snaploc] == ip->i_number) 1574 break; 1575 if (snaploc < FSMAXSNAP) { 1576 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { 1577 if (fs->fs_snapinum[snaploc] == 0) 1578 break; 1579 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; 1580 } 1581 fs->fs_snapinum[snaploc - 1] = 0; 1582 } 1583 UFS_UNLOCK(ump); 1584 } 1585 1586 /* 1587 * Prepare a snapshot file for being removed. 1588 */ 1589 void 1590 ffs_snapremove(vp) 1591 struct vnode *vp; 1592 { 1593 struct inode *ip; 1594 struct vnode *devvp; 1595 struct buf *ibp; 1596 struct fs *fs; 1597 ufs2_daddr_t numblks, blkno, dblk; 1598 int error, loc, last; 1599 struct snapdata *sn; 1600 1601 ip = VTOI(vp); 1602 fs = ip->i_fs; 1603 devvp = ip->i_devvp; 1604 /* 1605 * If active, delete from incore list (this snapshot may 1606 * already have been in the process of being deleted, so 1607 * would not have been active). 1608 * 1609 * Clear copy-on-write flag if last snapshot. 1610 */ 1611 VI_LOCK(devvp); 1612 if (ip->i_nextsnap.tqe_prev != 0) { 1613 sn = devvp->v_rdev->si_snapdata; 1614 TAILQ_REMOVE(&sn->sn_head, ip, i_nextsnap); 1615 ip->i_nextsnap.tqe_prev = 0; 1616 VI_UNLOCK(devvp); 1617 lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL); 1618 KASSERT(vp->v_vnlock == &sn->sn_lock, 1619 ("ffs_snapremove: lost lock mutation")); 1620 vp->v_vnlock = &vp->v_lock; 1621 VI_LOCK(devvp); 1622 lockmgr(&sn->sn_lock, LK_RELEASE, NULL); 1623 try_free_snapdata(devvp); 1624 } else 1625 VI_UNLOCK(devvp); 1626 /* 1627 * Clear all BLK_NOCOPY fields. Pass any block claims to other 1628 * snapshots that want them (see ffs_snapblkfree below). 1629 */ 1630 for (blkno = 1; blkno < NDADDR; blkno++) { 1631 dblk = DIP(ip, i_db[blkno]); 1632 if (dblk == 0) 1633 continue; 1634 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1635 DIP_SET(ip, i_db[blkno], 0); 1636 else if ((dblk == blkstofrags(fs, blkno) && 1637 ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, 1638 ip->i_number))) { 1639 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - 1640 btodb(fs->fs_bsize)); 1641 DIP_SET(ip, i_db[blkno], 0); 1642 } 1643 } 1644 numblks = howmany(ip->i_size, fs->fs_bsize); 1645 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 1646 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), 1647 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1648 if (error) 1649 continue; 1650 if (fs->fs_size - blkno > NINDIR(fs)) 1651 last = NINDIR(fs); 1652 else 1653 last = fs->fs_size - blkno; 1654 for (loc = 0; loc < last; loc++) { 1655 if (ip->i_ump->um_fstype == UFS1) { 1656 dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc]; 1657 if (dblk == 0) 1658 continue; 1659 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1660 ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; 1661 else if ((dblk == blkstofrags(fs, blkno) && 1662 ffs_snapblkfree(fs, ip->i_devvp, dblk, 1663 fs->fs_bsize, ip->i_number))) { 1664 ip->i_din1->di_blocks -= 1665 btodb(fs->fs_bsize); 1666 ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; 1667 } 1668 continue; 1669 } 1670 dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc]; 1671 if (dblk == 0) 1672 continue; 1673 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1674 ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; 1675 else if ((dblk == blkstofrags(fs, blkno) && 1676 ffs_snapblkfree(fs, ip->i_devvp, dblk, 1677 fs->fs_bsize, ip->i_number))) { 1678 ip->i_din2->di_blocks -= btodb(fs->fs_bsize); 1679 ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; 1680 } 1681 } 1682 bawrite(ibp); 1683 } 1684 /* 1685 * Clear snapshot flag and drop reference. 1686 */ 1687 ip->i_flags &= ~SF_SNAPSHOT; 1688 DIP_SET(ip, i_flags, ip->i_flags); 1689 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1690 /* 1691 * The dirtied indirects must be written out before 1692 * softdep_setup_freeblocks() is called. Otherwise indir_trunc() 1693 * may find indirect pointers using the magic BLK_* values. 1694 */ 1695 if (DOINGSOFTDEP(vp)) 1696 ffs_syncvnode(vp, MNT_WAIT); 1697 #ifdef QUOTA 1698 /* 1699 * Reenable disk quotas for ex-snapshot file. 1700 */ 1701 if (!getinoquota(ip)) 1702 (void) chkdq(ip, DIP(ip, i_blocks), KERNCRED, FORCE); 1703 #endif 1704 } 1705 1706 /* 1707 * Notification that a block is being freed. Return zero if the free 1708 * should be allowed to proceed. Return non-zero if the snapshot file 1709 * wants to claim the block. The block will be claimed if it is an 1710 * uncopied part of one of the snapshots. It will be freed if it is 1711 * either a BLK_NOCOPY or has already been copied in all of the snapshots. 1712 * If a fragment is being freed, then all snapshots that care about 1713 * it must make a copy since a snapshot file can only claim full sized 1714 * blocks. Note that if more than one snapshot file maps the block, 1715 * we can pick one at random to claim it. Since none of the snapshots 1716 * can change, we are assurred that they will all see the same unmodified 1717 * image. When deleting a snapshot file (see ffs_snapremove above), we 1718 * must push any of these claimed blocks to one of the other snapshots 1719 * that maps it. These claimed blocks are easily identified as they will 1720 * have a block number equal to their logical block number within the 1721 * snapshot. A copied block can never have this property because they 1722 * must always have been allocated from a BLK_NOCOPY location. 1723 */ 1724 int 1725 ffs_snapblkfree(fs, devvp, bno, size, inum) 1726 struct fs *fs; 1727 struct vnode *devvp; 1728 ufs2_daddr_t bno; 1729 long size; 1730 ino_t inum; 1731 { 1732 struct buf *ibp, *cbp, *savedcbp = 0; 1733 struct thread *td = curthread; 1734 struct inode *ip; 1735 struct vnode *vp = NULL; 1736 ufs_lbn_t lbn; 1737 ufs2_daddr_t blkno; 1738 int indiroff = 0, error = 0, claimedblk = 0; 1739 struct snapdata *sn; 1740 1741 lbn = fragstoblks(fs, bno); 1742 retry: 1743 VI_LOCK(devvp); 1744 sn = devvp->v_rdev->si_snapdata; 1745 if (sn == NULL) { 1746 VI_UNLOCK(devvp); 1747 return (0); 1748 } 1749 if (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 1750 VI_MTX(devvp)) != 0) 1751 goto retry; 1752 TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { 1753 vp = ITOV(ip); 1754 if (DOINGSOFTDEP(vp)) 1755 softdep_prealloc(vp, MNT_WAIT); 1756 /* 1757 * Lookup block being written. 1758 */ 1759 if (lbn < NDADDR) { 1760 blkno = DIP(ip, i_db[lbn]); 1761 } else { 1762 td->td_pflags |= TDP_COWINPROGRESS; 1763 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1764 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1765 td->td_pflags &= ~TDP_COWINPROGRESS; 1766 if (error) 1767 break; 1768 indiroff = (lbn - NDADDR) % NINDIR(fs); 1769 if (ip->i_ump->um_fstype == UFS1) 1770 blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; 1771 else 1772 blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; 1773 } 1774 /* 1775 * Check to see if block needs to be copied. 1776 */ 1777 if (blkno == 0) { 1778 /* 1779 * A block that we map is being freed. If it has not 1780 * been claimed yet, we will claim or copy it (below). 1781 */ 1782 claimedblk = 1; 1783 } else if (blkno == BLK_SNAP) { 1784 /* 1785 * No previous snapshot claimed the block, 1786 * so it will be freed and become a BLK_NOCOPY 1787 * (don't care) for us. 1788 */ 1789 if (claimedblk) 1790 panic("snapblkfree: inconsistent block type"); 1791 if (lbn < NDADDR) { 1792 DIP_SET(ip, i_db[lbn], BLK_NOCOPY); 1793 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1794 } else if (ip->i_ump->um_fstype == UFS1) { 1795 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 1796 BLK_NOCOPY; 1797 bdwrite(ibp); 1798 } else { 1799 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 1800 BLK_NOCOPY; 1801 bdwrite(ibp); 1802 } 1803 continue; 1804 } else /* BLK_NOCOPY or default */ { 1805 /* 1806 * If the snapshot has already copied the block 1807 * (default), or does not care about the block, 1808 * it is not needed. 1809 */ 1810 if (lbn >= NDADDR) 1811 bqrelse(ibp); 1812 continue; 1813 } 1814 /* 1815 * If this is a full size block, we will just grab it 1816 * and assign it to the snapshot inode. Otherwise we 1817 * will proceed to copy it. See explanation for this 1818 * routine as to why only a single snapshot needs to 1819 * claim this block. 1820 */ 1821 if (size == fs->fs_bsize) { 1822 #ifdef DEBUG 1823 if (snapdebug) 1824 printf("%s %d lbn %jd from inum %d\n", 1825 "Grabonremove: snapino", ip->i_number, 1826 (intmax_t)lbn, inum); 1827 #endif 1828 if (lbn < NDADDR) { 1829 DIP_SET(ip, i_db[lbn], bno); 1830 } else if (ip->i_ump->um_fstype == UFS1) { 1831 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno; 1832 bdwrite(ibp); 1833 } else { 1834 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno; 1835 bdwrite(ibp); 1836 } 1837 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + btodb(size)); 1838 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1839 lockmgr(vp->v_vnlock, LK_RELEASE, NULL); 1840 return (1); 1841 } 1842 if (lbn >= NDADDR) 1843 bqrelse(ibp); 1844 /* 1845 * Allocate the block into which to do the copy. Note that this 1846 * allocation will never require any additional allocations for 1847 * the snapshot inode. 1848 */ 1849 td->td_pflags |= TDP_COWINPROGRESS; 1850 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1851 fs->fs_bsize, KERNCRED, 0, &cbp); 1852 td->td_pflags &= ~TDP_COWINPROGRESS; 1853 if (error) 1854 break; 1855 #ifdef DEBUG 1856 if (snapdebug) 1857 printf("%s%d lbn %jd %s %d size %ld to blkno %jd\n", 1858 "Copyonremove: snapino ", ip->i_number, 1859 (intmax_t)lbn, "for inum", inum, size, 1860 (intmax_t)cbp->b_blkno); 1861 #endif 1862 /* 1863 * If we have already read the old block contents, then 1864 * simply copy them to the new block. Note that we need 1865 * to synchronously write snapshots that have not been 1866 * unlinked, and hence will be visible after a crash, 1867 * to ensure their integrity. 1868 */ 1869 if (savedcbp != 0) { 1870 bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 1871 bawrite(cbp); 1872 if (dopersistence && ip->i_effnlink > 0) 1873 (void) ffs_syncvnode(vp, MNT_WAIT); 1874 continue; 1875 } 1876 /* 1877 * Otherwise, read the old block contents into the buffer. 1878 */ 1879 if ((error = readblock(vp, cbp, lbn)) != 0) { 1880 bzero(cbp->b_data, fs->fs_bsize); 1881 bawrite(cbp); 1882 if (dopersistence && ip->i_effnlink > 0) 1883 (void) ffs_syncvnode(vp, MNT_WAIT); 1884 break; 1885 } 1886 savedcbp = cbp; 1887 } 1888 /* 1889 * Note that we need to synchronously write snapshots that 1890 * have not been unlinked, and hence will be visible after 1891 * a crash, to ensure their integrity. 1892 */ 1893 if (savedcbp) { 1894 vp = savedcbp->b_vp; 1895 bawrite(savedcbp); 1896 if (dopersistence && VTOI(vp)->i_effnlink > 0) 1897 (void) ffs_syncvnode(vp, MNT_WAIT); 1898 } 1899 /* 1900 * If we have been unable to allocate a block in which to do 1901 * the copy, then return non-zero so that the fragment will 1902 * not be freed. Although space will be lost, the snapshot 1903 * will stay consistent. 1904 */ 1905 lockmgr(vp->v_vnlock, LK_RELEASE, NULL); 1906 return (error); 1907 } 1908 1909 /* 1910 * Associate snapshot files when mounting. 1911 */ 1912 void 1913 ffs_snapshot_mount(mp) 1914 struct mount *mp; 1915 { 1916 struct ufsmount *ump = VFSTOUFS(mp); 1917 struct vnode *devvp = ump->um_devvp; 1918 struct fs *fs = ump->um_fs; 1919 struct thread *td = curthread; 1920 struct snapdata *sn; 1921 struct vnode *vp; 1922 struct vnode *lastvp; 1923 struct inode *ip; 1924 struct uio auio; 1925 struct iovec aiov; 1926 void *snapblklist; 1927 char *reason; 1928 daddr_t snaplistsize; 1929 int error, snaploc, loc; 1930 1931 /* 1932 * XXX The following needs to be set before ffs_truncate or 1933 * VOP_READ can be called. 1934 */ 1935 mp->mnt_stat.f_iosize = fs->fs_bsize; 1936 /* 1937 * Process each snapshot listed in the superblock. 1938 */ 1939 vp = NULL; 1940 lastvp = NULL; 1941 sn = NULL; 1942 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { 1943 if (fs->fs_snapinum[snaploc] == 0) 1944 break; 1945 if ((error = ffs_vget(mp, fs->fs_snapinum[snaploc], 1946 LK_EXCLUSIVE, &vp)) != 0){ 1947 printf("ffs_snapshot_mount: vget failed %d\n", error); 1948 continue; 1949 } 1950 ip = VTOI(vp); 1951 if ((ip->i_flags & SF_SNAPSHOT) == 0 || ip->i_size == 1952 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) { 1953 if ((ip->i_flags & SF_SNAPSHOT) == 0) { 1954 reason = "non-snapshot"; 1955 } else { 1956 reason = "old format snapshot"; 1957 (void)ffs_truncate(vp, (off_t)0, 0, NOCRED, td); 1958 (void)ffs_syncvnode(vp, MNT_WAIT); 1959 } 1960 printf("ffs_snapshot_mount: %s inode %d\n", 1961 reason, fs->fs_snapinum[snaploc]); 1962 vput(vp); 1963 vp = NULL; 1964 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { 1965 if (fs->fs_snapinum[loc] == 0) 1966 break; 1967 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; 1968 } 1969 fs->fs_snapinum[loc - 1] = 0; 1970 snaploc--; 1971 continue; 1972 } 1973 /* 1974 * Acquire a lock on the snapdata structure, creating it if 1975 * necessary. 1976 */ 1977 sn = ffs_snapdata_acquire(devvp); 1978 /* 1979 * Change vnode to use shared snapshot lock instead of the 1980 * original private lock. 1981 */ 1982 vp->v_vnlock = &sn->sn_lock; 1983 lockmgr(&vp->v_lock, LK_RELEASE, NULL); 1984 /* 1985 * Link it onto the active snapshot list. 1986 */ 1987 VI_LOCK(devvp); 1988 if (ip->i_nextsnap.tqe_prev != 0) 1989 panic("ffs_snapshot_mount: %d already on list", 1990 ip->i_number); 1991 else 1992 TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap); 1993 vp->v_vflag |= VV_SYSTEM; 1994 VI_UNLOCK(devvp); 1995 VOP_UNLOCK(vp, 0); 1996 lastvp = vp; 1997 } 1998 vp = lastvp; 1999 /* 2000 * No usable snapshots found. 2001 */ 2002 if (sn == NULL || vp == NULL) 2003 return; 2004 /* 2005 * Allocate the space for the block hints list. We always want to 2006 * use the list from the newest snapshot. 2007 */ 2008 auio.uio_iov = &aiov; 2009 auio.uio_iovcnt = 1; 2010 aiov.iov_base = (void *)&snaplistsize; 2011 aiov.iov_len = sizeof(snaplistsize); 2012 auio.uio_resid = aiov.iov_len; 2013 auio.uio_offset = 2014 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)); 2015 auio.uio_segflg = UIO_SYSSPACE; 2016 auio.uio_rw = UIO_READ; 2017 auio.uio_td = td; 2018 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 2019 if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 2020 printf("ffs_snapshot_mount: read_1 failed %d\n", error); 2021 VOP_UNLOCK(vp, 0); 2022 return; 2023 } 2024 snapblklist = malloc(snaplistsize * sizeof(daddr_t), 2025 M_UFSMNT, M_WAITOK); 2026 auio.uio_iovcnt = 1; 2027 aiov.iov_base = snapblklist; 2028 aiov.iov_len = snaplistsize * sizeof (daddr_t); 2029 auio.uio_resid = aiov.iov_len; 2030 auio.uio_offset -= sizeof(snaplistsize); 2031 if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 2032 printf("ffs_snapshot_mount: read_2 failed %d\n", error); 2033 VOP_UNLOCK(vp, 0); 2034 free(snapblklist, M_UFSMNT); 2035 return; 2036 } 2037 VOP_UNLOCK(vp, 0); 2038 VI_LOCK(devvp); 2039 ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_mount"); 2040 sn->sn_listsize = snaplistsize; 2041 sn->sn_blklist = (daddr_t *)snapblklist; 2042 devvp->v_vflag |= VV_COPYONWRITE; 2043 VI_UNLOCK(devvp); 2044 } 2045 2046 /* 2047 * Disassociate snapshot files when unmounting. 2048 */ 2049 void 2050 ffs_snapshot_unmount(mp) 2051 struct mount *mp; 2052 { 2053 struct vnode *devvp = VFSTOUFS(mp)->um_devvp; 2054 struct snapdata *sn; 2055 struct inode *xp; 2056 struct vnode *vp; 2057 2058 VI_LOCK(devvp); 2059 sn = devvp->v_rdev->si_snapdata; 2060 while (sn != NULL && (xp = TAILQ_FIRST(&sn->sn_head)) != NULL) { 2061 vp = ITOV(xp); 2062 TAILQ_REMOVE(&sn->sn_head, xp, i_nextsnap); 2063 xp->i_nextsnap.tqe_prev = 0; 2064 lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE, 2065 VI_MTX(devvp)); 2066 lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL); 2067 KASSERT(vp->v_vnlock == &sn->sn_lock, 2068 ("ffs_snapshot_unmount: lost lock mutation")); 2069 vp->v_vnlock = &vp->v_lock; 2070 lockmgr(&vp->v_lock, LK_RELEASE, NULL); 2071 lockmgr(&sn->sn_lock, LK_RELEASE, NULL); 2072 if (xp->i_effnlink > 0) 2073 vrele(vp); 2074 VI_LOCK(devvp); 2075 sn = devvp->v_rdev->si_snapdata; 2076 } 2077 try_free_snapdata(devvp); 2078 ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount"); 2079 } 2080 2081 /* 2082 * Check the buffer block to be belong to device buffer that shall be 2083 * locked after snaplk. devvp shall be locked on entry, and will be 2084 * leaved locked upon exit. 2085 */ 2086 static int 2087 ffs_bp_snapblk(devvp, bp) 2088 struct vnode *devvp; 2089 struct buf *bp; 2090 { 2091 struct snapdata *sn; 2092 struct fs *fs; 2093 ufs2_daddr_t lbn, *snapblklist; 2094 int lower, upper, mid; 2095 2096 ASSERT_VI_LOCKED(devvp, "ffs_bp_snapblk"); 2097 KASSERT(devvp->v_type == VCHR, ("Not a device %p", devvp)); 2098 sn = devvp->v_rdev->si_snapdata; 2099 if (sn == NULL || TAILQ_FIRST(&sn->sn_head) == NULL) 2100 return (0); 2101 fs = TAILQ_FIRST(&sn->sn_head)->i_fs; 2102 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); 2103 snapblklist = sn->sn_blklist; 2104 upper = sn->sn_listsize - 1; 2105 lower = 1; 2106 while (lower <= upper) { 2107 mid = (lower + upper) / 2; 2108 if (snapblklist[mid] == lbn) 2109 break; 2110 if (snapblklist[mid] < lbn) 2111 lower = mid + 1; 2112 else 2113 upper = mid - 1; 2114 } 2115 if (lower <= upper) 2116 return (1); 2117 return (0); 2118 } 2119 2120 void 2121 ffs_bdflush(bo, bp) 2122 struct bufobj *bo; 2123 struct buf *bp; 2124 { 2125 struct thread *td; 2126 struct vnode *vp, *devvp; 2127 struct buf *nbp; 2128 int bp_bdskip; 2129 2130 if (bo->bo_dirty.bv_cnt <= dirtybufthresh) 2131 return; 2132 2133 td = curthread; 2134 vp = bp->b_vp; 2135 devvp = bo->__bo_vnode; 2136 KASSERT(vp == devvp, ("devvp != vp %p %p", bo, bp)); 2137 2138 VI_LOCK(devvp); 2139 bp_bdskip = ffs_bp_snapblk(devvp, bp); 2140 if (bp_bdskip) 2141 bdwriteskip++; 2142 VI_UNLOCK(devvp); 2143 if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10 && !bp_bdskip) { 2144 (void) VOP_FSYNC(vp, MNT_NOWAIT, td); 2145 altbufferflushes++; 2146 } else { 2147 BO_LOCK(bo); 2148 /* 2149 * Try to find a buffer to flush. 2150 */ 2151 TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) { 2152 if ((nbp->b_vflags & BV_BKGRDINPROG) || 2153 BUF_LOCK(nbp, 2154 LK_EXCLUSIVE | LK_NOWAIT, NULL)) 2155 continue; 2156 if (bp == nbp) 2157 panic("bdwrite: found ourselves"); 2158 BO_UNLOCK(bo); 2159 /* 2160 * Don't countdeps with the bo lock 2161 * held. 2162 */ 2163 if (buf_countdeps(nbp, 0)) { 2164 BO_LOCK(bo); 2165 BUF_UNLOCK(nbp); 2166 continue; 2167 } 2168 if (bp_bdskip) { 2169 VI_LOCK(devvp); 2170 if (!ffs_bp_snapblk(vp, nbp)) { 2171 if (BO_MTX(bo) != VI_MTX(vp)) { 2172 VI_UNLOCK(devvp); 2173 BO_LOCK(bo); 2174 } 2175 BUF_UNLOCK(nbp); 2176 continue; 2177 } 2178 VI_UNLOCK(devvp); 2179 } 2180 if (nbp->b_flags & B_CLUSTEROK) { 2181 vfs_bio_awrite(nbp); 2182 } else { 2183 bremfree(nbp); 2184 bawrite(nbp); 2185 } 2186 dirtybufferflushes++; 2187 break; 2188 } 2189 if (nbp == NULL) 2190 BO_UNLOCK(bo); 2191 } 2192 } 2193 2194 /* 2195 * Check for need to copy block that is about to be written, 2196 * copying the block if necessary. 2197 */ 2198 int 2199 ffs_copyonwrite(devvp, bp) 2200 struct vnode *devvp; 2201 struct buf *bp; 2202 { 2203 struct snapdata *sn; 2204 struct buf *ibp, *cbp, *savedcbp = 0; 2205 struct thread *td = curthread; 2206 struct fs *fs; 2207 struct inode *ip; 2208 struct vnode *vp = 0; 2209 ufs2_daddr_t lbn, blkno, *snapblklist; 2210 int lower, upper, mid, indiroff, error = 0; 2211 int launched_async_io, prev_norunningbuf; 2212 long saved_runningbufspace; 2213 2214 if (devvp != bp->b_vp && (VTOI(bp->b_vp)->i_flags & SF_SNAPSHOT) != 0) 2215 return (0); /* Update on a snapshot file */ 2216 if (td->td_pflags & TDP_COWINPROGRESS) 2217 panic("ffs_copyonwrite: recursive call"); 2218 /* 2219 * First check to see if it is in the preallocated list. 2220 * By doing this check we avoid several potential deadlocks. 2221 */ 2222 VI_LOCK(devvp); 2223 sn = devvp->v_rdev->si_snapdata; 2224 if (sn == NULL || 2225 TAILQ_EMPTY(&sn->sn_head)) { 2226 VI_UNLOCK(devvp); 2227 return (0); /* No snapshot */ 2228 } 2229 ip = TAILQ_FIRST(&sn->sn_head); 2230 fs = ip->i_fs; 2231 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); 2232 snapblklist = sn->sn_blklist; 2233 upper = sn->sn_listsize - 1; 2234 lower = 1; 2235 while (lower <= upper) { 2236 mid = (lower + upper) / 2; 2237 if (snapblklist[mid] == lbn) 2238 break; 2239 if (snapblklist[mid] < lbn) 2240 lower = mid + 1; 2241 else 2242 upper = mid - 1; 2243 } 2244 if (lower <= upper) { 2245 VI_UNLOCK(devvp); 2246 return (0); 2247 } 2248 launched_async_io = 0; 2249 prev_norunningbuf = td->td_pflags & TDP_NORUNNINGBUF; 2250 /* 2251 * Since I/O on bp isn't yet in progress and it may be blocked 2252 * for a long time waiting on snaplk, back it out of 2253 * runningbufspace, possibly waking other threads waiting for space. 2254 */ 2255 saved_runningbufspace = bp->b_runningbufspace; 2256 if (saved_runningbufspace != 0) 2257 runningbufwakeup(bp); 2258 /* 2259 * Not in the precomputed list, so check the snapshots. 2260 */ 2261 while (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 2262 VI_MTX(devvp)) != 0) { 2263 VI_LOCK(devvp); 2264 sn = devvp->v_rdev->si_snapdata; 2265 if (sn == NULL || 2266 TAILQ_EMPTY(&sn->sn_head)) { 2267 VI_UNLOCK(devvp); 2268 if (saved_runningbufspace != 0) { 2269 bp->b_runningbufspace = saved_runningbufspace; 2270 atomic_add_long(&runningbufspace, 2271 bp->b_runningbufspace); 2272 } 2273 return (0); /* Snapshot gone */ 2274 } 2275 } 2276 TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { 2277 vp = ITOV(ip); 2278 if (DOINGSOFTDEP(vp)) 2279 softdep_prealloc(vp, MNT_WAIT); 2280 /* 2281 * We ensure that everything of our own that needs to be 2282 * copied will be done at the time that ffs_snapshot is 2283 * called. Thus we can skip the check here which can 2284 * deadlock in doing the lookup in UFS_BALLOC. 2285 */ 2286 if (bp->b_vp == vp) 2287 continue; 2288 /* 2289 * Check to see if block needs to be copied. We do not have 2290 * to hold the snapshot lock while doing this lookup as it 2291 * will never require any additional allocations for the 2292 * snapshot inode. 2293 */ 2294 if (lbn < NDADDR) { 2295 blkno = DIP(ip, i_db[lbn]); 2296 } else { 2297 td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF; 2298 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 2299 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 2300 td->td_pflags &= ~TDP_COWINPROGRESS; 2301 if (error) 2302 break; 2303 indiroff = (lbn - NDADDR) % NINDIR(fs); 2304 if (ip->i_ump->um_fstype == UFS1) 2305 blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; 2306 else 2307 blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; 2308 bqrelse(ibp); 2309 } 2310 #ifdef INVARIANTS 2311 if (blkno == BLK_SNAP && bp->b_lblkno >= 0) 2312 panic("ffs_copyonwrite: bad copy block"); 2313 #endif 2314 if (blkno != 0) 2315 continue; 2316 /* 2317 * Allocate the block into which to do the copy. Since 2318 * multiple processes may all try to copy the same block, 2319 * we have to recheck our need to do a copy if we sleep 2320 * waiting for the lock. 2321 * 2322 * Because all snapshots on a filesystem share a single 2323 * lock, we ensure that we will never be in competition 2324 * with another process to allocate a block. 2325 */ 2326 td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF; 2327 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 2328 fs->fs_bsize, KERNCRED, 0, &cbp); 2329 td->td_pflags &= ~TDP_COWINPROGRESS; 2330 if (error) 2331 break; 2332 #ifdef DEBUG 2333 if (snapdebug) { 2334 printf("Copyonwrite: snapino %d lbn %jd for ", 2335 ip->i_number, (intmax_t)lbn); 2336 if (bp->b_vp == devvp) 2337 printf("fs metadata"); 2338 else 2339 printf("inum %d", VTOI(bp->b_vp)->i_number); 2340 printf(" lblkno %jd to blkno %jd\n", 2341 (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno); 2342 } 2343 #endif 2344 /* 2345 * If we have already read the old block contents, then 2346 * simply copy them to the new block. Note that we need 2347 * to synchronously write snapshots that have not been 2348 * unlinked, and hence will be visible after a crash, 2349 * to ensure their integrity. 2350 */ 2351 if (savedcbp != 0) { 2352 bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 2353 bawrite(cbp); 2354 if (dopersistence && ip->i_effnlink > 0) 2355 (void) ffs_syncvnode(vp, MNT_WAIT); 2356 else 2357 launched_async_io = 1; 2358 continue; 2359 } 2360 /* 2361 * Otherwise, read the old block contents into the buffer. 2362 */ 2363 if ((error = readblock(vp, cbp, lbn)) != 0) { 2364 bzero(cbp->b_data, fs->fs_bsize); 2365 bawrite(cbp); 2366 if (dopersistence && ip->i_effnlink > 0) 2367 (void) ffs_syncvnode(vp, MNT_WAIT); 2368 else 2369 launched_async_io = 1; 2370 break; 2371 } 2372 savedcbp = cbp; 2373 } 2374 /* 2375 * Note that we need to synchronously write snapshots that 2376 * have not been unlinked, and hence will be visible after 2377 * a crash, to ensure their integrity. 2378 */ 2379 if (savedcbp) { 2380 vp = savedcbp->b_vp; 2381 bawrite(savedcbp); 2382 if (dopersistence && VTOI(vp)->i_effnlink > 0) 2383 (void) ffs_syncvnode(vp, MNT_WAIT); 2384 else 2385 launched_async_io = 1; 2386 } 2387 lockmgr(vp->v_vnlock, LK_RELEASE, NULL); 2388 td->td_pflags = (td->td_pflags & ~TDP_NORUNNINGBUF) | 2389 prev_norunningbuf; 2390 if (launched_async_io && (td->td_pflags & TDP_NORUNNINGBUF) == 0) 2391 waitrunningbufspace(); 2392 /* 2393 * I/O on bp will now be started, so count it in runningbufspace. 2394 */ 2395 if (saved_runningbufspace != 0) { 2396 bp->b_runningbufspace = saved_runningbufspace; 2397 atomic_add_long(&runningbufspace, bp->b_runningbufspace); 2398 } 2399 return (error); 2400 } 2401 2402 /* 2403 * Read the specified block into the given buffer. 2404 * Much of this boiler-plate comes from bwrite(). 2405 */ 2406 static int 2407 readblock(vp, bp, lbn) 2408 struct vnode *vp; 2409 struct buf *bp; 2410 ufs2_daddr_t lbn; 2411 { 2412 struct inode *ip = VTOI(vp); 2413 struct bio *bip; 2414 2415 bip = g_alloc_bio(); 2416 bip->bio_cmd = BIO_READ; 2417 bip->bio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn))); 2418 bip->bio_data = bp->b_data; 2419 bip->bio_length = bp->b_bcount; 2420 bip->bio_done = NULL; 2421 2422 g_io_request(bip, ip->i_devvp->v_bufobj.bo_private); 2423 bp->b_error = biowait(bip, "snaprdb"); 2424 g_destroy_bio(bip); 2425 return (bp->b_error); 2426 } 2427 2428 #endif 2429 2430 /* 2431 * Process file deletes that were deferred by ufs_inactive() due to 2432 * the file system being suspended. Transfer IN_LAZYACCESS into 2433 * IN_MODIFIED for vnodes that were accessed during suspension. 2434 */ 2435 void 2436 process_deferred_inactive(struct mount *mp) 2437 { 2438 struct vnode *vp, *mvp; 2439 struct inode *ip; 2440 struct thread *td; 2441 int error; 2442 2443 td = curthread; 2444 (void) vn_start_secondary_write(NULL, &mp, V_WAIT); 2445 MNT_ILOCK(mp); 2446 loop: 2447 MNT_VNODE_FOREACH(vp, mp, mvp) { 2448 VI_LOCK(vp); 2449 /* 2450 * IN_LAZYACCESS is checked here without holding any 2451 * vnode lock, but this flag is set only while holding 2452 * vnode interlock. 2453 */ 2454 if (vp->v_type == VNON || (vp->v_iflag & VI_DOOMED) != 0 || 2455 ((VTOI(vp)->i_flag & IN_LAZYACCESS) == 0 && 2456 ((vp->v_iflag & VI_OWEINACT) == 0 || 2457 vp->v_usecount > 0))) { 2458 VI_UNLOCK(vp); 2459 continue; 2460 } 2461 MNT_IUNLOCK(mp); 2462 vholdl(vp); 2463 error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); 2464 if (error != 0) { 2465 vdrop(vp); 2466 MNT_ILOCK(mp); 2467 if (error == ENOENT) 2468 continue; /* vnode recycled */ 2469 MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp); 2470 goto loop; 2471 } 2472 ip = VTOI(vp); 2473 if ((ip->i_flag & IN_LAZYACCESS) != 0) { 2474 ip->i_flag &= ~IN_LAZYACCESS; 2475 ip->i_flag |= IN_MODIFIED; 2476 } 2477 VI_LOCK(vp); 2478 if ((vp->v_iflag & VI_OWEINACT) == 0 || vp->v_usecount > 0) { 2479 VI_UNLOCK(vp); 2480 VOP_UNLOCK(vp, 0); 2481 vdrop(vp); 2482 MNT_ILOCK(mp); 2483 continue; 2484 } 2485 2486 VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, 2487 ("process_deferred_inactive: " 2488 "recursed on VI_DOINGINACT")); 2489 vp->v_iflag |= VI_DOINGINACT; 2490 vp->v_iflag &= ~VI_OWEINACT; 2491 VI_UNLOCK(vp); 2492 (void) VOP_INACTIVE(vp, td); 2493 VI_LOCK(vp); 2494 VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, 2495 ("process_deferred_inactive: lost VI_DOINGINACT")); 2496 VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp, 2497 ("process_deferred_inactive: got VI_OWEINACT")); 2498 vp->v_iflag &= ~VI_DOINGINACT; 2499 VI_UNLOCK(vp); 2500 VOP_UNLOCK(vp, 0); 2501 vdrop(vp); 2502 MNT_ILOCK(mp); 2503 } 2504 MNT_IUNLOCK(mp); 2505 vn_finished_secondary_write(mp); 2506 } 2507 2508 #ifndef NO_FFS_SNAPSHOT 2509 2510 static struct snapdata * 2511 ffs_snapdata_alloc(void) 2512 { 2513 struct snapdata *sn; 2514 2515 /* 2516 * Fetch a snapdata from the free list if there is one available. 2517 */ 2518 mtx_lock(&snapfree_lock); 2519 sn = LIST_FIRST(&snapfree); 2520 if (sn != NULL) 2521 LIST_REMOVE(sn, sn_link); 2522 mtx_unlock(&snapfree_lock); 2523 if (sn != NULL) 2524 return (sn); 2525 /* 2526 * If there were no free snapdatas allocate one. 2527 */ 2528 sn = malloc(sizeof *sn, M_UFSMNT, M_WAITOK | M_ZERO); 2529 TAILQ_INIT(&sn->sn_head); 2530 lockinit(&sn->sn_lock, PVFS, "snaplk", VLKTIMEOUT, 2531 LK_CANRECURSE | LK_NOSHARE); 2532 return (sn); 2533 } 2534 2535 /* 2536 * The snapdata is never freed because we can not be certain that 2537 * there are no threads sleeping on the snap lock. Persisting 2538 * them permanently avoids costly synchronization in ffs_lock(). 2539 */ 2540 static void 2541 ffs_snapdata_free(struct snapdata *sn) 2542 { 2543 mtx_lock(&snapfree_lock); 2544 LIST_INSERT_HEAD(&snapfree, sn, sn_link); 2545 mtx_unlock(&snapfree_lock); 2546 } 2547 2548 /* Try to free snapdata associated with devvp */ 2549 static void 2550 try_free_snapdata(struct vnode *devvp) 2551 { 2552 struct snapdata *sn; 2553 ufs2_daddr_t *snapblklist; 2554 2555 ASSERT_VI_LOCKED(devvp, "try_free_snapdata"); 2556 sn = devvp->v_rdev->si_snapdata; 2557 2558 if (sn == NULL || TAILQ_FIRST(&sn->sn_head) != NULL || 2559 (devvp->v_vflag & VV_COPYONWRITE) == 0) { 2560 VI_UNLOCK(devvp); 2561 return; 2562 } 2563 2564 devvp->v_rdev->si_snapdata = NULL; 2565 devvp->v_vflag &= ~VV_COPYONWRITE; 2566 lockmgr(&sn->sn_lock, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp)); 2567 snapblklist = sn->sn_blklist; 2568 sn->sn_blklist = NULL; 2569 sn->sn_listsize = 0; 2570 lockmgr(&sn->sn_lock, LK_RELEASE, NULL); 2571 if (snapblklist != NULL) 2572 free(snapblklist, M_UFSMNT); 2573 ffs_snapdata_free(sn); 2574 } 2575 2576 static struct snapdata * 2577 ffs_snapdata_acquire(struct vnode *devvp) 2578 { 2579 struct snapdata *nsn; 2580 struct snapdata *sn; 2581 2582 /* 2583 * Allocate a free snapdata. This is done before acquiring the 2584 * devvp lock to avoid allocation while the devvp interlock is 2585 * held. 2586 */ 2587 nsn = ffs_snapdata_alloc(); 2588 /* 2589 * If there snapshots already exist on this filesystem grab a 2590 * reference to the shared lock. Otherwise this is the first 2591 * snapshot on this filesystem and we need to use our 2592 * pre-allocated snapdata. 2593 */ 2594 VI_LOCK(devvp); 2595 if (devvp->v_rdev->si_snapdata == NULL) { 2596 devvp->v_rdev->si_snapdata = nsn; 2597 nsn = NULL; 2598 } 2599 sn = devvp->v_rdev->si_snapdata; 2600 /* 2601 * Acquire the snapshot lock. 2602 */ 2603 lockmgr(&sn->sn_lock, 2604 LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, VI_MTX(devvp)); 2605 /* 2606 * Free any unused snapdata. 2607 */ 2608 if (nsn != NULL) 2609 ffs_snapdata_free(nsn); 2610 2611 return (sn); 2612 } 2613 2614 #endif 2615