1 /* 2 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. 3 * 4 * Further information about snapshots can be obtained from: 5 * 6 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 7 * 1614 Oxford Street mckusick@mckusick.com 8 * Berkeley, CA 94709-1608 +1-510-843-9542 9 * USA 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 21 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 22 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 34 */ 35 36 #include <sys/cdefs.h> 37 __FBSDID("$FreeBSD$"); 38 39 #include <sys/param.h> 40 #include <sys/kernel.h> 41 #include <sys/systm.h> 42 #include <sys/conf.h> 43 #include <sys/bio.h> 44 #include <sys/buf.h> 45 #include <sys/proc.h> 46 #include <sys/namei.h> 47 #include <sys/sched.h> 48 #include <sys/stat.h> 49 #include <sys/malloc.h> 50 #include <sys/mount.h> 51 #include <sys/resource.h> 52 #include <sys/resourcevar.h> 53 #include <sys/vnode.h> 54 55 #include <ufs/ufs/extattr.h> 56 #include <ufs/ufs/quota.h> 57 #include <ufs/ufs/ufsmount.h> 58 #include <ufs/ufs/inode.h> 59 #include <ufs/ufs/ufs_extern.h> 60 61 #include <ufs/ffs/fs.h> 62 #include <ufs/ffs/ffs_extern.h> 63 64 #define KERNCRED thread0.td_ucred 65 #define DEBUG 1 66 67 static int cgaccount(int, struct vnode *, struct buf *, int); 68 static int expunge_ufs1(struct vnode *, struct inode *, struct fs *, 69 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 70 ufs_lbn_t, int), int); 71 static int indiracct_ufs1(struct vnode *, struct vnode *, int, 72 ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 73 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 74 ufs_lbn_t, int), int); 75 static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 76 struct fs *, ufs_lbn_t, int); 77 static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 78 struct fs *, ufs_lbn_t, int); 79 static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 80 struct fs *, ufs_lbn_t, int); 81 static int expunge_ufs2(struct vnode *, struct inode *, struct fs *, 82 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 83 ufs_lbn_t, int), int); 84 static int indiracct_ufs2(struct vnode *, struct vnode *, int, 85 ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 86 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 87 ufs_lbn_t, int), int); 88 static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 89 struct fs *, ufs_lbn_t, int); 90 static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 91 struct fs *, ufs_lbn_t, int); 92 static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 93 struct fs *, ufs_lbn_t, int); 94 static int ffs_copyonwrite(struct vnode *, struct buf *); 95 static int readblock(struct buf *, ufs2_daddr_t); 96 97 /* 98 * To ensure the consistency of snapshots across crashes, we must 99 * synchronously write out copied blocks before allowing the 100 * originals to be modified. Because of the rather severe speed 101 * penalty that this imposes, the following flag allows this 102 * crash persistence to be disabled. 103 */ 104 int dopersistence = 0; 105 106 #ifdef DEBUG 107 #include <sys/sysctl.h> 108 SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, ""); 109 static int snapdebug = 0; 110 SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, ""); 111 int collectsnapstats = 0; 112 SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats, 113 0, ""); 114 #endif /* DEBUG */ 115 116 /* 117 * Create a snapshot file and initialize it for the filesystem. 118 */ 119 int 120 ffs_snapshot(mp, snapfile) 121 struct mount *mp; 122 char *snapfile; 123 { 124 ufs2_daddr_t numblks, blkno, *blkp, *snapblklist; 125 int error, cg, snaploc; 126 int i, size, len, loc; 127 int flag = mp->mnt_flag; 128 struct timespec starttime = {0, 0}, endtime; 129 char saved_nice = 0; 130 long redo = 0, snaplistsize = 0; 131 int32_t *lp; 132 void *space; 133 struct fs *copy_fs = NULL, *fs = VFSTOUFS(mp)->um_fs; 134 struct snaphead *snaphead; 135 struct thread *td = curthread; 136 struct inode *ip, *xp; 137 struct buf *bp, *nbp, *ibp, *sbp = NULL; 138 struct nameidata nd; 139 struct mount *wrtmp; 140 struct vattr vat; 141 struct vnode *vp, *xvp, *nvp, *devvp; 142 struct uio auio; 143 struct iovec aiov; 144 145 /* 146 * Need to serialize access to snapshot code per filesystem. 147 */ 148 /* 149 * Assign a snapshot slot in the superblock. 150 */ 151 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 152 if (fs->fs_snapinum[snaploc] == 0) 153 break; 154 if (snaploc == FSMAXSNAP) 155 return (ENOSPC); 156 /* 157 * Create the snapshot file. 158 */ 159 restart: 160 NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, snapfile, td); 161 if ((error = namei(&nd)) != 0) 162 return (error); 163 if (nd.ni_vp != NULL) { 164 vput(nd.ni_vp); 165 error = EEXIST; 166 } 167 if (nd.ni_dvp->v_mount != mp) 168 error = EXDEV; 169 if (error) { 170 NDFREE(&nd, NDF_ONLY_PNBUF); 171 if (nd.ni_dvp == nd.ni_vp) 172 vrele(nd.ni_dvp); 173 else 174 vput(nd.ni_dvp); 175 return (error); 176 } 177 VATTR_NULL(&vat); 178 vat.va_type = VREG; 179 vat.va_mode = S_IRUSR; 180 vat.va_vaflags |= VA_EXCLUSIVE; 181 if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp)) 182 wrtmp = NULL; 183 if (wrtmp != mp) 184 panic("ffs_snapshot: mount mismatch"); 185 if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) { 186 NDFREE(&nd, NDF_ONLY_PNBUF); 187 vput(nd.ni_dvp); 188 if ((error = vn_start_write(NULL, &wrtmp, 189 V_XSLEEP | PCATCH)) != 0) 190 return (error); 191 goto restart; 192 } 193 VOP_LEASE(nd.ni_dvp, td, KERNCRED, LEASE_WRITE); 194 error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat); 195 vput(nd.ni_dvp); 196 if (error) { 197 NDFREE(&nd, NDF_ONLY_PNBUF); 198 vn_finished_write(wrtmp); 199 return (error); 200 } 201 vp = nd.ni_vp; 202 ip = VTOI(vp); 203 devvp = ip->i_devvp; 204 /* 205 * Allocate and copy the last block contents so as to be able 206 * to set size to that of the filesystem. 207 */ 208 numblks = howmany(fs->fs_size, fs->fs_frag); 209 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), 210 fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); 211 if (error) 212 goto out; 213 ip->i_size = lblktosize(fs, (off_t)numblks); 214 DIP(ip, i_size) = ip->i_size; 215 ip->i_flag |= IN_CHANGE | IN_UPDATE; 216 if ((error = readblock(bp, numblks - 1)) != 0) 217 goto out; 218 bawrite(bp); 219 /* 220 * Preallocate critical data structures so that we can copy 221 * them in without further allocation after we suspend all 222 * operations on the filesystem. We would like to just release 223 * the allocated buffers without writing them since they will 224 * be filled in below once we are ready to go, but this upsets 225 * the soft update code, so we go ahead and write the new buffers. 226 * 227 * Allocate all indirect blocks and mark all of them as not 228 * needing to be copied. 229 */ 230 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 231 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), 232 fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp); 233 if (error) 234 goto out; 235 bawrite(ibp); 236 } 237 /* 238 * Allocate copies for the superblock and its summary information. 239 */ 240 error = UFS_BALLOC(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED, 241 0, &nbp); 242 if (error) 243 goto out; 244 bawrite(nbp); 245 blkno = fragstoblks(fs, fs->fs_csaddr); 246 len = howmany(fs->fs_cssize, fs->fs_bsize); 247 for (loc = 0; loc < len; loc++) { 248 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), 249 fs->fs_bsize, KERNCRED, 0, &nbp); 250 if (error) 251 goto out; 252 bawrite(nbp); 253 } 254 /* 255 * Allocate all cylinder group blocks. 256 */ 257 for (cg = 0; cg < fs->fs_ncg; cg++) { 258 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 259 fs->fs_bsize, KERNCRED, 0, &nbp); 260 if (error) 261 goto out; 262 bawrite(nbp); 263 } 264 /* 265 * Copy all the cylinder group maps. Although the 266 * filesystem is still active, we hope that only a few 267 * cylinder groups will change between now and when we 268 * suspend operations. Thus, we will be able to quickly 269 * touch up the few cylinder groups that changed during 270 * the suspension period. 271 */ 272 len = howmany(fs->fs_ncg, NBBY); 273 MALLOC(fs->fs_active, int *, len, M_DEVBUF, M_WAITOK); 274 bzero(fs->fs_active, len); 275 for (cg = 0; cg < fs->fs_ncg; cg++) { 276 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 277 fs->fs_bsize, KERNCRED, 0, &nbp); 278 if (error) 279 goto out; 280 error = cgaccount(cg, vp, nbp, 1); 281 bawrite(nbp); 282 if (error) 283 goto out; 284 } 285 /* 286 * Change inode to snapshot type file. 287 */ 288 ip->i_flags |= SF_SNAPSHOT; 289 DIP(ip, i_flags) = ip->i_flags; 290 ip->i_flag |= IN_CHANGE | IN_UPDATE; 291 /* 292 * Ensure that the snapshot is completely on disk. 293 * Since we have marked it as a snapshot it is safe to 294 * unlock it as no process will be allowed to write to it. 295 */ 296 if ((error = VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td)) != 0) 297 goto out; 298 VOP_UNLOCK(vp, 0, td); 299 /* 300 * All allocations are done, so we can now snapshot the system. 301 * 302 * Recind nice scheduling while running with the filesystem suspended. 303 */ 304 if (td->td_proc->p_nice > 0) { 305 PROC_LOCK(td->td_proc); 306 mtx_lock_spin(&sched_lock); 307 saved_nice = td->td_proc->p_nice; 308 sched_nice(td->td_proc, 0); 309 mtx_unlock_spin(&sched_lock); 310 PROC_UNLOCK(td->td_proc); 311 } 312 /* 313 * Suspend operation on filesystem. 314 */ 315 for (;;) { 316 vn_finished_write(wrtmp); 317 if ((error = vfs_write_suspend(vp->v_mount)) != 0) { 318 vn_start_write(NULL, &wrtmp, V_WAIT); 319 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 320 goto out; 321 } 322 if (mp->mnt_kern_flag & MNTK_SUSPENDED) 323 break; 324 vn_start_write(NULL, &wrtmp, V_WAIT); 325 } 326 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 327 if (collectsnapstats) 328 nanotime(&starttime); 329 /* 330 * First, copy all the cylinder group maps that have changed. 331 */ 332 for (cg = 0; cg < fs->fs_ncg; cg++) { 333 if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0) 334 continue; 335 redo++; 336 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 337 fs->fs_bsize, KERNCRED, 0, &nbp); 338 if (error) 339 goto out1; 340 error = cgaccount(cg, vp, nbp, 2); 341 bawrite(nbp); 342 if (error) 343 goto out1; 344 } 345 /* 346 * Grab a copy of the superblock and its summary information. 347 * We delay writing it until the suspension is released below. 348 */ 349 error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize, 350 KERNCRED, &sbp); 351 if (error) { 352 brelse(sbp); 353 sbp = NULL; 354 goto out1; 355 } 356 loc = blkoff(fs, fs->fs_sblockloc); 357 copy_fs = (struct fs *)(sbp->b_data + loc); 358 bcopy(fs, copy_fs, fs->fs_sbsize); 359 if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) 360 copy_fs->fs_clean = 1; 361 size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE; 362 if (fs->fs_sbsize < size) 363 bzero(&sbp->b_data[loc + fs->fs_sbsize], size - fs->fs_sbsize); 364 size = blkroundup(fs, fs->fs_cssize); 365 if (fs->fs_contigsumsize > 0) 366 size += fs->fs_ncg * sizeof(int32_t); 367 space = malloc((u_long)size, M_UFSMNT, M_WAITOK); 368 copy_fs->fs_csp = space; 369 bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize); 370 space = (char *)space + fs->fs_cssize; 371 loc = howmany(fs->fs_cssize, fs->fs_fsize); 372 i = fs->fs_frag - loc % fs->fs_frag; 373 len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; 374 if (len > 0) { 375 if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc), 376 len, KERNCRED, &bp)) != 0) { 377 brelse(bp); 378 free(copy_fs->fs_csp, M_UFSMNT); 379 bawrite(sbp); 380 sbp = NULL; 381 goto out1; 382 } 383 bcopy(bp->b_data, space, (u_int)len); 384 space = (char *)space + len; 385 bp->b_flags |= B_INVAL | B_NOCACHE; 386 brelse(bp); 387 } 388 if (fs->fs_contigsumsize > 0) { 389 copy_fs->fs_maxcluster = lp = space; 390 for (i = 0; i < fs->fs_ncg; i++) 391 *lp++ = fs->fs_contigsumsize; 392 } 393 /* 394 * We must check for active files that have been unlinked 395 * (e.g., with a zero link count). We have to expunge all 396 * trace of these files from the snapshot so that they are 397 * not reclaimed prematurely by fsck or unnecessarily dumped. 398 * We turn off the MNTK_SUSPENDED flag to avoid a panic from 399 * spec_strategy about writing on a suspended filesystem. 400 * Note that we skip unlinked snapshot files as they will 401 * be handled separately below. 402 * 403 * We also calculate the needed size for the snapshot list. 404 */ 405 snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + 406 FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */; 407 mp->mnt_kern_flag &= ~MNTK_SUSPENDED; 408 MNT_ILOCK(mp); 409 loop: 410 MNT_VNODE_FOREACH(xvp, mp, nvp) { 411 VI_LOCK(xvp); 412 MNT_IUNLOCK(mp); 413 if ((xvp->v_iflag & VI_XLOCK) || 414 xvp->v_usecount == 0 || xvp->v_type == VNON || 415 (VTOI(xvp)->i_flags & SF_SNAPSHOT)) { 416 VI_UNLOCK(xvp); 417 MNT_ILOCK(mp); 418 continue; 419 } 420 /* 421 * We can skip parent directory vnode because it must have 422 * this snapshot file in it. 423 */ 424 if (xvp == nd.ni_dvp) { 425 VI_UNLOCK(xvp); 426 MNT_ILOCK(mp); 427 continue; 428 } 429 if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK, td) != 0) { 430 MNT_ILOCK(mp); 431 goto loop; 432 } 433 if (snapdebug) 434 vprint("ffs_snapshot: busy vnode", xvp); 435 if (VOP_GETATTR(xvp, &vat, td->td_ucred, td) == 0 && 436 vat.va_nlink > 0) { 437 VOP_UNLOCK(xvp, 0, td); 438 MNT_ILOCK(mp); 439 continue; 440 } 441 xp = VTOI(xvp); 442 if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) { 443 VOP_UNLOCK(xvp, 0, td); 444 MNT_ILOCK(mp); 445 continue; 446 } 447 /* 448 * If there is a fragment, clear it here. 449 */ 450 blkno = 0; 451 loc = howmany(xp->i_size, fs->fs_bsize) - 1; 452 if (loc < NDADDR) { 453 len = fragroundup(fs, blkoff(fs, xp->i_size)); 454 if (len < fs->fs_bsize) { 455 ffs_blkfree(copy_fs, vp, DIP(xp, i_db[loc]), 456 len, xp->i_number); 457 blkno = DIP(xp, i_db[loc]); 458 DIP(xp, i_db[loc]) = 0; 459 } 460 } 461 snaplistsize += 1; 462 if (xp->i_ump->um_fstype == UFS1) 463 error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, 464 BLK_NOCOPY); 465 else 466 error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, 467 BLK_NOCOPY); 468 if (blkno) 469 DIP(xp, i_db[loc]) = blkno; 470 if (!error) 471 error = ffs_freefile(copy_fs, vp, xp->i_number, 472 xp->i_mode); 473 VOP_UNLOCK(xvp, 0, td); 474 if (error) { 475 free(copy_fs->fs_csp, M_UFSMNT); 476 bawrite(sbp); 477 sbp = NULL; 478 goto out1; 479 } 480 MNT_ILOCK(mp); 481 } 482 MNT_IUNLOCK(mp); 483 /* 484 * If there already exist snapshots on this filesystem, grab a 485 * reference to their shared lock. If this is the first snapshot 486 * on this filesystem, we need to allocate a lock for the snapshots 487 * to share. In either case, acquire the snapshot lock and give 488 * up our original private lock. 489 */ 490 VI_LOCK(devvp); 491 snaphead = &devvp->v_rdev->si_snapshots; 492 if ((xp = TAILQ_FIRST(snaphead)) != NULL) { 493 struct lock *lkp; 494 495 lkp = ITOV(xp)->v_vnlock; 496 VI_UNLOCK(devvp); 497 VI_LOCK(vp); 498 vp->v_vnlock = lkp; 499 } else { 500 struct lock *lkp; 501 502 VI_UNLOCK(devvp); 503 MALLOC(lkp, struct lock *, sizeof(struct lock), M_UFSMNT, 504 M_WAITOK); 505 lockinit(lkp, PVFS, "snaplk", VLKTIMEOUT, 506 LK_CANRECURSE | LK_NOPAUSE); 507 VI_LOCK(vp); 508 vp->v_vnlock = lkp; 509 } 510 vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, td); 511 transferlockers(&vp->v_lock, vp->v_vnlock); 512 lockmgr(&vp->v_lock, LK_RELEASE, NULL, td); 513 /* 514 * If this is the first snapshot on this filesystem, then we need 515 * to allocate the space for the list of preallocated snapshot blocks. 516 * This list will be refined below, but this preliminary one will 517 * keep us out of deadlock until the full one is ready. 518 */ 519 if (xp == NULL) { 520 MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t), 521 M_UFSMNT, M_WAITOK); 522 blkp = &snapblklist[1]; 523 *blkp++ = lblkno(fs, fs->fs_sblockloc); 524 blkno = fragstoblks(fs, fs->fs_csaddr); 525 for (cg = 0; cg < fs->fs_ncg; cg++) { 526 if (fragstoblks(fs, cgtod(fs, cg) > blkno)) 527 break; 528 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 529 } 530 len = howmany(fs->fs_cssize, fs->fs_bsize); 531 for (loc = 0; loc < len; loc++) 532 *blkp++ = blkno + loc; 533 for (; cg < fs->fs_ncg; cg++) 534 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 535 snapblklist[0] = blkp - snapblklist; 536 VI_LOCK(devvp); 537 if (devvp->v_rdev->si_snapblklist != NULL) 538 panic("ffs_snapshot: non-empty list"); 539 devvp->v_rdev->si_snapblklist = snapblklist; 540 devvp->v_rdev->si_snaplistsize = blkp - snapblklist; 541 VI_UNLOCK(devvp); 542 } 543 /* 544 * Record snapshot inode. Since this is the newest snapshot, 545 * it must be placed at the end of the list. 546 */ 547 VI_LOCK(devvp); 548 fs->fs_snapinum[snaploc] = ip->i_number; 549 if (ip->i_nextsnap.tqe_prev != 0) 550 panic("ffs_snapshot: %d already on list", ip->i_number); 551 TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap); 552 devvp->v_rdev->si_copyonwrite = ffs_copyonwrite; 553 devvp->v_vflag |= VV_COPYONWRITE; 554 VI_UNLOCK(devvp); 555 ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp"); 556 vp->v_vflag |= VV_SYSTEM; 557 out1: 558 /* 559 * Resume operation on filesystem. 560 */ 561 vfs_write_resume(vp->v_mount); 562 vn_start_write(NULL, &wrtmp, V_WAIT); 563 if (collectsnapstats && starttime.tv_sec > 0) { 564 nanotime(&endtime); 565 timespecsub(&endtime, &starttime); 566 printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n", 567 vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec, 568 endtime.tv_nsec / 1000000, redo, fs->fs_ncg); 569 } 570 if (sbp == NULL) 571 goto out; 572 /* 573 * Copy allocation information from all the snapshots in 574 * this snapshot and then expunge them from its view. 575 */ 576 snaphead = &devvp->v_rdev->si_snapshots; 577 TAILQ_FOREACH(xp, snaphead, i_nextsnap) { 578 if (xp == ip) 579 break; 580 if (xp->i_ump->um_fstype == UFS1) 581 error = expunge_ufs1(vp, xp, fs, snapacct_ufs1, 582 BLK_SNAP); 583 else 584 error = expunge_ufs2(vp, xp, fs, snapacct_ufs2, 585 BLK_SNAP); 586 if (error) { 587 fs->fs_snapinum[snaploc] = 0; 588 goto done; 589 } 590 } 591 /* 592 * Allocate space for the full list of preallocated snapshot blocks. 593 */ 594 MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t), 595 M_UFSMNT, M_WAITOK); 596 ip->i_snapblklist = &snapblklist[1]; 597 /* 598 * Expunge the blocks used by the snapshots from the set of 599 * blocks marked as used in the snapshot bitmaps. Also, collect 600 * the list of allocated blocks in i_snapblklist. 601 */ 602 if (ip->i_ump->um_fstype == UFS1) 603 error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP); 604 else 605 error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP); 606 if (error) { 607 fs->fs_snapinum[snaploc] = 0; 608 FREE(snapblklist, M_UFSMNT); 609 goto done; 610 } 611 if (snaplistsize < ip->i_snapblklist - snapblklist) 612 panic("ffs_snapshot: list too small"); 613 snaplistsize = ip->i_snapblklist - snapblklist; 614 snapblklist[0] = snaplistsize; 615 ip->i_snapblklist = 0; 616 /* 617 * Write out the list of allocated blocks to the end of the snapshot. 618 */ 619 auio.uio_iov = &aiov; 620 auio.uio_iovcnt = 1; 621 aiov.iov_base = (void *)snapblklist; 622 aiov.iov_len = snaplistsize * sizeof(daddr_t); 623 auio.uio_resid = aiov.iov_len;; 624 auio.uio_offset = ip->i_size; 625 auio.uio_segflg = UIO_SYSSPACE; 626 auio.uio_rw = UIO_WRITE; 627 auio.uio_td = td; 628 if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 629 fs->fs_snapinum[snaploc] = 0; 630 FREE(snapblklist, M_UFSMNT); 631 goto done; 632 } 633 /* 634 * Write the superblock and its summary information 635 * to the snapshot. 636 */ 637 blkno = fragstoblks(fs, fs->fs_csaddr); 638 len = howmany(fs->fs_cssize, fs->fs_bsize); 639 space = copy_fs->fs_csp; 640 for (loc = 0; loc < len; loc++) { 641 error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp); 642 if (error) { 643 brelse(nbp); 644 fs->fs_snapinum[snaploc] = 0; 645 FREE(snapblklist, M_UFSMNT); 646 goto done; 647 } 648 bcopy(space, nbp->b_data, fs->fs_bsize); 649 space = (char *)space + fs->fs_bsize; 650 bawrite(nbp); 651 } 652 /* 653 * As this is the newest list, it is the most inclusive, so 654 * should replace the previous list. 655 */ 656 VI_LOCK(devvp); 657 space = devvp->v_rdev->si_snapblklist; 658 devvp->v_rdev->si_snapblklist = snapblklist; 659 devvp->v_rdev->si_snaplistsize = snaplistsize; 660 VI_UNLOCK(devvp); 661 if (space != NULL) 662 FREE(space, M_UFSMNT); 663 done: 664 free(copy_fs->fs_csp, M_UFSMNT); 665 bawrite(sbp); 666 out: 667 if (saved_nice > 0) { 668 PROC_LOCK(td->td_proc); 669 mtx_lock_spin(&sched_lock); 670 sched_nice(td->td_proc, saved_nice); 671 mtx_unlock_spin(&sched_lock); 672 PROC_UNLOCK(td->td_proc); 673 } 674 if (fs->fs_active != 0) { 675 FREE(fs->fs_active, M_DEVBUF); 676 fs->fs_active = 0; 677 } 678 mp->mnt_flag = flag; 679 if (error) 680 (void) UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED, td); 681 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 682 if (error) 683 vput(vp); 684 else 685 VOP_UNLOCK(vp, 0, td); 686 vn_finished_write(wrtmp); 687 return (error); 688 } 689 690 /* 691 * Copy a cylinder group map. All the unallocated blocks are marked 692 * BLK_NOCOPY so that the snapshot knows that it need not copy them 693 * if they are later written. If passno is one, then this is a first 694 * pass, so only setting needs to be done. If passno is 2, then this 695 * is a revision to a previous pass which must be undone as the 696 * replacement pass is done. 697 */ 698 static int 699 cgaccount(cg, vp, nbp, passno) 700 int cg; 701 struct vnode *vp; 702 struct buf *nbp; 703 int passno; 704 { 705 struct buf *bp, *ibp; 706 struct inode *ip; 707 struct cg *cgp; 708 struct fs *fs; 709 ufs2_daddr_t base, numblks; 710 int error, len, loc, indiroff; 711 712 ip = VTOI(vp); 713 fs = ip->i_fs; 714 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 715 (int)fs->fs_cgsize, KERNCRED, &bp); 716 if (error) { 717 brelse(bp); 718 return (error); 719 } 720 cgp = (struct cg *)bp->b_data; 721 if (!cg_chkmagic(cgp)) { 722 brelse(bp); 723 return (EIO); 724 } 725 atomic_set_int(&ACTIVECGNUM(fs, cg), ACTIVECGOFF(cg)); 726 bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize); 727 if (fs->fs_cgsize < fs->fs_bsize) 728 bzero(&nbp->b_data[fs->fs_cgsize], 729 fs->fs_bsize - fs->fs_cgsize); 730 if (passno == 2) 731 nbp->b_flags |= B_VALIDSUSPWRT; 732 numblks = howmany(fs->fs_size, fs->fs_frag); 733 len = howmany(fs->fs_fpg, fs->fs_frag); 734 base = cg * fs->fs_fpg / fs->fs_frag; 735 if (base + len >= numblks) 736 len = numblks - base - 1; 737 loc = 0; 738 if (base < NDADDR) { 739 for ( ; loc < NDADDR; loc++) { 740 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 741 DIP(ip, i_db[loc]) = BLK_NOCOPY; 742 else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY) 743 DIP(ip, i_db[loc]) = 0; 744 else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY) 745 panic("ffs_snapshot: lost direct block"); 746 } 747 } 748 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), 749 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 750 if (error) { 751 brelse(bp); 752 return (error); 753 } 754 indiroff = (base + loc - NDADDR) % NINDIR(fs); 755 for ( ; loc < len; loc++, indiroff++) { 756 if (indiroff >= NINDIR(fs)) { 757 if (passno == 2) 758 ibp->b_flags |= B_VALIDSUSPWRT; 759 bawrite(ibp); 760 error = UFS_BALLOC(vp, 761 lblktosize(fs, (off_t)(base + loc)), 762 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 763 if (error) { 764 brelse(bp); 765 return (error); 766 } 767 indiroff = 0; 768 } 769 if (ip->i_ump->um_fstype == UFS1) { 770 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 771 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 772 BLK_NOCOPY; 773 else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data)) 774 [indiroff] == BLK_NOCOPY) 775 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0; 776 else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data)) 777 [indiroff] == BLK_NOCOPY) 778 panic("ffs_snapshot: lost indirect block"); 779 continue; 780 } 781 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 782 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; 783 else if (passno == 2 && 784 ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) 785 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0; 786 else if (passno == 1 && 787 ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) 788 panic("ffs_snapshot: lost indirect block"); 789 } 790 bqrelse(bp); 791 if (passno == 2) 792 ibp->b_flags |= B_VALIDSUSPWRT; 793 bdwrite(ibp); 794 return (0); 795 } 796 797 /* 798 * Before expunging a snapshot inode, note all the 799 * blocks that it claims with BLK_SNAP so that fsck will 800 * be able to account for those blocks properly and so 801 * that this snapshot knows that it need not copy them 802 * if the other snapshot holding them is freed. This code 803 * is reproduced once each for UFS1 and UFS2. 804 */ 805 static int 806 expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype) 807 struct vnode *snapvp; 808 struct inode *cancelip; 809 struct fs *fs; 810 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 811 struct fs *, ufs_lbn_t, int); 812 int expungetype; 813 { 814 int i, error, indiroff; 815 ufs_lbn_t lbn, rlbn; 816 ufs2_daddr_t len, blkno, numblks, blksperindir; 817 struct ufs1_dinode *dip; 818 struct thread *td = curthread; 819 struct buf *bp; 820 821 /* 822 * Prepare to expunge the inode. If its inode block has not 823 * yet been copied, then allocate and fill the copy. 824 */ 825 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 826 blkno = 0; 827 if (lbn < NDADDR) { 828 blkno = VTOI(snapvp)->i_din1->di_db[lbn]; 829 } else { 830 td->td_pflags |= TDP_COWINPROGRESS; 831 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 832 fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); 833 td->td_pflags &= ~TDP_COWINPROGRESS; 834 if (error) 835 return (error); 836 indiroff = (lbn - NDADDR) % NINDIR(fs); 837 blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff]; 838 bqrelse(bp); 839 } 840 if (blkno != 0) { 841 if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) 842 return (error); 843 } else { 844 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 845 fs->fs_bsize, KERNCRED, 0, &bp); 846 if (error) 847 return (error); 848 if ((error = readblock(bp, lbn)) != 0) 849 return (error); 850 } 851 /* 852 * Set a snapshot inode to be a zero length file, regular files 853 * to be completely unallocated. 854 */ 855 dip = (struct ufs1_dinode *)bp->b_data + 856 ino_to_fsbo(fs, cancelip->i_number); 857 if (expungetype == BLK_NOCOPY) 858 dip->di_mode = 0; 859 dip->di_size = 0; 860 dip->di_blocks = 0; 861 dip->di_flags &= ~SF_SNAPSHOT; 862 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t)); 863 bdwrite(bp); 864 /* 865 * Now go through and expunge all the blocks in the file 866 * using the function requested. 867 */ 868 numblks = howmany(cancelip->i_size, fs->fs_bsize); 869 if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0], 870 &cancelip->i_din1->di_db[NDADDR], fs, 0, expungetype))) 871 return (error); 872 if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_ib[0], 873 &cancelip->i_din1->di_ib[NIADDR], fs, -1, expungetype))) 874 return (error); 875 blksperindir = 1; 876 lbn = -NDADDR; 877 len = numblks - NDADDR; 878 rlbn = NDADDR; 879 for (i = 0; len > 0 && i < NIADDR; i++) { 880 error = indiracct_ufs1(snapvp, ITOV(cancelip), i, 881 cancelip->i_din1->di_ib[i], lbn, rlbn, len, 882 blksperindir, fs, acctfunc, expungetype); 883 if (error) 884 return (error); 885 blksperindir *= NINDIR(fs); 886 lbn -= blksperindir + 1; 887 len -= blksperindir; 888 rlbn += blksperindir; 889 } 890 return (0); 891 } 892 893 /* 894 * Descend an indirect block chain for vnode cancelvp accounting for all 895 * its indirect blocks in snapvp. 896 */ 897 static int 898 indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, 899 blksperindir, fs, acctfunc, expungetype) 900 struct vnode *snapvp; 901 struct vnode *cancelvp; 902 int level; 903 ufs1_daddr_t blkno; 904 ufs_lbn_t lbn; 905 ufs_lbn_t rlbn; 906 ufs_lbn_t remblks; 907 ufs_lbn_t blksperindir; 908 struct fs *fs; 909 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 910 struct fs *, ufs_lbn_t, int); 911 int expungetype; 912 { 913 int error, num, i; 914 ufs_lbn_t subblksperindir; 915 struct indir indirs[NIADDR + 2]; 916 ufs1_daddr_t last, *bap; 917 struct buf *bp; 918 919 if (blkno == 0) { 920 if (expungetype == BLK_NOCOPY) 921 return (0); 922 panic("indiracct_ufs1: missing indir"); 923 } 924 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 925 return (error); 926 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 927 panic("indiracct_ufs1: botched params"); 928 /* 929 * We have to expand bread here since it will deadlock looking 930 * up the block number for any blocks that are not in the cache. 931 */ 932 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); 933 bp->b_blkno = fsbtodb(fs, blkno); 934 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 935 (error = readblock(bp, fragstoblks(fs, blkno)))) { 936 brelse(bp); 937 return (error); 938 } 939 /* 940 * Account for the block pointers in this indirect block. 941 */ 942 last = howmany(remblks, blksperindir); 943 if (last > NINDIR(fs)) 944 last = NINDIR(fs); 945 MALLOC(bap, ufs1_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); 946 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 947 bqrelse(bp); 948 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 949 level == 0 ? rlbn : -1, expungetype); 950 if (error || level == 0) 951 goto out; 952 /* 953 * Account for the block pointers in each of the indirect blocks 954 * in the levels below us. 955 */ 956 subblksperindir = blksperindir / NINDIR(fs); 957 for (lbn++, level--, i = 0; i < last; i++) { 958 error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn, 959 rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); 960 if (error) 961 goto out; 962 rlbn += blksperindir; 963 lbn -= blksperindir; 964 remblks -= blksperindir; 965 } 966 out: 967 FREE(bap, M_DEVBUF); 968 return (error); 969 } 970 971 /* 972 * Do both snap accounting and map accounting. 973 */ 974 static int 975 fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype) 976 struct vnode *vp; 977 ufs1_daddr_t *oldblkp, *lastblkp; 978 struct fs *fs; 979 ufs_lbn_t lblkno; 980 int exptype; /* BLK_SNAP or BLK_NOCOPY */ 981 { 982 int error; 983 984 if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 985 return (error); 986 return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 987 } 988 989 /* 990 * Identify a set of blocks allocated in a snapshot inode. 991 */ 992 static int 993 snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 994 struct vnode *vp; 995 ufs1_daddr_t *oldblkp, *lastblkp; 996 struct fs *fs; 997 ufs_lbn_t lblkno; 998 int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 999 { 1000 struct inode *ip = VTOI(vp); 1001 ufs1_daddr_t blkno, *blkp; 1002 ufs_lbn_t lbn; 1003 struct buf *ibp; 1004 int error; 1005 1006 for ( ; oldblkp < lastblkp; oldblkp++) { 1007 blkno = *oldblkp; 1008 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1009 continue; 1010 lbn = fragstoblks(fs, blkno); 1011 if (lbn < NDADDR) { 1012 blkp = &ip->i_din1->di_db[lbn]; 1013 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1014 } else { 1015 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1016 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1017 if (error) 1018 return (error); 1019 blkp = &((ufs1_daddr_t *)(ibp->b_data)) 1020 [(lbn - NDADDR) % NINDIR(fs)]; 1021 } 1022 /* 1023 * If we are expunging a snapshot vnode and we 1024 * find a block marked BLK_NOCOPY, then it is 1025 * one that has been allocated to this snapshot after 1026 * we took our current snapshot and can be ignored. 1027 */ 1028 if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { 1029 if (lbn >= NDADDR) 1030 brelse(ibp); 1031 } else { 1032 if (*blkp != 0) 1033 panic("snapacct_ufs1: bad block"); 1034 *blkp = expungetype; 1035 if (lbn >= NDADDR) 1036 bdwrite(ibp); 1037 } 1038 } 1039 return (0); 1040 } 1041 1042 /* 1043 * Account for a set of blocks allocated in a snapshot inode. 1044 */ 1045 static int 1046 mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1047 struct vnode *vp; 1048 ufs1_daddr_t *oldblkp, *lastblkp; 1049 struct fs *fs; 1050 ufs_lbn_t lblkno; 1051 int expungetype; 1052 { 1053 ufs1_daddr_t blkno; 1054 struct inode *ip; 1055 ino_t inum; 1056 int acctit; 1057 1058 ip = VTOI(vp); 1059 inum = ip->i_number; 1060 if (lblkno == -1) 1061 acctit = 0; 1062 else 1063 acctit = 1; 1064 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1065 blkno = *oldblkp; 1066 if (blkno == 0 || blkno == BLK_NOCOPY) 1067 continue; 1068 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1069 *ip->i_snapblklist++ = lblkno; 1070 if (blkno == BLK_SNAP) 1071 blkno = blkstofrags(fs, lblkno); 1072 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); 1073 } 1074 return (0); 1075 } 1076 1077 /* 1078 * Before expunging a snapshot inode, note all the 1079 * blocks that it claims with BLK_SNAP so that fsck will 1080 * be able to account for those blocks properly and so 1081 * that this snapshot knows that it need not copy them 1082 * if the other snapshot holding them is freed. This code 1083 * is reproduced once each for UFS1 and UFS2. 1084 */ 1085 static int 1086 expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype) 1087 struct vnode *snapvp; 1088 struct inode *cancelip; 1089 struct fs *fs; 1090 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1091 struct fs *, ufs_lbn_t, int); 1092 int expungetype; 1093 { 1094 int i, error, indiroff; 1095 ufs_lbn_t lbn, rlbn; 1096 ufs2_daddr_t len, blkno, numblks, blksperindir; 1097 struct ufs2_dinode *dip; 1098 struct thread *td = curthread; 1099 struct buf *bp; 1100 1101 /* 1102 * Prepare to expunge the inode. If its inode block has not 1103 * yet been copied, then allocate and fill the copy. 1104 */ 1105 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 1106 blkno = 0; 1107 if (lbn < NDADDR) { 1108 blkno = VTOI(snapvp)->i_din2->di_db[lbn]; 1109 } else { 1110 td->td_pflags |= TDP_COWINPROGRESS; 1111 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 1112 fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); 1113 td->td_pflags &= ~TDP_COWINPROGRESS; 1114 if (error) 1115 return (error); 1116 indiroff = (lbn - NDADDR) % NINDIR(fs); 1117 blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff]; 1118 bqrelse(bp); 1119 } 1120 if (blkno != 0) { 1121 if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) 1122 return (error); 1123 } else { 1124 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 1125 fs->fs_bsize, KERNCRED, 0, &bp); 1126 if (error) 1127 return (error); 1128 if ((error = readblock(bp, lbn)) != 0) 1129 return (error); 1130 } 1131 /* 1132 * Set a snapshot inode to be a zero length file, regular files 1133 * to be completely unallocated. 1134 */ 1135 dip = (struct ufs2_dinode *)bp->b_data + 1136 ino_to_fsbo(fs, cancelip->i_number); 1137 if (expungetype == BLK_NOCOPY) 1138 dip->di_mode = 0; 1139 dip->di_size = 0; 1140 dip->di_blocks = 0; 1141 dip->di_flags &= ~SF_SNAPSHOT; 1142 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t)); 1143 bdwrite(bp); 1144 /* 1145 * Now go through and expunge all the blocks in the file 1146 * using the function requested. 1147 */ 1148 numblks = howmany(cancelip->i_size, fs->fs_bsize); 1149 if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0], 1150 &cancelip->i_din2->di_db[NDADDR], fs, 0, expungetype))) 1151 return (error); 1152 if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_ib[0], 1153 &cancelip->i_din2->di_ib[NIADDR], fs, -1, expungetype))) 1154 return (error); 1155 blksperindir = 1; 1156 lbn = -NDADDR; 1157 len = numblks - NDADDR; 1158 rlbn = NDADDR; 1159 for (i = 0; len > 0 && i < NIADDR; i++) { 1160 error = indiracct_ufs2(snapvp, ITOV(cancelip), i, 1161 cancelip->i_din2->di_ib[i], lbn, rlbn, len, 1162 blksperindir, fs, acctfunc, expungetype); 1163 if (error) 1164 return (error); 1165 blksperindir *= NINDIR(fs); 1166 lbn -= blksperindir + 1; 1167 len -= blksperindir; 1168 rlbn += blksperindir; 1169 } 1170 return (0); 1171 } 1172 1173 /* 1174 * Descend an indirect block chain for vnode cancelvp accounting for all 1175 * its indirect blocks in snapvp. 1176 */ 1177 static int 1178 indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, 1179 blksperindir, fs, acctfunc, expungetype) 1180 struct vnode *snapvp; 1181 struct vnode *cancelvp; 1182 int level; 1183 ufs2_daddr_t blkno; 1184 ufs_lbn_t lbn; 1185 ufs_lbn_t rlbn; 1186 ufs_lbn_t remblks; 1187 ufs_lbn_t blksperindir; 1188 struct fs *fs; 1189 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1190 struct fs *, ufs_lbn_t, int); 1191 int expungetype; 1192 { 1193 int error, num, i; 1194 ufs_lbn_t subblksperindir; 1195 struct indir indirs[NIADDR + 2]; 1196 ufs2_daddr_t last, *bap; 1197 struct buf *bp; 1198 1199 if (blkno == 0) { 1200 if (expungetype == BLK_NOCOPY) 1201 return (0); 1202 panic("indiracct_ufs2: missing indir"); 1203 } 1204 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 1205 return (error); 1206 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 1207 panic("indiracct_ufs2: botched params"); 1208 /* 1209 * We have to expand bread here since it will deadlock looking 1210 * up the block number for any blocks that are not in the cache. 1211 */ 1212 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); 1213 bp->b_blkno = fsbtodb(fs, blkno); 1214 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 1215 (error = readblock(bp, fragstoblks(fs, blkno)))) { 1216 brelse(bp); 1217 return (error); 1218 } 1219 /* 1220 * Account for the block pointers in this indirect block. 1221 */ 1222 last = howmany(remblks, blksperindir); 1223 if (last > NINDIR(fs)) 1224 last = NINDIR(fs); 1225 MALLOC(bap, ufs2_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); 1226 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 1227 bqrelse(bp); 1228 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 1229 level == 0 ? rlbn : -1, expungetype); 1230 if (error || level == 0) 1231 goto out; 1232 /* 1233 * Account for the block pointers in each of the indirect blocks 1234 * in the levels below us. 1235 */ 1236 subblksperindir = blksperindir / NINDIR(fs); 1237 for (lbn++, level--, i = 0; i < last; i++) { 1238 error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn, 1239 rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); 1240 if (error) 1241 goto out; 1242 rlbn += blksperindir; 1243 lbn -= blksperindir; 1244 remblks -= blksperindir; 1245 } 1246 out: 1247 FREE(bap, M_DEVBUF); 1248 return (error); 1249 } 1250 1251 /* 1252 * Do both snap accounting and map accounting. 1253 */ 1254 static int 1255 fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype) 1256 struct vnode *vp; 1257 ufs2_daddr_t *oldblkp, *lastblkp; 1258 struct fs *fs; 1259 ufs_lbn_t lblkno; 1260 int exptype; /* BLK_SNAP or BLK_NOCOPY */ 1261 { 1262 int error; 1263 1264 if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 1265 return (error); 1266 return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 1267 } 1268 1269 /* 1270 * Identify a set of blocks allocated in a snapshot inode. 1271 */ 1272 static int 1273 snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1274 struct vnode *vp; 1275 ufs2_daddr_t *oldblkp, *lastblkp; 1276 struct fs *fs; 1277 ufs_lbn_t lblkno; 1278 int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 1279 { 1280 struct inode *ip = VTOI(vp); 1281 ufs2_daddr_t blkno, *blkp; 1282 ufs_lbn_t lbn; 1283 struct buf *ibp; 1284 int error; 1285 1286 for ( ; oldblkp < lastblkp; oldblkp++) { 1287 blkno = *oldblkp; 1288 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1289 continue; 1290 lbn = fragstoblks(fs, blkno); 1291 if (lbn < NDADDR) { 1292 blkp = &ip->i_din2->di_db[lbn]; 1293 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1294 } else { 1295 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1296 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1297 if (error) 1298 return (error); 1299 blkp = &((ufs2_daddr_t *)(ibp->b_data)) 1300 [(lbn - NDADDR) % NINDIR(fs)]; 1301 } 1302 /* 1303 * If we are expunging a snapshot vnode and we 1304 * find a block marked BLK_NOCOPY, then it is 1305 * one that has been allocated to this snapshot after 1306 * we took our current snapshot and can be ignored. 1307 */ 1308 if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { 1309 if (lbn >= NDADDR) 1310 brelse(ibp); 1311 } else { 1312 if (*blkp != 0) 1313 panic("snapacct_ufs2: bad block"); 1314 *blkp = expungetype; 1315 if (lbn >= NDADDR) 1316 bdwrite(ibp); 1317 } 1318 } 1319 return (0); 1320 } 1321 1322 /* 1323 * Account for a set of blocks allocated in a snapshot inode. 1324 */ 1325 static int 1326 mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1327 struct vnode *vp; 1328 ufs2_daddr_t *oldblkp, *lastblkp; 1329 struct fs *fs; 1330 ufs_lbn_t lblkno; 1331 int expungetype; 1332 { 1333 ufs2_daddr_t blkno; 1334 struct inode *ip; 1335 ino_t inum; 1336 int acctit; 1337 1338 ip = VTOI(vp); 1339 inum = ip->i_number; 1340 if (lblkno == -1) 1341 acctit = 0; 1342 else 1343 acctit = 1; 1344 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1345 blkno = *oldblkp; 1346 if (blkno == 0 || blkno == BLK_NOCOPY) 1347 continue; 1348 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1349 *ip->i_snapblklist++ = lblkno; 1350 if (blkno == BLK_SNAP) 1351 blkno = blkstofrags(fs, lblkno); 1352 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); 1353 } 1354 return (0); 1355 } 1356 1357 /* 1358 * Decrement extra reference on snapshot when last name is removed. 1359 * It will not be freed until the last open reference goes away. 1360 */ 1361 void 1362 ffs_snapgone(ip) 1363 struct inode *ip; 1364 { 1365 struct inode *xp; 1366 struct fs *fs; 1367 int snaploc; 1368 1369 /* 1370 * Find snapshot in incore list. 1371 */ 1372 TAILQ_FOREACH(xp, &ip->i_devvp->v_rdev->si_snapshots, i_nextsnap) 1373 if (xp == ip) 1374 break; 1375 if (xp != NULL) 1376 vrele(ITOV(ip)); 1377 else if (snapdebug) 1378 printf("ffs_snapgone: lost snapshot vnode %d\n", 1379 ip->i_number); 1380 /* 1381 * Delete snapshot inode from superblock. Keep list dense. 1382 */ 1383 fs = ip->i_fs; 1384 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 1385 if (fs->fs_snapinum[snaploc] == ip->i_number) 1386 break; 1387 if (snaploc < FSMAXSNAP) { 1388 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { 1389 if (fs->fs_snapinum[snaploc] == 0) 1390 break; 1391 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; 1392 } 1393 fs->fs_snapinum[snaploc - 1] = 0; 1394 } 1395 } 1396 1397 /* 1398 * Prepare a snapshot file for being removed. 1399 */ 1400 void 1401 ffs_snapremove(vp) 1402 struct vnode *vp; 1403 { 1404 struct inode *ip; 1405 struct vnode *devvp; 1406 struct lock *lkp; 1407 struct buf *ibp; 1408 struct fs *fs; 1409 struct thread *td = curthread; 1410 ufs2_daddr_t numblks, blkno, dblk, *snapblklist; 1411 int error, loc, last; 1412 1413 ip = VTOI(vp); 1414 fs = ip->i_fs; 1415 devvp = ip->i_devvp; 1416 /* 1417 * If active, delete from incore list (this snapshot may 1418 * already have been in the process of being deleted, so 1419 * would not have been active). 1420 * 1421 * Clear copy-on-write flag if last snapshot. 1422 */ 1423 if (ip->i_nextsnap.tqe_prev != 0) { 1424 VI_LOCK(devvp); 1425 lockmgr(&vp->v_lock, LK_INTERLOCK | LK_EXCLUSIVE, 1426 VI_MTX(devvp), td); 1427 VI_LOCK(devvp); 1428 TAILQ_REMOVE(&devvp->v_rdev->si_snapshots, ip, i_nextsnap); 1429 ip->i_nextsnap.tqe_prev = 0; 1430 lkp = vp->v_vnlock; 1431 vp->v_vnlock = &vp->v_lock; 1432 lockmgr(lkp, LK_RELEASE, NULL, td); 1433 if (TAILQ_FIRST(&devvp->v_rdev->si_snapshots) != 0) { 1434 VI_UNLOCK(devvp); 1435 } else { 1436 snapblklist = devvp->v_rdev->si_snapblklist; 1437 devvp->v_rdev->si_snapblklist = 0; 1438 devvp->v_rdev->si_snaplistsize = 0; 1439 devvp->v_rdev->si_copyonwrite = 0; 1440 devvp->v_vflag &= ~VV_COPYONWRITE; 1441 lockmgr(lkp, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp), td); 1442 lockmgr(lkp, LK_RELEASE, NULL, td); 1443 lockdestroy(lkp); 1444 FREE(lkp, M_UFSMNT); 1445 FREE(snapblklist, M_UFSMNT); 1446 } 1447 } 1448 /* 1449 * Clear all BLK_NOCOPY fields. Pass any block claims to other 1450 * snapshots that want them (see ffs_snapblkfree below). 1451 */ 1452 for (blkno = 1; blkno < NDADDR; blkno++) { 1453 dblk = DIP(ip, i_db[blkno]); 1454 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1455 DIP(ip, i_db[blkno]) = 0; 1456 else if ((dblk == blkstofrags(fs, blkno) && 1457 ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, 1458 ip->i_number))) { 1459 DIP(ip, i_blocks) -= btodb(fs->fs_bsize); 1460 DIP(ip, i_db[blkno]) = 0; 1461 } 1462 } 1463 numblks = howmany(ip->i_size, fs->fs_bsize); 1464 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 1465 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), 1466 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1467 if (error) 1468 continue; 1469 if (fs->fs_size - blkno > NINDIR(fs)) 1470 last = NINDIR(fs); 1471 else 1472 last = fs->fs_size - blkno; 1473 for (loc = 0; loc < last; loc++) { 1474 if (ip->i_ump->um_fstype == UFS1) { 1475 dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc]; 1476 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1477 ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; 1478 else if ((dblk == blkstofrags(fs, blkno) && 1479 ffs_snapblkfree(fs, ip->i_devvp, dblk, 1480 fs->fs_bsize, ip->i_number))) { 1481 ip->i_din1->di_blocks -= 1482 btodb(fs->fs_bsize); 1483 ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; 1484 } 1485 continue; 1486 } 1487 dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc]; 1488 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1489 ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; 1490 else if ((dblk == blkstofrags(fs, blkno) && 1491 ffs_snapblkfree(fs, ip->i_devvp, dblk, 1492 fs->fs_bsize, ip->i_number))) { 1493 ip->i_din2->di_blocks -= btodb(fs->fs_bsize); 1494 ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; 1495 } 1496 } 1497 bawrite(ibp); 1498 } 1499 /* 1500 * Clear snapshot flag and drop reference. 1501 */ 1502 ip->i_flags &= ~SF_SNAPSHOT; 1503 DIP(ip, i_flags) = ip->i_flags; 1504 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1505 } 1506 1507 /* 1508 * Notification that a block is being freed. Return zero if the free 1509 * should be allowed to proceed. Return non-zero if the snapshot file 1510 * wants to claim the block. The block will be claimed if it is an 1511 * uncopied part of one of the snapshots. It will be freed if it is 1512 * either a BLK_NOCOPY or has already been copied in all of the snapshots. 1513 * If a fragment is being freed, then all snapshots that care about 1514 * it must make a copy since a snapshot file can only claim full sized 1515 * blocks. Note that if more than one snapshot file maps the block, 1516 * we can pick one at random to claim it. Since none of the snapshots 1517 * can change, we are assurred that they will all see the same unmodified 1518 * image. When deleting a snapshot file (see ffs_snapremove above), we 1519 * must push any of these claimed blocks to one of the other snapshots 1520 * that maps it. These claimed blocks are easily identified as they will 1521 * have a block number equal to their logical block number within the 1522 * snapshot. A copied block can never have this property because they 1523 * must always have been allocated from a BLK_NOCOPY location. 1524 */ 1525 int 1526 ffs_snapblkfree(fs, devvp, bno, size, inum) 1527 struct fs *fs; 1528 struct vnode *devvp; 1529 ufs2_daddr_t bno; 1530 long size; 1531 ino_t inum; 1532 { 1533 struct buf *ibp, *cbp, *savedcbp = 0; 1534 struct thread *td = curthread; 1535 struct inode *ip; 1536 struct vnode *vp = NULL; 1537 ufs_lbn_t lbn; 1538 ufs2_daddr_t blkno; 1539 int indiroff = 0, snapshot_locked = 0, error = 0, claimedblk = 0; 1540 struct snaphead *snaphead; 1541 1542 lbn = fragstoblks(fs, bno); 1543 retry: 1544 VI_LOCK(devvp); 1545 snaphead = &devvp->v_rdev->si_snapshots; 1546 TAILQ_FOREACH(ip, snaphead, i_nextsnap) { 1547 vp = ITOV(ip); 1548 /* 1549 * Lookup block being written. 1550 */ 1551 if (lbn < NDADDR) { 1552 blkno = DIP(ip, i_db[lbn]); 1553 } else { 1554 if (snapshot_locked == 0 && 1555 lockmgr(vp->v_vnlock, 1556 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 1557 VI_MTX(devvp), td) != 0) 1558 goto retry; 1559 snapshot_locked = 1; 1560 td->td_pflags |= TDP_COWINPROGRESS; 1561 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1562 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1563 td->td_pflags &= ~TDP_COWINPROGRESS; 1564 if (error) 1565 break; 1566 indiroff = (lbn - NDADDR) % NINDIR(fs); 1567 if (ip->i_ump->um_fstype == UFS1) 1568 blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; 1569 else 1570 blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; 1571 } 1572 /* 1573 * Check to see if block needs to be copied. 1574 */ 1575 if (blkno == 0) { 1576 /* 1577 * A block that we map is being freed. If it has not 1578 * been claimed yet, we will claim or copy it (below). 1579 */ 1580 claimedblk = 1; 1581 } else if (blkno == BLK_SNAP) { 1582 /* 1583 * No previous snapshot claimed the block, 1584 * so it will be freed and become a BLK_NOCOPY 1585 * (don't care) for us. 1586 */ 1587 if (claimedblk) 1588 panic("snapblkfree: inconsistent block type"); 1589 if (snapshot_locked == 0 && 1590 lockmgr(vp->v_vnlock, 1591 LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, 1592 VI_MTX(devvp), td) != 0) { 1593 if (lbn >= NDADDR) 1594 bqrelse(ibp); 1595 vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL, td); 1596 goto retry; 1597 } 1598 snapshot_locked = 1; 1599 if (lbn < NDADDR) { 1600 DIP(ip, i_db[lbn]) = BLK_NOCOPY; 1601 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1602 } else if (ip->i_ump->um_fstype == UFS1) { 1603 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 1604 BLK_NOCOPY; 1605 bdwrite(ibp); 1606 } else { 1607 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 1608 BLK_NOCOPY; 1609 bdwrite(ibp); 1610 } 1611 continue; 1612 } else /* BLK_NOCOPY or default */ { 1613 /* 1614 * If the snapshot has already copied the block 1615 * (default), or does not care about the block, 1616 * it is not needed. 1617 */ 1618 if (lbn >= NDADDR) 1619 bqrelse(ibp); 1620 continue; 1621 } 1622 /* 1623 * If this is a full size block, we will just grab it 1624 * and assign it to the snapshot inode. Otherwise we 1625 * will proceed to copy it. See explanation for this 1626 * routine as to why only a single snapshot needs to 1627 * claim this block. 1628 */ 1629 if (snapshot_locked == 0 && 1630 lockmgr(vp->v_vnlock, 1631 LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, 1632 VI_MTX(devvp), td) != 0) { 1633 if (lbn >= NDADDR) 1634 bqrelse(ibp); 1635 vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL, td); 1636 goto retry; 1637 } 1638 snapshot_locked = 1; 1639 if (size == fs->fs_bsize) { 1640 #ifdef DEBUG 1641 if (snapdebug) 1642 printf("%s %d lbn %jd from inum %d\n", 1643 "Grabonremove: snapino", ip->i_number, 1644 (intmax_t)lbn, inum); 1645 #endif 1646 if (lbn < NDADDR) { 1647 DIP(ip, i_db[lbn]) = bno; 1648 } else if (ip->i_ump->um_fstype == UFS1) { 1649 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno; 1650 bdwrite(ibp); 1651 } else { 1652 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno; 1653 bdwrite(ibp); 1654 } 1655 DIP(ip, i_blocks) += btodb(size); 1656 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1657 VOP_UNLOCK(vp, 0, td); 1658 return (1); 1659 } 1660 if (lbn >= NDADDR) 1661 bqrelse(ibp); 1662 /* 1663 * Allocate the block into which to do the copy. Note that this 1664 * allocation will never require any additional allocations for 1665 * the snapshot inode. 1666 */ 1667 td->td_pflags |= TDP_COWINPROGRESS; 1668 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1669 fs->fs_bsize, KERNCRED, 0, &cbp); 1670 td->td_pflags &= ~TDP_COWINPROGRESS; 1671 if (error) 1672 break; 1673 #ifdef DEBUG 1674 if (snapdebug) 1675 printf("%s%d lbn %jd %s %d size %ld to blkno %jd\n", 1676 "Copyonremove: snapino ", ip->i_number, 1677 (intmax_t)lbn, "for inum", inum, size, 1678 (intmax_t)cbp->b_blkno); 1679 #endif 1680 /* 1681 * If we have already read the old block contents, then 1682 * simply copy them to the new block. Note that we need 1683 * to synchronously write snapshots that have not been 1684 * unlinked, and hence will be visible after a crash, 1685 * to ensure their integrity. 1686 */ 1687 if (savedcbp != 0) { 1688 bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 1689 bawrite(cbp); 1690 if (dopersistence && ip->i_effnlink > 0) 1691 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1692 continue; 1693 } 1694 /* 1695 * Otherwise, read the old block contents into the buffer. 1696 */ 1697 if ((error = readblock(cbp, lbn)) != 0) { 1698 bzero(cbp->b_data, fs->fs_bsize); 1699 bawrite(cbp); 1700 if (dopersistence && ip->i_effnlink > 0) 1701 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1702 break; 1703 } 1704 savedcbp = cbp; 1705 } 1706 /* 1707 * Note that we need to synchronously write snapshots that 1708 * have not been unlinked, and hence will be visible after 1709 * a crash, to ensure their integrity. 1710 */ 1711 if (savedcbp) { 1712 vp = savedcbp->b_vp; 1713 bawrite(savedcbp); 1714 if (dopersistence && VTOI(vp)->i_effnlink > 0) 1715 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1716 } 1717 /* 1718 * If we have been unable to allocate a block in which to do 1719 * the copy, then return non-zero so that the fragment will 1720 * not be freed. Although space will be lost, the snapshot 1721 * will stay consistent. 1722 */ 1723 if (snapshot_locked) 1724 VOP_UNLOCK(vp, 0, td); 1725 else 1726 VI_UNLOCK(devvp); 1727 return (error); 1728 } 1729 1730 /* 1731 * Associate snapshot files when mounting. 1732 */ 1733 void 1734 ffs_snapshot_mount(mp) 1735 struct mount *mp; 1736 { 1737 struct ufsmount *ump = VFSTOUFS(mp); 1738 struct vnode *devvp = ump->um_devvp; 1739 struct fs *fs = ump->um_fs; 1740 struct thread *td = curthread; 1741 struct snaphead *snaphead; 1742 struct vnode *vp; 1743 struct inode *ip, *xp; 1744 struct uio auio; 1745 struct iovec aiov; 1746 void *snapblklist; 1747 char *reason; 1748 daddr_t snaplistsize; 1749 int error, snaploc, loc; 1750 1751 /* 1752 * XXX The following needs to be set before UFS_TRUNCATE or 1753 * VOP_READ can be called. 1754 */ 1755 mp->mnt_stat.f_iosize = fs->fs_bsize; 1756 /* 1757 * Process each snapshot listed in the superblock. 1758 */ 1759 vp = NULL; 1760 snaphead = &devvp->v_rdev->si_snapshots; 1761 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { 1762 if (fs->fs_snapinum[snaploc] == 0) 1763 break; 1764 if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], 1765 LK_EXCLUSIVE, &vp)) != 0){ 1766 printf("ffs_snapshot_mount: vget failed %d\n", error); 1767 continue; 1768 } 1769 ip = VTOI(vp); 1770 if ((ip->i_flags & SF_SNAPSHOT) == 0 || ip->i_size == 1771 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) { 1772 if ((ip->i_flags & SF_SNAPSHOT) == 0) { 1773 reason = "non-snapshot"; 1774 } else { 1775 reason = "old format snapshot"; 1776 (void)UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED, td); 1777 (void)VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1778 } 1779 printf("ffs_snapshot_mount: %s inode %d\n", 1780 reason, fs->fs_snapinum[snaploc]); 1781 vput(vp); 1782 vp = NULL; 1783 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { 1784 if (fs->fs_snapinum[loc] == 0) 1785 break; 1786 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; 1787 } 1788 fs->fs_snapinum[loc - 1] = 0; 1789 snaploc--; 1790 continue; 1791 } 1792 /* 1793 * If there already exist snapshots on this filesystem, grab a 1794 * reference to their shared lock. If this is the first snapshot 1795 * on this filesystem, we need to allocate a lock for the 1796 * snapshots to share. In either case, acquire the snapshot 1797 * lock and give up our original private lock. 1798 */ 1799 VI_LOCK(devvp); 1800 if ((xp = TAILQ_FIRST(snaphead)) != NULL) { 1801 struct lock *lkp; 1802 1803 lkp = ITOV(xp)->v_vnlock; 1804 VI_UNLOCK(devvp); 1805 VI_LOCK(vp); 1806 vp->v_vnlock = lkp; 1807 } else { 1808 struct lock *lkp; 1809 1810 VI_UNLOCK(devvp); 1811 MALLOC(lkp, struct lock *, sizeof(struct lock), 1812 M_UFSMNT, M_WAITOK); 1813 lockinit(lkp, PVFS, "snaplk", VLKTIMEOUT, 1814 LK_CANRECURSE | LK_NOPAUSE); 1815 VI_LOCK(vp); 1816 vp->v_vnlock = lkp; 1817 } 1818 vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, td); 1819 transferlockers(&vp->v_lock, vp->v_vnlock); 1820 lockmgr(&vp->v_lock, LK_RELEASE, NULL, td); 1821 /* 1822 * Link it onto the active snapshot list. 1823 */ 1824 VI_LOCK(devvp); 1825 if (ip->i_nextsnap.tqe_prev != 0) 1826 panic("ffs_snapshot_mount: %d already on list", 1827 ip->i_number); 1828 else 1829 TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap); 1830 vp->v_vflag |= VV_SYSTEM; 1831 VI_UNLOCK(devvp); 1832 VOP_UNLOCK(vp, 0, td); 1833 } 1834 /* 1835 * No usable snapshots found. 1836 */ 1837 if (vp == NULL) 1838 return; 1839 /* 1840 * Allocate the space for the block hints list. We always want to 1841 * use the list from the newest snapshot. 1842 */ 1843 auio.uio_iov = &aiov; 1844 auio.uio_iovcnt = 1; 1845 aiov.iov_base = (void *)&snaplistsize; 1846 aiov.iov_len = sizeof(snaplistsize); 1847 auio.uio_resid = aiov.iov_len; 1848 auio.uio_offset = 1849 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)); 1850 auio.uio_segflg = UIO_SYSSPACE; 1851 auio.uio_rw = UIO_READ; 1852 auio.uio_td = td; 1853 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1854 if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 1855 printf("ffs_snapshot_mount: read_1 failed %d\n", error); 1856 VOP_UNLOCK(vp, 0, td); 1857 return; 1858 } 1859 MALLOC(snapblklist, void *, snaplistsize * sizeof(daddr_t), 1860 M_UFSMNT, M_WAITOK); 1861 auio.uio_iovcnt = 1; 1862 aiov.iov_base = snapblklist; 1863 aiov.iov_len = snaplistsize * sizeof (daddr_t); 1864 auio.uio_resid = aiov.iov_len; 1865 auio.uio_offset -= sizeof(snaplistsize); 1866 if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 1867 printf("ffs_snapshot_mount: read_2 failed %d\n", error); 1868 VOP_UNLOCK(vp, 0, td); 1869 FREE(snapblklist, M_UFSMNT); 1870 return; 1871 } 1872 VOP_UNLOCK(vp, 0, td); 1873 VI_LOCK(devvp); 1874 ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_mount"); 1875 devvp->v_rdev->si_snaplistsize = snaplistsize; 1876 devvp->v_rdev->si_snapblklist = (daddr_t *)snapblklist; 1877 devvp->v_rdev->si_copyonwrite = ffs_copyonwrite; 1878 devvp->v_vflag |= VV_COPYONWRITE; 1879 VI_UNLOCK(devvp); 1880 } 1881 1882 /* 1883 * Disassociate snapshot files when unmounting. 1884 */ 1885 void 1886 ffs_snapshot_unmount(mp) 1887 struct mount *mp; 1888 { 1889 struct vnode *devvp = VFSTOUFS(mp)->um_devvp; 1890 struct snaphead *snaphead = &devvp->v_rdev->si_snapshots; 1891 struct lock *lkp = NULL; 1892 struct inode *xp; 1893 struct vnode *vp; 1894 1895 VI_LOCK(devvp); 1896 while ((xp = TAILQ_FIRST(snaphead)) != 0) { 1897 vp = ITOV(xp); 1898 lkp = vp->v_vnlock; 1899 vp->v_vnlock = &vp->v_lock; 1900 TAILQ_REMOVE(snaphead, xp, i_nextsnap); 1901 xp->i_nextsnap.tqe_prev = 0; 1902 if (xp->i_effnlink > 0) { 1903 VI_UNLOCK(devvp); 1904 vrele(vp); 1905 VI_LOCK(devvp); 1906 } 1907 } 1908 if (devvp->v_rdev->si_snapblklist != NULL) { 1909 FREE(devvp->v_rdev->si_snapblklist, M_UFSMNT); 1910 devvp->v_rdev->si_snapblklist = NULL; 1911 devvp->v_rdev->si_snaplistsize = 0; 1912 } 1913 if (lkp != NULL) { 1914 lockdestroy(lkp); 1915 FREE(lkp, M_UFSMNT); 1916 } 1917 ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount"); 1918 devvp->v_rdev->si_copyonwrite = 0; 1919 devvp->v_vflag &= ~VV_COPYONWRITE; 1920 VI_UNLOCK(devvp); 1921 } 1922 1923 /* 1924 * Check for need to copy block that is about to be written, 1925 * copying the block if necessary. 1926 */ 1927 static int 1928 ffs_copyonwrite(devvp, bp) 1929 struct vnode *devvp; 1930 struct buf *bp; 1931 { 1932 struct snaphead *snaphead; 1933 struct buf *ibp, *cbp, *savedcbp = 0; 1934 struct thread *td = curthread; 1935 struct fs *fs; 1936 struct inode *ip; 1937 struct vnode *vp = 0; 1938 ufs2_daddr_t lbn, blkno, *snapblklist; 1939 int lower, upper, mid, indiroff, snapshot_locked = 0, error = 0; 1940 1941 if (td->td_pflags & TDP_COWINPROGRESS) 1942 panic("ffs_copyonwrite: recursive call"); 1943 /* 1944 * First check to see if it is in the preallocated list. 1945 * By doing this check we avoid several potential deadlocks. 1946 */ 1947 VI_LOCK(devvp); 1948 snaphead = &devvp->v_rdev->si_snapshots; 1949 ip = TAILQ_FIRST(snaphead); 1950 fs = ip->i_fs; 1951 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); 1952 snapblklist = devvp->v_rdev->si_snapblklist; 1953 upper = devvp->v_rdev->si_snaplistsize - 1; 1954 lower = 1; 1955 while (lower <= upper) { 1956 mid = (lower + upper) / 2; 1957 if (snapblklist[mid] == lbn) 1958 break; 1959 if (snapblklist[mid] < lbn) 1960 lower = mid + 1; 1961 else 1962 upper = mid - 1; 1963 } 1964 if (lower <= upper) { 1965 VI_UNLOCK(devvp); 1966 return (0); 1967 } 1968 /* 1969 * Not in the precomputed list, so check the snapshots. 1970 */ 1971 retry: 1972 TAILQ_FOREACH(ip, snaphead, i_nextsnap) { 1973 vp = ITOV(ip); 1974 /* 1975 * We ensure that everything of our own that needs to be 1976 * copied will be done at the time that ffs_snapshot is 1977 * called. Thus we can skip the check here which can 1978 * deadlock in doing the lookup in UFS_BALLOC. 1979 */ 1980 if (bp->b_vp == vp) 1981 continue; 1982 /* 1983 * Check to see if block needs to be copied. We do not have 1984 * to hold the snapshot lock while doing this lookup as it 1985 * will never require any additional allocations for the 1986 * snapshot inode. 1987 */ 1988 if (lbn < NDADDR) { 1989 blkno = DIP(ip, i_db[lbn]); 1990 } else { 1991 if (snapshot_locked == 0 && 1992 lockmgr(vp->v_vnlock, 1993 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 1994 VI_MTX(devvp), td) != 0) { 1995 VI_LOCK(devvp); 1996 goto retry; 1997 } 1998 snapshot_locked = 1; 1999 td->td_pflags |= TDP_COWINPROGRESS; 2000 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 2001 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 2002 td->td_pflags &= ~TDP_COWINPROGRESS; 2003 if (error) 2004 break; 2005 indiroff = (lbn - NDADDR) % NINDIR(fs); 2006 if (ip->i_ump->um_fstype == UFS1) 2007 blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; 2008 else 2009 blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; 2010 bqrelse(ibp); 2011 } 2012 #ifdef DIAGNOSTIC 2013 if (blkno == BLK_SNAP && bp->b_lblkno >= 0) 2014 panic("ffs_copyonwrite: bad copy block"); 2015 #endif 2016 if (blkno != 0) 2017 continue; 2018 /* 2019 * Allocate the block into which to do the copy. Since 2020 * multiple processes may all try to copy the same block, 2021 * we have to recheck our need to do a copy if we sleep 2022 * waiting for the lock. 2023 * 2024 * Because all snapshots on a filesystem share a single 2025 * lock, we ensure that we will never be in competition 2026 * with another process to allocate a block. 2027 */ 2028 if (snapshot_locked == 0 && 2029 lockmgr(vp->v_vnlock, 2030 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 2031 VI_MTX(devvp), td) != 0) { 2032 VI_LOCK(devvp); 2033 goto retry; 2034 } 2035 snapshot_locked = 1; 2036 td->td_pflags |= TDP_COWINPROGRESS; 2037 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 2038 fs->fs_bsize, KERNCRED, 0, &cbp); 2039 td->td_pflags &= ~TDP_COWINPROGRESS; 2040 if (error) 2041 break; 2042 #ifdef DEBUG 2043 if (snapdebug) { 2044 printf("Copyonwrite: snapino %d lbn %jd for ", 2045 ip->i_number, (intmax_t)lbn); 2046 if (bp->b_vp == devvp) 2047 printf("fs metadata"); 2048 else 2049 printf("inum %d", VTOI(bp->b_vp)->i_number); 2050 printf(" lblkno %jd to blkno %jd\n", 2051 (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno); 2052 } 2053 #endif 2054 /* 2055 * If we have already read the old block contents, then 2056 * simply copy them to the new block. Note that we need 2057 * to synchronously write snapshots that have not been 2058 * unlinked, and hence will be visible after a crash, 2059 * to ensure their integrity. 2060 */ 2061 if (savedcbp != 0) { 2062 bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 2063 bawrite(cbp); 2064 if (dopersistence && ip->i_effnlink > 0) 2065 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 2066 continue; 2067 } 2068 /* 2069 * Otherwise, read the old block contents into the buffer. 2070 */ 2071 if ((error = readblock(cbp, lbn)) != 0) { 2072 bzero(cbp->b_data, fs->fs_bsize); 2073 bawrite(cbp); 2074 if (dopersistence && ip->i_effnlink > 0) 2075 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 2076 break; 2077 } 2078 savedcbp = cbp; 2079 } 2080 /* 2081 * Note that we need to synchronously write snapshots that 2082 * have not been unlinked, and hence will be visible after 2083 * a crash, to ensure their integrity. 2084 */ 2085 if (savedcbp) { 2086 vp = savedcbp->b_vp; 2087 bawrite(savedcbp); 2088 if (dopersistence && VTOI(vp)->i_effnlink > 0) 2089 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 2090 } 2091 if (snapshot_locked) 2092 VOP_UNLOCK(vp, 0, td); 2093 else 2094 VI_UNLOCK(devvp); 2095 return (error); 2096 } 2097 2098 /* 2099 * Read the specified block into the given buffer. 2100 * Much of this boiler-plate comes from bwrite(). 2101 */ 2102 static int 2103 readblock(bp, lbn) 2104 struct buf *bp; 2105 ufs2_daddr_t lbn; 2106 { 2107 struct uio auio; 2108 struct iovec aiov; 2109 struct thread *td = curthread; 2110 struct inode *ip = VTOI(bp->b_vp); 2111 2112 aiov.iov_base = bp->b_data; 2113 aiov.iov_len = bp->b_bcount; 2114 auio.uio_iov = &aiov; 2115 auio.uio_iovcnt = 1; 2116 auio.uio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn))); 2117 auio.uio_resid = bp->b_bcount; 2118 auio.uio_rw = UIO_READ; 2119 auio.uio_segflg = UIO_SYSSPACE; 2120 auio.uio_td = td; 2121 return (physio(ip->i_devvp->v_rdev, &auio, 0)); 2122 } 2123