1 /* 2 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. 3 * 4 * Further information about snapshots can be obtained from: 5 * 6 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 7 * 1614 Oxford Street mckusick@mckusick.com 8 * Berkeley, CA 94709-1608 +1-510-843-9542 9 * USA 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 21 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 22 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 34 * $FreeBSD$ 35 */ 36 37 #include <sys/param.h> 38 #include <sys/kernel.h> 39 #include <sys/systm.h> 40 #include <sys/conf.h> 41 #include <sys/bio.h> 42 #include <sys/buf.h> 43 #include <sys/proc.h> 44 #include <sys/namei.h> 45 #include <sys/sched.h> 46 #include <sys/stat.h> 47 #include <sys/malloc.h> 48 #include <sys/mount.h> 49 #include <sys/resource.h> 50 #include <sys/resourcevar.h> 51 #include <sys/vnode.h> 52 53 #include <ufs/ufs/extattr.h> 54 #include <ufs/ufs/quota.h> 55 #include <ufs/ufs/ufsmount.h> 56 #include <ufs/ufs/inode.h> 57 #include <ufs/ufs/ufs_extern.h> 58 59 #include <ufs/ffs/fs.h> 60 #include <ufs/ffs/ffs_extern.h> 61 62 #define KERNCRED thread0.td_ucred 63 #define DEBUG 1 64 65 static int cgaccount(int, struct vnode *, struct buf *, int); 66 static int expunge_ufs1(struct vnode *, struct inode *, struct fs *, 67 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 68 ufs_lbn_t, int), int); 69 static int indiracct_ufs1(struct vnode *, struct vnode *, int, 70 ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 71 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 72 ufs_lbn_t, int), int); 73 static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 74 struct fs *, ufs_lbn_t, int); 75 static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 76 struct fs *, ufs_lbn_t, int); 77 static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 78 struct fs *, ufs_lbn_t, int); 79 static int expunge_ufs2(struct vnode *, struct inode *, struct fs *, 80 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 81 ufs_lbn_t, int), int); 82 static int indiracct_ufs2(struct vnode *, struct vnode *, int, 83 ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 84 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 85 ufs_lbn_t, int), int); 86 static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 87 struct fs *, ufs_lbn_t, int); 88 static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 89 struct fs *, ufs_lbn_t, int); 90 static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 91 struct fs *, ufs_lbn_t, int); 92 static int ffs_copyonwrite(struct vnode *, struct buf *); 93 static int readblock(struct buf *, ufs2_daddr_t); 94 95 /* 96 * To ensure the consistency of snapshots across crashes, we must 97 * synchronously write out copied blocks before allowing the 98 * originals to be modified. Because of the rather severe speed 99 * penalty that this imposes, the following flag allows this 100 * crash persistence to be disabled. 101 */ 102 int dopersistence = 0; 103 104 #ifdef DEBUG 105 #include <sys/sysctl.h> 106 SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, ""); 107 static int snapdebug = 0; 108 SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, ""); 109 int collectsnapstats = 0; 110 SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats, 111 0, ""); 112 #endif /* DEBUG */ 113 114 /* 115 * Create a snapshot file and initialize it for the filesystem. 116 */ 117 int 118 ffs_snapshot(mp, snapfile) 119 struct mount *mp; 120 char *snapfile; 121 { 122 ufs2_daddr_t numblks, blkno, *blkp, *snapblklist; 123 int error, cg, snaploc; 124 int i, size, len, loc; 125 int flag = mp->mnt_flag; 126 struct timespec starttime = {0, 0}, endtime; 127 char saved_nice = 0; 128 long redo = 0, snaplistsize = 0; 129 int32_t *lp; 130 void *space; 131 struct fs *copy_fs = NULL, *fs = VFSTOUFS(mp)->um_fs; 132 struct snaphead *snaphead; 133 struct thread *td = curthread; 134 struct inode *ip, *xp; 135 struct buf *bp, *nbp, *ibp, *sbp = NULL; 136 struct nameidata nd; 137 struct mount *wrtmp; 138 struct vattr vat; 139 struct vnode *vp, *xvp, *nvp, *devvp; 140 struct uio auio; 141 struct iovec aiov; 142 143 /* 144 * Need to serialize access to snapshot code per filesystem. 145 */ 146 /* 147 * Assign a snapshot slot in the superblock. 148 */ 149 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 150 if (fs->fs_snapinum[snaploc] == 0) 151 break; 152 if (snaploc == FSMAXSNAP) 153 return (ENOSPC); 154 /* 155 * Create the snapshot file. 156 */ 157 restart: 158 NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, snapfile, td); 159 if ((error = namei(&nd)) != 0) 160 return (error); 161 if (nd.ni_vp != NULL) { 162 vput(nd.ni_vp); 163 error = EEXIST; 164 } 165 if (nd.ni_dvp->v_mount != mp) 166 error = EXDEV; 167 if (error) { 168 NDFREE(&nd, NDF_ONLY_PNBUF); 169 if (nd.ni_dvp == nd.ni_vp) 170 vrele(nd.ni_dvp); 171 else 172 vput(nd.ni_dvp); 173 return (error); 174 } 175 VATTR_NULL(&vat); 176 vat.va_type = VREG; 177 vat.va_mode = S_IRUSR; 178 vat.va_vaflags |= VA_EXCLUSIVE; 179 if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp)) 180 wrtmp = NULL; 181 if (wrtmp != mp) 182 panic("ffs_snapshot: mount mismatch"); 183 if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) { 184 NDFREE(&nd, NDF_ONLY_PNBUF); 185 vput(nd.ni_dvp); 186 if ((error = vn_start_write(NULL, &wrtmp, 187 V_XSLEEP | PCATCH)) != 0) 188 return (error); 189 goto restart; 190 } 191 VOP_LEASE(nd.ni_dvp, td, KERNCRED, LEASE_WRITE); 192 error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat); 193 vput(nd.ni_dvp); 194 if (error) { 195 NDFREE(&nd, NDF_ONLY_PNBUF); 196 vn_finished_write(wrtmp); 197 return (error); 198 } 199 vp = nd.ni_vp; 200 ip = VTOI(vp); 201 devvp = ip->i_devvp; 202 /* 203 * Allocate and copy the last block contents so as to be able 204 * to set size to that of the filesystem. 205 */ 206 numblks = howmany(fs->fs_size, fs->fs_frag); 207 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), 208 fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); 209 if (error) 210 goto out; 211 ip->i_size = lblktosize(fs, (off_t)numblks); 212 DIP(ip, i_size) = ip->i_size; 213 ip->i_flag |= IN_CHANGE | IN_UPDATE; 214 if ((error = readblock(bp, numblks - 1)) != 0) 215 goto out; 216 bawrite(bp); 217 /* 218 * Preallocate critical data structures so that we can copy 219 * them in without further allocation after we suspend all 220 * operations on the filesystem. We would like to just release 221 * the allocated buffers without writing them since they will 222 * be filled in below once we are ready to go, but this upsets 223 * the soft update code, so we go ahead and write the new buffers. 224 * 225 * Allocate all indirect blocks and mark all of them as not 226 * needing to be copied. 227 */ 228 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 229 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), 230 fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp); 231 if (error) 232 goto out; 233 bawrite(ibp); 234 } 235 /* 236 * Allocate copies for the superblock and its summary information. 237 */ 238 error = UFS_BALLOC(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED, 239 0, &nbp); 240 if (error) 241 goto out; 242 bawrite(nbp); 243 blkno = fragstoblks(fs, fs->fs_csaddr); 244 len = howmany(fs->fs_cssize, fs->fs_bsize); 245 for (loc = 0; loc < len; loc++) { 246 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), 247 fs->fs_bsize, KERNCRED, 0, &nbp); 248 if (error) 249 goto out; 250 bawrite(nbp); 251 } 252 /* 253 * Allocate all cylinder group blocks. 254 */ 255 for (cg = 0; cg < fs->fs_ncg; cg++) { 256 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 257 fs->fs_bsize, KERNCRED, 0, &nbp); 258 if (error) 259 goto out; 260 bawrite(nbp); 261 } 262 /* 263 * Copy all the cylinder group maps. Although the 264 * filesystem is still active, we hope that only a few 265 * cylinder groups will change between now and when we 266 * suspend operations. Thus, we will be able to quickly 267 * touch up the few cylinder groups that changed during 268 * the suspension period. 269 */ 270 len = howmany(fs->fs_ncg, NBBY); 271 MALLOC(fs->fs_active, int *, len, M_DEVBUF, M_WAITOK); 272 bzero(fs->fs_active, len); 273 for (cg = 0; cg < fs->fs_ncg; cg++) { 274 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 275 fs->fs_bsize, KERNCRED, 0, &nbp); 276 if (error) 277 goto out; 278 error = cgaccount(cg, vp, nbp, 1); 279 bawrite(nbp); 280 if (error) 281 goto out; 282 } 283 /* 284 * Change inode to snapshot type file. 285 */ 286 ip->i_flags |= SF_SNAPSHOT; 287 DIP(ip, i_flags) = ip->i_flags; 288 ip->i_flag |= IN_CHANGE | IN_UPDATE; 289 /* 290 * Ensure that the snapshot is completely on disk. 291 * Since we have marked it as a snapshot it is safe to 292 * unlock it as no process will be allowed to write to it. 293 */ 294 if ((error = VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td)) != 0) 295 goto out; 296 VOP_UNLOCK(vp, 0, td); 297 /* 298 * All allocations are done, so we can now snapshot the system. 299 * 300 * Recind nice scheduling while running with the filesystem suspended. 301 */ 302 if (td->td_ksegrp->kg_nice > 0) { 303 PROC_LOCK(td->td_proc); 304 mtx_lock_spin(&sched_lock); 305 saved_nice = td->td_ksegrp->kg_nice; 306 sched_nice(td->td_ksegrp, 0); 307 mtx_unlock_spin(&sched_lock); 308 PROC_UNLOCK(td->td_proc); 309 } 310 /* 311 * Suspend operation on filesystem. 312 */ 313 for (;;) { 314 vn_finished_write(wrtmp); 315 if ((error = vfs_write_suspend(vp->v_mount)) != 0) { 316 vn_start_write(NULL, &wrtmp, V_WAIT); 317 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 318 goto out; 319 } 320 if (mp->mnt_kern_flag & MNTK_SUSPENDED) 321 break; 322 vn_start_write(NULL, &wrtmp, V_WAIT); 323 } 324 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 325 if (collectsnapstats) 326 nanotime(&starttime); 327 /* 328 * First, copy all the cylinder group maps that have changed. 329 */ 330 for (cg = 0; cg < fs->fs_ncg; cg++) { 331 if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0) 332 continue; 333 redo++; 334 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 335 fs->fs_bsize, KERNCRED, 0, &nbp); 336 if (error) 337 goto out1; 338 error = cgaccount(cg, vp, nbp, 2); 339 bawrite(nbp); 340 if (error) 341 goto out1; 342 } 343 /* 344 * Grab a copy of the superblock and its summary information. 345 * We delay writing it until the suspension is released below. 346 */ 347 error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize, 348 KERNCRED, &sbp); 349 if (error) { 350 brelse(sbp); 351 sbp = NULL; 352 goto out1; 353 } 354 loc = blkoff(fs, fs->fs_sblockloc); 355 copy_fs = (struct fs *)(sbp->b_data + loc); 356 bcopy(fs, copy_fs, fs->fs_sbsize); 357 if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) 358 copy_fs->fs_clean = 1; 359 size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE; 360 if (fs->fs_sbsize < size) 361 bzero(&sbp->b_data[loc + fs->fs_sbsize], size - fs->fs_sbsize); 362 size = blkroundup(fs, fs->fs_cssize); 363 if (fs->fs_contigsumsize > 0) 364 size += fs->fs_ncg * sizeof(int32_t); 365 space = malloc((u_long)size, M_UFSMNT, M_WAITOK); 366 copy_fs->fs_csp = space; 367 bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize); 368 (char *)space += fs->fs_cssize; 369 loc = howmany(fs->fs_cssize, fs->fs_fsize); 370 i = fs->fs_frag - loc % fs->fs_frag; 371 len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; 372 if (len > 0) { 373 if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc), 374 len, KERNCRED, &bp)) != 0) { 375 brelse(bp); 376 free(copy_fs->fs_csp, M_UFSMNT); 377 bawrite(sbp); 378 sbp = NULL; 379 goto out1; 380 } 381 bcopy(bp->b_data, space, (u_int)len); 382 (char *)space += len; 383 bp->b_flags |= B_INVAL | B_NOCACHE; 384 brelse(bp); 385 } 386 if (fs->fs_contigsumsize > 0) { 387 copy_fs->fs_maxcluster = lp = space; 388 for (i = 0; i < fs->fs_ncg; i++) 389 *lp++ = fs->fs_contigsumsize; 390 } 391 /* 392 * We must check for active files that have been unlinked 393 * (e.g., with a zero link count). We have to expunge all 394 * trace of these files from the snapshot so that they are 395 * not reclaimed prematurely by fsck or unnecessarily dumped. 396 * We turn off the MNTK_SUSPENDED flag to avoid a panic from 397 * spec_strategy about writing on a suspended filesystem. 398 * Note that we skip unlinked snapshot files as they will 399 * be handled separately below. 400 * 401 * We also calculate the needed size for the snapshot list. 402 */ 403 snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + 404 FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */; 405 mp->mnt_kern_flag &= ~MNTK_SUSPENDED; 406 mtx_lock(&mntvnode_mtx); 407 loop: 408 for (xvp = TAILQ_FIRST(&mp->mnt_nvnodelist); xvp; xvp = nvp) { 409 /* 410 * Make sure this vnode wasn't reclaimed in getnewvnode(). 411 * Start over if it has (it won't be on the list anymore). 412 */ 413 if (xvp->v_mount != mp) 414 goto loop; 415 nvp = TAILQ_NEXT(xvp, v_nmntvnodes); 416 mtx_unlock(&mntvnode_mtx); 417 mp_fixme("Unlocked GETATTR."); 418 if (vrefcnt(xvp) == 0 || xvp->v_type == VNON || 419 (VTOI(xvp)->i_flags & SF_SNAPSHOT) || 420 (VOP_GETATTR(xvp, &vat, td->td_ucred, td) == 0 && 421 vat.va_nlink > 0)) { 422 mtx_lock(&mntvnode_mtx); 423 continue; 424 } 425 if (snapdebug) 426 vprint("ffs_snapshot: busy vnode", xvp); 427 if (vn_lock(xvp, LK_EXCLUSIVE, td) != 0) 428 goto loop; 429 xp = VTOI(xvp); 430 if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) { 431 VOP_UNLOCK(xvp, 0, td); 432 continue; 433 } 434 /* 435 * If there is a fragment, clear it here. 436 */ 437 blkno = 0; 438 loc = howmany(xp->i_size, fs->fs_bsize) - 1; 439 if (loc < NDADDR) { 440 len = fragroundup(fs, blkoff(fs, xp->i_size)); 441 if (len < fs->fs_bsize) { 442 ffs_blkfree(copy_fs, vp, DIP(xp, i_db[loc]), 443 len, xp->i_number); 444 blkno = DIP(xp, i_db[loc]); 445 DIP(xp, i_db[loc]) = 0; 446 } 447 } 448 snaplistsize += 1; 449 if (xp->i_ump->um_fstype == UFS1) 450 error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, 451 BLK_NOCOPY); 452 else 453 error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, 454 BLK_NOCOPY); 455 if (blkno) 456 DIP(xp, i_db[loc]) = blkno; 457 if (!error) 458 error = ffs_freefile(copy_fs, vp, xp->i_number, 459 xp->i_mode); 460 VOP_UNLOCK(xvp, 0, td); 461 if (error) { 462 free(copy_fs->fs_csp, M_UFSMNT); 463 bawrite(sbp); 464 sbp = NULL; 465 goto out1; 466 } 467 mtx_lock(&mntvnode_mtx); 468 } 469 mtx_unlock(&mntvnode_mtx); 470 /* 471 * If there already exist snapshots on this filesystem, grab a 472 * reference to their shared lock. If this is the first snapshot 473 * on this filesystem, we need to allocate a lock for the snapshots 474 * to share. In either case, acquire the snapshot lock and give 475 * up our original private lock. 476 */ 477 VI_LOCK(devvp); 478 snaphead = &devvp->v_rdev->si_snapshots; 479 if ((xp = TAILQ_FIRST(snaphead)) != NULL) { 480 VI_LOCK(vp); 481 vp->v_vnlock = ITOV(xp)->v_vnlock; 482 VI_UNLOCK(devvp); 483 } else { 484 struct lock *lkp; 485 486 VI_UNLOCK(devvp); 487 MALLOC(lkp, struct lock *, sizeof(struct lock), M_UFSMNT, 488 M_WAITOK); 489 lockinit(lkp, PVFS, "snaplk", VLKTIMEOUT, 490 LK_CANRECURSE | LK_NOPAUSE); 491 VI_LOCK(vp); 492 vp->v_vnlock = lkp; 493 } 494 vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, td); 495 transferlockers(&vp->v_lock, vp->v_vnlock); 496 lockmgr(&vp->v_lock, LK_RELEASE, NULL, td); 497 /* 498 * If this is the first snapshot on this filesystem, then we need 499 * to allocate the space for the list of preallocated snapshot blocks. 500 * This list will be refined below, but this preliminary one will 501 * keep us out of deadlock until the full one is ready. 502 */ 503 if (xp == NULL) { 504 MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t), 505 M_UFSMNT, M_WAITOK); 506 blkp = &snapblklist[1]; 507 *blkp++ = lblkno(fs, fs->fs_sblockloc); 508 blkno = fragstoblks(fs, fs->fs_csaddr); 509 for (cg = 0; cg < fs->fs_ncg; cg++) { 510 if (fragstoblks(fs, cgtod(fs, cg) > blkno)) 511 break; 512 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 513 } 514 len = howmany(fs->fs_cssize, fs->fs_bsize); 515 for (loc = 0; loc < len; loc++) 516 *blkp++ = blkno + loc; 517 for (; cg < fs->fs_ncg; cg++) 518 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 519 snapblklist[0] = blkp - snapblklist; 520 VI_LOCK(devvp); 521 if (devvp->v_rdev->si_snapblklist != NULL) 522 panic("ffs_snapshot: non-empty list"); 523 devvp->v_rdev->si_snapblklist = snapblklist; 524 devvp->v_rdev->si_snaplistsize = blkp - snapblklist; 525 VI_UNLOCK(devvp); 526 } 527 /* 528 * Record snapshot inode. Since this is the newest snapshot, 529 * it must be placed at the end of the list. 530 */ 531 VI_LOCK(devvp); 532 fs->fs_snapinum[snaploc] = ip->i_number; 533 if (ip->i_nextsnap.tqe_prev != 0) 534 panic("ffs_snapshot: %d already on list", ip->i_number); 535 TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap); 536 devvp->v_rdev->si_copyonwrite = ffs_copyonwrite; 537 devvp->v_vflag |= VV_COPYONWRITE; 538 VI_UNLOCK(devvp); 539 ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp"); 540 vp->v_vflag |= VV_SYSTEM; 541 out1: 542 /* 543 * Resume operation on filesystem. 544 */ 545 vfs_write_resume(vp->v_mount); 546 vn_start_write(NULL, &wrtmp, V_WAIT); 547 if (collectsnapstats && starttime.tv_sec > 0) { 548 nanotime(&endtime); 549 timespecsub(&endtime, &starttime); 550 printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n", 551 vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec, 552 endtime.tv_nsec / 1000000, redo, fs->fs_ncg); 553 } 554 if (sbp == NULL) 555 goto out; 556 /* 557 * Copy allocation information from all the snapshots in 558 * this snapshot and then expunge them from its view. 559 */ 560 snaphead = &devvp->v_rdev->si_snapshots; 561 TAILQ_FOREACH(xp, snaphead, i_nextsnap) { 562 if (xp == ip) 563 break; 564 if (xp->i_ump->um_fstype == UFS1) 565 error = expunge_ufs1(vp, xp, fs, snapacct_ufs1, 566 BLK_SNAP); 567 else 568 error = expunge_ufs2(vp, xp, fs, snapacct_ufs2, 569 BLK_SNAP); 570 if (error) { 571 fs->fs_snapinum[snaploc] = 0; 572 goto done; 573 } 574 } 575 /* 576 * Allocate space for the full list of preallocated snapshot blocks. 577 */ 578 MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t), 579 M_UFSMNT, M_WAITOK); 580 ip->i_snapblklist = &snapblklist[1]; 581 /* 582 * Expunge the blocks used by the snapshots from the set of 583 * blocks marked as used in the snapshot bitmaps. Also, collect 584 * the list of allocated blocks in i_snapblklist. 585 */ 586 if (ip->i_ump->um_fstype == UFS1) 587 error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP); 588 else 589 error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP); 590 if (error) { 591 fs->fs_snapinum[snaploc] = 0; 592 FREE(snapblklist, M_UFSMNT); 593 goto done; 594 } 595 if (snaplistsize < ip->i_snapblklist - snapblklist) 596 panic("ffs_snapshot: list too small"); 597 snaplistsize = ip->i_snapblklist - snapblklist; 598 snapblklist[0] = snaplistsize; 599 ip->i_snapblklist = 0; 600 /* 601 * Write out the list of allocated blocks to the end of the snapshot. 602 */ 603 auio.uio_iov = &aiov; 604 auio.uio_iovcnt = 1; 605 aiov.iov_base = (void *)snapblklist; 606 aiov.iov_len = snaplistsize * sizeof(daddr_t); 607 auio.uio_resid = aiov.iov_len;; 608 auio.uio_offset = ip->i_size; 609 auio.uio_segflg = UIO_SYSSPACE; 610 auio.uio_rw = UIO_WRITE; 611 auio.uio_td = td; 612 if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 613 fs->fs_snapinum[snaploc] = 0; 614 FREE(snapblklist, M_UFSMNT); 615 goto done; 616 } 617 /* 618 * Write the superblock and its summary information 619 * to the snapshot. 620 */ 621 blkno = fragstoblks(fs, fs->fs_csaddr); 622 len = howmany(fs->fs_cssize, fs->fs_bsize); 623 space = copy_fs->fs_csp; 624 for (loc = 0; loc < len; loc++) { 625 error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp); 626 if (error) { 627 brelse(nbp); 628 fs->fs_snapinum[snaploc] = 0; 629 FREE(snapblklist, M_UFSMNT); 630 goto done; 631 } 632 bcopy(space, nbp->b_data, fs->fs_bsize); 633 space = (char *)space + fs->fs_bsize; 634 bawrite(nbp); 635 } 636 /* 637 * As this is the newest list, it is the most inclusive, so 638 * should replace the previous list. 639 */ 640 VI_LOCK(devvp); 641 space = devvp->v_rdev->si_snapblklist; 642 devvp->v_rdev->si_snapblklist = snapblklist; 643 devvp->v_rdev->si_snaplistsize = snaplistsize; 644 if (space != NULL) 645 FREE(space, M_UFSMNT); 646 VI_UNLOCK(devvp); 647 done: 648 free(copy_fs->fs_csp, M_UFSMNT); 649 bawrite(sbp); 650 out: 651 if (saved_nice > 0) { 652 PROC_LOCK(td->td_proc); 653 mtx_lock_spin(&sched_lock); 654 sched_nice(td->td_ksegrp, saved_nice); 655 mtx_unlock_spin(&sched_lock); 656 PROC_UNLOCK(td->td_proc); 657 } 658 if (fs->fs_active != 0) { 659 FREE(fs->fs_active, M_DEVBUF); 660 fs->fs_active = 0; 661 } 662 mp->mnt_flag = flag; 663 if (error) 664 (void) UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED, td); 665 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 666 if (error) 667 vput(vp); 668 else 669 VOP_UNLOCK(vp, 0, td); 670 vn_finished_write(wrtmp); 671 return (error); 672 } 673 674 /* 675 * Copy a cylinder group map. All the unallocated blocks are marked 676 * BLK_NOCOPY so that the snapshot knows that it need not copy them 677 * if they are later written. If passno is one, then this is a first 678 * pass, so only setting needs to be done. If passno is 2, then this 679 * is a revision to a previous pass which must be undone as the 680 * replacement pass is done. 681 */ 682 static int 683 cgaccount(cg, vp, nbp, passno) 684 int cg; 685 struct vnode *vp; 686 struct buf *nbp; 687 int passno; 688 { 689 struct buf *bp, *ibp; 690 struct inode *ip; 691 struct cg *cgp; 692 struct fs *fs; 693 ufs2_daddr_t base, numblks; 694 int error, len, loc, indiroff; 695 696 ip = VTOI(vp); 697 fs = ip->i_fs; 698 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 699 (int)fs->fs_cgsize, KERNCRED, &bp); 700 if (error) { 701 brelse(bp); 702 return (error); 703 } 704 cgp = (struct cg *)bp->b_data; 705 if (!cg_chkmagic(cgp)) { 706 brelse(bp); 707 return (EIO); 708 } 709 atomic_set_int(&ACTIVECGNUM(fs, cg), ACTIVECGOFF(cg)); 710 bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize); 711 if (fs->fs_cgsize < fs->fs_bsize) 712 bzero(&nbp->b_data[fs->fs_cgsize], 713 fs->fs_bsize - fs->fs_cgsize); 714 if (passno == 2) 715 nbp->b_flags |= B_VALIDSUSPWRT; 716 numblks = howmany(fs->fs_size, fs->fs_frag); 717 len = howmany(fs->fs_fpg, fs->fs_frag); 718 base = cg * fs->fs_fpg / fs->fs_frag; 719 if (base + len >= numblks) 720 len = numblks - base - 1; 721 loc = 0; 722 if (base < NDADDR) { 723 for ( ; loc < NDADDR; loc++) { 724 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 725 DIP(ip, i_db[loc]) = BLK_NOCOPY; 726 else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY) 727 DIP(ip, i_db[loc]) = 0; 728 else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY) 729 panic("ffs_snapshot: lost direct block"); 730 } 731 } 732 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), 733 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 734 if (error) { 735 brelse(bp); 736 return (error); 737 } 738 indiroff = (base + loc - NDADDR) % NINDIR(fs); 739 for ( ; loc < len; loc++, indiroff++) { 740 if (indiroff >= NINDIR(fs)) { 741 if (passno == 2) 742 ibp->b_flags |= B_VALIDSUSPWRT; 743 bawrite(ibp); 744 error = UFS_BALLOC(vp, 745 lblktosize(fs, (off_t)(base + loc)), 746 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 747 if (error) { 748 brelse(bp); 749 return (error); 750 } 751 indiroff = 0; 752 } 753 if (ip->i_ump->um_fstype == UFS1) { 754 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 755 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 756 BLK_NOCOPY; 757 else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data)) 758 [indiroff] == BLK_NOCOPY) 759 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0; 760 else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data)) 761 [indiroff] == BLK_NOCOPY) 762 panic("ffs_snapshot: lost indirect block"); 763 continue; 764 } 765 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 766 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; 767 else if (passno == 2 && 768 ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) 769 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0; 770 else if (passno == 1 && 771 ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) 772 panic("ffs_snapshot: lost indirect block"); 773 } 774 bqrelse(bp); 775 if (passno == 2) 776 ibp->b_flags |= B_VALIDSUSPWRT; 777 bdwrite(ibp); 778 return (0); 779 } 780 781 /* 782 * Before expunging a snapshot inode, note all the 783 * blocks that it claims with BLK_SNAP so that fsck will 784 * be able to account for those blocks properly and so 785 * that this snapshot knows that it need not copy them 786 * if the other snapshot holding them is freed. This code 787 * is reproduced once each for UFS1 and UFS2. 788 */ 789 static int 790 expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype) 791 struct vnode *snapvp; 792 struct inode *cancelip; 793 struct fs *fs; 794 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 795 struct fs *, ufs_lbn_t, int); 796 int expungetype; 797 { 798 int i, error, indiroff; 799 ufs_lbn_t lbn, rlbn; 800 ufs2_daddr_t len, blkno, numblks, blksperindir; 801 struct ufs1_dinode *dip; 802 struct thread *td = curthread; 803 struct buf *bp; 804 805 /* 806 * Prepare to expunge the inode. If its inode block has not 807 * yet been copied, then allocate and fill the copy. 808 */ 809 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 810 blkno = 0; 811 if (lbn < NDADDR) { 812 blkno = VTOI(snapvp)->i_din1->di_db[lbn]; 813 } else { 814 td->td_proc->p_flag |= P_COWINPROGRESS; 815 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 816 fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); 817 td->td_proc->p_flag &= ~P_COWINPROGRESS; 818 if (error) 819 return (error); 820 indiroff = (lbn - NDADDR) % NINDIR(fs); 821 blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff]; 822 bqrelse(bp); 823 } 824 if (blkno != 0) { 825 if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) 826 return (error); 827 } else { 828 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 829 fs->fs_bsize, KERNCRED, 0, &bp); 830 if (error) 831 return (error); 832 if ((error = readblock(bp, lbn)) != 0) 833 return (error); 834 } 835 /* 836 * Set a snapshot inode to be a zero length file, regular files 837 * to be completely unallocated. 838 */ 839 dip = (struct ufs1_dinode *)bp->b_data + 840 ino_to_fsbo(fs, cancelip->i_number); 841 if (expungetype == BLK_NOCOPY) 842 dip->di_mode = 0; 843 dip->di_size = 0; 844 dip->di_blocks = 0; 845 dip->di_flags &= ~SF_SNAPSHOT; 846 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t)); 847 bdwrite(bp); 848 /* 849 * Now go through and expunge all the blocks in the file 850 * using the function requested. 851 */ 852 numblks = howmany(cancelip->i_size, fs->fs_bsize); 853 if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0], 854 &cancelip->i_din1->di_db[NDADDR], fs, 0, expungetype))) 855 return (error); 856 if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_ib[0], 857 &cancelip->i_din1->di_ib[NIADDR], fs, -1, expungetype))) 858 return (error); 859 blksperindir = 1; 860 lbn = -NDADDR; 861 len = numblks - NDADDR; 862 rlbn = NDADDR; 863 for (i = 0; len > 0 && i < NIADDR; i++) { 864 error = indiracct_ufs1(snapvp, ITOV(cancelip), i, 865 cancelip->i_din1->di_ib[i], lbn, rlbn, len, 866 blksperindir, fs, acctfunc, expungetype); 867 if (error) 868 return (error); 869 blksperindir *= NINDIR(fs); 870 lbn -= blksperindir + 1; 871 len -= blksperindir; 872 rlbn += blksperindir; 873 } 874 return (0); 875 } 876 877 /* 878 * Descend an indirect block chain for vnode cancelvp accounting for all 879 * its indirect blocks in snapvp. 880 */ 881 static int 882 indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, 883 blksperindir, fs, acctfunc, expungetype) 884 struct vnode *snapvp; 885 struct vnode *cancelvp; 886 int level; 887 ufs1_daddr_t blkno; 888 ufs_lbn_t lbn; 889 ufs_lbn_t rlbn; 890 ufs_lbn_t remblks; 891 ufs_lbn_t blksperindir; 892 struct fs *fs; 893 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 894 struct fs *, ufs_lbn_t, int); 895 int expungetype; 896 { 897 int error, num, i; 898 ufs_lbn_t subblksperindir; 899 struct indir indirs[NIADDR + 2]; 900 ufs1_daddr_t last, *bap; 901 struct buf *bp; 902 903 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 904 return (error); 905 if (lbn != indirs[num - 1 - level].in_lbn || blkno == 0 || num < 2) 906 panic("indiracct: botched params"); 907 /* 908 * We have to expand bread here since it will deadlock looking 909 * up the block number for any blocks that are not in the cache. 910 */ 911 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); 912 bp->b_blkno = fsbtodb(fs, blkno); 913 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 914 (error = readblock(bp, fragstoblks(fs, blkno)))) { 915 brelse(bp); 916 return (error); 917 } 918 /* 919 * Account for the block pointers in this indirect block. 920 */ 921 last = howmany(remblks, blksperindir); 922 if (last > NINDIR(fs)) 923 last = NINDIR(fs); 924 MALLOC(bap, ufs1_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); 925 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 926 bqrelse(bp); 927 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 928 level == 0 ? rlbn : -1, expungetype); 929 if (error || level == 0) 930 goto out; 931 /* 932 * Account for the block pointers in each of the indirect blocks 933 * in the levels below us. 934 */ 935 subblksperindir = blksperindir / NINDIR(fs); 936 for (lbn++, level--, i = 0; i < last; i++) { 937 error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn, 938 rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); 939 if (error) 940 goto out; 941 rlbn += blksperindir; 942 lbn -= blksperindir; 943 remblks -= blksperindir; 944 } 945 out: 946 FREE(bap, M_DEVBUF); 947 return (error); 948 } 949 950 /* 951 * Do both snap accounting and map accounting. 952 */ 953 static int 954 fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype) 955 struct vnode *vp; 956 ufs1_daddr_t *oldblkp, *lastblkp; 957 struct fs *fs; 958 ufs_lbn_t lblkno; 959 int exptype; /* BLK_SNAP or BLK_NOCOPY */ 960 { 961 int error; 962 963 if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 964 return (error); 965 return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 966 } 967 968 /* 969 * Identify a set of blocks allocated in a snapshot inode. 970 */ 971 static int 972 snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 973 struct vnode *vp; 974 ufs1_daddr_t *oldblkp, *lastblkp; 975 struct fs *fs; 976 ufs_lbn_t lblkno; 977 int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 978 { 979 struct inode *ip = VTOI(vp); 980 ufs1_daddr_t blkno, *blkp; 981 ufs_lbn_t lbn; 982 struct buf *ibp; 983 int error; 984 985 for ( ; oldblkp < lastblkp; oldblkp++) { 986 blkno = *oldblkp; 987 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 988 continue; 989 lbn = fragstoblks(fs, blkno); 990 if (lbn < NDADDR) { 991 blkp = &ip->i_din1->di_db[lbn]; 992 ip->i_flag |= IN_CHANGE | IN_UPDATE; 993 } else { 994 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 995 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 996 if (error) 997 return (error); 998 blkp = &((ufs1_daddr_t *)(ibp->b_data)) 999 [(lbn - NDADDR) % NINDIR(fs)]; 1000 } 1001 /* 1002 * If we are expunging a snapshot vnode and we 1003 * find a block marked BLK_NOCOPY, then it is 1004 * one that has been allocated to this snapshot after 1005 * we took our current snapshot and can be ignored. 1006 */ 1007 if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { 1008 if (lbn >= NDADDR) 1009 brelse(ibp); 1010 } else { 1011 if (*blkp != 0) 1012 panic("snapacct: bad block"); 1013 *blkp = expungetype; 1014 if (lbn >= NDADDR) 1015 bdwrite(ibp); 1016 } 1017 } 1018 return (0); 1019 } 1020 1021 /* 1022 * Account for a set of blocks allocated in a snapshot inode. 1023 */ 1024 static int 1025 mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1026 struct vnode *vp; 1027 ufs1_daddr_t *oldblkp, *lastblkp; 1028 struct fs *fs; 1029 ufs_lbn_t lblkno; 1030 int expungetype; 1031 { 1032 ufs1_daddr_t blkno; 1033 struct inode *ip; 1034 ino_t inum; 1035 int acctit; 1036 1037 ip = VTOI(vp); 1038 inum = ip->i_number; 1039 if (lblkno == -1) 1040 acctit = 0; 1041 else 1042 acctit = 1; 1043 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1044 blkno = *oldblkp; 1045 if (blkno == 0 || blkno == BLK_NOCOPY) 1046 continue; 1047 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1048 *ip->i_snapblklist++ = lblkno; 1049 if (blkno == BLK_SNAP) 1050 blkno = blkstofrags(fs, lblkno); 1051 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); 1052 } 1053 return (0); 1054 } 1055 1056 /* 1057 * Before expunging a snapshot inode, note all the 1058 * blocks that it claims with BLK_SNAP so that fsck will 1059 * be able to account for those blocks properly and so 1060 * that this snapshot knows that it need not copy them 1061 * if the other snapshot holding them is freed. This code 1062 * is reproduced once each for UFS1 and UFS2. 1063 */ 1064 static int 1065 expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype) 1066 struct vnode *snapvp; 1067 struct inode *cancelip; 1068 struct fs *fs; 1069 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1070 struct fs *, ufs_lbn_t, int); 1071 int expungetype; 1072 { 1073 int i, error, indiroff; 1074 ufs_lbn_t lbn, rlbn; 1075 ufs2_daddr_t len, blkno, numblks, blksperindir; 1076 struct ufs2_dinode *dip; 1077 struct thread *td = curthread; 1078 struct buf *bp; 1079 1080 /* 1081 * Prepare to expunge the inode. If its inode block has not 1082 * yet been copied, then allocate and fill the copy. 1083 */ 1084 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 1085 blkno = 0; 1086 if (lbn < NDADDR) { 1087 blkno = VTOI(snapvp)->i_din2->di_db[lbn]; 1088 } else { 1089 td->td_proc->p_flag |= P_COWINPROGRESS; 1090 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 1091 fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); 1092 td->td_proc->p_flag &= ~P_COWINPROGRESS; 1093 if (error) 1094 return (error); 1095 indiroff = (lbn - NDADDR) % NINDIR(fs); 1096 blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff]; 1097 bqrelse(bp); 1098 } 1099 if (blkno != 0) { 1100 if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) 1101 return (error); 1102 } else { 1103 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 1104 fs->fs_bsize, KERNCRED, 0, &bp); 1105 if (error) 1106 return (error); 1107 if ((error = readblock(bp, lbn)) != 0) 1108 return (error); 1109 } 1110 /* 1111 * Set a snapshot inode to be a zero length file, regular files 1112 * to be completely unallocated. 1113 */ 1114 dip = (struct ufs2_dinode *)bp->b_data + 1115 ino_to_fsbo(fs, cancelip->i_number); 1116 if (expungetype == BLK_NOCOPY) 1117 dip->di_mode = 0; 1118 dip->di_size = 0; 1119 dip->di_blocks = 0; 1120 dip->di_flags &= ~SF_SNAPSHOT; 1121 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t)); 1122 bdwrite(bp); 1123 /* 1124 * Now go through and expunge all the blocks in the file 1125 * using the function requested. 1126 */ 1127 numblks = howmany(cancelip->i_size, fs->fs_bsize); 1128 if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0], 1129 &cancelip->i_din2->di_db[NDADDR], fs, 0, expungetype))) 1130 return (error); 1131 if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_ib[0], 1132 &cancelip->i_din2->di_ib[NIADDR], fs, -1, expungetype))) 1133 return (error); 1134 blksperindir = 1; 1135 lbn = -NDADDR; 1136 len = numblks - NDADDR; 1137 rlbn = NDADDR; 1138 for (i = 0; len > 0 && i < NIADDR; i++) { 1139 error = indiracct_ufs2(snapvp, ITOV(cancelip), i, 1140 cancelip->i_din2->di_ib[i], lbn, rlbn, len, 1141 blksperindir, fs, acctfunc, expungetype); 1142 if (error) 1143 return (error); 1144 blksperindir *= NINDIR(fs); 1145 lbn -= blksperindir + 1; 1146 len -= blksperindir; 1147 rlbn += blksperindir; 1148 } 1149 return (0); 1150 } 1151 1152 /* 1153 * Descend an indirect block chain for vnode cancelvp accounting for all 1154 * its indirect blocks in snapvp. 1155 */ 1156 static int 1157 indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, 1158 blksperindir, fs, acctfunc, expungetype) 1159 struct vnode *snapvp; 1160 struct vnode *cancelvp; 1161 int level; 1162 ufs2_daddr_t blkno; 1163 ufs_lbn_t lbn; 1164 ufs_lbn_t rlbn; 1165 ufs_lbn_t remblks; 1166 ufs_lbn_t blksperindir; 1167 struct fs *fs; 1168 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1169 struct fs *, ufs_lbn_t, int); 1170 int expungetype; 1171 { 1172 int error, num, i; 1173 ufs_lbn_t subblksperindir; 1174 struct indir indirs[NIADDR + 2]; 1175 ufs2_daddr_t last, *bap; 1176 struct buf *bp; 1177 1178 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 1179 return (error); 1180 if (lbn != indirs[num - 1 - level].in_lbn || blkno == 0 || num < 2) 1181 panic("indiracct: botched params"); 1182 /* 1183 * We have to expand bread here since it will deadlock looking 1184 * up the block number for any blocks that are not in the cache. 1185 */ 1186 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); 1187 bp->b_blkno = fsbtodb(fs, blkno); 1188 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 1189 (error = readblock(bp, fragstoblks(fs, blkno)))) { 1190 brelse(bp); 1191 return (error); 1192 } 1193 /* 1194 * Account for the block pointers in this indirect block. 1195 */ 1196 last = howmany(remblks, blksperindir); 1197 if (last > NINDIR(fs)) 1198 last = NINDIR(fs); 1199 MALLOC(bap, ufs2_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); 1200 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 1201 bqrelse(bp); 1202 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 1203 level == 0 ? rlbn : -1, expungetype); 1204 if (error || level == 0) 1205 goto out; 1206 /* 1207 * Account for the block pointers in each of the indirect blocks 1208 * in the levels below us. 1209 */ 1210 subblksperindir = blksperindir / NINDIR(fs); 1211 for (lbn++, level--, i = 0; i < last; i++) { 1212 error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn, 1213 rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); 1214 if (error) 1215 goto out; 1216 rlbn += blksperindir; 1217 lbn -= blksperindir; 1218 remblks -= blksperindir; 1219 } 1220 out: 1221 FREE(bap, M_DEVBUF); 1222 return (error); 1223 } 1224 1225 /* 1226 * Do both snap accounting and map accounting. 1227 */ 1228 static int 1229 fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype) 1230 struct vnode *vp; 1231 ufs2_daddr_t *oldblkp, *lastblkp; 1232 struct fs *fs; 1233 ufs_lbn_t lblkno; 1234 int exptype; /* BLK_SNAP or BLK_NOCOPY */ 1235 { 1236 int error; 1237 1238 if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 1239 return (error); 1240 return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 1241 } 1242 1243 /* 1244 * Identify a set of blocks allocated in a snapshot inode. 1245 */ 1246 static int 1247 snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1248 struct vnode *vp; 1249 ufs2_daddr_t *oldblkp, *lastblkp; 1250 struct fs *fs; 1251 ufs_lbn_t lblkno; 1252 int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 1253 { 1254 struct inode *ip = VTOI(vp); 1255 ufs2_daddr_t blkno, *blkp; 1256 ufs_lbn_t lbn; 1257 struct buf *ibp; 1258 int error; 1259 1260 for ( ; oldblkp < lastblkp; oldblkp++) { 1261 blkno = *oldblkp; 1262 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1263 continue; 1264 lbn = fragstoblks(fs, blkno); 1265 if (lbn < NDADDR) { 1266 blkp = &ip->i_din2->di_db[lbn]; 1267 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1268 } else { 1269 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1270 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1271 if (error) 1272 return (error); 1273 blkp = &((ufs2_daddr_t *)(ibp->b_data)) 1274 [(lbn - NDADDR) % NINDIR(fs)]; 1275 } 1276 /* 1277 * If we are expunging a snapshot vnode and we 1278 * find a block marked BLK_NOCOPY, then it is 1279 * one that has been allocated to this snapshot after 1280 * we took our current snapshot and can be ignored. 1281 */ 1282 if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { 1283 if (lbn >= NDADDR) 1284 brelse(ibp); 1285 } else { 1286 if (*blkp != 0) 1287 panic("snapacct: bad block"); 1288 *blkp = expungetype; 1289 if (lbn >= NDADDR) 1290 bdwrite(ibp); 1291 } 1292 } 1293 return (0); 1294 } 1295 1296 /* 1297 * Account for a set of blocks allocated in a snapshot inode. 1298 */ 1299 static int 1300 mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1301 struct vnode *vp; 1302 ufs2_daddr_t *oldblkp, *lastblkp; 1303 struct fs *fs; 1304 ufs_lbn_t lblkno; 1305 int expungetype; 1306 { 1307 ufs2_daddr_t blkno; 1308 struct inode *ip; 1309 ino_t inum; 1310 int acctit; 1311 1312 ip = VTOI(vp); 1313 inum = ip->i_number; 1314 if (lblkno == -1) 1315 acctit = 0; 1316 else 1317 acctit = 1; 1318 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1319 blkno = *oldblkp; 1320 if (blkno == 0 || blkno == BLK_NOCOPY) 1321 continue; 1322 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1323 *ip->i_snapblklist++ = lblkno; 1324 if (blkno == BLK_SNAP) 1325 blkno = blkstofrags(fs, lblkno); 1326 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); 1327 } 1328 return (0); 1329 } 1330 1331 /* 1332 * Decrement extra reference on snapshot when last name is removed. 1333 * It will not be freed until the last open reference goes away. 1334 */ 1335 void 1336 ffs_snapgone(ip) 1337 struct inode *ip; 1338 { 1339 struct inode *xp; 1340 struct fs *fs; 1341 int snaploc; 1342 1343 /* 1344 * Find snapshot in incore list. 1345 */ 1346 TAILQ_FOREACH(xp, &ip->i_devvp->v_rdev->si_snapshots, i_nextsnap) 1347 if (xp == ip) 1348 break; 1349 if (xp != NULL) 1350 vrele(ITOV(ip)); 1351 else if (snapdebug) 1352 printf("ffs_snapgone: lost snapshot vnode %d\n", 1353 ip->i_number); 1354 /* 1355 * Delete snapshot inode from superblock. Keep list dense. 1356 */ 1357 fs = ip->i_fs; 1358 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 1359 if (fs->fs_snapinum[snaploc] == ip->i_number) 1360 break; 1361 if (snaploc < FSMAXSNAP) { 1362 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { 1363 if (fs->fs_snapinum[snaploc] == 0) 1364 break; 1365 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; 1366 } 1367 fs->fs_snapinum[snaploc - 1] = 0; 1368 } 1369 } 1370 1371 /* 1372 * Prepare a snapshot file for being removed. 1373 */ 1374 void 1375 ffs_snapremove(vp) 1376 struct vnode *vp; 1377 { 1378 struct inode *ip; 1379 struct vnode *devvp; 1380 struct lock *lkp; 1381 struct buf *ibp; 1382 struct fs *fs; 1383 struct thread *td = curthread; 1384 ufs2_daddr_t numblks, blkno, dblk, *snapblklist; 1385 int error, loc, last; 1386 1387 ip = VTOI(vp); 1388 fs = ip->i_fs; 1389 devvp = ip->i_devvp; 1390 /* 1391 * If active, delete from incore list (this snapshot may 1392 * already have been in the process of being deleted, so 1393 * would not have been active). 1394 * 1395 * Clear copy-on-write flag if last snapshot. 1396 */ 1397 if (ip->i_nextsnap.tqe_prev != 0) { 1398 VI_LOCK(devvp); 1399 lockmgr(&vp->v_lock, LK_INTERLOCK | LK_EXCLUSIVE, 1400 VI_MTX(devvp), td); 1401 VI_LOCK(devvp); 1402 TAILQ_REMOVE(&devvp->v_rdev->si_snapshots, ip, i_nextsnap); 1403 ip->i_nextsnap.tqe_prev = 0; 1404 lkp = vp->v_vnlock; 1405 vp->v_vnlock = &vp->v_lock; 1406 lockmgr(lkp, LK_RELEASE, NULL, td); 1407 if (TAILQ_FIRST(&devvp->v_rdev->si_snapshots) != 0) { 1408 VI_UNLOCK(devvp); 1409 } else { 1410 snapblklist = devvp->v_rdev->si_snapblklist; 1411 devvp->v_rdev->si_snapblklist = 0; 1412 devvp->v_rdev->si_snaplistsize = 0; 1413 devvp->v_rdev->si_copyonwrite = 0; 1414 devvp->v_vflag &= ~VV_COPYONWRITE; 1415 lockmgr(lkp, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp), td); 1416 lockmgr(lkp, LK_RELEASE, NULL, td); 1417 lockdestroy(lkp); 1418 FREE(lkp, M_UFSMNT); 1419 FREE(snapblklist, M_UFSMNT); 1420 } 1421 } 1422 /* 1423 * Clear all BLK_NOCOPY fields. Pass any block claims to other 1424 * snapshots that want them (see ffs_snapblkfree below). 1425 */ 1426 for (blkno = 1; blkno < NDADDR; blkno++) { 1427 dblk = DIP(ip, i_db[blkno]); 1428 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1429 DIP(ip, i_db[blkno]) = 0; 1430 else if ((dblk == blkstofrags(fs, blkno) && 1431 ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, 1432 ip->i_number))) { 1433 DIP(ip, i_blocks) -= btodb(fs->fs_bsize); 1434 DIP(ip, i_db[blkno]) = 0; 1435 } 1436 } 1437 numblks = howmany(ip->i_size, fs->fs_bsize); 1438 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 1439 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), 1440 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1441 if (error) 1442 continue; 1443 if (fs->fs_size - blkno > NINDIR(fs)) 1444 last = NINDIR(fs); 1445 else 1446 last = fs->fs_size - blkno; 1447 for (loc = 0; loc < last; loc++) { 1448 if (ip->i_ump->um_fstype == UFS1) { 1449 dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc]; 1450 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1451 ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; 1452 else if ((dblk == blkstofrags(fs, blkno) && 1453 ffs_snapblkfree(fs, ip->i_devvp, dblk, 1454 fs->fs_bsize, ip->i_number))) { 1455 ip->i_din1->di_blocks -= 1456 btodb(fs->fs_bsize); 1457 ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; 1458 } 1459 continue; 1460 } 1461 dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc]; 1462 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1463 ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; 1464 else if ((dblk == blkstofrags(fs, blkno) && 1465 ffs_snapblkfree(fs, ip->i_devvp, dblk, 1466 fs->fs_bsize, ip->i_number))) { 1467 ip->i_din2->di_blocks -= btodb(fs->fs_bsize); 1468 ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; 1469 } 1470 } 1471 bawrite(ibp); 1472 } 1473 /* 1474 * Clear snapshot flag and drop reference. 1475 */ 1476 ip->i_flags &= ~SF_SNAPSHOT; 1477 DIP(ip, i_flags) = ip->i_flags; 1478 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1479 } 1480 1481 /* 1482 * Notification that a block is being freed. Return zero if the free 1483 * should be allowed to proceed. Return non-zero if the snapshot file 1484 * wants to claim the block. The block will be claimed if it is an 1485 * uncopied part of one of the snapshots. It will be freed if it is 1486 * either a BLK_NOCOPY or has already been copied in all of the snapshots. 1487 * If a fragment is being freed, then all snapshots that care about 1488 * it must make a copy since a snapshot file can only claim full sized 1489 * blocks. Note that if more than one snapshot file maps the block, 1490 * we can pick one at random to claim it. Since none of the snapshots 1491 * can change, we are assurred that they will all see the same unmodified 1492 * image. When deleting a snapshot file (see ffs_snapremove above), we 1493 * must push any of these claimed blocks to one of the other snapshots 1494 * that maps it. These claimed blocks are easily identified as they will 1495 * have a block number equal to their logical block number within the 1496 * snapshot. A copied block can never have this property because they 1497 * must always have been allocated from a BLK_NOCOPY location. 1498 */ 1499 int 1500 ffs_snapblkfree(fs, devvp, bno, size, inum) 1501 struct fs *fs; 1502 struct vnode *devvp; 1503 ufs2_daddr_t bno; 1504 long size; 1505 ino_t inum; 1506 { 1507 struct buf *ibp, *cbp, *savedcbp = 0; 1508 struct thread *td = curthread; 1509 struct inode *ip; 1510 struct vnode *vp = NULL; 1511 ufs_lbn_t lbn; 1512 ufs2_daddr_t blkno; 1513 int indiroff = 0, snapshot_locked = 0, error = 0, claimedblk = 0; 1514 struct snaphead *snaphead; 1515 1516 lbn = fragstoblks(fs, bno); 1517 retry: 1518 VI_LOCK(devvp); 1519 snaphead = &devvp->v_rdev->si_snapshots; 1520 TAILQ_FOREACH(ip, snaphead, i_nextsnap) { 1521 vp = ITOV(ip); 1522 /* 1523 * Lookup block being written. 1524 */ 1525 if (lbn < NDADDR) { 1526 blkno = DIP(ip, i_db[lbn]); 1527 } else { 1528 if (snapshot_locked == 0 && 1529 lockmgr(vp->v_vnlock, 1530 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 1531 VI_MTX(devvp), td) != 0) 1532 goto retry; 1533 snapshot_locked = 1; 1534 td->td_proc->p_flag |= P_COWINPROGRESS; 1535 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1536 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1537 td->td_proc->p_flag &= ~P_COWINPROGRESS; 1538 if (error) 1539 break; 1540 indiroff = (lbn - NDADDR) % NINDIR(fs); 1541 if (ip->i_ump->um_fstype == UFS1) 1542 blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; 1543 else 1544 blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; 1545 } 1546 /* 1547 * Check to see if block needs to be copied. 1548 */ 1549 if (blkno == 0) { 1550 /* 1551 * A block that we map is being freed. If it has not 1552 * been claimed yet, we will claim or copy it (below). 1553 */ 1554 claimedblk = 1; 1555 } else if (blkno == BLK_SNAP) { 1556 /* 1557 * No previous snapshot claimed the block, 1558 * so it will be freed and become a BLK_NOCOPY 1559 * (don't care) for us. 1560 */ 1561 if (claimedblk) 1562 panic("snapblkfree: inconsistent block type"); 1563 if (snapshot_locked == 0 && 1564 lockmgr(vp->v_vnlock, 1565 LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, 1566 VI_MTX(devvp), td) != 0) { 1567 if (lbn >= NDADDR) 1568 bqrelse(ibp); 1569 vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL, td); 1570 goto retry; 1571 } 1572 snapshot_locked = 1; 1573 if (lbn < NDADDR) { 1574 DIP(ip, i_db[lbn]) = BLK_NOCOPY; 1575 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1576 } else if (ip->i_ump->um_fstype == UFS1) { 1577 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 1578 BLK_NOCOPY; 1579 bdwrite(ibp); 1580 } else { 1581 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 1582 BLK_NOCOPY; 1583 bdwrite(ibp); 1584 } 1585 continue; 1586 } else /* BLK_NOCOPY or default */ { 1587 /* 1588 * If the snapshot has already copied the block 1589 * (default), or does not care about the block, 1590 * it is not needed. 1591 */ 1592 if (lbn >= NDADDR) 1593 bqrelse(ibp); 1594 continue; 1595 } 1596 /* 1597 * If this is a full size block, we will just grab it 1598 * and assign it to the snapshot inode. Otherwise we 1599 * will proceed to copy it. See explanation for this 1600 * routine as to why only a single snapshot needs to 1601 * claim this block. 1602 */ 1603 if (snapshot_locked == 0 && 1604 lockmgr(vp->v_vnlock, 1605 LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, 1606 VI_MTX(devvp), td) != 0) { 1607 if (lbn >= NDADDR) 1608 bqrelse(ibp); 1609 vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL, td); 1610 goto retry; 1611 } 1612 snapshot_locked = 1; 1613 if (size == fs->fs_bsize) { 1614 #ifdef DEBUG 1615 if (snapdebug) 1616 printf("%s %d lbn %jd from inum %d\n", 1617 "Grabonremove: snapino", ip->i_number, 1618 (intmax_t)lbn, inum); 1619 #endif 1620 if (lbn < NDADDR) { 1621 DIP(ip, i_db[lbn]) = bno; 1622 } else if (ip->i_ump->um_fstype == UFS1) { 1623 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno; 1624 bdwrite(ibp); 1625 } else { 1626 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno; 1627 bdwrite(ibp); 1628 } 1629 DIP(ip, i_blocks) += btodb(size); 1630 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1631 VOP_UNLOCK(vp, 0, td); 1632 return (1); 1633 } 1634 if (lbn >= NDADDR) 1635 bqrelse(ibp); 1636 /* 1637 * Allocate the block into which to do the copy. Note that this 1638 * allocation will never require any additional allocations for 1639 * the snapshot inode. 1640 */ 1641 td->td_proc->p_flag |= P_COWINPROGRESS; 1642 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1643 fs->fs_bsize, KERNCRED, 0, &cbp); 1644 td->td_proc->p_flag &= ~P_COWINPROGRESS; 1645 if (error) 1646 break; 1647 #ifdef DEBUG 1648 if (snapdebug) 1649 printf("%s%d lbn %jd %s %d size %ld to blkno %jd\n", 1650 "Copyonremove: snapino ", ip->i_number, 1651 (intmax_t)lbn, "for inum", inum, size, 1652 (intmax_t)cbp->b_blkno); 1653 #endif 1654 /* 1655 * If we have already read the old block contents, then 1656 * simply copy them to the new block. Note that we need 1657 * to synchronously write snapshots that have not been 1658 * unlinked, and hence will be visible after a crash, 1659 * to ensure their integrity. 1660 */ 1661 if (savedcbp != 0) { 1662 bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 1663 bawrite(cbp); 1664 if (dopersistence && ip->i_effnlink > 0) 1665 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1666 continue; 1667 } 1668 /* 1669 * Otherwise, read the old block contents into the buffer. 1670 */ 1671 if ((error = readblock(cbp, lbn)) != 0) { 1672 bzero(cbp->b_data, fs->fs_bsize); 1673 bawrite(cbp); 1674 if (dopersistence && ip->i_effnlink > 0) 1675 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1676 break; 1677 } 1678 savedcbp = cbp; 1679 } 1680 /* 1681 * Note that we need to synchronously write snapshots that 1682 * have not been unlinked, and hence will be visible after 1683 * a crash, to ensure their integrity. 1684 */ 1685 if (savedcbp) { 1686 vp = savedcbp->b_vp; 1687 bawrite(savedcbp); 1688 if (dopersistence && VTOI(vp)->i_effnlink > 0) 1689 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1690 } 1691 /* 1692 * If we have been unable to allocate a block in which to do 1693 * the copy, then return non-zero so that the fragment will 1694 * not be freed. Although space will be lost, the snapshot 1695 * will stay consistent. 1696 */ 1697 if (snapshot_locked) 1698 VOP_UNLOCK(vp, 0, td); 1699 else 1700 VI_UNLOCK(devvp); 1701 return (error); 1702 } 1703 1704 /* 1705 * Associate snapshot files when mounting. 1706 */ 1707 void 1708 ffs_snapshot_mount(mp) 1709 struct mount *mp; 1710 { 1711 struct ufsmount *ump = VFSTOUFS(mp); 1712 struct vnode *devvp = ump->um_devvp; 1713 struct fs *fs = ump->um_fs; 1714 struct thread *td = curthread; 1715 struct snaphead *snaphead; 1716 struct vnode *vp; 1717 struct inode *ip, *xp; 1718 struct uio auio; 1719 struct iovec aiov; 1720 void *snapblklist; 1721 char *reason; 1722 daddr_t snaplistsize; 1723 int error, snaploc, loc; 1724 1725 /* 1726 * XXX The following needs to be set before UFS_TRUNCATE or 1727 * VOP_READ can be called. 1728 */ 1729 mp->mnt_stat.f_iosize = fs->fs_bsize; 1730 /* 1731 * Process each snapshot listed in the superblock. 1732 */ 1733 vp = NULL; 1734 snaphead = &devvp->v_rdev->si_snapshots; 1735 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { 1736 if (fs->fs_snapinum[snaploc] == 0) 1737 break; 1738 if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], 1739 LK_EXCLUSIVE, &vp)) != 0){ 1740 printf("ffs_snapshot_mount: vget failed %d\n", error); 1741 continue; 1742 } 1743 ip = VTOI(vp); 1744 if ((ip->i_flags & SF_SNAPSHOT) == 0 || ip->i_size == 1745 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) { 1746 if ((ip->i_flags & SF_SNAPSHOT) == 0) { 1747 reason = "non-snapshot"; 1748 } else { 1749 reason = "old format snapshot"; 1750 (void)UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED, td); 1751 (void)VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1752 } 1753 printf("ffs_snapshot_mount: %s inode %d\n", 1754 reason, fs->fs_snapinum[snaploc]); 1755 vput(vp); 1756 vp = NULL; 1757 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { 1758 if (fs->fs_snapinum[loc] == 0) 1759 break; 1760 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; 1761 } 1762 fs->fs_snapinum[loc - 1] = 0; 1763 snaploc--; 1764 continue; 1765 } 1766 /* 1767 * If there already exist snapshots on this filesystem, grab a 1768 * reference to their shared lock. If this is the first snapshot 1769 * on this filesystem, we need to allocate a lock for the 1770 * snapshots to share. In either case, acquire the snapshot 1771 * lock and give up our original private lock. 1772 */ 1773 VI_LOCK(devvp); 1774 if ((xp = TAILQ_FIRST(snaphead)) != NULL) { 1775 VI_LOCK(vp); 1776 vp->v_vnlock = ITOV(xp)->v_vnlock; 1777 VI_UNLOCK(devvp); 1778 } else { 1779 struct lock *lkp; 1780 1781 VI_UNLOCK(devvp); 1782 MALLOC(lkp, struct lock *, sizeof(struct lock), 1783 M_UFSMNT, M_WAITOK); 1784 lockinit(lkp, PVFS, "snaplk", VLKTIMEOUT, 1785 LK_CANRECURSE | LK_NOPAUSE); 1786 VI_LOCK(vp); 1787 vp->v_vnlock = lkp; 1788 } 1789 vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, td); 1790 transferlockers(&vp->v_lock, vp->v_vnlock); 1791 lockmgr(&vp->v_lock, LK_RELEASE, NULL, td); 1792 /* 1793 * Link it onto the active snapshot list. 1794 */ 1795 VI_LOCK(devvp); 1796 if (ip->i_nextsnap.tqe_prev != 0) 1797 panic("ffs_snapshot_mount: %d already on list", 1798 ip->i_number); 1799 else 1800 TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap); 1801 vp->v_vflag |= VV_SYSTEM; 1802 VI_UNLOCK(devvp); 1803 VOP_UNLOCK(vp, 0, td); 1804 } 1805 /* 1806 * No usable snapshots found. 1807 */ 1808 if (vp == NULL) 1809 return; 1810 /* 1811 * Allocate the space for the block hints list. We always want to 1812 * use the list from the newest snapshot. 1813 */ 1814 auio.uio_iov = &aiov; 1815 auio.uio_iovcnt = 1; 1816 aiov.iov_base = (void *)&snaplistsize; 1817 aiov.iov_len = sizeof(snaplistsize); 1818 auio.uio_resid = aiov.iov_len; 1819 auio.uio_offset = 1820 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)); 1821 auio.uio_segflg = UIO_SYSSPACE; 1822 auio.uio_rw = UIO_READ; 1823 auio.uio_td = td; 1824 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1825 if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 1826 printf("ffs_snapshot_mount: read_1 failed %d\n", error); 1827 VOP_UNLOCK(vp, 0, td); 1828 return; 1829 } 1830 MALLOC(snapblklist, void *, snaplistsize * sizeof(daddr_t), 1831 M_UFSMNT, M_WAITOK); 1832 auio.uio_iovcnt = 1; 1833 aiov.iov_base = snapblklist; 1834 aiov.iov_len = snaplistsize * sizeof (daddr_t); 1835 auio.uio_resid = aiov.iov_len; 1836 auio.uio_offset -= sizeof(snaplistsize); 1837 if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 1838 printf("ffs_snapshot_mount: read_2 failed %d\n", error); 1839 VOP_UNLOCK(vp, 0, td); 1840 FREE(snapblklist, M_UFSMNT); 1841 return; 1842 } 1843 VOP_UNLOCK(vp, 0, td); 1844 VI_LOCK(devvp); 1845 ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_mount"); 1846 devvp->v_rdev->si_snaplistsize = snaplistsize; 1847 devvp->v_rdev->si_snapblklist = (daddr_t *)snapblklist; 1848 devvp->v_rdev->si_copyonwrite = ffs_copyonwrite; 1849 devvp->v_vflag |= VV_COPYONWRITE; 1850 VI_UNLOCK(devvp); 1851 } 1852 1853 /* 1854 * Disassociate snapshot files when unmounting. 1855 */ 1856 void 1857 ffs_snapshot_unmount(mp) 1858 struct mount *mp; 1859 { 1860 struct vnode *devvp = VFSTOUFS(mp)->um_devvp; 1861 struct snaphead *snaphead = &devvp->v_rdev->si_snapshots; 1862 struct lock *lkp = NULL; 1863 struct inode *xp; 1864 struct vnode *vp; 1865 1866 VI_LOCK(devvp); 1867 while ((xp = TAILQ_FIRST(snaphead)) != 0) { 1868 vp = ITOV(xp); 1869 lkp = vp->v_vnlock; 1870 vp->v_vnlock = &vp->v_lock; 1871 TAILQ_REMOVE(snaphead, xp, i_nextsnap); 1872 xp->i_nextsnap.tqe_prev = 0; 1873 if (xp->i_effnlink > 0) { 1874 VI_UNLOCK(devvp); 1875 vrele(vp); 1876 VI_LOCK(devvp); 1877 } 1878 } 1879 if (devvp->v_rdev->si_snapblklist != NULL) { 1880 FREE(devvp->v_rdev->si_snapblklist, M_UFSMNT); 1881 devvp->v_rdev->si_snapblklist = NULL; 1882 devvp->v_rdev->si_snaplistsize = 0; 1883 } 1884 if (lkp != NULL) { 1885 lockdestroy(lkp); 1886 FREE(lkp, M_UFSMNT); 1887 } 1888 ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount"); 1889 devvp->v_rdev->si_copyonwrite = 0; 1890 devvp->v_vflag &= ~VV_COPYONWRITE; 1891 VI_UNLOCK(devvp); 1892 } 1893 1894 /* 1895 * Check for need to copy block that is about to be written, 1896 * copying the block if necessary. 1897 */ 1898 static int 1899 ffs_copyonwrite(devvp, bp) 1900 struct vnode *devvp; 1901 struct buf *bp; 1902 { 1903 struct snaphead *snaphead; 1904 struct buf *ibp, *cbp, *savedcbp = 0; 1905 struct thread *td = curthread; 1906 struct fs *fs; 1907 struct inode *ip; 1908 struct vnode *vp = 0; 1909 ufs2_daddr_t lbn, blkno, *snapblklist; 1910 int lower, upper, mid, indiroff, snapshot_locked = 0, error = 0; 1911 1912 if (td->td_proc->p_flag & P_COWINPROGRESS) 1913 panic("ffs_copyonwrite: recursive call"); 1914 /* 1915 * First check to see if it is in the preallocated list. 1916 * By doing this check we avoid several potential deadlocks. 1917 */ 1918 VI_LOCK(devvp); 1919 snaphead = &devvp->v_rdev->si_snapshots; 1920 ip = TAILQ_FIRST(snaphead); 1921 fs = ip->i_fs; 1922 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); 1923 snapblklist = devvp->v_rdev->si_snapblklist; 1924 upper = devvp->v_rdev->si_snaplistsize - 1; 1925 lower = 1; 1926 while (lower <= upper) { 1927 mid = (lower + upper) / 2; 1928 if (snapblklist[mid] == lbn) 1929 break; 1930 if (snapblklist[mid] < lbn) 1931 lower = mid + 1; 1932 else 1933 upper = mid - 1; 1934 } 1935 if (lower <= upper) { 1936 VI_UNLOCK(devvp); 1937 return (0); 1938 } 1939 /* 1940 * Not in the precomputed list, so check the snapshots. 1941 */ 1942 retry: 1943 TAILQ_FOREACH(ip, snaphead, i_nextsnap) { 1944 vp = ITOV(ip); 1945 /* 1946 * We ensure that everything of our own that needs to be 1947 * copied will be done at the time that ffs_snapshot is 1948 * called. Thus we can skip the check here which can 1949 * deadlock in doing the lookup in UFS_BALLOC. 1950 */ 1951 if (bp->b_vp == vp) 1952 continue; 1953 /* 1954 * Check to see if block needs to be copied. We do not have 1955 * to hold the snapshot lock while doing this lookup as it 1956 * will never require any additional allocations for the 1957 * snapshot inode. 1958 */ 1959 if (lbn < NDADDR) { 1960 blkno = DIP(ip, i_db[lbn]); 1961 } else { 1962 if (snapshot_locked == 0 && 1963 lockmgr(vp->v_vnlock, 1964 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 1965 VI_MTX(devvp), td) != 0) { 1966 VI_LOCK(devvp); 1967 goto retry; 1968 } 1969 snapshot_locked = 1; 1970 td->td_proc->p_flag |= P_COWINPROGRESS; 1971 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1972 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1973 td->td_proc->p_flag &= ~P_COWINPROGRESS; 1974 if (error) 1975 break; 1976 indiroff = (lbn - NDADDR) % NINDIR(fs); 1977 if (ip->i_ump->um_fstype == UFS1) 1978 blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; 1979 else 1980 blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; 1981 bqrelse(ibp); 1982 } 1983 #ifdef DIAGNOSTIC 1984 if (blkno == BLK_SNAP && bp->b_lblkno >= 0) 1985 panic("ffs_copyonwrite: bad copy block"); 1986 #endif 1987 if (blkno != 0) 1988 continue; 1989 /* 1990 * Allocate the block into which to do the copy. Since 1991 * multiple processes may all try to copy the same block, 1992 * we have to recheck our need to do a copy if we sleep 1993 * waiting for the lock. 1994 * 1995 * Because all snapshots on a filesystem share a single 1996 * lock, we ensure that we will never be in competition 1997 * with another process to allocate a block. 1998 */ 1999 if (snapshot_locked == 0 && 2000 lockmgr(vp->v_vnlock, 2001 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 2002 VI_MTX(devvp), td) != 0) { 2003 VI_LOCK(devvp); 2004 goto retry; 2005 } 2006 snapshot_locked = 1; 2007 td->td_proc->p_flag |= P_COWINPROGRESS; 2008 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 2009 fs->fs_bsize, KERNCRED, 0, &cbp); 2010 td->td_proc->p_flag &= ~P_COWINPROGRESS; 2011 if (error) 2012 break; 2013 #ifdef DEBUG 2014 if (snapdebug) { 2015 printf("Copyonwrite: snapino %d lbn %jd for ", 2016 ip->i_number, (intmax_t)lbn); 2017 if (bp->b_vp == devvp) 2018 printf("fs metadata"); 2019 else 2020 printf("inum %d", VTOI(bp->b_vp)->i_number); 2021 printf(" lblkno %jd to blkno %jd\n", 2022 (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno); 2023 } 2024 #endif 2025 /* 2026 * If we have already read the old block contents, then 2027 * simply copy them to the new block. Note that we need 2028 * to synchronously write snapshots that have not been 2029 * unlinked, and hence will be visible after a crash, 2030 * to ensure their integrity. 2031 */ 2032 if (savedcbp != 0) { 2033 bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 2034 bawrite(cbp); 2035 if (dopersistence && ip->i_effnlink > 0) 2036 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 2037 continue; 2038 } 2039 /* 2040 * Otherwise, read the old block contents into the buffer. 2041 */ 2042 if ((error = readblock(cbp, lbn)) != 0) { 2043 bzero(cbp->b_data, fs->fs_bsize); 2044 bawrite(cbp); 2045 if (dopersistence && ip->i_effnlink > 0) 2046 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 2047 break; 2048 } 2049 savedcbp = cbp; 2050 } 2051 /* 2052 * Note that we need to synchronously write snapshots that 2053 * have not been unlinked, and hence will be visible after 2054 * a crash, to ensure their integrity. 2055 */ 2056 if (savedcbp) { 2057 vp = savedcbp->b_vp; 2058 bawrite(savedcbp); 2059 if (dopersistence && VTOI(vp)->i_effnlink > 0) 2060 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 2061 } 2062 if (snapshot_locked) 2063 VOP_UNLOCK(vp, 0, td); 2064 else 2065 VI_UNLOCK(devvp); 2066 return (error); 2067 } 2068 2069 /* 2070 * Read the specified block into the given buffer. 2071 * Much of this boiler-plate comes from bwrite(). 2072 */ 2073 static int 2074 readblock(bp, lbn) 2075 struct buf *bp; 2076 ufs2_daddr_t lbn; 2077 { 2078 struct uio auio; 2079 struct iovec aiov; 2080 struct thread *td = curthread; 2081 struct inode *ip = VTOI(bp->b_vp); 2082 2083 aiov.iov_base = bp->b_data; 2084 aiov.iov_len = bp->b_bcount; 2085 auio.uio_iov = &aiov; 2086 auio.uio_iovcnt = 1; 2087 auio.uio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn))); 2088 auio.uio_resid = bp->b_bcount; 2089 auio.uio_rw = UIO_READ; 2090 auio.uio_segflg = UIO_SYSSPACE; 2091 auio.uio_td = td; 2092 return (physio(ip->i_devvp->v_rdev, &auio, 0)); 2093 } 2094