1 /* 2 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. 3 * 4 * Further information about snapshots can be obtained from: 5 * 6 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 7 * 1614 Oxford Street mckusick@mckusick.com 8 * Berkeley, CA 94709-1608 +1-510-843-9542 9 * USA 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 21 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 22 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 34 */ 35 36 #include <sys/cdefs.h> 37 __FBSDID("$FreeBSD$"); 38 39 #include <sys/param.h> 40 #include <sys/kernel.h> 41 #include <sys/systm.h> 42 #include <sys/conf.h> 43 #include <sys/bio.h> 44 #include <sys/buf.h> 45 #include <sys/proc.h> 46 #include <sys/namei.h> 47 #include <sys/sched.h> 48 #include <sys/stat.h> 49 #include <sys/malloc.h> 50 #include <sys/mount.h> 51 #include <sys/resource.h> 52 #include <sys/resourcevar.h> 53 #include <sys/vnode.h> 54 55 #include <ufs/ufs/extattr.h> 56 #include <ufs/ufs/quota.h> 57 #include <ufs/ufs/ufsmount.h> 58 #include <ufs/ufs/inode.h> 59 #include <ufs/ufs/ufs_extern.h> 60 61 #include <ufs/ffs/fs.h> 62 #include <ufs/ffs/ffs_extern.h> 63 64 #define KERNCRED thread0.td_ucred 65 #define DEBUG 1 66 67 static int cgaccount(int, struct vnode *, struct buf *, int); 68 static int expunge_ufs1(struct vnode *, struct inode *, struct fs *, 69 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 70 ufs_lbn_t, int), int); 71 static int indiracct_ufs1(struct vnode *, struct vnode *, int, 72 ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 73 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 74 ufs_lbn_t, int), int); 75 static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 76 struct fs *, ufs_lbn_t, int); 77 static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 78 struct fs *, ufs_lbn_t, int); 79 static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 80 struct fs *, ufs_lbn_t, int); 81 static int expunge_ufs2(struct vnode *, struct inode *, struct fs *, 82 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 83 ufs_lbn_t, int), int); 84 static int indiracct_ufs2(struct vnode *, struct vnode *, int, 85 ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 86 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 87 ufs_lbn_t, int), int); 88 static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 89 struct fs *, ufs_lbn_t, int); 90 static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 91 struct fs *, ufs_lbn_t, int); 92 static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 93 struct fs *, ufs_lbn_t, int); 94 static int ffs_copyonwrite(struct vnode *, struct buf *); 95 static int readblock(struct buf *, ufs2_daddr_t); 96 97 /* 98 * To ensure the consistency of snapshots across crashes, we must 99 * synchronously write out copied blocks before allowing the 100 * originals to be modified. Because of the rather severe speed 101 * penalty that this imposes, the following flag allows this 102 * crash persistence to be disabled. 103 */ 104 int dopersistence = 0; 105 106 #ifdef DEBUG 107 #include <sys/sysctl.h> 108 SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, ""); 109 static int snapdebug = 0; 110 SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, ""); 111 int collectsnapstats = 0; 112 SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats, 113 0, ""); 114 #endif /* DEBUG */ 115 116 /* 117 * Create a snapshot file and initialize it for the filesystem. 118 */ 119 int 120 ffs_snapshot(mp, snapfile) 121 struct mount *mp; 122 char *snapfile; 123 { 124 ufs2_daddr_t numblks, blkno, *blkp, *snapblklist; 125 int error, cg, snaploc; 126 int i, size, len, loc; 127 int flag = mp->mnt_flag; 128 struct timespec starttime = {0, 0}, endtime; 129 char saved_nice = 0; 130 long redo = 0, snaplistsize = 0; 131 int32_t *lp; 132 void *space; 133 struct fs *copy_fs = NULL, *fs = VFSTOUFS(mp)->um_fs; 134 struct snaphead *snaphead; 135 struct thread *td = curthread; 136 struct inode *ip, *xp; 137 struct buf *bp, *nbp, *ibp, *sbp = NULL; 138 struct nameidata nd; 139 struct mount *wrtmp; 140 struct vattr vat; 141 struct vnode *vp, *xvp, *nvp, *devvp; 142 struct uio auio; 143 struct iovec aiov; 144 145 /* 146 * Need to serialize access to snapshot code per filesystem. 147 */ 148 /* 149 * Assign a snapshot slot in the superblock. 150 */ 151 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 152 if (fs->fs_snapinum[snaploc] == 0) 153 break; 154 if (snaploc == FSMAXSNAP) 155 return (ENOSPC); 156 /* 157 * Create the snapshot file. 158 */ 159 restart: 160 NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, snapfile, td); 161 if ((error = namei(&nd)) != 0) 162 return (error); 163 if (nd.ni_vp != NULL) { 164 vput(nd.ni_vp); 165 error = EEXIST; 166 } 167 if (nd.ni_dvp->v_mount != mp) 168 error = EXDEV; 169 if (error) { 170 NDFREE(&nd, NDF_ONLY_PNBUF); 171 if (nd.ni_dvp == nd.ni_vp) 172 vrele(nd.ni_dvp); 173 else 174 vput(nd.ni_dvp); 175 return (error); 176 } 177 VATTR_NULL(&vat); 178 vat.va_type = VREG; 179 vat.va_mode = S_IRUSR; 180 vat.va_vaflags |= VA_EXCLUSIVE; 181 if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp)) 182 wrtmp = NULL; 183 if (wrtmp != mp) 184 panic("ffs_snapshot: mount mismatch"); 185 if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) { 186 NDFREE(&nd, NDF_ONLY_PNBUF); 187 vput(nd.ni_dvp); 188 if ((error = vn_start_write(NULL, &wrtmp, 189 V_XSLEEP | PCATCH)) != 0) 190 return (error); 191 goto restart; 192 } 193 VOP_LEASE(nd.ni_dvp, td, KERNCRED, LEASE_WRITE); 194 error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat); 195 vput(nd.ni_dvp); 196 if (error) { 197 NDFREE(&nd, NDF_ONLY_PNBUF); 198 vn_finished_write(wrtmp); 199 return (error); 200 } 201 vp = nd.ni_vp; 202 ip = VTOI(vp); 203 devvp = ip->i_devvp; 204 /* 205 * Allocate and copy the last block contents so as to be able 206 * to set size to that of the filesystem. 207 */ 208 numblks = howmany(fs->fs_size, fs->fs_frag); 209 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), 210 fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); 211 if (error) 212 goto out; 213 ip->i_size = lblktosize(fs, (off_t)numblks); 214 DIP(ip, i_size) = ip->i_size; 215 ip->i_flag |= IN_CHANGE | IN_UPDATE; 216 if ((error = readblock(bp, numblks - 1)) != 0) 217 goto out; 218 bawrite(bp); 219 /* 220 * Preallocate critical data structures so that we can copy 221 * them in without further allocation after we suspend all 222 * operations on the filesystem. We would like to just release 223 * the allocated buffers without writing them since they will 224 * be filled in below once we are ready to go, but this upsets 225 * the soft update code, so we go ahead and write the new buffers. 226 * 227 * Allocate all indirect blocks and mark all of them as not 228 * needing to be copied. 229 */ 230 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 231 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), 232 fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp); 233 if (error) 234 goto out; 235 bawrite(ibp); 236 } 237 /* 238 * Allocate copies for the superblock and its summary information. 239 */ 240 error = UFS_BALLOC(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED, 241 0, &nbp); 242 if (error) 243 goto out; 244 bawrite(nbp); 245 blkno = fragstoblks(fs, fs->fs_csaddr); 246 len = howmany(fs->fs_cssize, fs->fs_bsize); 247 for (loc = 0; loc < len; loc++) { 248 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), 249 fs->fs_bsize, KERNCRED, 0, &nbp); 250 if (error) 251 goto out; 252 bawrite(nbp); 253 } 254 /* 255 * Allocate all cylinder group blocks. 256 */ 257 for (cg = 0; cg < fs->fs_ncg; cg++) { 258 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 259 fs->fs_bsize, KERNCRED, 0, &nbp); 260 if (error) 261 goto out; 262 bawrite(nbp); 263 } 264 /* 265 * Copy all the cylinder group maps. Although the 266 * filesystem is still active, we hope that only a few 267 * cylinder groups will change between now and when we 268 * suspend operations. Thus, we will be able to quickly 269 * touch up the few cylinder groups that changed during 270 * the suspension period. 271 */ 272 len = howmany(fs->fs_ncg, NBBY); 273 MALLOC(fs->fs_active, int *, len, M_DEVBUF, M_WAITOK); 274 bzero(fs->fs_active, len); 275 for (cg = 0; cg < fs->fs_ncg; cg++) { 276 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 277 fs->fs_bsize, KERNCRED, 0, &nbp); 278 if (error) 279 goto out; 280 error = cgaccount(cg, vp, nbp, 1); 281 bawrite(nbp); 282 if (error) 283 goto out; 284 } 285 /* 286 * Change inode to snapshot type file. 287 */ 288 ip->i_flags |= SF_SNAPSHOT; 289 DIP(ip, i_flags) = ip->i_flags; 290 ip->i_flag |= IN_CHANGE | IN_UPDATE; 291 /* 292 * Ensure that the snapshot is completely on disk. 293 * Since we have marked it as a snapshot it is safe to 294 * unlock it as no process will be allowed to write to it. 295 */ 296 if ((error = VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td)) != 0) 297 goto out; 298 VOP_UNLOCK(vp, 0, td); 299 /* 300 * All allocations are done, so we can now snapshot the system. 301 * 302 * Recind nice scheduling while running with the filesystem suspended. 303 */ 304 if (td->td_ksegrp->kg_nice > 0) { 305 PROC_LOCK(td->td_proc); 306 mtx_lock_spin(&sched_lock); 307 saved_nice = td->td_ksegrp->kg_nice; 308 sched_nice(td->td_ksegrp, 0); 309 mtx_unlock_spin(&sched_lock); 310 PROC_UNLOCK(td->td_proc); 311 } 312 /* 313 * Suspend operation on filesystem. 314 */ 315 for (;;) { 316 vn_finished_write(wrtmp); 317 if ((error = vfs_write_suspend(vp->v_mount)) != 0) { 318 vn_start_write(NULL, &wrtmp, V_WAIT); 319 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 320 goto out; 321 } 322 if (mp->mnt_kern_flag & MNTK_SUSPENDED) 323 break; 324 vn_start_write(NULL, &wrtmp, V_WAIT); 325 } 326 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 327 if (collectsnapstats) 328 nanotime(&starttime); 329 /* 330 * First, copy all the cylinder group maps that have changed. 331 */ 332 for (cg = 0; cg < fs->fs_ncg; cg++) { 333 if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0) 334 continue; 335 redo++; 336 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 337 fs->fs_bsize, KERNCRED, 0, &nbp); 338 if (error) 339 goto out1; 340 error = cgaccount(cg, vp, nbp, 2); 341 bawrite(nbp); 342 if (error) 343 goto out1; 344 } 345 /* 346 * Grab a copy of the superblock and its summary information. 347 * We delay writing it until the suspension is released below. 348 */ 349 error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize, 350 KERNCRED, &sbp); 351 if (error) { 352 brelse(sbp); 353 sbp = NULL; 354 goto out1; 355 } 356 loc = blkoff(fs, fs->fs_sblockloc); 357 copy_fs = (struct fs *)(sbp->b_data + loc); 358 bcopy(fs, copy_fs, fs->fs_sbsize); 359 if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) 360 copy_fs->fs_clean = 1; 361 size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE; 362 if (fs->fs_sbsize < size) 363 bzero(&sbp->b_data[loc + fs->fs_sbsize], size - fs->fs_sbsize); 364 size = blkroundup(fs, fs->fs_cssize); 365 if (fs->fs_contigsumsize > 0) 366 size += fs->fs_ncg * sizeof(int32_t); 367 space = malloc((u_long)size, M_UFSMNT, M_WAITOK); 368 copy_fs->fs_csp = space; 369 bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize); 370 (char *)space += fs->fs_cssize; 371 loc = howmany(fs->fs_cssize, fs->fs_fsize); 372 i = fs->fs_frag - loc % fs->fs_frag; 373 len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; 374 if (len > 0) { 375 if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc), 376 len, KERNCRED, &bp)) != 0) { 377 brelse(bp); 378 free(copy_fs->fs_csp, M_UFSMNT); 379 bawrite(sbp); 380 sbp = NULL; 381 goto out1; 382 } 383 bcopy(bp->b_data, space, (u_int)len); 384 (char *)space += len; 385 bp->b_flags |= B_INVAL | B_NOCACHE; 386 brelse(bp); 387 } 388 if (fs->fs_contigsumsize > 0) { 389 copy_fs->fs_maxcluster = lp = space; 390 for (i = 0; i < fs->fs_ncg; i++) 391 *lp++ = fs->fs_contigsumsize; 392 } 393 /* 394 * We must check for active files that have been unlinked 395 * (e.g., with a zero link count). We have to expunge all 396 * trace of these files from the snapshot so that they are 397 * not reclaimed prematurely by fsck or unnecessarily dumped. 398 * We turn off the MNTK_SUSPENDED flag to avoid a panic from 399 * spec_strategy about writing on a suspended filesystem. 400 * Note that we skip unlinked snapshot files as they will 401 * be handled separately below. 402 * 403 * We also calculate the needed size for the snapshot list. 404 */ 405 snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + 406 FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */; 407 mp->mnt_kern_flag &= ~MNTK_SUSPENDED; 408 mtx_lock(&mntvnode_mtx); 409 loop: 410 for (xvp = TAILQ_FIRST(&mp->mnt_nvnodelist); xvp; xvp = nvp) { 411 /* 412 * Make sure this vnode wasn't reclaimed in getnewvnode(). 413 * Start over if it has (it won't be on the list anymore). 414 */ 415 if (xvp->v_mount != mp) 416 goto loop; 417 nvp = TAILQ_NEXT(xvp, v_nmntvnodes); 418 mtx_unlock(&mntvnode_mtx); 419 mp_fixme("Unlocked GETATTR."); 420 if (vrefcnt(xvp) == 0 || xvp->v_type == VNON || 421 (VTOI(xvp)->i_flags & SF_SNAPSHOT) || 422 (VOP_GETATTR(xvp, &vat, td->td_ucred, td) == 0 && 423 vat.va_nlink > 0)) { 424 mtx_lock(&mntvnode_mtx); 425 continue; 426 } 427 if (snapdebug) 428 vprint("ffs_snapshot: busy vnode", xvp); 429 if (vn_lock(xvp, LK_EXCLUSIVE, td) != 0) 430 goto loop; 431 xp = VTOI(xvp); 432 if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) { 433 VOP_UNLOCK(xvp, 0, td); 434 continue; 435 } 436 /* 437 * If there is a fragment, clear it here. 438 */ 439 blkno = 0; 440 loc = howmany(xp->i_size, fs->fs_bsize) - 1; 441 if (loc < NDADDR) { 442 len = fragroundup(fs, blkoff(fs, xp->i_size)); 443 if (len < fs->fs_bsize) { 444 ffs_blkfree(copy_fs, vp, DIP(xp, i_db[loc]), 445 len, xp->i_number); 446 blkno = DIP(xp, i_db[loc]); 447 DIP(xp, i_db[loc]) = 0; 448 } 449 } 450 snaplistsize += 1; 451 if (xp->i_ump->um_fstype == UFS1) 452 error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, 453 BLK_NOCOPY); 454 else 455 error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, 456 BLK_NOCOPY); 457 if (blkno) 458 DIP(xp, i_db[loc]) = blkno; 459 if (!error) 460 error = ffs_freefile(copy_fs, vp, xp->i_number, 461 xp->i_mode); 462 VOP_UNLOCK(xvp, 0, td); 463 if (error) { 464 free(copy_fs->fs_csp, M_UFSMNT); 465 bawrite(sbp); 466 sbp = NULL; 467 goto out1; 468 } 469 mtx_lock(&mntvnode_mtx); 470 } 471 mtx_unlock(&mntvnode_mtx); 472 /* 473 * If there already exist snapshots on this filesystem, grab a 474 * reference to their shared lock. If this is the first snapshot 475 * on this filesystem, we need to allocate a lock for the snapshots 476 * to share. In either case, acquire the snapshot lock and give 477 * up our original private lock. 478 */ 479 VI_LOCK(devvp); 480 snaphead = &devvp->v_rdev->si_snapshots; 481 if ((xp = TAILQ_FIRST(snaphead)) != NULL) { 482 VI_LOCK(vp); 483 vp->v_vnlock = ITOV(xp)->v_vnlock; 484 VI_UNLOCK(devvp); 485 } else { 486 struct lock *lkp; 487 488 VI_UNLOCK(devvp); 489 MALLOC(lkp, struct lock *, sizeof(struct lock), M_UFSMNT, 490 M_WAITOK); 491 lockinit(lkp, PVFS, "snaplk", VLKTIMEOUT, 492 LK_CANRECURSE | LK_NOPAUSE); 493 VI_LOCK(vp); 494 vp->v_vnlock = lkp; 495 } 496 vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, td); 497 transferlockers(&vp->v_lock, vp->v_vnlock); 498 lockmgr(&vp->v_lock, LK_RELEASE, NULL, td); 499 /* 500 * If this is the first snapshot on this filesystem, then we need 501 * to allocate the space for the list of preallocated snapshot blocks. 502 * This list will be refined below, but this preliminary one will 503 * keep us out of deadlock until the full one is ready. 504 */ 505 if (xp == NULL) { 506 MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t), 507 M_UFSMNT, M_WAITOK); 508 blkp = &snapblklist[1]; 509 *blkp++ = lblkno(fs, fs->fs_sblockloc); 510 blkno = fragstoblks(fs, fs->fs_csaddr); 511 for (cg = 0; cg < fs->fs_ncg; cg++) { 512 if (fragstoblks(fs, cgtod(fs, cg) > blkno)) 513 break; 514 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 515 } 516 len = howmany(fs->fs_cssize, fs->fs_bsize); 517 for (loc = 0; loc < len; loc++) 518 *blkp++ = blkno + loc; 519 for (; cg < fs->fs_ncg; cg++) 520 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 521 snapblklist[0] = blkp - snapblklist; 522 VI_LOCK(devvp); 523 if (devvp->v_rdev->si_snapblklist != NULL) 524 panic("ffs_snapshot: non-empty list"); 525 devvp->v_rdev->si_snapblklist = snapblklist; 526 devvp->v_rdev->si_snaplistsize = blkp - snapblklist; 527 VI_UNLOCK(devvp); 528 } 529 /* 530 * Record snapshot inode. Since this is the newest snapshot, 531 * it must be placed at the end of the list. 532 */ 533 VI_LOCK(devvp); 534 fs->fs_snapinum[snaploc] = ip->i_number; 535 if (ip->i_nextsnap.tqe_prev != 0) 536 panic("ffs_snapshot: %d already on list", ip->i_number); 537 TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap); 538 devvp->v_rdev->si_copyonwrite = ffs_copyonwrite; 539 devvp->v_vflag |= VV_COPYONWRITE; 540 VI_UNLOCK(devvp); 541 ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp"); 542 vp->v_vflag |= VV_SYSTEM; 543 out1: 544 /* 545 * Resume operation on filesystem. 546 */ 547 vfs_write_resume(vp->v_mount); 548 vn_start_write(NULL, &wrtmp, V_WAIT); 549 if (collectsnapstats && starttime.tv_sec > 0) { 550 nanotime(&endtime); 551 timespecsub(&endtime, &starttime); 552 printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n", 553 vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec, 554 endtime.tv_nsec / 1000000, redo, fs->fs_ncg); 555 } 556 if (sbp == NULL) 557 goto out; 558 /* 559 * Copy allocation information from all the snapshots in 560 * this snapshot and then expunge them from its view. 561 */ 562 snaphead = &devvp->v_rdev->si_snapshots; 563 TAILQ_FOREACH(xp, snaphead, i_nextsnap) { 564 if (xp == ip) 565 break; 566 if (xp->i_ump->um_fstype == UFS1) 567 error = expunge_ufs1(vp, xp, fs, snapacct_ufs1, 568 BLK_SNAP); 569 else 570 error = expunge_ufs2(vp, xp, fs, snapacct_ufs2, 571 BLK_SNAP); 572 if (error) { 573 fs->fs_snapinum[snaploc] = 0; 574 goto done; 575 } 576 } 577 /* 578 * Allocate space for the full list of preallocated snapshot blocks. 579 */ 580 MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t), 581 M_UFSMNT, M_WAITOK); 582 ip->i_snapblklist = &snapblklist[1]; 583 /* 584 * Expunge the blocks used by the snapshots from the set of 585 * blocks marked as used in the snapshot bitmaps. Also, collect 586 * the list of allocated blocks in i_snapblklist. 587 */ 588 if (ip->i_ump->um_fstype == UFS1) 589 error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP); 590 else 591 error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP); 592 if (error) { 593 fs->fs_snapinum[snaploc] = 0; 594 FREE(snapblklist, M_UFSMNT); 595 goto done; 596 } 597 if (snaplistsize < ip->i_snapblklist - snapblklist) 598 panic("ffs_snapshot: list too small"); 599 snaplistsize = ip->i_snapblklist - snapblklist; 600 snapblklist[0] = snaplistsize; 601 ip->i_snapblklist = 0; 602 /* 603 * Write out the list of allocated blocks to the end of the snapshot. 604 */ 605 auio.uio_iov = &aiov; 606 auio.uio_iovcnt = 1; 607 aiov.iov_base = (void *)snapblklist; 608 aiov.iov_len = snaplistsize * sizeof(daddr_t); 609 auio.uio_resid = aiov.iov_len;; 610 auio.uio_offset = ip->i_size; 611 auio.uio_segflg = UIO_SYSSPACE; 612 auio.uio_rw = UIO_WRITE; 613 auio.uio_td = td; 614 if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 615 fs->fs_snapinum[snaploc] = 0; 616 FREE(snapblklist, M_UFSMNT); 617 goto done; 618 } 619 /* 620 * Write the superblock and its summary information 621 * to the snapshot. 622 */ 623 blkno = fragstoblks(fs, fs->fs_csaddr); 624 len = howmany(fs->fs_cssize, fs->fs_bsize); 625 space = copy_fs->fs_csp; 626 for (loc = 0; loc < len; loc++) { 627 error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp); 628 if (error) { 629 brelse(nbp); 630 fs->fs_snapinum[snaploc] = 0; 631 FREE(snapblklist, M_UFSMNT); 632 goto done; 633 } 634 bcopy(space, nbp->b_data, fs->fs_bsize); 635 space = (char *)space + fs->fs_bsize; 636 bawrite(nbp); 637 } 638 /* 639 * As this is the newest list, it is the most inclusive, so 640 * should replace the previous list. 641 */ 642 VI_LOCK(devvp); 643 space = devvp->v_rdev->si_snapblklist; 644 devvp->v_rdev->si_snapblklist = snapblklist; 645 devvp->v_rdev->si_snaplistsize = snaplistsize; 646 if (space != NULL) 647 FREE(space, M_UFSMNT); 648 VI_UNLOCK(devvp); 649 done: 650 free(copy_fs->fs_csp, M_UFSMNT); 651 bawrite(sbp); 652 out: 653 if (saved_nice > 0) { 654 PROC_LOCK(td->td_proc); 655 mtx_lock_spin(&sched_lock); 656 sched_nice(td->td_ksegrp, saved_nice); 657 mtx_unlock_spin(&sched_lock); 658 PROC_UNLOCK(td->td_proc); 659 } 660 if (fs->fs_active != 0) { 661 FREE(fs->fs_active, M_DEVBUF); 662 fs->fs_active = 0; 663 } 664 mp->mnt_flag = flag; 665 if (error) 666 (void) UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED, td); 667 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 668 if (error) 669 vput(vp); 670 else 671 VOP_UNLOCK(vp, 0, td); 672 vn_finished_write(wrtmp); 673 return (error); 674 } 675 676 /* 677 * Copy a cylinder group map. All the unallocated blocks are marked 678 * BLK_NOCOPY so that the snapshot knows that it need not copy them 679 * if they are later written. If passno is one, then this is a first 680 * pass, so only setting needs to be done. If passno is 2, then this 681 * is a revision to a previous pass which must be undone as the 682 * replacement pass is done. 683 */ 684 static int 685 cgaccount(cg, vp, nbp, passno) 686 int cg; 687 struct vnode *vp; 688 struct buf *nbp; 689 int passno; 690 { 691 struct buf *bp, *ibp; 692 struct inode *ip; 693 struct cg *cgp; 694 struct fs *fs; 695 ufs2_daddr_t base, numblks; 696 int error, len, loc, indiroff; 697 698 ip = VTOI(vp); 699 fs = ip->i_fs; 700 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 701 (int)fs->fs_cgsize, KERNCRED, &bp); 702 if (error) { 703 brelse(bp); 704 return (error); 705 } 706 cgp = (struct cg *)bp->b_data; 707 if (!cg_chkmagic(cgp)) { 708 brelse(bp); 709 return (EIO); 710 } 711 atomic_set_int(&ACTIVECGNUM(fs, cg), ACTIVECGOFF(cg)); 712 bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize); 713 if (fs->fs_cgsize < fs->fs_bsize) 714 bzero(&nbp->b_data[fs->fs_cgsize], 715 fs->fs_bsize - fs->fs_cgsize); 716 if (passno == 2) 717 nbp->b_flags |= B_VALIDSUSPWRT; 718 numblks = howmany(fs->fs_size, fs->fs_frag); 719 len = howmany(fs->fs_fpg, fs->fs_frag); 720 base = cg * fs->fs_fpg / fs->fs_frag; 721 if (base + len >= numblks) 722 len = numblks - base - 1; 723 loc = 0; 724 if (base < NDADDR) { 725 for ( ; loc < NDADDR; loc++) { 726 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 727 DIP(ip, i_db[loc]) = BLK_NOCOPY; 728 else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY) 729 DIP(ip, i_db[loc]) = 0; 730 else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY) 731 panic("ffs_snapshot: lost direct block"); 732 } 733 } 734 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), 735 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 736 if (error) { 737 brelse(bp); 738 return (error); 739 } 740 indiroff = (base + loc - NDADDR) % NINDIR(fs); 741 for ( ; loc < len; loc++, indiroff++) { 742 if (indiroff >= NINDIR(fs)) { 743 if (passno == 2) 744 ibp->b_flags |= B_VALIDSUSPWRT; 745 bawrite(ibp); 746 error = UFS_BALLOC(vp, 747 lblktosize(fs, (off_t)(base + loc)), 748 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 749 if (error) { 750 brelse(bp); 751 return (error); 752 } 753 indiroff = 0; 754 } 755 if (ip->i_ump->um_fstype == UFS1) { 756 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 757 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 758 BLK_NOCOPY; 759 else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data)) 760 [indiroff] == BLK_NOCOPY) 761 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0; 762 else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data)) 763 [indiroff] == BLK_NOCOPY) 764 panic("ffs_snapshot: lost indirect block"); 765 continue; 766 } 767 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 768 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; 769 else if (passno == 2 && 770 ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) 771 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0; 772 else if (passno == 1 && 773 ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) 774 panic("ffs_snapshot: lost indirect block"); 775 } 776 bqrelse(bp); 777 if (passno == 2) 778 ibp->b_flags |= B_VALIDSUSPWRT; 779 bdwrite(ibp); 780 return (0); 781 } 782 783 /* 784 * Before expunging a snapshot inode, note all the 785 * blocks that it claims with BLK_SNAP so that fsck will 786 * be able to account for those blocks properly and so 787 * that this snapshot knows that it need not copy them 788 * if the other snapshot holding them is freed. This code 789 * is reproduced once each for UFS1 and UFS2. 790 */ 791 static int 792 expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype) 793 struct vnode *snapvp; 794 struct inode *cancelip; 795 struct fs *fs; 796 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 797 struct fs *, ufs_lbn_t, int); 798 int expungetype; 799 { 800 int i, error, indiroff; 801 ufs_lbn_t lbn, rlbn; 802 ufs2_daddr_t len, blkno, numblks, blksperindir; 803 struct ufs1_dinode *dip; 804 struct thread *td = curthread; 805 struct buf *bp; 806 807 /* 808 * Prepare to expunge the inode. If its inode block has not 809 * yet been copied, then allocate and fill the copy. 810 */ 811 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 812 blkno = 0; 813 if (lbn < NDADDR) { 814 blkno = VTOI(snapvp)->i_din1->di_db[lbn]; 815 } else { 816 td->td_proc->p_flag |= P_COWINPROGRESS; 817 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 818 fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); 819 td->td_proc->p_flag &= ~P_COWINPROGRESS; 820 if (error) 821 return (error); 822 indiroff = (lbn - NDADDR) % NINDIR(fs); 823 blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff]; 824 bqrelse(bp); 825 } 826 if (blkno != 0) { 827 if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) 828 return (error); 829 } else { 830 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 831 fs->fs_bsize, KERNCRED, 0, &bp); 832 if (error) 833 return (error); 834 if ((error = readblock(bp, lbn)) != 0) 835 return (error); 836 } 837 /* 838 * Set a snapshot inode to be a zero length file, regular files 839 * to be completely unallocated. 840 */ 841 dip = (struct ufs1_dinode *)bp->b_data + 842 ino_to_fsbo(fs, cancelip->i_number); 843 if (expungetype == BLK_NOCOPY) 844 dip->di_mode = 0; 845 dip->di_size = 0; 846 dip->di_blocks = 0; 847 dip->di_flags &= ~SF_SNAPSHOT; 848 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t)); 849 bdwrite(bp); 850 /* 851 * Now go through and expunge all the blocks in the file 852 * using the function requested. 853 */ 854 numblks = howmany(cancelip->i_size, fs->fs_bsize); 855 if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0], 856 &cancelip->i_din1->di_db[NDADDR], fs, 0, expungetype))) 857 return (error); 858 if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_ib[0], 859 &cancelip->i_din1->di_ib[NIADDR], fs, -1, expungetype))) 860 return (error); 861 blksperindir = 1; 862 lbn = -NDADDR; 863 len = numblks - NDADDR; 864 rlbn = NDADDR; 865 for (i = 0; len > 0 && i < NIADDR; i++) { 866 error = indiracct_ufs1(snapvp, ITOV(cancelip), i, 867 cancelip->i_din1->di_ib[i], lbn, rlbn, len, 868 blksperindir, fs, acctfunc, expungetype); 869 if (error) 870 return (error); 871 blksperindir *= NINDIR(fs); 872 lbn -= blksperindir + 1; 873 len -= blksperindir; 874 rlbn += blksperindir; 875 } 876 return (0); 877 } 878 879 /* 880 * Descend an indirect block chain for vnode cancelvp accounting for all 881 * its indirect blocks in snapvp. 882 */ 883 static int 884 indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, 885 blksperindir, fs, acctfunc, expungetype) 886 struct vnode *snapvp; 887 struct vnode *cancelvp; 888 int level; 889 ufs1_daddr_t blkno; 890 ufs_lbn_t lbn; 891 ufs_lbn_t rlbn; 892 ufs_lbn_t remblks; 893 ufs_lbn_t blksperindir; 894 struct fs *fs; 895 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 896 struct fs *, ufs_lbn_t, int); 897 int expungetype; 898 { 899 int error, num, i; 900 ufs_lbn_t subblksperindir; 901 struct indir indirs[NIADDR + 2]; 902 ufs1_daddr_t last, *bap; 903 struct buf *bp; 904 905 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 906 return (error); 907 if (lbn != indirs[num - 1 - level].in_lbn || blkno == 0 || num < 2) 908 panic("indiracct: botched params"); 909 /* 910 * We have to expand bread here since it will deadlock looking 911 * up the block number for any blocks that are not in the cache. 912 */ 913 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); 914 bp->b_blkno = fsbtodb(fs, blkno); 915 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 916 (error = readblock(bp, fragstoblks(fs, blkno)))) { 917 brelse(bp); 918 return (error); 919 } 920 /* 921 * Account for the block pointers in this indirect block. 922 */ 923 last = howmany(remblks, blksperindir); 924 if (last > NINDIR(fs)) 925 last = NINDIR(fs); 926 MALLOC(bap, ufs1_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); 927 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 928 bqrelse(bp); 929 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 930 level == 0 ? rlbn : -1, expungetype); 931 if (error || level == 0) 932 goto out; 933 /* 934 * Account for the block pointers in each of the indirect blocks 935 * in the levels below us. 936 */ 937 subblksperindir = blksperindir / NINDIR(fs); 938 for (lbn++, level--, i = 0; i < last; i++) { 939 error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn, 940 rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); 941 if (error) 942 goto out; 943 rlbn += blksperindir; 944 lbn -= blksperindir; 945 remblks -= blksperindir; 946 } 947 out: 948 FREE(bap, M_DEVBUF); 949 return (error); 950 } 951 952 /* 953 * Do both snap accounting and map accounting. 954 */ 955 static int 956 fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype) 957 struct vnode *vp; 958 ufs1_daddr_t *oldblkp, *lastblkp; 959 struct fs *fs; 960 ufs_lbn_t lblkno; 961 int exptype; /* BLK_SNAP or BLK_NOCOPY */ 962 { 963 int error; 964 965 if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 966 return (error); 967 return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 968 } 969 970 /* 971 * Identify a set of blocks allocated in a snapshot inode. 972 */ 973 static int 974 snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 975 struct vnode *vp; 976 ufs1_daddr_t *oldblkp, *lastblkp; 977 struct fs *fs; 978 ufs_lbn_t lblkno; 979 int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 980 { 981 struct inode *ip = VTOI(vp); 982 ufs1_daddr_t blkno, *blkp; 983 ufs_lbn_t lbn; 984 struct buf *ibp; 985 int error; 986 987 for ( ; oldblkp < lastblkp; oldblkp++) { 988 blkno = *oldblkp; 989 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 990 continue; 991 lbn = fragstoblks(fs, blkno); 992 if (lbn < NDADDR) { 993 blkp = &ip->i_din1->di_db[lbn]; 994 ip->i_flag |= IN_CHANGE | IN_UPDATE; 995 } else { 996 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 997 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 998 if (error) 999 return (error); 1000 blkp = &((ufs1_daddr_t *)(ibp->b_data)) 1001 [(lbn - NDADDR) % NINDIR(fs)]; 1002 } 1003 /* 1004 * If we are expunging a snapshot vnode and we 1005 * find a block marked BLK_NOCOPY, then it is 1006 * one that has been allocated to this snapshot after 1007 * we took our current snapshot and can be ignored. 1008 */ 1009 if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { 1010 if (lbn >= NDADDR) 1011 brelse(ibp); 1012 } else { 1013 if (*blkp != 0) 1014 panic("snapacct: bad block"); 1015 *blkp = expungetype; 1016 if (lbn >= NDADDR) 1017 bdwrite(ibp); 1018 } 1019 } 1020 return (0); 1021 } 1022 1023 /* 1024 * Account for a set of blocks allocated in a snapshot inode. 1025 */ 1026 static int 1027 mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1028 struct vnode *vp; 1029 ufs1_daddr_t *oldblkp, *lastblkp; 1030 struct fs *fs; 1031 ufs_lbn_t lblkno; 1032 int expungetype; 1033 { 1034 ufs1_daddr_t blkno; 1035 struct inode *ip; 1036 ino_t inum; 1037 int acctit; 1038 1039 ip = VTOI(vp); 1040 inum = ip->i_number; 1041 if (lblkno == -1) 1042 acctit = 0; 1043 else 1044 acctit = 1; 1045 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1046 blkno = *oldblkp; 1047 if (blkno == 0 || blkno == BLK_NOCOPY) 1048 continue; 1049 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1050 *ip->i_snapblklist++ = lblkno; 1051 if (blkno == BLK_SNAP) 1052 blkno = blkstofrags(fs, lblkno); 1053 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); 1054 } 1055 return (0); 1056 } 1057 1058 /* 1059 * Before expunging a snapshot inode, note all the 1060 * blocks that it claims with BLK_SNAP so that fsck will 1061 * be able to account for those blocks properly and so 1062 * that this snapshot knows that it need not copy them 1063 * if the other snapshot holding them is freed. This code 1064 * is reproduced once each for UFS1 and UFS2. 1065 */ 1066 static int 1067 expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype) 1068 struct vnode *snapvp; 1069 struct inode *cancelip; 1070 struct fs *fs; 1071 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1072 struct fs *, ufs_lbn_t, int); 1073 int expungetype; 1074 { 1075 int i, error, indiroff; 1076 ufs_lbn_t lbn, rlbn; 1077 ufs2_daddr_t len, blkno, numblks, blksperindir; 1078 struct ufs2_dinode *dip; 1079 struct thread *td = curthread; 1080 struct buf *bp; 1081 1082 /* 1083 * Prepare to expunge the inode. If its inode block has not 1084 * yet been copied, then allocate and fill the copy. 1085 */ 1086 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 1087 blkno = 0; 1088 if (lbn < NDADDR) { 1089 blkno = VTOI(snapvp)->i_din2->di_db[lbn]; 1090 } else { 1091 td->td_proc->p_flag |= P_COWINPROGRESS; 1092 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 1093 fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); 1094 td->td_proc->p_flag &= ~P_COWINPROGRESS; 1095 if (error) 1096 return (error); 1097 indiroff = (lbn - NDADDR) % NINDIR(fs); 1098 blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff]; 1099 bqrelse(bp); 1100 } 1101 if (blkno != 0) { 1102 if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) 1103 return (error); 1104 } else { 1105 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 1106 fs->fs_bsize, KERNCRED, 0, &bp); 1107 if (error) 1108 return (error); 1109 if ((error = readblock(bp, lbn)) != 0) 1110 return (error); 1111 } 1112 /* 1113 * Set a snapshot inode to be a zero length file, regular files 1114 * to be completely unallocated. 1115 */ 1116 dip = (struct ufs2_dinode *)bp->b_data + 1117 ino_to_fsbo(fs, cancelip->i_number); 1118 if (expungetype == BLK_NOCOPY) 1119 dip->di_mode = 0; 1120 dip->di_size = 0; 1121 dip->di_blocks = 0; 1122 dip->di_flags &= ~SF_SNAPSHOT; 1123 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t)); 1124 bdwrite(bp); 1125 /* 1126 * Now go through and expunge all the blocks in the file 1127 * using the function requested. 1128 */ 1129 numblks = howmany(cancelip->i_size, fs->fs_bsize); 1130 if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0], 1131 &cancelip->i_din2->di_db[NDADDR], fs, 0, expungetype))) 1132 return (error); 1133 if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_ib[0], 1134 &cancelip->i_din2->di_ib[NIADDR], fs, -1, expungetype))) 1135 return (error); 1136 blksperindir = 1; 1137 lbn = -NDADDR; 1138 len = numblks - NDADDR; 1139 rlbn = NDADDR; 1140 for (i = 0; len > 0 && i < NIADDR; i++) { 1141 error = indiracct_ufs2(snapvp, ITOV(cancelip), i, 1142 cancelip->i_din2->di_ib[i], lbn, rlbn, len, 1143 blksperindir, fs, acctfunc, expungetype); 1144 if (error) 1145 return (error); 1146 blksperindir *= NINDIR(fs); 1147 lbn -= blksperindir + 1; 1148 len -= blksperindir; 1149 rlbn += blksperindir; 1150 } 1151 return (0); 1152 } 1153 1154 /* 1155 * Descend an indirect block chain for vnode cancelvp accounting for all 1156 * its indirect blocks in snapvp. 1157 */ 1158 static int 1159 indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, 1160 blksperindir, fs, acctfunc, expungetype) 1161 struct vnode *snapvp; 1162 struct vnode *cancelvp; 1163 int level; 1164 ufs2_daddr_t blkno; 1165 ufs_lbn_t lbn; 1166 ufs_lbn_t rlbn; 1167 ufs_lbn_t remblks; 1168 ufs_lbn_t blksperindir; 1169 struct fs *fs; 1170 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1171 struct fs *, ufs_lbn_t, int); 1172 int expungetype; 1173 { 1174 int error, num, i; 1175 ufs_lbn_t subblksperindir; 1176 struct indir indirs[NIADDR + 2]; 1177 ufs2_daddr_t last, *bap; 1178 struct buf *bp; 1179 1180 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 1181 return (error); 1182 if (lbn != indirs[num - 1 - level].in_lbn || blkno == 0 || num < 2) 1183 panic("indiracct: botched params"); 1184 /* 1185 * We have to expand bread here since it will deadlock looking 1186 * up the block number for any blocks that are not in the cache. 1187 */ 1188 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); 1189 bp->b_blkno = fsbtodb(fs, blkno); 1190 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 1191 (error = readblock(bp, fragstoblks(fs, blkno)))) { 1192 brelse(bp); 1193 return (error); 1194 } 1195 /* 1196 * Account for the block pointers in this indirect block. 1197 */ 1198 last = howmany(remblks, blksperindir); 1199 if (last > NINDIR(fs)) 1200 last = NINDIR(fs); 1201 MALLOC(bap, ufs2_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); 1202 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 1203 bqrelse(bp); 1204 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 1205 level == 0 ? rlbn : -1, expungetype); 1206 if (error || level == 0) 1207 goto out; 1208 /* 1209 * Account for the block pointers in each of the indirect blocks 1210 * in the levels below us. 1211 */ 1212 subblksperindir = blksperindir / NINDIR(fs); 1213 for (lbn++, level--, i = 0; i < last; i++) { 1214 error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn, 1215 rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); 1216 if (error) 1217 goto out; 1218 rlbn += blksperindir; 1219 lbn -= blksperindir; 1220 remblks -= blksperindir; 1221 } 1222 out: 1223 FREE(bap, M_DEVBUF); 1224 return (error); 1225 } 1226 1227 /* 1228 * Do both snap accounting and map accounting. 1229 */ 1230 static int 1231 fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype) 1232 struct vnode *vp; 1233 ufs2_daddr_t *oldblkp, *lastblkp; 1234 struct fs *fs; 1235 ufs_lbn_t lblkno; 1236 int exptype; /* BLK_SNAP or BLK_NOCOPY */ 1237 { 1238 int error; 1239 1240 if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 1241 return (error); 1242 return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 1243 } 1244 1245 /* 1246 * Identify a set of blocks allocated in a snapshot inode. 1247 */ 1248 static int 1249 snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1250 struct vnode *vp; 1251 ufs2_daddr_t *oldblkp, *lastblkp; 1252 struct fs *fs; 1253 ufs_lbn_t lblkno; 1254 int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 1255 { 1256 struct inode *ip = VTOI(vp); 1257 ufs2_daddr_t blkno, *blkp; 1258 ufs_lbn_t lbn; 1259 struct buf *ibp; 1260 int error; 1261 1262 for ( ; oldblkp < lastblkp; oldblkp++) { 1263 blkno = *oldblkp; 1264 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1265 continue; 1266 lbn = fragstoblks(fs, blkno); 1267 if (lbn < NDADDR) { 1268 blkp = &ip->i_din2->di_db[lbn]; 1269 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1270 } else { 1271 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1272 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1273 if (error) 1274 return (error); 1275 blkp = &((ufs2_daddr_t *)(ibp->b_data)) 1276 [(lbn - NDADDR) % NINDIR(fs)]; 1277 } 1278 /* 1279 * If we are expunging a snapshot vnode and we 1280 * find a block marked BLK_NOCOPY, then it is 1281 * one that has been allocated to this snapshot after 1282 * we took our current snapshot and can be ignored. 1283 */ 1284 if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { 1285 if (lbn >= NDADDR) 1286 brelse(ibp); 1287 } else { 1288 if (*blkp != 0) 1289 panic("snapacct: bad block"); 1290 *blkp = expungetype; 1291 if (lbn >= NDADDR) 1292 bdwrite(ibp); 1293 } 1294 } 1295 return (0); 1296 } 1297 1298 /* 1299 * Account for a set of blocks allocated in a snapshot inode. 1300 */ 1301 static int 1302 mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1303 struct vnode *vp; 1304 ufs2_daddr_t *oldblkp, *lastblkp; 1305 struct fs *fs; 1306 ufs_lbn_t lblkno; 1307 int expungetype; 1308 { 1309 ufs2_daddr_t blkno; 1310 struct inode *ip; 1311 ino_t inum; 1312 int acctit; 1313 1314 ip = VTOI(vp); 1315 inum = ip->i_number; 1316 if (lblkno == -1) 1317 acctit = 0; 1318 else 1319 acctit = 1; 1320 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1321 blkno = *oldblkp; 1322 if (blkno == 0 || blkno == BLK_NOCOPY) 1323 continue; 1324 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1325 *ip->i_snapblklist++ = lblkno; 1326 if (blkno == BLK_SNAP) 1327 blkno = blkstofrags(fs, lblkno); 1328 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); 1329 } 1330 return (0); 1331 } 1332 1333 /* 1334 * Decrement extra reference on snapshot when last name is removed. 1335 * It will not be freed until the last open reference goes away. 1336 */ 1337 void 1338 ffs_snapgone(ip) 1339 struct inode *ip; 1340 { 1341 struct inode *xp; 1342 struct fs *fs; 1343 int snaploc; 1344 1345 /* 1346 * Find snapshot in incore list. 1347 */ 1348 TAILQ_FOREACH(xp, &ip->i_devvp->v_rdev->si_snapshots, i_nextsnap) 1349 if (xp == ip) 1350 break; 1351 if (xp != NULL) 1352 vrele(ITOV(ip)); 1353 else if (snapdebug) 1354 printf("ffs_snapgone: lost snapshot vnode %d\n", 1355 ip->i_number); 1356 /* 1357 * Delete snapshot inode from superblock. Keep list dense. 1358 */ 1359 fs = ip->i_fs; 1360 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 1361 if (fs->fs_snapinum[snaploc] == ip->i_number) 1362 break; 1363 if (snaploc < FSMAXSNAP) { 1364 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { 1365 if (fs->fs_snapinum[snaploc] == 0) 1366 break; 1367 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; 1368 } 1369 fs->fs_snapinum[snaploc - 1] = 0; 1370 } 1371 } 1372 1373 /* 1374 * Prepare a snapshot file for being removed. 1375 */ 1376 void 1377 ffs_snapremove(vp) 1378 struct vnode *vp; 1379 { 1380 struct inode *ip; 1381 struct vnode *devvp; 1382 struct lock *lkp; 1383 struct buf *ibp; 1384 struct fs *fs; 1385 struct thread *td = curthread; 1386 ufs2_daddr_t numblks, blkno, dblk, *snapblklist; 1387 int error, loc, last; 1388 1389 ip = VTOI(vp); 1390 fs = ip->i_fs; 1391 devvp = ip->i_devvp; 1392 /* 1393 * If active, delete from incore list (this snapshot may 1394 * already have been in the process of being deleted, so 1395 * would not have been active). 1396 * 1397 * Clear copy-on-write flag if last snapshot. 1398 */ 1399 if (ip->i_nextsnap.tqe_prev != 0) { 1400 VI_LOCK(devvp); 1401 lockmgr(&vp->v_lock, LK_INTERLOCK | LK_EXCLUSIVE, 1402 VI_MTX(devvp), td); 1403 VI_LOCK(devvp); 1404 TAILQ_REMOVE(&devvp->v_rdev->si_snapshots, ip, i_nextsnap); 1405 ip->i_nextsnap.tqe_prev = 0; 1406 lkp = vp->v_vnlock; 1407 vp->v_vnlock = &vp->v_lock; 1408 lockmgr(lkp, LK_RELEASE, NULL, td); 1409 if (TAILQ_FIRST(&devvp->v_rdev->si_snapshots) != 0) { 1410 VI_UNLOCK(devvp); 1411 } else { 1412 snapblklist = devvp->v_rdev->si_snapblklist; 1413 devvp->v_rdev->si_snapblklist = 0; 1414 devvp->v_rdev->si_snaplistsize = 0; 1415 devvp->v_rdev->si_copyonwrite = 0; 1416 devvp->v_vflag &= ~VV_COPYONWRITE; 1417 lockmgr(lkp, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp), td); 1418 lockmgr(lkp, LK_RELEASE, NULL, td); 1419 lockdestroy(lkp); 1420 FREE(lkp, M_UFSMNT); 1421 FREE(snapblklist, M_UFSMNT); 1422 } 1423 } 1424 /* 1425 * Clear all BLK_NOCOPY fields. Pass any block claims to other 1426 * snapshots that want them (see ffs_snapblkfree below). 1427 */ 1428 for (blkno = 1; blkno < NDADDR; blkno++) { 1429 dblk = DIP(ip, i_db[blkno]); 1430 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1431 DIP(ip, i_db[blkno]) = 0; 1432 else if ((dblk == blkstofrags(fs, blkno) && 1433 ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, 1434 ip->i_number))) { 1435 DIP(ip, i_blocks) -= btodb(fs->fs_bsize); 1436 DIP(ip, i_db[blkno]) = 0; 1437 } 1438 } 1439 numblks = howmany(ip->i_size, fs->fs_bsize); 1440 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 1441 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), 1442 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1443 if (error) 1444 continue; 1445 if (fs->fs_size - blkno > NINDIR(fs)) 1446 last = NINDIR(fs); 1447 else 1448 last = fs->fs_size - blkno; 1449 for (loc = 0; loc < last; loc++) { 1450 if (ip->i_ump->um_fstype == UFS1) { 1451 dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc]; 1452 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1453 ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; 1454 else if ((dblk == blkstofrags(fs, blkno) && 1455 ffs_snapblkfree(fs, ip->i_devvp, dblk, 1456 fs->fs_bsize, ip->i_number))) { 1457 ip->i_din1->di_blocks -= 1458 btodb(fs->fs_bsize); 1459 ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; 1460 } 1461 continue; 1462 } 1463 dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc]; 1464 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1465 ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; 1466 else if ((dblk == blkstofrags(fs, blkno) && 1467 ffs_snapblkfree(fs, ip->i_devvp, dblk, 1468 fs->fs_bsize, ip->i_number))) { 1469 ip->i_din2->di_blocks -= btodb(fs->fs_bsize); 1470 ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; 1471 } 1472 } 1473 bawrite(ibp); 1474 } 1475 /* 1476 * Clear snapshot flag and drop reference. 1477 */ 1478 ip->i_flags &= ~SF_SNAPSHOT; 1479 DIP(ip, i_flags) = ip->i_flags; 1480 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1481 } 1482 1483 /* 1484 * Notification that a block is being freed. Return zero if the free 1485 * should be allowed to proceed. Return non-zero if the snapshot file 1486 * wants to claim the block. The block will be claimed if it is an 1487 * uncopied part of one of the snapshots. It will be freed if it is 1488 * either a BLK_NOCOPY or has already been copied in all of the snapshots. 1489 * If a fragment is being freed, then all snapshots that care about 1490 * it must make a copy since a snapshot file can only claim full sized 1491 * blocks. Note that if more than one snapshot file maps the block, 1492 * we can pick one at random to claim it. Since none of the snapshots 1493 * can change, we are assurred that they will all see the same unmodified 1494 * image. When deleting a snapshot file (see ffs_snapremove above), we 1495 * must push any of these claimed blocks to one of the other snapshots 1496 * that maps it. These claimed blocks are easily identified as they will 1497 * have a block number equal to their logical block number within the 1498 * snapshot. A copied block can never have this property because they 1499 * must always have been allocated from a BLK_NOCOPY location. 1500 */ 1501 int 1502 ffs_snapblkfree(fs, devvp, bno, size, inum) 1503 struct fs *fs; 1504 struct vnode *devvp; 1505 ufs2_daddr_t bno; 1506 long size; 1507 ino_t inum; 1508 { 1509 struct buf *ibp, *cbp, *savedcbp = 0; 1510 struct thread *td = curthread; 1511 struct inode *ip; 1512 struct vnode *vp = NULL; 1513 ufs_lbn_t lbn; 1514 ufs2_daddr_t blkno; 1515 int indiroff = 0, snapshot_locked = 0, error = 0, claimedblk = 0; 1516 struct snaphead *snaphead; 1517 1518 lbn = fragstoblks(fs, bno); 1519 retry: 1520 VI_LOCK(devvp); 1521 snaphead = &devvp->v_rdev->si_snapshots; 1522 TAILQ_FOREACH(ip, snaphead, i_nextsnap) { 1523 vp = ITOV(ip); 1524 /* 1525 * Lookup block being written. 1526 */ 1527 if (lbn < NDADDR) { 1528 blkno = DIP(ip, i_db[lbn]); 1529 } else { 1530 if (snapshot_locked == 0 && 1531 lockmgr(vp->v_vnlock, 1532 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 1533 VI_MTX(devvp), td) != 0) 1534 goto retry; 1535 snapshot_locked = 1; 1536 td->td_proc->p_flag |= P_COWINPROGRESS; 1537 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1538 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1539 td->td_proc->p_flag &= ~P_COWINPROGRESS; 1540 if (error) 1541 break; 1542 indiroff = (lbn - NDADDR) % NINDIR(fs); 1543 if (ip->i_ump->um_fstype == UFS1) 1544 blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; 1545 else 1546 blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; 1547 } 1548 /* 1549 * Check to see if block needs to be copied. 1550 */ 1551 if (blkno == 0) { 1552 /* 1553 * A block that we map is being freed. If it has not 1554 * been claimed yet, we will claim or copy it (below). 1555 */ 1556 claimedblk = 1; 1557 } else if (blkno == BLK_SNAP) { 1558 /* 1559 * No previous snapshot claimed the block, 1560 * so it will be freed and become a BLK_NOCOPY 1561 * (don't care) for us. 1562 */ 1563 if (claimedblk) 1564 panic("snapblkfree: inconsistent block type"); 1565 if (snapshot_locked == 0 && 1566 lockmgr(vp->v_vnlock, 1567 LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, 1568 VI_MTX(devvp), td) != 0) { 1569 if (lbn >= NDADDR) 1570 bqrelse(ibp); 1571 vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL, td); 1572 goto retry; 1573 } 1574 snapshot_locked = 1; 1575 if (lbn < NDADDR) { 1576 DIP(ip, i_db[lbn]) = BLK_NOCOPY; 1577 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1578 } else if (ip->i_ump->um_fstype == UFS1) { 1579 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 1580 BLK_NOCOPY; 1581 bdwrite(ibp); 1582 } else { 1583 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 1584 BLK_NOCOPY; 1585 bdwrite(ibp); 1586 } 1587 continue; 1588 } else /* BLK_NOCOPY or default */ { 1589 /* 1590 * If the snapshot has already copied the block 1591 * (default), or does not care about the block, 1592 * it is not needed. 1593 */ 1594 if (lbn >= NDADDR) 1595 bqrelse(ibp); 1596 continue; 1597 } 1598 /* 1599 * If this is a full size block, we will just grab it 1600 * and assign it to the snapshot inode. Otherwise we 1601 * will proceed to copy it. See explanation for this 1602 * routine as to why only a single snapshot needs to 1603 * claim this block. 1604 */ 1605 if (snapshot_locked == 0 && 1606 lockmgr(vp->v_vnlock, 1607 LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, 1608 VI_MTX(devvp), td) != 0) { 1609 if (lbn >= NDADDR) 1610 bqrelse(ibp); 1611 vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL, td); 1612 goto retry; 1613 } 1614 snapshot_locked = 1; 1615 if (size == fs->fs_bsize) { 1616 #ifdef DEBUG 1617 if (snapdebug) 1618 printf("%s %d lbn %jd from inum %d\n", 1619 "Grabonremove: snapino", ip->i_number, 1620 (intmax_t)lbn, inum); 1621 #endif 1622 if (lbn < NDADDR) { 1623 DIP(ip, i_db[lbn]) = bno; 1624 } else if (ip->i_ump->um_fstype == UFS1) { 1625 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno; 1626 bdwrite(ibp); 1627 } else { 1628 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno; 1629 bdwrite(ibp); 1630 } 1631 DIP(ip, i_blocks) += btodb(size); 1632 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1633 VOP_UNLOCK(vp, 0, td); 1634 return (1); 1635 } 1636 if (lbn >= NDADDR) 1637 bqrelse(ibp); 1638 /* 1639 * Allocate the block into which to do the copy. Note that this 1640 * allocation will never require any additional allocations for 1641 * the snapshot inode. 1642 */ 1643 td->td_proc->p_flag |= P_COWINPROGRESS; 1644 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1645 fs->fs_bsize, KERNCRED, 0, &cbp); 1646 td->td_proc->p_flag &= ~P_COWINPROGRESS; 1647 if (error) 1648 break; 1649 #ifdef DEBUG 1650 if (snapdebug) 1651 printf("%s%d lbn %jd %s %d size %ld to blkno %jd\n", 1652 "Copyonremove: snapino ", ip->i_number, 1653 (intmax_t)lbn, "for inum", inum, size, 1654 (intmax_t)cbp->b_blkno); 1655 #endif 1656 /* 1657 * If we have already read the old block contents, then 1658 * simply copy them to the new block. Note that we need 1659 * to synchronously write snapshots that have not been 1660 * unlinked, and hence will be visible after a crash, 1661 * to ensure their integrity. 1662 */ 1663 if (savedcbp != 0) { 1664 bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 1665 bawrite(cbp); 1666 if (dopersistence && ip->i_effnlink > 0) 1667 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1668 continue; 1669 } 1670 /* 1671 * Otherwise, read the old block contents into the buffer. 1672 */ 1673 if ((error = readblock(cbp, lbn)) != 0) { 1674 bzero(cbp->b_data, fs->fs_bsize); 1675 bawrite(cbp); 1676 if (dopersistence && ip->i_effnlink > 0) 1677 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1678 break; 1679 } 1680 savedcbp = cbp; 1681 } 1682 /* 1683 * Note that we need to synchronously write snapshots that 1684 * have not been unlinked, and hence will be visible after 1685 * a crash, to ensure their integrity. 1686 */ 1687 if (savedcbp) { 1688 vp = savedcbp->b_vp; 1689 bawrite(savedcbp); 1690 if (dopersistence && VTOI(vp)->i_effnlink > 0) 1691 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1692 } 1693 /* 1694 * If we have been unable to allocate a block in which to do 1695 * the copy, then return non-zero so that the fragment will 1696 * not be freed. Although space will be lost, the snapshot 1697 * will stay consistent. 1698 */ 1699 if (snapshot_locked) 1700 VOP_UNLOCK(vp, 0, td); 1701 else 1702 VI_UNLOCK(devvp); 1703 return (error); 1704 } 1705 1706 /* 1707 * Associate snapshot files when mounting. 1708 */ 1709 void 1710 ffs_snapshot_mount(mp) 1711 struct mount *mp; 1712 { 1713 struct ufsmount *ump = VFSTOUFS(mp); 1714 struct vnode *devvp = ump->um_devvp; 1715 struct fs *fs = ump->um_fs; 1716 struct thread *td = curthread; 1717 struct snaphead *snaphead; 1718 struct vnode *vp; 1719 struct inode *ip, *xp; 1720 struct uio auio; 1721 struct iovec aiov; 1722 void *snapblklist; 1723 char *reason; 1724 daddr_t snaplistsize; 1725 int error, snaploc, loc; 1726 1727 /* 1728 * XXX The following needs to be set before UFS_TRUNCATE or 1729 * VOP_READ can be called. 1730 */ 1731 mp->mnt_stat.f_iosize = fs->fs_bsize; 1732 /* 1733 * Process each snapshot listed in the superblock. 1734 */ 1735 vp = NULL; 1736 snaphead = &devvp->v_rdev->si_snapshots; 1737 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { 1738 if (fs->fs_snapinum[snaploc] == 0) 1739 break; 1740 if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], 1741 LK_EXCLUSIVE, &vp)) != 0){ 1742 printf("ffs_snapshot_mount: vget failed %d\n", error); 1743 continue; 1744 } 1745 ip = VTOI(vp); 1746 if ((ip->i_flags & SF_SNAPSHOT) == 0 || ip->i_size == 1747 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) { 1748 if ((ip->i_flags & SF_SNAPSHOT) == 0) { 1749 reason = "non-snapshot"; 1750 } else { 1751 reason = "old format snapshot"; 1752 (void)UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED, td); 1753 (void)VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1754 } 1755 printf("ffs_snapshot_mount: %s inode %d\n", 1756 reason, fs->fs_snapinum[snaploc]); 1757 vput(vp); 1758 vp = NULL; 1759 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { 1760 if (fs->fs_snapinum[loc] == 0) 1761 break; 1762 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; 1763 } 1764 fs->fs_snapinum[loc - 1] = 0; 1765 snaploc--; 1766 continue; 1767 } 1768 /* 1769 * If there already exist snapshots on this filesystem, grab a 1770 * reference to their shared lock. If this is the first snapshot 1771 * on this filesystem, we need to allocate a lock for the 1772 * snapshots to share. In either case, acquire the snapshot 1773 * lock and give up our original private lock. 1774 */ 1775 VI_LOCK(devvp); 1776 if ((xp = TAILQ_FIRST(snaphead)) != NULL) { 1777 VI_LOCK(vp); 1778 vp->v_vnlock = ITOV(xp)->v_vnlock; 1779 VI_UNLOCK(devvp); 1780 } else { 1781 struct lock *lkp; 1782 1783 VI_UNLOCK(devvp); 1784 MALLOC(lkp, struct lock *, sizeof(struct lock), 1785 M_UFSMNT, M_WAITOK); 1786 lockinit(lkp, PVFS, "snaplk", VLKTIMEOUT, 1787 LK_CANRECURSE | LK_NOPAUSE); 1788 VI_LOCK(vp); 1789 vp->v_vnlock = lkp; 1790 } 1791 vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, td); 1792 transferlockers(&vp->v_lock, vp->v_vnlock); 1793 lockmgr(&vp->v_lock, LK_RELEASE, NULL, td); 1794 /* 1795 * Link it onto the active snapshot list. 1796 */ 1797 VI_LOCK(devvp); 1798 if (ip->i_nextsnap.tqe_prev != 0) 1799 panic("ffs_snapshot_mount: %d already on list", 1800 ip->i_number); 1801 else 1802 TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap); 1803 vp->v_vflag |= VV_SYSTEM; 1804 VI_UNLOCK(devvp); 1805 VOP_UNLOCK(vp, 0, td); 1806 } 1807 /* 1808 * No usable snapshots found. 1809 */ 1810 if (vp == NULL) 1811 return; 1812 /* 1813 * Allocate the space for the block hints list. We always want to 1814 * use the list from the newest snapshot. 1815 */ 1816 auio.uio_iov = &aiov; 1817 auio.uio_iovcnt = 1; 1818 aiov.iov_base = (void *)&snaplistsize; 1819 aiov.iov_len = sizeof(snaplistsize); 1820 auio.uio_resid = aiov.iov_len; 1821 auio.uio_offset = 1822 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)); 1823 auio.uio_segflg = UIO_SYSSPACE; 1824 auio.uio_rw = UIO_READ; 1825 auio.uio_td = td; 1826 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1827 if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 1828 printf("ffs_snapshot_mount: read_1 failed %d\n", error); 1829 VOP_UNLOCK(vp, 0, td); 1830 return; 1831 } 1832 MALLOC(snapblklist, void *, snaplistsize * sizeof(daddr_t), 1833 M_UFSMNT, M_WAITOK); 1834 auio.uio_iovcnt = 1; 1835 aiov.iov_base = snapblklist; 1836 aiov.iov_len = snaplistsize * sizeof (daddr_t); 1837 auio.uio_resid = aiov.iov_len; 1838 auio.uio_offset -= sizeof(snaplistsize); 1839 if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 1840 printf("ffs_snapshot_mount: read_2 failed %d\n", error); 1841 VOP_UNLOCK(vp, 0, td); 1842 FREE(snapblklist, M_UFSMNT); 1843 return; 1844 } 1845 VOP_UNLOCK(vp, 0, td); 1846 VI_LOCK(devvp); 1847 ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_mount"); 1848 devvp->v_rdev->si_snaplistsize = snaplistsize; 1849 devvp->v_rdev->si_snapblklist = (daddr_t *)snapblklist; 1850 devvp->v_rdev->si_copyonwrite = ffs_copyonwrite; 1851 devvp->v_vflag |= VV_COPYONWRITE; 1852 VI_UNLOCK(devvp); 1853 } 1854 1855 /* 1856 * Disassociate snapshot files when unmounting. 1857 */ 1858 void 1859 ffs_snapshot_unmount(mp) 1860 struct mount *mp; 1861 { 1862 struct vnode *devvp = VFSTOUFS(mp)->um_devvp; 1863 struct snaphead *snaphead = &devvp->v_rdev->si_snapshots; 1864 struct lock *lkp = NULL; 1865 struct inode *xp; 1866 struct vnode *vp; 1867 1868 VI_LOCK(devvp); 1869 while ((xp = TAILQ_FIRST(snaphead)) != 0) { 1870 vp = ITOV(xp); 1871 lkp = vp->v_vnlock; 1872 vp->v_vnlock = &vp->v_lock; 1873 TAILQ_REMOVE(snaphead, xp, i_nextsnap); 1874 xp->i_nextsnap.tqe_prev = 0; 1875 if (xp->i_effnlink > 0) { 1876 VI_UNLOCK(devvp); 1877 vrele(vp); 1878 VI_LOCK(devvp); 1879 } 1880 } 1881 if (devvp->v_rdev->si_snapblklist != NULL) { 1882 FREE(devvp->v_rdev->si_snapblklist, M_UFSMNT); 1883 devvp->v_rdev->si_snapblklist = NULL; 1884 devvp->v_rdev->si_snaplistsize = 0; 1885 } 1886 if (lkp != NULL) { 1887 lockdestroy(lkp); 1888 FREE(lkp, M_UFSMNT); 1889 } 1890 ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount"); 1891 devvp->v_rdev->si_copyonwrite = 0; 1892 devvp->v_vflag &= ~VV_COPYONWRITE; 1893 VI_UNLOCK(devvp); 1894 } 1895 1896 /* 1897 * Check for need to copy block that is about to be written, 1898 * copying the block if necessary. 1899 */ 1900 static int 1901 ffs_copyonwrite(devvp, bp) 1902 struct vnode *devvp; 1903 struct buf *bp; 1904 { 1905 struct snaphead *snaphead; 1906 struct buf *ibp, *cbp, *savedcbp = 0; 1907 struct thread *td = curthread; 1908 struct fs *fs; 1909 struct inode *ip; 1910 struct vnode *vp = 0; 1911 ufs2_daddr_t lbn, blkno, *snapblklist; 1912 int lower, upper, mid, indiroff, snapshot_locked = 0, error = 0; 1913 1914 if (td->td_proc->p_flag & P_COWINPROGRESS) 1915 panic("ffs_copyonwrite: recursive call"); 1916 /* 1917 * First check to see if it is in the preallocated list. 1918 * By doing this check we avoid several potential deadlocks. 1919 */ 1920 VI_LOCK(devvp); 1921 snaphead = &devvp->v_rdev->si_snapshots; 1922 ip = TAILQ_FIRST(snaphead); 1923 fs = ip->i_fs; 1924 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); 1925 snapblklist = devvp->v_rdev->si_snapblklist; 1926 upper = devvp->v_rdev->si_snaplistsize - 1; 1927 lower = 1; 1928 while (lower <= upper) { 1929 mid = (lower + upper) / 2; 1930 if (snapblklist[mid] == lbn) 1931 break; 1932 if (snapblklist[mid] < lbn) 1933 lower = mid + 1; 1934 else 1935 upper = mid - 1; 1936 } 1937 if (lower <= upper) { 1938 VI_UNLOCK(devvp); 1939 return (0); 1940 } 1941 /* 1942 * Not in the precomputed list, so check the snapshots. 1943 */ 1944 retry: 1945 TAILQ_FOREACH(ip, snaphead, i_nextsnap) { 1946 vp = ITOV(ip); 1947 /* 1948 * We ensure that everything of our own that needs to be 1949 * copied will be done at the time that ffs_snapshot is 1950 * called. Thus we can skip the check here which can 1951 * deadlock in doing the lookup in UFS_BALLOC. 1952 */ 1953 if (bp->b_vp == vp) 1954 continue; 1955 /* 1956 * Check to see if block needs to be copied. We do not have 1957 * to hold the snapshot lock while doing this lookup as it 1958 * will never require any additional allocations for the 1959 * snapshot inode. 1960 */ 1961 if (lbn < NDADDR) { 1962 blkno = DIP(ip, i_db[lbn]); 1963 } else { 1964 if (snapshot_locked == 0 && 1965 lockmgr(vp->v_vnlock, 1966 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 1967 VI_MTX(devvp), td) != 0) { 1968 VI_LOCK(devvp); 1969 goto retry; 1970 } 1971 snapshot_locked = 1; 1972 td->td_proc->p_flag |= P_COWINPROGRESS; 1973 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1974 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1975 td->td_proc->p_flag &= ~P_COWINPROGRESS; 1976 if (error) 1977 break; 1978 indiroff = (lbn - NDADDR) % NINDIR(fs); 1979 if (ip->i_ump->um_fstype == UFS1) 1980 blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; 1981 else 1982 blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; 1983 bqrelse(ibp); 1984 } 1985 #ifdef DIAGNOSTIC 1986 if (blkno == BLK_SNAP && bp->b_lblkno >= 0) 1987 panic("ffs_copyonwrite: bad copy block"); 1988 #endif 1989 if (blkno != 0) 1990 continue; 1991 /* 1992 * Allocate the block into which to do the copy. Since 1993 * multiple processes may all try to copy the same block, 1994 * we have to recheck our need to do a copy if we sleep 1995 * waiting for the lock. 1996 * 1997 * Because all snapshots on a filesystem share a single 1998 * lock, we ensure that we will never be in competition 1999 * with another process to allocate a block. 2000 */ 2001 if (snapshot_locked == 0 && 2002 lockmgr(vp->v_vnlock, 2003 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 2004 VI_MTX(devvp), td) != 0) { 2005 VI_LOCK(devvp); 2006 goto retry; 2007 } 2008 snapshot_locked = 1; 2009 td->td_proc->p_flag |= P_COWINPROGRESS; 2010 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 2011 fs->fs_bsize, KERNCRED, 0, &cbp); 2012 td->td_proc->p_flag &= ~P_COWINPROGRESS; 2013 if (error) 2014 break; 2015 #ifdef DEBUG 2016 if (snapdebug) { 2017 printf("Copyonwrite: snapino %d lbn %jd for ", 2018 ip->i_number, (intmax_t)lbn); 2019 if (bp->b_vp == devvp) 2020 printf("fs metadata"); 2021 else 2022 printf("inum %d", VTOI(bp->b_vp)->i_number); 2023 printf(" lblkno %jd to blkno %jd\n", 2024 (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno); 2025 } 2026 #endif 2027 /* 2028 * If we have already read the old block contents, then 2029 * simply copy them to the new block. Note that we need 2030 * to synchronously write snapshots that have not been 2031 * unlinked, and hence will be visible after a crash, 2032 * to ensure their integrity. 2033 */ 2034 if (savedcbp != 0) { 2035 bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 2036 bawrite(cbp); 2037 if (dopersistence && ip->i_effnlink > 0) 2038 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 2039 continue; 2040 } 2041 /* 2042 * Otherwise, read the old block contents into the buffer. 2043 */ 2044 if ((error = readblock(cbp, lbn)) != 0) { 2045 bzero(cbp->b_data, fs->fs_bsize); 2046 bawrite(cbp); 2047 if (dopersistence && ip->i_effnlink > 0) 2048 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 2049 break; 2050 } 2051 savedcbp = cbp; 2052 } 2053 /* 2054 * Note that we need to synchronously write snapshots that 2055 * have not been unlinked, and hence will be visible after 2056 * a crash, to ensure their integrity. 2057 */ 2058 if (savedcbp) { 2059 vp = savedcbp->b_vp; 2060 bawrite(savedcbp); 2061 if (dopersistence && VTOI(vp)->i_effnlink > 0) 2062 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 2063 } 2064 if (snapshot_locked) 2065 VOP_UNLOCK(vp, 0, td); 2066 else 2067 VI_UNLOCK(devvp); 2068 return (error); 2069 } 2070 2071 /* 2072 * Read the specified block into the given buffer. 2073 * Much of this boiler-plate comes from bwrite(). 2074 */ 2075 static int 2076 readblock(bp, lbn) 2077 struct buf *bp; 2078 ufs2_daddr_t lbn; 2079 { 2080 struct uio auio; 2081 struct iovec aiov; 2082 struct thread *td = curthread; 2083 struct inode *ip = VTOI(bp->b_vp); 2084 2085 aiov.iov_base = bp->b_data; 2086 aiov.iov_len = bp->b_bcount; 2087 auio.uio_iov = &aiov; 2088 auio.uio_iovcnt = 1; 2089 auio.uio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn))); 2090 auio.uio_resid = bp->b_bcount; 2091 auio.uio_rw = UIO_READ; 2092 auio.uio_segflg = UIO_SYSSPACE; 2093 auio.uio_td = td; 2094 return (physio(ip->i_devvp->v_rdev, &auio, 0)); 2095 } 2096