1 /* 2 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. 3 * 4 * Further information about snapshots can be obtained from: 5 * 6 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 7 * 1614 Oxford Street mckusick@mckusick.com 8 * Berkeley, CA 94709-1608 +1-510-843-9542 9 * USA 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 21 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 22 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 34 * $FreeBSD$ 35 */ 36 37 #include <sys/param.h> 38 #include <sys/kernel.h> 39 #include <sys/systm.h> 40 #include <sys/conf.h> 41 #include <sys/bio.h> 42 #include <sys/buf.h> 43 #include <sys/proc.h> 44 #include <sys/namei.h> 45 #include <sys/stat.h> 46 #include <sys/malloc.h> 47 #include <sys/mount.h> 48 #include <sys/resource.h> 49 #include <sys/resourcevar.h> 50 #include <sys/vnode.h> 51 52 #include <ufs/ufs/extattr.h> 53 #include <ufs/ufs/quota.h> 54 #include <ufs/ufs/ufsmount.h> 55 #include <ufs/ufs/inode.h> 56 #include <ufs/ufs/ufs_extern.h> 57 58 #include <ufs/ffs/fs.h> 59 #include <ufs/ffs/ffs_extern.h> 60 61 #define KERNCRED thread0.td_ucred 62 #define DEBUG 1 63 64 static int cgaccount(int, struct vnode *, struct buf *, int); 65 static int expunge_ufs1(struct vnode *, struct inode *, struct fs *, 66 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 67 ufs_lbn_t, int), int); 68 static int indiracct_ufs1(struct vnode *, struct vnode *, int, 69 ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 70 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 71 ufs_lbn_t, int), int); 72 static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 73 struct fs *, ufs_lbn_t, int); 74 static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 75 struct fs *, ufs_lbn_t, int); 76 static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 77 struct fs *, ufs_lbn_t, int); 78 static int expunge_ufs2(struct vnode *, struct inode *, struct fs *, 79 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 80 ufs_lbn_t, int), int); 81 static int indiracct_ufs2(struct vnode *, struct vnode *, int, 82 ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 83 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 84 ufs_lbn_t, int), int); 85 static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 86 struct fs *, ufs_lbn_t, int); 87 static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 88 struct fs *, ufs_lbn_t, int); 89 static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 90 struct fs *, ufs_lbn_t, int); 91 static int ffs_copyonwrite(struct vnode *, struct buf *); 92 static int readblock(struct buf *, ufs2_daddr_t); 93 94 /* 95 * To ensure the consistency of snapshots across crashes, we must 96 * synchronously write out copied blocks before allowing the 97 * originals to be modified. Because of the rather severe speed 98 * penalty that this imposes, the following flag allows this 99 * crash persistence to be disabled. 100 */ 101 int dopersistence = 0; 102 103 #ifdef DEBUG 104 #include <sys/sysctl.h> 105 SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, ""); 106 int snapdebug = 0; 107 SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, ""); 108 int collectsnapstats = 0; 109 SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats, 110 0, ""); 111 #endif /* DEBUG */ 112 113 /* 114 * Create a snapshot file and initialize it for the filesystem. 115 */ 116 int 117 ffs_snapshot(mp, snapfile) 118 struct mount *mp; 119 char *snapfile; 120 { 121 ufs2_daddr_t numblks, blkno, *blkp, *snapblklist; 122 int error, cg, snaploc; 123 int i, size, len, loc; 124 int flag = mp->mnt_flag; 125 struct timespec starttime = {0, 0}, endtime; 126 char saved_nice = 0; 127 long redo = 0, snaplistsize = 0; 128 int32_t *lp; 129 void *space; 130 struct fs *copy_fs = NULL, *fs = VFSTOUFS(mp)->um_fs; 131 struct snaphead *snaphead; 132 struct thread *td = curthread; 133 struct inode *ip, *xp; 134 struct buf *bp, *nbp, *ibp, *sbp = NULL; 135 struct nameidata nd; 136 struct mount *wrtmp; 137 struct vattr vat; 138 struct vnode *vp, *xvp, *nvp, *devvp; 139 struct uio auio; 140 struct iovec aiov; 141 142 /* 143 * Need to serialize access to snapshot code per filesystem. 144 */ 145 /* 146 * Assign a snapshot slot in the superblock. 147 */ 148 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 149 if (fs->fs_snapinum[snaploc] == 0) 150 break; 151 if (snaploc == FSMAXSNAP) 152 return (ENOSPC); 153 /* 154 * Create the snapshot file. 155 */ 156 restart: 157 NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, snapfile, td); 158 if ((error = namei(&nd)) != 0) 159 return (error); 160 if (nd.ni_vp != NULL) { 161 vput(nd.ni_vp); 162 error = EEXIST; 163 } 164 if (nd.ni_dvp->v_mount != mp) 165 error = EXDEV; 166 if (error) { 167 NDFREE(&nd, NDF_ONLY_PNBUF); 168 if (nd.ni_dvp == nd.ni_vp) 169 vrele(nd.ni_dvp); 170 else 171 vput(nd.ni_dvp); 172 return (error); 173 } 174 VATTR_NULL(&vat); 175 vat.va_type = VREG; 176 vat.va_mode = S_IRUSR; 177 vat.va_vaflags |= VA_EXCLUSIVE; 178 if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp)) 179 wrtmp = NULL; 180 if (wrtmp != mp) 181 panic("ffs_snapshot: mount mismatch"); 182 if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) { 183 NDFREE(&nd, NDF_ONLY_PNBUF); 184 vput(nd.ni_dvp); 185 if ((error = vn_start_write(NULL, &wrtmp, 186 V_XSLEEP | PCATCH)) != 0) 187 return (error); 188 goto restart; 189 } 190 VOP_LEASE(nd.ni_dvp, td, KERNCRED, LEASE_WRITE); 191 error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat); 192 vput(nd.ni_dvp); 193 if (error) { 194 NDFREE(&nd, NDF_ONLY_PNBUF); 195 vn_finished_write(wrtmp); 196 return (error); 197 } 198 vp = nd.ni_vp; 199 ip = VTOI(vp); 200 devvp = ip->i_devvp; 201 /* 202 * Allocate and copy the last block contents so as to be able 203 * to set size to that of the filesystem. 204 */ 205 numblks = howmany(fs->fs_size, fs->fs_frag); 206 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), 207 fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); 208 if (error) 209 goto out; 210 ip->i_size = lblktosize(fs, (off_t)numblks); 211 DIP(ip, i_size) = ip->i_size; 212 ip->i_flag |= IN_CHANGE | IN_UPDATE; 213 if ((error = readblock(bp, numblks - 1)) != 0) 214 goto out; 215 bawrite(bp); 216 /* 217 * Preallocate critical data structures so that we can copy 218 * them in without further allocation after we suspend all 219 * operations on the filesystem. We would like to just release 220 * the allocated buffers without writing them since they will 221 * be filled in below once we are ready to go, but this upsets 222 * the soft update code, so we go ahead and write the new buffers. 223 * 224 * Allocate all indirect blocks and mark all of them as not 225 * needing to be copied. 226 */ 227 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 228 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), 229 fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp); 230 if (error) 231 goto out; 232 bawrite(ibp); 233 } 234 /* 235 * Allocate copies for the superblock and its summary information. 236 */ 237 error = UFS_BALLOC(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED, 238 0, &nbp); 239 if (error) 240 goto out; 241 bawrite(nbp); 242 blkno = fragstoblks(fs, fs->fs_csaddr); 243 len = howmany(fs->fs_cssize, fs->fs_bsize); 244 for (loc = 0; loc < len; loc++) { 245 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), 246 fs->fs_bsize, KERNCRED, 0, &nbp); 247 if (error) 248 goto out; 249 bawrite(nbp); 250 } 251 /* 252 * Allocate all cylinder group blocks. 253 */ 254 for (cg = 0; cg < fs->fs_ncg; cg++) { 255 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 256 fs->fs_bsize, KERNCRED, 0, &nbp); 257 if (error) 258 goto out; 259 bawrite(nbp); 260 } 261 /* 262 * Copy all the cylinder group maps. Although the 263 * filesystem is still active, we hope that only a few 264 * cylinder groups will change between now and when we 265 * suspend operations. Thus, we will be able to quickly 266 * touch up the few cylinder groups that changed during 267 * the suspension period. 268 */ 269 len = howmany(fs->fs_ncg, NBBY); 270 MALLOC(fs->fs_active, int *, len, M_DEVBUF, M_WAITOK); 271 bzero(fs->fs_active, len); 272 for (cg = 0; cg < fs->fs_ncg; cg++) { 273 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 274 fs->fs_bsize, KERNCRED, 0, &nbp); 275 if (error) 276 goto out; 277 error = cgaccount(cg, vp, nbp, 1); 278 bawrite(nbp); 279 if (error) 280 goto out; 281 } 282 /* 283 * Change inode to snapshot type file. 284 */ 285 ip->i_flags |= SF_SNAPSHOT; 286 DIP(ip, i_flags) = ip->i_flags; 287 ip->i_flag |= IN_CHANGE | IN_UPDATE; 288 /* 289 * Ensure that the snapshot is completely on disk. 290 * Since we have marked it as a snapshot it is safe to 291 * unlock it as no process will be allowed to write to it. 292 */ 293 if ((error = VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td)) != 0) 294 goto out; 295 VOP_UNLOCK(vp, 0, td); 296 /* 297 * All allocations are done, so we can now snapshot the system. 298 * 299 * Recind nice scheduling while running with the filesystem suspended. 300 */ 301 if (td->td_ksegrp->kg_nice > 0) { 302 saved_nice = td->td_ksegrp->kg_nice; 303 td->td_ksegrp->kg_nice = 0; 304 } 305 /* 306 * Suspend operation on filesystem. 307 */ 308 for (;;) { 309 vn_finished_write(wrtmp); 310 if ((error = vfs_write_suspend(vp->v_mount)) != 0) { 311 vn_start_write(NULL, &wrtmp, V_WAIT); 312 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 313 goto out; 314 } 315 if (mp->mnt_kern_flag & MNTK_SUSPENDED) 316 break; 317 vn_start_write(NULL, &wrtmp, V_WAIT); 318 } 319 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 320 if (collectsnapstats) 321 nanotime(&starttime); 322 /* 323 * First, copy all the cylinder group maps that have changed. 324 */ 325 for (cg = 0; cg < fs->fs_ncg; cg++) { 326 if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0) 327 continue; 328 redo++; 329 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 330 fs->fs_bsize, KERNCRED, 0, &nbp); 331 if (error) 332 goto out1; 333 error = cgaccount(cg, vp, nbp, 2); 334 bawrite(nbp); 335 if (error) 336 goto out1; 337 } 338 /* 339 * Grab a copy of the superblock and its summary information. 340 * We delay writing it until the suspension is released below. 341 */ 342 error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize, 343 KERNCRED, &sbp); 344 if (error) { 345 brelse(sbp); 346 sbp = NULL; 347 goto out1; 348 } 349 loc = blkoff(fs, fs->fs_sblockloc); 350 copy_fs = (struct fs *)(sbp->b_data + loc); 351 bcopy(fs, copy_fs, fs->fs_sbsize); 352 if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) 353 copy_fs->fs_clean = 1; 354 size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE; 355 if (fs->fs_sbsize < size) 356 bzero(&sbp->b_data[loc + fs->fs_sbsize], size - fs->fs_sbsize); 357 size = blkroundup(fs, fs->fs_cssize); 358 if (fs->fs_contigsumsize > 0) 359 size += fs->fs_ncg * sizeof(int32_t); 360 space = malloc((u_long)size, M_UFSMNT, M_WAITOK); 361 copy_fs->fs_csp = space; 362 bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize); 363 (char *)space += fs->fs_cssize; 364 loc = howmany(fs->fs_cssize, fs->fs_fsize); 365 i = fs->fs_frag - loc % fs->fs_frag; 366 len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; 367 if (len > 0) { 368 if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc), 369 len, KERNCRED, &bp)) != 0) { 370 brelse(bp); 371 free(copy_fs->fs_csp, M_UFSMNT); 372 bawrite(sbp); 373 sbp = NULL; 374 goto out1; 375 } 376 bcopy(bp->b_data, space, (u_int)len); 377 (char *)space += len; 378 bp->b_flags |= B_INVAL | B_NOCACHE; 379 brelse(bp); 380 } 381 if (fs->fs_contigsumsize > 0) { 382 copy_fs->fs_maxcluster = lp = space; 383 for (i = 0; i < fs->fs_ncg; i++) 384 *lp++ = fs->fs_contigsumsize; 385 } 386 /* 387 * We must check for active files that have been unlinked 388 * (e.g., with a zero link count). We have to expunge all 389 * trace of these files from the snapshot so that they are 390 * not reclaimed prematurely by fsck or unnecessarily dumped. 391 * We turn off the MNTK_SUSPENDED flag to avoid a panic from 392 * spec_strategy about writing on a suspended filesystem. 393 * Note that we skip unlinked snapshot files as they will 394 * be handled separately below. 395 * 396 * We also calculate the needed size for the snapshot list. 397 */ 398 snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + 399 FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */; 400 mp->mnt_kern_flag &= ~MNTK_SUSPENDED; 401 mtx_lock(&mntvnode_mtx); 402 loop: 403 for (xvp = TAILQ_FIRST(&mp->mnt_nvnodelist); xvp; xvp = nvp) { 404 /* 405 * Make sure this vnode wasn't reclaimed in getnewvnode(). 406 * Start over if it has (it won't be on the list anymore). 407 */ 408 if (xvp->v_mount != mp) 409 goto loop; 410 nvp = TAILQ_NEXT(xvp, v_nmntvnodes); 411 mtx_unlock(&mntvnode_mtx); 412 mp_fixme("Unlocked GETATTR."); 413 if (vrefcnt(xvp) == 0 || xvp->v_type == VNON || 414 (VTOI(xvp)->i_flags & SF_SNAPSHOT) || 415 (VOP_GETATTR(xvp, &vat, td->td_ucred, td) == 0 && 416 vat.va_nlink > 0)) { 417 mtx_lock(&mntvnode_mtx); 418 continue; 419 } 420 if (snapdebug) 421 vprint("ffs_snapshot: busy vnode", xvp); 422 if (vn_lock(xvp, LK_EXCLUSIVE, td) != 0) 423 goto loop; 424 xp = VTOI(xvp); 425 if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) { 426 VOP_UNLOCK(xvp, 0, td); 427 continue; 428 } 429 /* 430 * If there is a fragment, clear it here. 431 */ 432 blkno = 0; 433 loc = howmany(xp->i_size, fs->fs_bsize) - 1; 434 if (loc < NDADDR) { 435 len = fragroundup(fs, blkoff(fs, xp->i_size)); 436 if (len < fs->fs_bsize) { 437 ffs_blkfree(copy_fs, vp, DIP(xp, i_db[loc]), 438 len, xp->i_number); 439 blkno = DIP(xp, i_db[loc]); 440 DIP(xp, i_db[loc]) = 0; 441 } 442 } 443 snaplistsize += 1; 444 if (xp->i_ump->um_fstype == UFS1) 445 error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, 446 BLK_NOCOPY); 447 else 448 error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, 449 BLK_NOCOPY); 450 if (blkno) 451 DIP(xp, i_db[loc]) = blkno; 452 if (!error) 453 error = ffs_freefile(copy_fs, vp, xp->i_number, 454 xp->i_mode); 455 VOP_UNLOCK(xvp, 0, td); 456 if (error) { 457 free(copy_fs->fs_csp, M_UFSMNT); 458 bawrite(sbp); 459 sbp = NULL; 460 goto out1; 461 } 462 mtx_lock(&mntvnode_mtx); 463 } 464 mtx_unlock(&mntvnode_mtx); 465 /* 466 * If there already exist snapshots on this filesystem, grab a 467 * reference to their shared lock. If this is the first snapshot 468 * on this filesystem, we need to allocate a lock for the snapshots 469 * to share. In either case, acquire the snapshot lock and give 470 * up our original private lock. 471 */ 472 VI_LOCK(devvp); 473 snaphead = &devvp->v_rdev->si_snapshots; 474 if ((xp = TAILQ_FIRST(snaphead)) != NULL) { 475 VI_LOCK(vp); 476 vp->v_vnlock = ITOV(xp)->v_vnlock; 477 VI_UNLOCK(devvp); 478 } else { 479 struct lock *lkp; 480 481 VI_UNLOCK(devvp); 482 MALLOC(lkp, struct lock *, sizeof(struct lock), M_UFSMNT, 483 M_WAITOK); 484 lockinit(lkp, PVFS, "snaplk", VLKTIMEOUT, 485 LK_CANRECURSE | LK_NOPAUSE); 486 VI_LOCK(vp); 487 vp->v_vnlock = lkp; 488 } 489 vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, td); 490 transferlockers(&vp->v_lock, vp->v_vnlock); 491 lockmgr(&vp->v_lock, LK_RELEASE, NULL, td); 492 /* 493 * If this is the first snapshot on this filesystem, then we need 494 * to allocate the space for the list of preallocated snapshot blocks. 495 * This list will be refined below, but this preliminary one will 496 * keep us out of deadlock until the full one is ready. 497 */ 498 if (xp == NULL) { 499 MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t), 500 M_UFSMNT, M_WAITOK); 501 blkp = &snapblklist[1]; 502 *blkp++ = lblkno(fs, fs->fs_sblockloc); 503 blkno = fragstoblks(fs, fs->fs_csaddr); 504 for (cg = 0; cg < fs->fs_ncg; cg++) { 505 if (fragstoblks(fs, cgtod(fs, cg) > blkno)) 506 break; 507 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 508 } 509 len = howmany(fs->fs_cssize, fs->fs_bsize); 510 for (loc = 0; loc < len; loc++) 511 *blkp++ = blkno + loc; 512 for (; cg < fs->fs_ncg; cg++) 513 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 514 snapblklist[0] = blkp - snapblklist; 515 VI_LOCK(devvp); 516 if (devvp->v_rdev->si_snapblklist != NULL) 517 panic("ffs_snapshot: non-empty list"); 518 devvp->v_rdev->si_snapblklist = snapblklist; 519 devvp->v_rdev->si_snaplistsize = blkp - snapblklist; 520 VI_UNLOCK(devvp); 521 } 522 /* 523 * Record snapshot inode. Since this is the newest snapshot, 524 * it must be placed at the end of the list. 525 */ 526 VI_LOCK(devvp); 527 fs->fs_snapinum[snaploc] = ip->i_number; 528 if (ip->i_nextsnap.tqe_prev != 0) 529 panic("ffs_snapshot: %d already on list", ip->i_number); 530 TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap); 531 devvp->v_rdev->si_copyonwrite = ffs_copyonwrite; 532 devvp->v_vflag |= VV_COPYONWRITE; 533 VI_UNLOCK(devvp); 534 ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp"); 535 vp->v_vflag |= VV_SYSTEM; 536 out1: 537 /* 538 * Resume operation on filesystem. 539 */ 540 vfs_write_resume(vp->v_mount); 541 vn_start_write(NULL, &wrtmp, V_WAIT); 542 if (collectsnapstats && starttime.tv_sec > 0) { 543 nanotime(&endtime); 544 timespecsub(&endtime, &starttime); 545 printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n", 546 vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec, 547 endtime.tv_nsec / 1000000, redo, fs->fs_ncg); 548 } 549 if (sbp == NULL) 550 goto out; 551 /* 552 * Copy allocation information from all the snapshots in 553 * this snapshot and then expunge them from its view. 554 */ 555 snaphead = &devvp->v_rdev->si_snapshots; 556 TAILQ_FOREACH(xp, snaphead, i_nextsnap) { 557 if (xp == ip) 558 break; 559 if (xp->i_ump->um_fstype == UFS1) 560 error = expunge_ufs1(vp, xp, fs, snapacct_ufs1, 561 BLK_SNAP); 562 else 563 error = expunge_ufs2(vp, xp, fs, snapacct_ufs2, 564 BLK_SNAP); 565 if (error) { 566 fs->fs_snapinum[snaploc] = 0; 567 goto done; 568 } 569 } 570 /* 571 * Allocate space for the full list of preallocated snapshot blocks. 572 */ 573 MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t), 574 M_UFSMNT, M_WAITOK); 575 ip->i_snapblklist = &snapblklist[1]; 576 /* 577 * Expunge the blocks used by the snapshots from the set of 578 * blocks marked as used in the snapshot bitmaps. Also, collect 579 * the list of allocated blocks in i_snapblklist. 580 */ 581 if (ip->i_ump->um_fstype == UFS1) 582 error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP); 583 else 584 error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP); 585 if (error) { 586 fs->fs_snapinum[snaploc] = 0; 587 FREE(snapblklist, M_UFSMNT); 588 goto done; 589 } 590 if (snaplistsize < ip->i_snapblklist - snapblklist) 591 panic("ffs_snapshot: list too small"); 592 snaplistsize = ip->i_snapblklist - snapblklist; 593 snapblklist[0] = snaplistsize; 594 ip->i_snapblklist = 0; 595 /* 596 * Write out the list of allocated blocks to the end of the snapshot. 597 */ 598 auio.uio_iov = &aiov; 599 auio.uio_iovcnt = 1; 600 aiov.iov_base = (void *)snapblklist; 601 aiov.iov_len = snaplistsize * sizeof(daddr_t); 602 auio.uio_resid = aiov.iov_len;; 603 auio.uio_offset = ip->i_size; 604 auio.uio_segflg = UIO_SYSSPACE; 605 auio.uio_rw = UIO_WRITE; 606 auio.uio_td = td; 607 if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 608 fs->fs_snapinum[snaploc] = 0; 609 FREE(snapblklist, M_UFSMNT); 610 goto done; 611 } 612 /* 613 * Write the superblock and its summary information 614 * to the snapshot. 615 */ 616 blkno = fragstoblks(fs, fs->fs_csaddr); 617 len = howmany(fs->fs_cssize, fs->fs_bsize); 618 space = copy_fs->fs_csp; 619 for (loc = 0; loc < len; loc++) { 620 error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp); 621 if (error) { 622 brelse(nbp); 623 fs->fs_snapinum[snaploc] = 0; 624 FREE(snapblklist, M_UFSMNT); 625 goto done; 626 } 627 bcopy(space, nbp->b_data, fs->fs_bsize); 628 space = (char *)space + fs->fs_bsize; 629 bawrite(nbp); 630 } 631 /* 632 * As this is the newest list, it is the most inclusive, so 633 * should replace the previous list. 634 */ 635 VI_LOCK(devvp); 636 space = devvp->v_rdev->si_snapblklist; 637 devvp->v_rdev->si_snapblklist = snapblklist; 638 devvp->v_rdev->si_snaplistsize = snaplistsize; 639 if (space != NULL) 640 FREE(space, M_UFSMNT); 641 VI_UNLOCK(devvp); 642 done: 643 free(copy_fs->fs_csp, M_UFSMNT); 644 bawrite(sbp); 645 out: 646 if (saved_nice > 0) 647 td->td_ksegrp->kg_nice = saved_nice; 648 if (fs->fs_active != 0) { 649 FREE(fs->fs_active, M_DEVBUF); 650 fs->fs_active = 0; 651 } 652 mp->mnt_flag = flag; 653 if (error) 654 (void) UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED, td); 655 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 656 if (error) 657 vput(vp); 658 else 659 VOP_UNLOCK(vp, 0, td); 660 vn_finished_write(wrtmp); 661 return (error); 662 } 663 664 /* 665 * Copy a cylinder group map. All the unallocated blocks are marked 666 * BLK_NOCOPY so that the snapshot knows that it need not copy them 667 * if they are later written. If passno is one, then this is a first 668 * pass, so only setting needs to be done. If passno is 2, then this 669 * is a revision to a previous pass which must be undone as the 670 * replacement pass is done. 671 */ 672 static int 673 cgaccount(cg, vp, nbp, passno) 674 int cg; 675 struct vnode *vp; 676 struct buf *nbp; 677 int passno; 678 { 679 struct buf *bp, *ibp; 680 struct inode *ip; 681 struct cg *cgp; 682 struct fs *fs; 683 ufs2_daddr_t base, numblks; 684 int error, len, loc, indiroff; 685 686 ip = VTOI(vp); 687 fs = ip->i_fs; 688 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 689 (int)fs->fs_cgsize, KERNCRED, &bp); 690 if (error) { 691 brelse(bp); 692 return (error); 693 } 694 cgp = (struct cg *)bp->b_data; 695 if (!cg_chkmagic(cgp)) { 696 brelse(bp); 697 return (EIO); 698 } 699 atomic_set_int(&ACTIVECGNUM(fs, cg), ACTIVECGOFF(cg)); 700 bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize); 701 if (fs->fs_cgsize < fs->fs_bsize) 702 bzero(&nbp->b_data[fs->fs_cgsize], 703 fs->fs_bsize - fs->fs_cgsize); 704 if (passno == 2) 705 nbp->b_flags |= B_VALIDSUSPWRT; 706 numblks = howmany(fs->fs_size, fs->fs_frag); 707 len = howmany(fs->fs_fpg, fs->fs_frag); 708 base = cg * fs->fs_fpg / fs->fs_frag; 709 if (base + len >= numblks) 710 len = numblks - base - 1; 711 loc = 0; 712 if (base < NDADDR) { 713 for ( ; loc < NDADDR; loc++) { 714 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 715 DIP(ip, i_db[loc]) = BLK_NOCOPY; 716 else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY) 717 DIP(ip, i_db[loc]) = 0; 718 else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY) 719 panic("ffs_snapshot: lost direct block"); 720 } 721 } 722 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), 723 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 724 if (error) { 725 brelse(bp); 726 return (error); 727 } 728 indiroff = (base + loc - NDADDR) % NINDIR(fs); 729 for ( ; loc < len; loc++, indiroff++) { 730 if (indiroff >= NINDIR(fs)) { 731 if (passno == 2) 732 ibp->b_flags |= B_VALIDSUSPWRT; 733 bawrite(ibp); 734 error = UFS_BALLOC(vp, 735 lblktosize(fs, (off_t)(base + loc)), 736 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 737 if (error) { 738 brelse(bp); 739 return (error); 740 } 741 indiroff = 0; 742 } 743 if (ip->i_ump->um_fstype == UFS1) { 744 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 745 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 746 BLK_NOCOPY; 747 else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data)) 748 [indiroff] == BLK_NOCOPY) 749 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0; 750 else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data)) 751 [indiroff] == BLK_NOCOPY) 752 panic("ffs_snapshot: lost indirect block"); 753 continue; 754 } 755 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 756 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; 757 else if (passno == 2 && 758 ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) 759 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0; 760 else if (passno == 1 && 761 ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) 762 panic("ffs_snapshot: lost indirect block"); 763 } 764 bqrelse(bp); 765 if (passno == 2) 766 ibp->b_flags |= B_VALIDSUSPWRT; 767 bdwrite(ibp); 768 return (0); 769 } 770 771 /* 772 * Before expunging a snapshot inode, note all the 773 * blocks that it claims with BLK_SNAP so that fsck will 774 * be able to account for those blocks properly and so 775 * that this snapshot knows that it need not copy them 776 * if the other snapshot holding them is freed. This code 777 * is reproduced once each for UFS1 and UFS2. 778 */ 779 static int 780 expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype) 781 struct vnode *snapvp; 782 struct inode *cancelip; 783 struct fs *fs; 784 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 785 struct fs *, ufs_lbn_t, int); 786 int expungetype; 787 { 788 int i, error, indiroff; 789 ufs_lbn_t lbn, rlbn; 790 ufs2_daddr_t len, blkno, numblks, blksperindir; 791 struct ufs1_dinode *dip; 792 struct thread *td = curthread; 793 struct buf *bp; 794 795 /* 796 * Prepare to expunge the inode. If its inode block has not 797 * yet been copied, then allocate and fill the copy. 798 */ 799 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 800 blkno = 0; 801 if (lbn < NDADDR) { 802 blkno = VTOI(snapvp)->i_din1->di_db[lbn]; 803 } else { 804 td->td_proc->p_flag |= P_COWINPROGRESS; 805 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 806 fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); 807 td->td_proc->p_flag &= ~P_COWINPROGRESS; 808 if (error) 809 return (error); 810 indiroff = (lbn - NDADDR) % NINDIR(fs); 811 blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff]; 812 bqrelse(bp); 813 } 814 if (blkno != 0) { 815 if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) 816 return (error); 817 } else { 818 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 819 fs->fs_bsize, KERNCRED, 0, &bp); 820 if (error) 821 return (error); 822 if ((error = readblock(bp, lbn)) != 0) 823 return (error); 824 } 825 /* 826 * Set a snapshot inode to be a zero length file, regular files 827 * to be completely unallocated. 828 */ 829 dip = (struct ufs1_dinode *)bp->b_data + 830 ino_to_fsbo(fs, cancelip->i_number); 831 if (expungetype == BLK_NOCOPY) 832 dip->di_mode = 0; 833 dip->di_size = 0; 834 dip->di_blocks = 0; 835 dip->di_flags &= ~SF_SNAPSHOT; 836 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t)); 837 bdwrite(bp); 838 /* 839 * Now go through and expunge all the blocks in the file 840 * using the function requested. 841 */ 842 numblks = howmany(cancelip->i_size, fs->fs_bsize); 843 if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0], 844 &cancelip->i_din1->di_db[NDADDR], fs, 0, expungetype))) 845 return (error); 846 if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_ib[0], 847 &cancelip->i_din1->di_ib[NIADDR], fs, -1, expungetype))) 848 return (error); 849 blksperindir = 1; 850 lbn = -NDADDR; 851 len = numblks - NDADDR; 852 rlbn = NDADDR; 853 for (i = 0; len > 0 && i < NIADDR; i++) { 854 error = indiracct_ufs1(snapvp, ITOV(cancelip), i, 855 cancelip->i_din1->di_ib[i], lbn, rlbn, len, 856 blksperindir, fs, acctfunc, expungetype); 857 if (error) 858 return (error); 859 blksperindir *= NINDIR(fs); 860 lbn -= blksperindir + 1; 861 len -= blksperindir; 862 rlbn += blksperindir; 863 } 864 return (0); 865 } 866 867 /* 868 * Descend an indirect block chain for vnode cancelvp accounting for all 869 * its indirect blocks in snapvp. 870 */ 871 static int 872 indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, 873 blksperindir, fs, acctfunc, expungetype) 874 struct vnode *snapvp; 875 struct vnode *cancelvp; 876 int level; 877 ufs1_daddr_t blkno; 878 ufs_lbn_t lbn; 879 ufs_lbn_t rlbn; 880 ufs_lbn_t remblks; 881 ufs_lbn_t blksperindir; 882 struct fs *fs; 883 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 884 struct fs *, ufs_lbn_t, int); 885 int expungetype; 886 { 887 int error, num, i; 888 ufs_lbn_t subblksperindir; 889 struct indir indirs[NIADDR + 2]; 890 ufs1_daddr_t last, *bap; 891 struct buf *bp; 892 893 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 894 return (error); 895 if (lbn != indirs[num - 1 - level].in_lbn || blkno == 0 || num < 2) 896 panic("indiracct: botched params"); 897 /* 898 * We have to expand bread here since it will deadlock looking 899 * up the block number for any blocks that are not in the cache. 900 */ 901 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); 902 bp->b_blkno = fsbtodb(fs, blkno); 903 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 904 (error = readblock(bp, fragstoblks(fs, blkno)))) { 905 brelse(bp); 906 return (error); 907 } 908 /* 909 * Account for the block pointers in this indirect block. 910 */ 911 last = howmany(remblks, blksperindir); 912 if (last > NINDIR(fs)) 913 last = NINDIR(fs); 914 MALLOC(bap, ufs1_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); 915 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 916 bqrelse(bp); 917 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 918 level == 0 ? rlbn : -1, expungetype); 919 if (error || level == 0) 920 goto out; 921 /* 922 * Account for the block pointers in each of the indirect blocks 923 * in the levels below us. 924 */ 925 subblksperindir = blksperindir / NINDIR(fs); 926 for (lbn++, level--, i = 0; i < last; i++) { 927 error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn, 928 rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); 929 if (error) 930 goto out; 931 rlbn += blksperindir; 932 lbn -= blksperindir; 933 remblks -= blksperindir; 934 } 935 out: 936 FREE(bap, M_DEVBUF); 937 return (error); 938 } 939 940 /* 941 * Do both snap accounting and map accounting. 942 */ 943 static int 944 fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype) 945 struct vnode *vp; 946 ufs1_daddr_t *oldblkp, *lastblkp; 947 struct fs *fs; 948 ufs_lbn_t lblkno; 949 int exptype; /* BLK_SNAP or BLK_NOCOPY */ 950 { 951 int error; 952 953 if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 954 return (error); 955 return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 956 } 957 958 /* 959 * Identify a set of blocks allocated in a snapshot inode. 960 */ 961 static int 962 snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 963 struct vnode *vp; 964 ufs1_daddr_t *oldblkp, *lastblkp; 965 struct fs *fs; 966 ufs_lbn_t lblkno; 967 int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 968 { 969 struct inode *ip = VTOI(vp); 970 ufs1_daddr_t blkno, *blkp; 971 ufs_lbn_t lbn; 972 struct buf *ibp; 973 int error; 974 975 for ( ; oldblkp < lastblkp; oldblkp++) { 976 blkno = *oldblkp; 977 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 978 continue; 979 lbn = fragstoblks(fs, blkno); 980 if (lbn < NDADDR) { 981 blkp = &ip->i_din1->di_db[lbn]; 982 ip->i_flag |= IN_CHANGE | IN_UPDATE; 983 } else { 984 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 985 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 986 if (error) 987 return (error); 988 blkp = &((ufs1_daddr_t *)(ibp->b_data)) 989 [(lbn - NDADDR) % NINDIR(fs)]; 990 } 991 /* 992 * If we are expunging a snapshot vnode and we 993 * find a block marked BLK_NOCOPY, then it is 994 * one that has been allocated to this snapshot after 995 * we took our current snapshot and can be ignored. 996 */ 997 if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { 998 if (lbn >= NDADDR) 999 brelse(ibp); 1000 } else { 1001 if (*blkp != 0) 1002 panic("snapacct: bad block"); 1003 *blkp = expungetype; 1004 if (lbn >= NDADDR) 1005 bdwrite(ibp); 1006 } 1007 } 1008 return (0); 1009 } 1010 1011 /* 1012 * Account for a set of blocks allocated in a snapshot inode. 1013 */ 1014 static int 1015 mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1016 struct vnode *vp; 1017 ufs1_daddr_t *oldblkp, *lastblkp; 1018 struct fs *fs; 1019 ufs_lbn_t lblkno; 1020 int expungetype; 1021 { 1022 ufs1_daddr_t blkno; 1023 struct inode *ip; 1024 ino_t inum; 1025 int acctit; 1026 1027 ip = VTOI(vp); 1028 inum = ip->i_number; 1029 if (lblkno == -1) 1030 acctit = 0; 1031 else 1032 acctit = 1; 1033 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1034 blkno = *oldblkp; 1035 if (blkno == 0 || blkno == BLK_NOCOPY) 1036 continue; 1037 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1038 *ip->i_snapblklist++ = lblkno; 1039 if (blkno == BLK_SNAP) 1040 blkno = blkstofrags(fs, lblkno); 1041 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); 1042 } 1043 return (0); 1044 } 1045 1046 /* 1047 * Before expunging a snapshot inode, note all the 1048 * blocks that it claims with BLK_SNAP so that fsck will 1049 * be able to account for those blocks properly and so 1050 * that this snapshot knows that it need not copy them 1051 * if the other snapshot holding them is freed. This code 1052 * is reproduced once each for UFS1 and UFS2. 1053 */ 1054 static int 1055 expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype) 1056 struct vnode *snapvp; 1057 struct inode *cancelip; 1058 struct fs *fs; 1059 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1060 struct fs *, ufs_lbn_t, int); 1061 int expungetype; 1062 { 1063 int i, error, indiroff; 1064 ufs_lbn_t lbn, rlbn; 1065 ufs2_daddr_t len, blkno, numblks, blksperindir; 1066 struct ufs2_dinode *dip; 1067 struct thread *td = curthread; 1068 struct buf *bp; 1069 1070 /* 1071 * Prepare to expunge the inode. If its inode block has not 1072 * yet been copied, then allocate and fill the copy. 1073 */ 1074 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 1075 blkno = 0; 1076 if (lbn < NDADDR) { 1077 blkno = VTOI(snapvp)->i_din2->di_db[lbn]; 1078 } else { 1079 td->td_proc->p_flag |= P_COWINPROGRESS; 1080 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 1081 fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); 1082 td->td_proc->p_flag &= ~P_COWINPROGRESS; 1083 if (error) 1084 return (error); 1085 indiroff = (lbn - NDADDR) % NINDIR(fs); 1086 blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff]; 1087 bqrelse(bp); 1088 } 1089 if (blkno != 0) { 1090 if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) 1091 return (error); 1092 } else { 1093 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 1094 fs->fs_bsize, KERNCRED, 0, &bp); 1095 if (error) 1096 return (error); 1097 if ((error = readblock(bp, lbn)) != 0) 1098 return (error); 1099 } 1100 /* 1101 * Set a snapshot inode to be a zero length file, regular files 1102 * to be completely unallocated. 1103 */ 1104 dip = (struct ufs2_dinode *)bp->b_data + 1105 ino_to_fsbo(fs, cancelip->i_number); 1106 if (expungetype == BLK_NOCOPY) 1107 dip->di_mode = 0; 1108 dip->di_size = 0; 1109 dip->di_blocks = 0; 1110 dip->di_flags &= ~SF_SNAPSHOT; 1111 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t)); 1112 bdwrite(bp); 1113 /* 1114 * Now go through and expunge all the blocks in the file 1115 * using the function requested. 1116 */ 1117 numblks = howmany(cancelip->i_size, fs->fs_bsize); 1118 if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0], 1119 &cancelip->i_din2->di_db[NDADDR], fs, 0, expungetype))) 1120 return (error); 1121 if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_ib[0], 1122 &cancelip->i_din2->di_ib[NIADDR], fs, -1, expungetype))) 1123 return (error); 1124 blksperindir = 1; 1125 lbn = -NDADDR; 1126 len = numblks - NDADDR; 1127 rlbn = NDADDR; 1128 for (i = 0; len > 0 && i < NIADDR; i++) { 1129 error = indiracct_ufs2(snapvp, ITOV(cancelip), i, 1130 cancelip->i_din2->di_ib[i], lbn, rlbn, len, 1131 blksperindir, fs, acctfunc, expungetype); 1132 if (error) 1133 return (error); 1134 blksperindir *= NINDIR(fs); 1135 lbn -= blksperindir + 1; 1136 len -= blksperindir; 1137 rlbn += blksperindir; 1138 } 1139 return (0); 1140 } 1141 1142 /* 1143 * Descend an indirect block chain for vnode cancelvp accounting for all 1144 * its indirect blocks in snapvp. 1145 */ 1146 static int 1147 indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, 1148 blksperindir, fs, acctfunc, expungetype) 1149 struct vnode *snapvp; 1150 struct vnode *cancelvp; 1151 int level; 1152 ufs2_daddr_t blkno; 1153 ufs_lbn_t lbn; 1154 ufs_lbn_t rlbn; 1155 ufs_lbn_t remblks; 1156 ufs_lbn_t blksperindir; 1157 struct fs *fs; 1158 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1159 struct fs *, ufs_lbn_t, int); 1160 int expungetype; 1161 { 1162 int error, num, i; 1163 ufs_lbn_t subblksperindir; 1164 struct indir indirs[NIADDR + 2]; 1165 ufs2_daddr_t last, *bap; 1166 struct buf *bp; 1167 1168 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 1169 return (error); 1170 if (lbn != indirs[num - 1 - level].in_lbn || blkno == 0 || num < 2) 1171 panic("indiracct: botched params"); 1172 /* 1173 * We have to expand bread here since it will deadlock looking 1174 * up the block number for any blocks that are not in the cache. 1175 */ 1176 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); 1177 bp->b_blkno = fsbtodb(fs, blkno); 1178 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 1179 (error = readblock(bp, fragstoblks(fs, blkno)))) { 1180 brelse(bp); 1181 return (error); 1182 } 1183 /* 1184 * Account for the block pointers in this indirect block. 1185 */ 1186 last = howmany(remblks, blksperindir); 1187 if (last > NINDIR(fs)) 1188 last = NINDIR(fs); 1189 MALLOC(bap, ufs2_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); 1190 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 1191 bqrelse(bp); 1192 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 1193 level == 0 ? rlbn : -1, expungetype); 1194 if (error || level == 0) 1195 goto out; 1196 /* 1197 * Account for the block pointers in each of the indirect blocks 1198 * in the levels below us. 1199 */ 1200 subblksperindir = blksperindir / NINDIR(fs); 1201 for (lbn++, level--, i = 0; i < last; i++) { 1202 error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn, 1203 rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); 1204 if (error) 1205 goto out; 1206 rlbn += blksperindir; 1207 lbn -= blksperindir; 1208 remblks -= blksperindir; 1209 } 1210 out: 1211 FREE(bap, M_DEVBUF); 1212 return (error); 1213 } 1214 1215 /* 1216 * Do both snap accounting and map accounting. 1217 */ 1218 static int 1219 fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype) 1220 struct vnode *vp; 1221 ufs2_daddr_t *oldblkp, *lastblkp; 1222 struct fs *fs; 1223 ufs_lbn_t lblkno; 1224 int exptype; /* BLK_SNAP or BLK_NOCOPY */ 1225 { 1226 int error; 1227 1228 if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 1229 return (error); 1230 return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 1231 } 1232 1233 /* 1234 * Identify a set of blocks allocated in a snapshot inode. 1235 */ 1236 static int 1237 snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1238 struct vnode *vp; 1239 ufs2_daddr_t *oldblkp, *lastblkp; 1240 struct fs *fs; 1241 ufs_lbn_t lblkno; 1242 int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 1243 { 1244 struct inode *ip = VTOI(vp); 1245 ufs2_daddr_t blkno, *blkp; 1246 ufs_lbn_t lbn; 1247 struct buf *ibp; 1248 int error; 1249 1250 for ( ; oldblkp < lastblkp; oldblkp++) { 1251 blkno = *oldblkp; 1252 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1253 continue; 1254 lbn = fragstoblks(fs, blkno); 1255 if (lbn < NDADDR) { 1256 blkp = &ip->i_din2->di_db[lbn]; 1257 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1258 } else { 1259 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1260 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1261 if (error) 1262 return (error); 1263 blkp = &((ufs2_daddr_t *)(ibp->b_data)) 1264 [(lbn - NDADDR) % NINDIR(fs)]; 1265 } 1266 /* 1267 * If we are expunging a snapshot vnode and we 1268 * find a block marked BLK_NOCOPY, then it is 1269 * one that has been allocated to this snapshot after 1270 * we took our current snapshot and can be ignored. 1271 */ 1272 if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { 1273 if (lbn >= NDADDR) 1274 brelse(ibp); 1275 } else { 1276 if (*blkp != 0) 1277 panic("snapacct: bad block"); 1278 *blkp = expungetype; 1279 if (lbn >= NDADDR) 1280 bdwrite(ibp); 1281 } 1282 } 1283 return (0); 1284 } 1285 1286 /* 1287 * Account for a set of blocks allocated in a snapshot inode. 1288 */ 1289 static int 1290 mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1291 struct vnode *vp; 1292 ufs2_daddr_t *oldblkp, *lastblkp; 1293 struct fs *fs; 1294 ufs_lbn_t lblkno; 1295 int expungetype; 1296 { 1297 ufs2_daddr_t blkno; 1298 struct inode *ip; 1299 ino_t inum; 1300 int acctit; 1301 1302 ip = VTOI(vp); 1303 inum = ip->i_number; 1304 if (lblkno == -1) 1305 acctit = 0; 1306 else 1307 acctit = 1; 1308 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1309 blkno = *oldblkp; 1310 if (blkno == 0 || blkno == BLK_NOCOPY) 1311 continue; 1312 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1313 *ip->i_snapblklist++ = lblkno; 1314 if (blkno == BLK_SNAP) 1315 blkno = blkstofrags(fs, lblkno); 1316 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); 1317 } 1318 return (0); 1319 } 1320 1321 /* 1322 * Decrement extra reference on snapshot when last name is removed. 1323 * It will not be freed until the last open reference goes away. 1324 */ 1325 void 1326 ffs_snapgone(ip) 1327 struct inode *ip; 1328 { 1329 struct inode *xp; 1330 struct fs *fs; 1331 int snaploc; 1332 1333 /* 1334 * Find snapshot in incore list. 1335 */ 1336 TAILQ_FOREACH(xp, &ip->i_devvp->v_rdev->si_snapshots, i_nextsnap) 1337 if (xp == ip) 1338 break; 1339 if (xp != NULL) 1340 vrele(ITOV(ip)); 1341 else if (snapdebug) 1342 printf("ffs_snapgone: lost snapshot vnode %d\n", 1343 ip->i_number); 1344 /* 1345 * Delete snapshot inode from superblock. Keep list dense. 1346 */ 1347 fs = ip->i_fs; 1348 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 1349 if (fs->fs_snapinum[snaploc] == ip->i_number) 1350 break; 1351 if (snaploc < FSMAXSNAP) { 1352 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { 1353 if (fs->fs_snapinum[snaploc] == 0) 1354 break; 1355 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; 1356 } 1357 fs->fs_snapinum[snaploc - 1] = 0; 1358 } 1359 } 1360 1361 /* 1362 * Prepare a snapshot file for being removed. 1363 */ 1364 void 1365 ffs_snapremove(vp) 1366 struct vnode *vp; 1367 { 1368 struct inode *ip; 1369 struct vnode *devvp; 1370 struct lock *lkp; 1371 struct buf *ibp; 1372 struct fs *fs; 1373 struct thread *td = curthread; 1374 ufs2_daddr_t numblks, blkno, dblk, *snapblklist; 1375 int error, loc, last; 1376 1377 ip = VTOI(vp); 1378 fs = ip->i_fs; 1379 devvp = ip->i_devvp; 1380 /* 1381 * If active, delete from incore list (this snapshot may 1382 * already have been in the process of being deleted, so 1383 * would not have been active). 1384 * 1385 * Clear copy-on-write flag if last snapshot. 1386 */ 1387 if (ip->i_nextsnap.tqe_prev != 0) { 1388 VI_LOCK(devvp); 1389 lockmgr(&vp->v_lock, LK_INTERLOCK | LK_EXCLUSIVE, 1390 VI_MTX(devvp), td); 1391 VI_LOCK(devvp); 1392 TAILQ_REMOVE(&devvp->v_rdev->si_snapshots, ip, i_nextsnap); 1393 ip->i_nextsnap.tqe_prev = 0; 1394 lkp = vp->v_vnlock; 1395 vp->v_vnlock = &vp->v_lock; 1396 lockmgr(lkp, LK_RELEASE, NULL, td); 1397 if (TAILQ_FIRST(&devvp->v_rdev->si_snapshots) != 0) { 1398 VI_UNLOCK(devvp); 1399 } else { 1400 snapblklist = devvp->v_rdev->si_snapblklist; 1401 devvp->v_rdev->si_snapblklist = 0; 1402 devvp->v_rdev->si_snaplistsize = 0; 1403 devvp->v_rdev->si_copyonwrite = 0; 1404 devvp->v_vflag &= ~VV_COPYONWRITE; 1405 lockmgr(lkp, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp), td); 1406 lockmgr(lkp, LK_RELEASE, NULL, td); 1407 lockdestroy(lkp); 1408 FREE(lkp, M_UFSMNT); 1409 FREE(snapblklist, M_UFSMNT); 1410 } 1411 } 1412 /* 1413 * Clear all BLK_NOCOPY fields. Pass any block claims to other 1414 * snapshots that want them (see ffs_snapblkfree below). 1415 */ 1416 for (blkno = 1; blkno < NDADDR; blkno++) { 1417 dblk = DIP(ip, i_db[blkno]); 1418 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1419 DIP(ip, i_db[blkno]) = 0; 1420 else if ((dblk == blkstofrags(fs, blkno) && 1421 ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, 1422 ip->i_number))) { 1423 DIP(ip, i_blocks) -= btodb(fs->fs_bsize); 1424 DIP(ip, i_db[blkno]) = 0; 1425 } 1426 } 1427 numblks = howmany(ip->i_size, fs->fs_bsize); 1428 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 1429 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), 1430 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1431 if (error) 1432 continue; 1433 if (fs->fs_size - blkno > NINDIR(fs)) 1434 last = NINDIR(fs); 1435 else 1436 last = fs->fs_size - blkno; 1437 for (loc = 0; loc < last; loc++) { 1438 if (ip->i_ump->um_fstype == UFS1) { 1439 dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc]; 1440 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1441 ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; 1442 else if ((dblk == blkstofrags(fs, blkno) && 1443 ffs_snapblkfree(fs, ip->i_devvp, dblk, 1444 fs->fs_bsize, ip->i_number))) { 1445 ip->i_din1->di_blocks -= 1446 btodb(fs->fs_bsize); 1447 ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; 1448 } 1449 continue; 1450 } 1451 dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc]; 1452 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1453 ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; 1454 else if ((dblk == blkstofrags(fs, blkno) && 1455 ffs_snapblkfree(fs, ip->i_devvp, dblk, 1456 fs->fs_bsize, ip->i_number))) { 1457 ip->i_din2->di_blocks -= btodb(fs->fs_bsize); 1458 ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; 1459 } 1460 } 1461 bawrite(ibp); 1462 } 1463 /* 1464 * Clear snapshot flag and drop reference. 1465 */ 1466 ip->i_flags &= ~SF_SNAPSHOT; 1467 DIP(ip, i_flags) = ip->i_flags; 1468 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1469 } 1470 1471 /* 1472 * Notification that a block is being freed. Return zero if the free 1473 * should be allowed to proceed. Return non-zero if the snapshot file 1474 * wants to claim the block. The block will be claimed if it is an 1475 * uncopied part of one of the snapshots. It will be freed if it is 1476 * either a BLK_NOCOPY or has already been copied in all of the snapshots. 1477 * If a fragment is being freed, then all snapshots that care about 1478 * it must make a copy since a snapshot file can only claim full sized 1479 * blocks. Note that if more than one snapshot file maps the block, 1480 * we can pick one at random to claim it. Since none of the snapshots 1481 * can change, we are assurred that they will all see the same unmodified 1482 * image. When deleting a snapshot file (see ffs_snapremove above), we 1483 * must push any of these claimed blocks to one of the other snapshots 1484 * that maps it. These claimed blocks are easily identified as they will 1485 * have a block number equal to their logical block number within the 1486 * snapshot. A copied block can never have this property because they 1487 * must always have been allocated from a BLK_NOCOPY location. 1488 */ 1489 int 1490 ffs_snapblkfree(fs, devvp, bno, size, inum) 1491 struct fs *fs; 1492 struct vnode *devvp; 1493 ufs2_daddr_t bno; 1494 long size; 1495 ino_t inum; 1496 { 1497 struct buf *ibp, *cbp, *savedcbp = 0; 1498 struct thread *td = curthread; 1499 struct inode *ip; 1500 struct vnode *vp = NULL; 1501 ufs_lbn_t lbn; 1502 ufs2_daddr_t blkno; 1503 int indiroff = 0, snapshot_locked = 0, error = 0, claimedblk = 0; 1504 struct snaphead *snaphead; 1505 1506 lbn = fragstoblks(fs, bno); 1507 retry: 1508 VI_LOCK(devvp); 1509 snaphead = &devvp->v_rdev->si_snapshots; 1510 TAILQ_FOREACH(ip, snaphead, i_nextsnap) { 1511 vp = ITOV(ip); 1512 /* 1513 * Lookup block being written. 1514 */ 1515 if (lbn < NDADDR) { 1516 blkno = DIP(ip, i_db[lbn]); 1517 } else { 1518 if (snapshot_locked == 0 && 1519 lockmgr(vp->v_vnlock, 1520 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 1521 VI_MTX(devvp), td) != 0) 1522 goto retry; 1523 snapshot_locked = 1; 1524 td->td_proc->p_flag |= P_COWINPROGRESS; 1525 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1526 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1527 td->td_proc->p_flag &= ~P_COWINPROGRESS; 1528 if (error) 1529 break; 1530 indiroff = (lbn - NDADDR) % NINDIR(fs); 1531 if (ip->i_ump->um_fstype == UFS1) 1532 blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; 1533 else 1534 blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; 1535 } 1536 /* 1537 * Check to see if block needs to be copied. 1538 */ 1539 if (blkno == 0) { 1540 /* 1541 * A block that we map is being freed. If it has not 1542 * been claimed yet, we will claim or copy it (below). 1543 */ 1544 claimedblk = 1; 1545 } else if (blkno == BLK_SNAP) { 1546 /* 1547 * No previous snapshot claimed the block, 1548 * so it will be freed and become a BLK_NOCOPY 1549 * (don't care) for us. 1550 */ 1551 if (claimedblk) 1552 panic("snapblkfree: inconsistent block type"); 1553 if (snapshot_locked == 0 && 1554 lockmgr(vp->v_vnlock, 1555 LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, 1556 VI_MTX(devvp), td) != 0) { 1557 if (lbn >= NDADDR) 1558 bqrelse(ibp); 1559 vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL, td); 1560 goto retry; 1561 } 1562 snapshot_locked = 1; 1563 if (lbn < NDADDR) { 1564 DIP(ip, i_db[lbn]) = BLK_NOCOPY; 1565 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1566 } else if (ip->i_ump->um_fstype == UFS1) { 1567 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 1568 BLK_NOCOPY; 1569 bdwrite(ibp); 1570 } else { 1571 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 1572 BLK_NOCOPY; 1573 bdwrite(ibp); 1574 } 1575 continue; 1576 } else /* BLK_NOCOPY or default */ { 1577 /* 1578 * If the snapshot has already copied the block 1579 * (default), or does not care about the block, 1580 * it is not needed. 1581 */ 1582 if (lbn >= NDADDR) 1583 bqrelse(ibp); 1584 continue; 1585 } 1586 /* 1587 * If this is a full size block, we will just grab it 1588 * and assign it to the snapshot inode. Otherwise we 1589 * will proceed to copy it. See explanation for this 1590 * routine as to why only a single snapshot needs to 1591 * claim this block. 1592 */ 1593 if (snapshot_locked == 0 && 1594 lockmgr(vp->v_vnlock, 1595 LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, 1596 VI_MTX(devvp), td) != 0) { 1597 if (lbn >= NDADDR) 1598 bqrelse(ibp); 1599 vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL, td); 1600 goto retry; 1601 } 1602 snapshot_locked = 1; 1603 if (size == fs->fs_bsize) { 1604 #ifdef DEBUG 1605 if (snapdebug) 1606 printf("%s %d lbn %jd from inum %d\n", 1607 "Grabonremove: snapino", ip->i_number, 1608 (intmax_t)lbn, inum); 1609 #endif 1610 if (lbn < NDADDR) { 1611 DIP(ip, i_db[lbn]) = bno; 1612 } else if (ip->i_ump->um_fstype == UFS1) { 1613 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno; 1614 bdwrite(ibp); 1615 } else { 1616 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno; 1617 bdwrite(ibp); 1618 } 1619 DIP(ip, i_blocks) += btodb(size); 1620 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1621 VOP_UNLOCK(vp, 0, td); 1622 return (1); 1623 } 1624 if (lbn >= NDADDR) 1625 bqrelse(ibp); 1626 /* 1627 * Allocate the block into which to do the copy. Note that this 1628 * allocation will never require any additional allocations for 1629 * the snapshot inode. 1630 */ 1631 td->td_proc->p_flag |= P_COWINPROGRESS; 1632 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1633 fs->fs_bsize, KERNCRED, 0, &cbp); 1634 td->td_proc->p_flag &= ~P_COWINPROGRESS; 1635 if (error) 1636 break; 1637 #ifdef DEBUG 1638 if (snapdebug) 1639 printf("%s%d lbn %jd %s %d size %ld to blkno %jd\n", 1640 "Copyonremove: snapino ", ip->i_number, 1641 (intmax_t)lbn, "for inum", inum, size, 1642 (intmax_t)cbp->b_blkno); 1643 #endif 1644 /* 1645 * If we have already read the old block contents, then 1646 * simply copy them to the new block. Note that we need 1647 * to synchronously write snapshots that have not been 1648 * unlinked, and hence will be visible after a crash, 1649 * to ensure their integrity. 1650 */ 1651 if (savedcbp != 0) { 1652 bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 1653 bawrite(cbp); 1654 if (dopersistence && ip->i_effnlink > 0) 1655 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1656 continue; 1657 } 1658 /* 1659 * Otherwise, read the old block contents into the buffer. 1660 */ 1661 if ((error = readblock(cbp, lbn)) != 0) { 1662 bzero(cbp->b_data, fs->fs_bsize); 1663 bawrite(cbp); 1664 if (dopersistence && ip->i_effnlink > 0) 1665 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1666 break; 1667 } 1668 savedcbp = cbp; 1669 } 1670 /* 1671 * Note that we need to synchronously write snapshots that 1672 * have not been unlinked, and hence will be visible after 1673 * a crash, to ensure their integrity. 1674 */ 1675 if (savedcbp) { 1676 vp = savedcbp->b_vp; 1677 bawrite(savedcbp); 1678 if (dopersistence && VTOI(vp)->i_effnlink > 0) 1679 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1680 } 1681 /* 1682 * If we have been unable to allocate a block in which to do 1683 * the copy, then return non-zero so that the fragment will 1684 * not be freed. Although space will be lost, the snapshot 1685 * will stay consistent. 1686 */ 1687 if (snapshot_locked) 1688 VOP_UNLOCK(vp, 0, td); 1689 else 1690 VI_UNLOCK(devvp); 1691 return (error); 1692 } 1693 1694 /* 1695 * Associate snapshot files when mounting. 1696 */ 1697 void 1698 ffs_snapshot_mount(mp) 1699 struct mount *mp; 1700 { 1701 struct ufsmount *ump = VFSTOUFS(mp); 1702 struct vnode *devvp = ump->um_devvp; 1703 struct fs *fs = ump->um_fs; 1704 struct thread *td = curthread; 1705 struct snaphead *snaphead; 1706 struct vnode *vp; 1707 struct inode *ip, *xp; 1708 struct uio auio; 1709 struct iovec aiov; 1710 void *snapblklist; 1711 char *reason; 1712 daddr_t snaplistsize; 1713 int error, snaploc, loc; 1714 1715 /* 1716 * XXX The following needs to be set before UFS_TRUNCATE or 1717 * VOP_READ can be called. 1718 */ 1719 mp->mnt_stat.f_iosize = fs->fs_bsize; 1720 /* 1721 * Process each snapshot listed in the superblock. 1722 */ 1723 vp = NULL; 1724 snaphead = &devvp->v_rdev->si_snapshots; 1725 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { 1726 if (fs->fs_snapinum[snaploc] == 0) 1727 break; 1728 if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], 1729 LK_EXCLUSIVE, &vp)) != 0){ 1730 printf("ffs_snapshot_mount: vget failed %d\n", error); 1731 continue; 1732 } 1733 ip = VTOI(vp); 1734 if ((ip->i_flags & SF_SNAPSHOT) == 0 || ip->i_size == 1735 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) { 1736 if ((ip->i_flags & SF_SNAPSHOT) == 0) { 1737 reason = "non-snapshot"; 1738 } else { 1739 reason = "old format snapshot"; 1740 (void)UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED, td); 1741 (void)VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1742 } 1743 printf("ffs_snapshot_mount: %s inode %d\n", 1744 reason, fs->fs_snapinum[snaploc]); 1745 vput(vp); 1746 vp = NULL; 1747 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { 1748 if (fs->fs_snapinum[loc] == 0) 1749 break; 1750 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; 1751 } 1752 fs->fs_snapinum[loc - 1] = 0; 1753 snaploc--; 1754 continue; 1755 } 1756 /* 1757 * If there already exist snapshots on this filesystem, grab a 1758 * reference to their shared lock. If this is the first snapshot 1759 * on this filesystem, we need to allocate a lock for the 1760 * snapshots to share. In either case, acquire the snapshot 1761 * lock and give up our original private lock. 1762 */ 1763 VI_LOCK(devvp); 1764 if ((xp = TAILQ_FIRST(snaphead)) != NULL) { 1765 VI_LOCK(vp); 1766 vp->v_vnlock = ITOV(xp)->v_vnlock; 1767 VI_UNLOCK(devvp); 1768 } else { 1769 struct lock *lkp; 1770 1771 VI_UNLOCK(devvp); 1772 MALLOC(lkp, struct lock *, sizeof(struct lock), 1773 M_UFSMNT, M_WAITOK); 1774 lockinit(lkp, PVFS, "snaplk", VLKTIMEOUT, 1775 LK_CANRECURSE | LK_NOPAUSE); 1776 VI_LOCK(vp); 1777 vp->v_vnlock = lkp; 1778 } 1779 vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, td); 1780 transferlockers(&vp->v_lock, vp->v_vnlock); 1781 lockmgr(&vp->v_lock, LK_RELEASE, NULL, td); 1782 /* 1783 * Link it onto the active snapshot list. 1784 */ 1785 VI_LOCK(devvp); 1786 if (ip->i_nextsnap.tqe_prev != 0) 1787 panic("ffs_snapshot_mount: %d already on list", 1788 ip->i_number); 1789 else 1790 TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap); 1791 vp->v_vflag |= VV_SYSTEM; 1792 VI_UNLOCK(devvp); 1793 VOP_UNLOCK(vp, 0, td); 1794 } 1795 /* 1796 * No usable snapshots found. 1797 */ 1798 if (vp == NULL) 1799 return; 1800 /* 1801 * Allocate the space for the block hints list. We always want to 1802 * use the list from the newest snapshot. 1803 */ 1804 auio.uio_iov = &aiov; 1805 auio.uio_iovcnt = 1; 1806 aiov.iov_base = (void *)&snaplistsize; 1807 aiov.iov_len = sizeof(snaplistsize); 1808 auio.uio_resid = aiov.iov_len; 1809 auio.uio_offset = 1810 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)); 1811 auio.uio_segflg = UIO_SYSSPACE; 1812 auio.uio_rw = UIO_READ; 1813 auio.uio_td = td; 1814 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1815 if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 1816 printf("ffs_snapshot_mount: read_1 failed %d\n", error); 1817 VOP_UNLOCK(vp, 0, td); 1818 return; 1819 } 1820 MALLOC(snapblklist, void *, snaplistsize * sizeof(daddr_t), 1821 M_UFSMNT, M_WAITOK); 1822 auio.uio_iovcnt = 1; 1823 aiov.iov_base = snapblklist; 1824 aiov.iov_len = snaplistsize * sizeof (daddr_t); 1825 auio.uio_resid = aiov.iov_len; 1826 auio.uio_offset -= sizeof(snaplistsize); 1827 if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 1828 printf("ffs_snapshot_mount: read_2 failed %d\n", error); 1829 VOP_UNLOCK(vp, 0, td); 1830 FREE(snapblklist, M_UFSMNT); 1831 return; 1832 } 1833 VOP_UNLOCK(vp, 0, td); 1834 VI_LOCK(devvp); 1835 ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_mount"); 1836 devvp->v_rdev->si_snaplistsize = snaplistsize; 1837 devvp->v_rdev->si_snapblklist = (daddr_t *)snapblklist; 1838 devvp->v_rdev->si_copyonwrite = ffs_copyonwrite; 1839 devvp->v_vflag |= VV_COPYONWRITE; 1840 VI_UNLOCK(devvp); 1841 } 1842 1843 /* 1844 * Disassociate snapshot files when unmounting. 1845 */ 1846 void 1847 ffs_snapshot_unmount(mp) 1848 struct mount *mp; 1849 { 1850 struct vnode *devvp = VFSTOUFS(mp)->um_devvp; 1851 struct snaphead *snaphead = &devvp->v_rdev->si_snapshots; 1852 struct lock *lkp = NULL; 1853 struct inode *xp; 1854 struct vnode *vp; 1855 1856 VI_LOCK(devvp); 1857 while ((xp = TAILQ_FIRST(snaphead)) != 0) { 1858 vp = ITOV(xp); 1859 lkp = vp->v_vnlock; 1860 vp->v_vnlock = &vp->v_lock; 1861 TAILQ_REMOVE(snaphead, xp, i_nextsnap); 1862 xp->i_nextsnap.tqe_prev = 0; 1863 if (xp->i_effnlink > 0) { 1864 VI_UNLOCK(devvp); 1865 vrele(vp); 1866 VI_LOCK(devvp); 1867 } 1868 } 1869 if (devvp->v_rdev->si_snapblklist != NULL) { 1870 FREE(devvp->v_rdev->si_snapblklist, M_UFSMNT); 1871 devvp->v_rdev->si_snapblklist = NULL; 1872 devvp->v_rdev->si_snaplistsize = 0; 1873 } 1874 if (lkp != NULL) { 1875 lockdestroy(lkp); 1876 FREE(lkp, M_UFSMNT); 1877 } 1878 ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount"); 1879 devvp->v_rdev->si_copyonwrite = 0; 1880 devvp->v_vflag &= ~VV_COPYONWRITE; 1881 VI_UNLOCK(devvp); 1882 } 1883 1884 /* 1885 * Check for need to copy block that is about to be written, 1886 * copying the block if necessary. 1887 */ 1888 static int 1889 ffs_copyonwrite(devvp, bp) 1890 struct vnode *devvp; 1891 struct buf *bp; 1892 { 1893 struct snaphead *snaphead; 1894 struct buf *ibp, *cbp, *savedcbp = 0; 1895 struct thread *td = curthread; 1896 struct fs *fs; 1897 struct inode *ip; 1898 struct vnode *vp = 0; 1899 ufs2_daddr_t lbn, blkno, *snapblklist; 1900 int lower, upper, mid, indiroff, snapshot_locked = 0, error = 0; 1901 1902 if (td->td_proc->p_flag & P_COWINPROGRESS) 1903 panic("ffs_copyonwrite: recursive call"); 1904 /* 1905 * First check to see if it is in the preallocated list. 1906 * By doing this check we avoid several potential deadlocks. 1907 */ 1908 VI_LOCK(devvp); 1909 snaphead = &devvp->v_rdev->si_snapshots; 1910 ip = TAILQ_FIRST(snaphead); 1911 fs = ip->i_fs; 1912 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); 1913 snapblklist = devvp->v_rdev->si_snapblklist; 1914 upper = devvp->v_rdev->si_snaplistsize - 1; 1915 lower = 1; 1916 while (lower <= upper) { 1917 mid = (lower + upper) / 2; 1918 if (snapblklist[mid] == lbn) 1919 break; 1920 if (snapblklist[mid] < lbn) 1921 lower = mid + 1; 1922 else 1923 upper = mid - 1; 1924 } 1925 if (lower <= upper) { 1926 VI_UNLOCK(devvp); 1927 return (0); 1928 } 1929 /* 1930 * Not in the precomputed list, so check the snapshots. 1931 */ 1932 retry: 1933 TAILQ_FOREACH(ip, snaphead, i_nextsnap) { 1934 vp = ITOV(ip); 1935 /* 1936 * We ensure that everything of our own that needs to be 1937 * copied will be done at the time that ffs_snapshot is 1938 * called. Thus we can skip the check here which can 1939 * deadlock in doing the lookup in UFS_BALLOC. 1940 */ 1941 if (bp->b_vp == vp) 1942 continue; 1943 /* 1944 * Check to see if block needs to be copied. We do not have 1945 * to hold the snapshot lock while doing this lookup as it 1946 * will never require any additional allocations for the 1947 * snapshot inode. 1948 */ 1949 if (lbn < NDADDR) { 1950 blkno = DIP(ip, i_db[lbn]); 1951 } else { 1952 if (snapshot_locked == 0 && 1953 lockmgr(vp->v_vnlock, 1954 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 1955 VI_MTX(devvp), td) != 0) { 1956 VI_LOCK(devvp); 1957 goto retry; 1958 } 1959 snapshot_locked = 1; 1960 td->td_proc->p_flag |= P_COWINPROGRESS; 1961 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1962 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1963 td->td_proc->p_flag &= ~P_COWINPROGRESS; 1964 if (error) 1965 break; 1966 indiroff = (lbn - NDADDR) % NINDIR(fs); 1967 if (ip->i_ump->um_fstype == UFS1) 1968 blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; 1969 else 1970 blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; 1971 bqrelse(ibp); 1972 } 1973 #ifdef DIAGNOSTIC 1974 if (blkno == BLK_SNAP && bp->b_lblkno >= 0) 1975 panic("ffs_copyonwrite: bad copy block"); 1976 #endif 1977 if (blkno != 0) 1978 continue; 1979 /* 1980 * Allocate the block into which to do the copy. Since 1981 * multiple processes may all try to copy the same block, 1982 * we have to recheck our need to do a copy if we sleep 1983 * waiting for the lock. 1984 * 1985 * Because all snapshots on a filesystem share a single 1986 * lock, we ensure that we will never be in competition 1987 * with another process to allocate a block. 1988 */ 1989 if (snapshot_locked == 0 && 1990 lockmgr(vp->v_vnlock, 1991 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 1992 VI_MTX(devvp), td) != 0) { 1993 VI_LOCK(devvp); 1994 goto retry; 1995 } 1996 snapshot_locked = 1; 1997 td->td_proc->p_flag |= P_COWINPROGRESS; 1998 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1999 fs->fs_bsize, KERNCRED, 0, &cbp); 2000 td->td_proc->p_flag &= ~P_COWINPROGRESS; 2001 if (error) 2002 break; 2003 #ifdef DEBUG 2004 if (snapdebug) { 2005 printf("Copyonwrite: snapino %d lbn %jd for ", 2006 ip->i_number, (intmax_t)lbn); 2007 if (bp->b_vp == devvp) 2008 printf("fs metadata"); 2009 else 2010 printf("inum %d", VTOI(bp->b_vp)->i_number); 2011 printf(" lblkno %jd to blkno %jd\n", 2012 (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno); 2013 } 2014 #endif 2015 /* 2016 * If we have already read the old block contents, then 2017 * simply copy them to the new block. Note that we need 2018 * to synchronously write snapshots that have not been 2019 * unlinked, and hence will be visible after a crash, 2020 * to ensure their integrity. 2021 */ 2022 if (savedcbp != 0) { 2023 bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 2024 bawrite(cbp); 2025 if (dopersistence && ip->i_effnlink > 0) 2026 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 2027 continue; 2028 } 2029 /* 2030 * Otherwise, read the old block contents into the buffer. 2031 */ 2032 if ((error = readblock(cbp, lbn)) != 0) { 2033 bzero(cbp->b_data, fs->fs_bsize); 2034 bawrite(cbp); 2035 if (dopersistence && ip->i_effnlink > 0) 2036 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 2037 break; 2038 } 2039 savedcbp = cbp; 2040 } 2041 /* 2042 * Note that we need to synchronously write snapshots that 2043 * have not been unlinked, and hence will be visible after 2044 * a crash, to ensure their integrity. 2045 */ 2046 if (savedcbp) { 2047 vp = savedcbp->b_vp; 2048 bawrite(savedcbp); 2049 if (dopersistence && VTOI(vp)->i_effnlink > 0) 2050 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 2051 } 2052 if (snapshot_locked) 2053 VOP_UNLOCK(vp, 0, td); 2054 else 2055 VI_UNLOCK(devvp); 2056 return (error); 2057 } 2058 2059 /* 2060 * Read the specified block into the given buffer. 2061 * Much of this boiler-plate comes from bwrite(). 2062 */ 2063 static int 2064 readblock(bp, lbn) 2065 struct buf *bp; 2066 ufs2_daddr_t lbn; 2067 { 2068 struct uio auio; 2069 struct iovec aiov; 2070 struct thread *td = curthread; 2071 struct inode *ip = VTOI(bp->b_vp); 2072 2073 aiov.iov_base = bp->b_data; 2074 aiov.iov_len = bp->b_bcount; 2075 auio.uio_iov = &aiov; 2076 auio.uio_iovcnt = 1; 2077 auio.uio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn))); 2078 auio.uio_resid = bp->b_bcount; 2079 auio.uio_rw = UIO_READ; 2080 auio.uio_segflg = UIO_SYSSPACE; 2081 auio.uio_td = td; 2082 return (physio(ip->i_devvp->v_rdev, &auio, 0)); 2083 } 2084