1 /* 2 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. 3 * 4 * Further information about snapshots can be obtained from: 5 * 6 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 7 * 1614 Oxford Street mckusick@mckusick.com 8 * Berkeley, CA 94709-1608 +1-510-843-9542 9 * USA 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 21 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 22 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 34 * $FreeBSD$ 35 */ 36 37 #include <sys/param.h> 38 #include <sys/stdint.h> 39 #include <sys/kernel.h> 40 #include <sys/systm.h> 41 #include <sys/conf.h> 42 #include <sys/bio.h> 43 #include <sys/buf.h> 44 #include <sys/proc.h> 45 #include <sys/namei.h> 46 #include <sys/stat.h> 47 #include <sys/malloc.h> 48 #include <sys/mount.h> 49 #include <sys/resource.h> 50 #include <sys/resourcevar.h> 51 #include <sys/vnode.h> 52 53 #include <ufs/ufs/extattr.h> 54 #include <ufs/ufs/quota.h> 55 #include <ufs/ufs/ufsmount.h> 56 #include <ufs/ufs/inode.h> 57 #include <ufs/ufs/ufs_extern.h> 58 59 #include <ufs/ffs/fs.h> 60 #include <ufs/ffs/ffs_extern.h> 61 62 #define KERNCRED thread0.td_ucred 63 #define DEBUG 1 64 65 static int cgaccount(int, struct vnode *, struct buf *, int); 66 static int expunge_ufs1(struct vnode *, struct inode *, struct fs *, 67 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 68 ufs_lbn_t, int), int); 69 static int indiracct_ufs1(struct vnode *, struct vnode *, int, 70 ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 71 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 72 ufs_lbn_t, int), int); 73 static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 74 struct fs *, ufs_lbn_t, int); 75 static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 76 struct fs *, ufs_lbn_t, int); 77 static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 78 struct fs *, ufs_lbn_t, int); 79 static int expunge_ufs2(struct vnode *, struct inode *, struct fs *, 80 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 81 ufs_lbn_t, int), int); 82 static int indiracct_ufs2(struct vnode *, struct vnode *, int, 83 ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 84 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 85 ufs_lbn_t, int), int); 86 static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 87 struct fs *, ufs_lbn_t, int); 88 static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 89 struct fs *, ufs_lbn_t, int); 90 static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 91 struct fs *, ufs_lbn_t, int); 92 static int ffs_copyonwrite(struct vnode *, struct buf *); 93 static int readblock(struct buf *, ufs2_daddr_t); 94 95 /* 96 * To ensure the consistency of snapshots across crashes, we must 97 * synchronously write out copied blocks before allowing the 98 * originals to be modified. Because of the rather severe speed 99 * penalty that this imposes, the following flag allows this 100 * crash persistence to be disabled. 101 */ 102 int dopersistence = 0; 103 104 #ifdef DEBUG 105 #include <sys/sysctl.h> 106 SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, ""); 107 int snapdebug = 0; 108 SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, ""); 109 int collectsnapstats = 0; 110 SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats, 111 0, ""); 112 #endif /* DEBUG */ 113 114 /* 115 * Create a snapshot file and initialize it for the filesystem. 116 */ 117 int 118 ffs_snapshot(mp, snapfile) 119 struct mount *mp; 120 char *snapfile; 121 { 122 ufs2_daddr_t numblks, blkno, *blkp, *snapblklist; 123 int error, cg, snaploc; 124 int i, size, len, loc; 125 int flag = mp->mnt_flag; 126 struct timespec starttime = {0, 0}, endtime; 127 char saved_nice = 0; 128 long redo = 0, snaplistsize = 0; 129 int32_t *lp; 130 void *space; 131 struct fs *copy_fs = NULL, *fs = VFSTOUFS(mp)->um_fs; 132 struct snaphead *snaphead; 133 struct thread *td = curthread; 134 struct inode *ip, *xp; 135 struct buf *bp, *nbp, *ibp, *sbp = NULL; 136 struct nameidata nd; 137 struct mount *wrtmp; 138 struct vattr vat; 139 struct vnode *vp, *xvp, *nvp, *devvp; 140 struct uio auio; 141 struct iovec aiov; 142 143 /* 144 * Need to serialize access to snapshot code per filesystem. 145 */ 146 /* 147 * Assign a snapshot slot in the superblock. 148 */ 149 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 150 if (fs->fs_snapinum[snaploc] == 0) 151 break; 152 if (snaploc == FSMAXSNAP) 153 return (ENOSPC); 154 /* 155 * Create the snapshot file. 156 */ 157 restart: 158 NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, snapfile, td); 159 if ((error = namei(&nd)) != 0) 160 return (error); 161 if (nd.ni_vp != NULL) { 162 vput(nd.ni_vp); 163 error = EEXIST; 164 } 165 if (nd.ni_dvp->v_mount != mp) 166 error = EXDEV; 167 if (error) { 168 NDFREE(&nd, NDF_ONLY_PNBUF); 169 if (nd.ni_dvp == nd.ni_vp) 170 vrele(nd.ni_dvp); 171 else 172 vput(nd.ni_dvp); 173 return (error); 174 } 175 VATTR_NULL(&vat); 176 vat.va_type = VREG; 177 vat.va_mode = S_IRUSR; 178 vat.va_vaflags |= VA_EXCLUSIVE; 179 if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp)) 180 wrtmp = NULL; 181 if (wrtmp != mp) 182 panic("ffs_snapshot: mount mismatch"); 183 if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) { 184 NDFREE(&nd, NDF_ONLY_PNBUF); 185 vput(nd.ni_dvp); 186 if ((error = vn_start_write(NULL, &wrtmp, 187 V_XSLEEP | PCATCH)) != 0) 188 return (error); 189 goto restart; 190 } 191 VOP_LEASE(nd.ni_dvp, td, KERNCRED, LEASE_WRITE); 192 error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat); 193 vput(nd.ni_dvp); 194 if (error) { 195 NDFREE(&nd, NDF_ONLY_PNBUF); 196 vn_finished_write(wrtmp); 197 return (error); 198 } 199 vp = nd.ni_vp; 200 ip = VTOI(vp); 201 devvp = ip->i_devvp; 202 /* 203 * Allocate and copy the last block contents so as to be able 204 * to set size to that of the filesystem. 205 */ 206 numblks = howmany(fs->fs_size, fs->fs_frag); 207 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), 208 fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); 209 if (error) 210 goto out; 211 ip->i_size = lblktosize(fs, (off_t)numblks); 212 DIP(ip, i_size) = ip->i_size; 213 ip->i_flag |= IN_CHANGE | IN_UPDATE; 214 if ((error = readblock(bp, numblks - 1)) != 0) 215 goto out; 216 bawrite(bp); 217 /* 218 * Preallocate critical data structures so that we can copy 219 * them in without further allocation after we suspend all 220 * operations on the filesystem. We would like to just release 221 * the allocated buffers without writing them since they will 222 * be filled in below once we are ready to go, but this upsets 223 * the soft update code, so we go ahead and write the new buffers. 224 * 225 * Allocate all indirect blocks and mark all of them as not 226 * needing to be copied. 227 */ 228 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 229 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), 230 fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp); 231 if (error) 232 goto out; 233 bawrite(ibp); 234 } 235 /* 236 * Allocate copies for the superblock and its summary information. 237 */ 238 error = UFS_BALLOC(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED, 239 0, &nbp); 240 if (error) 241 goto out; 242 bawrite(nbp); 243 blkno = fragstoblks(fs, fs->fs_csaddr); 244 len = howmany(fs->fs_cssize, fs->fs_bsize); 245 for (loc = 0; loc < len; loc++) { 246 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), 247 fs->fs_bsize, KERNCRED, 0, &nbp); 248 if (error) 249 goto out; 250 bawrite(nbp); 251 } 252 /* 253 * Allocate all cylinder group blocks. 254 */ 255 for (cg = 0; cg < fs->fs_ncg; cg++) { 256 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 257 fs->fs_bsize, KERNCRED, 0, &nbp); 258 if (error) 259 goto out; 260 bawrite(nbp); 261 } 262 /* 263 * Copy all the cylinder group maps. Although the 264 * filesystem is still active, we hope that only a few 265 * cylinder groups will change between now and when we 266 * suspend operations. Thus, we will be able to quickly 267 * touch up the few cylinder groups that changed during 268 * the suspension period. 269 */ 270 len = howmany(fs->fs_ncg, NBBY); 271 MALLOC(fs->fs_active, int *, len, M_DEVBUF, M_WAITOK); 272 bzero(fs->fs_active, len); 273 for (cg = 0; cg < fs->fs_ncg; cg++) { 274 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 275 fs->fs_bsize, KERNCRED, 0, &nbp); 276 if (error) 277 goto out; 278 error = cgaccount(cg, vp, nbp, 1); 279 bawrite(nbp); 280 if (error) 281 goto out; 282 } 283 /* 284 * Change inode to snapshot type file. 285 */ 286 ip->i_flags |= SF_SNAPSHOT; 287 DIP(ip, i_flags) = ip->i_flags; 288 ip->i_flag |= IN_CHANGE | IN_UPDATE; 289 /* 290 * Ensure that the snapshot is completely on disk. 291 * Since we have marked it as a snapshot it is safe to 292 * unlock it as no process will be allowed to write to it. 293 */ 294 if ((error = VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td)) != 0) 295 goto out; 296 VOP_UNLOCK(vp, 0, td); 297 /* 298 * All allocations are done, so we can now snapshot the system. 299 * 300 * Recind nice scheduling while running with the filesystem suspended. 301 */ 302 if (td->td_ksegrp->kg_nice > 0) { 303 saved_nice = td->td_ksegrp->kg_nice; 304 td->td_ksegrp->kg_nice = 0; 305 } 306 /* 307 * Suspend operation on filesystem. 308 */ 309 for (;;) { 310 vn_finished_write(wrtmp); 311 if ((error = vfs_write_suspend(vp->v_mount)) != 0) { 312 vn_start_write(NULL, &wrtmp, V_WAIT); 313 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 314 goto out; 315 } 316 if (mp->mnt_kern_flag & MNTK_SUSPENDED) 317 break; 318 vn_start_write(NULL, &wrtmp, V_WAIT); 319 } 320 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 321 if (collectsnapstats) 322 nanotime(&starttime); 323 /* 324 * First, copy all the cylinder group maps that have changed. 325 */ 326 for (cg = 0; cg < fs->fs_ncg; cg++) { 327 if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0) 328 continue; 329 redo++; 330 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 331 fs->fs_bsize, KERNCRED, 0, &nbp); 332 if (error) 333 goto out1; 334 error = cgaccount(cg, vp, nbp, 2); 335 bawrite(nbp); 336 if (error) 337 goto out1; 338 } 339 /* 340 * Grab a copy of the superblock and its summary information. 341 * We delay writing it until the suspension is released below. 342 */ 343 error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize, 344 KERNCRED, &sbp); 345 if (error) { 346 brelse(sbp); 347 sbp = NULL; 348 goto out1; 349 } 350 loc = blkoff(fs, fs->fs_sblockloc); 351 copy_fs = (struct fs *)(sbp->b_data + loc); 352 bcopy(fs, copy_fs, fs->fs_sbsize); 353 if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) 354 copy_fs->fs_clean = 1; 355 if (fs->fs_sbsize < SBLOCKSIZE) 356 bzero(&sbp->b_data[loc + fs->fs_sbsize], 357 SBLOCKSIZE - fs->fs_sbsize); 358 size = blkroundup(fs, fs->fs_cssize); 359 if (fs->fs_contigsumsize > 0) 360 size += fs->fs_ncg * sizeof(int32_t); 361 space = malloc((u_long)size, M_UFSMNT, M_WAITOK); 362 copy_fs->fs_csp = space; 363 bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize); 364 (char *)space += fs->fs_cssize; 365 loc = howmany(fs->fs_cssize, fs->fs_fsize); 366 i = fs->fs_frag - loc % fs->fs_frag; 367 len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; 368 if (len > 0) { 369 if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc), 370 len, KERNCRED, &bp)) != 0) { 371 brelse(bp); 372 free(copy_fs->fs_csp, M_UFSMNT); 373 bawrite(sbp); 374 sbp = NULL; 375 goto out1; 376 } 377 bcopy(bp->b_data, space, (u_int)len); 378 (char *)space += len; 379 bp->b_flags |= B_INVAL | B_NOCACHE; 380 brelse(bp); 381 } 382 if (fs->fs_contigsumsize > 0) { 383 copy_fs->fs_maxcluster = lp = space; 384 for (i = 0; i < fs->fs_ncg; i++) 385 *lp++ = fs->fs_contigsumsize; 386 } 387 /* 388 * We must check for active files that have been unlinked 389 * (e.g., with a zero link count). We have to expunge all 390 * trace of these files from the snapshot so that they are 391 * not reclaimed prematurely by fsck or unnecessarily dumped. 392 * We turn off the MNTK_SUSPENDED flag to avoid a panic from 393 * spec_strategy about writing on a suspended filesystem. 394 * Note that we skip unlinked snapshot files as they will 395 * be handled separately below. 396 * 397 * We also calculate the needed size for the snapshot list. 398 */ 399 snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + 400 FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */; 401 mp->mnt_kern_flag &= ~MNTK_SUSPENDED; 402 mtx_lock(&mntvnode_mtx); 403 loop: 404 for (xvp = TAILQ_FIRST(&mp->mnt_nvnodelist); xvp; xvp = nvp) { 405 /* 406 * Make sure this vnode wasn't reclaimed in getnewvnode(). 407 * Start over if it has (it won't be on the list anymore). 408 */ 409 if (xvp->v_mount != mp) 410 goto loop; 411 nvp = TAILQ_NEXT(xvp, v_nmntvnodes); 412 mtx_unlock(&mntvnode_mtx); 413 mp_fixme("Unlocked GETATTR."); 414 if (vrefcnt(xvp) == 0 || xvp->v_type == VNON || 415 (VTOI(xvp)->i_flags & SF_SNAPSHOT) || 416 (VOP_GETATTR(xvp, &vat, td->td_proc->p_ucred, td) == 0 && 417 vat.va_nlink > 0)) { 418 mtx_lock(&mntvnode_mtx); 419 continue; 420 } 421 if (snapdebug) 422 vprint("ffs_snapshot: busy vnode", xvp); 423 if (vn_lock(xvp, LK_EXCLUSIVE, td) != 0) 424 goto loop; 425 xp = VTOI(xvp); 426 if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) { 427 VOP_UNLOCK(xvp, 0, td); 428 continue; 429 } 430 /* 431 * If there is a fragment, clear it here. 432 */ 433 blkno = 0; 434 loc = howmany(xp->i_size, fs->fs_bsize) - 1; 435 if (loc < NDADDR) { 436 len = fragroundup(fs, blkoff(fs, xp->i_size)); 437 if (len < fs->fs_bsize) { 438 ffs_blkfree(copy_fs, vp, DIP(xp, i_db[loc]), 439 len, xp->i_number); 440 blkno = DIP(xp, i_db[loc]); 441 DIP(xp, i_db[loc]) = 0; 442 } 443 } 444 snaplistsize += 1; 445 if (xp->i_ump->um_fstype == UFS1) 446 error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, 447 BLK_NOCOPY); 448 else 449 error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, 450 BLK_NOCOPY); 451 if (blkno) 452 DIP(xp, i_db[loc]) = blkno; 453 if (!error) 454 error = ffs_freefile(copy_fs, vp, xp->i_number, 455 xp->i_mode); 456 VOP_UNLOCK(xvp, 0, td); 457 if (error) { 458 free(copy_fs->fs_csp, M_UFSMNT); 459 bawrite(sbp); 460 sbp = NULL; 461 goto out1; 462 } 463 mtx_lock(&mntvnode_mtx); 464 } 465 mtx_unlock(&mntvnode_mtx); 466 /* 467 * If there already exist snapshots on this filesystem, grab a 468 * reference to their shared lock. If this is the first snapshot 469 * on this filesystem, we need to allocate a lock for the snapshots 470 * to share. In either case, acquire the snapshot lock and give 471 * up our original private lock. 472 */ 473 VI_LOCK(devvp); 474 snaphead = &devvp->v_rdev->si_snapshots; 475 if ((xp = TAILQ_FIRST(snaphead)) != NULL) { 476 VI_LOCK(vp); 477 vp->v_vnlock = ITOV(xp)->v_vnlock; 478 VI_UNLOCK(devvp); 479 } else { 480 struct lock *lkp; 481 482 VI_UNLOCK(devvp); 483 MALLOC(lkp, struct lock *, sizeof(struct lock), M_UFSMNT, 484 M_WAITOK); 485 lockinit(lkp, PVFS, "snaplk", VLKTIMEOUT, 486 LK_CANRECURSE | LK_NOPAUSE); 487 VI_LOCK(vp); 488 vp->v_vnlock = lkp; 489 } 490 vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, td); 491 transferlockers(&vp->v_lock, vp->v_vnlock); 492 lockmgr(&vp->v_lock, LK_RELEASE, NULL, td); 493 /* 494 * If this is the first snapshot on this filesystem, then we need 495 * to allocate the space for the list of preallocated snapshot blocks. 496 * This list will be refined below, but this preliminary one will 497 * keep us out of deadlock until the full one is ready. 498 */ 499 if (xp == NULL) { 500 MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t), 501 M_UFSMNT, M_WAITOK); 502 blkp = &snapblklist[1]; 503 *blkp++ = lblkno(fs, fs->fs_sblockloc); 504 blkno = fragstoblks(fs, fs->fs_csaddr); 505 for (cg = 0; cg < fs->fs_ncg; cg++) { 506 if (fragstoblks(fs, cgtod(fs, cg) > blkno)) 507 break; 508 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 509 } 510 len = howmany(fs->fs_cssize, fs->fs_bsize); 511 for (loc = 0; loc < len; loc++) 512 *blkp++ = blkno + loc; 513 for (; cg < fs->fs_ncg; cg++) 514 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 515 snapblklist[0] = blkp - snapblklist; 516 VI_LOCK(devvp); 517 if (devvp->v_rdev->si_snapblklist != NULL) 518 panic("ffs_snapshot: non-empty list"); 519 devvp->v_rdev->si_snapblklist = snapblklist; 520 devvp->v_rdev->si_snaplistsize = blkp - snapblklist; 521 VI_UNLOCK(devvp); 522 } 523 /* 524 * Record snapshot inode. Since this is the newest snapshot, 525 * it must be placed at the end of the list. 526 */ 527 VI_LOCK(devvp); 528 fs->fs_snapinum[snaploc] = ip->i_number; 529 if (ip->i_nextsnap.tqe_prev != 0) 530 panic("ffs_snapshot: %d already on list", ip->i_number); 531 TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap); 532 devvp->v_rdev->si_copyonwrite = ffs_copyonwrite; 533 devvp->v_vflag |= VV_COPYONWRITE; 534 VI_UNLOCK(devvp); 535 ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp"); 536 vp->v_vflag |= VV_SYSTEM; 537 out1: 538 /* 539 * Resume operation on filesystem. 540 */ 541 vfs_write_resume(vp->v_mount); 542 vn_start_write(NULL, &wrtmp, V_WAIT); 543 if (collectsnapstats && starttime.tv_sec > 0) { 544 nanotime(&endtime); 545 timespecsub(&endtime, &starttime); 546 printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n", 547 vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec, 548 endtime.tv_nsec / 1000000, redo, fs->fs_ncg); 549 } 550 if (sbp == NULL) 551 goto out; 552 /* 553 * Copy allocation information from all the snapshots in 554 * this snapshot and then expunge them from its view. 555 */ 556 snaphead = &devvp->v_rdev->si_snapshots; 557 TAILQ_FOREACH(xp, snaphead, i_nextsnap) { 558 if (xp == ip) 559 break; 560 if (xp->i_ump->um_fstype == UFS1) 561 error = expunge_ufs1(vp, xp, fs, snapacct_ufs1, 562 BLK_SNAP); 563 else 564 error = expunge_ufs2(vp, xp, fs, snapacct_ufs2, 565 BLK_SNAP); 566 if (error) { 567 fs->fs_snapinum[snaploc] = 0; 568 goto done; 569 } 570 } 571 /* 572 * Allocate space for the full list of preallocated snapshot blocks. 573 */ 574 MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t), 575 M_UFSMNT, M_WAITOK); 576 ip->i_snapblklist = &snapblklist[1]; 577 /* 578 * Expunge the blocks used by the snapshots from the set of 579 * blocks marked as used in the snapshot bitmaps. Also, collect 580 * the list of allocated blocks in i_snapblklist. 581 */ 582 if (ip->i_ump->um_fstype == UFS1) 583 error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP); 584 else 585 error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP); 586 if (error) { 587 fs->fs_snapinum[snaploc] = 0; 588 FREE(snapblklist, M_UFSMNT); 589 goto done; 590 } 591 if (snaplistsize < ip->i_snapblklist - snapblklist) 592 panic("ffs_snapshot: list too small"); 593 snaplistsize = ip->i_snapblklist - snapblklist; 594 snapblklist[0] = snaplistsize; 595 ip->i_snapblklist = 0; 596 /* 597 * Write out the list of allocated blocks to the end of the snapshot. 598 */ 599 auio.uio_iov = &aiov; 600 auio.uio_iovcnt = 1; 601 aiov.iov_base = (void *)snapblklist; 602 aiov.iov_len = snaplistsize * sizeof(daddr_t); 603 auio.uio_resid = aiov.iov_len;; 604 auio.uio_offset = ip->i_size; 605 auio.uio_segflg = UIO_SYSSPACE; 606 auio.uio_rw = UIO_WRITE; 607 auio.uio_td = td; 608 if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 609 fs->fs_snapinum[snaploc] = 0; 610 FREE(snapblklist, M_UFSMNT); 611 goto done; 612 } 613 /* 614 * Write the superblock and its summary information 615 * to the snapshot. 616 */ 617 blkno = fragstoblks(fs, fs->fs_csaddr); 618 len = howmany(fs->fs_cssize, fs->fs_bsize); 619 space = copy_fs->fs_csp; 620 for (loc = 0; loc < len; loc++) { 621 error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp); 622 if (error) { 623 brelse(nbp); 624 fs->fs_snapinum[snaploc] = 0; 625 FREE(snapblklist, M_UFSMNT); 626 goto done; 627 } 628 bcopy(space, nbp->b_data, fs->fs_bsize); 629 space = (char *)space + fs->fs_bsize; 630 bawrite(nbp); 631 } 632 /* 633 * As this is the newest list, it is the most inclusive, so 634 * should replace the previous list. 635 */ 636 VI_LOCK(devvp); 637 space = devvp->v_rdev->si_snapblklist; 638 devvp->v_rdev->si_snapblklist = snapblklist; 639 devvp->v_rdev->si_snaplistsize = snaplistsize; 640 if (space != NULL) 641 FREE(space, M_UFSMNT); 642 VI_UNLOCK(devvp); 643 done: 644 free(copy_fs->fs_csp, M_UFSMNT); 645 bawrite(sbp); 646 out: 647 if (saved_nice > 0) 648 td->td_ksegrp->kg_nice = saved_nice; 649 if (fs->fs_active != 0) { 650 FREE(fs->fs_active, M_DEVBUF); 651 fs->fs_active = 0; 652 } 653 mp->mnt_flag = flag; 654 if (error) 655 (void) UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED, td); 656 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 657 if (error) 658 vput(vp); 659 else 660 VOP_UNLOCK(vp, 0, td); 661 vn_finished_write(wrtmp); 662 return (error); 663 } 664 665 /* 666 * Copy a cylinder group map. All the unallocated blocks are marked 667 * BLK_NOCOPY so that the snapshot knows that it need not copy them 668 * if they are later written. If passno is one, then this is a first 669 * pass, so only setting needs to be done. If passno is 2, then this 670 * is a revision to a previous pass which must be undone as the 671 * replacement pass is done. 672 */ 673 static int 674 cgaccount(cg, vp, nbp, passno) 675 int cg; 676 struct vnode *vp; 677 struct buf *nbp; 678 int passno; 679 { 680 struct buf *bp, *ibp; 681 struct inode *ip; 682 struct cg *cgp; 683 struct fs *fs; 684 ufs2_daddr_t base, numblks; 685 int error, len, loc, indiroff; 686 687 ip = VTOI(vp); 688 fs = ip->i_fs; 689 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 690 (int)fs->fs_cgsize, KERNCRED, &bp); 691 if (error) { 692 brelse(bp); 693 return (error); 694 } 695 cgp = (struct cg *)bp->b_data; 696 if (!cg_chkmagic(cgp)) { 697 brelse(bp); 698 return (EIO); 699 } 700 atomic_set_int(&ACTIVECGNUM(fs, cg), ACTIVECGOFF(cg)); 701 bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize); 702 if (fs->fs_cgsize < fs->fs_bsize) 703 bzero(&nbp->b_data[fs->fs_cgsize], 704 fs->fs_bsize - fs->fs_cgsize); 705 if (passno == 2) 706 nbp->b_flags |= B_VALIDSUSPWRT; 707 numblks = howmany(fs->fs_size, fs->fs_frag); 708 len = howmany(fs->fs_fpg, fs->fs_frag); 709 base = cg * fs->fs_fpg / fs->fs_frag; 710 if (base + len >= numblks) 711 len = numblks - base - 1; 712 loc = 0; 713 if (base < NDADDR) { 714 for ( ; loc < NDADDR; loc++) { 715 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 716 DIP(ip, i_db[loc]) = BLK_NOCOPY; 717 else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY) 718 DIP(ip, i_db[loc]) = 0; 719 else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY) 720 panic("ffs_snapshot: lost direct block"); 721 } 722 } 723 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), 724 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 725 if (error) { 726 brelse(bp); 727 return (error); 728 } 729 indiroff = (base + loc - NDADDR) % NINDIR(fs); 730 for ( ; loc < len; loc++, indiroff++) { 731 if (indiroff >= NINDIR(fs)) { 732 if (passno == 2) 733 ibp->b_flags |= B_VALIDSUSPWRT; 734 bawrite(ibp); 735 error = UFS_BALLOC(vp, 736 lblktosize(fs, (off_t)(base + loc)), 737 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 738 if (error) { 739 brelse(bp); 740 return (error); 741 } 742 indiroff = 0; 743 } 744 if (ip->i_ump->um_fstype == UFS1) { 745 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 746 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 747 BLK_NOCOPY; 748 else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data)) 749 [indiroff] == BLK_NOCOPY) 750 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0; 751 else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data)) 752 [indiroff] == BLK_NOCOPY) 753 panic("ffs_snapshot: lost indirect block"); 754 continue; 755 } 756 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 757 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; 758 else if (passno == 2 && 759 ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) 760 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0; 761 else if (passno == 1 && 762 ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) 763 panic("ffs_snapshot: lost indirect block"); 764 } 765 bqrelse(bp); 766 if (passno == 2) 767 ibp->b_flags |= B_VALIDSUSPWRT; 768 bdwrite(ibp); 769 return (0); 770 } 771 772 /* 773 * Before expunging a snapshot inode, note all the 774 * blocks that it claims with BLK_SNAP so that fsck will 775 * be able to account for those blocks properly and so 776 * that this snapshot knows that it need not copy them 777 * if the other snapshot holding them is freed. This code 778 * is reproduced once each for UFS1 and UFS2. 779 */ 780 static int 781 expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype) 782 struct vnode *snapvp; 783 struct inode *cancelip; 784 struct fs *fs; 785 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 786 struct fs *, ufs_lbn_t, int); 787 int expungetype; 788 { 789 int i, error, indiroff; 790 ufs_lbn_t lbn, rlbn; 791 ufs2_daddr_t len, blkno, numblks, blksperindir; 792 struct ufs1_dinode *dip; 793 struct thread *td = curthread; 794 struct buf *bp; 795 796 /* 797 * Prepare to expunge the inode. If its inode block has not 798 * yet been copied, then allocate and fill the copy. 799 */ 800 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 801 blkno = 0; 802 if (lbn < NDADDR) { 803 blkno = VTOI(snapvp)->i_din1->di_db[lbn]; 804 } else { 805 td->td_proc->p_flag |= P_COWINPROGRESS; 806 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 807 fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); 808 td->td_proc->p_flag &= ~P_COWINPROGRESS; 809 if (error) 810 return (error); 811 indiroff = (lbn - NDADDR) % NINDIR(fs); 812 blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff]; 813 bqrelse(bp); 814 } 815 if (blkno != 0) { 816 if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) 817 return (error); 818 } else { 819 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 820 fs->fs_bsize, KERNCRED, 0, &bp); 821 if (error) 822 return (error); 823 if ((error = readblock(bp, lbn)) != 0) 824 return (error); 825 } 826 /* 827 * Set a snapshot inode to be a zero length file, regular files 828 * to be completely unallocated. 829 */ 830 dip = (struct ufs1_dinode *)bp->b_data + 831 ino_to_fsbo(fs, cancelip->i_number); 832 if (expungetype == BLK_NOCOPY) 833 dip->di_mode = 0; 834 dip->di_size = 0; 835 dip->di_blocks = 0; 836 dip->di_flags &= ~SF_SNAPSHOT; 837 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t)); 838 bdwrite(bp); 839 /* 840 * Now go through and expunge all the blocks in the file 841 * using the function requested. 842 */ 843 numblks = howmany(cancelip->i_size, fs->fs_bsize); 844 if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0], 845 &cancelip->i_din1->di_db[NDADDR], fs, 0, expungetype))) 846 return (error); 847 if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_ib[0], 848 &cancelip->i_din1->di_ib[NIADDR], fs, -1, expungetype))) 849 return (error); 850 blksperindir = 1; 851 lbn = -NDADDR; 852 len = numblks - NDADDR; 853 rlbn = NDADDR; 854 for (i = 0; len > 0 && i < NIADDR; i++) { 855 error = indiracct_ufs1(snapvp, ITOV(cancelip), i, 856 cancelip->i_din1->di_ib[i], lbn, rlbn, len, 857 blksperindir, fs, acctfunc, expungetype); 858 if (error) 859 return (error); 860 blksperindir *= NINDIR(fs); 861 lbn -= blksperindir + 1; 862 len -= blksperindir; 863 rlbn += blksperindir; 864 } 865 return (0); 866 } 867 868 /* 869 * Descend an indirect block chain for vnode cancelvp accounting for all 870 * its indirect blocks in snapvp. 871 */ 872 static int 873 indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, 874 blksperindir, fs, acctfunc, expungetype) 875 struct vnode *snapvp; 876 struct vnode *cancelvp; 877 int level; 878 ufs1_daddr_t blkno; 879 ufs_lbn_t lbn; 880 ufs_lbn_t rlbn; 881 ufs_lbn_t remblks; 882 ufs_lbn_t blksperindir; 883 struct fs *fs; 884 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 885 struct fs *, ufs_lbn_t, int); 886 int expungetype; 887 { 888 int error, num, i; 889 ufs_lbn_t subblksperindir; 890 struct indir indirs[NIADDR + 2]; 891 ufs1_daddr_t last, *bap; 892 struct buf *bp; 893 894 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 895 return (error); 896 if (lbn != indirs[num - 1 - level].in_lbn || blkno == 0 || num < 2) 897 panic("indiracct: botched params"); 898 /* 899 * We have to expand bread here since it will deadlock looking 900 * up the block number for any blocks that are not in the cache. 901 */ 902 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0); 903 bp->b_blkno = fsbtodb(fs, blkno); 904 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 905 (error = readblock(bp, fragstoblks(fs, blkno)))) { 906 brelse(bp); 907 return (error); 908 } 909 /* 910 * Account for the block pointers in this indirect block. 911 */ 912 last = howmany(remblks, blksperindir); 913 if (last > NINDIR(fs)) 914 last = NINDIR(fs); 915 MALLOC(bap, ufs1_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); 916 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 917 bqrelse(bp); 918 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 919 level == 0 ? rlbn : -1, expungetype); 920 if (error || level == 0) 921 goto out; 922 /* 923 * Account for the block pointers in each of the indirect blocks 924 * in the levels below us. 925 */ 926 subblksperindir = blksperindir / NINDIR(fs); 927 for (lbn++, level--, i = 0; i < last; i++) { 928 error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn, 929 rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); 930 if (error) 931 goto out; 932 rlbn += blksperindir; 933 lbn -= blksperindir; 934 remblks -= blksperindir; 935 } 936 out: 937 FREE(bap, M_DEVBUF); 938 return (error); 939 } 940 941 /* 942 * Do both snap accounting and map accounting. 943 */ 944 static int 945 fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype) 946 struct vnode *vp; 947 ufs1_daddr_t *oldblkp, *lastblkp; 948 struct fs *fs; 949 ufs_lbn_t lblkno; 950 int exptype; /* BLK_SNAP or BLK_NOCOPY */ 951 { 952 int error; 953 954 if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 955 return (error); 956 return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 957 } 958 959 /* 960 * Identify a set of blocks allocated in a snapshot inode. 961 */ 962 static int 963 snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 964 struct vnode *vp; 965 ufs1_daddr_t *oldblkp, *lastblkp; 966 struct fs *fs; 967 ufs_lbn_t lblkno; 968 int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 969 { 970 struct inode *ip = VTOI(vp); 971 ufs1_daddr_t blkno, *blkp; 972 ufs_lbn_t lbn; 973 struct buf *ibp; 974 int error; 975 976 for ( ; oldblkp < lastblkp; oldblkp++) { 977 blkno = *oldblkp; 978 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 979 continue; 980 lbn = fragstoblks(fs, blkno); 981 if (lbn < NDADDR) { 982 blkp = &ip->i_din1->di_db[lbn]; 983 ip->i_flag |= IN_CHANGE | IN_UPDATE; 984 } else { 985 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 986 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 987 if (error) 988 return (error); 989 blkp = &((ufs1_daddr_t *)(ibp->b_data)) 990 [(lbn - NDADDR) % NINDIR(fs)]; 991 } 992 /* 993 * If we are expunging a snapshot vnode and we 994 * find a block marked BLK_NOCOPY, then it is 995 * one that has been allocated to this snapshot after 996 * we took our current snapshot and can be ignored. 997 */ 998 if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { 999 if (lbn >= NDADDR) 1000 brelse(ibp); 1001 } else { 1002 if (*blkp != 0) 1003 panic("snapacct: bad block"); 1004 *blkp = expungetype; 1005 if (lbn >= NDADDR) 1006 bdwrite(ibp); 1007 } 1008 } 1009 return (0); 1010 } 1011 1012 /* 1013 * Account for a set of blocks allocated in a snapshot inode. 1014 */ 1015 static int 1016 mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1017 struct vnode *vp; 1018 ufs1_daddr_t *oldblkp, *lastblkp; 1019 struct fs *fs; 1020 ufs_lbn_t lblkno; 1021 int expungetype; 1022 { 1023 ufs1_daddr_t blkno; 1024 struct inode *ip; 1025 ino_t inum; 1026 int acctit; 1027 1028 ip = VTOI(vp); 1029 inum = ip->i_number; 1030 if (lblkno == -1) 1031 acctit = 0; 1032 else 1033 acctit = 1; 1034 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1035 blkno = *oldblkp; 1036 if (blkno == 0 || blkno == BLK_NOCOPY) 1037 continue; 1038 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1039 *ip->i_snapblklist++ = lblkno; 1040 if (blkno == BLK_SNAP) 1041 blkno = blkstofrags(fs, lblkno); 1042 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); 1043 } 1044 return (0); 1045 } 1046 1047 /* 1048 * Before expunging a snapshot inode, note all the 1049 * blocks that it claims with BLK_SNAP so that fsck will 1050 * be able to account for those blocks properly and so 1051 * that this snapshot knows that it need not copy them 1052 * if the other snapshot holding them is freed. This code 1053 * is reproduced once each for UFS1 and UFS2. 1054 */ 1055 static int 1056 expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype) 1057 struct vnode *snapvp; 1058 struct inode *cancelip; 1059 struct fs *fs; 1060 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1061 struct fs *, ufs_lbn_t, int); 1062 int expungetype; 1063 { 1064 int i, error, indiroff; 1065 ufs_lbn_t lbn, rlbn; 1066 ufs2_daddr_t len, blkno, numblks, blksperindir; 1067 struct ufs2_dinode *dip; 1068 struct thread *td = curthread; 1069 struct buf *bp; 1070 1071 /* 1072 * Prepare to expunge the inode. If its inode block has not 1073 * yet been copied, then allocate and fill the copy. 1074 */ 1075 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 1076 blkno = 0; 1077 if (lbn < NDADDR) { 1078 blkno = VTOI(snapvp)->i_din2->di_db[lbn]; 1079 } else { 1080 td->td_proc->p_flag |= P_COWINPROGRESS; 1081 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 1082 fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); 1083 td->td_proc->p_flag &= ~P_COWINPROGRESS; 1084 if (error) 1085 return (error); 1086 indiroff = (lbn - NDADDR) % NINDIR(fs); 1087 blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff]; 1088 bqrelse(bp); 1089 } 1090 if (blkno != 0) { 1091 if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) 1092 return (error); 1093 } else { 1094 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 1095 fs->fs_bsize, KERNCRED, 0, &bp); 1096 if (error) 1097 return (error); 1098 if ((error = readblock(bp, lbn)) != 0) 1099 return (error); 1100 } 1101 /* 1102 * Set a snapshot inode to be a zero length file, regular files 1103 * to be completely unallocated. 1104 */ 1105 dip = (struct ufs2_dinode *)bp->b_data + 1106 ino_to_fsbo(fs, cancelip->i_number); 1107 if (expungetype == BLK_NOCOPY) 1108 dip->di_mode = 0; 1109 dip->di_size = 0; 1110 dip->di_blocks = 0; 1111 dip->di_flags &= ~SF_SNAPSHOT; 1112 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t)); 1113 bdwrite(bp); 1114 /* 1115 * Now go through and expunge all the blocks in the file 1116 * using the function requested. 1117 */ 1118 numblks = howmany(cancelip->i_size, fs->fs_bsize); 1119 if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0], 1120 &cancelip->i_din2->di_db[NDADDR], fs, 0, expungetype))) 1121 return (error); 1122 if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_ib[0], 1123 &cancelip->i_din2->di_ib[NIADDR], fs, -1, expungetype))) 1124 return (error); 1125 blksperindir = 1; 1126 lbn = -NDADDR; 1127 len = numblks - NDADDR; 1128 rlbn = NDADDR; 1129 for (i = 0; len > 0 && i < NIADDR; i++) { 1130 error = indiracct_ufs2(snapvp, ITOV(cancelip), i, 1131 cancelip->i_din2->di_ib[i], lbn, rlbn, len, 1132 blksperindir, fs, acctfunc, expungetype); 1133 if (error) 1134 return (error); 1135 blksperindir *= NINDIR(fs); 1136 lbn -= blksperindir + 1; 1137 len -= blksperindir; 1138 rlbn += blksperindir; 1139 } 1140 return (0); 1141 } 1142 1143 /* 1144 * Descend an indirect block chain for vnode cancelvp accounting for all 1145 * its indirect blocks in snapvp. 1146 */ 1147 static int 1148 indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, 1149 blksperindir, fs, acctfunc, expungetype) 1150 struct vnode *snapvp; 1151 struct vnode *cancelvp; 1152 int level; 1153 ufs2_daddr_t blkno; 1154 ufs_lbn_t lbn; 1155 ufs_lbn_t rlbn; 1156 ufs_lbn_t remblks; 1157 ufs_lbn_t blksperindir; 1158 struct fs *fs; 1159 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1160 struct fs *, ufs_lbn_t, int); 1161 int expungetype; 1162 { 1163 int error, num, i; 1164 ufs_lbn_t subblksperindir; 1165 struct indir indirs[NIADDR + 2]; 1166 ufs2_daddr_t last, *bap; 1167 struct buf *bp; 1168 1169 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 1170 return (error); 1171 if (lbn != indirs[num - 1 - level].in_lbn || blkno == 0 || num < 2) 1172 panic("indiracct: botched params"); 1173 /* 1174 * We have to expand bread here since it will deadlock looking 1175 * up the block number for any blocks that are not in the cache. 1176 */ 1177 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0); 1178 bp->b_blkno = fsbtodb(fs, blkno); 1179 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 1180 (error = readblock(bp, fragstoblks(fs, blkno)))) { 1181 brelse(bp); 1182 return (error); 1183 } 1184 /* 1185 * Account for the block pointers in this indirect block. 1186 */ 1187 last = howmany(remblks, blksperindir); 1188 if (last > NINDIR(fs)) 1189 last = NINDIR(fs); 1190 MALLOC(bap, ufs2_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); 1191 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 1192 bqrelse(bp); 1193 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 1194 level == 0 ? rlbn : -1, expungetype); 1195 if (error || level == 0) 1196 goto out; 1197 /* 1198 * Account for the block pointers in each of the indirect blocks 1199 * in the levels below us. 1200 */ 1201 subblksperindir = blksperindir / NINDIR(fs); 1202 for (lbn++, level--, i = 0; i < last; i++) { 1203 error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn, 1204 rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); 1205 if (error) 1206 goto out; 1207 rlbn += blksperindir; 1208 lbn -= blksperindir; 1209 remblks -= blksperindir; 1210 } 1211 out: 1212 FREE(bap, M_DEVBUF); 1213 return (error); 1214 } 1215 1216 /* 1217 * Do both snap accounting and map accounting. 1218 */ 1219 static int 1220 fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype) 1221 struct vnode *vp; 1222 ufs2_daddr_t *oldblkp, *lastblkp; 1223 struct fs *fs; 1224 ufs_lbn_t lblkno; 1225 int exptype; /* BLK_SNAP or BLK_NOCOPY */ 1226 { 1227 int error; 1228 1229 if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 1230 return (error); 1231 return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 1232 } 1233 1234 /* 1235 * Identify a set of blocks allocated in a snapshot inode. 1236 */ 1237 static int 1238 snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1239 struct vnode *vp; 1240 ufs2_daddr_t *oldblkp, *lastblkp; 1241 struct fs *fs; 1242 ufs_lbn_t lblkno; 1243 int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 1244 { 1245 struct inode *ip = VTOI(vp); 1246 ufs2_daddr_t blkno, *blkp; 1247 ufs_lbn_t lbn; 1248 struct buf *ibp; 1249 int error; 1250 1251 for ( ; oldblkp < lastblkp; oldblkp++) { 1252 blkno = *oldblkp; 1253 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1254 continue; 1255 lbn = fragstoblks(fs, blkno); 1256 if (lbn < NDADDR) { 1257 blkp = &ip->i_din2->di_db[lbn]; 1258 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1259 } else { 1260 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1261 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1262 if (error) 1263 return (error); 1264 blkp = &((ufs2_daddr_t *)(ibp->b_data)) 1265 [(lbn - NDADDR) % NINDIR(fs)]; 1266 } 1267 /* 1268 * If we are expunging a snapshot vnode and we 1269 * find a block marked BLK_NOCOPY, then it is 1270 * one that has been allocated to this snapshot after 1271 * we took our current snapshot and can be ignored. 1272 */ 1273 if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { 1274 if (lbn >= NDADDR) 1275 brelse(ibp); 1276 } else { 1277 if (*blkp != 0) 1278 panic("snapacct: bad block"); 1279 *blkp = expungetype; 1280 if (lbn >= NDADDR) 1281 bdwrite(ibp); 1282 } 1283 } 1284 return (0); 1285 } 1286 1287 /* 1288 * Account for a set of blocks allocated in a snapshot inode. 1289 */ 1290 static int 1291 mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1292 struct vnode *vp; 1293 ufs2_daddr_t *oldblkp, *lastblkp; 1294 struct fs *fs; 1295 ufs_lbn_t lblkno; 1296 int expungetype; 1297 { 1298 ufs2_daddr_t blkno; 1299 struct inode *ip; 1300 ino_t inum; 1301 int acctit; 1302 1303 ip = VTOI(vp); 1304 inum = ip->i_number; 1305 if (lblkno == -1) 1306 acctit = 0; 1307 else 1308 acctit = 1; 1309 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1310 blkno = *oldblkp; 1311 if (blkno == 0 || blkno == BLK_NOCOPY) 1312 continue; 1313 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1314 *ip->i_snapblklist++ = lblkno; 1315 if (blkno == BLK_SNAP) 1316 blkno = blkstofrags(fs, lblkno); 1317 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); 1318 } 1319 return (0); 1320 } 1321 1322 /* 1323 * Decrement extra reference on snapshot when last name is removed. 1324 * It will not be freed until the last open reference goes away. 1325 */ 1326 void 1327 ffs_snapgone(ip) 1328 struct inode *ip; 1329 { 1330 struct inode *xp; 1331 struct fs *fs; 1332 int snaploc; 1333 1334 /* 1335 * Find snapshot in incore list. 1336 */ 1337 TAILQ_FOREACH(xp, &ip->i_devvp->v_rdev->si_snapshots, i_nextsnap) 1338 if (xp == ip) 1339 break; 1340 if (xp != NULL) 1341 vrele(ITOV(ip)); 1342 else if (snapdebug) 1343 printf("ffs_snapgone: lost snapshot vnode %d\n", 1344 ip->i_number); 1345 /* 1346 * Delete snapshot inode from superblock. Keep list dense. 1347 */ 1348 fs = ip->i_fs; 1349 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 1350 if (fs->fs_snapinum[snaploc] == ip->i_number) 1351 break; 1352 if (snaploc < FSMAXSNAP) { 1353 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { 1354 if (fs->fs_snapinum[snaploc] == 0) 1355 break; 1356 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; 1357 } 1358 fs->fs_snapinum[snaploc - 1] = 0; 1359 } 1360 } 1361 1362 /* 1363 * Prepare a snapshot file for being removed. 1364 */ 1365 void 1366 ffs_snapremove(vp) 1367 struct vnode *vp; 1368 { 1369 struct inode *ip; 1370 struct vnode *devvp; 1371 struct lock *lkp; 1372 struct buf *ibp; 1373 struct fs *fs; 1374 struct thread *td = curthread; 1375 ufs2_daddr_t numblks, blkno, dblk, *snapblklist; 1376 int error, loc, last; 1377 1378 ip = VTOI(vp); 1379 fs = ip->i_fs; 1380 devvp = ip->i_devvp; 1381 /* 1382 * If active, delete from incore list (this snapshot may 1383 * already have been in the process of being deleted, so 1384 * would not have been active). 1385 * 1386 * Clear copy-on-write flag if last snapshot. 1387 */ 1388 if (ip->i_nextsnap.tqe_prev != 0) { 1389 VI_LOCK(devvp); 1390 lockmgr(&vp->v_lock, LK_INTERLOCK | LK_EXCLUSIVE, 1391 VI_MTX(devvp), td); 1392 VI_LOCK(devvp); 1393 TAILQ_REMOVE(&devvp->v_rdev->si_snapshots, ip, i_nextsnap); 1394 ip->i_nextsnap.tqe_prev = 0; 1395 lkp = vp->v_vnlock; 1396 vp->v_vnlock = &vp->v_lock; 1397 lockmgr(lkp, LK_RELEASE, NULL, td); 1398 if (TAILQ_FIRST(&devvp->v_rdev->si_snapshots) != 0) { 1399 VI_UNLOCK(devvp); 1400 } else { 1401 snapblklist = devvp->v_rdev->si_snapblklist; 1402 devvp->v_rdev->si_snapblklist = 0; 1403 devvp->v_rdev->si_snaplistsize = 0; 1404 devvp->v_rdev->si_copyonwrite = 0; 1405 devvp->v_vflag &= ~VV_COPYONWRITE; 1406 lockmgr(lkp, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp), td); 1407 lockmgr(lkp, LK_RELEASE, NULL, td); 1408 lockdestroy(lkp); 1409 FREE(lkp, M_UFSMNT); 1410 FREE(snapblklist, M_UFSMNT); 1411 } 1412 } 1413 /* 1414 * Clear all BLK_NOCOPY fields. Pass any block claims to other 1415 * snapshots that want them (see ffs_snapblkfree below). 1416 */ 1417 for (blkno = 1; blkno < NDADDR; blkno++) { 1418 dblk = DIP(ip, i_db[blkno]); 1419 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1420 DIP(ip, i_db[blkno]) = 0; 1421 else if ((dblk == blkstofrags(fs, blkno) && 1422 ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, 1423 ip->i_number))) { 1424 DIP(ip, i_blocks) -= btodb(fs->fs_bsize); 1425 DIP(ip, i_db[blkno]) = 0; 1426 } 1427 } 1428 numblks = howmany(ip->i_size, fs->fs_bsize); 1429 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 1430 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), 1431 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1432 if (error) 1433 continue; 1434 if (fs->fs_size - blkno > NINDIR(fs)) 1435 last = NINDIR(fs); 1436 else 1437 last = fs->fs_size - blkno; 1438 for (loc = 0; loc < last; loc++) { 1439 if (ip->i_ump->um_fstype == UFS1) { 1440 dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc]; 1441 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1442 ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; 1443 else if ((dblk == blkstofrags(fs, blkno) && 1444 ffs_snapblkfree(fs, ip->i_devvp, dblk, 1445 fs->fs_bsize, ip->i_number))) { 1446 ip->i_din1->di_blocks -= 1447 btodb(fs->fs_bsize); 1448 ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; 1449 } 1450 continue; 1451 } 1452 dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc]; 1453 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1454 ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; 1455 else if ((dblk == blkstofrags(fs, blkno) && 1456 ffs_snapblkfree(fs, ip->i_devvp, dblk, 1457 fs->fs_bsize, ip->i_number))) { 1458 ip->i_din2->di_blocks -= btodb(fs->fs_bsize); 1459 ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; 1460 } 1461 } 1462 bawrite(ibp); 1463 } 1464 /* 1465 * Clear snapshot flag and drop reference. 1466 */ 1467 ip->i_flags &= ~SF_SNAPSHOT; 1468 DIP(ip, i_flags) = ip->i_flags; 1469 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1470 } 1471 1472 /* 1473 * Notification that a block is being freed. Return zero if the free 1474 * should be allowed to proceed. Return non-zero if the snapshot file 1475 * wants to claim the block. The block will be claimed if it is an 1476 * uncopied part of one of the snapshots. It will be freed if it is 1477 * either a BLK_NOCOPY or has already been copied in all of the snapshots. 1478 * If a fragment is being freed, then all snapshots that care about 1479 * it must make a copy since a snapshot file can only claim full sized 1480 * blocks. Note that if more than one snapshot file maps the block, 1481 * we can pick one at random to claim it. Since none of the snapshots 1482 * can change, we are assurred that they will all see the same unmodified 1483 * image. When deleting a snapshot file (see ffs_snapremove above), we 1484 * must push any of these claimed blocks to one of the other snapshots 1485 * that maps it. These claimed blocks are easily identified as they will 1486 * have a block number equal to their logical block number within the 1487 * snapshot. A copied block can never have this property because they 1488 * must always have been allocated from a BLK_NOCOPY location. 1489 */ 1490 int 1491 ffs_snapblkfree(fs, devvp, bno, size, inum) 1492 struct fs *fs; 1493 struct vnode *devvp; 1494 ufs2_daddr_t bno; 1495 long size; 1496 ino_t inum; 1497 { 1498 struct buf *ibp, *cbp, *savedcbp = 0; 1499 struct thread *td = curthread; 1500 struct inode *ip; 1501 struct vnode *vp = NULL; 1502 ufs_lbn_t lbn; 1503 ufs2_daddr_t blkno; 1504 int indiroff = 0, snapshot_locked = 0, error = 0, claimedblk = 0; 1505 struct snaphead *snaphead; 1506 1507 lbn = fragstoblks(fs, bno); 1508 retry: 1509 VI_LOCK(devvp); 1510 snaphead = &devvp->v_rdev->si_snapshots; 1511 TAILQ_FOREACH(ip, snaphead, i_nextsnap) { 1512 vp = ITOV(ip); 1513 /* 1514 * Lookup block being written. 1515 */ 1516 if (lbn < NDADDR) { 1517 blkno = DIP(ip, i_db[lbn]); 1518 } else { 1519 if (snapshot_locked == 0 && 1520 lockmgr(vp->v_vnlock, 1521 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 1522 VI_MTX(devvp), td) != 0) 1523 goto retry; 1524 snapshot_locked = 1; 1525 td->td_proc->p_flag |= P_COWINPROGRESS; 1526 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1527 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1528 td->td_proc->p_flag &= ~P_COWINPROGRESS; 1529 if (error) 1530 break; 1531 indiroff = (lbn - NDADDR) % NINDIR(fs); 1532 if (ip->i_ump->um_fstype == UFS1) 1533 blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; 1534 else 1535 blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; 1536 } 1537 /* 1538 * Check to see if block needs to be copied. 1539 */ 1540 if (blkno == 0) { 1541 /* 1542 * A block that we map is being freed. If it has not 1543 * been claimed yet, we will claim or copy it (below). 1544 */ 1545 claimedblk = 1; 1546 } else if (blkno == BLK_SNAP) { 1547 /* 1548 * No previous snapshot claimed the block, 1549 * so it will be freed and become a BLK_NOCOPY 1550 * (don't care) for us. 1551 */ 1552 if (claimedblk) 1553 panic("snapblkfree: inconsistent block type"); 1554 if (snapshot_locked == 0 && 1555 lockmgr(vp->v_vnlock, 1556 LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, 1557 VI_MTX(devvp), td) != 0) { 1558 if (lbn >= NDADDR) 1559 bqrelse(ibp); 1560 vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL, td); 1561 goto retry; 1562 } 1563 snapshot_locked = 1; 1564 if (lbn < NDADDR) { 1565 DIP(ip, i_db[lbn]) = BLK_NOCOPY; 1566 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1567 } else if (ip->i_ump->um_fstype == UFS1) { 1568 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 1569 BLK_NOCOPY; 1570 bdwrite(ibp); 1571 } else { 1572 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 1573 BLK_NOCOPY; 1574 bdwrite(ibp); 1575 } 1576 continue; 1577 } else /* BLK_NOCOPY or default */ { 1578 /* 1579 * If the snapshot has already copied the block 1580 * (default), or does not care about the block, 1581 * it is not needed. 1582 */ 1583 if (lbn >= NDADDR) 1584 bqrelse(ibp); 1585 continue; 1586 } 1587 /* 1588 * If this is a full size block, we will just grab it 1589 * and assign it to the snapshot inode. Otherwise we 1590 * will proceed to copy it. See explanation for this 1591 * routine as to why only a single snapshot needs to 1592 * claim this block. 1593 */ 1594 if (snapshot_locked == 0 && 1595 lockmgr(vp->v_vnlock, 1596 LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, 1597 VI_MTX(devvp), td) != 0) { 1598 if (lbn >= NDADDR) 1599 bqrelse(ibp); 1600 vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL, td); 1601 goto retry; 1602 } 1603 snapshot_locked = 1; 1604 if (size == fs->fs_bsize) { 1605 #ifdef DEBUG 1606 if (snapdebug) 1607 printf("%s %d lbn %jd from inum %d\n", 1608 "Grabonremove: snapino", ip->i_number, 1609 (intmax_t)lbn, inum); 1610 #endif 1611 if (lbn < NDADDR) { 1612 DIP(ip, i_db[lbn]) = bno; 1613 } else if (ip->i_ump->um_fstype == UFS1) { 1614 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno; 1615 bdwrite(ibp); 1616 } else { 1617 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno; 1618 bdwrite(ibp); 1619 } 1620 DIP(ip, i_blocks) += btodb(size); 1621 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1622 VOP_UNLOCK(vp, 0, td); 1623 return (1); 1624 } 1625 if (lbn >= NDADDR) 1626 bqrelse(ibp); 1627 /* 1628 * Allocate the block into which to do the copy. Note that this 1629 * allocation will never require any additional allocations for 1630 * the snapshot inode. 1631 */ 1632 td->td_proc->p_flag |= P_COWINPROGRESS; 1633 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1634 fs->fs_bsize, KERNCRED, 0, &cbp); 1635 td->td_proc->p_flag &= ~P_COWINPROGRESS; 1636 if (error) 1637 break; 1638 #ifdef DEBUG 1639 if (snapdebug) 1640 printf("%s%d lbn %jd %s %d size %ld to blkno %jd\n", 1641 "Copyonremove: snapino ", ip->i_number, 1642 (intmax_t)lbn, "for inum", inum, size, 1643 (intmax_t)cbp->b_blkno); 1644 #endif 1645 /* 1646 * If we have already read the old block contents, then 1647 * simply copy them to the new block. Note that we need 1648 * to synchronously write snapshots that have not been 1649 * unlinked, and hence will be visible after a crash, 1650 * to ensure their integrity. 1651 */ 1652 if (savedcbp != 0) { 1653 bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 1654 bawrite(cbp); 1655 if (dopersistence && ip->i_effnlink > 0) 1656 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1657 continue; 1658 } 1659 /* 1660 * Otherwise, read the old block contents into the buffer. 1661 */ 1662 if ((error = readblock(cbp, lbn)) != 0) { 1663 bzero(cbp->b_data, fs->fs_bsize); 1664 bawrite(cbp); 1665 if (dopersistence && ip->i_effnlink > 0) 1666 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1667 break; 1668 } 1669 savedcbp = cbp; 1670 } 1671 /* 1672 * Note that we need to synchronously write snapshots that 1673 * have not been unlinked, and hence will be visible after 1674 * a crash, to ensure their integrity. 1675 */ 1676 if (savedcbp) { 1677 vp = savedcbp->b_vp; 1678 bawrite(savedcbp); 1679 if (dopersistence && VTOI(vp)->i_effnlink > 0) 1680 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1681 } 1682 /* 1683 * If we have been unable to allocate a block in which to do 1684 * the copy, then return non-zero so that the fragment will 1685 * not be freed. Although space will be lost, the snapshot 1686 * will stay consistent. 1687 */ 1688 if (snapshot_locked) 1689 VOP_UNLOCK(vp, 0, td); 1690 else 1691 VI_UNLOCK(devvp); 1692 return (error); 1693 } 1694 1695 /* 1696 * Associate snapshot files when mounting. 1697 */ 1698 void 1699 ffs_snapshot_mount(mp) 1700 struct mount *mp; 1701 { 1702 struct ufsmount *ump = VFSTOUFS(mp); 1703 struct vnode *devvp = ump->um_devvp; 1704 struct fs *fs = ump->um_fs; 1705 struct thread *td = curthread; 1706 struct snaphead *snaphead; 1707 struct vnode *vp; 1708 struct inode *ip, *xp; 1709 struct uio auio; 1710 struct iovec aiov; 1711 void *snapblklist; 1712 char *reason; 1713 daddr_t snaplistsize; 1714 int error, snaploc, loc; 1715 1716 /* 1717 * XXX The following needs to be set before UFS_TRUNCATE or 1718 * VOP_READ can be called. 1719 */ 1720 mp->mnt_stat.f_iosize = fs->fs_bsize; 1721 /* 1722 * Process each snapshot listed in the superblock. 1723 */ 1724 vp = NULL; 1725 snaphead = &devvp->v_rdev->si_snapshots; 1726 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { 1727 if (fs->fs_snapinum[snaploc] == 0) 1728 break; 1729 if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], 1730 LK_EXCLUSIVE, &vp)) != 0){ 1731 printf("ffs_snapshot_mount: vget failed %d\n", error); 1732 continue; 1733 } 1734 ip = VTOI(vp); 1735 if ((ip->i_flags & SF_SNAPSHOT) == 0 || ip->i_size == 1736 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) { 1737 if ((ip->i_flags & SF_SNAPSHOT) == 0) { 1738 reason = "non-snapshot"; 1739 } else { 1740 reason = "old format snapshot"; 1741 (void)UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED, td); 1742 (void)VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1743 } 1744 printf("ffs_snapshot_mount: %s inode %d\n", 1745 reason, fs->fs_snapinum[snaploc]); 1746 vput(vp); 1747 vp = NULL; 1748 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { 1749 if (fs->fs_snapinum[loc] == 0) 1750 break; 1751 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; 1752 } 1753 fs->fs_snapinum[loc - 1] = 0; 1754 snaploc--; 1755 continue; 1756 } 1757 /* 1758 * If there already exist snapshots on this filesystem, grab a 1759 * reference to their shared lock. If this is the first snapshot 1760 * on this filesystem, we need to allocate a lock for the 1761 * snapshots to share. In either case, acquire the snapshot 1762 * lock and give up our original private lock. 1763 */ 1764 VI_LOCK(devvp); 1765 if ((xp = TAILQ_FIRST(snaphead)) != NULL) { 1766 VI_LOCK(vp); 1767 vp->v_vnlock = ITOV(xp)->v_vnlock; 1768 VI_UNLOCK(devvp); 1769 } else { 1770 struct lock *lkp; 1771 1772 VI_UNLOCK(devvp); 1773 MALLOC(lkp, struct lock *, sizeof(struct lock), 1774 M_UFSMNT, M_WAITOK); 1775 lockinit(lkp, PVFS, "snaplk", VLKTIMEOUT, 1776 LK_CANRECURSE | LK_NOPAUSE); 1777 VI_LOCK(vp); 1778 vp->v_vnlock = lkp; 1779 } 1780 vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, td); 1781 transferlockers(&vp->v_lock, vp->v_vnlock); 1782 lockmgr(&vp->v_lock, LK_RELEASE, NULL, td); 1783 /* 1784 * Link it onto the active snapshot list. 1785 */ 1786 VI_LOCK(devvp); 1787 if (ip->i_nextsnap.tqe_prev != 0) 1788 panic("ffs_snapshot_mount: %d already on list", 1789 ip->i_number); 1790 else 1791 TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap); 1792 vp->v_vflag |= VV_SYSTEM; 1793 VI_UNLOCK(devvp); 1794 VOP_UNLOCK(vp, 0, td); 1795 } 1796 /* 1797 * No usable snapshots found. 1798 */ 1799 if (vp == NULL) 1800 return; 1801 /* 1802 * Allocate the space for the block hints list. We always want to 1803 * use the list from the newest snapshot. 1804 */ 1805 auio.uio_iov = &aiov; 1806 auio.uio_iovcnt = 1; 1807 aiov.iov_base = (void *)&snaplistsize; 1808 aiov.iov_len = sizeof(snaplistsize); 1809 auio.uio_resid = aiov.iov_len; 1810 auio.uio_offset = 1811 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)); 1812 auio.uio_segflg = UIO_SYSSPACE; 1813 auio.uio_rw = UIO_READ; 1814 auio.uio_td = td; 1815 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1816 if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 1817 printf("ffs_snapshot_mount: read_1 failed %d\n", error); 1818 VOP_UNLOCK(vp, 0, td); 1819 return; 1820 } 1821 MALLOC(snapblklist, void *, snaplistsize * sizeof(daddr_t), 1822 M_UFSMNT, M_WAITOK); 1823 auio.uio_iovcnt = 1; 1824 aiov.iov_base = snapblklist; 1825 aiov.iov_len = snaplistsize * sizeof (daddr_t); 1826 auio.uio_resid = aiov.iov_len; 1827 auio.uio_offset -= sizeof(snaplistsize); 1828 if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 1829 printf("ffs_snapshot_mount: read_2 failed %d\n", error); 1830 VOP_UNLOCK(vp, 0, td); 1831 FREE(snapblklist, M_UFSMNT); 1832 return; 1833 } 1834 VOP_UNLOCK(vp, 0, td); 1835 VI_LOCK(devvp); 1836 ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_mount"); 1837 devvp->v_rdev->si_snaplistsize = snaplistsize; 1838 devvp->v_rdev->si_snapblklist = (daddr_t *)snapblklist; 1839 devvp->v_rdev->si_copyonwrite = ffs_copyonwrite; 1840 devvp->v_vflag |= VV_COPYONWRITE; 1841 VI_UNLOCK(devvp); 1842 } 1843 1844 /* 1845 * Disassociate snapshot files when unmounting. 1846 */ 1847 void 1848 ffs_snapshot_unmount(mp) 1849 struct mount *mp; 1850 { 1851 struct vnode *devvp = VFSTOUFS(mp)->um_devvp; 1852 struct snaphead *snaphead = &devvp->v_rdev->si_snapshots; 1853 struct lock *lkp = NULL; 1854 struct inode *xp; 1855 struct vnode *vp; 1856 1857 VI_LOCK(devvp); 1858 while ((xp = TAILQ_FIRST(snaphead)) != 0) { 1859 vp = ITOV(xp); 1860 lkp = vp->v_vnlock; 1861 vp->v_vnlock = &vp->v_lock; 1862 TAILQ_REMOVE(snaphead, xp, i_nextsnap); 1863 xp->i_nextsnap.tqe_prev = 0; 1864 if (xp->i_effnlink > 0) { 1865 VI_UNLOCK(devvp); 1866 vrele(vp); 1867 VI_LOCK(devvp); 1868 } 1869 } 1870 if (devvp->v_rdev->si_snapblklist != NULL) { 1871 FREE(devvp->v_rdev->si_snapblklist, M_UFSMNT); 1872 devvp->v_rdev->si_snapblklist = NULL; 1873 devvp->v_rdev->si_snaplistsize = 0; 1874 } 1875 if (lkp != NULL) { 1876 lockdestroy(lkp); 1877 FREE(lkp, M_UFSMNT); 1878 } 1879 ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount"); 1880 devvp->v_rdev->si_copyonwrite = 0; 1881 devvp->v_vflag &= ~VV_COPYONWRITE; 1882 VI_UNLOCK(devvp); 1883 } 1884 1885 /* 1886 * Check for need to copy block that is about to be written, 1887 * copying the block if necessary. 1888 */ 1889 static int 1890 ffs_copyonwrite(devvp, bp) 1891 struct vnode *devvp; 1892 struct buf *bp; 1893 { 1894 struct snaphead *snaphead; 1895 struct buf *ibp, *cbp, *savedcbp = 0; 1896 struct thread *td = curthread; 1897 struct fs *fs; 1898 struct inode *ip; 1899 struct vnode *vp = 0; 1900 ufs2_daddr_t lbn, blkno, *snapblklist; 1901 int lower, upper, mid, indiroff, snapshot_locked = 0, error = 0; 1902 1903 if (td->td_proc->p_flag & P_COWINPROGRESS) 1904 panic("ffs_copyonwrite: recursive call"); 1905 /* 1906 * First check to see if it is in the preallocated list. 1907 * By doing this check we avoid several potential deadlocks. 1908 */ 1909 VI_LOCK(devvp); 1910 snaphead = &devvp->v_rdev->si_snapshots; 1911 ip = TAILQ_FIRST(snaphead); 1912 fs = ip->i_fs; 1913 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); 1914 snapblklist = devvp->v_rdev->si_snapblklist; 1915 upper = devvp->v_rdev->si_snaplistsize - 1; 1916 lower = 1; 1917 while (lower <= upper) { 1918 mid = (lower + upper) / 2; 1919 if (snapblklist[mid] == lbn) 1920 break; 1921 if (snapblklist[mid] < lbn) 1922 lower = mid + 1; 1923 else 1924 upper = mid - 1; 1925 } 1926 if (lower <= upper) { 1927 VI_UNLOCK(devvp); 1928 return (0); 1929 } 1930 /* 1931 * Not in the precomputed list, so check the snapshots. 1932 */ 1933 retry: 1934 TAILQ_FOREACH(ip, snaphead, i_nextsnap) { 1935 vp = ITOV(ip); 1936 /* 1937 * We ensure that everything of our own that needs to be 1938 * copied will be done at the time that ffs_snapshot is 1939 * called. Thus we can skip the check here which can 1940 * deadlock in doing the lookup in UFS_BALLOC. 1941 */ 1942 if (bp->b_vp == vp) 1943 continue; 1944 /* 1945 * Check to see if block needs to be copied. We do not have 1946 * to hold the snapshot lock while doing this lookup as it 1947 * will never require any additional allocations for the 1948 * snapshot inode. 1949 */ 1950 if (lbn < NDADDR) { 1951 blkno = DIP(ip, i_db[lbn]); 1952 } else { 1953 if (snapshot_locked == 0 && 1954 lockmgr(vp->v_vnlock, 1955 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 1956 VI_MTX(devvp), td) != 0) { 1957 VI_LOCK(devvp); 1958 goto retry; 1959 } 1960 snapshot_locked = 1; 1961 td->td_proc->p_flag |= P_COWINPROGRESS; 1962 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1963 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1964 td->td_proc->p_flag &= ~P_COWINPROGRESS; 1965 if (error) 1966 break; 1967 indiroff = (lbn - NDADDR) % NINDIR(fs); 1968 if (ip->i_ump->um_fstype == UFS1) 1969 blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; 1970 else 1971 blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; 1972 bqrelse(ibp); 1973 } 1974 #ifdef DIAGNOSTIC 1975 if (blkno == BLK_SNAP && bp->b_lblkno >= 0) 1976 panic("ffs_copyonwrite: bad copy block"); 1977 #endif 1978 if (blkno != 0) 1979 continue; 1980 /* 1981 * Allocate the block into which to do the copy. Since 1982 * multiple processes may all try to copy the same block, 1983 * we have to recheck our need to do a copy if we sleep 1984 * waiting for the lock. 1985 * 1986 * Because all snapshots on a filesystem share a single 1987 * lock, we ensure that we will never be in competition 1988 * with another process to allocate a block. 1989 */ 1990 if (snapshot_locked == 0 && 1991 lockmgr(vp->v_vnlock, 1992 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 1993 VI_MTX(devvp), td) != 0) { 1994 VI_LOCK(devvp); 1995 goto retry; 1996 } 1997 snapshot_locked = 1; 1998 td->td_proc->p_flag |= P_COWINPROGRESS; 1999 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 2000 fs->fs_bsize, KERNCRED, 0, &cbp); 2001 td->td_proc->p_flag &= ~P_COWINPROGRESS; 2002 if (error) 2003 break; 2004 #ifdef DEBUG 2005 if (snapdebug) { 2006 printf("Copyonwrite: snapino %d lbn %jd for ", 2007 ip->i_number, (intmax_t)lbn); 2008 if (bp->b_vp == devvp) 2009 printf("fs metadata"); 2010 else 2011 printf("inum %d", VTOI(bp->b_vp)->i_number); 2012 printf(" lblkno %jd to blkno %jd\n", 2013 (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno); 2014 } 2015 #endif 2016 /* 2017 * If we have already read the old block contents, then 2018 * simply copy them to the new block. Note that we need 2019 * to synchronously write snapshots that have not been 2020 * unlinked, and hence will be visible after a crash, 2021 * to ensure their integrity. 2022 */ 2023 if (savedcbp != 0) { 2024 bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 2025 bawrite(cbp); 2026 if (dopersistence && ip->i_effnlink > 0) 2027 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 2028 continue; 2029 } 2030 /* 2031 * Otherwise, read the old block contents into the buffer. 2032 */ 2033 if ((error = readblock(cbp, lbn)) != 0) { 2034 bzero(cbp->b_data, fs->fs_bsize); 2035 bawrite(cbp); 2036 if (dopersistence && ip->i_effnlink > 0) 2037 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 2038 break; 2039 } 2040 savedcbp = cbp; 2041 } 2042 /* 2043 * Note that we need to synchronously write snapshots that 2044 * have not been unlinked, and hence will be visible after 2045 * a crash, to ensure their integrity. 2046 */ 2047 if (savedcbp) { 2048 vp = savedcbp->b_vp; 2049 bawrite(savedcbp); 2050 if (dopersistence && VTOI(vp)->i_effnlink > 0) 2051 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 2052 } 2053 if (snapshot_locked) 2054 VOP_UNLOCK(vp, 0, td); 2055 else 2056 VI_UNLOCK(devvp); 2057 return (error); 2058 } 2059 2060 /* 2061 * Read the specified block into the given buffer. 2062 * Much of this boiler-plate comes from bwrite(). 2063 */ 2064 static int 2065 readblock(bp, lbn) 2066 struct buf *bp; 2067 ufs2_daddr_t lbn; 2068 { 2069 struct uio auio; 2070 struct iovec aiov; 2071 struct thread *td = curthread; 2072 struct inode *ip = VTOI(bp->b_vp); 2073 2074 aiov.iov_base = bp->b_data; 2075 aiov.iov_len = bp->b_bcount; 2076 auio.uio_iov = &aiov; 2077 auio.uio_iovcnt = 1; 2078 auio.uio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn))); 2079 auio.uio_resid = bp->b_bcount; 2080 auio.uio_rw = UIO_READ; 2081 auio.uio_segflg = UIO_SYSSPACE; 2082 auio.uio_td = td; 2083 return (physio(ip->i_devvp->v_rdev, &auio, 0)); 2084 } 2085