1 /* 2 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. 3 * 4 * Further information about snapshots can be obtained from: 5 * 6 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 7 * 1614 Oxford Street mckusick@mckusick.com 8 * Berkeley, CA 94709-1608 +1-510-843-9542 9 * USA 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 21 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 22 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 34 * $FreeBSD$ 35 */ 36 37 #include <sys/param.h> 38 #include <sys/stdint.h> 39 #include <sys/kernel.h> 40 #include <sys/systm.h> 41 #include <sys/conf.h> 42 #include <sys/bio.h> 43 #include <sys/buf.h> 44 #include <sys/proc.h> 45 #include <sys/namei.h> 46 #include <sys/stat.h> 47 #include <sys/malloc.h> 48 #include <sys/mount.h> 49 #include <sys/resource.h> 50 #include <sys/resourcevar.h> 51 #include <sys/vnode.h> 52 53 #include <ufs/ufs/extattr.h> 54 #include <ufs/ufs/quota.h> 55 #include <ufs/ufs/ufsmount.h> 56 #include <ufs/ufs/inode.h> 57 #include <ufs/ufs/ufs_extern.h> 58 59 #include <ufs/ffs/fs.h> 60 #include <ufs/ffs/ffs_extern.h> 61 62 #define KERNCRED thread0.td_ucred 63 #define DEBUG 1 64 65 static int cgaccount(int, struct vnode *, struct buf *, int); 66 static int expunge_ufs1(struct vnode *, struct inode *, struct fs *, 67 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 68 ufs_lbn_t, int), int); 69 static int indiracct_ufs1(struct vnode *, struct vnode *, int, 70 ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 71 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 72 ufs_lbn_t, int), int); 73 static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 74 struct fs *, ufs_lbn_t, int); 75 static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 76 struct fs *, ufs_lbn_t, int); 77 static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 78 struct fs *, ufs_lbn_t, int); 79 static int expunge_ufs2(struct vnode *, struct inode *, struct fs *, 80 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 81 ufs_lbn_t, int), int); 82 static int indiracct_ufs2(struct vnode *, struct vnode *, int, 83 ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 84 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 85 ufs_lbn_t, int), int); 86 static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 87 struct fs *, ufs_lbn_t, int); 88 static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 89 struct fs *, ufs_lbn_t, int); 90 static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 91 struct fs *, ufs_lbn_t, int); 92 static int ffs_copyonwrite(struct vnode *, struct buf *); 93 static int readblock(struct buf *, ufs2_daddr_t); 94 95 /* 96 * To ensure the consistency of snapshots across crashes, we must 97 * synchronously write out copied blocks before allowing the 98 * originals to be modified. Because of the rather severe speed 99 * penalty that this imposes, the following flag allows this 100 * crash persistence to be disabled. 101 */ 102 int dopersistence = 0; 103 104 #ifdef DEBUG 105 #include <sys/sysctl.h> 106 SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, ""); 107 int snapdebug = 0; 108 SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, ""); 109 int collectsnapstats = 0; 110 SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats, 111 0, ""); 112 #endif /* DEBUG */ 113 114 /* 115 * Create a snapshot file and initialize it for the filesystem. 116 */ 117 int 118 ffs_snapshot(mp, snapfile) 119 struct mount *mp; 120 char *snapfile; 121 { 122 ufs2_daddr_t numblks, blkno; 123 int error, cg, snaploc; 124 int i, size, len, loc; 125 int flag = mp->mnt_flag; 126 struct timespec starttime = {0, 0}, endtime; 127 char saved_nice = 0; 128 long redo = 0; 129 int32_t *lp; 130 void *space; 131 daddr_t *listhd; 132 struct fs *copy_fs = NULL, *fs = VFSTOUFS(mp)->um_fs; 133 struct snaphead *snaphead; 134 struct thread *td = curthread; 135 struct inode *ip, *xp; 136 struct buf *bp, *nbp, *ibp, *sbp = NULL; 137 struct nameidata nd; 138 struct mount *wrtmp; 139 struct vattr vat; 140 struct vnode *vp, *xvp, *nvp; 141 struct uio auio; 142 struct iovec aiov; 143 144 /* 145 * Need to serialize access to snapshot code per filesystem. 146 */ 147 /* 148 * Assign a snapshot slot in the superblock. 149 */ 150 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 151 if (fs->fs_snapinum[snaploc] == 0) 152 break; 153 if (snaploc == FSMAXSNAP) 154 return (ENOSPC); 155 /* 156 * Create the snapshot file. 157 */ 158 restart: 159 NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, snapfile, td); 160 if ((error = namei(&nd)) != 0) 161 return (error); 162 if (nd.ni_vp != NULL) { 163 vput(nd.ni_vp); 164 error = EEXIST; 165 } 166 if (nd.ni_dvp->v_mount != mp) 167 error = EXDEV; 168 if (error) { 169 NDFREE(&nd, NDF_ONLY_PNBUF); 170 if (nd.ni_dvp == nd.ni_vp) 171 vrele(nd.ni_dvp); 172 else 173 vput(nd.ni_dvp); 174 return (error); 175 } 176 VATTR_NULL(&vat); 177 vat.va_type = VREG; 178 vat.va_mode = S_IRUSR; 179 vat.va_vaflags |= VA_EXCLUSIVE; 180 if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp)) 181 wrtmp = NULL; 182 if (wrtmp != mp) 183 panic("ffs_snapshot: mount mismatch"); 184 if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) { 185 NDFREE(&nd, NDF_ONLY_PNBUF); 186 vput(nd.ni_dvp); 187 if ((error = vn_start_write(NULL, &wrtmp, 188 V_XSLEEP | PCATCH)) != 0) 189 return (error); 190 goto restart; 191 } 192 VOP_LEASE(nd.ni_dvp, td, KERNCRED, LEASE_WRITE); 193 error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat); 194 vput(nd.ni_dvp); 195 if (error) { 196 NDFREE(&nd, NDF_ONLY_PNBUF); 197 vn_finished_write(wrtmp); 198 return (error); 199 } 200 vp = nd.ni_vp; 201 ip = VTOI(vp); 202 /* 203 * Allocate and copy the last block contents so as to be able 204 * to set size to that of the filesystem. 205 */ 206 numblks = howmany(fs->fs_size, fs->fs_frag); 207 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), 208 fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); 209 if (error) 210 goto out; 211 ip->i_size = lblktosize(fs, (off_t)numblks); 212 DIP(ip, i_size) = ip->i_size; 213 ip->i_flag |= IN_CHANGE | IN_UPDATE; 214 if ((error = readblock(bp, numblks - 1)) != 0) 215 goto out; 216 bawrite(bp); 217 /* 218 * Preallocate critical data structures so that we can copy 219 * them in without further allocation after we suspend all 220 * operations on the filesystem. We would like to just release 221 * the allocated buffers without writing them since they will 222 * be filled in below once we are ready to go, but this upsets 223 * the soft update code, so we go ahead and write the new buffers. 224 * 225 * Allocate all indirect blocks and mark all of them as not 226 * needing to be copied. 227 */ 228 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 229 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), 230 fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp); 231 if (error) 232 goto out; 233 bdwrite(ibp); 234 } 235 /* 236 * Allocate copies for the superblock and its summary information. 237 */ 238 error = UFS_BALLOC(vp, lfragtosize(fs, fs->fs_sblockloc), 239 fs->fs_sbsize, KERNCRED, 0, &nbp); 240 if (error) 241 goto out; 242 bawrite(nbp); 243 blkno = fragstoblks(fs, fs->fs_csaddr); 244 len = howmany(fs->fs_cssize, fs->fs_bsize); 245 for (loc = 0; loc < len; loc++) { 246 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), 247 fs->fs_bsize, KERNCRED, 0, &nbp); 248 if (error) 249 goto out; 250 bawrite(nbp); 251 } 252 /* 253 * Allocate all cylinder group blocks. 254 */ 255 for (cg = 0; cg < fs->fs_ncg; cg++) { 256 error = UFS_BALLOC(vp, (off_t)(cgtod(fs, cg)) << fs->fs_fshift, 257 fs->fs_bsize, KERNCRED, 0, &nbp); 258 if (error) 259 goto out; 260 bdwrite(nbp); 261 } 262 /* 263 * Copy all the cylinder group maps. Although the 264 * filesystem is still active, we hope that only a few 265 * cylinder groups will change between now and when we 266 * suspend operations. Thus, we will be able to quickly 267 * touch up the few cylinder groups that changed during 268 * the suspension period. 269 */ 270 len = howmany(fs->fs_ncg, NBBY); 271 MALLOC(fs->fs_active, int *, len, M_DEVBUF, M_WAITOK); 272 bzero(fs->fs_active, len); 273 for (cg = 0; cg < fs->fs_ncg; cg++) { 274 error = bread(vp, fragstoblks(fs, cgtod(fs, cg)), fs->fs_bsize, 275 KERNCRED, &nbp); 276 if (error) { 277 brelse(nbp); 278 goto out; 279 } 280 error = cgaccount(cg, vp, nbp, 1); 281 bawrite(nbp); 282 if (error) 283 goto out; 284 } 285 /* 286 * Change inode to snapshot type file. 287 */ 288 ip->i_flags |= SF_SNAPSHOT; 289 DIP(ip, i_flags) = ip->i_flags; 290 ip->i_flag |= IN_CHANGE | IN_UPDATE; 291 /* 292 * Ensure that the snapshot is completely on disk. 293 */ 294 if ((error = VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td)) != 0) 295 goto out; 296 /* 297 * All allocations are done, so we can now snapshot the system. 298 * 299 * Recind nice scheduling while running with the filesystem suspended. 300 */ 301 if (td->td_ksegrp->kg_nice > 0) { 302 saved_nice = td->td_ksegrp->kg_nice; 303 td->td_ksegrp->kg_nice = 0; 304 } 305 /* 306 * Suspend operation on filesystem. 307 */ 308 for (;;) { 309 vn_finished_write(wrtmp); 310 if ((error = vfs_write_suspend(vp->v_mount)) != 0) { 311 vn_start_write(NULL, &wrtmp, V_WAIT); 312 goto out; 313 } 314 if (mp->mnt_kern_flag & MNTK_SUSPENDED) 315 break; 316 vn_start_write(NULL, &wrtmp, V_WAIT); 317 } 318 if (collectsnapstats) 319 nanotime(&starttime); 320 /* 321 * First, copy all the cylinder group maps that have changed. 322 */ 323 for (cg = 0; cg < fs->fs_ncg; cg++) { 324 if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0) 325 continue; 326 redo++; 327 error = bread(vp, fragstoblks(fs, cgtod(fs, cg)), fs->fs_bsize, 328 KERNCRED, &nbp); 329 if (error) { 330 brelse(nbp); 331 goto out1; 332 } 333 error = cgaccount(cg, vp, nbp, 2); 334 bawrite(nbp); 335 if (error) 336 goto out1; 337 } 338 /* 339 * Grab a copy of the superblock and its summary information. 340 * We delay writing it until the suspension is released below. 341 */ 342 error = bread(vp, fragstoblks(fs, fs->fs_sblockloc), fs->fs_bsize, 343 KERNCRED, &sbp); 344 if (error) { 345 brelse(sbp); 346 sbp = NULL; 347 goto out1; 348 } 349 loc = blkoff(fs, lfragtosize(fs, fs->fs_sblockloc)); 350 copy_fs = (struct fs *)(sbp->b_data + loc); 351 bcopy(fs, copy_fs, fs->fs_sbsize); 352 if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) 353 copy_fs->fs_clean = 1; 354 if (fs->fs_sbsize < SBLOCKSIZE) 355 bzero(&sbp->b_data[loc + fs->fs_sbsize], 356 SBLOCKSIZE - fs->fs_sbsize); 357 size = blkroundup(fs, fs->fs_cssize); 358 if (fs->fs_contigsumsize > 0) 359 size += fs->fs_ncg * sizeof(int32_t); 360 space = malloc((u_long)size, M_UFSMNT, M_WAITOK); 361 copy_fs->fs_csp = space; 362 bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize); 363 (char *)space += fs->fs_cssize; 364 loc = howmany(fs->fs_cssize, fs->fs_fsize); 365 i = fs->fs_frag - loc % fs->fs_frag; 366 len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; 367 if (len > 0) { 368 if ((error = bread(ip->i_devvp, 369 fsbtodb(fs, fs->fs_csaddr + loc), 370 len, KERNCRED, &bp)) != 0) { 371 brelse(bp); 372 free(copy_fs->fs_csp, M_UFSMNT); 373 bawrite(sbp); 374 sbp = NULL; 375 goto out1; 376 } 377 bcopy(bp->b_data, space, (u_int)len); 378 (char *)space += len; 379 bp->b_flags |= B_INVAL | B_NOCACHE; 380 brelse(bp); 381 } 382 if (fs->fs_contigsumsize > 0) { 383 copy_fs->fs_maxcluster = lp = space; 384 for (i = 0; i < fs->fs_ncg; i++) 385 *lp++ = fs->fs_contigsumsize; 386 } 387 /* 388 * We must check for active files that have been unlinked 389 * (e.g., with a zero link count). We have to expunge all 390 * trace of these files from the snapshot so that they are 391 * not reclaimed prematurely by fsck or unnecessarily dumped. 392 * We turn off the MNTK_SUSPENDED flag to avoid a panic from 393 * spec_strategy about writing on a suspended filesystem. 394 * Note that we skip unlinked snapshot files as they will 395 * be handled separately below. 396 */ 397 mp->mnt_kern_flag &= ~MNTK_SUSPENDED; 398 mtx_lock(&mntvnode_mtx); 399 loop: 400 for (xvp = TAILQ_FIRST(&mp->mnt_nvnodelist); xvp; xvp = nvp) { 401 /* 402 * Make sure this vnode wasn't reclaimed in getnewvnode(). 403 * Start over if it has (it won't be on the list anymore). 404 */ 405 if (xvp->v_mount != mp) 406 goto loop; 407 nvp = TAILQ_NEXT(xvp, v_nmntvnodes); 408 mtx_unlock(&mntvnode_mtx); 409 mp_fixme("Unlocked GETATTR."); 410 if (vrefcnt(xvp) == 0 || xvp->v_type == VNON || 411 (VTOI(xvp)->i_flags & SF_SNAPSHOT) || 412 (VOP_GETATTR(xvp, &vat, td->td_proc->p_ucred, td) == 0 && 413 vat.va_nlink > 0)) { 414 mtx_lock(&mntvnode_mtx); 415 continue; 416 } 417 if (snapdebug) 418 vprint("ffs_snapshot: busy vnode", xvp); 419 if (vn_lock(xvp, LK_EXCLUSIVE, td) != 0) 420 goto loop; 421 xp = VTOI(xvp); 422 /* 423 * If there is a fragment, clear it here. 424 */ 425 blkno = 0; 426 loc = howmany(xp->i_size, fs->fs_bsize) - 1; 427 if (loc < NDADDR) { 428 len = fragroundup(fs, blkoff(fs, xp->i_size)); 429 if (len < fs->fs_bsize) { 430 ffs_blkfree(copy_fs, vp, DIP(xp, i_db[loc]), 431 len, xp->i_number); 432 blkno = DIP(xp, i_db[loc]); 433 DIP(xp, i_db[loc]) = 0; 434 } 435 } 436 if (xp->i_ump->um_fstype == UFS1) 437 error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, 438 BLK_NOCOPY); 439 else 440 error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, 441 BLK_NOCOPY); 442 if (blkno) 443 DIP(xp, i_db[loc]) = blkno; 444 if (!error) 445 error = ffs_freefile(copy_fs, vp, xp->i_number, 446 xp->i_mode); 447 VOP_UNLOCK(xvp, 0, td); 448 if (error) { 449 free(copy_fs->fs_csp, M_UFSMNT); 450 bawrite(sbp); 451 sbp = NULL; 452 goto out1; 453 } 454 mtx_lock(&mntvnode_mtx); 455 } 456 mtx_unlock(&mntvnode_mtx); 457 /* 458 * If there already exist snapshots on this filesystem, grab a 459 * reference to their shared lock. If this is the first snapshot 460 * on this filesystem, we need to allocate a lock for the snapshots 461 * to share. In either case, acquire the snapshot lock and give 462 * up our original private lock. 463 */ 464 snaphead = &ip->i_devvp->v_rdev->si_snapshots; 465 if ((xp = TAILQ_FIRST(snaphead)) != NULL) { 466 VI_LOCK(vp); 467 vp->v_vnlock = ITOV(xp)->v_vnlock; 468 } else { 469 struct lock *lkp; 470 471 MALLOC(lkp, struct lock *, sizeof(struct lock), M_UFSMNT, 472 M_WAITOK); 473 lockinit(lkp, PVFS, "snaplk", VLKTIMEOUT, 474 LK_CANRECURSE | LK_NOPAUSE); 475 VI_LOCK(vp); 476 vp->v_vnlock = lkp; 477 } 478 vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, td); 479 VI_LOCK(vp); 480 lockmgr(&vp->v_lock, LK_INTERLOCK | LK_RELEASE, VI_MTX(vp), td); 481 /* 482 * Record snapshot inode. Since this is the newest snapshot, 483 * it must be placed at the end of the list. 484 */ 485 fs->fs_snapinum[snaploc] = ip->i_number; 486 if (ip->i_nextsnap.tqe_prev != 0) 487 panic("ffs_snapshot: %d already on list", ip->i_number); 488 ASSERT_VOP_LOCKED(ip->i_devvp, "ffs_snapshot devvp"); 489 TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap); 490 ip->i_devvp->v_rdev->si_copyonwrite = ffs_copyonwrite; 491 ip->i_devvp->v_vflag |= VV_COPYONWRITE; 492 ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp"); 493 vp->v_vflag |= VV_SYSTEM; 494 out1: 495 /* 496 * Resume operation on filesystem. 497 */ 498 vfs_write_resume(vp->v_mount); 499 vn_start_write(NULL, &wrtmp, V_WAIT); 500 if (collectsnapstats && starttime.tv_sec > 0) { 501 nanotime(&endtime); 502 timespecsub(&endtime, &starttime); 503 printf("%s: suspended %d.%03ld sec, redo %ld of %d\n", 504 vp->v_mount->mnt_stat.f_mntonname, endtime.tv_sec, 505 endtime.tv_nsec / 1000000, redo, fs->fs_ncg); 506 } 507 if (sbp == NULL) 508 goto out; 509 /* 510 * Copy allocation information from all the snapshots in 511 * this snapshot and then expunge them from its view. 512 */ 513 snaphead = &ip->i_devvp->v_rdev->si_snapshots; 514 TAILQ_FOREACH(xp, snaphead, i_nextsnap) { 515 if (xp == ip) 516 break; 517 if (xp->i_ump->um_fstype == UFS1) 518 error = expunge_ufs1(vp, xp, fs, snapacct_ufs1, 519 BLK_SNAP); 520 else 521 error = expunge_ufs2(vp, xp, fs, snapacct_ufs2, 522 BLK_SNAP); 523 if (error) { 524 fs->fs_snapinum[snaploc] = 0; 525 goto done; 526 } 527 } 528 /* 529 * Allocate the space for the list of preallocated snapshot blocks. 530 */ 531 ip->i_snaplistsize = fragstoblks(fs, dbtofsb(fs, DIP(ip,i_blocks))) + 1; 532 MALLOC(listhd, daddr_t *, ip->i_snaplistsize * sizeof(daddr_t), 533 M_UFSMNT, M_WAITOK); 534 ip->i_snapblklist = listhd; 535 *ip->i_snapblklist++ = ip->i_snaplistsize; 536 /* 537 * Expunge the blocks used by the snapshots from the set of 538 * blocks marked as used in the snapshot bitmaps. Also, collect 539 * the list of allocated blocks in i_snapblklist. 540 */ 541 if (ip->i_ump->um_fstype == UFS1) 542 error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP); 543 else 544 error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP); 545 if (error) { 546 fs->fs_snapinum[snaploc] = 0; 547 FREE(listhd, M_UFSMNT); 548 goto done; 549 } 550 /* 551 * Write out the list of allocated blocks to the end of the snapshot. 552 */ 553 if (ip->i_snapblklist - listhd != ip->i_snaplistsize) 554 printf("Snaplist mismatch, got %jd should be %jd\n", 555 (intmax_t)(ip->i_snapblklist - listhd), 556 (intmax_t)ip->i_snaplistsize); 557 auio.uio_iov = &aiov; 558 auio.uio_iovcnt = 1; 559 aiov.iov_base = (void *)listhd; 560 aiov.iov_len = ip->i_snaplistsize * sizeof(daddr_t); 561 auio.uio_resid = aiov.iov_len;; 562 auio.uio_offset = ip->i_size; 563 auio.uio_segflg = UIO_SYSSPACE; 564 auio.uio_rw = UIO_WRITE; 565 auio.uio_td = td; 566 if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 567 fs->fs_snapinum[snaploc] = 0; 568 FREE(listhd, M_UFSMNT); 569 goto done; 570 } 571 ip->i_snapblklist = listhd; 572 /* 573 * Write the superblock and its summary information 574 * to the snapshot. 575 */ 576 blkno = fragstoblks(fs, fs->fs_csaddr); 577 len = howmany(fs->fs_cssize, fs->fs_bsize); 578 space = copy_fs->fs_csp; 579 for (loc = 0; loc < len; loc++) { 580 error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp); 581 if (error) { 582 brelse(nbp); 583 fs->fs_snapinum[snaploc] = 0; 584 FREE(listhd, M_UFSMNT); 585 ip->i_snapblklist = NULL; 586 goto done; 587 } 588 bcopy(space, nbp->b_data, fs->fs_bsize); 589 space = (char *)space + fs->fs_bsize; 590 bawrite(nbp); 591 } 592 done: 593 free(copy_fs->fs_csp, M_UFSMNT); 594 bawrite(sbp); 595 out: 596 if (saved_nice > 0) 597 td->td_ksegrp->kg_nice = saved_nice; 598 if (fs->fs_active != 0) { 599 FREE(fs->fs_active, M_DEVBUF); 600 fs->fs_active = 0; 601 } 602 mp->mnt_flag = flag; 603 if (error) 604 (void) UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED, td); 605 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 606 if (error) 607 vput(vp); 608 else 609 VOP_UNLOCK(vp, 0, td); 610 vn_finished_write(wrtmp); 611 return (error); 612 } 613 614 /* 615 * Copy a cylinder group map. All the unallocated blocks are marked 616 * BLK_NOCOPY so that the snapshot knows that it need not copy them 617 * if they are later written. If passno is one, then this is a first 618 * pass, so only setting needs to be done. If passno is 2, then this 619 * is a revision to a previous pass which must be undone as the 620 * replacement pass is done. 621 */ 622 static int 623 cgaccount(cg, vp, nbp, passno) 624 int cg; 625 struct vnode *vp; 626 struct buf *nbp; 627 int passno; 628 { 629 struct buf *bp, *ibp; 630 struct inode *ip; 631 struct cg *cgp; 632 struct fs *fs; 633 ufs2_daddr_t base, numblks; 634 int error, len, loc, indiroff; 635 636 ip = VTOI(vp); 637 fs = ip->i_fs; 638 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 639 (int)fs->fs_cgsize, KERNCRED, &bp); 640 if (error) { 641 brelse(bp); 642 return (error); 643 } 644 cgp = (struct cg *)bp->b_data; 645 if (!cg_chkmagic(cgp)) { 646 brelse(bp); 647 return (EIO); 648 } 649 atomic_set_int(&ACTIVECGNUM(fs, cg), ACTIVECGOFF(cg)); 650 bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize); 651 if (fs->fs_cgsize < fs->fs_bsize) 652 bzero(&nbp->b_data[fs->fs_cgsize], 653 fs->fs_bsize - fs->fs_cgsize); 654 if (passno == 2) 655 nbp->b_flags |= B_VALIDSUSPWRT; 656 numblks = howmany(fs->fs_size, fs->fs_frag); 657 len = howmany(fs->fs_fpg, fs->fs_frag); 658 base = cg * fs->fs_fpg / fs->fs_frag; 659 if (base + len >= numblks) 660 len = numblks - base - 1; 661 loc = 0; 662 if (base < NDADDR) { 663 for ( ; loc < NDADDR; loc++) { 664 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 665 DIP(ip, i_db[loc]) = BLK_NOCOPY; 666 else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY) 667 DIP(ip, i_db[loc]) = 0; 668 else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY) 669 panic("ffs_snapshot: lost direct block"); 670 } 671 } 672 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), 673 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 674 if (error) { 675 brelse(bp); 676 return (error); 677 } 678 indiroff = (base + loc - NDADDR) % NINDIR(fs); 679 for ( ; loc < len; loc++, indiroff++) { 680 if (indiroff >= NINDIR(fs)) { 681 if (passno == 2) 682 ibp->b_flags |= B_VALIDSUSPWRT; 683 bawrite(ibp); 684 error = UFS_BALLOC(vp, 685 lblktosize(fs, (off_t)(base + loc)), 686 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 687 if (error) { 688 brelse(bp); 689 return (error); 690 } 691 indiroff = 0; 692 } 693 if (ip->i_ump->um_fstype == UFS1) { 694 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 695 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 696 BLK_NOCOPY; 697 else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data)) 698 [indiroff] == BLK_NOCOPY) 699 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0; 700 else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data)) 701 [indiroff] == BLK_NOCOPY) 702 panic("ffs_snapshot: lost indirect block"); 703 continue; 704 } 705 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 706 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; 707 else if (passno == 2 && 708 ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) 709 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0; 710 else if (passno == 1 && 711 ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) 712 panic("ffs_snapshot: lost indirect block"); 713 } 714 bqrelse(bp); 715 if (passno == 2) 716 ibp->b_flags |= B_VALIDSUSPWRT; 717 bdwrite(ibp); 718 return (0); 719 } 720 721 /* 722 * Before expunging a snapshot inode, note all the 723 * blocks that it claims with BLK_SNAP so that fsck will 724 * be able to account for those blocks properly and so 725 * that this snapshot knows that it need not copy them 726 * if the other snapshot holding them is freed. This code 727 * is reproduced once each for UFS1 and UFS2. 728 */ 729 static int 730 expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype) 731 struct vnode *snapvp; 732 struct inode *cancelip; 733 struct fs *fs; 734 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 735 struct fs *, ufs_lbn_t, int); 736 int expungetype; 737 { 738 int i, error, indiroff; 739 ufs_lbn_t lbn, rlbn; 740 ufs2_daddr_t len, blkno, numblks, blksperindir; 741 struct ufs1_dinode *dip; 742 struct thread *td = curthread; 743 struct buf *bp; 744 745 numblks = howmany(cancelip->i_size, fs->fs_bsize); 746 if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0], 747 &cancelip->i_din1->di_ib[NIADDR], fs, 0, expungetype))) 748 return (error); 749 blksperindir = 1; 750 lbn = -NDADDR; 751 len = numblks - NDADDR; 752 rlbn = NDADDR; 753 for (i = 0; len > 0 && i < NIADDR; i++) { 754 error = indiracct_ufs1(snapvp, ITOV(cancelip), i, 755 cancelip->i_din1->di_ib[i], lbn, rlbn, len, 756 blksperindir, fs, acctfunc, expungetype); 757 if (error) 758 return (error); 759 blksperindir *= NINDIR(fs); 760 lbn -= blksperindir + 1; 761 len -= blksperindir; 762 rlbn += blksperindir; 763 } 764 /* 765 * Prepare to expunge the inode. If its inode block has not 766 * yet been copied, then allocate and fill the copy. 767 */ 768 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 769 blkno = 0; 770 if (lbn < NDADDR) { 771 blkno = cancelip->i_din1->di_db[lbn]; 772 } else { 773 td->td_proc->p_flag |= P_COWINPROGRESS; 774 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 775 fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); 776 td->td_proc->p_flag &= ~P_COWINPROGRESS; 777 if (error) 778 return (error); 779 indiroff = (lbn - NDADDR) % NINDIR(fs); 780 blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff]; 781 bqrelse(bp); 782 } 783 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 784 fs->fs_bsize, KERNCRED, 0, &bp); 785 if (error) 786 return (error); 787 if (blkno == 0 && (error = readblock(bp, lbn))) 788 return (error); 789 /* 790 * Set a snapshot inode to be a zero length file, regular files 791 * to be completely unallocated. 792 */ 793 dip = (struct ufs1_dinode *)bp->b_data + 794 ino_to_fsbo(fs, cancelip->i_number); 795 if (expungetype == BLK_NOCOPY) 796 dip->di_mode = 0; 797 dip->di_size = 0; 798 dip->di_blocks = 0; 799 dip->di_flags &= ~SF_SNAPSHOT; 800 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t)); 801 bdwrite(bp); 802 return (0); 803 } 804 805 /* 806 * Descend an indirect block chain for vnode cancelvp accounting for all 807 * its indirect blocks in snapvp. 808 */ 809 static int 810 indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, 811 blksperindir, fs, acctfunc, expungetype) 812 struct vnode *snapvp; 813 struct vnode *cancelvp; 814 int level; 815 ufs1_daddr_t blkno; 816 ufs_lbn_t lbn; 817 ufs_lbn_t rlbn; 818 ufs_lbn_t remblks; 819 ufs_lbn_t blksperindir; 820 struct fs *fs; 821 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 822 struct fs *, ufs_lbn_t, int); 823 int expungetype; 824 { 825 int error, num, i; 826 ufs_lbn_t subblksperindir; 827 struct indir indirs[NIADDR + 2]; 828 ufs1_daddr_t last, *bap; 829 struct buf *bp; 830 831 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 832 return (error); 833 if (lbn != indirs[num - 1 - level].in_lbn || blkno == 0 || num < 2) 834 panic("indiracct: botched params"); 835 /* 836 * We have to expand bread here since it will deadlock looking 837 * up the block number for any blocks that are not in the cache. 838 */ 839 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0); 840 bp->b_blkno = fsbtodb(fs, blkno); 841 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 842 (error = readblock(bp, fragstoblks(fs, blkno)))) { 843 brelse(bp); 844 return (error); 845 } 846 /* 847 * Account for the block pointers in this indirect block. 848 */ 849 last = howmany(remblks, blksperindir); 850 if (last > NINDIR(fs)) 851 last = NINDIR(fs); 852 MALLOC(bap, ufs1_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); 853 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 854 bqrelse(bp); 855 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, rlbn, expungetype); 856 if (error || level == 0) 857 goto out; 858 /* 859 * Account for the block pointers in each of the indirect blocks 860 * in the levels below us. 861 */ 862 subblksperindir = blksperindir / NINDIR(fs); 863 for (lbn++, level--, i = 0; i < last; i++) { 864 error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn, 865 rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); 866 if (error) 867 goto out; 868 rlbn += blksperindir; 869 lbn -= blksperindir; 870 remblks -= blksperindir; 871 } 872 out: 873 FREE(bap, M_DEVBUF); 874 return (error); 875 } 876 877 /* 878 * Do both snap accounting and map accounting. 879 */ 880 static int 881 fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype) 882 struct vnode *vp; 883 ufs1_daddr_t *oldblkp, *lastblkp; 884 struct fs *fs; 885 ufs_lbn_t lblkno; 886 int exptype; /* BLK_SNAP or BLK_NOCOPY */ 887 { 888 int error; 889 890 if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 891 return (error); 892 return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 893 } 894 895 /* 896 * Identify a set of blocks allocated in a snapshot inode. 897 */ 898 static int 899 snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 900 struct vnode *vp; 901 ufs1_daddr_t *oldblkp, *lastblkp; 902 struct fs *fs; 903 ufs_lbn_t lblkno; 904 int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 905 { 906 struct inode *ip = VTOI(vp); 907 ufs1_daddr_t blkno, *blkp; 908 ufs_lbn_t lbn; 909 struct buf *ibp; 910 int error; 911 912 for ( ; oldblkp < lastblkp; oldblkp++) { 913 blkno = *oldblkp; 914 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 915 continue; 916 lbn = fragstoblks(fs, blkno); 917 if (lbn < NDADDR) { 918 blkp = &ip->i_din1->di_db[lbn]; 919 ip->i_flag |= IN_CHANGE | IN_UPDATE; 920 } else { 921 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 922 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 923 if (error) 924 return (error); 925 blkp = &((ufs1_daddr_t *)(ibp->b_data)) 926 [(lbn - NDADDR) % NINDIR(fs)]; 927 } 928 /* 929 * If we are expunging a snapshot vnode and we 930 * find a block marked BLK_NOCOPY, then it is 931 * one that has been allocated to this snapshot after 932 * we took our current snapshot and can be ignored. 933 */ 934 if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { 935 if (lbn >= NDADDR) 936 brelse(ibp); 937 } else { 938 if (*blkp != 0) 939 panic("snapacct: bad block"); 940 *blkp = expungetype; 941 if (lbn >= NDADDR) 942 bdwrite(ibp); 943 } 944 } 945 return (0); 946 } 947 948 /* 949 * Account for a set of blocks allocated in a snapshot inode. 950 */ 951 static int 952 mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 953 struct vnode *vp; 954 ufs1_daddr_t *oldblkp, *lastblkp; 955 struct fs *fs; 956 ufs_lbn_t lblkno; 957 int expungetype; 958 { 959 ufs1_daddr_t blkno; 960 struct inode *ip; 961 ino_t inum; 962 963 ip = VTOI(vp); 964 inum = ip->i_number; 965 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 966 blkno = *oldblkp; 967 if (blkno == 0 || blkno == BLK_NOCOPY) 968 continue; 969 if (expungetype == BLK_SNAP && blkno != BLK_SNAP) 970 *ip->i_snapblklist++ = lblkno; 971 if (blkno == BLK_SNAP) 972 blkno = blkstofrags(fs, lblkno); 973 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); 974 } 975 return (0); 976 } 977 978 /* 979 * Before expunging a snapshot inode, note all the 980 * blocks that it claims with BLK_SNAP so that fsck will 981 * be able to account for those blocks properly and so 982 * that this snapshot knows that it need not copy them 983 * if the other snapshot holding them is freed. This code 984 * is reproduced once each for UFS1 and UFS2. 985 */ 986 static int 987 expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype) 988 struct vnode *snapvp; 989 struct inode *cancelip; 990 struct fs *fs; 991 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 992 struct fs *, ufs_lbn_t, int); 993 int expungetype; 994 { 995 int i, error, indiroff; 996 ufs_lbn_t lbn, rlbn; 997 ufs2_daddr_t len, blkno, numblks, blksperindir; 998 struct ufs2_dinode *dip; 999 struct thread *td = curthread; 1000 struct buf *bp; 1001 1002 numblks = howmany(cancelip->i_size, fs->fs_bsize); 1003 if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0], 1004 &cancelip->i_din2->di_ib[NIADDR], fs, 0, expungetype))) 1005 return (error); 1006 blksperindir = 1; 1007 lbn = -NDADDR; 1008 len = numblks - NDADDR; 1009 rlbn = NDADDR; 1010 for (i = 0; len > 0 && i < NIADDR; i++) { 1011 error = indiracct_ufs2(snapvp, ITOV(cancelip), i, 1012 cancelip->i_din2->di_ib[i], lbn, rlbn, len, 1013 blksperindir, fs, acctfunc, expungetype); 1014 if (error) 1015 return (error); 1016 blksperindir *= NINDIR(fs); 1017 lbn -= blksperindir + 1; 1018 len -= blksperindir; 1019 rlbn += blksperindir; 1020 } 1021 /* 1022 * Prepare to expunge the inode. If its inode block has not 1023 * yet been copied, then allocate and fill the copy. 1024 */ 1025 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 1026 blkno = 0; 1027 if (lbn < NDADDR) { 1028 blkno = cancelip->i_din2->di_db[lbn]; 1029 } else { 1030 td->td_proc->p_flag |= P_COWINPROGRESS; 1031 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 1032 fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); 1033 td->td_proc->p_flag &= ~P_COWINPROGRESS; 1034 if (error) 1035 return (error); 1036 indiroff = (lbn - NDADDR) % NINDIR(fs); 1037 blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff]; 1038 bqrelse(bp); 1039 } 1040 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 1041 fs->fs_bsize, KERNCRED, 0, &bp); 1042 if (error) 1043 return (error); 1044 if (blkno == 0 && (error = readblock(bp, lbn))) 1045 return (error); 1046 /* 1047 * Set a snapshot inode to be a zero length file, regular files 1048 * to be completely unallocated. 1049 */ 1050 dip = (struct ufs2_dinode *)bp->b_data + 1051 ino_to_fsbo(fs, cancelip->i_number); 1052 if (expungetype == BLK_NOCOPY) 1053 dip->di_mode = 0; 1054 dip->di_size = 0; 1055 dip->di_blocks = 0; 1056 dip->di_flags &= ~SF_SNAPSHOT; 1057 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t)); 1058 bdwrite(bp); 1059 return (0); 1060 } 1061 1062 /* 1063 * Descend an indirect block chain for vnode cancelvp accounting for all 1064 * its indirect blocks in snapvp. 1065 */ 1066 static int 1067 indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, 1068 blksperindir, fs, acctfunc, expungetype) 1069 struct vnode *snapvp; 1070 struct vnode *cancelvp; 1071 int level; 1072 ufs2_daddr_t blkno; 1073 ufs_lbn_t lbn; 1074 ufs_lbn_t rlbn; 1075 ufs_lbn_t remblks; 1076 ufs_lbn_t blksperindir; 1077 struct fs *fs; 1078 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1079 struct fs *, ufs_lbn_t, int); 1080 int expungetype; 1081 { 1082 int error, num, i; 1083 ufs_lbn_t subblksperindir; 1084 struct indir indirs[NIADDR + 2]; 1085 ufs2_daddr_t last, *bap; 1086 struct buf *bp; 1087 1088 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 1089 return (error); 1090 if (lbn != indirs[num - 1 - level].in_lbn || blkno == 0 || num < 2) 1091 panic("indiracct: botched params"); 1092 /* 1093 * We have to expand bread here since it will deadlock looking 1094 * up the block number for any blocks that are not in the cache. 1095 */ 1096 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0); 1097 bp->b_blkno = fsbtodb(fs, blkno); 1098 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 1099 (error = readblock(bp, fragstoblks(fs, blkno)))) { 1100 brelse(bp); 1101 return (error); 1102 } 1103 /* 1104 * Account for the block pointers in this indirect block. 1105 */ 1106 last = howmany(remblks, blksperindir); 1107 if (last > NINDIR(fs)) 1108 last = NINDIR(fs); 1109 MALLOC(bap, ufs2_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); 1110 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 1111 bqrelse(bp); 1112 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, rlbn, expungetype); 1113 if (error || level == 0) 1114 goto out; 1115 /* 1116 * Account for the block pointers in each of the indirect blocks 1117 * in the levels below us. 1118 */ 1119 subblksperindir = blksperindir / NINDIR(fs); 1120 for (lbn++, level--, i = 0; i < last; i++) { 1121 error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn, 1122 rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); 1123 if (error) 1124 goto out; 1125 rlbn += blksperindir; 1126 lbn -= blksperindir; 1127 remblks -= blksperindir; 1128 } 1129 out: 1130 FREE(bap, M_DEVBUF); 1131 return (error); 1132 } 1133 1134 /* 1135 * Do both snap accounting and map accounting. 1136 */ 1137 static int 1138 fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype) 1139 struct vnode *vp; 1140 ufs2_daddr_t *oldblkp, *lastblkp; 1141 struct fs *fs; 1142 ufs_lbn_t lblkno; 1143 int exptype; /* BLK_SNAP or BLK_NOCOPY */ 1144 { 1145 int error; 1146 1147 if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 1148 return (error); 1149 return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 1150 } 1151 1152 /* 1153 * Identify a set of blocks allocated in a snapshot inode. 1154 */ 1155 static int 1156 snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1157 struct vnode *vp; 1158 ufs2_daddr_t *oldblkp, *lastblkp; 1159 struct fs *fs; 1160 ufs_lbn_t lblkno; 1161 int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 1162 { 1163 struct inode *ip = VTOI(vp); 1164 ufs2_daddr_t blkno, *blkp; 1165 ufs_lbn_t lbn; 1166 struct buf *ibp; 1167 int error; 1168 1169 for ( ; oldblkp < lastblkp; oldblkp++) { 1170 blkno = *oldblkp; 1171 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1172 continue; 1173 lbn = fragstoblks(fs, blkno); 1174 if (lbn < NDADDR) { 1175 blkp = &ip->i_din2->di_db[lbn]; 1176 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1177 } else { 1178 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1179 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1180 if (error) 1181 return (error); 1182 blkp = &((ufs2_daddr_t *)(ibp->b_data)) 1183 [(lbn - NDADDR) % NINDIR(fs)]; 1184 } 1185 /* 1186 * If we are expunging a snapshot vnode and we 1187 * find a block marked BLK_NOCOPY, then it is 1188 * one that has been allocated to this snapshot after 1189 * we took our current snapshot and can be ignored. 1190 */ 1191 if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { 1192 if (lbn >= NDADDR) 1193 brelse(ibp); 1194 } else { 1195 if (*blkp != 0) 1196 panic("snapacct: bad block"); 1197 *blkp = expungetype; 1198 if (lbn >= NDADDR) 1199 bdwrite(ibp); 1200 } 1201 } 1202 return (0); 1203 } 1204 1205 /* 1206 * Account for a set of blocks allocated in a snapshot inode. 1207 */ 1208 static int 1209 mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1210 struct vnode *vp; 1211 ufs2_daddr_t *oldblkp, *lastblkp; 1212 struct fs *fs; 1213 ufs_lbn_t lblkno; 1214 int expungetype; 1215 { 1216 ufs2_daddr_t blkno; 1217 struct inode *ip; 1218 ino_t inum; 1219 1220 ip = VTOI(vp); 1221 inum = ip->i_number; 1222 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1223 blkno = *oldblkp; 1224 if (blkno == 0 || blkno == BLK_NOCOPY) 1225 continue; 1226 if (expungetype == BLK_SNAP && blkno != BLK_SNAP) 1227 *ip->i_snapblklist++ = lblkno; 1228 if (blkno == BLK_SNAP) 1229 blkno = blkstofrags(fs, lblkno); 1230 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); 1231 } 1232 return (0); 1233 } 1234 1235 /* 1236 * Decrement extra reference on snapshot when last name is removed. 1237 * It will not be freed until the last open reference goes away. 1238 */ 1239 void 1240 ffs_snapgone(ip) 1241 struct inode *ip; 1242 { 1243 struct inode *xp; 1244 struct fs *fs; 1245 int snaploc; 1246 1247 /* 1248 * Find snapshot in incore list. 1249 */ 1250 TAILQ_FOREACH(xp, &ip->i_devvp->v_rdev->si_snapshots, i_nextsnap) 1251 if (xp == ip) 1252 break; 1253 if (xp == 0) 1254 printf("ffs_snapgone: lost snapshot vnode %d\n", 1255 ip->i_number); 1256 else 1257 vrele(ITOV(ip)); 1258 /* 1259 * Delete snapshot inode from superblock. Keep list dense. 1260 */ 1261 fs = ip->i_fs; 1262 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 1263 if (fs->fs_snapinum[snaploc] == ip->i_number) 1264 break; 1265 if (snaploc < FSMAXSNAP) { 1266 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { 1267 if (fs->fs_snapinum[snaploc] == 0) 1268 break; 1269 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; 1270 } 1271 fs->fs_snapinum[snaploc - 1] = 0; 1272 } 1273 } 1274 1275 /* 1276 * Prepare a snapshot file for being removed. 1277 */ 1278 void 1279 ffs_snapremove(vp) 1280 struct vnode *vp; 1281 { 1282 struct inode *ip; 1283 struct vnode *devvp; 1284 struct lock *lkp; 1285 struct buf *ibp; 1286 struct fs *fs; 1287 struct thread *td = curthread; 1288 ufs2_daddr_t numblks, blkno, dblk; 1289 int error, loc, last; 1290 1291 ip = VTOI(vp); 1292 fs = ip->i_fs; 1293 /* 1294 * If active, delete from incore list (this snapshot may 1295 * already have been in the process of being deleted, so 1296 * would not have been active). 1297 * 1298 * Clear copy-on-write flag if last snapshot. 1299 */ 1300 if (ip->i_nextsnap.tqe_prev != 0) { 1301 VI_LOCK(vp); 1302 lockmgr(&vp->v_lock, LK_INTERLOCK|LK_EXCLUSIVE, VI_MTX(vp), td); 1303 VI_LOCK(vp); 1304 lkp = vp->v_vnlock; 1305 vp->v_vnlock = &vp->v_lock; 1306 lockmgr(lkp, LK_INTERLOCK | LK_RELEASE, VI_MTX(vp), td); 1307 devvp = ip->i_devvp; 1308 TAILQ_REMOVE(&devvp->v_rdev->si_snapshots, ip, i_nextsnap); 1309 ip->i_nextsnap.tqe_prev = 0; 1310 ASSERT_VOP_LOCKED(devvp, "ffs_snapremove devvp"); 1311 if (TAILQ_FIRST(&devvp->v_rdev->si_snapshots) == 0) { 1312 lockdestroy(lkp); 1313 FREE(lkp, M_UFSMNT); 1314 devvp->v_rdev->si_copyonwrite = 0; 1315 devvp->v_vflag &= ~VV_COPYONWRITE; 1316 } 1317 } 1318 /* 1319 * Get rid of its hints list. 1320 */ 1321 if (ip->i_snapblklist != NULL) { 1322 FREE(ip->i_snapblklist, M_UFSMNT); 1323 ip->i_snapblklist = NULL; 1324 } 1325 /* 1326 * Clear all BLK_NOCOPY fields. Pass any block claims to other 1327 * snapshots that want them (see ffs_snapblkfree below). 1328 */ 1329 for (blkno = 1; blkno < NDADDR; blkno++) { 1330 dblk = DIP(ip, i_db[blkno]); 1331 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1332 DIP(ip, i_db[blkno]) = 0; 1333 else if ((dblk == blkstofrags(fs, blkno) && 1334 ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, 1335 ip->i_number))) { 1336 DIP(ip, i_blocks) -= btodb(fs->fs_bsize); 1337 DIP(ip, i_db[blkno]) = 0; 1338 } 1339 } 1340 numblks = howmany(ip->i_size, fs->fs_bsize); 1341 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 1342 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), 1343 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1344 if (error) 1345 continue; 1346 if (fs->fs_size - blkno > NINDIR(fs)) 1347 last = NINDIR(fs); 1348 else 1349 last = fs->fs_size - blkno; 1350 for (loc = 0; loc < last; loc++) { 1351 if (ip->i_ump->um_fstype == UFS1) { 1352 dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc]; 1353 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1354 ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; 1355 else if ((dblk == blkstofrags(fs, blkno) && 1356 ffs_snapblkfree(fs, ip->i_devvp, dblk, 1357 fs->fs_bsize, ip->i_number))) { 1358 ip->i_din1->di_blocks -= 1359 btodb(fs->fs_bsize); 1360 ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; 1361 } 1362 continue; 1363 } 1364 dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc]; 1365 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1366 ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; 1367 else if ((dblk == blkstofrags(fs, blkno) && 1368 ffs_snapblkfree(fs, ip->i_devvp, dblk, 1369 fs->fs_bsize, ip->i_number))) { 1370 ip->i_din2->di_blocks -= btodb(fs->fs_bsize); 1371 ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; 1372 } 1373 } 1374 bawrite(ibp); 1375 } 1376 /* 1377 * Clear snapshot flag and drop reference. 1378 */ 1379 ip->i_flags &= ~SF_SNAPSHOT; 1380 DIP(ip, i_flags) = ip->i_flags; 1381 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1382 } 1383 1384 /* 1385 * Notification that a block is being freed. Return zero if the free 1386 * should be allowed to proceed. Return non-zero if the snapshot file 1387 * wants to claim the block. The block will be claimed if it is an 1388 * uncopied part of one of the snapshots. It will be freed if it is 1389 * either a BLK_NOCOPY or has already been copied in all of the snapshots. 1390 * If a fragment is being freed, then all snapshots that care about 1391 * it must make a copy since a snapshot file can only claim full sized 1392 * blocks. Note that if more than one snapshot file maps the block, 1393 * we can pick one at random to claim it. Since none of the snapshots 1394 * can change, we are assurred that they will all see the same unmodified 1395 * image. When deleting a snapshot file (see ffs_snapremove above), we 1396 * must push any of these claimed blocks to one of the other snapshots 1397 * that maps it. These claimed blocks are easily identified as they will 1398 * have a block number equal to their logical block number within the 1399 * snapshot. A copied block can never have this property because they 1400 * must always have been allocated from a BLK_NOCOPY location. 1401 */ 1402 int 1403 ffs_snapblkfree(fs, devvp, bno, size, inum) 1404 struct fs *fs; 1405 struct vnode *devvp; 1406 ufs2_daddr_t bno; 1407 long size; 1408 ino_t inum; 1409 { 1410 struct buf *ibp, *cbp, *savedcbp = 0; 1411 struct thread *td = curthread; 1412 struct inode *ip; 1413 struct vnode *vp; 1414 ufs_lbn_t lbn; 1415 ufs2_daddr_t blkno; 1416 int indiroff = 0, error = 0, claimedblk = 0; 1417 struct snaphead *snaphead; 1418 1419 lbn = fragstoblks(fs, bno); 1420 snaphead = &devvp->v_rdev->si_snapshots; 1421 TAILQ_FOREACH(ip, snaphead, i_nextsnap) { 1422 vp = ITOV(ip); 1423 /* 1424 * Lookup block being written. 1425 */ 1426 if (lbn < NDADDR) { 1427 blkno = DIP(ip, i_db[lbn]); 1428 } else { 1429 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1430 td->td_proc->p_flag |= P_COWINPROGRESS; 1431 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1432 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1433 td->td_proc->p_flag &= ~P_COWINPROGRESS; 1434 VOP_UNLOCK(vp, 0, td); 1435 if (error) 1436 break; 1437 indiroff = (lbn - NDADDR) % NINDIR(fs); 1438 if (ip->i_ump->um_fstype == UFS1) 1439 blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; 1440 else 1441 blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; 1442 } 1443 /* 1444 * Check to see if block needs to be copied. 1445 */ 1446 if (blkno == 0) { 1447 /* 1448 * A block that we map is being freed. If it has not 1449 * been claimed yet, we will claim or copy it (below). 1450 */ 1451 claimedblk = 1; 1452 } else if (blkno == BLK_SNAP) { 1453 /* 1454 * No previous snapshot claimed the block, 1455 * so it will be * freed and become a BLK_NOCOPY 1456 * (don't care) for us. 1457 */ 1458 if (claimedblk) 1459 panic("snapblkfree: inconsistent block type"); 1460 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1461 if (lbn < NDADDR) { 1462 DIP(ip, i_db[lbn]) = BLK_NOCOPY; 1463 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1464 } else if (ip->i_ump->um_fstype == UFS1) { 1465 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 1466 BLK_NOCOPY; 1467 bdwrite(ibp); 1468 } else { 1469 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 1470 BLK_NOCOPY; 1471 bdwrite(ibp); 1472 } 1473 VOP_UNLOCK(vp, 0, td); 1474 continue; 1475 } else /* BLK_NOCOPY or default */ { 1476 /* 1477 * If the snapshot has already copied the block 1478 * (default), or does not care about the block, 1479 * it is not needed. 1480 */ 1481 if (lbn >= NDADDR) 1482 bqrelse(ibp); 1483 continue; 1484 } 1485 /* 1486 * If this is a full size block, we will just grab it 1487 * and assign it to the snapshot inode. Otherwise we 1488 * will proceed to copy it. See explanation for this 1489 * routine as to why only a single snapshot needs to 1490 * claim this block. 1491 */ 1492 if (size == fs->fs_bsize) { 1493 #ifdef DEBUG 1494 if (snapdebug) 1495 printf("%s %d lbn %jd from inum %d\n", 1496 "Grabonremove: snapino", ip->i_number, 1497 (intmax_t)lbn, inum); 1498 #endif 1499 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1500 if (lbn < NDADDR) { 1501 DIP(ip, i_db[lbn]) = bno; 1502 } else if (ip->i_ump->um_fstype == UFS1) { 1503 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno; 1504 bdwrite(ibp); 1505 } else { 1506 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno; 1507 bdwrite(ibp); 1508 } 1509 DIP(ip, i_blocks) += btodb(size); 1510 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1511 VOP_UNLOCK(vp, 0, td); 1512 return (1); 1513 } 1514 if (lbn >= NDADDR) 1515 bqrelse(ibp); 1516 /* 1517 * Allocate the block into which to do the copy. Note that this 1518 * allocation will never require any additional allocations for 1519 * the snapshot inode. 1520 */ 1521 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1522 td->td_proc->p_flag |= P_COWINPROGRESS; 1523 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1524 fs->fs_bsize, KERNCRED, 0, &cbp); 1525 td->td_proc->p_flag &= ~P_COWINPROGRESS; 1526 if (error) { 1527 VOP_UNLOCK(vp, 0, td); 1528 break; 1529 } 1530 #ifdef DEBUG 1531 if (snapdebug) 1532 printf("%s%d lbn %jd %s %d size %ld to blkno %jd\n", 1533 "Copyonremove: snapino ", ip->i_number, 1534 (intmax_t)lbn, "for inum", inum, size, 1535 (intmax_t)cbp->b_blkno); 1536 #endif 1537 /* 1538 * If we have already read the old block contents, then 1539 * simply copy them to the new block. Note that we need 1540 * to synchronously write snapshots that have not been 1541 * unlinked, and hence will be visible after a crash, 1542 * to ensure their integrity. 1543 */ 1544 if (savedcbp != 0) { 1545 bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 1546 bawrite(cbp); 1547 if (dopersistence && ip->i_effnlink > 0) 1548 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1549 VOP_UNLOCK(vp, 0, td); 1550 continue; 1551 } 1552 /* 1553 * Otherwise, read the old block contents into the buffer. 1554 */ 1555 if ((error = readblock(cbp, lbn)) != 0) { 1556 bzero(cbp->b_data, fs->fs_bsize); 1557 bawrite(cbp); 1558 if (dopersistence && ip->i_effnlink > 0) 1559 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1560 VOP_UNLOCK(vp, 0, td); 1561 break; 1562 } 1563 VOP_UNLOCK(vp, 0, td); 1564 savedcbp = cbp; 1565 } 1566 /* 1567 * Note that we need to synchronously write snapshots that 1568 * have not been unlinked, and hence will be visible after 1569 * a crash, to ensure their integrity. 1570 */ 1571 if (savedcbp) { 1572 vp = savedcbp->b_vp; 1573 bawrite(savedcbp); 1574 if (dopersistence && VTOI(vp)->i_effnlink > 0) { 1575 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1576 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1577 VOP_UNLOCK(vp, 0, td); 1578 } 1579 } 1580 /* 1581 * If we have been unable to allocate a block in which to do 1582 * the copy, then return non-zero so that the fragment will 1583 * not be freed. Although space will be lost, the snapshot 1584 * will stay consistent. 1585 */ 1586 return (error); 1587 } 1588 1589 /* 1590 * Associate snapshot files when mounting. 1591 */ 1592 void 1593 ffs_snapshot_mount(mp) 1594 struct mount *mp; 1595 { 1596 struct ufsmount *ump = VFSTOUFS(mp); 1597 struct fs *fs = ump->um_fs; 1598 struct thread *td = curthread; 1599 struct snaphead *snaphead; 1600 struct vnode *vp; 1601 struct inode *ip, *xp; 1602 struct uio auio; 1603 struct iovec aiov; 1604 void *listhd; 1605 char *reason; 1606 int error, snaploc, loc; 1607 1608 /* 1609 * XXX The following needs to be set before UFS_TRUNCATE or 1610 * VOP_READ can be called. 1611 */ 1612 mp->mnt_stat.f_iosize = fs->fs_bsize; 1613 /* 1614 * Process each snapshot listed in the superblock. 1615 */ 1616 snaphead = &ump->um_devvp->v_rdev->si_snapshots; 1617 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { 1618 if (fs->fs_snapinum[snaploc] == 0) 1619 return; 1620 if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], 1621 LK_EXCLUSIVE, &vp)) != 0){ 1622 printf("ffs_snapshot_mount: vget failed %d\n", error); 1623 continue; 1624 } 1625 ip = VTOI(vp); 1626 if ((ip->i_flags & SF_SNAPSHOT) == 0 || ip->i_size == 1627 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) { 1628 if ((ip->i_flags & SF_SNAPSHOT) == 0) { 1629 reason = "non-snapshot"; 1630 } else { 1631 reason = "old format snapshot"; 1632 (void)UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED, td); 1633 (void)VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1634 } 1635 printf("ffs_snapshot_mount: %s inode %d\n", 1636 reason, fs->fs_snapinum[snaploc]); 1637 vput(vp); 1638 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { 1639 if (fs->fs_snapinum[loc] == 0) 1640 break; 1641 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; 1642 } 1643 fs->fs_snapinum[loc - 1] = 0; 1644 snaploc--; 1645 continue; 1646 } 1647 /* 1648 * Allocate the space for the block hints list. 1649 */ 1650 auio.uio_iov = &aiov; 1651 auio.uio_iovcnt = 1; 1652 aiov.iov_base = (void *)&ip->i_snaplistsize; 1653 aiov.iov_len = sizeof(ip->i_snaplistsize); 1654 auio.uio_resid = aiov.iov_len; 1655 auio.uio_offset = 1656 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)); 1657 auio.uio_segflg = UIO_SYSSPACE; 1658 auio.uio_rw = UIO_READ; 1659 auio.uio_td = td; 1660 if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 1661 printf("ffs_snapshot_mount: read_1 failed %d\n", error); 1662 continue; 1663 } 1664 MALLOC(listhd, void *, ip->i_snaplistsize * sizeof(daddr_t), 1665 M_UFSMNT, M_WAITOK); 1666 auio.uio_iovcnt = 1; 1667 aiov.iov_base = listhd; 1668 aiov.iov_len = ip->i_snaplistsize * sizeof (daddr_t); 1669 auio.uio_resid = aiov.iov_len; 1670 auio.uio_offset -= sizeof(ip->i_snaplistsize); 1671 if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 1672 printf("ffs_snapshot_mount: read_2 failed %d\n", error); 1673 FREE(listhd, M_UFSMNT); 1674 continue; 1675 } 1676 ip->i_snapblklist = (daddr_t *)listhd; 1677 /* 1678 * If there already exist snapshots on this filesystem, grab a 1679 * reference to their shared lock. If this is the first snapshot 1680 * on this filesystem, we need to allocate a lock for the 1681 * snapshots to share. In either case, acquire the snapshot 1682 * lock and give up our original private lock. 1683 */ 1684 if ((xp = TAILQ_FIRST(snaphead)) != NULL) { 1685 VI_LOCK(vp); 1686 vp->v_vnlock = ITOV(xp)->v_vnlock; 1687 } else { 1688 struct lock *lkp; 1689 1690 MALLOC(lkp, struct lock *, sizeof(struct lock), 1691 M_UFSMNT, M_WAITOK); 1692 lockinit(lkp, PVFS, "snaplk", VLKTIMEOUT, 1693 LK_CANRECURSE | LK_NOPAUSE); 1694 VI_LOCK(vp); 1695 vp->v_vnlock = lkp; 1696 } 1697 vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, td); 1698 VI_LOCK(vp); 1699 lockmgr(&vp->v_lock, LK_INTERLOCK | LK_RELEASE, VI_MTX(vp), td); 1700 /* 1701 * Link it onto the active snapshot list. 1702 */ 1703 if (ip->i_nextsnap.tqe_prev != 0) 1704 panic("ffs_snapshot_mount: %d already on list", 1705 ip->i_number); 1706 else 1707 TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap); 1708 vp->v_vflag |= VV_SYSTEM; 1709 ump->um_devvp->v_rdev->si_copyonwrite = ffs_copyonwrite; 1710 ASSERT_VOP_LOCKED(ump->um_devvp, "ffs_snapshot_mount"); 1711 ump->um_devvp->v_vflag |= VV_COPYONWRITE; 1712 VOP_UNLOCK(vp, 0, td); 1713 } 1714 } 1715 1716 /* 1717 * Disassociate snapshot files when unmounting. 1718 */ 1719 void 1720 ffs_snapshot_unmount(mp) 1721 struct mount *mp; 1722 { 1723 struct ufsmount *ump = VFSTOUFS(mp); 1724 struct snaphead *snaphead = &ump->um_devvp->v_rdev->si_snapshots; 1725 struct lock *lkp = NULL; 1726 struct inode *xp; 1727 struct vnode *vp; 1728 1729 while ((xp = TAILQ_FIRST(snaphead)) != 0) { 1730 vp = ITOV(xp); 1731 lkp = vp->v_vnlock; 1732 vp->v_vnlock = &vp->v_lock; 1733 TAILQ_REMOVE(snaphead, xp, i_nextsnap); 1734 if (xp->i_snapblklist != NULL) { 1735 FREE(xp->i_snapblklist, M_UFSMNT); 1736 xp->i_snapblklist = NULL; 1737 } 1738 xp->i_nextsnap.tqe_prev = 0; 1739 if (xp->i_effnlink > 0) 1740 vrele(vp); 1741 } 1742 if (lkp != NULL) { 1743 lockdestroy(lkp); 1744 FREE(lkp, M_UFSMNT); 1745 } 1746 ASSERT_VOP_LOCKED(ump->um_devvp, "ffs_snapshot_unmount"); 1747 ump->um_devvp->v_rdev->si_copyonwrite = 0; 1748 ump->um_devvp->v_vflag &= ~VV_COPYONWRITE; 1749 } 1750 1751 /* 1752 * Check for need to copy block that is about to be written, 1753 * copying the block if necessary. 1754 */ 1755 static int 1756 ffs_copyonwrite(devvp, bp) 1757 struct vnode *devvp; 1758 struct buf *bp; 1759 { 1760 struct snaphead *snaphead; 1761 struct buf *ibp, *cbp, *savedcbp = 0; 1762 struct thread *td = curthread; 1763 struct fs *fs; 1764 struct inode *ip; 1765 struct vnode *vp = 0; 1766 ufs2_daddr_t lbn, blkno; 1767 int lower, upper, mid, indiroff, snapshot_locked = 0, error = 0; 1768 1769 if (td->td_proc->p_flag & P_COWINPROGRESS) 1770 panic("ffs_copyonwrite: recursive call"); 1771 snaphead = &devvp->v_rdev->si_snapshots; 1772 ip = TAILQ_FIRST(snaphead); 1773 fs = ip->i_fs; 1774 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); 1775 TAILQ_FOREACH(ip, snaphead, i_nextsnap) { 1776 vp = ITOV(ip); 1777 /* 1778 * We ensure that everything of our own that needs to be 1779 * copied will be done at the time that ffs_snapshot is 1780 * called. Thus we can skip the check here which can 1781 * deadlock in doing the lookup in UFS_BALLOC. 1782 */ 1783 if (bp->b_vp == vp) 1784 continue; 1785 retry: 1786 /* 1787 * First check to see if it is in the preallocated list. 1788 * By doing this check we avoid several potential deadlocks. 1789 */ 1790 lower = 1; 1791 upper = ip->i_snaplistsize - 1; 1792 while (lower <= upper) { 1793 mid = (lower + upper) / 2; 1794 if (ip->i_snapblklist[mid] == lbn) 1795 break; 1796 if (ip->i_snapblklist[mid] < lbn) 1797 lower = mid + 1; 1798 else 1799 upper = mid - 1; 1800 } 1801 if (lower <= upper) 1802 continue; 1803 /* 1804 * Check to see if block needs to be copied. We do not have 1805 * to hold the snapshot lock while doing this lookup as it 1806 * will never require any additional allocations for the 1807 * snapshot inode. 1808 */ 1809 if (lbn < NDADDR) { 1810 blkno = DIP(ip, i_db[lbn]); 1811 } else { 1812 td->td_proc->p_flag |= P_COWINPROGRESS; 1813 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1814 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1815 td->td_proc->p_flag &= ~P_COWINPROGRESS; 1816 if (error) 1817 break; 1818 indiroff = (lbn - NDADDR) % NINDIR(fs); 1819 if (ip->i_ump->um_fstype == UFS1) 1820 blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; 1821 else 1822 blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; 1823 bqrelse(ibp); 1824 } 1825 #ifdef DIAGNOSTIC 1826 if (blkno == BLK_SNAP && bp->b_lblkno >= 0) 1827 panic("ffs_copyonwrite: bad copy block"); 1828 #endif 1829 if (blkno != 0) 1830 continue; 1831 /* 1832 * Allocate the block into which to do the copy. Since 1833 * multiple processes may all try to copy the same block, 1834 * we have to recheck our need to do a copy if we sleep 1835 * waiting for the lock. 1836 * 1837 * Because all snapshots on a filesystem share a single 1838 * lock, we ensure that we will never be in competition 1839 * with another process to allocate a block. 1840 */ 1841 if (snapshot_locked == 0 && 1842 vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL, td) != 0) 1843 goto retry; 1844 snapshot_locked = 1; 1845 td->td_proc->p_flag |= P_COWINPROGRESS; 1846 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1847 fs->fs_bsize, KERNCRED, 0, &cbp); 1848 td->td_proc->p_flag &= ~P_COWINPROGRESS; 1849 if (error) 1850 break; 1851 #ifdef DEBUG 1852 if (snapdebug) { 1853 printf("Copyonwrite: snapino %d lbn %jd for ", 1854 ip->i_number, (intmax_t)lbn); 1855 if (bp->b_vp == devvp) 1856 printf("fs metadata"); 1857 else 1858 printf("inum %d", VTOI(bp->b_vp)->i_number); 1859 printf(" lblkno %jd to blkno %jd\n", 1860 (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno); 1861 } 1862 #endif 1863 /* 1864 * If we have already read the old block contents, then 1865 * simply copy them to the new block. Note that we need 1866 * to synchronously write snapshots that have not been 1867 * unlinked, and hence will be visible after a crash, 1868 * to ensure their integrity. 1869 */ 1870 if (savedcbp != 0) { 1871 bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 1872 bawrite(cbp); 1873 if (dopersistence && ip->i_effnlink > 0) 1874 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1875 continue; 1876 } 1877 /* 1878 * Otherwise, read the old block contents into the buffer. 1879 */ 1880 if ((error = readblock(cbp, lbn)) != 0) { 1881 bzero(cbp->b_data, fs->fs_bsize); 1882 bawrite(cbp); 1883 if (dopersistence && ip->i_effnlink > 0) 1884 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1885 break; 1886 } 1887 savedcbp = cbp; 1888 } 1889 /* 1890 * Note that we need to synchronously write snapshots that 1891 * have not been unlinked, and hence will be visible after 1892 * a crash, to ensure their integrity. 1893 */ 1894 if (savedcbp) { 1895 vp = savedcbp->b_vp; 1896 bawrite(savedcbp); 1897 if (dopersistence && VTOI(vp)->i_effnlink > 0) 1898 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1899 } 1900 if (snapshot_locked) 1901 VOP_UNLOCK(vp, 0, td); 1902 return (error); 1903 } 1904 1905 /* 1906 * Read the specified block into the given buffer. 1907 * Much of this boiler-plate comes from bwrite(). 1908 */ 1909 static int 1910 readblock(bp, lbn) 1911 struct buf *bp; 1912 ufs2_daddr_t lbn; 1913 { 1914 struct uio auio; 1915 struct iovec aiov; 1916 struct thread *td = curthread; 1917 struct inode *ip = VTOI(bp->b_vp); 1918 1919 aiov.iov_base = bp->b_data; 1920 aiov.iov_len = bp->b_bcount; 1921 auio.uio_iov = &aiov; 1922 auio.uio_iovcnt = 1; 1923 auio.uio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn))); 1924 auio.uio_resid = bp->b_bcount; 1925 auio.uio_rw = UIO_READ; 1926 auio.uio_segflg = UIO_SYSSPACE; 1927 auio.uio_td = td; 1928 return (physio(ip->i_devvp->v_rdev, &auio, 0)); 1929 } 1930