1 /* 2 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. 3 * 4 * Further information about snapshots can be obtained from: 5 * 6 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 7 * 1614 Oxford Street mckusick@mckusick.com 8 * Berkeley, CA 94709-1608 +1-510-843-9542 9 * USA 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 21 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 22 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 34 */ 35 36 #include <sys/cdefs.h> 37 __FBSDID("$FreeBSD$"); 38 39 #include <sys/param.h> 40 #include <sys/kernel.h> 41 #include <sys/systm.h> 42 #include <sys/conf.h> 43 #include <sys/bio.h> 44 #include <sys/buf.h> 45 #include <sys/proc.h> 46 #include <sys/namei.h> 47 #include <sys/sched.h> 48 #include <sys/stat.h> 49 #include <sys/malloc.h> 50 #include <sys/mount.h> 51 #include <sys/resource.h> 52 #include <sys/resourcevar.h> 53 #include <sys/vnode.h> 54 55 #include <ufs/ufs/extattr.h> 56 #include <ufs/ufs/quota.h> 57 #include <ufs/ufs/ufsmount.h> 58 #include <ufs/ufs/inode.h> 59 #include <ufs/ufs/ufs_extern.h> 60 61 #include <ufs/ffs/fs.h> 62 #include <ufs/ffs/ffs_extern.h> 63 64 #define KERNCRED thread0.td_ucred 65 #define DEBUG 1 66 67 static int cgaccount(int, struct vnode *, struct buf *, int); 68 static int expunge_ufs1(struct vnode *, struct inode *, struct fs *, 69 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 70 ufs_lbn_t, int), int); 71 static int indiracct_ufs1(struct vnode *, struct vnode *, int, 72 ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 73 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 74 ufs_lbn_t, int), int); 75 static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 76 struct fs *, ufs_lbn_t, int); 77 static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 78 struct fs *, ufs_lbn_t, int); 79 static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 80 struct fs *, ufs_lbn_t, int); 81 static int expunge_ufs2(struct vnode *, struct inode *, struct fs *, 82 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 83 ufs_lbn_t, int), int); 84 static int indiracct_ufs2(struct vnode *, struct vnode *, int, 85 ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 86 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 87 ufs_lbn_t, int), int); 88 static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 89 struct fs *, ufs_lbn_t, int); 90 static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 91 struct fs *, ufs_lbn_t, int); 92 static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 93 struct fs *, ufs_lbn_t, int); 94 static int ffs_copyonwrite(struct vnode *, struct buf *); 95 static int readblock(struct buf *, ufs2_daddr_t); 96 97 /* 98 * To ensure the consistency of snapshots across crashes, we must 99 * synchronously write out copied blocks before allowing the 100 * originals to be modified. Because of the rather severe speed 101 * penalty that this imposes, the following flag allows this 102 * crash persistence to be disabled. 103 */ 104 int dopersistence = 0; 105 106 #ifdef DEBUG 107 #include <sys/sysctl.h> 108 SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, ""); 109 static int snapdebug = 0; 110 SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, ""); 111 int collectsnapstats = 0; 112 SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats, 113 0, ""); 114 #endif /* DEBUG */ 115 116 /* 117 * Create a snapshot file and initialize it for the filesystem. 118 */ 119 int 120 ffs_snapshot(mp, snapfile) 121 struct mount *mp; 122 char *snapfile; 123 { 124 ufs2_daddr_t numblks, blkno, *blkp, *snapblklist; 125 int error, cg, snaploc; 126 int i, size, len, loc; 127 int flag = mp->mnt_flag; 128 struct timespec starttime = {0, 0}, endtime; 129 char saved_nice = 0; 130 long redo = 0, snaplistsize = 0; 131 int32_t *lp; 132 void *space; 133 struct fs *copy_fs = NULL, *fs = VFSTOUFS(mp)->um_fs; 134 struct snaphead *snaphead; 135 struct thread *td = curthread; 136 struct inode *ip, *xp; 137 struct buf *bp, *nbp, *ibp, *sbp = NULL; 138 struct nameidata nd; 139 struct mount *wrtmp; 140 struct vattr vat; 141 struct vnode *vp, *xvp, *nvp, *devvp; 142 struct uio auio; 143 struct iovec aiov; 144 145 /* 146 * Need to serialize access to snapshot code per filesystem. 147 */ 148 /* 149 * Assign a snapshot slot in the superblock. 150 */ 151 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 152 if (fs->fs_snapinum[snaploc] == 0) 153 break; 154 if (snaploc == FSMAXSNAP) 155 return (ENOSPC); 156 /* 157 * Create the snapshot file. 158 */ 159 restart: 160 NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, snapfile, td); 161 if ((error = namei(&nd)) != 0) 162 return (error); 163 if (nd.ni_vp != NULL) { 164 vput(nd.ni_vp); 165 error = EEXIST; 166 } 167 if (nd.ni_dvp->v_mount != mp) 168 error = EXDEV; 169 if (error) { 170 NDFREE(&nd, NDF_ONLY_PNBUF); 171 if (nd.ni_dvp == nd.ni_vp) 172 vrele(nd.ni_dvp); 173 else 174 vput(nd.ni_dvp); 175 return (error); 176 } 177 VATTR_NULL(&vat); 178 vat.va_type = VREG; 179 vat.va_mode = S_IRUSR; 180 vat.va_vaflags |= VA_EXCLUSIVE; 181 if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp)) 182 wrtmp = NULL; 183 if (wrtmp != mp) 184 panic("ffs_snapshot: mount mismatch"); 185 if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) { 186 NDFREE(&nd, NDF_ONLY_PNBUF); 187 vput(nd.ni_dvp); 188 if ((error = vn_start_write(NULL, &wrtmp, 189 V_XSLEEP | PCATCH)) != 0) 190 return (error); 191 goto restart; 192 } 193 VOP_LEASE(nd.ni_dvp, td, KERNCRED, LEASE_WRITE); 194 error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat); 195 vput(nd.ni_dvp); 196 if (error) { 197 NDFREE(&nd, NDF_ONLY_PNBUF); 198 vn_finished_write(wrtmp); 199 return (error); 200 } 201 vp = nd.ni_vp; 202 ip = VTOI(vp); 203 devvp = ip->i_devvp; 204 /* 205 * Allocate and copy the last block contents so as to be able 206 * to set size to that of the filesystem. 207 */ 208 numblks = howmany(fs->fs_size, fs->fs_frag); 209 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), 210 fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); 211 if (error) 212 goto out; 213 ip->i_size = lblktosize(fs, (off_t)numblks); 214 DIP(ip, i_size) = ip->i_size; 215 ip->i_flag |= IN_CHANGE | IN_UPDATE; 216 if ((error = readblock(bp, numblks - 1)) != 0) 217 goto out; 218 bawrite(bp); 219 /* 220 * Preallocate critical data structures so that we can copy 221 * them in without further allocation after we suspend all 222 * operations on the filesystem. We would like to just release 223 * the allocated buffers without writing them since they will 224 * be filled in below once we are ready to go, but this upsets 225 * the soft update code, so we go ahead and write the new buffers. 226 * 227 * Allocate all indirect blocks and mark all of them as not 228 * needing to be copied. 229 */ 230 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 231 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), 232 fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp); 233 if (error) 234 goto out; 235 bawrite(ibp); 236 } 237 /* 238 * Allocate copies for the superblock and its summary information. 239 */ 240 error = UFS_BALLOC(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED, 241 0, &nbp); 242 if (error) 243 goto out; 244 bawrite(nbp); 245 blkno = fragstoblks(fs, fs->fs_csaddr); 246 len = howmany(fs->fs_cssize, fs->fs_bsize); 247 for (loc = 0; loc < len; loc++) { 248 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), 249 fs->fs_bsize, KERNCRED, 0, &nbp); 250 if (error) 251 goto out; 252 bawrite(nbp); 253 } 254 /* 255 * Allocate all cylinder group blocks. 256 */ 257 for (cg = 0; cg < fs->fs_ncg; cg++) { 258 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 259 fs->fs_bsize, KERNCRED, 0, &nbp); 260 if (error) 261 goto out; 262 bawrite(nbp); 263 } 264 /* 265 * Copy all the cylinder group maps. Although the 266 * filesystem is still active, we hope that only a few 267 * cylinder groups will change between now and when we 268 * suspend operations. Thus, we will be able to quickly 269 * touch up the few cylinder groups that changed during 270 * the suspension period. 271 */ 272 len = howmany(fs->fs_ncg, NBBY); 273 MALLOC(fs->fs_active, int *, len, M_DEVBUF, M_WAITOK); 274 bzero(fs->fs_active, len); 275 for (cg = 0; cg < fs->fs_ncg; cg++) { 276 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 277 fs->fs_bsize, KERNCRED, 0, &nbp); 278 if (error) 279 goto out; 280 error = cgaccount(cg, vp, nbp, 1); 281 bawrite(nbp); 282 if (error) 283 goto out; 284 } 285 /* 286 * Change inode to snapshot type file. 287 */ 288 ip->i_flags |= SF_SNAPSHOT; 289 DIP(ip, i_flags) = ip->i_flags; 290 ip->i_flag |= IN_CHANGE | IN_UPDATE; 291 /* 292 * Ensure that the snapshot is completely on disk. 293 * Since we have marked it as a snapshot it is safe to 294 * unlock it as no process will be allowed to write to it. 295 */ 296 if ((error = VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td)) != 0) 297 goto out; 298 VOP_UNLOCK(vp, 0, td); 299 /* 300 * All allocations are done, so we can now snapshot the system. 301 * 302 * Recind nice scheduling while running with the filesystem suspended. 303 */ 304 if (td->td_ksegrp->kg_nice > 0) { 305 PROC_LOCK(td->td_proc); 306 mtx_lock_spin(&sched_lock); 307 saved_nice = td->td_ksegrp->kg_nice; 308 sched_nice(td->td_ksegrp, 0); 309 mtx_unlock_spin(&sched_lock); 310 PROC_UNLOCK(td->td_proc); 311 } 312 /* 313 * Suspend operation on filesystem. 314 */ 315 for (;;) { 316 vn_finished_write(wrtmp); 317 if ((error = vfs_write_suspend(vp->v_mount)) != 0) { 318 vn_start_write(NULL, &wrtmp, V_WAIT); 319 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 320 goto out; 321 } 322 if (mp->mnt_kern_flag & MNTK_SUSPENDED) 323 break; 324 vn_start_write(NULL, &wrtmp, V_WAIT); 325 } 326 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 327 if (collectsnapstats) 328 nanotime(&starttime); 329 /* 330 * First, copy all the cylinder group maps that have changed. 331 */ 332 for (cg = 0; cg < fs->fs_ncg; cg++) { 333 if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0) 334 continue; 335 redo++; 336 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 337 fs->fs_bsize, KERNCRED, 0, &nbp); 338 if (error) 339 goto out1; 340 error = cgaccount(cg, vp, nbp, 2); 341 bawrite(nbp); 342 if (error) 343 goto out1; 344 } 345 /* 346 * Grab a copy of the superblock and its summary information. 347 * We delay writing it until the suspension is released below. 348 */ 349 error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize, 350 KERNCRED, &sbp); 351 if (error) { 352 brelse(sbp); 353 sbp = NULL; 354 goto out1; 355 } 356 loc = blkoff(fs, fs->fs_sblockloc); 357 copy_fs = (struct fs *)(sbp->b_data + loc); 358 bcopy(fs, copy_fs, fs->fs_sbsize); 359 if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) 360 copy_fs->fs_clean = 1; 361 size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE; 362 if (fs->fs_sbsize < size) 363 bzero(&sbp->b_data[loc + fs->fs_sbsize], size - fs->fs_sbsize); 364 size = blkroundup(fs, fs->fs_cssize); 365 if (fs->fs_contigsumsize > 0) 366 size += fs->fs_ncg * sizeof(int32_t); 367 space = malloc((u_long)size, M_UFSMNT, M_WAITOK); 368 copy_fs->fs_csp = space; 369 bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize); 370 (char *)space += fs->fs_cssize; 371 loc = howmany(fs->fs_cssize, fs->fs_fsize); 372 i = fs->fs_frag - loc % fs->fs_frag; 373 len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; 374 if (len > 0) { 375 if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc), 376 len, KERNCRED, &bp)) != 0) { 377 brelse(bp); 378 free(copy_fs->fs_csp, M_UFSMNT); 379 bawrite(sbp); 380 sbp = NULL; 381 goto out1; 382 } 383 bcopy(bp->b_data, space, (u_int)len); 384 (char *)space += len; 385 bp->b_flags |= B_INVAL | B_NOCACHE; 386 brelse(bp); 387 } 388 if (fs->fs_contigsumsize > 0) { 389 copy_fs->fs_maxcluster = lp = space; 390 for (i = 0; i < fs->fs_ncg; i++) 391 *lp++ = fs->fs_contigsumsize; 392 } 393 /* 394 * We must check for active files that have been unlinked 395 * (e.g., with a zero link count). We have to expunge all 396 * trace of these files from the snapshot so that they are 397 * not reclaimed prematurely by fsck or unnecessarily dumped. 398 * We turn off the MNTK_SUSPENDED flag to avoid a panic from 399 * spec_strategy about writing on a suspended filesystem. 400 * Note that we skip unlinked snapshot files as they will 401 * be handled separately below. 402 * 403 * We also calculate the needed size for the snapshot list. 404 */ 405 snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + 406 FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */; 407 mp->mnt_kern_flag &= ~MNTK_SUSPENDED; 408 MNT_ILOCK(mp); 409 loop: 410 for (xvp = TAILQ_FIRST(&mp->mnt_nvnodelist); xvp; xvp = nvp) { 411 /* 412 * Make sure this vnode wasn't reclaimed in getnewvnode(). 413 * Start over if it has (it won't be on the list anymore). 414 */ 415 if (xvp->v_mount != mp) 416 goto loop; 417 nvp = TAILQ_NEXT(xvp, v_nmntvnodes); 418 VI_LOCK(xvp); 419 MNT_IUNLOCK(mp); 420 if ((xvp->v_iflag & VI_XLOCK) || 421 xvp->v_usecount == 0 || xvp->v_type == VNON || 422 (VTOI(xvp)->i_flags & SF_SNAPSHOT)) { 423 VI_UNLOCK(xvp); 424 MNT_ILOCK(mp); 425 continue; 426 } 427 if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK, td) != 0) { 428 MNT_ILOCK(mp); 429 goto loop; 430 } 431 if (snapdebug) 432 vprint("ffs_snapshot: busy vnode", xvp); 433 if (VOP_GETATTR(xvp, &vat, td->td_ucred, td) == 0 && 434 vat.va_nlink > 0) { 435 VOP_UNLOCK(xvp, 0, td); 436 MNT_ILOCK(mp); 437 continue; 438 } 439 xp = VTOI(xvp); 440 if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) { 441 VOP_UNLOCK(xvp, 0, td); 442 MNT_ILOCK(mp); 443 continue; 444 } 445 /* 446 * If there is a fragment, clear it here. 447 */ 448 blkno = 0; 449 loc = howmany(xp->i_size, fs->fs_bsize) - 1; 450 if (loc < NDADDR) { 451 len = fragroundup(fs, blkoff(fs, xp->i_size)); 452 if (len < fs->fs_bsize) { 453 ffs_blkfree(copy_fs, vp, DIP(xp, i_db[loc]), 454 len, xp->i_number); 455 blkno = DIP(xp, i_db[loc]); 456 DIP(xp, i_db[loc]) = 0; 457 } 458 } 459 snaplistsize += 1; 460 if (xp->i_ump->um_fstype == UFS1) 461 error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, 462 BLK_NOCOPY); 463 else 464 error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, 465 BLK_NOCOPY); 466 if (blkno) 467 DIP(xp, i_db[loc]) = blkno; 468 if (!error) 469 error = ffs_freefile(copy_fs, vp, xp->i_number, 470 xp->i_mode); 471 VOP_UNLOCK(xvp, 0, td); 472 if (error) { 473 free(copy_fs->fs_csp, M_UFSMNT); 474 bawrite(sbp); 475 sbp = NULL; 476 goto out1; 477 } 478 MNT_ILOCK(mp); 479 } 480 MNT_IUNLOCK(mp); 481 /* 482 * If there already exist snapshots on this filesystem, grab a 483 * reference to their shared lock. If this is the first snapshot 484 * on this filesystem, we need to allocate a lock for the snapshots 485 * to share. In either case, acquire the snapshot lock and give 486 * up our original private lock. 487 */ 488 VI_LOCK(devvp); 489 snaphead = &devvp->v_rdev->si_snapshots; 490 if ((xp = TAILQ_FIRST(snaphead)) != NULL) { 491 struct lock *lkp; 492 493 lkp = ITOV(xp)->v_vnlock; 494 VI_UNLOCK(devvp); 495 VI_LOCK(vp); 496 vp->v_vnlock = lkp; 497 } else { 498 struct lock *lkp; 499 500 VI_UNLOCK(devvp); 501 MALLOC(lkp, struct lock *, sizeof(struct lock), M_UFSMNT, 502 M_WAITOK); 503 lockinit(lkp, PVFS, "snaplk", VLKTIMEOUT, 504 LK_CANRECURSE | LK_NOPAUSE); 505 VI_LOCK(vp); 506 vp->v_vnlock = lkp; 507 } 508 vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, td); 509 transferlockers(&vp->v_lock, vp->v_vnlock); 510 lockmgr(&vp->v_lock, LK_RELEASE, NULL, td); 511 /* 512 * If this is the first snapshot on this filesystem, then we need 513 * to allocate the space for the list of preallocated snapshot blocks. 514 * This list will be refined below, but this preliminary one will 515 * keep us out of deadlock until the full one is ready. 516 */ 517 if (xp == NULL) { 518 MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t), 519 M_UFSMNT, M_WAITOK); 520 blkp = &snapblklist[1]; 521 *blkp++ = lblkno(fs, fs->fs_sblockloc); 522 blkno = fragstoblks(fs, fs->fs_csaddr); 523 for (cg = 0; cg < fs->fs_ncg; cg++) { 524 if (fragstoblks(fs, cgtod(fs, cg) > blkno)) 525 break; 526 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 527 } 528 len = howmany(fs->fs_cssize, fs->fs_bsize); 529 for (loc = 0; loc < len; loc++) 530 *blkp++ = blkno + loc; 531 for (; cg < fs->fs_ncg; cg++) 532 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 533 snapblklist[0] = blkp - snapblklist; 534 VI_LOCK(devvp); 535 if (devvp->v_rdev->si_snapblklist != NULL) 536 panic("ffs_snapshot: non-empty list"); 537 devvp->v_rdev->si_snapblklist = snapblklist; 538 devvp->v_rdev->si_snaplistsize = blkp - snapblklist; 539 VI_UNLOCK(devvp); 540 } 541 /* 542 * Record snapshot inode. Since this is the newest snapshot, 543 * it must be placed at the end of the list. 544 */ 545 VI_LOCK(devvp); 546 fs->fs_snapinum[snaploc] = ip->i_number; 547 if (ip->i_nextsnap.tqe_prev != 0) 548 panic("ffs_snapshot: %d already on list", ip->i_number); 549 TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap); 550 devvp->v_rdev->si_copyonwrite = ffs_copyonwrite; 551 devvp->v_vflag |= VV_COPYONWRITE; 552 VI_UNLOCK(devvp); 553 ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp"); 554 vp->v_vflag |= VV_SYSTEM; 555 out1: 556 /* 557 * Resume operation on filesystem. 558 */ 559 vfs_write_resume(vp->v_mount); 560 vn_start_write(NULL, &wrtmp, V_WAIT); 561 if (collectsnapstats && starttime.tv_sec > 0) { 562 nanotime(&endtime); 563 timespecsub(&endtime, &starttime); 564 printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n", 565 vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec, 566 endtime.tv_nsec / 1000000, redo, fs->fs_ncg); 567 } 568 if (sbp == NULL) 569 goto out; 570 /* 571 * Copy allocation information from all the snapshots in 572 * this snapshot and then expunge them from its view. 573 */ 574 snaphead = &devvp->v_rdev->si_snapshots; 575 TAILQ_FOREACH(xp, snaphead, i_nextsnap) { 576 if (xp == ip) 577 break; 578 if (xp->i_ump->um_fstype == UFS1) 579 error = expunge_ufs1(vp, xp, fs, snapacct_ufs1, 580 BLK_SNAP); 581 else 582 error = expunge_ufs2(vp, xp, fs, snapacct_ufs2, 583 BLK_SNAP); 584 if (error) { 585 fs->fs_snapinum[snaploc] = 0; 586 goto done; 587 } 588 } 589 /* 590 * Allocate space for the full list of preallocated snapshot blocks. 591 */ 592 MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t), 593 M_UFSMNT, M_WAITOK); 594 ip->i_snapblklist = &snapblklist[1]; 595 /* 596 * Expunge the blocks used by the snapshots from the set of 597 * blocks marked as used in the snapshot bitmaps. Also, collect 598 * the list of allocated blocks in i_snapblklist. 599 */ 600 if (ip->i_ump->um_fstype == UFS1) 601 error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP); 602 else 603 error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP); 604 if (error) { 605 fs->fs_snapinum[snaploc] = 0; 606 FREE(snapblklist, M_UFSMNT); 607 goto done; 608 } 609 if (snaplistsize < ip->i_snapblklist - snapblklist) 610 panic("ffs_snapshot: list too small"); 611 snaplistsize = ip->i_snapblklist - snapblklist; 612 snapblklist[0] = snaplistsize; 613 ip->i_snapblklist = 0; 614 /* 615 * Write out the list of allocated blocks to the end of the snapshot. 616 */ 617 auio.uio_iov = &aiov; 618 auio.uio_iovcnt = 1; 619 aiov.iov_base = (void *)snapblklist; 620 aiov.iov_len = snaplistsize * sizeof(daddr_t); 621 auio.uio_resid = aiov.iov_len;; 622 auio.uio_offset = ip->i_size; 623 auio.uio_segflg = UIO_SYSSPACE; 624 auio.uio_rw = UIO_WRITE; 625 auio.uio_td = td; 626 if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 627 fs->fs_snapinum[snaploc] = 0; 628 FREE(snapblklist, M_UFSMNT); 629 goto done; 630 } 631 /* 632 * Write the superblock and its summary information 633 * to the snapshot. 634 */ 635 blkno = fragstoblks(fs, fs->fs_csaddr); 636 len = howmany(fs->fs_cssize, fs->fs_bsize); 637 space = copy_fs->fs_csp; 638 for (loc = 0; loc < len; loc++) { 639 error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp); 640 if (error) { 641 brelse(nbp); 642 fs->fs_snapinum[snaploc] = 0; 643 FREE(snapblklist, M_UFSMNT); 644 goto done; 645 } 646 bcopy(space, nbp->b_data, fs->fs_bsize); 647 space = (char *)space + fs->fs_bsize; 648 bawrite(nbp); 649 } 650 /* 651 * As this is the newest list, it is the most inclusive, so 652 * should replace the previous list. 653 */ 654 VI_LOCK(devvp); 655 space = devvp->v_rdev->si_snapblklist; 656 devvp->v_rdev->si_snapblklist = snapblklist; 657 devvp->v_rdev->si_snaplistsize = snaplistsize; 658 VI_UNLOCK(devvp); 659 if (space != NULL) 660 FREE(space, M_UFSMNT); 661 done: 662 free(copy_fs->fs_csp, M_UFSMNT); 663 bawrite(sbp); 664 out: 665 if (saved_nice > 0) { 666 PROC_LOCK(td->td_proc); 667 mtx_lock_spin(&sched_lock); 668 sched_nice(td->td_ksegrp, saved_nice); 669 mtx_unlock_spin(&sched_lock); 670 PROC_UNLOCK(td->td_proc); 671 } 672 if (fs->fs_active != 0) { 673 FREE(fs->fs_active, M_DEVBUF); 674 fs->fs_active = 0; 675 } 676 mp->mnt_flag = flag; 677 if (error) 678 (void) UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED, td); 679 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 680 if (error) 681 vput(vp); 682 else 683 VOP_UNLOCK(vp, 0, td); 684 vn_finished_write(wrtmp); 685 return (error); 686 } 687 688 /* 689 * Copy a cylinder group map. All the unallocated blocks are marked 690 * BLK_NOCOPY so that the snapshot knows that it need not copy them 691 * if they are later written. If passno is one, then this is a first 692 * pass, so only setting needs to be done. If passno is 2, then this 693 * is a revision to a previous pass which must be undone as the 694 * replacement pass is done. 695 */ 696 static int 697 cgaccount(cg, vp, nbp, passno) 698 int cg; 699 struct vnode *vp; 700 struct buf *nbp; 701 int passno; 702 { 703 struct buf *bp, *ibp; 704 struct inode *ip; 705 struct cg *cgp; 706 struct fs *fs; 707 ufs2_daddr_t base, numblks; 708 int error, len, loc, indiroff; 709 710 ip = VTOI(vp); 711 fs = ip->i_fs; 712 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 713 (int)fs->fs_cgsize, KERNCRED, &bp); 714 if (error) { 715 brelse(bp); 716 return (error); 717 } 718 cgp = (struct cg *)bp->b_data; 719 if (!cg_chkmagic(cgp)) { 720 brelse(bp); 721 return (EIO); 722 } 723 atomic_set_int(&ACTIVECGNUM(fs, cg), ACTIVECGOFF(cg)); 724 bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize); 725 if (fs->fs_cgsize < fs->fs_bsize) 726 bzero(&nbp->b_data[fs->fs_cgsize], 727 fs->fs_bsize - fs->fs_cgsize); 728 if (passno == 2) 729 nbp->b_flags |= B_VALIDSUSPWRT; 730 numblks = howmany(fs->fs_size, fs->fs_frag); 731 len = howmany(fs->fs_fpg, fs->fs_frag); 732 base = cg * fs->fs_fpg / fs->fs_frag; 733 if (base + len >= numblks) 734 len = numblks - base - 1; 735 loc = 0; 736 if (base < NDADDR) { 737 for ( ; loc < NDADDR; loc++) { 738 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 739 DIP(ip, i_db[loc]) = BLK_NOCOPY; 740 else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY) 741 DIP(ip, i_db[loc]) = 0; 742 else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY) 743 panic("ffs_snapshot: lost direct block"); 744 } 745 } 746 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), 747 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 748 if (error) { 749 brelse(bp); 750 return (error); 751 } 752 indiroff = (base + loc - NDADDR) % NINDIR(fs); 753 for ( ; loc < len; loc++, indiroff++) { 754 if (indiroff >= NINDIR(fs)) { 755 if (passno == 2) 756 ibp->b_flags |= B_VALIDSUSPWRT; 757 bawrite(ibp); 758 error = UFS_BALLOC(vp, 759 lblktosize(fs, (off_t)(base + loc)), 760 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 761 if (error) { 762 brelse(bp); 763 return (error); 764 } 765 indiroff = 0; 766 } 767 if (ip->i_ump->um_fstype == UFS1) { 768 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 769 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 770 BLK_NOCOPY; 771 else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data)) 772 [indiroff] == BLK_NOCOPY) 773 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0; 774 else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data)) 775 [indiroff] == BLK_NOCOPY) 776 panic("ffs_snapshot: lost indirect block"); 777 continue; 778 } 779 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 780 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; 781 else if (passno == 2 && 782 ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) 783 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0; 784 else if (passno == 1 && 785 ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) 786 panic("ffs_snapshot: lost indirect block"); 787 } 788 bqrelse(bp); 789 if (passno == 2) 790 ibp->b_flags |= B_VALIDSUSPWRT; 791 bdwrite(ibp); 792 return (0); 793 } 794 795 /* 796 * Before expunging a snapshot inode, note all the 797 * blocks that it claims with BLK_SNAP so that fsck will 798 * be able to account for those blocks properly and so 799 * that this snapshot knows that it need not copy them 800 * if the other snapshot holding them is freed. This code 801 * is reproduced once each for UFS1 and UFS2. 802 */ 803 static int 804 expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype) 805 struct vnode *snapvp; 806 struct inode *cancelip; 807 struct fs *fs; 808 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 809 struct fs *, ufs_lbn_t, int); 810 int expungetype; 811 { 812 int i, error, indiroff; 813 ufs_lbn_t lbn, rlbn; 814 ufs2_daddr_t len, blkno, numblks, blksperindir; 815 struct ufs1_dinode *dip; 816 struct thread *td = curthread; 817 struct buf *bp; 818 819 /* 820 * Prepare to expunge the inode. If its inode block has not 821 * yet been copied, then allocate and fill the copy. 822 */ 823 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 824 blkno = 0; 825 if (lbn < NDADDR) { 826 blkno = VTOI(snapvp)->i_din1->di_db[lbn]; 827 } else { 828 td->td_pflags |= TDP_COWINPROGRESS; 829 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 830 fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); 831 td->td_pflags &= ~TDP_COWINPROGRESS; 832 if (error) 833 return (error); 834 indiroff = (lbn - NDADDR) % NINDIR(fs); 835 blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff]; 836 bqrelse(bp); 837 } 838 if (blkno != 0) { 839 if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) 840 return (error); 841 } else { 842 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 843 fs->fs_bsize, KERNCRED, 0, &bp); 844 if (error) 845 return (error); 846 if ((error = readblock(bp, lbn)) != 0) 847 return (error); 848 } 849 /* 850 * Set a snapshot inode to be a zero length file, regular files 851 * to be completely unallocated. 852 */ 853 dip = (struct ufs1_dinode *)bp->b_data + 854 ino_to_fsbo(fs, cancelip->i_number); 855 if (expungetype == BLK_NOCOPY) 856 dip->di_mode = 0; 857 dip->di_size = 0; 858 dip->di_blocks = 0; 859 dip->di_flags &= ~SF_SNAPSHOT; 860 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t)); 861 bdwrite(bp); 862 /* 863 * Now go through and expunge all the blocks in the file 864 * using the function requested. 865 */ 866 numblks = howmany(cancelip->i_size, fs->fs_bsize); 867 if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0], 868 &cancelip->i_din1->di_db[NDADDR], fs, 0, expungetype))) 869 return (error); 870 if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_ib[0], 871 &cancelip->i_din1->di_ib[NIADDR], fs, -1, expungetype))) 872 return (error); 873 blksperindir = 1; 874 lbn = -NDADDR; 875 len = numblks - NDADDR; 876 rlbn = NDADDR; 877 for (i = 0; len > 0 && i < NIADDR; i++) { 878 error = indiracct_ufs1(snapvp, ITOV(cancelip), i, 879 cancelip->i_din1->di_ib[i], lbn, rlbn, len, 880 blksperindir, fs, acctfunc, expungetype); 881 if (error) 882 return (error); 883 blksperindir *= NINDIR(fs); 884 lbn -= blksperindir + 1; 885 len -= blksperindir; 886 rlbn += blksperindir; 887 } 888 return (0); 889 } 890 891 /* 892 * Descend an indirect block chain for vnode cancelvp accounting for all 893 * its indirect blocks in snapvp. 894 */ 895 static int 896 indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, 897 blksperindir, fs, acctfunc, expungetype) 898 struct vnode *snapvp; 899 struct vnode *cancelvp; 900 int level; 901 ufs1_daddr_t blkno; 902 ufs_lbn_t lbn; 903 ufs_lbn_t rlbn; 904 ufs_lbn_t remblks; 905 ufs_lbn_t blksperindir; 906 struct fs *fs; 907 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 908 struct fs *, ufs_lbn_t, int); 909 int expungetype; 910 { 911 int error, num, i; 912 ufs_lbn_t subblksperindir; 913 struct indir indirs[NIADDR + 2]; 914 ufs1_daddr_t last, *bap; 915 struct buf *bp; 916 917 if (blkno == 0) { 918 if (expungetype == BLK_NOCOPY) 919 return (0); 920 panic("indiracct_ufs1: missing indir"); 921 } 922 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 923 return (error); 924 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 925 panic("indiracct_ufs1: botched params"); 926 /* 927 * We have to expand bread here since it will deadlock looking 928 * up the block number for any blocks that are not in the cache. 929 */ 930 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); 931 bp->b_blkno = fsbtodb(fs, blkno); 932 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 933 (error = readblock(bp, fragstoblks(fs, blkno)))) { 934 brelse(bp); 935 return (error); 936 } 937 /* 938 * Account for the block pointers in this indirect block. 939 */ 940 last = howmany(remblks, blksperindir); 941 if (last > NINDIR(fs)) 942 last = NINDIR(fs); 943 MALLOC(bap, ufs1_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); 944 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 945 bqrelse(bp); 946 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 947 level == 0 ? rlbn : -1, expungetype); 948 if (error || level == 0) 949 goto out; 950 /* 951 * Account for the block pointers in each of the indirect blocks 952 * in the levels below us. 953 */ 954 subblksperindir = blksperindir / NINDIR(fs); 955 for (lbn++, level--, i = 0; i < last; i++) { 956 error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn, 957 rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); 958 if (error) 959 goto out; 960 rlbn += blksperindir; 961 lbn -= blksperindir; 962 remblks -= blksperindir; 963 } 964 out: 965 FREE(bap, M_DEVBUF); 966 return (error); 967 } 968 969 /* 970 * Do both snap accounting and map accounting. 971 */ 972 static int 973 fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype) 974 struct vnode *vp; 975 ufs1_daddr_t *oldblkp, *lastblkp; 976 struct fs *fs; 977 ufs_lbn_t lblkno; 978 int exptype; /* BLK_SNAP or BLK_NOCOPY */ 979 { 980 int error; 981 982 if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 983 return (error); 984 return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 985 } 986 987 /* 988 * Identify a set of blocks allocated in a snapshot inode. 989 */ 990 static int 991 snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 992 struct vnode *vp; 993 ufs1_daddr_t *oldblkp, *lastblkp; 994 struct fs *fs; 995 ufs_lbn_t lblkno; 996 int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 997 { 998 struct inode *ip = VTOI(vp); 999 ufs1_daddr_t blkno, *blkp; 1000 ufs_lbn_t lbn; 1001 struct buf *ibp; 1002 int error; 1003 1004 for ( ; oldblkp < lastblkp; oldblkp++) { 1005 blkno = *oldblkp; 1006 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1007 continue; 1008 lbn = fragstoblks(fs, blkno); 1009 if (lbn < NDADDR) { 1010 blkp = &ip->i_din1->di_db[lbn]; 1011 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1012 } else { 1013 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1014 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1015 if (error) 1016 return (error); 1017 blkp = &((ufs1_daddr_t *)(ibp->b_data)) 1018 [(lbn - NDADDR) % NINDIR(fs)]; 1019 } 1020 /* 1021 * If we are expunging a snapshot vnode and we 1022 * find a block marked BLK_NOCOPY, then it is 1023 * one that has been allocated to this snapshot after 1024 * we took our current snapshot and can be ignored. 1025 */ 1026 if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { 1027 if (lbn >= NDADDR) 1028 brelse(ibp); 1029 } else { 1030 if (*blkp != 0) 1031 panic("snapacct_ufs1: bad block"); 1032 *blkp = expungetype; 1033 if (lbn >= NDADDR) 1034 bdwrite(ibp); 1035 } 1036 } 1037 return (0); 1038 } 1039 1040 /* 1041 * Account for a set of blocks allocated in a snapshot inode. 1042 */ 1043 static int 1044 mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1045 struct vnode *vp; 1046 ufs1_daddr_t *oldblkp, *lastblkp; 1047 struct fs *fs; 1048 ufs_lbn_t lblkno; 1049 int expungetype; 1050 { 1051 ufs1_daddr_t blkno; 1052 struct inode *ip; 1053 ino_t inum; 1054 int acctit; 1055 1056 ip = VTOI(vp); 1057 inum = ip->i_number; 1058 if (lblkno == -1) 1059 acctit = 0; 1060 else 1061 acctit = 1; 1062 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1063 blkno = *oldblkp; 1064 if (blkno == 0 || blkno == BLK_NOCOPY) 1065 continue; 1066 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1067 *ip->i_snapblklist++ = lblkno; 1068 if (blkno == BLK_SNAP) 1069 blkno = blkstofrags(fs, lblkno); 1070 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); 1071 } 1072 return (0); 1073 } 1074 1075 /* 1076 * Before expunging a snapshot inode, note all the 1077 * blocks that it claims with BLK_SNAP so that fsck will 1078 * be able to account for those blocks properly and so 1079 * that this snapshot knows that it need not copy them 1080 * if the other snapshot holding them is freed. This code 1081 * is reproduced once each for UFS1 and UFS2. 1082 */ 1083 static int 1084 expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype) 1085 struct vnode *snapvp; 1086 struct inode *cancelip; 1087 struct fs *fs; 1088 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1089 struct fs *, ufs_lbn_t, int); 1090 int expungetype; 1091 { 1092 int i, error, indiroff; 1093 ufs_lbn_t lbn, rlbn; 1094 ufs2_daddr_t len, blkno, numblks, blksperindir; 1095 struct ufs2_dinode *dip; 1096 struct thread *td = curthread; 1097 struct buf *bp; 1098 1099 /* 1100 * Prepare to expunge the inode. If its inode block has not 1101 * yet been copied, then allocate and fill the copy. 1102 */ 1103 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 1104 blkno = 0; 1105 if (lbn < NDADDR) { 1106 blkno = VTOI(snapvp)->i_din2->di_db[lbn]; 1107 } else { 1108 td->td_pflags |= TDP_COWINPROGRESS; 1109 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 1110 fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); 1111 td->td_pflags &= ~TDP_COWINPROGRESS; 1112 if (error) 1113 return (error); 1114 indiroff = (lbn - NDADDR) % NINDIR(fs); 1115 blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff]; 1116 bqrelse(bp); 1117 } 1118 if (blkno != 0) { 1119 if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) 1120 return (error); 1121 } else { 1122 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 1123 fs->fs_bsize, KERNCRED, 0, &bp); 1124 if (error) 1125 return (error); 1126 if ((error = readblock(bp, lbn)) != 0) 1127 return (error); 1128 } 1129 /* 1130 * Set a snapshot inode to be a zero length file, regular files 1131 * to be completely unallocated. 1132 */ 1133 dip = (struct ufs2_dinode *)bp->b_data + 1134 ino_to_fsbo(fs, cancelip->i_number); 1135 if (expungetype == BLK_NOCOPY) 1136 dip->di_mode = 0; 1137 dip->di_size = 0; 1138 dip->di_blocks = 0; 1139 dip->di_flags &= ~SF_SNAPSHOT; 1140 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t)); 1141 bdwrite(bp); 1142 /* 1143 * Now go through and expunge all the blocks in the file 1144 * using the function requested. 1145 */ 1146 numblks = howmany(cancelip->i_size, fs->fs_bsize); 1147 if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0], 1148 &cancelip->i_din2->di_db[NDADDR], fs, 0, expungetype))) 1149 return (error); 1150 if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_ib[0], 1151 &cancelip->i_din2->di_ib[NIADDR], fs, -1, expungetype))) 1152 return (error); 1153 blksperindir = 1; 1154 lbn = -NDADDR; 1155 len = numblks - NDADDR; 1156 rlbn = NDADDR; 1157 for (i = 0; len > 0 && i < NIADDR; i++) { 1158 error = indiracct_ufs2(snapvp, ITOV(cancelip), i, 1159 cancelip->i_din2->di_ib[i], lbn, rlbn, len, 1160 blksperindir, fs, acctfunc, expungetype); 1161 if (error) 1162 return (error); 1163 blksperindir *= NINDIR(fs); 1164 lbn -= blksperindir + 1; 1165 len -= blksperindir; 1166 rlbn += blksperindir; 1167 } 1168 return (0); 1169 } 1170 1171 /* 1172 * Descend an indirect block chain for vnode cancelvp accounting for all 1173 * its indirect blocks in snapvp. 1174 */ 1175 static int 1176 indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, 1177 blksperindir, fs, acctfunc, expungetype) 1178 struct vnode *snapvp; 1179 struct vnode *cancelvp; 1180 int level; 1181 ufs2_daddr_t blkno; 1182 ufs_lbn_t lbn; 1183 ufs_lbn_t rlbn; 1184 ufs_lbn_t remblks; 1185 ufs_lbn_t blksperindir; 1186 struct fs *fs; 1187 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1188 struct fs *, ufs_lbn_t, int); 1189 int expungetype; 1190 { 1191 int error, num, i; 1192 ufs_lbn_t subblksperindir; 1193 struct indir indirs[NIADDR + 2]; 1194 ufs2_daddr_t last, *bap; 1195 struct buf *bp; 1196 1197 if (blkno == 0) { 1198 if (expungetype == BLK_NOCOPY) 1199 return (0); 1200 panic("indiracct_ufs2: missing indir"); 1201 } 1202 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 1203 return (error); 1204 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 1205 panic("indiracct_ufs2: botched params"); 1206 /* 1207 * We have to expand bread here since it will deadlock looking 1208 * up the block number for any blocks that are not in the cache. 1209 */ 1210 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); 1211 bp->b_blkno = fsbtodb(fs, blkno); 1212 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 1213 (error = readblock(bp, fragstoblks(fs, blkno)))) { 1214 brelse(bp); 1215 return (error); 1216 } 1217 /* 1218 * Account for the block pointers in this indirect block. 1219 */ 1220 last = howmany(remblks, blksperindir); 1221 if (last > NINDIR(fs)) 1222 last = NINDIR(fs); 1223 MALLOC(bap, ufs2_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); 1224 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 1225 bqrelse(bp); 1226 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 1227 level == 0 ? rlbn : -1, expungetype); 1228 if (error || level == 0) 1229 goto out; 1230 /* 1231 * Account for the block pointers in each of the indirect blocks 1232 * in the levels below us. 1233 */ 1234 subblksperindir = blksperindir / NINDIR(fs); 1235 for (lbn++, level--, i = 0; i < last; i++) { 1236 error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn, 1237 rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); 1238 if (error) 1239 goto out; 1240 rlbn += blksperindir; 1241 lbn -= blksperindir; 1242 remblks -= blksperindir; 1243 } 1244 out: 1245 FREE(bap, M_DEVBUF); 1246 return (error); 1247 } 1248 1249 /* 1250 * Do both snap accounting and map accounting. 1251 */ 1252 static int 1253 fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype) 1254 struct vnode *vp; 1255 ufs2_daddr_t *oldblkp, *lastblkp; 1256 struct fs *fs; 1257 ufs_lbn_t lblkno; 1258 int exptype; /* BLK_SNAP or BLK_NOCOPY */ 1259 { 1260 int error; 1261 1262 if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 1263 return (error); 1264 return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 1265 } 1266 1267 /* 1268 * Identify a set of blocks allocated in a snapshot inode. 1269 */ 1270 static int 1271 snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1272 struct vnode *vp; 1273 ufs2_daddr_t *oldblkp, *lastblkp; 1274 struct fs *fs; 1275 ufs_lbn_t lblkno; 1276 int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 1277 { 1278 struct inode *ip = VTOI(vp); 1279 ufs2_daddr_t blkno, *blkp; 1280 ufs_lbn_t lbn; 1281 struct buf *ibp; 1282 int error; 1283 1284 for ( ; oldblkp < lastblkp; oldblkp++) { 1285 blkno = *oldblkp; 1286 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1287 continue; 1288 lbn = fragstoblks(fs, blkno); 1289 if (lbn < NDADDR) { 1290 blkp = &ip->i_din2->di_db[lbn]; 1291 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1292 } else { 1293 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1294 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1295 if (error) 1296 return (error); 1297 blkp = &((ufs2_daddr_t *)(ibp->b_data)) 1298 [(lbn - NDADDR) % NINDIR(fs)]; 1299 } 1300 /* 1301 * If we are expunging a snapshot vnode and we 1302 * find a block marked BLK_NOCOPY, then it is 1303 * one that has been allocated to this snapshot after 1304 * we took our current snapshot and can be ignored. 1305 */ 1306 if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { 1307 if (lbn >= NDADDR) 1308 brelse(ibp); 1309 } else { 1310 if (*blkp != 0) 1311 panic("snapacct_ufs2: bad block"); 1312 *blkp = expungetype; 1313 if (lbn >= NDADDR) 1314 bdwrite(ibp); 1315 } 1316 } 1317 return (0); 1318 } 1319 1320 /* 1321 * Account for a set of blocks allocated in a snapshot inode. 1322 */ 1323 static int 1324 mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1325 struct vnode *vp; 1326 ufs2_daddr_t *oldblkp, *lastblkp; 1327 struct fs *fs; 1328 ufs_lbn_t lblkno; 1329 int expungetype; 1330 { 1331 ufs2_daddr_t blkno; 1332 struct inode *ip; 1333 ino_t inum; 1334 int acctit; 1335 1336 ip = VTOI(vp); 1337 inum = ip->i_number; 1338 if (lblkno == -1) 1339 acctit = 0; 1340 else 1341 acctit = 1; 1342 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1343 blkno = *oldblkp; 1344 if (blkno == 0 || blkno == BLK_NOCOPY) 1345 continue; 1346 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1347 *ip->i_snapblklist++ = lblkno; 1348 if (blkno == BLK_SNAP) 1349 blkno = blkstofrags(fs, lblkno); 1350 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); 1351 } 1352 return (0); 1353 } 1354 1355 /* 1356 * Decrement extra reference on snapshot when last name is removed. 1357 * It will not be freed until the last open reference goes away. 1358 */ 1359 void 1360 ffs_snapgone(ip) 1361 struct inode *ip; 1362 { 1363 struct inode *xp; 1364 struct fs *fs; 1365 int snaploc; 1366 1367 /* 1368 * Find snapshot in incore list. 1369 */ 1370 TAILQ_FOREACH(xp, &ip->i_devvp->v_rdev->si_snapshots, i_nextsnap) 1371 if (xp == ip) 1372 break; 1373 if (xp != NULL) 1374 vrele(ITOV(ip)); 1375 else if (snapdebug) 1376 printf("ffs_snapgone: lost snapshot vnode %d\n", 1377 ip->i_number); 1378 /* 1379 * Delete snapshot inode from superblock. Keep list dense. 1380 */ 1381 fs = ip->i_fs; 1382 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 1383 if (fs->fs_snapinum[snaploc] == ip->i_number) 1384 break; 1385 if (snaploc < FSMAXSNAP) { 1386 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { 1387 if (fs->fs_snapinum[snaploc] == 0) 1388 break; 1389 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; 1390 } 1391 fs->fs_snapinum[snaploc - 1] = 0; 1392 } 1393 } 1394 1395 /* 1396 * Prepare a snapshot file for being removed. 1397 */ 1398 void 1399 ffs_snapremove(vp) 1400 struct vnode *vp; 1401 { 1402 struct inode *ip; 1403 struct vnode *devvp; 1404 struct lock *lkp; 1405 struct buf *ibp; 1406 struct fs *fs; 1407 struct thread *td = curthread; 1408 ufs2_daddr_t numblks, blkno, dblk, *snapblklist; 1409 int error, loc, last; 1410 1411 ip = VTOI(vp); 1412 fs = ip->i_fs; 1413 devvp = ip->i_devvp; 1414 /* 1415 * If active, delete from incore list (this snapshot may 1416 * already have been in the process of being deleted, so 1417 * would not have been active). 1418 * 1419 * Clear copy-on-write flag if last snapshot. 1420 */ 1421 if (ip->i_nextsnap.tqe_prev != 0) { 1422 VI_LOCK(devvp); 1423 lockmgr(&vp->v_lock, LK_INTERLOCK | LK_EXCLUSIVE, 1424 VI_MTX(devvp), td); 1425 VI_LOCK(devvp); 1426 TAILQ_REMOVE(&devvp->v_rdev->si_snapshots, ip, i_nextsnap); 1427 ip->i_nextsnap.tqe_prev = 0; 1428 lkp = vp->v_vnlock; 1429 vp->v_vnlock = &vp->v_lock; 1430 lockmgr(lkp, LK_RELEASE, NULL, td); 1431 if (TAILQ_FIRST(&devvp->v_rdev->si_snapshots) != 0) { 1432 VI_UNLOCK(devvp); 1433 } else { 1434 snapblklist = devvp->v_rdev->si_snapblklist; 1435 devvp->v_rdev->si_snapblklist = 0; 1436 devvp->v_rdev->si_snaplistsize = 0; 1437 devvp->v_rdev->si_copyonwrite = 0; 1438 devvp->v_vflag &= ~VV_COPYONWRITE; 1439 lockmgr(lkp, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp), td); 1440 lockmgr(lkp, LK_RELEASE, NULL, td); 1441 lockdestroy(lkp); 1442 FREE(lkp, M_UFSMNT); 1443 FREE(snapblklist, M_UFSMNT); 1444 } 1445 } 1446 /* 1447 * Clear all BLK_NOCOPY fields. Pass any block claims to other 1448 * snapshots that want them (see ffs_snapblkfree below). 1449 */ 1450 for (blkno = 1; blkno < NDADDR; blkno++) { 1451 dblk = DIP(ip, i_db[blkno]); 1452 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1453 DIP(ip, i_db[blkno]) = 0; 1454 else if ((dblk == blkstofrags(fs, blkno) && 1455 ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, 1456 ip->i_number))) { 1457 DIP(ip, i_blocks) -= btodb(fs->fs_bsize); 1458 DIP(ip, i_db[blkno]) = 0; 1459 } 1460 } 1461 numblks = howmany(ip->i_size, fs->fs_bsize); 1462 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 1463 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), 1464 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1465 if (error) 1466 continue; 1467 if (fs->fs_size - blkno > NINDIR(fs)) 1468 last = NINDIR(fs); 1469 else 1470 last = fs->fs_size - blkno; 1471 for (loc = 0; loc < last; loc++) { 1472 if (ip->i_ump->um_fstype == UFS1) { 1473 dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc]; 1474 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1475 ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; 1476 else if ((dblk == blkstofrags(fs, blkno) && 1477 ffs_snapblkfree(fs, ip->i_devvp, dblk, 1478 fs->fs_bsize, ip->i_number))) { 1479 ip->i_din1->di_blocks -= 1480 btodb(fs->fs_bsize); 1481 ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; 1482 } 1483 continue; 1484 } 1485 dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc]; 1486 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1487 ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; 1488 else if ((dblk == blkstofrags(fs, blkno) && 1489 ffs_snapblkfree(fs, ip->i_devvp, dblk, 1490 fs->fs_bsize, ip->i_number))) { 1491 ip->i_din2->di_blocks -= btodb(fs->fs_bsize); 1492 ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; 1493 } 1494 } 1495 bawrite(ibp); 1496 } 1497 /* 1498 * Clear snapshot flag and drop reference. 1499 */ 1500 ip->i_flags &= ~SF_SNAPSHOT; 1501 DIP(ip, i_flags) = ip->i_flags; 1502 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1503 } 1504 1505 /* 1506 * Notification that a block is being freed. Return zero if the free 1507 * should be allowed to proceed. Return non-zero if the snapshot file 1508 * wants to claim the block. The block will be claimed if it is an 1509 * uncopied part of one of the snapshots. It will be freed if it is 1510 * either a BLK_NOCOPY or has already been copied in all of the snapshots. 1511 * If a fragment is being freed, then all snapshots that care about 1512 * it must make a copy since a snapshot file can only claim full sized 1513 * blocks. Note that if more than one snapshot file maps the block, 1514 * we can pick one at random to claim it. Since none of the snapshots 1515 * can change, we are assurred that they will all see the same unmodified 1516 * image. When deleting a snapshot file (see ffs_snapremove above), we 1517 * must push any of these claimed blocks to one of the other snapshots 1518 * that maps it. These claimed blocks are easily identified as they will 1519 * have a block number equal to their logical block number within the 1520 * snapshot. A copied block can never have this property because they 1521 * must always have been allocated from a BLK_NOCOPY location. 1522 */ 1523 int 1524 ffs_snapblkfree(fs, devvp, bno, size, inum) 1525 struct fs *fs; 1526 struct vnode *devvp; 1527 ufs2_daddr_t bno; 1528 long size; 1529 ino_t inum; 1530 { 1531 struct buf *ibp, *cbp, *savedcbp = 0; 1532 struct thread *td = curthread; 1533 struct inode *ip; 1534 struct vnode *vp = NULL; 1535 ufs_lbn_t lbn; 1536 ufs2_daddr_t blkno; 1537 int indiroff = 0, snapshot_locked = 0, error = 0, claimedblk = 0; 1538 struct snaphead *snaphead; 1539 1540 lbn = fragstoblks(fs, bno); 1541 retry: 1542 VI_LOCK(devvp); 1543 snaphead = &devvp->v_rdev->si_snapshots; 1544 TAILQ_FOREACH(ip, snaphead, i_nextsnap) { 1545 vp = ITOV(ip); 1546 /* 1547 * Lookup block being written. 1548 */ 1549 if (lbn < NDADDR) { 1550 blkno = DIP(ip, i_db[lbn]); 1551 } else { 1552 if (snapshot_locked == 0 && 1553 lockmgr(vp->v_vnlock, 1554 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 1555 VI_MTX(devvp), td) != 0) 1556 goto retry; 1557 snapshot_locked = 1; 1558 td->td_pflags |= TDP_COWINPROGRESS; 1559 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1560 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1561 td->td_pflags &= ~TDP_COWINPROGRESS; 1562 if (error) 1563 break; 1564 indiroff = (lbn - NDADDR) % NINDIR(fs); 1565 if (ip->i_ump->um_fstype == UFS1) 1566 blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; 1567 else 1568 blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; 1569 } 1570 /* 1571 * Check to see if block needs to be copied. 1572 */ 1573 if (blkno == 0) { 1574 /* 1575 * A block that we map is being freed. If it has not 1576 * been claimed yet, we will claim or copy it (below). 1577 */ 1578 claimedblk = 1; 1579 } else if (blkno == BLK_SNAP) { 1580 /* 1581 * No previous snapshot claimed the block, 1582 * so it will be freed and become a BLK_NOCOPY 1583 * (don't care) for us. 1584 */ 1585 if (claimedblk) 1586 panic("snapblkfree: inconsistent block type"); 1587 if (snapshot_locked == 0 && 1588 lockmgr(vp->v_vnlock, 1589 LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, 1590 VI_MTX(devvp), td) != 0) { 1591 if (lbn >= NDADDR) 1592 bqrelse(ibp); 1593 vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL, td); 1594 goto retry; 1595 } 1596 snapshot_locked = 1; 1597 if (lbn < NDADDR) { 1598 DIP(ip, i_db[lbn]) = BLK_NOCOPY; 1599 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1600 } else if (ip->i_ump->um_fstype == UFS1) { 1601 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 1602 BLK_NOCOPY; 1603 bdwrite(ibp); 1604 } else { 1605 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 1606 BLK_NOCOPY; 1607 bdwrite(ibp); 1608 } 1609 continue; 1610 } else /* BLK_NOCOPY or default */ { 1611 /* 1612 * If the snapshot has already copied the block 1613 * (default), or does not care about the block, 1614 * it is not needed. 1615 */ 1616 if (lbn >= NDADDR) 1617 bqrelse(ibp); 1618 continue; 1619 } 1620 /* 1621 * If this is a full size block, we will just grab it 1622 * and assign it to the snapshot inode. Otherwise we 1623 * will proceed to copy it. See explanation for this 1624 * routine as to why only a single snapshot needs to 1625 * claim this block. 1626 */ 1627 if (snapshot_locked == 0 && 1628 lockmgr(vp->v_vnlock, 1629 LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, 1630 VI_MTX(devvp), td) != 0) { 1631 if (lbn >= NDADDR) 1632 bqrelse(ibp); 1633 vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL, td); 1634 goto retry; 1635 } 1636 snapshot_locked = 1; 1637 if (size == fs->fs_bsize) { 1638 #ifdef DEBUG 1639 if (snapdebug) 1640 printf("%s %d lbn %jd from inum %d\n", 1641 "Grabonremove: snapino", ip->i_number, 1642 (intmax_t)lbn, inum); 1643 #endif 1644 if (lbn < NDADDR) { 1645 DIP(ip, i_db[lbn]) = bno; 1646 } else if (ip->i_ump->um_fstype == UFS1) { 1647 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno; 1648 bdwrite(ibp); 1649 } else { 1650 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno; 1651 bdwrite(ibp); 1652 } 1653 DIP(ip, i_blocks) += btodb(size); 1654 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1655 VOP_UNLOCK(vp, 0, td); 1656 return (1); 1657 } 1658 if (lbn >= NDADDR) 1659 bqrelse(ibp); 1660 /* 1661 * Allocate the block into which to do the copy. Note that this 1662 * allocation will never require any additional allocations for 1663 * the snapshot inode. 1664 */ 1665 td->td_pflags |= TDP_COWINPROGRESS; 1666 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1667 fs->fs_bsize, KERNCRED, 0, &cbp); 1668 td->td_pflags &= ~TDP_COWINPROGRESS; 1669 if (error) 1670 break; 1671 #ifdef DEBUG 1672 if (snapdebug) 1673 printf("%s%d lbn %jd %s %d size %ld to blkno %jd\n", 1674 "Copyonremove: snapino ", ip->i_number, 1675 (intmax_t)lbn, "for inum", inum, size, 1676 (intmax_t)cbp->b_blkno); 1677 #endif 1678 /* 1679 * If we have already read the old block contents, then 1680 * simply copy them to the new block. Note that we need 1681 * to synchronously write snapshots that have not been 1682 * unlinked, and hence will be visible after a crash, 1683 * to ensure their integrity. 1684 */ 1685 if (savedcbp != 0) { 1686 bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 1687 bawrite(cbp); 1688 if (dopersistence && ip->i_effnlink > 0) 1689 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1690 continue; 1691 } 1692 /* 1693 * Otherwise, read the old block contents into the buffer. 1694 */ 1695 if ((error = readblock(cbp, lbn)) != 0) { 1696 bzero(cbp->b_data, fs->fs_bsize); 1697 bawrite(cbp); 1698 if (dopersistence && ip->i_effnlink > 0) 1699 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1700 break; 1701 } 1702 savedcbp = cbp; 1703 } 1704 /* 1705 * Note that we need to synchronously write snapshots that 1706 * have not been unlinked, and hence will be visible after 1707 * a crash, to ensure their integrity. 1708 */ 1709 if (savedcbp) { 1710 vp = savedcbp->b_vp; 1711 bawrite(savedcbp); 1712 if (dopersistence && VTOI(vp)->i_effnlink > 0) 1713 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1714 } 1715 /* 1716 * If we have been unable to allocate a block in which to do 1717 * the copy, then return non-zero so that the fragment will 1718 * not be freed. Although space will be lost, the snapshot 1719 * will stay consistent. 1720 */ 1721 if (snapshot_locked) 1722 VOP_UNLOCK(vp, 0, td); 1723 else 1724 VI_UNLOCK(devvp); 1725 return (error); 1726 } 1727 1728 /* 1729 * Associate snapshot files when mounting. 1730 */ 1731 void 1732 ffs_snapshot_mount(mp) 1733 struct mount *mp; 1734 { 1735 struct ufsmount *ump = VFSTOUFS(mp); 1736 struct vnode *devvp = ump->um_devvp; 1737 struct fs *fs = ump->um_fs; 1738 struct thread *td = curthread; 1739 struct snaphead *snaphead; 1740 struct vnode *vp; 1741 struct inode *ip, *xp; 1742 struct uio auio; 1743 struct iovec aiov; 1744 void *snapblklist; 1745 char *reason; 1746 daddr_t snaplistsize; 1747 int error, snaploc, loc; 1748 1749 /* 1750 * XXX The following needs to be set before UFS_TRUNCATE or 1751 * VOP_READ can be called. 1752 */ 1753 mp->mnt_stat.f_iosize = fs->fs_bsize; 1754 /* 1755 * Process each snapshot listed in the superblock. 1756 */ 1757 vp = NULL; 1758 snaphead = &devvp->v_rdev->si_snapshots; 1759 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { 1760 if (fs->fs_snapinum[snaploc] == 0) 1761 break; 1762 if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], 1763 LK_EXCLUSIVE, &vp)) != 0){ 1764 printf("ffs_snapshot_mount: vget failed %d\n", error); 1765 continue; 1766 } 1767 ip = VTOI(vp); 1768 if ((ip->i_flags & SF_SNAPSHOT) == 0 || ip->i_size == 1769 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) { 1770 if ((ip->i_flags & SF_SNAPSHOT) == 0) { 1771 reason = "non-snapshot"; 1772 } else { 1773 reason = "old format snapshot"; 1774 (void)UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED, td); 1775 (void)VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1776 } 1777 printf("ffs_snapshot_mount: %s inode %d\n", 1778 reason, fs->fs_snapinum[snaploc]); 1779 vput(vp); 1780 vp = NULL; 1781 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { 1782 if (fs->fs_snapinum[loc] == 0) 1783 break; 1784 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; 1785 } 1786 fs->fs_snapinum[loc - 1] = 0; 1787 snaploc--; 1788 continue; 1789 } 1790 /* 1791 * If there already exist snapshots on this filesystem, grab a 1792 * reference to their shared lock. If this is the first snapshot 1793 * on this filesystem, we need to allocate a lock for the 1794 * snapshots to share. In either case, acquire the snapshot 1795 * lock and give up our original private lock. 1796 */ 1797 VI_LOCK(devvp); 1798 if ((xp = TAILQ_FIRST(snaphead)) != NULL) { 1799 struct lock *lkp; 1800 1801 lkp = ITOV(xp)->v_vnlock; 1802 VI_UNLOCK(devvp); 1803 VI_LOCK(vp); 1804 vp->v_vnlock = lkp; 1805 } else { 1806 struct lock *lkp; 1807 1808 VI_UNLOCK(devvp); 1809 MALLOC(lkp, struct lock *, sizeof(struct lock), 1810 M_UFSMNT, M_WAITOK); 1811 lockinit(lkp, PVFS, "snaplk", VLKTIMEOUT, 1812 LK_CANRECURSE | LK_NOPAUSE); 1813 VI_LOCK(vp); 1814 vp->v_vnlock = lkp; 1815 } 1816 vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, td); 1817 transferlockers(&vp->v_lock, vp->v_vnlock); 1818 lockmgr(&vp->v_lock, LK_RELEASE, NULL, td); 1819 /* 1820 * Link it onto the active snapshot list. 1821 */ 1822 VI_LOCK(devvp); 1823 if (ip->i_nextsnap.tqe_prev != 0) 1824 panic("ffs_snapshot_mount: %d already on list", 1825 ip->i_number); 1826 else 1827 TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap); 1828 vp->v_vflag |= VV_SYSTEM; 1829 VI_UNLOCK(devvp); 1830 VOP_UNLOCK(vp, 0, td); 1831 } 1832 /* 1833 * No usable snapshots found. 1834 */ 1835 if (vp == NULL) 1836 return; 1837 /* 1838 * Allocate the space for the block hints list. We always want to 1839 * use the list from the newest snapshot. 1840 */ 1841 auio.uio_iov = &aiov; 1842 auio.uio_iovcnt = 1; 1843 aiov.iov_base = (void *)&snaplistsize; 1844 aiov.iov_len = sizeof(snaplistsize); 1845 auio.uio_resid = aiov.iov_len; 1846 auio.uio_offset = 1847 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)); 1848 auio.uio_segflg = UIO_SYSSPACE; 1849 auio.uio_rw = UIO_READ; 1850 auio.uio_td = td; 1851 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1852 if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 1853 printf("ffs_snapshot_mount: read_1 failed %d\n", error); 1854 VOP_UNLOCK(vp, 0, td); 1855 return; 1856 } 1857 MALLOC(snapblklist, void *, snaplistsize * sizeof(daddr_t), 1858 M_UFSMNT, M_WAITOK); 1859 auio.uio_iovcnt = 1; 1860 aiov.iov_base = snapblklist; 1861 aiov.iov_len = snaplistsize * sizeof (daddr_t); 1862 auio.uio_resid = aiov.iov_len; 1863 auio.uio_offset -= sizeof(snaplistsize); 1864 if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 1865 printf("ffs_snapshot_mount: read_2 failed %d\n", error); 1866 VOP_UNLOCK(vp, 0, td); 1867 FREE(snapblklist, M_UFSMNT); 1868 return; 1869 } 1870 VOP_UNLOCK(vp, 0, td); 1871 VI_LOCK(devvp); 1872 ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_mount"); 1873 devvp->v_rdev->si_snaplistsize = snaplistsize; 1874 devvp->v_rdev->si_snapblklist = (daddr_t *)snapblklist; 1875 devvp->v_rdev->si_copyonwrite = ffs_copyonwrite; 1876 devvp->v_vflag |= VV_COPYONWRITE; 1877 VI_UNLOCK(devvp); 1878 } 1879 1880 /* 1881 * Disassociate snapshot files when unmounting. 1882 */ 1883 void 1884 ffs_snapshot_unmount(mp) 1885 struct mount *mp; 1886 { 1887 struct vnode *devvp = VFSTOUFS(mp)->um_devvp; 1888 struct snaphead *snaphead = &devvp->v_rdev->si_snapshots; 1889 struct lock *lkp = NULL; 1890 struct inode *xp; 1891 struct vnode *vp; 1892 1893 VI_LOCK(devvp); 1894 while ((xp = TAILQ_FIRST(snaphead)) != 0) { 1895 vp = ITOV(xp); 1896 lkp = vp->v_vnlock; 1897 vp->v_vnlock = &vp->v_lock; 1898 TAILQ_REMOVE(snaphead, xp, i_nextsnap); 1899 xp->i_nextsnap.tqe_prev = 0; 1900 if (xp->i_effnlink > 0) { 1901 VI_UNLOCK(devvp); 1902 vrele(vp); 1903 VI_LOCK(devvp); 1904 } 1905 } 1906 if (devvp->v_rdev->si_snapblklist != NULL) { 1907 FREE(devvp->v_rdev->si_snapblklist, M_UFSMNT); 1908 devvp->v_rdev->si_snapblklist = NULL; 1909 devvp->v_rdev->si_snaplistsize = 0; 1910 } 1911 if (lkp != NULL) { 1912 lockdestroy(lkp); 1913 FREE(lkp, M_UFSMNT); 1914 } 1915 ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount"); 1916 devvp->v_rdev->si_copyonwrite = 0; 1917 devvp->v_vflag &= ~VV_COPYONWRITE; 1918 VI_UNLOCK(devvp); 1919 } 1920 1921 /* 1922 * Check for need to copy block that is about to be written, 1923 * copying the block if necessary. 1924 */ 1925 static int 1926 ffs_copyonwrite(devvp, bp) 1927 struct vnode *devvp; 1928 struct buf *bp; 1929 { 1930 struct snaphead *snaphead; 1931 struct buf *ibp, *cbp, *savedcbp = 0; 1932 struct thread *td = curthread; 1933 struct fs *fs; 1934 struct inode *ip; 1935 struct vnode *vp = 0; 1936 ufs2_daddr_t lbn, blkno, *snapblklist; 1937 int lower, upper, mid, indiroff, snapshot_locked = 0, error = 0; 1938 1939 if (td->td_pflags & TDP_COWINPROGRESS) 1940 panic("ffs_copyonwrite: recursive call"); 1941 /* 1942 * First check to see if it is in the preallocated list. 1943 * By doing this check we avoid several potential deadlocks. 1944 */ 1945 VI_LOCK(devvp); 1946 snaphead = &devvp->v_rdev->si_snapshots; 1947 ip = TAILQ_FIRST(snaphead); 1948 fs = ip->i_fs; 1949 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); 1950 snapblklist = devvp->v_rdev->si_snapblklist; 1951 upper = devvp->v_rdev->si_snaplistsize - 1; 1952 lower = 1; 1953 while (lower <= upper) { 1954 mid = (lower + upper) / 2; 1955 if (snapblklist[mid] == lbn) 1956 break; 1957 if (snapblklist[mid] < lbn) 1958 lower = mid + 1; 1959 else 1960 upper = mid - 1; 1961 } 1962 if (lower <= upper) { 1963 VI_UNLOCK(devvp); 1964 return (0); 1965 } 1966 /* 1967 * Not in the precomputed list, so check the snapshots. 1968 */ 1969 retry: 1970 TAILQ_FOREACH(ip, snaphead, i_nextsnap) { 1971 vp = ITOV(ip); 1972 /* 1973 * We ensure that everything of our own that needs to be 1974 * copied will be done at the time that ffs_snapshot is 1975 * called. Thus we can skip the check here which can 1976 * deadlock in doing the lookup in UFS_BALLOC. 1977 */ 1978 if (bp->b_vp == vp) 1979 continue; 1980 /* 1981 * Check to see if block needs to be copied. We do not have 1982 * to hold the snapshot lock while doing this lookup as it 1983 * will never require any additional allocations for the 1984 * snapshot inode. 1985 */ 1986 if (lbn < NDADDR) { 1987 blkno = DIP(ip, i_db[lbn]); 1988 } else { 1989 if (snapshot_locked == 0 && 1990 lockmgr(vp->v_vnlock, 1991 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 1992 VI_MTX(devvp), td) != 0) { 1993 VI_LOCK(devvp); 1994 goto retry; 1995 } 1996 snapshot_locked = 1; 1997 td->td_pflags |= TDP_COWINPROGRESS; 1998 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1999 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 2000 td->td_pflags &= ~TDP_COWINPROGRESS; 2001 if (error) 2002 break; 2003 indiroff = (lbn - NDADDR) % NINDIR(fs); 2004 if (ip->i_ump->um_fstype == UFS1) 2005 blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; 2006 else 2007 blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; 2008 bqrelse(ibp); 2009 } 2010 #ifdef DIAGNOSTIC 2011 if (blkno == BLK_SNAP && bp->b_lblkno >= 0) 2012 panic("ffs_copyonwrite: bad copy block"); 2013 #endif 2014 if (blkno != 0) 2015 continue; 2016 /* 2017 * Allocate the block into which to do the copy. Since 2018 * multiple processes may all try to copy the same block, 2019 * we have to recheck our need to do a copy if we sleep 2020 * waiting for the lock. 2021 * 2022 * Because all snapshots on a filesystem share a single 2023 * lock, we ensure that we will never be in competition 2024 * with another process to allocate a block. 2025 */ 2026 if (snapshot_locked == 0 && 2027 lockmgr(vp->v_vnlock, 2028 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 2029 VI_MTX(devvp), td) != 0) { 2030 VI_LOCK(devvp); 2031 goto retry; 2032 } 2033 snapshot_locked = 1; 2034 td->td_pflags |= TDP_COWINPROGRESS; 2035 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 2036 fs->fs_bsize, KERNCRED, 0, &cbp); 2037 td->td_pflags &= ~TDP_COWINPROGRESS; 2038 if (error) 2039 break; 2040 #ifdef DEBUG 2041 if (snapdebug) { 2042 printf("Copyonwrite: snapino %d lbn %jd for ", 2043 ip->i_number, (intmax_t)lbn); 2044 if (bp->b_vp == devvp) 2045 printf("fs metadata"); 2046 else 2047 printf("inum %d", VTOI(bp->b_vp)->i_number); 2048 printf(" lblkno %jd to blkno %jd\n", 2049 (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno); 2050 } 2051 #endif 2052 /* 2053 * If we have already read the old block contents, then 2054 * simply copy them to the new block. Note that we need 2055 * to synchronously write snapshots that have not been 2056 * unlinked, and hence will be visible after a crash, 2057 * to ensure their integrity. 2058 */ 2059 if (savedcbp != 0) { 2060 bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 2061 bawrite(cbp); 2062 if (dopersistence && ip->i_effnlink > 0) 2063 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 2064 continue; 2065 } 2066 /* 2067 * Otherwise, read the old block contents into the buffer. 2068 */ 2069 if ((error = readblock(cbp, lbn)) != 0) { 2070 bzero(cbp->b_data, fs->fs_bsize); 2071 bawrite(cbp); 2072 if (dopersistence && ip->i_effnlink > 0) 2073 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 2074 break; 2075 } 2076 savedcbp = cbp; 2077 } 2078 /* 2079 * Note that we need to synchronously write snapshots that 2080 * have not been unlinked, and hence will be visible after 2081 * a crash, to ensure their integrity. 2082 */ 2083 if (savedcbp) { 2084 vp = savedcbp->b_vp; 2085 bawrite(savedcbp); 2086 if (dopersistence && VTOI(vp)->i_effnlink > 0) 2087 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 2088 } 2089 if (snapshot_locked) 2090 VOP_UNLOCK(vp, 0, td); 2091 else 2092 VI_UNLOCK(devvp); 2093 return (error); 2094 } 2095 2096 /* 2097 * Read the specified block into the given buffer. 2098 * Much of this boiler-plate comes from bwrite(). 2099 */ 2100 static int 2101 readblock(bp, lbn) 2102 struct buf *bp; 2103 ufs2_daddr_t lbn; 2104 { 2105 struct uio auio; 2106 struct iovec aiov; 2107 struct thread *td = curthread; 2108 struct inode *ip = VTOI(bp->b_vp); 2109 2110 aiov.iov_base = bp->b_data; 2111 aiov.iov_len = bp->b_bcount; 2112 auio.uio_iov = &aiov; 2113 auio.uio_iovcnt = 1; 2114 auio.uio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn))); 2115 auio.uio_resid = bp->b_bcount; 2116 auio.uio_rw = UIO_READ; 2117 auio.uio_segflg = UIO_SYSSPACE; 2118 auio.uio_td = td; 2119 return (physio(ip->i_devvp->v_rdev, &auio, 0)); 2120 } 2121