1 /* 2 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. 3 * 4 * Further information about snapshots can be obtained from: 5 * 6 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 7 * 1614 Oxford Street mckusick@mckusick.com 8 * Berkeley, CA 94709-1608 +1-510-843-9542 9 * USA 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 21 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 22 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 34 */ 35 36 #include <sys/cdefs.h> 37 __FBSDID("$FreeBSD$"); 38 39 #include <sys/param.h> 40 #include <sys/kernel.h> 41 #include <sys/systm.h> 42 #include <sys/conf.h> 43 #include <sys/bio.h> 44 #include <sys/buf.h> 45 #include <sys/proc.h> 46 #include <sys/namei.h> 47 #include <sys/sched.h> 48 #include <sys/stat.h> 49 #include <sys/malloc.h> 50 #include <sys/mount.h> 51 #include <sys/resource.h> 52 #include <sys/resourcevar.h> 53 #include <sys/vnode.h> 54 55 #include <ufs/ufs/extattr.h> 56 #include <ufs/ufs/quota.h> 57 #include <ufs/ufs/ufsmount.h> 58 #include <ufs/ufs/inode.h> 59 #include <ufs/ufs/ufs_extern.h> 60 61 #include <ufs/ffs/fs.h> 62 #include <ufs/ffs/ffs_extern.h> 63 64 #define KERNCRED thread0.td_ucred 65 #define DEBUG 1 66 67 static int cgaccount(int, struct vnode *, struct buf *, int); 68 static int expunge_ufs1(struct vnode *, struct inode *, struct fs *, 69 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 70 ufs_lbn_t, int), int); 71 static int indiracct_ufs1(struct vnode *, struct vnode *, int, 72 ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 73 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 74 ufs_lbn_t, int), int); 75 static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 76 struct fs *, ufs_lbn_t, int); 77 static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 78 struct fs *, ufs_lbn_t, int); 79 static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 80 struct fs *, ufs_lbn_t, int); 81 static int expunge_ufs2(struct vnode *, struct inode *, struct fs *, 82 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 83 ufs_lbn_t, int), int); 84 static int indiracct_ufs2(struct vnode *, struct vnode *, int, 85 ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 86 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 87 ufs_lbn_t, int), int); 88 static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 89 struct fs *, ufs_lbn_t, int); 90 static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 91 struct fs *, ufs_lbn_t, int); 92 static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 93 struct fs *, ufs_lbn_t, int); 94 static int ffs_copyonwrite(struct vnode *, struct buf *); 95 static int readblock(struct buf *, ufs2_daddr_t); 96 97 /* 98 * To ensure the consistency of snapshots across crashes, we must 99 * synchronously write out copied blocks before allowing the 100 * originals to be modified. Because of the rather severe speed 101 * penalty that this imposes, the following flag allows this 102 * crash persistence to be disabled. 103 */ 104 int dopersistence = 0; 105 106 #ifdef DEBUG 107 #include <sys/sysctl.h> 108 SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, ""); 109 static int snapdebug = 0; 110 SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, ""); 111 int collectsnapstats = 0; 112 SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats, 113 0, ""); 114 #endif /* DEBUG */ 115 116 /* 117 * Create a snapshot file and initialize it for the filesystem. 118 */ 119 int 120 ffs_snapshot(mp, snapfile) 121 struct mount *mp; 122 char *snapfile; 123 { 124 ufs2_daddr_t numblks, blkno, *blkp, *snapblklist; 125 int error, cg, snaploc; 126 int i, size, len, loc; 127 int flag = mp->mnt_flag; 128 struct timespec starttime = {0, 0}, endtime; 129 char saved_nice = 0; 130 long redo = 0, snaplistsize = 0; 131 int32_t *lp; 132 void *space; 133 struct fs *copy_fs = NULL, *fs = VFSTOUFS(mp)->um_fs; 134 struct snaphead *snaphead; 135 struct thread *td = curthread; 136 struct inode *ip, *xp; 137 struct buf *bp, *nbp, *ibp, *sbp = NULL; 138 struct nameidata nd; 139 struct mount *wrtmp; 140 struct vattr vat; 141 struct vnode *vp, *xvp, *nvp, *devvp; 142 struct uio auio; 143 struct iovec aiov; 144 145 /* 146 * Need to serialize access to snapshot code per filesystem. 147 */ 148 /* 149 * Assign a snapshot slot in the superblock. 150 */ 151 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 152 if (fs->fs_snapinum[snaploc] == 0) 153 break; 154 if (snaploc == FSMAXSNAP) 155 return (ENOSPC); 156 /* 157 * Create the snapshot file. 158 */ 159 restart: 160 NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, snapfile, td); 161 if ((error = namei(&nd)) != 0) 162 return (error); 163 if (nd.ni_vp != NULL) { 164 vput(nd.ni_vp); 165 error = EEXIST; 166 } 167 if (nd.ni_dvp->v_mount != mp) 168 error = EXDEV; 169 if (error) { 170 NDFREE(&nd, NDF_ONLY_PNBUF); 171 if (nd.ni_dvp == nd.ni_vp) 172 vrele(nd.ni_dvp); 173 else 174 vput(nd.ni_dvp); 175 return (error); 176 } 177 VATTR_NULL(&vat); 178 vat.va_type = VREG; 179 vat.va_mode = S_IRUSR; 180 vat.va_vaflags |= VA_EXCLUSIVE; 181 if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp)) 182 wrtmp = NULL; 183 if (wrtmp != mp) 184 panic("ffs_snapshot: mount mismatch"); 185 if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) { 186 NDFREE(&nd, NDF_ONLY_PNBUF); 187 vput(nd.ni_dvp); 188 if ((error = vn_start_write(NULL, &wrtmp, 189 V_XSLEEP | PCATCH)) != 0) 190 return (error); 191 goto restart; 192 } 193 VOP_LEASE(nd.ni_dvp, td, KERNCRED, LEASE_WRITE); 194 error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat); 195 vput(nd.ni_dvp); 196 if (error) { 197 NDFREE(&nd, NDF_ONLY_PNBUF); 198 vn_finished_write(wrtmp); 199 return (error); 200 } 201 vp = nd.ni_vp; 202 ip = VTOI(vp); 203 devvp = ip->i_devvp; 204 /* 205 * Allocate and copy the last block contents so as to be able 206 * to set size to that of the filesystem. 207 */ 208 numblks = howmany(fs->fs_size, fs->fs_frag); 209 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), 210 fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); 211 if (error) 212 goto out; 213 ip->i_size = lblktosize(fs, (off_t)numblks); 214 DIP(ip, i_size) = ip->i_size; 215 ip->i_flag |= IN_CHANGE | IN_UPDATE; 216 if ((error = readblock(bp, numblks - 1)) != 0) 217 goto out; 218 bawrite(bp); 219 /* 220 * Preallocate critical data structures so that we can copy 221 * them in without further allocation after we suspend all 222 * operations on the filesystem. We would like to just release 223 * the allocated buffers without writing them since they will 224 * be filled in below once we are ready to go, but this upsets 225 * the soft update code, so we go ahead and write the new buffers. 226 * 227 * Allocate all indirect blocks and mark all of them as not 228 * needing to be copied. 229 */ 230 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 231 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), 232 fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp); 233 if (error) 234 goto out; 235 bawrite(ibp); 236 } 237 /* 238 * Allocate copies for the superblock and its summary information. 239 */ 240 error = UFS_BALLOC(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED, 241 0, &nbp); 242 if (error) 243 goto out; 244 bawrite(nbp); 245 blkno = fragstoblks(fs, fs->fs_csaddr); 246 len = howmany(fs->fs_cssize, fs->fs_bsize); 247 for (loc = 0; loc < len; loc++) { 248 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), 249 fs->fs_bsize, KERNCRED, 0, &nbp); 250 if (error) 251 goto out; 252 bawrite(nbp); 253 } 254 /* 255 * Allocate all cylinder group blocks. 256 */ 257 for (cg = 0; cg < fs->fs_ncg; cg++) { 258 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 259 fs->fs_bsize, KERNCRED, 0, &nbp); 260 if (error) 261 goto out; 262 bawrite(nbp); 263 } 264 /* 265 * Copy all the cylinder group maps. Although the 266 * filesystem is still active, we hope that only a few 267 * cylinder groups will change between now and when we 268 * suspend operations. Thus, we will be able to quickly 269 * touch up the few cylinder groups that changed during 270 * the suspension period. 271 */ 272 len = howmany(fs->fs_ncg, NBBY); 273 MALLOC(fs->fs_active, int *, len, M_DEVBUF, M_WAITOK); 274 bzero(fs->fs_active, len); 275 for (cg = 0; cg < fs->fs_ncg; cg++) { 276 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 277 fs->fs_bsize, KERNCRED, 0, &nbp); 278 if (error) 279 goto out; 280 error = cgaccount(cg, vp, nbp, 1); 281 bawrite(nbp); 282 if (error) 283 goto out; 284 } 285 /* 286 * Change inode to snapshot type file. 287 */ 288 ip->i_flags |= SF_SNAPSHOT; 289 DIP(ip, i_flags) = ip->i_flags; 290 ip->i_flag |= IN_CHANGE | IN_UPDATE; 291 /* 292 * Ensure that the snapshot is completely on disk. 293 * Since we have marked it as a snapshot it is safe to 294 * unlock it as no process will be allowed to write to it. 295 */ 296 if ((error = VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td)) != 0) 297 goto out; 298 VOP_UNLOCK(vp, 0, td); 299 /* 300 * All allocations are done, so we can now snapshot the system. 301 * 302 * Recind nice scheduling while running with the filesystem suspended. 303 */ 304 if (td->td_proc->p_nice > 0) { 305 PROC_LOCK(td->td_proc); 306 mtx_lock_spin(&sched_lock); 307 saved_nice = td->td_proc->p_nice; 308 sched_nice(td->td_proc, 0); 309 mtx_unlock_spin(&sched_lock); 310 PROC_UNLOCK(td->td_proc); 311 } 312 /* 313 * Suspend operation on filesystem. 314 */ 315 for (;;) { 316 vn_finished_write(wrtmp); 317 if ((error = vfs_write_suspend(vp->v_mount)) != 0) { 318 vn_start_write(NULL, &wrtmp, V_WAIT); 319 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 320 goto out; 321 } 322 if (mp->mnt_kern_flag & MNTK_SUSPENDED) 323 break; 324 vn_start_write(NULL, &wrtmp, V_WAIT); 325 } 326 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 327 if (collectsnapstats) 328 nanotime(&starttime); 329 /* 330 * First, copy all the cylinder group maps that have changed. 331 */ 332 for (cg = 0; cg < fs->fs_ncg; cg++) { 333 if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0) 334 continue; 335 redo++; 336 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 337 fs->fs_bsize, KERNCRED, 0, &nbp); 338 if (error) 339 goto out1; 340 error = cgaccount(cg, vp, nbp, 2); 341 bawrite(nbp); 342 if (error) 343 goto out1; 344 } 345 /* 346 * Grab a copy of the superblock and its summary information. 347 * We delay writing it until the suspension is released below. 348 */ 349 error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize, 350 KERNCRED, &sbp); 351 if (error) { 352 brelse(sbp); 353 sbp = NULL; 354 goto out1; 355 } 356 loc = blkoff(fs, fs->fs_sblockloc); 357 copy_fs = (struct fs *)(sbp->b_data + loc); 358 bcopy(fs, copy_fs, fs->fs_sbsize); 359 if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) 360 copy_fs->fs_clean = 1; 361 size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE; 362 if (fs->fs_sbsize < size) 363 bzero(&sbp->b_data[loc + fs->fs_sbsize], size - fs->fs_sbsize); 364 size = blkroundup(fs, fs->fs_cssize); 365 if (fs->fs_contigsumsize > 0) 366 size += fs->fs_ncg * sizeof(int32_t); 367 space = malloc((u_long)size, M_UFSMNT, M_WAITOK); 368 copy_fs->fs_csp = space; 369 bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize); 370 space = (char *)space + fs->fs_cssize; 371 loc = howmany(fs->fs_cssize, fs->fs_fsize); 372 i = fs->fs_frag - loc % fs->fs_frag; 373 len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; 374 if (len > 0) { 375 if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc), 376 len, KERNCRED, &bp)) != 0) { 377 brelse(bp); 378 free(copy_fs->fs_csp, M_UFSMNT); 379 bawrite(sbp); 380 sbp = NULL; 381 goto out1; 382 } 383 bcopy(bp->b_data, space, (u_int)len); 384 space = (char *)space + len; 385 bp->b_flags |= B_INVAL | B_NOCACHE; 386 brelse(bp); 387 } 388 if (fs->fs_contigsumsize > 0) { 389 copy_fs->fs_maxcluster = lp = space; 390 for (i = 0; i < fs->fs_ncg; i++) 391 *lp++ = fs->fs_contigsumsize; 392 } 393 /* 394 * We must check for active files that have been unlinked 395 * (e.g., with a zero link count). We have to expunge all 396 * trace of these files from the snapshot so that they are 397 * not reclaimed prematurely by fsck or unnecessarily dumped. 398 * We turn off the MNTK_SUSPENDED flag to avoid a panic from 399 * spec_strategy about writing on a suspended filesystem. 400 * Note that we skip unlinked snapshot files as they will 401 * be handled separately below. 402 * 403 * We also calculate the needed size for the snapshot list. 404 */ 405 snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + 406 FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */; 407 mp->mnt_kern_flag &= ~MNTK_SUSPENDED; 408 MNT_ILOCK(mp); 409 loop: 410 for (xvp = TAILQ_FIRST(&mp->mnt_nvnodelist); xvp; xvp = nvp) { 411 /* 412 * Make sure this vnode wasn't reclaimed in getnewvnode(). 413 * Start over if it has (it won't be on the list anymore). 414 */ 415 if (xvp->v_mount != mp) 416 goto loop; 417 nvp = TAILQ_NEXT(xvp, v_nmntvnodes); 418 VI_LOCK(xvp); 419 MNT_IUNLOCK(mp); 420 if ((xvp->v_iflag & VI_XLOCK) || 421 xvp->v_usecount == 0 || xvp->v_type == VNON || 422 (VTOI(xvp)->i_flags & SF_SNAPSHOT)) { 423 VI_UNLOCK(xvp); 424 MNT_ILOCK(mp); 425 continue; 426 } 427 /* 428 * We can skip parent directory vnode because it must have 429 * this snapshot file in it. 430 */ 431 if (xvp == nd.ni_dvp) { 432 VI_UNLOCK(xvp); 433 MNT_ILOCK(mp); 434 continue; 435 } 436 if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK, td) != 0) { 437 MNT_ILOCK(mp); 438 goto loop; 439 } 440 if (snapdebug) 441 vprint("ffs_snapshot: busy vnode", xvp); 442 if (VOP_GETATTR(xvp, &vat, td->td_ucred, td) == 0 && 443 vat.va_nlink > 0) { 444 VOP_UNLOCK(xvp, 0, td); 445 MNT_ILOCK(mp); 446 continue; 447 } 448 xp = VTOI(xvp); 449 if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) { 450 VOP_UNLOCK(xvp, 0, td); 451 MNT_ILOCK(mp); 452 continue; 453 } 454 /* 455 * If there is a fragment, clear it here. 456 */ 457 blkno = 0; 458 loc = howmany(xp->i_size, fs->fs_bsize) - 1; 459 if (loc < NDADDR) { 460 len = fragroundup(fs, blkoff(fs, xp->i_size)); 461 if (len < fs->fs_bsize) { 462 ffs_blkfree(copy_fs, vp, DIP(xp, i_db[loc]), 463 len, xp->i_number); 464 blkno = DIP(xp, i_db[loc]); 465 DIP(xp, i_db[loc]) = 0; 466 } 467 } 468 snaplistsize += 1; 469 if (xp->i_ump->um_fstype == UFS1) 470 error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, 471 BLK_NOCOPY); 472 else 473 error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, 474 BLK_NOCOPY); 475 if (blkno) 476 DIP(xp, i_db[loc]) = blkno; 477 if (!error) 478 error = ffs_freefile(copy_fs, vp, xp->i_number, 479 xp->i_mode); 480 VOP_UNLOCK(xvp, 0, td); 481 if (error) { 482 free(copy_fs->fs_csp, M_UFSMNT); 483 bawrite(sbp); 484 sbp = NULL; 485 goto out1; 486 } 487 MNT_ILOCK(mp); 488 } 489 MNT_IUNLOCK(mp); 490 /* 491 * If there already exist snapshots on this filesystem, grab a 492 * reference to their shared lock. If this is the first snapshot 493 * on this filesystem, we need to allocate a lock for the snapshots 494 * to share. In either case, acquire the snapshot lock and give 495 * up our original private lock. 496 */ 497 VI_LOCK(devvp); 498 snaphead = &devvp->v_rdev->si_snapshots; 499 if ((xp = TAILQ_FIRST(snaphead)) != NULL) { 500 struct lock *lkp; 501 502 lkp = ITOV(xp)->v_vnlock; 503 VI_UNLOCK(devvp); 504 VI_LOCK(vp); 505 vp->v_vnlock = lkp; 506 } else { 507 struct lock *lkp; 508 509 VI_UNLOCK(devvp); 510 MALLOC(lkp, struct lock *, sizeof(struct lock), M_UFSMNT, 511 M_WAITOK); 512 lockinit(lkp, PVFS, "snaplk", VLKTIMEOUT, 513 LK_CANRECURSE | LK_NOPAUSE); 514 VI_LOCK(vp); 515 vp->v_vnlock = lkp; 516 } 517 vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, td); 518 transferlockers(&vp->v_lock, vp->v_vnlock); 519 lockmgr(&vp->v_lock, LK_RELEASE, NULL, td); 520 /* 521 * If this is the first snapshot on this filesystem, then we need 522 * to allocate the space for the list of preallocated snapshot blocks. 523 * This list will be refined below, but this preliminary one will 524 * keep us out of deadlock until the full one is ready. 525 */ 526 if (xp == NULL) { 527 MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t), 528 M_UFSMNT, M_WAITOK); 529 blkp = &snapblklist[1]; 530 *blkp++ = lblkno(fs, fs->fs_sblockloc); 531 blkno = fragstoblks(fs, fs->fs_csaddr); 532 for (cg = 0; cg < fs->fs_ncg; cg++) { 533 if (fragstoblks(fs, cgtod(fs, cg) > blkno)) 534 break; 535 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 536 } 537 len = howmany(fs->fs_cssize, fs->fs_bsize); 538 for (loc = 0; loc < len; loc++) 539 *blkp++ = blkno + loc; 540 for (; cg < fs->fs_ncg; cg++) 541 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 542 snapblklist[0] = blkp - snapblklist; 543 VI_LOCK(devvp); 544 if (devvp->v_rdev->si_snapblklist != NULL) 545 panic("ffs_snapshot: non-empty list"); 546 devvp->v_rdev->si_snapblklist = snapblklist; 547 devvp->v_rdev->si_snaplistsize = blkp - snapblklist; 548 VI_UNLOCK(devvp); 549 } 550 /* 551 * Record snapshot inode. Since this is the newest snapshot, 552 * it must be placed at the end of the list. 553 */ 554 VI_LOCK(devvp); 555 fs->fs_snapinum[snaploc] = ip->i_number; 556 if (ip->i_nextsnap.tqe_prev != 0) 557 panic("ffs_snapshot: %d already on list", ip->i_number); 558 TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap); 559 devvp->v_rdev->si_copyonwrite = ffs_copyonwrite; 560 devvp->v_vflag |= VV_COPYONWRITE; 561 VI_UNLOCK(devvp); 562 ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp"); 563 vp->v_vflag |= VV_SYSTEM; 564 out1: 565 /* 566 * Resume operation on filesystem. 567 */ 568 vfs_write_resume(vp->v_mount); 569 vn_start_write(NULL, &wrtmp, V_WAIT); 570 if (collectsnapstats && starttime.tv_sec > 0) { 571 nanotime(&endtime); 572 timespecsub(&endtime, &starttime); 573 printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n", 574 vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec, 575 endtime.tv_nsec / 1000000, redo, fs->fs_ncg); 576 } 577 if (sbp == NULL) 578 goto out; 579 /* 580 * Copy allocation information from all the snapshots in 581 * this snapshot and then expunge them from its view. 582 */ 583 snaphead = &devvp->v_rdev->si_snapshots; 584 TAILQ_FOREACH(xp, snaphead, i_nextsnap) { 585 if (xp == ip) 586 break; 587 if (xp->i_ump->um_fstype == UFS1) 588 error = expunge_ufs1(vp, xp, fs, snapacct_ufs1, 589 BLK_SNAP); 590 else 591 error = expunge_ufs2(vp, xp, fs, snapacct_ufs2, 592 BLK_SNAP); 593 if (error) { 594 fs->fs_snapinum[snaploc] = 0; 595 goto done; 596 } 597 } 598 /* 599 * Allocate space for the full list of preallocated snapshot blocks. 600 */ 601 MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t), 602 M_UFSMNT, M_WAITOK); 603 ip->i_snapblklist = &snapblklist[1]; 604 /* 605 * Expunge the blocks used by the snapshots from the set of 606 * blocks marked as used in the snapshot bitmaps. Also, collect 607 * the list of allocated blocks in i_snapblklist. 608 */ 609 if (ip->i_ump->um_fstype == UFS1) 610 error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP); 611 else 612 error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP); 613 if (error) { 614 fs->fs_snapinum[snaploc] = 0; 615 FREE(snapblklist, M_UFSMNT); 616 goto done; 617 } 618 if (snaplistsize < ip->i_snapblklist - snapblklist) 619 panic("ffs_snapshot: list too small"); 620 snaplistsize = ip->i_snapblklist - snapblklist; 621 snapblklist[0] = snaplistsize; 622 ip->i_snapblklist = 0; 623 /* 624 * Write out the list of allocated blocks to the end of the snapshot. 625 */ 626 auio.uio_iov = &aiov; 627 auio.uio_iovcnt = 1; 628 aiov.iov_base = (void *)snapblklist; 629 aiov.iov_len = snaplistsize * sizeof(daddr_t); 630 auio.uio_resid = aiov.iov_len;; 631 auio.uio_offset = ip->i_size; 632 auio.uio_segflg = UIO_SYSSPACE; 633 auio.uio_rw = UIO_WRITE; 634 auio.uio_td = td; 635 if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 636 fs->fs_snapinum[snaploc] = 0; 637 FREE(snapblklist, M_UFSMNT); 638 goto done; 639 } 640 /* 641 * Write the superblock and its summary information 642 * to the snapshot. 643 */ 644 blkno = fragstoblks(fs, fs->fs_csaddr); 645 len = howmany(fs->fs_cssize, fs->fs_bsize); 646 space = copy_fs->fs_csp; 647 for (loc = 0; loc < len; loc++) { 648 error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp); 649 if (error) { 650 brelse(nbp); 651 fs->fs_snapinum[snaploc] = 0; 652 FREE(snapblklist, M_UFSMNT); 653 goto done; 654 } 655 bcopy(space, nbp->b_data, fs->fs_bsize); 656 space = (char *)space + fs->fs_bsize; 657 bawrite(nbp); 658 } 659 /* 660 * As this is the newest list, it is the most inclusive, so 661 * should replace the previous list. 662 */ 663 VI_LOCK(devvp); 664 space = devvp->v_rdev->si_snapblklist; 665 devvp->v_rdev->si_snapblklist = snapblklist; 666 devvp->v_rdev->si_snaplistsize = snaplistsize; 667 VI_UNLOCK(devvp); 668 if (space != NULL) 669 FREE(space, M_UFSMNT); 670 done: 671 free(copy_fs->fs_csp, M_UFSMNT); 672 bawrite(sbp); 673 out: 674 if (saved_nice > 0) { 675 PROC_LOCK(td->td_proc); 676 mtx_lock_spin(&sched_lock); 677 sched_nice(td->td_proc, saved_nice); 678 mtx_unlock_spin(&sched_lock); 679 PROC_UNLOCK(td->td_proc); 680 } 681 if (fs->fs_active != 0) { 682 FREE(fs->fs_active, M_DEVBUF); 683 fs->fs_active = 0; 684 } 685 mp->mnt_flag = flag; 686 if (error) 687 (void) UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED, td); 688 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 689 if (error) 690 vput(vp); 691 else 692 VOP_UNLOCK(vp, 0, td); 693 vn_finished_write(wrtmp); 694 return (error); 695 } 696 697 /* 698 * Copy a cylinder group map. All the unallocated blocks are marked 699 * BLK_NOCOPY so that the snapshot knows that it need not copy them 700 * if they are later written. If passno is one, then this is a first 701 * pass, so only setting needs to be done. If passno is 2, then this 702 * is a revision to a previous pass which must be undone as the 703 * replacement pass is done. 704 */ 705 static int 706 cgaccount(cg, vp, nbp, passno) 707 int cg; 708 struct vnode *vp; 709 struct buf *nbp; 710 int passno; 711 { 712 struct buf *bp, *ibp; 713 struct inode *ip; 714 struct cg *cgp; 715 struct fs *fs; 716 ufs2_daddr_t base, numblks; 717 int error, len, loc, indiroff; 718 719 ip = VTOI(vp); 720 fs = ip->i_fs; 721 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 722 (int)fs->fs_cgsize, KERNCRED, &bp); 723 if (error) { 724 brelse(bp); 725 return (error); 726 } 727 cgp = (struct cg *)bp->b_data; 728 if (!cg_chkmagic(cgp)) { 729 brelse(bp); 730 return (EIO); 731 } 732 atomic_set_int(&ACTIVECGNUM(fs, cg), ACTIVECGOFF(cg)); 733 bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize); 734 if (fs->fs_cgsize < fs->fs_bsize) 735 bzero(&nbp->b_data[fs->fs_cgsize], 736 fs->fs_bsize - fs->fs_cgsize); 737 if (passno == 2) 738 nbp->b_flags |= B_VALIDSUSPWRT; 739 numblks = howmany(fs->fs_size, fs->fs_frag); 740 len = howmany(fs->fs_fpg, fs->fs_frag); 741 base = cg * fs->fs_fpg / fs->fs_frag; 742 if (base + len >= numblks) 743 len = numblks - base - 1; 744 loc = 0; 745 if (base < NDADDR) { 746 for ( ; loc < NDADDR; loc++) { 747 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 748 DIP(ip, i_db[loc]) = BLK_NOCOPY; 749 else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY) 750 DIP(ip, i_db[loc]) = 0; 751 else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY) 752 panic("ffs_snapshot: lost direct block"); 753 } 754 } 755 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), 756 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 757 if (error) { 758 brelse(bp); 759 return (error); 760 } 761 indiroff = (base + loc - NDADDR) % NINDIR(fs); 762 for ( ; loc < len; loc++, indiroff++) { 763 if (indiroff >= NINDIR(fs)) { 764 if (passno == 2) 765 ibp->b_flags |= B_VALIDSUSPWRT; 766 bawrite(ibp); 767 error = UFS_BALLOC(vp, 768 lblktosize(fs, (off_t)(base + loc)), 769 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 770 if (error) { 771 brelse(bp); 772 return (error); 773 } 774 indiroff = 0; 775 } 776 if (ip->i_ump->um_fstype == UFS1) { 777 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 778 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 779 BLK_NOCOPY; 780 else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data)) 781 [indiroff] == BLK_NOCOPY) 782 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0; 783 else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data)) 784 [indiroff] == BLK_NOCOPY) 785 panic("ffs_snapshot: lost indirect block"); 786 continue; 787 } 788 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 789 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; 790 else if (passno == 2 && 791 ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) 792 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0; 793 else if (passno == 1 && 794 ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) 795 panic("ffs_snapshot: lost indirect block"); 796 } 797 bqrelse(bp); 798 if (passno == 2) 799 ibp->b_flags |= B_VALIDSUSPWRT; 800 bdwrite(ibp); 801 return (0); 802 } 803 804 /* 805 * Before expunging a snapshot inode, note all the 806 * blocks that it claims with BLK_SNAP so that fsck will 807 * be able to account for those blocks properly and so 808 * that this snapshot knows that it need not copy them 809 * if the other snapshot holding them is freed. This code 810 * is reproduced once each for UFS1 and UFS2. 811 */ 812 static int 813 expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype) 814 struct vnode *snapvp; 815 struct inode *cancelip; 816 struct fs *fs; 817 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 818 struct fs *, ufs_lbn_t, int); 819 int expungetype; 820 { 821 int i, error, indiroff; 822 ufs_lbn_t lbn, rlbn; 823 ufs2_daddr_t len, blkno, numblks, blksperindir; 824 struct ufs1_dinode *dip; 825 struct thread *td = curthread; 826 struct buf *bp; 827 828 /* 829 * Prepare to expunge the inode. If its inode block has not 830 * yet been copied, then allocate and fill the copy. 831 */ 832 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 833 blkno = 0; 834 if (lbn < NDADDR) { 835 blkno = VTOI(snapvp)->i_din1->di_db[lbn]; 836 } else { 837 td->td_pflags |= TDP_COWINPROGRESS; 838 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 839 fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); 840 td->td_pflags &= ~TDP_COWINPROGRESS; 841 if (error) 842 return (error); 843 indiroff = (lbn - NDADDR) % NINDIR(fs); 844 blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff]; 845 bqrelse(bp); 846 } 847 if (blkno != 0) { 848 if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) 849 return (error); 850 } else { 851 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 852 fs->fs_bsize, KERNCRED, 0, &bp); 853 if (error) 854 return (error); 855 if ((error = readblock(bp, lbn)) != 0) 856 return (error); 857 } 858 /* 859 * Set a snapshot inode to be a zero length file, regular files 860 * to be completely unallocated. 861 */ 862 dip = (struct ufs1_dinode *)bp->b_data + 863 ino_to_fsbo(fs, cancelip->i_number); 864 if (expungetype == BLK_NOCOPY) 865 dip->di_mode = 0; 866 dip->di_size = 0; 867 dip->di_blocks = 0; 868 dip->di_flags &= ~SF_SNAPSHOT; 869 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t)); 870 bdwrite(bp); 871 /* 872 * Now go through and expunge all the blocks in the file 873 * using the function requested. 874 */ 875 numblks = howmany(cancelip->i_size, fs->fs_bsize); 876 if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0], 877 &cancelip->i_din1->di_db[NDADDR], fs, 0, expungetype))) 878 return (error); 879 if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_ib[0], 880 &cancelip->i_din1->di_ib[NIADDR], fs, -1, expungetype))) 881 return (error); 882 blksperindir = 1; 883 lbn = -NDADDR; 884 len = numblks - NDADDR; 885 rlbn = NDADDR; 886 for (i = 0; len > 0 && i < NIADDR; i++) { 887 error = indiracct_ufs1(snapvp, ITOV(cancelip), i, 888 cancelip->i_din1->di_ib[i], lbn, rlbn, len, 889 blksperindir, fs, acctfunc, expungetype); 890 if (error) 891 return (error); 892 blksperindir *= NINDIR(fs); 893 lbn -= blksperindir + 1; 894 len -= blksperindir; 895 rlbn += blksperindir; 896 } 897 return (0); 898 } 899 900 /* 901 * Descend an indirect block chain for vnode cancelvp accounting for all 902 * its indirect blocks in snapvp. 903 */ 904 static int 905 indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, 906 blksperindir, fs, acctfunc, expungetype) 907 struct vnode *snapvp; 908 struct vnode *cancelvp; 909 int level; 910 ufs1_daddr_t blkno; 911 ufs_lbn_t lbn; 912 ufs_lbn_t rlbn; 913 ufs_lbn_t remblks; 914 ufs_lbn_t blksperindir; 915 struct fs *fs; 916 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 917 struct fs *, ufs_lbn_t, int); 918 int expungetype; 919 { 920 int error, num, i; 921 ufs_lbn_t subblksperindir; 922 struct indir indirs[NIADDR + 2]; 923 ufs1_daddr_t last, *bap; 924 struct buf *bp; 925 926 if (blkno == 0) { 927 if (expungetype == BLK_NOCOPY) 928 return (0); 929 panic("indiracct_ufs1: missing indir"); 930 } 931 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 932 return (error); 933 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 934 panic("indiracct_ufs1: botched params"); 935 /* 936 * We have to expand bread here since it will deadlock looking 937 * up the block number for any blocks that are not in the cache. 938 */ 939 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); 940 bp->b_blkno = fsbtodb(fs, blkno); 941 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 942 (error = readblock(bp, fragstoblks(fs, blkno)))) { 943 brelse(bp); 944 return (error); 945 } 946 /* 947 * Account for the block pointers in this indirect block. 948 */ 949 last = howmany(remblks, blksperindir); 950 if (last > NINDIR(fs)) 951 last = NINDIR(fs); 952 MALLOC(bap, ufs1_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); 953 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 954 bqrelse(bp); 955 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 956 level == 0 ? rlbn : -1, expungetype); 957 if (error || level == 0) 958 goto out; 959 /* 960 * Account for the block pointers in each of the indirect blocks 961 * in the levels below us. 962 */ 963 subblksperindir = blksperindir / NINDIR(fs); 964 for (lbn++, level--, i = 0; i < last; i++) { 965 error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn, 966 rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); 967 if (error) 968 goto out; 969 rlbn += blksperindir; 970 lbn -= blksperindir; 971 remblks -= blksperindir; 972 } 973 out: 974 FREE(bap, M_DEVBUF); 975 return (error); 976 } 977 978 /* 979 * Do both snap accounting and map accounting. 980 */ 981 static int 982 fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype) 983 struct vnode *vp; 984 ufs1_daddr_t *oldblkp, *lastblkp; 985 struct fs *fs; 986 ufs_lbn_t lblkno; 987 int exptype; /* BLK_SNAP or BLK_NOCOPY */ 988 { 989 int error; 990 991 if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 992 return (error); 993 return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 994 } 995 996 /* 997 * Identify a set of blocks allocated in a snapshot inode. 998 */ 999 static int 1000 snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1001 struct vnode *vp; 1002 ufs1_daddr_t *oldblkp, *lastblkp; 1003 struct fs *fs; 1004 ufs_lbn_t lblkno; 1005 int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 1006 { 1007 struct inode *ip = VTOI(vp); 1008 ufs1_daddr_t blkno, *blkp; 1009 ufs_lbn_t lbn; 1010 struct buf *ibp; 1011 int error; 1012 1013 for ( ; oldblkp < lastblkp; oldblkp++) { 1014 blkno = *oldblkp; 1015 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1016 continue; 1017 lbn = fragstoblks(fs, blkno); 1018 if (lbn < NDADDR) { 1019 blkp = &ip->i_din1->di_db[lbn]; 1020 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1021 } else { 1022 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1023 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1024 if (error) 1025 return (error); 1026 blkp = &((ufs1_daddr_t *)(ibp->b_data)) 1027 [(lbn - NDADDR) % NINDIR(fs)]; 1028 } 1029 /* 1030 * If we are expunging a snapshot vnode and we 1031 * find a block marked BLK_NOCOPY, then it is 1032 * one that has been allocated to this snapshot after 1033 * we took our current snapshot and can be ignored. 1034 */ 1035 if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { 1036 if (lbn >= NDADDR) 1037 brelse(ibp); 1038 } else { 1039 if (*blkp != 0) 1040 panic("snapacct_ufs1: bad block"); 1041 *blkp = expungetype; 1042 if (lbn >= NDADDR) 1043 bdwrite(ibp); 1044 } 1045 } 1046 return (0); 1047 } 1048 1049 /* 1050 * Account for a set of blocks allocated in a snapshot inode. 1051 */ 1052 static int 1053 mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1054 struct vnode *vp; 1055 ufs1_daddr_t *oldblkp, *lastblkp; 1056 struct fs *fs; 1057 ufs_lbn_t lblkno; 1058 int expungetype; 1059 { 1060 ufs1_daddr_t blkno; 1061 struct inode *ip; 1062 ino_t inum; 1063 int acctit; 1064 1065 ip = VTOI(vp); 1066 inum = ip->i_number; 1067 if (lblkno == -1) 1068 acctit = 0; 1069 else 1070 acctit = 1; 1071 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1072 blkno = *oldblkp; 1073 if (blkno == 0 || blkno == BLK_NOCOPY) 1074 continue; 1075 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1076 *ip->i_snapblklist++ = lblkno; 1077 if (blkno == BLK_SNAP) 1078 blkno = blkstofrags(fs, lblkno); 1079 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); 1080 } 1081 return (0); 1082 } 1083 1084 /* 1085 * Before expunging a snapshot inode, note all the 1086 * blocks that it claims with BLK_SNAP so that fsck will 1087 * be able to account for those blocks properly and so 1088 * that this snapshot knows that it need not copy them 1089 * if the other snapshot holding them is freed. This code 1090 * is reproduced once each for UFS1 and UFS2. 1091 */ 1092 static int 1093 expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype) 1094 struct vnode *snapvp; 1095 struct inode *cancelip; 1096 struct fs *fs; 1097 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1098 struct fs *, ufs_lbn_t, int); 1099 int expungetype; 1100 { 1101 int i, error, indiroff; 1102 ufs_lbn_t lbn, rlbn; 1103 ufs2_daddr_t len, blkno, numblks, blksperindir; 1104 struct ufs2_dinode *dip; 1105 struct thread *td = curthread; 1106 struct buf *bp; 1107 1108 /* 1109 * Prepare to expunge the inode. If its inode block has not 1110 * yet been copied, then allocate and fill the copy. 1111 */ 1112 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 1113 blkno = 0; 1114 if (lbn < NDADDR) { 1115 blkno = VTOI(snapvp)->i_din2->di_db[lbn]; 1116 } else { 1117 td->td_pflags |= TDP_COWINPROGRESS; 1118 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 1119 fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); 1120 td->td_pflags &= ~TDP_COWINPROGRESS; 1121 if (error) 1122 return (error); 1123 indiroff = (lbn - NDADDR) % NINDIR(fs); 1124 blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff]; 1125 bqrelse(bp); 1126 } 1127 if (blkno != 0) { 1128 if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) 1129 return (error); 1130 } else { 1131 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 1132 fs->fs_bsize, KERNCRED, 0, &bp); 1133 if (error) 1134 return (error); 1135 if ((error = readblock(bp, lbn)) != 0) 1136 return (error); 1137 } 1138 /* 1139 * Set a snapshot inode to be a zero length file, regular files 1140 * to be completely unallocated. 1141 */ 1142 dip = (struct ufs2_dinode *)bp->b_data + 1143 ino_to_fsbo(fs, cancelip->i_number); 1144 if (expungetype == BLK_NOCOPY) 1145 dip->di_mode = 0; 1146 dip->di_size = 0; 1147 dip->di_blocks = 0; 1148 dip->di_flags &= ~SF_SNAPSHOT; 1149 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t)); 1150 bdwrite(bp); 1151 /* 1152 * Now go through and expunge all the blocks in the file 1153 * using the function requested. 1154 */ 1155 numblks = howmany(cancelip->i_size, fs->fs_bsize); 1156 if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0], 1157 &cancelip->i_din2->di_db[NDADDR], fs, 0, expungetype))) 1158 return (error); 1159 if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_ib[0], 1160 &cancelip->i_din2->di_ib[NIADDR], fs, -1, expungetype))) 1161 return (error); 1162 blksperindir = 1; 1163 lbn = -NDADDR; 1164 len = numblks - NDADDR; 1165 rlbn = NDADDR; 1166 for (i = 0; len > 0 && i < NIADDR; i++) { 1167 error = indiracct_ufs2(snapvp, ITOV(cancelip), i, 1168 cancelip->i_din2->di_ib[i], lbn, rlbn, len, 1169 blksperindir, fs, acctfunc, expungetype); 1170 if (error) 1171 return (error); 1172 blksperindir *= NINDIR(fs); 1173 lbn -= blksperindir + 1; 1174 len -= blksperindir; 1175 rlbn += blksperindir; 1176 } 1177 return (0); 1178 } 1179 1180 /* 1181 * Descend an indirect block chain for vnode cancelvp accounting for all 1182 * its indirect blocks in snapvp. 1183 */ 1184 static int 1185 indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, 1186 blksperindir, fs, acctfunc, expungetype) 1187 struct vnode *snapvp; 1188 struct vnode *cancelvp; 1189 int level; 1190 ufs2_daddr_t blkno; 1191 ufs_lbn_t lbn; 1192 ufs_lbn_t rlbn; 1193 ufs_lbn_t remblks; 1194 ufs_lbn_t blksperindir; 1195 struct fs *fs; 1196 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1197 struct fs *, ufs_lbn_t, int); 1198 int expungetype; 1199 { 1200 int error, num, i; 1201 ufs_lbn_t subblksperindir; 1202 struct indir indirs[NIADDR + 2]; 1203 ufs2_daddr_t last, *bap; 1204 struct buf *bp; 1205 1206 if (blkno == 0) { 1207 if (expungetype == BLK_NOCOPY) 1208 return (0); 1209 panic("indiracct_ufs2: missing indir"); 1210 } 1211 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 1212 return (error); 1213 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 1214 panic("indiracct_ufs2: botched params"); 1215 /* 1216 * We have to expand bread here since it will deadlock looking 1217 * up the block number for any blocks that are not in the cache. 1218 */ 1219 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); 1220 bp->b_blkno = fsbtodb(fs, blkno); 1221 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 1222 (error = readblock(bp, fragstoblks(fs, blkno)))) { 1223 brelse(bp); 1224 return (error); 1225 } 1226 /* 1227 * Account for the block pointers in this indirect block. 1228 */ 1229 last = howmany(remblks, blksperindir); 1230 if (last > NINDIR(fs)) 1231 last = NINDIR(fs); 1232 MALLOC(bap, ufs2_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); 1233 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 1234 bqrelse(bp); 1235 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 1236 level == 0 ? rlbn : -1, expungetype); 1237 if (error || level == 0) 1238 goto out; 1239 /* 1240 * Account for the block pointers in each of the indirect blocks 1241 * in the levels below us. 1242 */ 1243 subblksperindir = blksperindir / NINDIR(fs); 1244 for (lbn++, level--, i = 0; i < last; i++) { 1245 error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn, 1246 rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); 1247 if (error) 1248 goto out; 1249 rlbn += blksperindir; 1250 lbn -= blksperindir; 1251 remblks -= blksperindir; 1252 } 1253 out: 1254 FREE(bap, M_DEVBUF); 1255 return (error); 1256 } 1257 1258 /* 1259 * Do both snap accounting and map accounting. 1260 */ 1261 static int 1262 fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype) 1263 struct vnode *vp; 1264 ufs2_daddr_t *oldblkp, *lastblkp; 1265 struct fs *fs; 1266 ufs_lbn_t lblkno; 1267 int exptype; /* BLK_SNAP or BLK_NOCOPY */ 1268 { 1269 int error; 1270 1271 if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 1272 return (error); 1273 return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 1274 } 1275 1276 /* 1277 * Identify a set of blocks allocated in a snapshot inode. 1278 */ 1279 static int 1280 snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1281 struct vnode *vp; 1282 ufs2_daddr_t *oldblkp, *lastblkp; 1283 struct fs *fs; 1284 ufs_lbn_t lblkno; 1285 int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 1286 { 1287 struct inode *ip = VTOI(vp); 1288 ufs2_daddr_t blkno, *blkp; 1289 ufs_lbn_t lbn; 1290 struct buf *ibp; 1291 int error; 1292 1293 for ( ; oldblkp < lastblkp; oldblkp++) { 1294 blkno = *oldblkp; 1295 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1296 continue; 1297 lbn = fragstoblks(fs, blkno); 1298 if (lbn < NDADDR) { 1299 blkp = &ip->i_din2->di_db[lbn]; 1300 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1301 } else { 1302 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1303 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1304 if (error) 1305 return (error); 1306 blkp = &((ufs2_daddr_t *)(ibp->b_data)) 1307 [(lbn - NDADDR) % NINDIR(fs)]; 1308 } 1309 /* 1310 * If we are expunging a snapshot vnode and we 1311 * find a block marked BLK_NOCOPY, then it is 1312 * one that has been allocated to this snapshot after 1313 * we took our current snapshot and can be ignored. 1314 */ 1315 if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { 1316 if (lbn >= NDADDR) 1317 brelse(ibp); 1318 } else { 1319 if (*blkp != 0) 1320 panic("snapacct_ufs2: bad block"); 1321 *blkp = expungetype; 1322 if (lbn >= NDADDR) 1323 bdwrite(ibp); 1324 } 1325 } 1326 return (0); 1327 } 1328 1329 /* 1330 * Account for a set of blocks allocated in a snapshot inode. 1331 */ 1332 static int 1333 mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1334 struct vnode *vp; 1335 ufs2_daddr_t *oldblkp, *lastblkp; 1336 struct fs *fs; 1337 ufs_lbn_t lblkno; 1338 int expungetype; 1339 { 1340 ufs2_daddr_t blkno; 1341 struct inode *ip; 1342 ino_t inum; 1343 int acctit; 1344 1345 ip = VTOI(vp); 1346 inum = ip->i_number; 1347 if (lblkno == -1) 1348 acctit = 0; 1349 else 1350 acctit = 1; 1351 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1352 blkno = *oldblkp; 1353 if (blkno == 0 || blkno == BLK_NOCOPY) 1354 continue; 1355 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1356 *ip->i_snapblklist++ = lblkno; 1357 if (blkno == BLK_SNAP) 1358 blkno = blkstofrags(fs, lblkno); 1359 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); 1360 } 1361 return (0); 1362 } 1363 1364 /* 1365 * Decrement extra reference on snapshot when last name is removed. 1366 * It will not be freed until the last open reference goes away. 1367 */ 1368 void 1369 ffs_snapgone(ip) 1370 struct inode *ip; 1371 { 1372 struct inode *xp; 1373 struct fs *fs; 1374 int snaploc; 1375 1376 /* 1377 * Find snapshot in incore list. 1378 */ 1379 TAILQ_FOREACH(xp, &ip->i_devvp->v_rdev->si_snapshots, i_nextsnap) 1380 if (xp == ip) 1381 break; 1382 if (xp != NULL) 1383 vrele(ITOV(ip)); 1384 else if (snapdebug) 1385 printf("ffs_snapgone: lost snapshot vnode %d\n", 1386 ip->i_number); 1387 /* 1388 * Delete snapshot inode from superblock. Keep list dense. 1389 */ 1390 fs = ip->i_fs; 1391 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 1392 if (fs->fs_snapinum[snaploc] == ip->i_number) 1393 break; 1394 if (snaploc < FSMAXSNAP) { 1395 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { 1396 if (fs->fs_snapinum[snaploc] == 0) 1397 break; 1398 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; 1399 } 1400 fs->fs_snapinum[snaploc - 1] = 0; 1401 } 1402 } 1403 1404 /* 1405 * Prepare a snapshot file for being removed. 1406 */ 1407 void 1408 ffs_snapremove(vp) 1409 struct vnode *vp; 1410 { 1411 struct inode *ip; 1412 struct vnode *devvp; 1413 struct lock *lkp; 1414 struct buf *ibp; 1415 struct fs *fs; 1416 struct thread *td = curthread; 1417 ufs2_daddr_t numblks, blkno, dblk, *snapblklist; 1418 int error, loc, last; 1419 1420 ip = VTOI(vp); 1421 fs = ip->i_fs; 1422 devvp = ip->i_devvp; 1423 /* 1424 * If active, delete from incore list (this snapshot may 1425 * already have been in the process of being deleted, so 1426 * would not have been active). 1427 * 1428 * Clear copy-on-write flag if last snapshot. 1429 */ 1430 if (ip->i_nextsnap.tqe_prev != 0) { 1431 VI_LOCK(devvp); 1432 lockmgr(&vp->v_lock, LK_INTERLOCK | LK_EXCLUSIVE, 1433 VI_MTX(devvp), td); 1434 VI_LOCK(devvp); 1435 TAILQ_REMOVE(&devvp->v_rdev->si_snapshots, ip, i_nextsnap); 1436 ip->i_nextsnap.tqe_prev = 0; 1437 lkp = vp->v_vnlock; 1438 vp->v_vnlock = &vp->v_lock; 1439 lockmgr(lkp, LK_RELEASE, NULL, td); 1440 if (TAILQ_FIRST(&devvp->v_rdev->si_snapshots) != 0) { 1441 VI_UNLOCK(devvp); 1442 } else { 1443 snapblklist = devvp->v_rdev->si_snapblklist; 1444 devvp->v_rdev->si_snapblklist = 0; 1445 devvp->v_rdev->si_snaplistsize = 0; 1446 devvp->v_rdev->si_copyonwrite = 0; 1447 devvp->v_vflag &= ~VV_COPYONWRITE; 1448 lockmgr(lkp, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp), td); 1449 lockmgr(lkp, LK_RELEASE, NULL, td); 1450 lockdestroy(lkp); 1451 FREE(lkp, M_UFSMNT); 1452 FREE(snapblklist, M_UFSMNT); 1453 } 1454 } 1455 /* 1456 * Clear all BLK_NOCOPY fields. Pass any block claims to other 1457 * snapshots that want them (see ffs_snapblkfree below). 1458 */ 1459 for (blkno = 1; blkno < NDADDR; blkno++) { 1460 dblk = DIP(ip, i_db[blkno]); 1461 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1462 DIP(ip, i_db[blkno]) = 0; 1463 else if ((dblk == blkstofrags(fs, blkno) && 1464 ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, 1465 ip->i_number))) { 1466 DIP(ip, i_blocks) -= btodb(fs->fs_bsize); 1467 DIP(ip, i_db[blkno]) = 0; 1468 } 1469 } 1470 numblks = howmany(ip->i_size, fs->fs_bsize); 1471 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 1472 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), 1473 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1474 if (error) 1475 continue; 1476 if (fs->fs_size - blkno > NINDIR(fs)) 1477 last = NINDIR(fs); 1478 else 1479 last = fs->fs_size - blkno; 1480 for (loc = 0; loc < last; loc++) { 1481 if (ip->i_ump->um_fstype == UFS1) { 1482 dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc]; 1483 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1484 ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; 1485 else if ((dblk == blkstofrags(fs, blkno) && 1486 ffs_snapblkfree(fs, ip->i_devvp, dblk, 1487 fs->fs_bsize, ip->i_number))) { 1488 ip->i_din1->di_blocks -= 1489 btodb(fs->fs_bsize); 1490 ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; 1491 } 1492 continue; 1493 } 1494 dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc]; 1495 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1496 ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; 1497 else if ((dblk == blkstofrags(fs, blkno) && 1498 ffs_snapblkfree(fs, ip->i_devvp, dblk, 1499 fs->fs_bsize, ip->i_number))) { 1500 ip->i_din2->di_blocks -= btodb(fs->fs_bsize); 1501 ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; 1502 } 1503 } 1504 bawrite(ibp); 1505 } 1506 /* 1507 * Clear snapshot flag and drop reference. 1508 */ 1509 ip->i_flags &= ~SF_SNAPSHOT; 1510 DIP(ip, i_flags) = ip->i_flags; 1511 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1512 } 1513 1514 /* 1515 * Notification that a block is being freed. Return zero if the free 1516 * should be allowed to proceed. Return non-zero if the snapshot file 1517 * wants to claim the block. The block will be claimed if it is an 1518 * uncopied part of one of the snapshots. It will be freed if it is 1519 * either a BLK_NOCOPY or has already been copied in all of the snapshots. 1520 * If a fragment is being freed, then all snapshots that care about 1521 * it must make a copy since a snapshot file can only claim full sized 1522 * blocks. Note that if more than one snapshot file maps the block, 1523 * we can pick one at random to claim it. Since none of the snapshots 1524 * can change, we are assurred that they will all see the same unmodified 1525 * image. When deleting a snapshot file (see ffs_snapremove above), we 1526 * must push any of these claimed blocks to one of the other snapshots 1527 * that maps it. These claimed blocks are easily identified as they will 1528 * have a block number equal to their logical block number within the 1529 * snapshot. A copied block can never have this property because they 1530 * must always have been allocated from a BLK_NOCOPY location. 1531 */ 1532 int 1533 ffs_snapblkfree(fs, devvp, bno, size, inum) 1534 struct fs *fs; 1535 struct vnode *devvp; 1536 ufs2_daddr_t bno; 1537 long size; 1538 ino_t inum; 1539 { 1540 struct buf *ibp, *cbp, *savedcbp = 0; 1541 struct thread *td = curthread; 1542 struct inode *ip; 1543 struct vnode *vp = NULL; 1544 ufs_lbn_t lbn; 1545 ufs2_daddr_t blkno; 1546 int indiroff = 0, snapshot_locked = 0, error = 0, claimedblk = 0; 1547 struct snaphead *snaphead; 1548 1549 lbn = fragstoblks(fs, bno); 1550 retry: 1551 VI_LOCK(devvp); 1552 snaphead = &devvp->v_rdev->si_snapshots; 1553 TAILQ_FOREACH(ip, snaphead, i_nextsnap) { 1554 vp = ITOV(ip); 1555 /* 1556 * Lookup block being written. 1557 */ 1558 if (lbn < NDADDR) { 1559 blkno = DIP(ip, i_db[lbn]); 1560 } else { 1561 if (snapshot_locked == 0 && 1562 lockmgr(vp->v_vnlock, 1563 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 1564 VI_MTX(devvp), td) != 0) 1565 goto retry; 1566 snapshot_locked = 1; 1567 td->td_pflags |= TDP_COWINPROGRESS; 1568 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1569 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1570 td->td_pflags &= ~TDP_COWINPROGRESS; 1571 if (error) 1572 break; 1573 indiroff = (lbn - NDADDR) % NINDIR(fs); 1574 if (ip->i_ump->um_fstype == UFS1) 1575 blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; 1576 else 1577 blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; 1578 } 1579 /* 1580 * Check to see if block needs to be copied. 1581 */ 1582 if (blkno == 0) { 1583 /* 1584 * A block that we map is being freed. If it has not 1585 * been claimed yet, we will claim or copy it (below). 1586 */ 1587 claimedblk = 1; 1588 } else if (blkno == BLK_SNAP) { 1589 /* 1590 * No previous snapshot claimed the block, 1591 * so it will be freed and become a BLK_NOCOPY 1592 * (don't care) for us. 1593 */ 1594 if (claimedblk) 1595 panic("snapblkfree: inconsistent block type"); 1596 if (snapshot_locked == 0 && 1597 lockmgr(vp->v_vnlock, 1598 LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, 1599 VI_MTX(devvp), td) != 0) { 1600 if (lbn >= NDADDR) 1601 bqrelse(ibp); 1602 vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL, td); 1603 goto retry; 1604 } 1605 snapshot_locked = 1; 1606 if (lbn < NDADDR) { 1607 DIP(ip, i_db[lbn]) = BLK_NOCOPY; 1608 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1609 } else if (ip->i_ump->um_fstype == UFS1) { 1610 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 1611 BLK_NOCOPY; 1612 bdwrite(ibp); 1613 } else { 1614 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 1615 BLK_NOCOPY; 1616 bdwrite(ibp); 1617 } 1618 continue; 1619 } else /* BLK_NOCOPY or default */ { 1620 /* 1621 * If the snapshot has already copied the block 1622 * (default), or does not care about the block, 1623 * it is not needed. 1624 */ 1625 if (lbn >= NDADDR) 1626 bqrelse(ibp); 1627 continue; 1628 } 1629 /* 1630 * If this is a full size block, we will just grab it 1631 * and assign it to the snapshot inode. Otherwise we 1632 * will proceed to copy it. See explanation for this 1633 * routine as to why only a single snapshot needs to 1634 * claim this block. 1635 */ 1636 if (snapshot_locked == 0 && 1637 lockmgr(vp->v_vnlock, 1638 LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, 1639 VI_MTX(devvp), td) != 0) { 1640 if (lbn >= NDADDR) 1641 bqrelse(ibp); 1642 vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL, td); 1643 goto retry; 1644 } 1645 snapshot_locked = 1; 1646 if (size == fs->fs_bsize) { 1647 #ifdef DEBUG 1648 if (snapdebug) 1649 printf("%s %d lbn %jd from inum %d\n", 1650 "Grabonremove: snapino", ip->i_number, 1651 (intmax_t)lbn, inum); 1652 #endif 1653 if (lbn < NDADDR) { 1654 DIP(ip, i_db[lbn]) = bno; 1655 } else if (ip->i_ump->um_fstype == UFS1) { 1656 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno; 1657 bdwrite(ibp); 1658 } else { 1659 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno; 1660 bdwrite(ibp); 1661 } 1662 DIP(ip, i_blocks) += btodb(size); 1663 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1664 VOP_UNLOCK(vp, 0, td); 1665 return (1); 1666 } 1667 if (lbn >= NDADDR) 1668 bqrelse(ibp); 1669 /* 1670 * Allocate the block into which to do the copy. Note that this 1671 * allocation will never require any additional allocations for 1672 * the snapshot inode. 1673 */ 1674 td->td_pflags |= TDP_COWINPROGRESS; 1675 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1676 fs->fs_bsize, KERNCRED, 0, &cbp); 1677 td->td_pflags &= ~TDP_COWINPROGRESS; 1678 if (error) 1679 break; 1680 #ifdef DEBUG 1681 if (snapdebug) 1682 printf("%s%d lbn %jd %s %d size %ld to blkno %jd\n", 1683 "Copyonremove: snapino ", ip->i_number, 1684 (intmax_t)lbn, "for inum", inum, size, 1685 (intmax_t)cbp->b_blkno); 1686 #endif 1687 /* 1688 * If we have already read the old block contents, then 1689 * simply copy them to the new block. Note that we need 1690 * to synchronously write snapshots that have not been 1691 * unlinked, and hence will be visible after a crash, 1692 * to ensure their integrity. 1693 */ 1694 if (savedcbp != 0) { 1695 bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 1696 bawrite(cbp); 1697 if (dopersistence && ip->i_effnlink > 0) 1698 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1699 continue; 1700 } 1701 /* 1702 * Otherwise, read the old block contents into the buffer. 1703 */ 1704 if ((error = readblock(cbp, lbn)) != 0) { 1705 bzero(cbp->b_data, fs->fs_bsize); 1706 bawrite(cbp); 1707 if (dopersistence && ip->i_effnlink > 0) 1708 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1709 break; 1710 } 1711 savedcbp = cbp; 1712 } 1713 /* 1714 * Note that we need to synchronously write snapshots that 1715 * have not been unlinked, and hence will be visible after 1716 * a crash, to ensure their integrity. 1717 */ 1718 if (savedcbp) { 1719 vp = savedcbp->b_vp; 1720 bawrite(savedcbp); 1721 if (dopersistence && VTOI(vp)->i_effnlink > 0) 1722 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1723 } 1724 /* 1725 * If we have been unable to allocate a block in which to do 1726 * the copy, then return non-zero so that the fragment will 1727 * not be freed. Although space will be lost, the snapshot 1728 * will stay consistent. 1729 */ 1730 if (snapshot_locked) 1731 VOP_UNLOCK(vp, 0, td); 1732 else 1733 VI_UNLOCK(devvp); 1734 return (error); 1735 } 1736 1737 /* 1738 * Associate snapshot files when mounting. 1739 */ 1740 void 1741 ffs_snapshot_mount(mp) 1742 struct mount *mp; 1743 { 1744 struct ufsmount *ump = VFSTOUFS(mp); 1745 struct vnode *devvp = ump->um_devvp; 1746 struct fs *fs = ump->um_fs; 1747 struct thread *td = curthread; 1748 struct snaphead *snaphead; 1749 struct vnode *vp; 1750 struct inode *ip, *xp; 1751 struct uio auio; 1752 struct iovec aiov; 1753 void *snapblklist; 1754 char *reason; 1755 daddr_t snaplistsize; 1756 int error, snaploc, loc; 1757 1758 /* 1759 * XXX The following needs to be set before UFS_TRUNCATE or 1760 * VOP_READ can be called. 1761 */ 1762 mp->mnt_stat.f_iosize = fs->fs_bsize; 1763 /* 1764 * Process each snapshot listed in the superblock. 1765 */ 1766 vp = NULL; 1767 snaphead = &devvp->v_rdev->si_snapshots; 1768 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { 1769 if (fs->fs_snapinum[snaploc] == 0) 1770 break; 1771 if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], 1772 LK_EXCLUSIVE, &vp)) != 0){ 1773 printf("ffs_snapshot_mount: vget failed %d\n", error); 1774 continue; 1775 } 1776 ip = VTOI(vp); 1777 if ((ip->i_flags & SF_SNAPSHOT) == 0 || ip->i_size == 1778 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) { 1779 if ((ip->i_flags & SF_SNAPSHOT) == 0) { 1780 reason = "non-snapshot"; 1781 } else { 1782 reason = "old format snapshot"; 1783 (void)UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED, td); 1784 (void)VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1785 } 1786 printf("ffs_snapshot_mount: %s inode %d\n", 1787 reason, fs->fs_snapinum[snaploc]); 1788 vput(vp); 1789 vp = NULL; 1790 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { 1791 if (fs->fs_snapinum[loc] == 0) 1792 break; 1793 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; 1794 } 1795 fs->fs_snapinum[loc - 1] = 0; 1796 snaploc--; 1797 continue; 1798 } 1799 /* 1800 * If there already exist snapshots on this filesystem, grab a 1801 * reference to their shared lock. If this is the first snapshot 1802 * on this filesystem, we need to allocate a lock for the 1803 * snapshots to share. In either case, acquire the snapshot 1804 * lock and give up our original private lock. 1805 */ 1806 VI_LOCK(devvp); 1807 if ((xp = TAILQ_FIRST(snaphead)) != NULL) { 1808 struct lock *lkp; 1809 1810 lkp = ITOV(xp)->v_vnlock; 1811 VI_UNLOCK(devvp); 1812 VI_LOCK(vp); 1813 vp->v_vnlock = lkp; 1814 } else { 1815 struct lock *lkp; 1816 1817 VI_UNLOCK(devvp); 1818 MALLOC(lkp, struct lock *, sizeof(struct lock), 1819 M_UFSMNT, M_WAITOK); 1820 lockinit(lkp, PVFS, "snaplk", VLKTIMEOUT, 1821 LK_CANRECURSE | LK_NOPAUSE); 1822 VI_LOCK(vp); 1823 vp->v_vnlock = lkp; 1824 } 1825 vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, td); 1826 transferlockers(&vp->v_lock, vp->v_vnlock); 1827 lockmgr(&vp->v_lock, LK_RELEASE, NULL, td); 1828 /* 1829 * Link it onto the active snapshot list. 1830 */ 1831 VI_LOCK(devvp); 1832 if (ip->i_nextsnap.tqe_prev != 0) 1833 panic("ffs_snapshot_mount: %d already on list", 1834 ip->i_number); 1835 else 1836 TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap); 1837 vp->v_vflag |= VV_SYSTEM; 1838 VI_UNLOCK(devvp); 1839 VOP_UNLOCK(vp, 0, td); 1840 } 1841 /* 1842 * No usable snapshots found. 1843 */ 1844 if (vp == NULL) 1845 return; 1846 /* 1847 * Allocate the space for the block hints list. We always want to 1848 * use the list from the newest snapshot. 1849 */ 1850 auio.uio_iov = &aiov; 1851 auio.uio_iovcnt = 1; 1852 aiov.iov_base = (void *)&snaplistsize; 1853 aiov.iov_len = sizeof(snaplistsize); 1854 auio.uio_resid = aiov.iov_len; 1855 auio.uio_offset = 1856 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)); 1857 auio.uio_segflg = UIO_SYSSPACE; 1858 auio.uio_rw = UIO_READ; 1859 auio.uio_td = td; 1860 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1861 if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 1862 printf("ffs_snapshot_mount: read_1 failed %d\n", error); 1863 VOP_UNLOCK(vp, 0, td); 1864 return; 1865 } 1866 MALLOC(snapblklist, void *, snaplistsize * sizeof(daddr_t), 1867 M_UFSMNT, M_WAITOK); 1868 auio.uio_iovcnt = 1; 1869 aiov.iov_base = snapblklist; 1870 aiov.iov_len = snaplistsize * sizeof (daddr_t); 1871 auio.uio_resid = aiov.iov_len; 1872 auio.uio_offset -= sizeof(snaplistsize); 1873 if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 1874 printf("ffs_snapshot_mount: read_2 failed %d\n", error); 1875 VOP_UNLOCK(vp, 0, td); 1876 FREE(snapblklist, M_UFSMNT); 1877 return; 1878 } 1879 VOP_UNLOCK(vp, 0, td); 1880 VI_LOCK(devvp); 1881 ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_mount"); 1882 devvp->v_rdev->si_snaplistsize = snaplistsize; 1883 devvp->v_rdev->si_snapblklist = (daddr_t *)snapblklist; 1884 devvp->v_rdev->si_copyonwrite = ffs_copyonwrite; 1885 devvp->v_vflag |= VV_COPYONWRITE; 1886 VI_UNLOCK(devvp); 1887 } 1888 1889 /* 1890 * Disassociate snapshot files when unmounting. 1891 */ 1892 void 1893 ffs_snapshot_unmount(mp) 1894 struct mount *mp; 1895 { 1896 struct vnode *devvp = VFSTOUFS(mp)->um_devvp; 1897 struct snaphead *snaphead = &devvp->v_rdev->si_snapshots; 1898 struct lock *lkp = NULL; 1899 struct inode *xp; 1900 struct vnode *vp; 1901 1902 VI_LOCK(devvp); 1903 while ((xp = TAILQ_FIRST(snaphead)) != 0) { 1904 vp = ITOV(xp); 1905 lkp = vp->v_vnlock; 1906 vp->v_vnlock = &vp->v_lock; 1907 TAILQ_REMOVE(snaphead, xp, i_nextsnap); 1908 xp->i_nextsnap.tqe_prev = 0; 1909 if (xp->i_effnlink > 0) { 1910 VI_UNLOCK(devvp); 1911 vrele(vp); 1912 VI_LOCK(devvp); 1913 } 1914 } 1915 if (devvp->v_rdev->si_snapblklist != NULL) { 1916 FREE(devvp->v_rdev->si_snapblklist, M_UFSMNT); 1917 devvp->v_rdev->si_snapblklist = NULL; 1918 devvp->v_rdev->si_snaplistsize = 0; 1919 } 1920 if (lkp != NULL) { 1921 lockdestroy(lkp); 1922 FREE(lkp, M_UFSMNT); 1923 } 1924 ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount"); 1925 devvp->v_rdev->si_copyonwrite = 0; 1926 devvp->v_vflag &= ~VV_COPYONWRITE; 1927 VI_UNLOCK(devvp); 1928 } 1929 1930 /* 1931 * Check for need to copy block that is about to be written, 1932 * copying the block if necessary. 1933 */ 1934 static int 1935 ffs_copyonwrite(devvp, bp) 1936 struct vnode *devvp; 1937 struct buf *bp; 1938 { 1939 struct snaphead *snaphead; 1940 struct buf *ibp, *cbp, *savedcbp = 0; 1941 struct thread *td = curthread; 1942 struct fs *fs; 1943 struct inode *ip; 1944 struct vnode *vp = 0; 1945 ufs2_daddr_t lbn, blkno, *snapblklist; 1946 int lower, upper, mid, indiroff, snapshot_locked = 0, error = 0; 1947 1948 if (td->td_pflags & TDP_COWINPROGRESS) 1949 panic("ffs_copyonwrite: recursive call"); 1950 /* 1951 * First check to see if it is in the preallocated list. 1952 * By doing this check we avoid several potential deadlocks. 1953 */ 1954 VI_LOCK(devvp); 1955 snaphead = &devvp->v_rdev->si_snapshots; 1956 ip = TAILQ_FIRST(snaphead); 1957 fs = ip->i_fs; 1958 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); 1959 snapblklist = devvp->v_rdev->si_snapblklist; 1960 upper = devvp->v_rdev->si_snaplistsize - 1; 1961 lower = 1; 1962 while (lower <= upper) { 1963 mid = (lower + upper) / 2; 1964 if (snapblklist[mid] == lbn) 1965 break; 1966 if (snapblklist[mid] < lbn) 1967 lower = mid + 1; 1968 else 1969 upper = mid - 1; 1970 } 1971 if (lower <= upper) { 1972 VI_UNLOCK(devvp); 1973 return (0); 1974 } 1975 /* 1976 * Not in the precomputed list, so check the snapshots. 1977 */ 1978 retry: 1979 TAILQ_FOREACH(ip, snaphead, i_nextsnap) { 1980 vp = ITOV(ip); 1981 /* 1982 * We ensure that everything of our own that needs to be 1983 * copied will be done at the time that ffs_snapshot is 1984 * called. Thus we can skip the check here which can 1985 * deadlock in doing the lookup in UFS_BALLOC. 1986 */ 1987 if (bp->b_vp == vp) 1988 continue; 1989 /* 1990 * Check to see if block needs to be copied. We do not have 1991 * to hold the snapshot lock while doing this lookup as it 1992 * will never require any additional allocations for the 1993 * snapshot inode. 1994 */ 1995 if (lbn < NDADDR) { 1996 blkno = DIP(ip, i_db[lbn]); 1997 } else { 1998 if (snapshot_locked == 0 && 1999 lockmgr(vp->v_vnlock, 2000 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 2001 VI_MTX(devvp), td) != 0) { 2002 VI_LOCK(devvp); 2003 goto retry; 2004 } 2005 snapshot_locked = 1; 2006 td->td_pflags |= TDP_COWINPROGRESS; 2007 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 2008 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 2009 td->td_pflags &= ~TDP_COWINPROGRESS; 2010 if (error) 2011 break; 2012 indiroff = (lbn - NDADDR) % NINDIR(fs); 2013 if (ip->i_ump->um_fstype == UFS1) 2014 blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; 2015 else 2016 blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; 2017 bqrelse(ibp); 2018 } 2019 #ifdef DIAGNOSTIC 2020 if (blkno == BLK_SNAP && bp->b_lblkno >= 0) 2021 panic("ffs_copyonwrite: bad copy block"); 2022 #endif 2023 if (blkno != 0) 2024 continue; 2025 /* 2026 * Allocate the block into which to do the copy. Since 2027 * multiple processes may all try to copy the same block, 2028 * we have to recheck our need to do a copy if we sleep 2029 * waiting for the lock. 2030 * 2031 * Because all snapshots on a filesystem share a single 2032 * lock, we ensure that we will never be in competition 2033 * with another process to allocate a block. 2034 */ 2035 if (snapshot_locked == 0 && 2036 lockmgr(vp->v_vnlock, 2037 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 2038 VI_MTX(devvp), td) != 0) { 2039 VI_LOCK(devvp); 2040 goto retry; 2041 } 2042 snapshot_locked = 1; 2043 td->td_pflags |= TDP_COWINPROGRESS; 2044 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 2045 fs->fs_bsize, KERNCRED, 0, &cbp); 2046 td->td_pflags &= ~TDP_COWINPROGRESS; 2047 if (error) 2048 break; 2049 #ifdef DEBUG 2050 if (snapdebug) { 2051 printf("Copyonwrite: snapino %d lbn %jd for ", 2052 ip->i_number, (intmax_t)lbn); 2053 if (bp->b_vp == devvp) 2054 printf("fs metadata"); 2055 else 2056 printf("inum %d", VTOI(bp->b_vp)->i_number); 2057 printf(" lblkno %jd to blkno %jd\n", 2058 (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno); 2059 } 2060 #endif 2061 /* 2062 * If we have already read the old block contents, then 2063 * simply copy them to the new block. Note that we need 2064 * to synchronously write snapshots that have not been 2065 * unlinked, and hence will be visible after a crash, 2066 * to ensure their integrity. 2067 */ 2068 if (savedcbp != 0) { 2069 bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 2070 bawrite(cbp); 2071 if (dopersistence && ip->i_effnlink > 0) 2072 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 2073 continue; 2074 } 2075 /* 2076 * Otherwise, read the old block contents into the buffer. 2077 */ 2078 if ((error = readblock(cbp, lbn)) != 0) { 2079 bzero(cbp->b_data, fs->fs_bsize); 2080 bawrite(cbp); 2081 if (dopersistence && ip->i_effnlink > 0) 2082 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 2083 break; 2084 } 2085 savedcbp = cbp; 2086 } 2087 /* 2088 * Note that we need to synchronously write snapshots that 2089 * have not been unlinked, and hence will be visible after 2090 * a crash, to ensure their integrity. 2091 */ 2092 if (savedcbp) { 2093 vp = savedcbp->b_vp; 2094 bawrite(savedcbp); 2095 if (dopersistence && VTOI(vp)->i_effnlink > 0) 2096 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 2097 } 2098 if (snapshot_locked) 2099 VOP_UNLOCK(vp, 0, td); 2100 else 2101 VI_UNLOCK(devvp); 2102 return (error); 2103 } 2104 2105 /* 2106 * Read the specified block into the given buffer. 2107 * Much of this boiler-plate comes from bwrite(). 2108 */ 2109 static int 2110 readblock(bp, lbn) 2111 struct buf *bp; 2112 ufs2_daddr_t lbn; 2113 { 2114 struct uio auio; 2115 struct iovec aiov; 2116 struct thread *td = curthread; 2117 struct inode *ip = VTOI(bp->b_vp); 2118 2119 aiov.iov_base = bp->b_data; 2120 aiov.iov_len = bp->b_bcount; 2121 auio.uio_iov = &aiov; 2122 auio.uio_iovcnt = 1; 2123 auio.uio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn))); 2124 auio.uio_resid = bp->b_bcount; 2125 auio.uio_rw = UIO_READ; 2126 auio.uio_segflg = UIO_SYSSPACE; 2127 auio.uio_td = td; 2128 return (physio(ip->i_devvp->v_rdev, &auio, 0)); 2129 } 2130