1 /*- 2 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. 3 * 4 * Further information about snapshots can be obtained from: 5 * 6 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 7 * 1614 Oxford Street mckusick@mckusick.com 8 * Berkeley, CA 94709-1608 +1-510-843-9542 9 * USA 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 21 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 22 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 34 */ 35 36 #include <sys/cdefs.h> 37 __FBSDID("$FreeBSD$"); 38 39 #include <sys/param.h> 40 #include <sys/kernel.h> 41 #include <sys/systm.h> 42 #include <sys/conf.h> 43 #include <sys/bio.h> 44 #include <sys/buf.h> 45 #include <sys/proc.h> 46 #include <sys/namei.h> 47 #include <sys/sched.h> 48 #include <sys/stat.h> 49 #include <sys/malloc.h> 50 #include <sys/mount.h> 51 #include <sys/resource.h> 52 #include <sys/resourcevar.h> 53 #include <sys/vnode.h> 54 55 #include <geom/geom.h> 56 57 #include <ufs/ufs/extattr.h> 58 #include <ufs/ufs/quota.h> 59 #include <ufs/ufs/ufsmount.h> 60 #include <ufs/ufs/inode.h> 61 #include <ufs/ufs/ufs_extern.h> 62 63 #include <ufs/ffs/fs.h> 64 #include <ufs/ffs/ffs_extern.h> 65 66 #define KERNCRED thread0.td_ucred 67 #define DEBUG 1 68 69 TAILQ_HEAD(snaphead, inode); 70 71 struct snapdata { 72 struct snaphead sn_head; 73 daddr_t sn_listsize; 74 daddr_t *sn_blklist; 75 struct lock sn_lock; 76 }; 77 78 static int cgaccount(int, struct vnode *, struct buf *, int); 79 static int expunge_ufs1(struct vnode *, struct inode *, struct fs *, 80 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 81 ufs_lbn_t, int), int); 82 static int indiracct_ufs1(struct vnode *, struct vnode *, int, 83 ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 84 int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, 85 ufs_lbn_t, int), int); 86 static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 87 struct fs *, ufs_lbn_t, int); 88 static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 89 struct fs *, ufs_lbn_t, int); 90 static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 91 struct fs *, ufs_lbn_t, int); 92 static int expunge_ufs2(struct vnode *, struct inode *, struct fs *, 93 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 94 ufs_lbn_t, int), int); 95 static int indiracct_ufs2(struct vnode *, struct vnode *, int, 96 ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, 97 int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, 98 ufs_lbn_t, int), int); 99 static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 100 struct fs *, ufs_lbn_t, int); 101 static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 102 struct fs *, ufs_lbn_t, int); 103 static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 104 struct fs *, ufs_lbn_t, int); 105 static int readblock(struct vnode *vp, struct buf *, ufs2_daddr_t); 106 107 /* 108 * To ensure the consistency of snapshots across crashes, we must 109 * synchronously write out copied blocks before allowing the 110 * originals to be modified. Because of the rather severe speed 111 * penalty that this imposes, the following flag allows this 112 * crash persistence to be disabled. 113 */ 114 int dopersistence = 0; 115 116 #ifdef DEBUG 117 #include <sys/sysctl.h> 118 SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, ""); 119 static int snapdebug = 0; 120 SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, ""); 121 int collectsnapstats = 0; 122 SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats, 123 0, ""); 124 #endif /* DEBUG */ 125 126 /* 127 * Create a snapshot file and initialize it for the filesystem. 128 */ 129 int 130 ffs_snapshot(mp, snapfile) 131 struct mount *mp; 132 char *snapfile; 133 { 134 ufs2_daddr_t numblks, blkno, *blkp, *snapblklist; 135 int error, cg, snaploc; 136 int i, size, len, loc; 137 int flag = mp->mnt_flag; 138 struct timespec starttime = {0, 0}, endtime; 139 char saved_nice = 0; 140 long redo = 0, snaplistsize = 0; 141 int32_t *lp; 142 void *space; 143 struct fs *copy_fs = NULL, *fs; 144 struct thread *td = curthread; 145 struct inode *ip, *xp; 146 struct buf *bp, *nbp, *ibp, *sbp = NULL; 147 struct nameidata nd; 148 struct mount *wrtmp; 149 struct vattr vat; 150 struct vnode *vp, *xvp, *nvp, *devvp; 151 struct uio auio; 152 struct iovec aiov; 153 struct snapdata *sn; 154 struct ufsmount *ump; 155 156 ump = VFSTOUFS(mp); 157 fs = ump->um_fs; 158 /* 159 * XXX: make sure we don't go to out1 before we setup sn 160 */ 161 sn = (void *)0xdeadbeef; 162 163 /* 164 * Need to serialize access to snapshot code per filesystem. 165 */ 166 /* 167 * Assign a snapshot slot in the superblock. 168 */ 169 UFS_LOCK(ump); 170 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 171 if (fs->fs_snapinum[snaploc] == 0) 172 break; 173 UFS_UNLOCK(ump); 174 if (snaploc == FSMAXSNAP) 175 return (ENOSPC); 176 /* 177 * Create the snapshot file. 178 */ 179 restart: 180 NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_SYSSPACE, snapfile, td); 181 if ((error = namei(&nd)) != 0) 182 return (error); 183 if (nd.ni_vp != NULL) { 184 vput(nd.ni_vp); 185 error = EEXIST; 186 } 187 if (nd.ni_dvp->v_mount != mp) 188 error = EXDEV; 189 if (error) { 190 NDFREE(&nd, NDF_ONLY_PNBUF); 191 if (nd.ni_dvp == nd.ni_vp) 192 vrele(nd.ni_dvp); 193 else 194 vput(nd.ni_dvp); 195 return (error); 196 } 197 VATTR_NULL(&vat); 198 vat.va_type = VREG; 199 vat.va_mode = S_IRUSR; 200 vat.va_vaflags |= VA_EXCLUSIVE; 201 if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp)) 202 wrtmp = NULL; 203 if (wrtmp != mp) 204 panic("ffs_snapshot: mount mismatch"); 205 if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) { 206 NDFREE(&nd, NDF_ONLY_PNBUF); 207 vput(nd.ni_dvp); 208 if ((error = vn_start_write(NULL, &wrtmp, 209 V_XSLEEP | PCATCH)) != 0) 210 return (error); 211 goto restart; 212 } 213 VOP_LEASE(nd.ni_dvp, td, KERNCRED, LEASE_WRITE); 214 error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat); 215 vput(nd.ni_dvp); 216 if (error) { 217 NDFREE(&nd, NDF_ONLY_PNBUF); 218 vn_finished_write(wrtmp); 219 return (error); 220 } 221 vp = nd.ni_vp; 222 ip = VTOI(vp); 223 devvp = ip->i_devvp; 224 /* 225 * Allocate and copy the last block contents so as to be able 226 * to set size to that of the filesystem. 227 */ 228 numblks = howmany(fs->fs_size, fs->fs_frag); 229 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), 230 fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); 231 if (error) 232 goto out; 233 ip->i_size = lblktosize(fs, (off_t)numblks); 234 DIP_SET(ip, i_size, ip->i_size); 235 ip->i_flag |= IN_CHANGE | IN_UPDATE; 236 if ((error = readblock(vp, bp, numblks - 1)) != 0) 237 goto out; 238 bawrite(bp); 239 /* 240 * Preallocate critical data structures so that we can copy 241 * them in without further allocation after we suspend all 242 * operations on the filesystem. We would like to just release 243 * the allocated buffers without writing them since they will 244 * be filled in below once we are ready to go, but this upsets 245 * the soft update code, so we go ahead and write the new buffers. 246 * 247 * Allocate all indirect blocks and mark all of them as not 248 * needing to be copied. 249 */ 250 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 251 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), 252 fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp); 253 if (error) 254 goto out; 255 bawrite(ibp); 256 } 257 /* 258 * Allocate copies for the superblock and its summary information. 259 */ 260 error = UFS_BALLOC(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED, 261 0, &nbp); 262 if (error) 263 goto out; 264 bawrite(nbp); 265 blkno = fragstoblks(fs, fs->fs_csaddr); 266 len = howmany(fs->fs_cssize, fs->fs_bsize); 267 for (loc = 0; loc < len; loc++) { 268 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), 269 fs->fs_bsize, KERNCRED, 0, &nbp); 270 if (error) 271 goto out; 272 bawrite(nbp); 273 } 274 /* 275 * Allocate all cylinder group blocks. 276 */ 277 for (cg = 0; cg < fs->fs_ncg; cg++) { 278 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 279 fs->fs_bsize, KERNCRED, 0, &nbp); 280 if (error) 281 goto out; 282 bawrite(nbp); 283 } 284 /* 285 * Copy all the cylinder group maps. Although the 286 * filesystem is still active, we hope that only a few 287 * cylinder groups will change between now and when we 288 * suspend operations. Thus, we will be able to quickly 289 * touch up the few cylinder groups that changed during 290 * the suspension period. 291 */ 292 len = howmany(fs->fs_ncg, NBBY); 293 MALLOC(space, void *, len, M_DEVBUF, M_WAITOK); 294 bzero(space, len); 295 UFS_LOCK(ump); 296 fs->fs_active = space; 297 UFS_UNLOCK(ump); 298 for (cg = 0; cg < fs->fs_ncg; cg++) { 299 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 300 fs->fs_bsize, KERNCRED, 0, &nbp); 301 if (error) 302 goto out; 303 error = cgaccount(cg, vp, nbp, 1); 304 bawrite(nbp); 305 if (error) 306 goto out; 307 } 308 /* 309 * Change inode to snapshot type file. 310 */ 311 ip->i_flags |= SF_SNAPSHOT; 312 DIP_SET(ip, i_flags, ip->i_flags); 313 ip->i_flag |= IN_CHANGE | IN_UPDATE; 314 /* 315 * Ensure that the snapshot is completely on disk. 316 * Since we have marked it as a snapshot it is safe to 317 * unlock it as no process will be allowed to write to it. 318 */ 319 if ((error = ffs_syncvnode(vp, MNT_WAIT)) != 0) 320 goto out; 321 VOP_UNLOCK(vp, 0, td); 322 /* 323 * All allocations are done, so we can now snapshot the system. 324 * 325 * Recind nice scheduling while running with the filesystem suspended. 326 */ 327 if (td->td_proc->p_nice > 0) { 328 PROC_LOCK(td->td_proc); 329 mtx_lock_spin(&sched_lock); 330 saved_nice = td->td_proc->p_nice; 331 sched_nice(td->td_proc, 0); 332 mtx_unlock_spin(&sched_lock); 333 PROC_UNLOCK(td->td_proc); 334 } 335 /* 336 * Suspend operation on filesystem. 337 */ 338 for (;;) { 339 vn_finished_write(wrtmp); 340 if ((error = vfs_write_suspend(vp->v_mount)) != 0) { 341 vn_start_write(NULL, &wrtmp, V_WAIT); 342 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 343 goto out; 344 } 345 if (mp->mnt_kern_flag & MNTK_SUSPENDED) 346 break; 347 vn_start_write(NULL, &wrtmp, V_WAIT); 348 } 349 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 350 if (collectsnapstats) 351 nanotime(&starttime); 352 /* 353 * First, copy all the cylinder group maps that have changed. 354 */ 355 for (cg = 0; cg < fs->fs_ncg; cg++) { 356 if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0) 357 continue; 358 redo++; 359 error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), 360 fs->fs_bsize, KERNCRED, 0, &nbp); 361 if (error) 362 goto out1; 363 error = cgaccount(cg, vp, nbp, 2); 364 bawrite(nbp); 365 if (error) 366 goto out1; 367 } 368 /* 369 * Grab a copy of the superblock and its summary information. 370 * We delay writing it until the suspension is released below. 371 */ 372 error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize, 373 KERNCRED, &sbp); 374 if (error) { 375 brelse(sbp); 376 sbp = NULL; 377 goto out1; 378 } 379 loc = blkoff(fs, fs->fs_sblockloc); 380 copy_fs = (struct fs *)(sbp->b_data + loc); 381 bcopy(fs, copy_fs, fs->fs_sbsize); 382 if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) 383 copy_fs->fs_clean = 1; 384 size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE; 385 if (fs->fs_sbsize < size) 386 bzero(&sbp->b_data[loc + fs->fs_sbsize], size - fs->fs_sbsize); 387 size = blkroundup(fs, fs->fs_cssize); 388 if (fs->fs_contigsumsize > 0) 389 size += fs->fs_ncg * sizeof(int32_t); 390 space = malloc((u_long)size, M_UFSMNT, M_WAITOK); 391 copy_fs->fs_csp = space; 392 bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize); 393 space = (char *)space + fs->fs_cssize; 394 loc = howmany(fs->fs_cssize, fs->fs_fsize); 395 i = fs->fs_frag - loc % fs->fs_frag; 396 len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; 397 if (len > 0) { 398 if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc), 399 len, KERNCRED, &bp)) != 0) { 400 brelse(bp); 401 free(copy_fs->fs_csp, M_UFSMNT); 402 bawrite(sbp); 403 sbp = NULL; 404 goto out1; 405 } 406 bcopy(bp->b_data, space, (u_int)len); 407 space = (char *)space + len; 408 bp->b_flags |= B_INVAL | B_NOCACHE; 409 brelse(bp); 410 } 411 if (fs->fs_contigsumsize > 0) { 412 copy_fs->fs_maxcluster = lp = space; 413 for (i = 0; i < fs->fs_ncg; i++) 414 *lp++ = fs->fs_contigsumsize; 415 } 416 /* 417 * We must check for active files that have been unlinked 418 * (e.g., with a zero link count). We have to expunge all 419 * trace of these files from the snapshot so that they are 420 * not reclaimed prematurely by fsck or unnecessarily dumped. 421 * We turn off the MNTK_SUSPENDED flag to avoid a panic from 422 * spec_strategy about writing on a suspended filesystem. 423 * Note that we skip unlinked snapshot files as they will 424 * be handled separately below. 425 * 426 * We also calculate the needed size for the snapshot list. 427 */ 428 snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + 429 FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */; 430 MNT_ILOCK(mp); 431 mp->mnt_kern_flag &= ~MNTK_SUSPENDED; 432 loop: 433 MNT_VNODE_FOREACH(xvp, mp, nvp) { 434 VI_LOCK(xvp); 435 MNT_IUNLOCK(mp); 436 if ((xvp->v_iflag & VI_XLOCK) || 437 xvp->v_usecount == 0 || xvp->v_type == VNON || 438 (VTOI(xvp)->i_flags & SF_SNAPSHOT)) { 439 VI_UNLOCK(xvp); 440 MNT_ILOCK(mp); 441 continue; 442 } 443 /* 444 * We can skip parent directory vnode because it must have 445 * this snapshot file in it. 446 */ 447 if (xvp == nd.ni_dvp) { 448 VI_UNLOCK(xvp); 449 MNT_ILOCK(mp); 450 continue; 451 } 452 if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK, td) != 0) { 453 MNT_ILOCK(mp); 454 goto loop; 455 } 456 if (snapdebug) 457 vprint("ffs_snapshot: busy vnode", xvp); 458 if (VOP_GETATTR(xvp, &vat, td->td_ucred, td) == 0 && 459 vat.va_nlink > 0) { 460 VOP_UNLOCK(xvp, 0, td); 461 MNT_ILOCK(mp); 462 continue; 463 } 464 xp = VTOI(xvp); 465 if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) { 466 VOP_UNLOCK(xvp, 0, td); 467 MNT_ILOCK(mp); 468 continue; 469 } 470 /* 471 * If there is a fragment, clear it here. 472 */ 473 blkno = 0; 474 loc = howmany(xp->i_size, fs->fs_bsize) - 1; 475 if (loc < NDADDR) { 476 len = fragroundup(fs, blkoff(fs, xp->i_size)); 477 if (len < fs->fs_bsize) { 478 ffs_blkfree(ump, copy_fs, vp, 479 DIP(xp, i_db[loc]), len, xp->i_number); 480 blkno = DIP(xp, i_db[loc]); 481 DIP_SET(xp, i_db[loc], 0); 482 } 483 } 484 snaplistsize += 1; 485 if (xp->i_ump->um_fstype == UFS1) 486 error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, 487 BLK_NOCOPY); 488 else 489 error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, 490 BLK_NOCOPY); 491 if (blkno) 492 DIP_SET(xp, i_db[loc], blkno); 493 if (!error) 494 error = ffs_freefile(ump, copy_fs, vp, xp->i_number, 495 xp->i_mode); 496 VOP_UNLOCK(xvp, 0, td); 497 if (error) { 498 free(copy_fs->fs_csp, M_UFSMNT); 499 bawrite(sbp); 500 sbp = NULL; 501 goto out1; 502 } 503 MNT_ILOCK(mp); 504 } 505 MNT_IUNLOCK(mp); 506 /* 507 * If there already exist snapshots on this filesystem, grab a 508 * reference to their shared lock. If this is the first snapshot 509 * on this filesystem, we need to allocate a lock for the snapshots 510 * to share. In either case, acquire the snapshot lock and give 511 * up our original private lock. 512 */ 513 VI_LOCK(devvp); 514 sn = devvp->v_rdev->si_snapdata; 515 if (sn != NULL) { 516 xp = TAILQ_FIRST(&sn->sn_head); 517 VI_UNLOCK(devvp); 518 VI_LOCK(vp); 519 vp->v_vnlock = &sn->sn_lock; 520 } else { 521 VI_UNLOCK(devvp); 522 sn = malloc(sizeof *sn, M_UFSMNT, M_WAITOK | M_ZERO); 523 TAILQ_INIT(&sn->sn_head); 524 lockinit(&sn->sn_lock, PVFS, "snaplk", VLKTIMEOUT, 525 LK_CANRECURSE | LK_NOPAUSE); 526 VI_LOCK(vp); 527 vp->v_vnlock = &sn->sn_lock; 528 devvp->v_rdev->si_snapdata = sn; 529 xp = NULL; 530 } 531 vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, td); 532 transferlockers(&vp->v_lock, vp->v_vnlock); 533 lockmgr(&vp->v_lock, LK_RELEASE, NULL, td); 534 /* 535 * If this is the first snapshot on this filesystem, then we need 536 * to allocate the space for the list of preallocated snapshot blocks. 537 * This list will be refined below, but this preliminary one will 538 * keep us out of deadlock until the full one is ready. 539 */ 540 if (xp == NULL) { 541 MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t), 542 M_UFSMNT, M_WAITOK); 543 blkp = &snapblklist[1]; 544 *blkp++ = lblkno(fs, fs->fs_sblockloc); 545 blkno = fragstoblks(fs, fs->fs_csaddr); 546 for (cg = 0; cg < fs->fs_ncg; cg++) { 547 if (fragstoblks(fs, cgtod(fs, cg) > blkno)) 548 break; 549 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 550 } 551 len = howmany(fs->fs_cssize, fs->fs_bsize); 552 for (loc = 0; loc < len; loc++) 553 *blkp++ = blkno + loc; 554 for (; cg < fs->fs_ncg; cg++) 555 *blkp++ = fragstoblks(fs, cgtod(fs, cg)); 556 snapblklist[0] = blkp - snapblklist; 557 VI_LOCK(devvp); 558 if (sn->sn_blklist != NULL) 559 panic("ffs_snapshot: non-empty list"); 560 sn->sn_blklist = snapblklist; 561 sn->sn_listsize = blkp - snapblklist; 562 VI_UNLOCK(devvp); 563 } 564 /* 565 * Record snapshot inode. Since this is the newest snapshot, 566 * it must be placed at the end of the list. 567 */ 568 VI_LOCK(devvp); 569 fs->fs_snapinum[snaploc] = ip->i_number; 570 if (ip->i_nextsnap.tqe_prev != 0) 571 panic("ffs_snapshot: %d already on list", ip->i_number); 572 TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap); 573 devvp->v_vflag |= VV_COPYONWRITE; 574 VI_UNLOCK(devvp); 575 ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp"); 576 vp->v_vflag |= VV_SYSTEM; 577 out1: 578 KASSERT(sn != (void *)0xdeadbeef, ("email phk@ and mckusick@")); 579 /* 580 * Resume operation on filesystem. 581 */ 582 vfs_write_resume(vp->v_mount); 583 vn_start_write(NULL, &wrtmp, V_WAIT); 584 if (collectsnapstats && starttime.tv_sec > 0) { 585 nanotime(&endtime); 586 timespecsub(&endtime, &starttime); 587 printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n", 588 vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec, 589 endtime.tv_nsec / 1000000, redo, fs->fs_ncg); 590 } 591 if (sbp == NULL) 592 goto out; 593 /* 594 * Copy allocation information from all the snapshots in 595 * this snapshot and then expunge them from its view. 596 */ 597 TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) { 598 if (xp == ip) 599 break; 600 if (xp->i_ump->um_fstype == UFS1) 601 error = expunge_ufs1(vp, xp, fs, snapacct_ufs1, 602 BLK_SNAP); 603 else 604 error = expunge_ufs2(vp, xp, fs, snapacct_ufs2, 605 BLK_SNAP); 606 if (error) { 607 fs->fs_snapinum[snaploc] = 0; 608 goto done; 609 } 610 } 611 /* 612 * Allocate space for the full list of preallocated snapshot blocks. 613 */ 614 MALLOC(snapblklist, daddr_t *, snaplistsize * sizeof(daddr_t), 615 M_UFSMNT, M_WAITOK); 616 ip->i_snapblklist = &snapblklist[1]; 617 /* 618 * Expunge the blocks used by the snapshots from the set of 619 * blocks marked as used in the snapshot bitmaps. Also, collect 620 * the list of allocated blocks in i_snapblklist. 621 */ 622 if (ip->i_ump->um_fstype == UFS1) 623 error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP); 624 else 625 error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP); 626 if (error) { 627 fs->fs_snapinum[snaploc] = 0; 628 FREE(snapblklist, M_UFSMNT); 629 goto done; 630 } 631 if (snaplistsize < ip->i_snapblklist - snapblklist) 632 panic("ffs_snapshot: list too small"); 633 snaplistsize = ip->i_snapblklist - snapblklist; 634 snapblklist[0] = snaplistsize; 635 ip->i_snapblklist = 0; 636 /* 637 * Write out the list of allocated blocks to the end of the snapshot. 638 */ 639 auio.uio_iov = &aiov; 640 auio.uio_iovcnt = 1; 641 aiov.iov_base = (void *)snapblklist; 642 aiov.iov_len = snaplistsize * sizeof(daddr_t); 643 auio.uio_resid = aiov.iov_len;; 644 auio.uio_offset = ip->i_size; 645 auio.uio_segflg = UIO_SYSSPACE; 646 auio.uio_rw = UIO_WRITE; 647 auio.uio_td = td; 648 if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 649 fs->fs_snapinum[snaploc] = 0; 650 FREE(snapblklist, M_UFSMNT); 651 goto done; 652 } 653 /* 654 * Write the superblock and its summary information 655 * to the snapshot. 656 */ 657 blkno = fragstoblks(fs, fs->fs_csaddr); 658 len = howmany(fs->fs_cssize, fs->fs_bsize); 659 space = copy_fs->fs_csp; 660 for (loc = 0; loc < len; loc++) { 661 error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp); 662 if (error) { 663 brelse(nbp); 664 fs->fs_snapinum[snaploc] = 0; 665 FREE(snapblklist, M_UFSMNT); 666 goto done; 667 } 668 bcopy(space, nbp->b_data, fs->fs_bsize); 669 space = (char *)space + fs->fs_bsize; 670 bawrite(nbp); 671 } 672 /* 673 * As this is the newest list, it is the most inclusive, so 674 * should replace the previous list. 675 */ 676 VI_LOCK(devvp); 677 space = sn->sn_blklist; 678 sn->sn_blklist = snapblklist; 679 sn->sn_listsize = snaplistsize; 680 VI_UNLOCK(devvp); 681 if (space != NULL) 682 FREE(space, M_UFSMNT); 683 done: 684 FREE(copy_fs->fs_csp, M_UFSMNT); 685 bawrite(sbp); 686 out: 687 if (saved_nice > 0) { 688 PROC_LOCK(td->td_proc); 689 mtx_lock_spin(&sched_lock); 690 sched_nice(td->td_proc, saved_nice); 691 mtx_unlock_spin(&sched_lock); 692 PROC_UNLOCK(td->td_proc); 693 } 694 UFS_LOCK(ump); 695 if (fs->fs_active != 0) { 696 FREE(fs->fs_active, M_DEVBUF); 697 fs->fs_active = 0; 698 } 699 UFS_UNLOCK(ump); 700 mp->mnt_flag = flag; 701 if (error) 702 (void) ffs_truncate(vp, (off_t)0, 0, NOCRED, td); 703 (void) ffs_syncvnode(vp, MNT_WAIT); 704 if (error) 705 vput(vp); 706 else 707 VOP_UNLOCK(vp, 0, td); 708 vn_finished_write(wrtmp); 709 return (error); 710 } 711 712 /* 713 * Copy a cylinder group map. All the unallocated blocks are marked 714 * BLK_NOCOPY so that the snapshot knows that it need not copy them 715 * if they are later written. If passno is one, then this is a first 716 * pass, so only setting needs to be done. If passno is 2, then this 717 * is a revision to a previous pass which must be undone as the 718 * replacement pass is done. 719 */ 720 static int 721 cgaccount(cg, vp, nbp, passno) 722 int cg; 723 struct vnode *vp; 724 struct buf *nbp; 725 int passno; 726 { 727 struct buf *bp, *ibp; 728 struct inode *ip; 729 struct cg *cgp; 730 struct fs *fs; 731 ufs2_daddr_t base, numblks; 732 int error, len, loc, indiroff; 733 734 ip = VTOI(vp); 735 fs = ip->i_fs; 736 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 737 (int)fs->fs_cgsize, KERNCRED, &bp); 738 if (error) { 739 brelse(bp); 740 return (error); 741 } 742 cgp = (struct cg *)bp->b_data; 743 if (!cg_chkmagic(cgp)) { 744 brelse(bp); 745 return (EIO); 746 } 747 UFS_LOCK(ip->i_ump); 748 ACTIVECLEAR(fs, cg); 749 UFS_UNLOCK(ip->i_ump); 750 bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize); 751 if (fs->fs_cgsize < fs->fs_bsize) 752 bzero(&nbp->b_data[fs->fs_cgsize], 753 fs->fs_bsize - fs->fs_cgsize); 754 if (passno == 2) 755 nbp->b_flags |= B_VALIDSUSPWRT; 756 numblks = howmany(fs->fs_size, fs->fs_frag); 757 len = howmany(fs->fs_fpg, fs->fs_frag); 758 base = cgbase(fs, cg) / fs->fs_frag; 759 if (base + len >= numblks) 760 len = numblks - base - 1; 761 loc = 0; 762 if (base < NDADDR) { 763 for ( ; loc < NDADDR; loc++) { 764 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 765 DIP_SET(ip, i_db[loc], BLK_NOCOPY); 766 else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY) 767 DIP_SET(ip, i_db[loc], 0); 768 else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY) 769 panic("ffs_snapshot: lost direct block"); 770 } 771 } 772 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), 773 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 774 if (error) { 775 brelse(bp); 776 return (error); 777 } 778 indiroff = (base + loc - NDADDR) % NINDIR(fs); 779 for ( ; loc < len; loc++, indiroff++) { 780 if (indiroff >= NINDIR(fs)) { 781 if (passno == 2) 782 ibp->b_flags |= B_VALIDSUSPWRT; 783 bawrite(ibp); 784 error = UFS_BALLOC(vp, 785 lblktosize(fs, (off_t)(base + loc)), 786 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 787 if (error) { 788 brelse(bp); 789 return (error); 790 } 791 indiroff = 0; 792 } 793 if (ip->i_ump->um_fstype == UFS1) { 794 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 795 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 796 BLK_NOCOPY; 797 else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data)) 798 [indiroff] == BLK_NOCOPY) 799 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0; 800 else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data)) 801 [indiroff] == BLK_NOCOPY) 802 panic("ffs_snapshot: lost indirect block"); 803 continue; 804 } 805 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 806 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; 807 else if (passno == 2 && 808 ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) 809 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0; 810 else if (passno == 1 && 811 ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) 812 panic("ffs_snapshot: lost indirect block"); 813 } 814 bqrelse(bp); 815 if (passno == 2) 816 ibp->b_flags |= B_VALIDSUSPWRT; 817 bdwrite(ibp); 818 return (0); 819 } 820 821 /* 822 * Before expunging a snapshot inode, note all the 823 * blocks that it claims with BLK_SNAP so that fsck will 824 * be able to account for those blocks properly and so 825 * that this snapshot knows that it need not copy them 826 * if the other snapshot holding them is freed. This code 827 * is reproduced once each for UFS1 and UFS2. 828 */ 829 static int 830 expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype) 831 struct vnode *snapvp; 832 struct inode *cancelip; 833 struct fs *fs; 834 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 835 struct fs *, ufs_lbn_t, int); 836 int expungetype; 837 { 838 int i, error, indiroff; 839 ufs_lbn_t lbn, rlbn; 840 ufs2_daddr_t len, blkno, numblks, blksperindir; 841 struct ufs1_dinode *dip; 842 struct thread *td = curthread; 843 struct buf *bp; 844 845 /* 846 * Prepare to expunge the inode. If its inode block has not 847 * yet been copied, then allocate and fill the copy. 848 */ 849 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 850 blkno = 0; 851 if (lbn < NDADDR) { 852 blkno = VTOI(snapvp)->i_din1->di_db[lbn]; 853 } else { 854 td->td_pflags |= TDP_COWINPROGRESS; 855 error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn), 856 fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); 857 td->td_pflags &= ~TDP_COWINPROGRESS; 858 if (error) 859 return (error); 860 indiroff = (lbn - NDADDR) % NINDIR(fs); 861 blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff]; 862 bqrelse(bp); 863 } 864 if (blkno != 0) { 865 if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) 866 return (error); 867 } else { 868 error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn), 869 fs->fs_bsize, KERNCRED, 0, &bp); 870 if (error) 871 return (error); 872 if ((error = readblock(snapvp, bp, lbn)) != 0) 873 return (error); 874 } 875 /* 876 * Set a snapshot inode to be a zero length file, regular files 877 * to be completely unallocated. 878 */ 879 dip = (struct ufs1_dinode *)bp->b_data + 880 ino_to_fsbo(fs, cancelip->i_number); 881 if (expungetype == BLK_NOCOPY) 882 dip->di_mode = 0; 883 dip->di_size = 0; 884 dip->di_blocks = 0; 885 dip->di_flags &= ~SF_SNAPSHOT; 886 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t)); 887 bdwrite(bp); 888 /* 889 * Now go through and expunge all the blocks in the file 890 * using the function requested. 891 */ 892 numblks = howmany(cancelip->i_size, fs->fs_bsize); 893 if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0], 894 &cancelip->i_din1->di_db[NDADDR], fs, 0, expungetype))) 895 return (error); 896 if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_ib[0], 897 &cancelip->i_din1->di_ib[NIADDR], fs, -1, expungetype))) 898 return (error); 899 blksperindir = 1; 900 lbn = -NDADDR; 901 len = numblks - NDADDR; 902 rlbn = NDADDR; 903 for (i = 0; len > 0 && i < NIADDR; i++) { 904 error = indiracct_ufs1(snapvp, ITOV(cancelip), i, 905 cancelip->i_din1->di_ib[i], lbn, rlbn, len, 906 blksperindir, fs, acctfunc, expungetype); 907 if (error) 908 return (error); 909 blksperindir *= NINDIR(fs); 910 lbn -= blksperindir + 1; 911 len -= blksperindir; 912 rlbn += blksperindir; 913 } 914 return (0); 915 } 916 917 /* 918 * Descend an indirect block chain for vnode cancelvp accounting for all 919 * its indirect blocks in snapvp. 920 */ 921 static int 922 indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, 923 blksperindir, fs, acctfunc, expungetype) 924 struct vnode *snapvp; 925 struct vnode *cancelvp; 926 int level; 927 ufs1_daddr_t blkno; 928 ufs_lbn_t lbn; 929 ufs_lbn_t rlbn; 930 ufs_lbn_t remblks; 931 ufs_lbn_t blksperindir; 932 struct fs *fs; 933 int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, 934 struct fs *, ufs_lbn_t, int); 935 int expungetype; 936 { 937 int error, num, i; 938 ufs_lbn_t subblksperindir; 939 struct indir indirs[NIADDR + 2]; 940 ufs1_daddr_t last, *bap; 941 struct buf *bp; 942 943 if (blkno == 0) { 944 if (expungetype == BLK_NOCOPY) 945 return (0); 946 panic("indiracct_ufs1: missing indir"); 947 } 948 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 949 return (error); 950 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 951 panic("indiracct_ufs1: botched params"); 952 /* 953 * We have to expand bread here since it will deadlock looking 954 * up the block number for any blocks that are not in the cache. 955 */ 956 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); 957 bp->b_blkno = fsbtodb(fs, blkno); 958 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 959 (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) { 960 brelse(bp); 961 return (error); 962 } 963 /* 964 * Account for the block pointers in this indirect block. 965 */ 966 last = howmany(remblks, blksperindir); 967 if (last > NINDIR(fs)) 968 last = NINDIR(fs); 969 MALLOC(bap, ufs1_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); 970 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 971 bqrelse(bp); 972 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 973 level == 0 ? rlbn : -1, expungetype); 974 if (error || level == 0) 975 goto out; 976 /* 977 * Account for the block pointers in each of the indirect blocks 978 * in the levels below us. 979 */ 980 subblksperindir = blksperindir / NINDIR(fs); 981 for (lbn++, level--, i = 0; i < last; i++) { 982 error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn, 983 rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); 984 if (error) 985 goto out; 986 rlbn += blksperindir; 987 lbn -= blksperindir; 988 remblks -= blksperindir; 989 } 990 out: 991 FREE(bap, M_DEVBUF); 992 return (error); 993 } 994 995 /* 996 * Do both snap accounting and map accounting. 997 */ 998 static int 999 fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype) 1000 struct vnode *vp; 1001 ufs1_daddr_t *oldblkp, *lastblkp; 1002 struct fs *fs; 1003 ufs_lbn_t lblkno; 1004 int exptype; /* BLK_SNAP or BLK_NOCOPY */ 1005 { 1006 int error; 1007 1008 if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 1009 return (error); 1010 return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 1011 } 1012 1013 /* 1014 * Identify a set of blocks allocated in a snapshot inode. 1015 */ 1016 static int 1017 snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1018 struct vnode *vp; 1019 ufs1_daddr_t *oldblkp, *lastblkp; 1020 struct fs *fs; 1021 ufs_lbn_t lblkno; 1022 int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 1023 { 1024 struct inode *ip = VTOI(vp); 1025 ufs1_daddr_t blkno, *blkp; 1026 ufs_lbn_t lbn; 1027 struct buf *ibp; 1028 int error; 1029 1030 for ( ; oldblkp < lastblkp; oldblkp++) { 1031 blkno = *oldblkp; 1032 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1033 continue; 1034 lbn = fragstoblks(fs, blkno); 1035 if (lbn < NDADDR) { 1036 blkp = &ip->i_din1->di_db[lbn]; 1037 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1038 } else { 1039 error = ffs_balloc_ufs1(vp, lblktosize(fs, (off_t)lbn), 1040 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1041 if (error) 1042 return (error); 1043 blkp = &((ufs1_daddr_t *)(ibp->b_data)) 1044 [(lbn - NDADDR) % NINDIR(fs)]; 1045 } 1046 /* 1047 * If we are expunging a snapshot vnode and we 1048 * find a block marked BLK_NOCOPY, then it is 1049 * one that has been allocated to this snapshot after 1050 * we took our current snapshot and can be ignored. 1051 */ 1052 if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { 1053 if (lbn >= NDADDR) 1054 brelse(ibp); 1055 } else { 1056 if (*blkp != 0) 1057 panic("snapacct_ufs1: bad block"); 1058 *blkp = expungetype; 1059 if (lbn >= NDADDR) 1060 bdwrite(ibp); 1061 } 1062 } 1063 return (0); 1064 } 1065 1066 /* 1067 * Account for a set of blocks allocated in a snapshot inode. 1068 */ 1069 static int 1070 mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1071 struct vnode *vp; 1072 ufs1_daddr_t *oldblkp, *lastblkp; 1073 struct fs *fs; 1074 ufs_lbn_t lblkno; 1075 int expungetype; 1076 { 1077 ufs1_daddr_t blkno; 1078 struct inode *ip; 1079 ino_t inum; 1080 int acctit; 1081 1082 ip = VTOI(vp); 1083 inum = ip->i_number; 1084 if (lblkno == -1) 1085 acctit = 0; 1086 else 1087 acctit = 1; 1088 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1089 blkno = *oldblkp; 1090 if (blkno == 0 || blkno == BLK_NOCOPY) 1091 continue; 1092 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1093 *ip->i_snapblklist++ = lblkno; 1094 if (blkno == BLK_SNAP) 1095 blkno = blkstofrags(fs, lblkno); 1096 ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum); 1097 } 1098 return (0); 1099 } 1100 1101 /* 1102 * Before expunging a snapshot inode, note all the 1103 * blocks that it claims with BLK_SNAP so that fsck will 1104 * be able to account for those blocks properly and so 1105 * that this snapshot knows that it need not copy them 1106 * if the other snapshot holding them is freed. This code 1107 * is reproduced once each for UFS1 and UFS2. 1108 */ 1109 static int 1110 expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype) 1111 struct vnode *snapvp; 1112 struct inode *cancelip; 1113 struct fs *fs; 1114 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1115 struct fs *, ufs_lbn_t, int); 1116 int expungetype; 1117 { 1118 int i, error, indiroff; 1119 ufs_lbn_t lbn, rlbn; 1120 ufs2_daddr_t len, blkno, numblks, blksperindir; 1121 struct ufs2_dinode *dip; 1122 struct thread *td = curthread; 1123 struct buf *bp; 1124 1125 /* 1126 * Prepare to expunge the inode. If its inode block has not 1127 * yet been copied, then allocate and fill the copy. 1128 */ 1129 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 1130 blkno = 0; 1131 if (lbn < NDADDR) { 1132 blkno = VTOI(snapvp)->i_din2->di_db[lbn]; 1133 } else { 1134 td->td_pflags |= TDP_COWINPROGRESS; 1135 error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn), 1136 fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); 1137 td->td_pflags &= ~TDP_COWINPROGRESS; 1138 if (error) 1139 return (error); 1140 indiroff = (lbn - NDADDR) % NINDIR(fs); 1141 blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff]; 1142 bqrelse(bp); 1143 } 1144 if (blkno != 0) { 1145 if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) 1146 return (error); 1147 } else { 1148 error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn), 1149 fs->fs_bsize, KERNCRED, 0, &bp); 1150 if (error) 1151 return (error); 1152 if ((error = readblock(snapvp, bp, lbn)) != 0) 1153 return (error); 1154 } 1155 /* 1156 * Set a snapshot inode to be a zero length file, regular files 1157 * to be completely unallocated. 1158 */ 1159 dip = (struct ufs2_dinode *)bp->b_data + 1160 ino_to_fsbo(fs, cancelip->i_number); 1161 if (expungetype == BLK_NOCOPY) 1162 dip->di_mode = 0; 1163 dip->di_size = 0; 1164 dip->di_blocks = 0; 1165 dip->di_flags &= ~SF_SNAPSHOT; 1166 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t)); 1167 bdwrite(bp); 1168 /* 1169 * Now go through and expunge all the blocks in the file 1170 * using the function requested. 1171 */ 1172 numblks = howmany(cancelip->i_size, fs->fs_bsize); 1173 if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0], 1174 &cancelip->i_din2->di_db[NDADDR], fs, 0, expungetype))) 1175 return (error); 1176 if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_ib[0], 1177 &cancelip->i_din2->di_ib[NIADDR], fs, -1, expungetype))) 1178 return (error); 1179 blksperindir = 1; 1180 lbn = -NDADDR; 1181 len = numblks - NDADDR; 1182 rlbn = NDADDR; 1183 for (i = 0; len > 0 && i < NIADDR; i++) { 1184 error = indiracct_ufs2(snapvp, ITOV(cancelip), i, 1185 cancelip->i_din2->di_ib[i], lbn, rlbn, len, 1186 blksperindir, fs, acctfunc, expungetype); 1187 if (error) 1188 return (error); 1189 blksperindir *= NINDIR(fs); 1190 lbn -= blksperindir + 1; 1191 len -= blksperindir; 1192 rlbn += blksperindir; 1193 } 1194 return (0); 1195 } 1196 1197 /* 1198 * Descend an indirect block chain for vnode cancelvp accounting for all 1199 * its indirect blocks in snapvp. 1200 */ 1201 static int 1202 indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, 1203 blksperindir, fs, acctfunc, expungetype) 1204 struct vnode *snapvp; 1205 struct vnode *cancelvp; 1206 int level; 1207 ufs2_daddr_t blkno; 1208 ufs_lbn_t lbn; 1209 ufs_lbn_t rlbn; 1210 ufs_lbn_t remblks; 1211 ufs_lbn_t blksperindir; 1212 struct fs *fs; 1213 int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, 1214 struct fs *, ufs_lbn_t, int); 1215 int expungetype; 1216 { 1217 int error, num, i; 1218 ufs_lbn_t subblksperindir; 1219 struct indir indirs[NIADDR + 2]; 1220 ufs2_daddr_t last, *bap; 1221 struct buf *bp; 1222 1223 if (blkno == 0) { 1224 if (expungetype == BLK_NOCOPY) 1225 return (0); 1226 panic("indiracct_ufs2: missing indir"); 1227 } 1228 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 1229 return (error); 1230 if (lbn != indirs[num - 1 - level].in_lbn || num < 2) 1231 panic("indiracct_ufs2: botched params"); 1232 /* 1233 * We have to expand bread here since it will deadlock looking 1234 * up the block number for any blocks that are not in the cache. 1235 */ 1236 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); 1237 bp->b_blkno = fsbtodb(fs, blkno); 1238 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 1239 (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) { 1240 brelse(bp); 1241 return (error); 1242 } 1243 /* 1244 * Account for the block pointers in this indirect block. 1245 */ 1246 last = howmany(remblks, blksperindir); 1247 if (last > NINDIR(fs)) 1248 last = NINDIR(fs); 1249 MALLOC(bap, ufs2_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); 1250 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 1251 bqrelse(bp); 1252 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, 1253 level == 0 ? rlbn : -1, expungetype); 1254 if (error || level == 0) 1255 goto out; 1256 /* 1257 * Account for the block pointers in each of the indirect blocks 1258 * in the levels below us. 1259 */ 1260 subblksperindir = blksperindir / NINDIR(fs); 1261 for (lbn++, level--, i = 0; i < last; i++) { 1262 error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn, 1263 rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); 1264 if (error) 1265 goto out; 1266 rlbn += blksperindir; 1267 lbn -= blksperindir; 1268 remblks -= blksperindir; 1269 } 1270 out: 1271 FREE(bap, M_DEVBUF); 1272 return (error); 1273 } 1274 1275 /* 1276 * Do both snap accounting and map accounting. 1277 */ 1278 static int 1279 fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype) 1280 struct vnode *vp; 1281 ufs2_daddr_t *oldblkp, *lastblkp; 1282 struct fs *fs; 1283 ufs_lbn_t lblkno; 1284 int exptype; /* BLK_SNAP or BLK_NOCOPY */ 1285 { 1286 int error; 1287 1288 if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype))) 1289 return (error); 1290 return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)); 1291 } 1292 1293 /* 1294 * Identify a set of blocks allocated in a snapshot inode. 1295 */ 1296 static int 1297 snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1298 struct vnode *vp; 1299 ufs2_daddr_t *oldblkp, *lastblkp; 1300 struct fs *fs; 1301 ufs_lbn_t lblkno; 1302 int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 1303 { 1304 struct inode *ip = VTOI(vp); 1305 ufs2_daddr_t blkno, *blkp; 1306 ufs_lbn_t lbn; 1307 struct buf *ibp; 1308 int error; 1309 1310 for ( ; oldblkp < lastblkp; oldblkp++) { 1311 blkno = *oldblkp; 1312 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 1313 continue; 1314 lbn = fragstoblks(fs, blkno); 1315 if (lbn < NDADDR) { 1316 blkp = &ip->i_din2->di_db[lbn]; 1317 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1318 } else { 1319 error = ffs_balloc_ufs2(vp, lblktosize(fs, (off_t)lbn), 1320 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1321 if (error) 1322 return (error); 1323 blkp = &((ufs2_daddr_t *)(ibp->b_data)) 1324 [(lbn - NDADDR) % NINDIR(fs)]; 1325 } 1326 /* 1327 * If we are expunging a snapshot vnode and we 1328 * find a block marked BLK_NOCOPY, then it is 1329 * one that has been allocated to this snapshot after 1330 * we took our current snapshot and can be ignored. 1331 */ 1332 if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { 1333 if (lbn >= NDADDR) 1334 brelse(ibp); 1335 } else { 1336 if (*blkp != 0) 1337 panic("snapacct_ufs2: bad block"); 1338 *blkp = expungetype; 1339 if (lbn >= NDADDR) 1340 bdwrite(ibp); 1341 } 1342 } 1343 return (0); 1344 } 1345 1346 /* 1347 * Account for a set of blocks allocated in a snapshot inode. 1348 */ 1349 static int 1350 mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 1351 struct vnode *vp; 1352 ufs2_daddr_t *oldblkp, *lastblkp; 1353 struct fs *fs; 1354 ufs_lbn_t lblkno; 1355 int expungetype; 1356 { 1357 ufs2_daddr_t blkno; 1358 struct inode *ip; 1359 ino_t inum; 1360 int acctit; 1361 1362 ip = VTOI(vp); 1363 inum = ip->i_number; 1364 if (lblkno == -1) 1365 acctit = 0; 1366 else 1367 acctit = 1; 1368 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 1369 blkno = *oldblkp; 1370 if (blkno == 0 || blkno == BLK_NOCOPY) 1371 continue; 1372 if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) 1373 *ip->i_snapblklist++ = lblkno; 1374 if (blkno == BLK_SNAP) 1375 blkno = blkstofrags(fs, lblkno); 1376 ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum); 1377 } 1378 return (0); 1379 } 1380 1381 /* 1382 * Decrement extra reference on snapshot when last name is removed. 1383 * It will not be freed until the last open reference goes away. 1384 */ 1385 void 1386 ffs_snapgone(ip) 1387 struct inode *ip; 1388 { 1389 struct inode *xp; 1390 struct fs *fs; 1391 int snaploc; 1392 struct snapdata *sn; 1393 struct ufsmount *ump; 1394 1395 /* 1396 * Find snapshot in incore list. 1397 */ 1398 xp = NULL; 1399 sn = ip->i_devvp->v_rdev->si_snapdata; 1400 if (sn != NULL) 1401 TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) 1402 if (xp == ip) 1403 break; 1404 if (xp != NULL) 1405 vrele(ITOV(ip)); 1406 else if (snapdebug) 1407 printf("ffs_snapgone: lost snapshot vnode %d\n", 1408 ip->i_number); 1409 /* 1410 * Delete snapshot inode from superblock. Keep list dense. 1411 */ 1412 fs = ip->i_fs; 1413 ump = ip->i_ump; 1414 UFS_LOCK(ump); 1415 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 1416 if (fs->fs_snapinum[snaploc] == ip->i_number) 1417 break; 1418 if (snaploc < FSMAXSNAP) { 1419 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { 1420 if (fs->fs_snapinum[snaploc] == 0) 1421 break; 1422 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; 1423 } 1424 fs->fs_snapinum[snaploc - 1] = 0; 1425 } 1426 UFS_UNLOCK(ump); 1427 } 1428 1429 /* 1430 * Prepare a snapshot file for being removed. 1431 */ 1432 void 1433 ffs_snapremove(vp) 1434 struct vnode *vp; 1435 { 1436 struct inode *ip; 1437 struct vnode *devvp; 1438 struct lock *lkp; 1439 struct buf *ibp; 1440 struct fs *fs; 1441 struct thread *td = curthread; 1442 ufs2_daddr_t numblks, blkno, dblk, *snapblklist; 1443 int error, loc, last; 1444 struct snapdata *sn; 1445 1446 ip = VTOI(vp); 1447 fs = ip->i_fs; 1448 devvp = ip->i_devvp; 1449 sn = devvp->v_rdev->si_snapdata; 1450 /* 1451 * If active, delete from incore list (this snapshot may 1452 * already have been in the process of being deleted, so 1453 * would not have been active). 1454 * 1455 * Clear copy-on-write flag if last snapshot. 1456 */ 1457 if (ip->i_nextsnap.tqe_prev != 0) { 1458 VI_LOCK(devvp); 1459 lockmgr(&vp->v_lock, LK_INTERLOCK | LK_EXCLUSIVE, 1460 VI_MTX(devvp), td); 1461 VI_LOCK(devvp); 1462 TAILQ_REMOVE(&sn->sn_head, ip, i_nextsnap); 1463 ip->i_nextsnap.tqe_prev = 0; 1464 lkp = vp->v_vnlock; 1465 vp->v_vnlock = &vp->v_lock; 1466 lockmgr(lkp, LK_RELEASE, NULL, td); 1467 if (TAILQ_FIRST(&sn->sn_head) != 0) { 1468 VI_UNLOCK(devvp); 1469 } else { 1470 snapblklist = sn->sn_blklist; 1471 sn->sn_blklist = 0; 1472 sn->sn_listsize = 0; 1473 devvp->v_rdev->si_snapdata = NULL; 1474 devvp->v_vflag &= ~VV_COPYONWRITE; 1475 lockmgr(lkp, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp), td); 1476 lockmgr(lkp, LK_RELEASE, NULL, td); 1477 lockdestroy(lkp); 1478 free(sn, M_UFSMNT); 1479 FREE(snapblklist, M_UFSMNT); 1480 } 1481 } 1482 /* 1483 * Clear all BLK_NOCOPY fields. Pass any block claims to other 1484 * snapshots that want them (see ffs_snapblkfree below). 1485 */ 1486 for (blkno = 1; blkno < NDADDR; blkno++) { 1487 dblk = DIP(ip, i_db[blkno]); 1488 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1489 DIP_SET(ip, i_db[blkno], 0); 1490 else if ((dblk == blkstofrags(fs, blkno) && 1491 ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, 1492 ip->i_number))) { 1493 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - 1494 btodb(fs->fs_bsize)); 1495 DIP_SET(ip, i_db[blkno], 0); 1496 } 1497 } 1498 numblks = howmany(ip->i_size, fs->fs_bsize); 1499 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 1500 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), 1501 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1502 if (error) 1503 continue; 1504 if (fs->fs_size - blkno > NINDIR(fs)) 1505 last = NINDIR(fs); 1506 else 1507 last = fs->fs_size - blkno; 1508 for (loc = 0; loc < last; loc++) { 1509 if (ip->i_ump->um_fstype == UFS1) { 1510 dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc]; 1511 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1512 ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; 1513 else if ((dblk == blkstofrags(fs, blkno) && 1514 ffs_snapblkfree(fs, ip->i_devvp, dblk, 1515 fs->fs_bsize, ip->i_number))) { 1516 ip->i_din1->di_blocks -= 1517 btodb(fs->fs_bsize); 1518 ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; 1519 } 1520 continue; 1521 } 1522 dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc]; 1523 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 1524 ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; 1525 else if ((dblk == blkstofrags(fs, blkno) && 1526 ffs_snapblkfree(fs, ip->i_devvp, dblk, 1527 fs->fs_bsize, ip->i_number))) { 1528 ip->i_din2->di_blocks -= btodb(fs->fs_bsize); 1529 ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; 1530 } 1531 } 1532 bawrite(ibp); 1533 } 1534 /* 1535 * Clear snapshot flag and drop reference. 1536 */ 1537 ip->i_flags &= ~SF_SNAPSHOT; 1538 DIP_SET(ip, i_flags, ip->i_flags); 1539 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1540 } 1541 1542 /* 1543 * Notification that a block is being freed. Return zero if the free 1544 * should be allowed to proceed. Return non-zero if the snapshot file 1545 * wants to claim the block. The block will be claimed if it is an 1546 * uncopied part of one of the snapshots. It will be freed if it is 1547 * either a BLK_NOCOPY or has already been copied in all of the snapshots. 1548 * If a fragment is being freed, then all snapshots that care about 1549 * it must make a copy since a snapshot file can only claim full sized 1550 * blocks. Note that if more than one snapshot file maps the block, 1551 * we can pick one at random to claim it. Since none of the snapshots 1552 * can change, we are assurred that they will all see the same unmodified 1553 * image. When deleting a snapshot file (see ffs_snapremove above), we 1554 * must push any of these claimed blocks to one of the other snapshots 1555 * that maps it. These claimed blocks are easily identified as they will 1556 * have a block number equal to their logical block number within the 1557 * snapshot. A copied block can never have this property because they 1558 * must always have been allocated from a BLK_NOCOPY location. 1559 */ 1560 int 1561 ffs_snapblkfree(fs, devvp, bno, size, inum) 1562 struct fs *fs; 1563 struct vnode *devvp; 1564 ufs2_daddr_t bno; 1565 long size; 1566 ino_t inum; 1567 { 1568 struct buf *ibp, *cbp, *savedcbp = 0; 1569 struct thread *td = curthread; 1570 struct inode *ip; 1571 struct vnode *vp = NULL; 1572 ufs_lbn_t lbn; 1573 ufs2_daddr_t blkno; 1574 int indiroff = 0, snapshot_locked = 0, error = 0, claimedblk = 0; 1575 struct snapdata *sn; 1576 1577 lbn = fragstoblks(fs, bno); 1578 retry: 1579 VI_LOCK(devvp); 1580 sn = devvp->v_rdev->si_snapdata; 1581 if (sn == NULL) { 1582 VI_UNLOCK(devvp); 1583 return (0); 1584 } 1585 TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { 1586 vp = ITOV(ip); 1587 /* 1588 * Lookup block being written. 1589 */ 1590 if (lbn < NDADDR) { 1591 blkno = DIP(ip, i_db[lbn]); 1592 } else { 1593 if (snapshot_locked == 0 && 1594 lockmgr(vp->v_vnlock, 1595 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 1596 VI_MTX(devvp), td) != 0) 1597 goto retry; 1598 snapshot_locked = 1; 1599 td->td_pflags |= TDP_COWINPROGRESS; 1600 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1601 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 1602 td->td_pflags &= ~TDP_COWINPROGRESS; 1603 if (error) 1604 break; 1605 indiroff = (lbn - NDADDR) % NINDIR(fs); 1606 if (ip->i_ump->um_fstype == UFS1) 1607 blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; 1608 else 1609 blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; 1610 } 1611 /* 1612 * Check to see if block needs to be copied. 1613 */ 1614 if (blkno == 0) { 1615 /* 1616 * A block that we map is being freed. If it has not 1617 * been claimed yet, we will claim or copy it (below). 1618 */ 1619 claimedblk = 1; 1620 } else if (blkno == BLK_SNAP) { 1621 /* 1622 * No previous snapshot claimed the block, 1623 * so it will be freed and become a BLK_NOCOPY 1624 * (don't care) for us. 1625 */ 1626 if (claimedblk) 1627 panic("snapblkfree: inconsistent block type"); 1628 if (snapshot_locked == 0 && 1629 lockmgr(vp->v_vnlock, 1630 LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, 1631 VI_MTX(devvp), td) != 0) { 1632 if (lbn >= NDADDR) 1633 bqrelse(ibp); 1634 vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL, td); 1635 goto retry; 1636 } 1637 snapshot_locked = 1; 1638 if (lbn < NDADDR) { 1639 DIP_SET(ip, i_db[lbn], BLK_NOCOPY); 1640 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1641 } else if (ip->i_ump->um_fstype == UFS1) { 1642 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 1643 BLK_NOCOPY; 1644 bdwrite(ibp); 1645 } else { 1646 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 1647 BLK_NOCOPY; 1648 bdwrite(ibp); 1649 } 1650 continue; 1651 } else /* BLK_NOCOPY or default */ { 1652 /* 1653 * If the snapshot has already copied the block 1654 * (default), or does not care about the block, 1655 * it is not needed. 1656 */ 1657 if (lbn >= NDADDR) 1658 bqrelse(ibp); 1659 continue; 1660 } 1661 /* 1662 * If this is a full size block, we will just grab it 1663 * and assign it to the snapshot inode. Otherwise we 1664 * will proceed to copy it. See explanation for this 1665 * routine as to why only a single snapshot needs to 1666 * claim this block. 1667 */ 1668 if (snapshot_locked == 0 && 1669 lockmgr(vp->v_vnlock, 1670 LK_INTERLOCK | LK_EXCLUSIVE | LK_NOWAIT, 1671 VI_MTX(devvp), td) != 0) { 1672 if (lbn >= NDADDR) 1673 bqrelse(ibp); 1674 vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL, td); 1675 goto retry; 1676 } 1677 snapshot_locked = 1; 1678 if (size == fs->fs_bsize) { 1679 #ifdef DEBUG 1680 if (snapdebug) 1681 printf("%s %d lbn %jd from inum %d\n", 1682 "Grabonremove: snapino", ip->i_number, 1683 (intmax_t)lbn, inum); 1684 #endif 1685 if (lbn < NDADDR) { 1686 DIP_SET(ip, i_db[lbn], bno); 1687 } else if (ip->i_ump->um_fstype == UFS1) { 1688 ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno; 1689 bdwrite(ibp); 1690 } else { 1691 ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno; 1692 bdwrite(ibp); 1693 } 1694 DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + btodb(size)); 1695 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1696 VOP_UNLOCK(vp, 0, td); 1697 return (1); 1698 } 1699 if (lbn >= NDADDR) 1700 bqrelse(ibp); 1701 /* 1702 * Allocate the block into which to do the copy. Note that this 1703 * allocation will never require any additional allocations for 1704 * the snapshot inode. 1705 */ 1706 td->td_pflags |= TDP_COWINPROGRESS; 1707 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1708 fs->fs_bsize, KERNCRED, 0, &cbp); 1709 td->td_pflags &= ~TDP_COWINPROGRESS; 1710 if (error) 1711 break; 1712 #ifdef DEBUG 1713 if (snapdebug) 1714 printf("%s%d lbn %jd %s %d size %ld to blkno %jd\n", 1715 "Copyonremove: snapino ", ip->i_number, 1716 (intmax_t)lbn, "for inum", inum, size, 1717 (intmax_t)cbp->b_blkno); 1718 #endif 1719 /* 1720 * If we have already read the old block contents, then 1721 * simply copy them to the new block. Note that we need 1722 * to synchronously write snapshots that have not been 1723 * unlinked, and hence will be visible after a crash, 1724 * to ensure their integrity. 1725 */ 1726 if (savedcbp != 0) { 1727 bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 1728 bawrite(cbp); 1729 if (dopersistence && ip->i_effnlink > 0) 1730 (void) ffs_syncvnode(vp, MNT_WAIT); 1731 continue; 1732 } 1733 /* 1734 * Otherwise, read the old block contents into the buffer. 1735 */ 1736 if ((error = readblock(vp, cbp, lbn)) != 0) { 1737 bzero(cbp->b_data, fs->fs_bsize); 1738 bawrite(cbp); 1739 if (dopersistence && ip->i_effnlink > 0) 1740 (void) ffs_syncvnode(vp, MNT_WAIT); 1741 break; 1742 } 1743 savedcbp = cbp; 1744 } 1745 /* 1746 * Note that we need to synchronously write snapshots that 1747 * have not been unlinked, and hence will be visible after 1748 * a crash, to ensure their integrity. 1749 */ 1750 if (savedcbp) { 1751 vp = savedcbp->b_vp; 1752 bawrite(savedcbp); 1753 if (dopersistence && VTOI(vp)->i_effnlink > 0) 1754 (void) ffs_syncvnode(vp, MNT_WAIT); 1755 } 1756 /* 1757 * If we have been unable to allocate a block in which to do 1758 * the copy, then return non-zero so that the fragment will 1759 * not be freed. Although space will be lost, the snapshot 1760 * will stay consistent. 1761 */ 1762 if (snapshot_locked) 1763 VOP_UNLOCK(vp, 0, td); 1764 else 1765 VI_UNLOCK(devvp); 1766 return (error); 1767 } 1768 1769 /* 1770 * Associate snapshot files when mounting. 1771 */ 1772 void 1773 ffs_snapshot_mount(mp) 1774 struct mount *mp; 1775 { 1776 struct ufsmount *ump = VFSTOUFS(mp); 1777 struct vnode *devvp = ump->um_devvp; 1778 struct fs *fs = ump->um_fs; 1779 struct thread *td = curthread; 1780 struct snapdata *sn; 1781 struct vnode *vp; 1782 struct inode *ip; 1783 struct uio auio; 1784 struct iovec aiov; 1785 void *snapblklist; 1786 char *reason; 1787 daddr_t snaplistsize; 1788 int error, snaploc, loc; 1789 1790 /* 1791 * XXX The following needs to be set before ffs_truncate or 1792 * VOP_READ can be called. 1793 */ 1794 mp->mnt_stat.f_iosize = fs->fs_bsize; 1795 /* 1796 * Process each snapshot listed in the superblock. 1797 */ 1798 vp = NULL; 1799 sn = devvp->v_rdev->si_snapdata; 1800 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { 1801 if (fs->fs_snapinum[snaploc] == 0) 1802 break; 1803 if ((error = ffs_vget(mp, fs->fs_snapinum[snaploc], 1804 LK_EXCLUSIVE, &vp)) != 0){ 1805 printf("ffs_snapshot_mount: vget failed %d\n", error); 1806 continue; 1807 } 1808 ip = VTOI(vp); 1809 if ((ip->i_flags & SF_SNAPSHOT) == 0 || ip->i_size == 1810 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) { 1811 if ((ip->i_flags & SF_SNAPSHOT) == 0) { 1812 reason = "non-snapshot"; 1813 } else { 1814 reason = "old format snapshot"; 1815 (void)ffs_truncate(vp, (off_t)0, 0, NOCRED, td); 1816 (void)ffs_syncvnode(vp, MNT_WAIT); 1817 } 1818 printf("ffs_snapshot_mount: %s inode %d\n", 1819 reason, fs->fs_snapinum[snaploc]); 1820 vput(vp); 1821 vp = NULL; 1822 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { 1823 if (fs->fs_snapinum[loc] == 0) 1824 break; 1825 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; 1826 } 1827 fs->fs_snapinum[loc - 1] = 0; 1828 snaploc--; 1829 continue; 1830 } 1831 /* 1832 * If there already exist snapshots on this filesystem, grab a 1833 * reference to their shared lock. If this is the first snapshot 1834 * on this filesystem, we need to allocate a lock for the 1835 * snapshots to share. In either case, acquire the snapshot 1836 * lock and give up our original private lock. 1837 */ 1838 VI_LOCK(devvp); 1839 if (sn != NULL) { 1840 1841 VI_UNLOCK(devvp); 1842 VI_LOCK(vp); 1843 vp->v_vnlock = &sn->sn_lock; 1844 } else { 1845 VI_UNLOCK(devvp); 1846 sn = malloc(sizeof *sn, M_UFSMNT, M_WAITOK | M_ZERO); 1847 TAILQ_INIT(&sn->sn_head); 1848 lockinit(&sn->sn_lock, PVFS, "snaplk", VLKTIMEOUT, 1849 LK_CANRECURSE | LK_NOPAUSE); 1850 VI_LOCK(vp); 1851 vp->v_vnlock = &sn->sn_lock; 1852 devvp->v_rdev->si_snapdata = sn; 1853 } 1854 vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, td); 1855 transferlockers(&vp->v_lock, vp->v_vnlock); 1856 lockmgr(&vp->v_lock, LK_RELEASE, NULL, td); 1857 /* 1858 * Link it onto the active snapshot list. 1859 */ 1860 VI_LOCK(devvp); 1861 if (ip->i_nextsnap.tqe_prev != 0) 1862 panic("ffs_snapshot_mount: %d already on list", 1863 ip->i_number); 1864 else 1865 TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap); 1866 vp->v_vflag |= VV_SYSTEM; 1867 VI_UNLOCK(devvp); 1868 VOP_UNLOCK(vp, 0, td); 1869 } 1870 /* 1871 * No usable snapshots found. 1872 */ 1873 if (vp == NULL) 1874 return; 1875 /* 1876 * Allocate the space for the block hints list. We always want to 1877 * use the list from the newest snapshot. 1878 */ 1879 auio.uio_iov = &aiov; 1880 auio.uio_iovcnt = 1; 1881 aiov.iov_base = (void *)&snaplistsize; 1882 aiov.iov_len = sizeof(snaplistsize); 1883 auio.uio_resid = aiov.iov_len; 1884 auio.uio_offset = 1885 lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)); 1886 auio.uio_segflg = UIO_SYSSPACE; 1887 auio.uio_rw = UIO_READ; 1888 auio.uio_td = td; 1889 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1890 if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 1891 printf("ffs_snapshot_mount: read_1 failed %d\n", error); 1892 VOP_UNLOCK(vp, 0, td); 1893 return; 1894 } 1895 MALLOC(snapblklist, void *, snaplistsize * sizeof(daddr_t), 1896 M_UFSMNT, M_WAITOK); 1897 auio.uio_iovcnt = 1; 1898 aiov.iov_base = snapblklist; 1899 aiov.iov_len = snaplistsize * sizeof (daddr_t); 1900 auio.uio_resid = aiov.iov_len; 1901 auio.uio_offset -= sizeof(snaplistsize); 1902 if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { 1903 printf("ffs_snapshot_mount: read_2 failed %d\n", error); 1904 VOP_UNLOCK(vp, 0, td); 1905 FREE(snapblklist, M_UFSMNT); 1906 return; 1907 } 1908 VOP_UNLOCK(vp, 0, td); 1909 VI_LOCK(devvp); 1910 ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_mount"); 1911 sn->sn_listsize = snaplistsize; 1912 sn->sn_blklist = (daddr_t *)snapblklist; 1913 devvp->v_vflag |= VV_COPYONWRITE; 1914 VI_UNLOCK(devvp); 1915 } 1916 1917 /* 1918 * Disassociate snapshot files when unmounting. 1919 */ 1920 void 1921 ffs_snapshot_unmount(mp) 1922 struct mount *mp; 1923 { 1924 struct vnode *devvp = VFSTOUFS(mp)->um_devvp; 1925 struct snapdata *sn; 1926 struct inode *xp; 1927 struct vnode *vp; 1928 1929 sn = devvp->v_rdev->si_snapdata; 1930 VI_LOCK(devvp); 1931 while ((xp = TAILQ_FIRST(&sn->sn_head)) != 0) { 1932 vp = ITOV(xp); 1933 vp->v_vnlock = &vp->v_lock; 1934 TAILQ_REMOVE(&sn->sn_head, xp, i_nextsnap); 1935 xp->i_nextsnap.tqe_prev = 0; 1936 if (xp->i_effnlink > 0) { 1937 VI_UNLOCK(devvp); 1938 vrele(vp); 1939 VI_LOCK(devvp); 1940 } 1941 } 1942 if (sn->sn_blklist != NULL) { 1943 FREE(sn->sn_blklist, M_UFSMNT); 1944 sn->sn_blklist = NULL; 1945 sn->sn_listsize = 0; 1946 } 1947 lockdestroy(&sn->sn_lock); 1948 free(sn, M_UFSMNT); 1949 ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount"); 1950 devvp->v_rdev->si_snapdata = NULL; 1951 devvp->v_vflag &= ~VV_COPYONWRITE; 1952 VI_UNLOCK(devvp); 1953 } 1954 1955 /* 1956 * Check for need to copy block that is about to be written, 1957 * copying the block if necessary. 1958 */ 1959 int 1960 ffs_copyonwrite(devvp, bp) 1961 struct vnode *devvp; 1962 struct buf *bp; 1963 { 1964 struct snapdata *sn; 1965 struct buf *ibp, *cbp, *savedcbp = 0; 1966 struct thread *td = curthread; 1967 struct fs *fs; 1968 struct inode *ip; 1969 struct vnode *vp = 0; 1970 ufs2_daddr_t lbn, blkno, *snapblklist; 1971 int lower, upper, mid, indiroff, snapshot_locked = 0, error = 0; 1972 1973 if (td->td_pflags & TDP_COWINPROGRESS) 1974 panic("ffs_copyonwrite: recursive call"); 1975 /* 1976 * First check to see if it is in the preallocated list. 1977 * By doing this check we avoid several potential deadlocks. 1978 */ 1979 VI_LOCK(devvp); 1980 sn = devvp->v_rdev->si_snapdata; 1981 ip = TAILQ_FIRST(&sn->sn_head); 1982 fs = ip->i_fs; 1983 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); 1984 snapblklist = sn->sn_blklist; 1985 upper = sn->sn_listsize - 1; 1986 lower = 1; 1987 while (lower <= upper) { 1988 mid = (lower + upper) / 2; 1989 if (snapblklist[mid] == lbn) 1990 break; 1991 if (snapblklist[mid] < lbn) 1992 lower = mid + 1; 1993 else 1994 upper = mid - 1; 1995 } 1996 if (lower <= upper) { 1997 VI_UNLOCK(devvp); 1998 return (0); 1999 } 2000 /* 2001 * Not in the precomputed list, so check the snapshots. 2002 */ 2003 retry: 2004 TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { 2005 vp = ITOV(ip); 2006 /* 2007 * We ensure that everything of our own that needs to be 2008 * copied will be done at the time that ffs_snapshot is 2009 * called. Thus we can skip the check here which can 2010 * deadlock in doing the lookup in UFS_BALLOC. 2011 */ 2012 if (bp->b_vp == vp) 2013 continue; 2014 /* 2015 * Check to see if block needs to be copied. We do not have 2016 * to hold the snapshot lock while doing this lookup as it 2017 * will never require any additional allocations for the 2018 * snapshot inode. 2019 */ 2020 if (lbn < NDADDR) { 2021 blkno = DIP(ip, i_db[lbn]); 2022 } else { 2023 if (snapshot_locked == 0 && 2024 lockmgr(vp->v_vnlock, 2025 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 2026 VI_MTX(devvp), td) != 0) { 2027 VI_LOCK(devvp); 2028 goto retry; 2029 } 2030 snapshot_locked = 1; 2031 td->td_pflags |= TDP_COWINPROGRESS; 2032 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 2033 fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); 2034 td->td_pflags &= ~TDP_COWINPROGRESS; 2035 if (error) 2036 break; 2037 indiroff = (lbn - NDADDR) % NINDIR(fs); 2038 if (ip->i_ump->um_fstype == UFS1) 2039 blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; 2040 else 2041 blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; 2042 bqrelse(ibp); 2043 } 2044 #ifdef DIAGNOSTIC 2045 if (blkno == BLK_SNAP && bp->b_lblkno >= 0) 2046 panic("ffs_copyonwrite: bad copy block"); 2047 #endif 2048 if (blkno != 0) 2049 continue; 2050 /* 2051 * Allocate the block into which to do the copy. Since 2052 * multiple processes may all try to copy the same block, 2053 * we have to recheck our need to do a copy if we sleep 2054 * waiting for the lock. 2055 * 2056 * Because all snapshots on a filesystem share a single 2057 * lock, we ensure that we will never be in competition 2058 * with another process to allocate a block. 2059 */ 2060 if (snapshot_locked == 0 && 2061 lockmgr(vp->v_vnlock, 2062 LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, 2063 VI_MTX(devvp), td) != 0) { 2064 VI_LOCK(devvp); 2065 goto retry; 2066 } 2067 snapshot_locked = 1; 2068 td->td_pflags |= TDP_COWINPROGRESS; 2069 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 2070 fs->fs_bsize, KERNCRED, 0, &cbp); 2071 td->td_pflags &= ~TDP_COWINPROGRESS; 2072 if (error) 2073 break; 2074 #ifdef DEBUG 2075 if (snapdebug) { 2076 printf("Copyonwrite: snapino %d lbn %jd for ", 2077 ip->i_number, (intmax_t)lbn); 2078 if (bp->b_vp == devvp) 2079 printf("fs metadata"); 2080 else 2081 printf("inum %d", VTOI(bp->b_vp)->i_number); 2082 printf(" lblkno %jd to blkno %jd\n", 2083 (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno); 2084 } 2085 #endif 2086 /* 2087 * If we have already read the old block contents, then 2088 * simply copy them to the new block. Note that we need 2089 * to synchronously write snapshots that have not been 2090 * unlinked, and hence will be visible after a crash, 2091 * to ensure their integrity. 2092 */ 2093 if (savedcbp != 0) { 2094 bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 2095 bawrite(cbp); 2096 if (dopersistence && ip->i_effnlink > 0) 2097 (void) ffs_syncvnode(vp, MNT_WAIT); 2098 continue; 2099 } 2100 /* 2101 * Otherwise, read the old block contents into the buffer. 2102 */ 2103 if ((error = readblock(vp, cbp, lbn)) != 0) { 2104 bzero(cbp->b_data, fs->fs_bsize); 2105 bawrite(cbp); 2106 if (dopersistence && ip->i_effnlink > 0) 2107 (void) ffs_syncvnode(vp, MNT_WAIT); 2108 break; 2109 } 2110 savedcbp = cbp; 2111 } 2112 /* 2113 * Note that we need to synchronously write snapshots that 2114 * have not been unlinked, and hence will be visible after 2115 * a crash, to ensure their integrity. 2116 */ 2117 if (savedcbp) { 2118 vp = savedcbp->b_vp; 2119 bawrite(savedcbp); 2120 if (dopersistence && VTOI(vp)->i_effnlink > 0) 2121 (void) ffs_syncvnode(vp, MNT_WAIT); 2122 } 2123 if (snapshot_locked) 2124 VOP_UNLOCK(vp, 0, td); 2125 else 2126 VI_UNLOCK(devvp); 2127 return (error); 2128 } 2129 2130 /* 2131 * Read the specified block into the given buffer. 2132 * Much of this boiler-plate comes from bwrite(). 2133 */ 2134 static int 2135 readblock(vp, bp, lbn) 2136 struct vnode *vp; 2137 struct buf *bp; 2138 ufs2_daddr_t lbn; 2139 { 2140 struct inode *ip = VTOI(vp); 2141 struct bio *bip; 2142 2143 bip = g_alloc_bio(); 2144 bip->bio_cmd = BIO_READ; 2145 bip->bio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn))); 2146 bip->bio_data = bp->b_data; 2147 bip->bio_length = bp->b_bcount; 2148 2149 g_io_request(bip, ip->i_devvp->v_bufobj.bo_private); 2150 2151 do 2152 msleep(bip, NULL, PRIBIO, "snaprdb", hz/10); 2153 while (!(bip->bio_flags & BIO_DONE)); 2154 bp->b_error = bip->bio_error; 2155 g_destroy_bio(bip); 2156 return (bp->b_error); 2157 } 2158