1 /* 2 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. 3 * 4 * Further information about snapshots can be obtained from: 5 * 6 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 7 * 1614 Oxford Street mckusick@mckusick.com 8 * Berkeley, CA 94709-1608 +1-510-843-9542 9 * USA 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 21 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 22 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 34 * $FreeBSD$ 35 */ 36 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/conf.h> 40 #include <sys/bio.h> 41 #include <sys/buf.h> 42 #include <sys/proc.h> 43 #include <sys/namei.h> 44 #include <sys/stat.h> 45 #include <sys/malloc.h> 46 #include <sys/mount.h> 47 #include <sys/resource.h> 48 #include <sys/resourcevar.h> 49 #include <sys/vnode.h> 50 51 #include <ufs/ufs/extattr.h> 52 #include <ufs/ufs/quota.h> 53 #include <ufs/ufs/ufsmount.h> 54 #include <ufs/ufs/inode.h> 55 #include <ufs/ufs/ufs_extern.h> 56 57 #include <ufs/ffs/fs.h> 58 #include <ufs/ffs/ffs_extern.h> 59 60 #define KERNCRED proc0.p_ucred 61 #define DEBUG 1 62 63 static int cgaccount __P((int, struct vnode *, struct buf *, int)); 64 static int expunge __P((struct vnode *, struct inode *, struct fs *, 65 int (*) __P((struct vnode *, ufs_daddr_t *, ufs_daddr_t *, struct fs *, 66 ufs_daddr_t, int)), int)); 67 static int indiracct __P((struct vnode *, struct vnode *, int, ufs_daddr_t, 68 int, int, int, int, struct fs *, int (*) __P((struct vnode *, 69 ufs_daddr_t *, ufs_daddr_t *, struct fs *, ufs_daddr_t, int)), int)); 70 static int fullacct __P((struct vnode *, ufs_daddr_t *, ufs_daddr_t *, 71 struct fs *, ufs_daddr_t, int)); 72 static int snapacct __P((struct vnode *, ufs_daddr_t *, ufs_daddr_t *, 73 struct fs *, ufs_daddr_t, int)); 74 static int mapacct __P((struct vnode *, ufs_daddr_t *, ufs_daddr_t *, 75 struct fs *, ufs_daddr_t, int)); 76 static int ffs_copyonwrite __P((struct vnode *, struct buf *)); 77 static int readblock __P((struct buf *, daddr_t)); 78 79 /* 80 * To ensure the consistency of snapshots across crashes, we must 81 * synchronously write out copied blocks before allowing the 82 * originals to be modified. Because of the rather severe speed 83 * penalty that this imposes, the following flag allows this 84 * crash persistence to be disabled. 85 */ 86 int dopersistence = 0; 87 88 #ifdef DEBUG 89 #include <sys/sysctl.h> 90 SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, ""); 91 int snapdebug = 0; 92 SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, ""); 93 int collectsnapstats = 0; 94 SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats, 95 0, ""); 96 #endif /* DEBUG */ 97 98 /* 99 * Create a snapshot file and initialize it for the filesystem. 100 */ 101 int 102 ffs_snapshot(mp, snapfile) 103 struct mount *mp; 104 char *snapfile; 105 { 106 ufs_daddr_t blkno; 107 int error, cg, snaploc, numblks; 108 int i, size, len, loc; 109 int flag = mp->mnt_flag; 110 struct timespec starttime = {0, 0}, endtime; 111 char saved_nice = 0; 112 long redo = 0; 113 int32_t *lp; 114 void *space; 115 struct fs *copy_fs = NULL, *fs = VFSTOUFS(mp)->um_fs; 116 struct snaphead *snaphead; 117 struct thread *td = curthread; 118 struct inode *ip, *xp; 119 struct buf *bp, *nbp, *ibp, *sbp = NULL; 120 struct nameidata nd; 121 struct mount *wrtmp; 122 struct vattr vat; 123 struct vnode *vp, *xvp, *nvp; 124 125 /* 126 * Need to serialize access to snapshot code per filesystem. 127 */ 128 /* 129 * Assign a snapshot slot in the superblock. 130 */ 131 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 132 if (fs->fs_snapinum[snaploc] == 0) 133 break; 134 if (snaploc == FSMAXSNAP) 135 return (ENOSPC); 136 /* 137 * Create the snapshot file. 138 */ 139 restart: 140 NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, snapfile, td); 141 if ((error = namei(&nd)) != 0) 142 return (error); 143 if (nd.ni_vp != NULL) { 144 vput(nd.ni_vp); 145 error = EEXIST; 146 } 147 if (nd.ni_dvp->v_mount != mp) 148 error = EXDEV; 149 if (error) { 150 NDFREE(&nd, NDF_ONLY_PNBUF); 151 if (nd.ni_dvp == nd.ni_vp) 152 vrele(nd.ni_dvp); 153 else 154 vput(nd.ni_dvp); 155 return (error); 156 } 157 VATTR_NULL(&vat); 158 vat.va_type = VREG; 159 vat.va_mode = S_IRUSR; 160 vat.va_vaflags |= VA_EXCLUSIVE; 161 if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp)) 162 wrtmp = NULL; 163 if (wrtmp != mp) 164 panic("ffs_snapshot: mount mismatch"); 165 if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) { 166 NDFREE(&nd, NDF_ONLY_PNBUF); 167 vput(nd.ni_dvp); 168 if ((error = vn_start_write(NULL, &wrtmp, 169 V_XSLEEP | PCATCH)) != 0) 170 return (error); 171 goto restart; 172 } 173 VOP_LEASE(nd.ni_dvp, td, KERNCRED, LEASE_WRITE); 174 error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat); 175 vput(nd.ni_dvp); 176 if (error) { 177 NDFREE(&nd, NDF_ONLY_PNBUF); 178 vn_finished_write(wrtmp); 179 return (error); 180 } 181 vp = nd.ni_vp; 182 ip = VTOI(vp); 183 /* 184 * Allocate and copy the last block contents so as to be able 185 * to set size to that of the filesystem. 186 */ 187 numblks = howmany(fs->fs_size, fs->fs_frag); 188 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), 189 fs->fs_bsize, KERNCRED, B_CLRBUF, &bp); 190 if (error) 191 goto out; 192 ip->i_size = lblktosize(fs, (off_t)numblks); 193 ip->i_flag |= IN_CHANGE | IN_UPDATE; 194 if ((error = readblock(bp, numblks - 1)) != 0) 195 goto out; 196 bawrite(bp); 197 /* 198 * Preallocate critical data structures so that we can copy 199 * them in without further allocation after we suspend all 200 * operations on the filesystem. We would like to just release 201 * the allocated buffers without writing them since they will 202 * be filled in below once we are ready to go, but this upsets 203 * the soft update code, so we go ahead and write the new buffers. 204 * 205 * Allocate all indirect blocks and mark all of them as not 206 * needing to be copied. 207 */ 208 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 209 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), 210 fs->fs_bsize, td->td_proc->p_ucred, B_METAONLY, &ibp); 211 if (error) 212 goto out; 213 bdwrite(ibp); 214 } 215 /* 216 * Allocate copies for the superblock and its summary information. 217 */ 218 error = UFS_BALLOC(vp, (off_t)(SBOFF), SBSIZE, KERNCRED, 0, &nbp); 219 if (error) 220 goto out; 221 bawrite(nbp); 222 blkno = fragstoblks(fs, fs->fs_csaddr); 223 len = howmany(fs->fs_cssize, fs->fs_bsize); 224 for (loc = 0; loc < len; loc++) { 225 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), 226 fs->fs_bsize, KERNCRED, 0, &nbp); 227 if (error) 228 goto out; 229 bawrite(nbp); 230 } 231 /* 232 * Allocate all cylinder group blocks. 233 */ 234 for (cg = 0; cg < fs->fs_ncg; cg++) { 235 error = UFS_BALLOC(vp, (off_t)(cgtod(fs, cg)) << fs->fs_fshift, 236 fs->fs_bsize, KERNCRED, 0, &nbp); 237 if (error) 238 goto out; 239 bdwrite(nbp); 240 } 241 /* 242 * Copy all the cylinder group maps. Although the 243 * filesystem is still active, we hope that only a few 244 * cylinder groups will change between now and when we 245 * suspend operations. Thus, we will be able to quickly 246 * touch up the few cylinder groups that changed during 247 * the suspension period. 248 */ 249 len = howmany(fs->fs_ncg, NBBY); 250 MALLOC(fs->fs_active, int *, len, M_DEVBUF, M_WAITOK); 251 bzero(fs->fs_active, len); 252 for (cg = 0; cg < fs->fs_ncg; cg++) { 253 error = bread(vp, fragstoblks(fs, cgtod(fs, cg)), fs->fs_bsize, 254 KERNCRED, &nbp); 255 if (error) { 256 brelse(nbp); 257 goto out; 258 } 259 error = cgaccount(cg, vp, nbp, 1); 260 bawrite(nbp); 261 if (error) 262 goto out; 263 } 264 /* 265 * Change inode to snapshot type file. 266 */ 267 ip->i_flags |= SF_SNAPSHOT; 268 ip->i_flag |= IN_CHANGE | IN_UPDATE; 269 /* 270 * Ensure that the snapshot is completely on disk. 271 */ 272 if ((error = VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td)) != 0) 273 goto out; 274 /* 275 * All allocations are done, so we can now snapshot the system. 276 * 277 * Recind nice scheduling while running with the filesystem suspended. 278 */ 279 if (td->td_ksegrp->kg_nice > 0) { 280 saved_nice = td->td_ksegrp->kg_nice; 281 td->td_ksegrp->kg_nice = 0; 282 } 283 /* 284 * Suspend operation on filesystem. 285 */ 286 for (;;) { 287 vn_finished_write(wrtmp); 288 vfs_write_suspend(vp->v_mount); 289 if (mp->mnt_kern_flag & MNTK_SUSPENDED) 290 break; 291 vn_start_write(NULL, &wrtmp, V_WAIT); 292 } 293 if (collectsnapstats) 294 nanotime(&starttime); 295 /* 296 * First, copy all the cylinder group maps that have changed. 297 */ 298 for (cg = 0; cg < fs->fs_ncg; cg++) { 299 if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0) 300 continue; 301 redo++; 302 error = bread(vp, fragstoblks(fs, cgtod(fs, cg)), fs->fs_bsize, 303 KERNCRED, &nbp); 304 if (error) { 305 brelse(nbp); 306 goto out1; 307 } 308 error = cgaccount(cg, vp, nbp, 2); 309 bawrite(nbp); 310 if (error) 311 goto out1; 312 } 313 /* 314 * Grab a copy of the superblock and its summary information. 315 * We delay writing it until the suspension is released below. 316 */ 317 error = bread(vp, lblkno(fs, SBOFF), fs->fs_bsize, KERNCRED, &sbp); 318 if (error) { 319 brelse(sbp); 320 sbp = NULL; 321 goto out1; 322 } 323 copy_fs = (struct fs *)(sbp->b_data + blkoff(fs, SBOFF)); 324 bcopy(fs, copy_fs, fs->fs_sbsize); 325 if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) 326 copy_fs->fs_clean = 1; 327 if (fs->fs_sbsize < SBSIZE) 328 bzero(&sbp->b_data[blkoff(fs, SBOFF) + fs->fs_sbsize], 329 SBSIZE - fs->fs_sbsize); 330 size = blkroundup(fs, fs->fs_cssize); 331 if (fs->fs_contigsumsize > 0) 332 size += fs->fs_ncg * sizeof(int32_t); 333 space = malloc((u_long)size, M_UFSMNT, M_WAITOK); 334 copy_fs->fs_csp = space; 335 bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize); 336 (char *)space += fs->fs_cssize; 337 loc = howmany(fs->fs_cssize, fs->fs_fsize); 338 i = fs->fs_frag - loc % fs->fs_frag; 339 len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; 340 if (len > 0) { 341 if ((error = bread(ip->i_devvp, 342 fsbtodb(fs, fs->fs_csaddr + loc), 343 len, KERNCRED, &bp)) != 0) { 344 brelse(bp); 345 free(copy_fs->fs_csp, M_UFSMNT); 346 bawrite(sbp); 347 sbp = NULL; 348 goto out1; 349 } 350 bcopy(bp->b_data, space, (u_int)len); 351 (char *)space += len; 352 bp->b_flags |= B_INVAL | B_NOCACHE; 353 brelse(bp); 354 } 355 if (fs->fs_contigsumsize > 0) { 356 copy_fs->fs_maxcluster = lp = space; 357 for (i = 0; i < fs->fs_ncg; i++) 358 *lp++ = fs->fs_contigsumsize; 359 } 360 /* 361 * We must check for active files that have been unlinked 362 * (e.g., with a zero link count). We have to expunge all 363 * trace of these files from the snapshot so that they are 364 * not reclaimed prematurely by fsck or unnecessarily dumped. 365 * We turn off the MNTK_SUSPENDED flag to avoid a panic from 366 * spec_strategy about writing on a suspended filesystem. 367 */ 368 mp->mnt_kern_flag &= ~MNTK_SUSPENDED; 369 mtx_lock(&mntvnode_mtx); 370 loop: 371 for (xvp = TAILQ_FIRST(&mp->mnt_nvnodelist); xvp; xvp = nvp) { 372 /* 373 * Make sure this vnode wasn't reclaimed in getnewvnode(). 374 * Start over if it has (it won't be on the list anymore). 375 */ 376 if (xvp->v_mount != mp) 377 goto loop; 378 nvp = TAILQ_NEXT(xvp, v_nmntvnodes); 379 mtx_unlock(&mntvnode_mtx); 380 mtx_lock(&xvp->v_interlock); 381 if (xvp->v_usecount == 0 || xvp->v_type == VNON || 382 (VOP_GETATTR(xvp, &vat, td->td_proc->p_ucred, td) == 0 && 383 vat.va_nlink > 0)) { 384 mtx_unlock(&xvp->v_interlock); 385 mtx_lock(&mntvnode_mtx); 386 continue; 387 } 388 if (snapdebug) 389 vprint("ffs_snapshot: busy vnode", xvp); 390 if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK, td) != 0) 391 goto loop; 392 xp = VTOI(xvp); 393 /* 394 * If there is a fragment, clear it here. 395 */ 396 blkno = 0; 397 loc = howmany(xp->i_size, fs->fs_bsize) - 1; 398 if (loc < NDADDR) { 399 len = fragroundup(fs, blkoff(fs, xp->i_size)); 400 if (len < fs->fs_bsize) { 401 ffs_blkfree(copy_fs, vp, xp->i_db[loc], len, 402 xp->i_number); 403 blkno = xp->i_db[loc]; 404 xp->i_db[loc] = 0; 405 } 406 } 407 error = expunge(vp, xp, copy_fs, fullacct, BLK_NOCOPY); 408 if (blkno) 409 xp->i_db[loc] = blkno; 410 if (!error) 411 error = ffs_freefile(copy_fs, vp, xp->i_number, 412 xp->i_mode); 413 VOP_UNLOCK(xvp, 0, td); 414 if (error) { 415 free(copy_fs->fs_csp, M_UFSMNT); 416 bawrite(sbp); 417 sbp = NULL; 418 goto out1; 419 } 420 mtx_lock(&mntvnode_mtx); 421 } 422 mtx_unlock(&mntvnode_mtx); 423 /* 424 * Record snapshot inode. Since this is the newest snapshot, 425 * it must be placed at the end of the list. 426 */ 427 fs->fs_snapinum[snaploc] = ip->i_number; 428 if (ip->i_nextsnap.tqe_prev != 0) 429 panic("ffs_snapshot: %d already on list", ip->i_number); 430 snaphead = &ip->i_devvp->v_rdev->si_snapshots; 431 TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap); 432 ip->i_devvp->v_rdev->si_copyonwrite = ffs_copyonwrite; 433 ip->i_devvp->v_flag |= VCOPYONWRITE; 434 vp->v_flag |= VSYSTEM; 435 out1: 436 /* 437 * Resume operation on filesystem. 438 */ 439 vfs_write_resume(vp->v_mount); 440 if (saved_nice > 0) 441 td->td_ksegrp->kg_nice = saved_nice; 442 vn_start_write(NULL, &wrtmp, V_WAIT); 443 if (collectsnapstats && starttime.tv_sec > 0) { 444 nanotime(&endtime); 445 timespecsub(&endtime, &starttime); 446 printf("%s: suspended %d.%03ld sec, redo %ld of %d\n", 447 vp->v_mount->mnt_stat.f_mntonname, endtime.tv_sec, 448 endtime.tv_nsec / 1000000, redo, fs->fs_ncg); 449 } 450 if (sbp == NULL) 451 goto out; 452 /* 453 * Copy allocation information from all the snapshots in 454 * this snapshot and then expunge them from its view. 455 */ 456 snaphead = &ip->i_devvp->v_rdev->si_snapshots; 457 TAILQ_FOREACH(xp, snaphead, i_nextsnap) { 458 if (xp == ip) 459 break; 460 if ((error = expunge(vp, xp, fs, snapacct, BLK_SNAP)) != 0) { 461 fs->fs_snapinum[snaploc] = 0; 462 goto done; 463 } 464 } 465 /* 466 * Expunge the blocks used by the snapshots from the set of 467 * blocks marked as used in the snapshot bitmaps. 468 */ 469 if ((error = expunge(vp, ip, copy_fs, mapacct, BLK_SNAP)) != 0) { 470 fs->fs_snapinum[snaploc] = 0; 471 goto done; 472 } 473 /* 474 * Write the superblock and its summary information 475 * to the snapshot. 476 */ 477 blkno = fragstoblks(fs, fs->fs_csaddr); 478 len = howmany(fs->fs_cssize, fs->fs_bsize); 479 space = copy_fs->fs_csp; 480 for (loc = 0; loc < len; loc++) { 481 error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp); 482 if (error) { 483 brelse(nbp); 484 fs->fs_snapinum[snaploc] = 0; 485 goto done; 486 } 487 bcopy(space, nbp->b_data, fs->fs_bsize); 488 space = (char *)space + fs->fs_bsize; 489 bawrite(nbp); 490 } 491 done: 492 free(copy_fs->fs_csp, M_UFSMNT); 493 bawrite(sbp); 494 out: 495 if (fs->fs_active != 0) { 496 FREE(fs->fs_active, M_DEVBUF); 497 fs->fs_active = 0; 498 } 499 mp->mnt_flag = flag; 500 if (error) 501 (void) UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED, td); 502 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 503 if (error) 504 vput(vp); 505 else 506 VOP_UNLOCK(vp, 0, td); 507 vn_finished_write(wrtmp); 508 return (error); 509 } 510 511 /* 512 * Copy a cylinder group map. All the unallocated blocks are marked 513 * BLK_NOCOPY so that the snapshot knows that it need not copy them 514 * if they are later written. If how is one, then this is a first 515 * pass, so only setting needs to be done. If how is 2, then this 516 * is a revision to a previous pass which must be undone as the 517 * replacement pass is done. 518 */ 519 static int 520 cgaccount(cg, vp, nbp, passno) 521 int cg; 522 struct vnode *vp; 523 struct buf *nbp; 524 int passno; 525 { 526 struct buf *bp, *ibp; 527 struct inode *ip; 528 struct cg *cgp; 529 struct fs *fs; 530 int error, numblks, base, len, loc, indiroff; 531 532 ip = VTOI(vp); 533 fs = ip->i_fs; 534 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 535 (int)fs->fs_cgsize, KERNCRED, &bp); 536 if (error) { 537 brelse(bp); 538 return (error); 539 } 540 cgp = (struct cg *)bp->b_data; 541 if (!cg_chkmagic(cgp)) { 542 brelse(bp); 543 return (EIO); 544 } 545 atomic_set_int(&ACTIVECGNUM(fs, cg), ACTIVECGOFF(cg)); 546 bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize); 547 if (fs->fs_cgsize < fs->fs_bsize) 548 bzero(&nbp->b_data[fs->fs_cgsize], 549 fs->fs_bsize - fs->fs_cgsize); 550 if (passno == 2) 551 nbp->b_flags |= B_VALIDSUSPWRT; 552 numblks = howmany(fs->fs_size, fs->fs_frag); 553 len = howmany(fs->fs_fpg, fs->fs_frag); 554 base = cg * fs->fs_fpg / fs->fs_frag; 555 if (base + len >= numblks) 556 len = numblks - base - 1; 557 loc = 0; 558 if (base < NDADDR) { 559 for ( ; loc < NDADDR; loc++) { 560 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 561 ip->i_db[loc] = BLK_NOCOPY; 562 else if (passno == 2 && ip->i_db[loc] == BLK_NOCOPY) 563 ip->i_db[loc] = 0; 564 else if (passno == 1 && ip->i_db[loc] == BLK_NOCOPY) 565 panic("ffs_snapshot: lost direct block"); 566 } 567 } 568 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), 569 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 570 if (error) { 571 brelse(bp); 572 return (error); 573 } 574 indiroff = (base + loc - NDADDR) % NINDIR(fs); 575 for ( ; loc < len; loc++, indiroff++) { 576 if (indiroff >= NINDIR(fs)) { 577 if (passno == 2) 578 ibp->b_flags |= B_VALIDSUSPWRT; 579 bawrite(ibp); 580 error = UFS_BALLOC(vp, 581 lblktosize(fs, (off_t)(base + loc)), 582 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 583 if (error) { 584 brelse(bp); 585 return (error); 586 } 587 indiroff = 0; 588 } 589 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 590 ((ufs_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; 591 else if (passno == 2 && 592 ((ufs_daddr_t *)(ibp->b_data))[indiroff] == BLK_NOCOPY) 593 ((ufs_daddr_t *)(ibp->b_data))[indiroff] = 0; 594 else if (passno == 1 && 595 ((ufs_daddr_t *)(ibp->b_data))[indiroff] == BLK_NOCOPY) 596 panic("ffs_snapshot: lost indirect block"); 597 } 598 bqrelse(bp); 599 if (passno == 2) 600 ibp->b_flags |= B_VALIDSUSPWRT; 601 bdwrite(ibp); 602 return (0); 603 } 604 605 /* 606 * Before expunging a snapshot inode, note all the 607 * blocks that it claims with BLK_SNAP so that fsck will 608 * be able to account for those blocks properly and so 609 * that this snapshot knows that it need not copy them 610 * if the other snapshot holding them is freed. 611 */ 612 static int 613 expunge(snapvp, cancelip, fs, acctfunc, expungetype) 614 struct vnode *snapvp; 615 struct inode *cancelip; 616 struct fs *fs; 617 int (*acctfunc) __P((struct vnode *, ufs_daddr_t *, ufs_daddr_t *, 618 struct fs *, ufs_daddr_t, int)); 619 int expungetype; 620 { 621 int i, len, error, numblks, blksperindir; 622 ufs_daddr_t lbn, rlbn, blkno, indiroff; 623 struct thread *td = curthread; 624 struct dinode *dip; 625 struct buf *bp; 626 627 numblks = howmany(cancelip->i_size, fs->fs_bsize); 628 if ((error = (*acctfunc)(snapvp, &cancelip->i_db[0], 629 &cancelip->i_ib[NIADDR], fs, 0, expungetype))) 630 return (error); 631 blksperindir = 1; 632 lbn = -NDADDR; 633 len = numblks - NDADDR; 634 rlbn = NDADDR; 635 for (i = 0; len > 0 && i < NIADDR; i++) { 636 error = indiracct(snapvp, ITOV(cancelip), i, cancelip->i_ib[i], 637 lbn, rlbn, len, blksperindir, fs, acctfunc, expungetype); 638 if (error) 639 return (error); 640 blksperindir *= NINDIR(fs); 641 lbn -= blksperindir + 1; 642 len -= blksperindir; 643 rlbn += blksperindir; 644 } 645 /* 646 * Prepare to expunge the inode. If its inode block has not 647 * yet been copied, then allocate and fill the copy. 648 */ 649 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 650 blkno = 0; 651 if (lbn < NDADDR) { 652 blkno = cancelip->i_db[lbn]; 653 } else { 654 td->td_proc->p_flag |= P_COWINPROGRESS; 655 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 656 fs->fs_bsize, KERNCRED, B_METAONLY, &bp); 657 td->td_proc->p_flag &= ~P_COWINPROGRESS; 658 if (error) 659 return (error); 660 indiroff = (lbn - NDADDR) % NINDIR(fs); 661 blkno = ((ufs_daddr_t *)(bp->b_data))[indiroff]; 662 bqrelse(bp); 663 } 664 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 665 fs->fs_bsize, KERNCRED, 0, &bp); 666 if (error) 667 return (error); 668 if (blkno == 0 && (error = readblock(bp, lbn))) 669 return (error); 670 /* 671 * Set a snapshot inode to be a zero length file, regular files 672 * to be completely unallocated. 673 */ 674 dip = (struct dinode *)bp->b_data + ino_to_fsbo(fs, cancelip->i_number); 675 if (expungetype == BLK_NOCOPY) 676 dip->di_mode = 0; 677 dip->di_size = 0; 678 dip->di_blocks = 0; 679 dip->di_flags &= ~SF_SNAPSHOT; 680 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs_daddr_t)); 681 bdwrite(bp); 682 return (0); 683 } 684 685 /* 686 * Descend an indirect block chain for vnode cancelvp accounting for all 687 * its indirect blocks in snapvp. 688 */ 689 static int 690 indiracct(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, blksperindir, fs, 691 acctfunc, expungetype) 692 struct vnode *snapvp; 693 struct vnode *cancelvp; 694 int level; 695 ufs_daddr_t blkno; 696 int lbn; 697 int rlbn; 698 int remblks; 699 int blksperindir; 700 struct fs *fs; 701 int (*acctfunc) __P((struct vnode *, ufs_daddr_t *, ufs_daddr_t *, 702 struct fs *, ufs_daddr_t, int)); 703 int expungetype; 704 { 705 int subblksperindir, error, last, num, i; 706 struct indir indirs[NIADDR + 2]; 707 ufs_daddr_t *bap; 708 struct buf *bp; 709 710 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 711 return (error); 712 if (lbn != indirs[num - 1 - level].in_lbn || blkno == 0 || num < 2) 713 panic("indiracct: botched params"); 714 /* 715 * We have to expand bread here since it will deadlock looking 716 * up the block number for any blocks that are not in the cache. 717 */ 718 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0); 719 bp->b_blkno = fsbtodb(fs, blkno); 720 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 721 (error = readblock(bp, fragstoblks(fs, blkno)))) { 722 brelse(bp); 723 return (error); 724 } 725 /* 726 * Account for the block pointers in this indirect block. 727 */ 728 last = howmany(remblks, blksperindir); 729 if (last > NINDIR(fs)) 730 last = NINDIR(fs); 731 MALLOC(bap, ufs_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); 732 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 733 bqrelse(bp); 734 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, rlbn, expungetype); 735 if (error || level == 0) 736 goto out; 737 /* 738 * Account for the block pointers in each of the indirect blocks 739 * in the levels below us. 740 */ 741 subblksperindir = blksperindir / NINDIR(fs); 742 for (lbn++, level--, i = 0; i < last; i++) { 743 error = indiracct(snapvp, cancelvp, level, bap[i], lbn, 744 rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); 745 if (error) 746 goto out; 747 rlbn += blksperindir; 748 lbn -= blksperindir; 749 remblks -= blksperindir; 750 } 751 out: 752 FREE(bap, M_DEVBUF); 753 return (error); 754 } 755 756 /* 757 * Do both snap accounting and map accounting. 758 */ 759 static int 760 fullacct(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 761 struct vnode *vp; 762 ufs_daddr_t *oldblkp, *lastblkp; 763 struct fs *fs; 764 ufs_daddr_t lblkno; 765 int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 766 { 767 int error; 768 769 if ((error = snapacct(vp, oldblkp, lastblkp, fs, lblkno, expungetype))) 770 return (error); 771 return (mapacct(vp, oldblkp, lastblkp, fs, lblkno, expungetype)); 772 } 773 774 /* 775 * Identify a set of blocks allocated in a snapshot inode. 776 */ 777 static int 778 snapacct(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 779 struct vnode *vp; 780 ufs_daddr_t *oldblkp, *lastblkp; 781 struct fs *fs; 782 ufs_daddr_t lblkno; 783 int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 784 { 785 struct inode *ip = VTOI(vp); 786 ufs_daddr_t lbn, blkno, *blkp; 787 struct buf *ibp; 788 int error; 789 790 for ( ; oldblkp < lastblkp; oldblkp++) { 791 blkno = *oldblkp; 792 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 793 continue; 794 lbn = fragstoblks(fs, blkno); 795 if (lbn < NDADDR) { 796 blkp = &ip->i_db[lbn]; 797 ip->i_flag |= IN_CHANGE | IN_UPDATE; 798 } else { 799 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 800 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 801 if (error) 802 return (error); 803 blkp = &((ufs_daddr_t *)(ibp->b_data)) 804 [(lbn - NDADDR) % NINDIR(fs)]; 805 } 806 /* 807 * If we are expunging a snapshot vnode and we 808 * find a block marked BLK_NOCOPY, then it is 809 * one that has been allocated to this snapshot after 810 * we took our current snapshot and can be ignored. 811 */ 812 if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { 813 if (lbn >= NDADDR) 814 brelse(ibp); 815 } else { 816 if (*blkp != 0) 817 panic("snapacct: bad block"); 818 *blkp = expungetype; 819 if (lbn >= NDADDR) 820 bdwrite(ibp); 821 } 822 } 823 return (0); 824 } 825 826 /* 827 * Account for a set of blocks allocated in a snapshot inode. 828 */ 829 static int 830 mapacct(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 831 struct vnode *vp; 832 ufs_daddr_t *oldblkp, *lastblkp; 833 struct fs *fs; 834 ufs_daddr_t lblkno; 835 int expungetype; 836 { 837 ufs_daddr_t blkno; 838 ino_t inum; 839 840 inum = VTOI(vp)->i_number; 841 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 842 blkno = *oldblkp; 843 if (blkno == 0 || blkno == BLK_NOCOPY) 844 continue; 845 if (blkno == BLK_SNAP) 846 blkno = blkstofrags(fs, lblkno); 847 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); 848 } 849 return (0); 850 } 851 852 /* 853 * Decrement extra reference on snapshot when last name is removed. 854 * It will not be freed until the last open reference goes away. 855 */ 856 void 857 ffs_snapgone(ip) 858 struct inode *ip; 859 { 860 struct inode *xp; 861 struct fs *fs; 862 int snaploc; 863 864 /* 865 * Find snapshot in incore list. 866 */ 867 TAILQ_FOREACH(xp, &ip->i_devvp->v_rdev->si_snapshots, i_nextsnap) 868 if (xp == ip) 869 break; 870 if (xp == 0) 871 printf("ffs_snapgone: lost snapshot vnode %d\n", 872 ip->i_number); 873 else 874 vrele(ITOV(ip)); 875 /* 876 * Delete snapshot inode from superblock. Keep list dense. 877 */ 878 fs = ip->i_fs; 879 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 880 if (fs->fs_snapinum[snaploc] == ip->i_number) 881 break; 882 if (snaploc < FSMAXSNAP) { 883 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { 884 if (fs->fs_snapinum[snaploc] == 0) 885 break; 886 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; 887 } 888 fs->fs_snapinum[snaploc - 1] = 0; 889 } 890 } 891 892 /* 893 * Prepare a snapshot file for being removed. 894 */ 895 void 896 ffs_snapremove(vp) 897 struct vnode *vp; 898 { 899 struct inode *ip; 900 struct vnode *devvp; 901 struct buf *ibp; 902 struct fs *fs; 903 ufs_daddr_t blkno, dblk; 904 int error, numblks, loc, last; 905 906 ip = VTOI(vp); 907 fs = ip->i_fs; 908 /* 909 * If active, delete from incore list (this snapshot may 910 * already have been in the process of being deleted, so 911 * would not have been active). 912 * 913 * Clear copy-on-write flag if last snapshot. 914 */ 915 if (ip->i_nextsnap.tqe_prev != 0) { 916 devvp = ip->i_devvp; 917 TAILQ_REMOVE(&devvp->v_rdev->si_snapshots, ip, i_nextsnap); 918 ip->i_nextsnap.tqe_prev = 0; 919 if (TAILQ_FIRST(&devvp->v_rdev->si_snapshots) == 0) { 920 devvp->v_rdev->si_copyonwrite = 0; 921 devvp->v_flag &= ~VCOPYONWRITE; 922 } 923 } 924 /* 925 * Clear all BLK_NOCOPY fields. Pass any block claims to other 926 * snapshots that want them (see ffs_snapblkfree below). 927 */ 928 for (blkno = 1; blkno < NDADDR; blkno++) { 929 dblk = ip->i_db[blkno]; 930 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 931 ip->i_db[blkno] = 0; 932 else if ((dblk == blkstofrags(fs, blkno) && 933 ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, 934 ip->i_number))) { 935 ip->i_blocks -= btodb(fs->fs_bsize); 936 ip->i_db[blkno] = 0; 937 } 938 } 939 numblks = howmany(ip->i_size, fs->fs_bsize); 940 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 941 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), 942 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 943 if (error) 944 continue; 945 if ((last = fs->fs_size - blkno) > NINDIR(fs)) 946 last = NINDIR(fs); 947 for (loc = 0; loc < last; loc++) { 948 dblk = ((ufs_daddr_t *)(ibp->b_data))[loc]; 949 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 950 ((ufs_daddr_t *)(ibp->b_data))[loc] = 0; 951 else if ((dblk == blkstofrags(fs, blkno) && 952 ffs_snapblkfree(fs, ip->i_devvp, dblk, 953 fs->fs_bsize, ip->i_number))) { 954 ip->i_blocks -= btodb(fs->fs_bsize); 955 ((ufs_daddr_t *)(ibp->b_data))[loc] = 0; 956 } 957 } 958 bawrite(ibp); 959 } 960 /* 961 * Clear snapshot flag and drop reference. 962 */ 963 ip->i_flags &= ~SF_SNAPSHOT; 964 ip->i_flag |= IN_CHANGE | IN_UPDATE; 965 } 966 967 /* 968 * Notification that a block is being freed. Return zero if the free 969 * should be allowed to proceed. Return non-zero if the snapshot file 970 * wants to claim the block. The block will be claimed if it is an 971 * uncopied part of one of the snapshots. It will be freed if it is 972 * either a BLK_NOCOPY or has already been copied in all of the snapshots. 973 * If a fragment is being freed, then all snapshots that care about 974 * it must make a copy since a snapshot file can only claim full sized 975 * blocks. Note that if more than one snapshot file maps the block, 976 * we can pick one at random to claim it. Since none of the snapshots 977 * can change, we are assurred that they will all see the same unmodified 978 * image. When deleting a snapshot file (see ffs_snapremove above), we 979 * must push any of these claimed blocks to one of the other snapshots 980 * that maps it. These claimed blocks are easily identified as they will 981 * have a block number equal to their logical block number within the 982 * snapshot. A copied block can never have this property because they 983 * must always have been allocated from a BLK_NOCOPY location. 984 */ 985 int 986 ffs_snapblkfree(fs, devvp, bno, size, inum) 987 struct fs *fs; 988 struct vnode *devvp; 989 ufs_daddr_t bno; 990 long size; 991 ino_t inum; 992 { 993 struct buf *ibp, *cbp, *savedcbp = 0; 994 struct thread *td = curthread; 995 struct inode *ip; 996 struct vnode *vp; 997 ufs_daddr_t lbn, blkno; 998 int indiroff = 0, error = 0, claimedblk = 0; 999 struct snaphead *snaphead; 1000 1001 lbn = fragstoblks(fs, bno); 1002 snaphead = &devvp->v_rdev->si_snapshots; 1003 TAILQ_FOREACH(ip, snaphead, i_nextsnap) { 1004 vp = ITOV(ip); 1005 /* 1006 * Lookup block being written. 1007 */ 1008 if (lbn < NDADDR) { 1009 blkno = ip->i_db[lbn]; 1010 } else { 1011 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1012 td->td_proc->p_flag |= P_COWINPROGRESS; 1013 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1014 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 1015 td->td_proc->p_flag &= ~P_COWINPROGRESS; 1016 VOP_UNLOCK(vp, 0, td); 1017 if (error) 1018 break; 1019 indiroff = (lbn - NDADDR) % NINDIR(fs); 1020 blkno = ((ufs_daddr_t *)(ibp->b_data))[indiroff]; 1021 } 1022 /* 1023 * Check to see if block needs to be copied. 1024 */ 1025 switch (blkno) { 1026 /* 1027 * If the snapshot has already copied the block (default), 1028 * or does not care about the block, it is not needed. 1029 */ 1030 default: 1031 case BLK_NOCOPY: 1032 if (lbn >= NDADDR) 1033 bqrelse(ibp); 1034 continue; 1035 /* 1036 * No previous snapshot claimed the block, so it will be 1037 * freed and become a BLK_NOCOPY (don't care) for us. 1038 */ 1039 case BLK_SNAP: 1040 if (claimedblk) 1041 panic("snapblkfree: inconsistent block type"); 1042 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1043 if (lbn < NDADDR) { 1044 ip->i_db[lbn] = BLK_NOCOPY; 1045 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1046 } else { 1047 ((ufs_daddr_t *)(ibp->b_data))[indiroff] = 1048 BLK_NOCOPY; 1049 bdwrite(ibp); 1050 } 1051 VOP_UNLOCK(vp, 0, td); 1052 continue; 1053 /* 1054 * A block that we map is being freed. If it has not been 1055 * claimed yet, we will claim or copy it (below). 1056 */ 1057 case 0: 1058 claimedblk = 1; 1059 break; 1060 } 1061 /* 1062 * If this is a full size block, we will just grab it 1063 * and assign it to the snapshot inode. Otherwise we 1064 * will proceed to copy it. See explanation for this 1065 * routine as to why only a single snapshot needs to 1066 * claim this block. 1067 */ 1068 if (size == fs->fs_bsize) { 1069 #ifdef DEBUG 1070 if (snapdebug) 1071 printf("%s %d lbn %d from inum %d\n", 1072 "Grabonremove: snapino", ip->i_number, lbn, 1073 inum); 1074 #endif 1075 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1076 if (lbn < NDADDR) { 1077 ip->i_db[lbn] = bno; 1078 } else { 1079 ((ufs_daddr_t *)(ibp->b_data))[indiroff] = bno; 1080 bdwrite(ibp); 1081 } 1082 ip->i_blocks += btodb(size); 1083 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1084 VOP_UNLOCK(vp, 0, td); 1085 return (1); 1086 } 1087 if (lbn >= NDADDR) 1088 bqrelse(ibp); 1089 /* 1090 * Allocate the block into which to do the copy. Note that this 1091 * allocation will never require any additional allocations for 1092 * the snapshot inode. 1093 */ 1094 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1095 td->td_proc->p_flag |= P_COWINPROGRESS; 1096 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1097 fs->fs_bsize, KERNCRED, 0, &cbp); 1098 td->td_proc->p_flag &= ~P_COWINPROGRESS; 1099 if (error) { 1100 VOP_UNLOCK(vp, 0, td); 1101 break; 1102 } 1103 #ifdef DEBUG 1104 if (snapdebug) 1105 printf("%s%d lbn %d for inum %d size %ld to blkno %d\n", 1106 "Copyonremove: snapino ", ip->i_number, lbn, 1107 inum, size, cbp->b_blkno); 1108 #endif 1109 /* 1110 * If we have already read the old block contents, then 1111 * simply copy them to the new block. Note that we need 1112 * to synchronously write snapshots that have not been 1113 * unlinked, and hence will be visible after a crash, 1114 * to ensure their integrity. 1115 */ 1116 if (savedcbp != 0) { 1117 bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 1118 bawrite(cbp); 1119 if (dopersistence && ip->i_effnlink > 0) 1120 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1121 VOP_UNLOCK(vp, 0, td); 1122 continue; 1123 } 1124 /* 1125 * Otherwise, read the old block contents into the buffer. 1126 */ 1127 if ((error = readblock(cbp, lbn)) != 0) { 1128 bzero(cbp->b_data, fs->fs_bsize); 1129 bawrite(cbp); 1130 if (dopersistence && ip->i_effnlink > 0) 1131 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1132 VOP_UNLOCK(vp, 0, td); 1133 break; 1134 } 1135 VOP_UNLOCK(vp, 0, td); 1136 savedcbp = cbp; 1137 } 1138 /* 1139 * Note that we need to synchronously write snapshots that 1140 * have not been unlinked, and hence will be visible after 1141 * a crash, to ensure their integrity. 1142 */ 1143 if (savedcbp) { 1144 vp = savedcbp->b_vp; 1145 bawrite(savedcbp); 1146 if (dopersistence && VTOI(vp)->i_effnlink > 0) { 1147 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1148 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1149 VOP_UNLOCK(vp, 0, td); 1150 } 1151 } 1152 /* 1153 * If we have been unable to allocate a block in which to do 1154 * the copy, then return non-zero so that the fragment will 1155 * not be freed. Although space will be lost, the snapshot 1156 * will stay consistent. 1157 */ 1158 return (error); 1159 } 1160 1161 /* 1162 * Associate snapshot files when mounting. 1163 */ 1164 void 1165 ffs_snapshot_mount(mp) 1166 struct mount *mp; 1167 { 1168 struct ufsmount *ump = VFSTOUFS(mp); 1169 struct fs *fs = ump->um_fs; 1170 struct thread *td = curthread; 1171 struct snaphead *snaphead; 1172 struct vnode *vp; 1173 struct inode *ip; 1174 int error, snaploc, loc; 1175 1176 snaphead = &ump->um_devvp->v_rdev->si_snapshots; 1177 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { 1178 if (fs->fs_snapinum[snaploc] == 0) 1179 return; 1180 if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], &vp)) != 0){ 1181 printf("ffs_snapshot_mount: vget failed %d\n", error); 1182 continue; 1183 } 1184 ip = VTOI(vp); 1185 if ((ip->i_flags & SF_SNAPSHOT) == 0) { 1186 printf("ffs_snapshot_mount: non-snapshot inode %d\n", 1187 fs->fs_snapinum[snaploc]); 1188 vput(vp); 1189 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { 1190 if (fs->fs_snapinum[loc] == 0) 1191 break; 1192 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; 1193 } 1194 fs->fs_snapinum[loc - 1] = 0; 1195 snaploc--; 1196 continue; 1197 } 1198 if (ip->i_nextsnap.tqe_prev != 0) 1199 panic("ffs_snapshot_mount: %d already on list", 1200 ip->i_number); 1201 else 1202 TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap); 1203 vp->v_flag |= VSYSTEM; 1204 ump->um_devvp->v_rdev->si_copyonwrite = ffs_copyonwrite; 1205 ump->um_devvp->v_flag |= VCOPYONWRITE; 1206 VOP_UNLOCK(vp, 0, td); 1207 } 1208 } 1209 1210 /* 1211 * Disassociate snapshot files when unmounting. 1212 */ 1213 void 1214 ffs_snapshot_unmount(mp) 1215 struct mount *mp; 1216 { 1217 struct ufsmount *ump = VFSTOUFS(mp); 1218 struct snaphead *snaphead = &ump->um_devvp->v_rdev->si_snapshots; 1219 struct inode *xp; 1220 1221 while ((xp = TAILQ_FIRST(snaphead)) != 0) { 1222 TAILQ_REMOVE(snaphead, xp, i_nextsnap); 1223 xp->i_nextsnap.tqe_prev = 0; 1224 if (xp->i_effnlink > 0) 1225 vrele(ITOV(xp)); 1226 } 1227 ump->um_devvp->v_rdev->si_copyonwrite = 0; 1228 ump->um_devvp->v_flag &= ~VCOPYONWRITE; 1229 } 1230 1231 /* 1232 * Check for need to copy block that is about to be written, 1233 * copying the block if necessary. 1234 */ 1235 static int 1236 ffs_copyonwrite(devvp, bp) 1237 struct vnode *devvp; 1238 struct buf *bp; 1239 { 1240 struct buf *ibp, *cbp, *savedcbp = 0; 1241 struct thread *td = curthread; 1242 struct fs *fs; 1243 struct inode *ip; 1244 struct vnode *vp; 1245 ufs_daddr_t lbn, blkno; 1246 int indiroff, error = 0; 1247 1248 fs = TAILQ_FIRST(&devvp->v_rdev->si_snapshots)->i_fs; 1249 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); 1250 if (td->td_proc->p_flag & P_COWINPROGRESS) 1251 panic("ffs_copyonwrite: recursive call"); 1252 TAILQ_FOREACH(ip, &devvp->v_rdev->si_snapshots, i_nextsnap) { 1253 vp = ITOV(ip); 1254 /* 1255 * We ensure that everything of our own that needs to be 1256 * copied will be done at the time that ffs_snapshot is 1257 * called. Thus we can skip the check here which can 1258 * deadlock in doing the lookup in UFS_BALLOC. 1259 */ 1260 if (bp->b_vp == vp) 1261 continue; 1262 /* 1263 * Check to see if block needs to be copied. We have to 1264 * be able to do the UFS_BALLOC without blocking, otherwise 1265 * we may get in a deadlock with another process also 1266 * trying to allocate. If we find outselves unable to 1267 * get the buffer lock, we unlock the snapshot vnode, 1268 * sleep briefly, and try again. 1269 */ 1270 retry: 1271 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1272 if (lbn < NDADDR) { 1273 blkno = ip->i_db[lbn]; 1274 } else { 1275 td->td_proc->p_flag |= P_COWINPROGRESS; 1276 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1277 fs->fs_bsize, KERNCRED, B_METAONLY | B_NOWAIT, &ibp); 1278 td->td_proc->p_flag &= ~P_COWINPROGRESS; 1279 if (error) { 1280 VOP_UNLOCK(vp, 0, td); 1281 if (error != EWOULDBLOCK) 1282 break; 1283 tsleep(vp, td->td_ksegrp->kg_user_pri, "nap", 1); 1284 goto retry; 1285 } 1286 indiroff = (lbn - NDADDR) % NINDIR(fs); 1287 blkno = ((ufs_daddr_t *)(ibp->b_data))[indiroff]; 1288 bqrelse(ibp); 1289 } 1290 #ifdef DIAGNOSTIC 1291 if (blkno == BLK_SNAP && bp->b_lblkno >= 0) 1292 panic("ffs_copyonwrite: bad copy block"); 1293 #endif 1294 if (blkno != 0) { 1295 VOP_UNLOCK(vp, 0, td); 1296 continue; 1297 } 1298 /* 1299 * Allocate the block into which to do the copy. Note that this 1300 * allocation will never require any additional allocations for 1301 * the snapshot inode. 1302 */ 1303 td->td_proc->p_flag |= P_COWINPROGRESS; 1304 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1305 fs->fs_bsize, KERNCRED, B_NOWAIT, &cbp); 1306 td->td_proc->p_flag &= ~P_COWINPROGRESS; 1307 if (error) { 1308 VOP_UNLOCK(vp, 0, td); 1309 if (error != EWOULDBLOCK) 1310 break; 1311 tsleep(vp, td->td_ksegrp->kg_user_pri, "nap", 1); 1312 goto retry; 1313 } 1314 #ifdef DEBUG 1315 if (snapdebug) { 1316 printf("Copyonwrite: snapino %d lbn %d for ", 1317 ip->i_number, lbn); 1318 if (bp->b_vp == devvp) 1319 printf("fs metadata"); 1320 else 1321 printf("inum %d", VTOI(bp->b_vp)->i_number); 1322 printf(" lblkno %d to blkno %d\n", bp->b_lblkno, 1323 cbp->b_blkno); 1324 } 1325 #endif 1326 /* 1327 * If we have already read the old block contents, then 1328 * simply copy them to the new block. Note that we need 1329 * to synchronously write snapshots that have not been 1330 * unlinked, and hence will be visible after a crash, 1331 * to ensure their integrity. 1332 */ 1333 if (savedcbp != 0) { 1334 bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 1335 bawrite(cbp); 1336 if (dopersistence && ip->i_effnlink > 0) 1337 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1338 VOP_UNLOCK(vp, 0, td); 1339 continue; 1340 } 1341 /* 1342 * Otherwise, read the old block contents into the buffer. 1343 */ 1344 if ((error = readblock(cbp, lbn)) != 0) { 1345 bzero(cbp->b_data, fs->fs_bsize); 1346 bawrite(cbp); 1347 if (dopersistence && ip->i_effnlink > 0) 1348 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1349 VOP_UNLOCK(vp, 0, td); 1350 break; 1351 } 1352 savedcbp = cbp; 1353 VOP_UNLOCK(vp, 0, td); 1354 } 1355 /* 1356 * Note that we need to synchronously write snapshots that 1357 * have not been unlinked, and hence will be visible after 1358 * a crash, to ensure their integrity. 1359 */ 1360 if (savedcbp) { 1361 vp = savedcbp->b_vp; 1362 bawrite(savedcbp); 1363 if (dopersistence && VTOI(vp)->i_effnlink > 0) { 1364 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1365 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1366 VOP_UNLOCK(vp, 0, td); 1367 } 1368 } 1369 return (error); 1370 } 1371 1372 /* 1373 * Read the specified block into the given buffer. 1374 * Much of this boiler-plate comes from bwrite(). 1375 */ 1376 static int 1377 readblock(bp, lbn) 1378 struct buf *bp; 1379 daddr_t lbn; 1380 { 1381 struct uio auio; 1382 struct iovec aiov; 1383 struct thread *td = curthread; 1384 struct inode *ip = VTOI(bp->b_vp); 1385 1386 aiov.iov_base = bp->b_data; 1387 aiov.iov_len = bp->b_bcount; 1388 auio.uio_iov = &aiov; 1389 auio.uio_iovcnt = 1; 1390 auio.uio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn))); 1391 auio.uio_resid = bp->b_bcount; 1392 auio.uio_rw = UIO_READ; 1393 auio.uio_segflg = UIO_SYSSPACE; 1394 auio.uio_td = td; 1395 return (physio(ip->i_devvp->v_rdev, &auio, 0)); 1396 } 1397