1 /* 2 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. 3 * 4 * Further information about snapshots can be obtained from: 5 * 6 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 7 * 1614 Oxford Street mckusick@mckusick.com 8 * Berkeley, CA 94709-1608 +1-510-843-9542 9 * USA 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 21 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 22 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 34 * $FreeBSD$ 35 */ 36 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/conf.h> 40 #include <sys/disklabel.h> 41 #include <sys/bio.h> 42 #include <sys/buf.h> 43 #include <sys/proc.h> 44 #include <sys/namei.h> 45 #include <sys/stat.h> 46 #include <sys/malloc.h> 47 #include <sys/mount.h> 48 #include <sys/resource.h> 49 #include <sys/resourcevar.h> 50 #include <sys/vnode.h> 51 52 #include <ufs/ufs/extattr.h> 53 #include <ufs/ufs/quota.h> 54 #include <ufs/ufs/ufsmount.h> 55 #include <ufs/ufs/inode.h> 56 #include <ufs/ufs/ufs_extern.h> 57 58 #include <ufs/ffs/fs.h> 59 #include <ufs/ffs/ffs_extern.h> 60 61 #define KERNCRED thread0.td_ucred 62 #define DEBUG 1 63 64 static int cgaccount(int, struct vnode *, struct buf *, int); 65 static int expunge(struct vnode *, struct inode *, struct fs *, 66 int (*)(struct vnode *, ufs_daddr_t *, ufs_daddr_t *, struct fs *, 67 ufs_daddr_t, int), int); 68 static int indiracct(struct vnode *, struct vnode *, int, ufs_daddr_t, 69 int, int, int, int, struct fs *, int (*)(struct vnode *, 70 ufs_daddr_t *, ufs_daddr_t *, struct fs *, ufs_daddr_t, int), int); 71 static int fullacct(struct vnode *, ufs_daddr_t *, ufs_daddr_t *, 72 struct fs *, ufs_daddr_t, int); 73 static int snapacct(struct vnode *, ufs_daddr_t *, ufs_daddr_t *, 74 struct fs *, ufs_daddr_t, int); 75 static int mapacct(struct vnode *, ufs_daddr_t *, ufs_daddr_t *, 76 struct fs *, ufs_daddr_t, int); 77 static int ffs_copyonwrite(struct vnode *, struct buf *); 78 static int readblock(struct buf *, daddr_t); 79 80 /* 81 * To ensure the consistency of snapshots across crashes, we must 82 * synchronously write out copied blocks before allowing the 83 * originals to be modified. Because of the rather severe speed 84 * penalty that this imposes, the following flag allows this 85 * crash persistence to be disabled. 86 */ 87 int dopersistence = 0; 88 89 #ifdef DEBUG 90 #include <sys/sysctl.h> 91 SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, ""); 92 int snapdebug = 0; 93 SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, ""); 94 int collectsnapstats = 0; 95 SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats, 96 0, ""); 97 #endif /* DEBUG */ 98 99 /* 100 * Create a snapshot file and initialize it for the filesystem. 101 */ 102 int 103 ffs_snapshot(mp, snapfile) 104 struct mount *mp; 105 char *snapfile; 106 { 107 ufs_daddr_t blkno; 108 int error, cg, snaploc, numblks; 109 int i, size, len, loc; 110 int flag = mp->mnt_flag; 111 struct timespec starttime = {0, 0}, endtime; 112 char saved_nice = 0; 113 long redo = 0; 114 int32_t *lp; 115 void *space; 116 struct fs *copy_fs = NULL, *fs = VFSTOUFS(mp)->um_fs; 117 struct snaphead *snaphead; 118 struct thread *td = curthread; 119 struct inode *ip, *xp; 120 struct buf *bp, *nbp, *ibp, *sbp = NULL; 121 struct nameidata nd; 122 struct mount *wrtmp; 123 struct vattr vat; 124 struct vnode *vp, *xvp, *nvp; 125 126 /* 127 * Need to serialize access to snapshot code per filesystem. 128 */ 129 /* 130 * Assign a snapshot slot in the superblock. 131 */ 132 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 133 if (fs->fs_snapinum[snaploc] == 0) 134 break; 135 if (snaploc == FSMAXSNAP) 136 return (ENOSPC); 137 /* 138 * Create the snapshot file. 139 */ 140 restart: 141 NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, snapfile, td); 142 if ((error = namei(&nd)) != 0) 143 return (error); 144 if (nd.ni_vp != NULL) { 145 vput(nd.ni_vp); 146 error = EEXIST; 147 } 148 if (nd.ni_dvp->v_mount != mp) 149 error = EXDEV; 150 if (error) { 151 NDFREE(&nd, NDF_ONLY_PNBUF); 152 if (nd.ni_dvp == nd.ni_vp) 153 vrele(nd.ni_dvp); 154 else 155 vput(nd.ni_dvp); 156 return (error); 157 } 158 VATTR_NULL(&vat); 159 vat.va_type = VREG; 160 vat.va_mode = S_IRUSR; 161 vat.va_vaflags |= VA_EXCLUSIVE; 162 if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp)) 163 wrtmp = NULL; 164 if (wrtmp != mp) 165 panic("ffs_snapshot: mount mismatch"); 166 if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) { 167 NDFREE(&nd, NDF_ONLY_PNBUF); 168 vput(nd.ni_dvp); 169 if ((error = vn_start_write(NULL, &wrtmp, 170 V_XSLEEP | PCATCH)) != 0) 171 return (error); 172 goto restart; 173 } 174 VOP_LEASE(nd.ni_dvp, td, KERNCRED, LEASE_WRITE); 175 error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat); 176 vput(nd.ni_dvp); 177 if (error) { 178 NDFREE(&nd, NDF_ONLY_PNBUF); 179 vn_finished_write(wrtmp); 180 return (error); 181 } 182 vp = nd.ni_vp; 183 ip = VTOI(vp); 184 /* 185 * Allocate and copy the last block contents so as to be able 186 * to set size to that of the filesystem. 187 */ 188 numblks = howmany(fs->fs_size, fs->fs_frag); 189 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), 190 fs->fs_bsize, KERNCRED, B_CLRBUF, &bp); 191 if (error) 192 goto out; 193 ip->i_size = lblktosize(fs, (off_t)numblks); 194 ip->i_flag |= IN_CHANGE | IN_UPDATE; 195 if ((error = readblock(bp, numblks - 1)) != 0) 196 goto out; 197 bawrite(bp); 198 /* 199 * Preallocate critical data structures so that we can copy 200 * them in without further allocation after we suspend all 201 * operations on the filesystem. We would like to just release 202 * the allocated buffers without writing them since they will 203 * be filled in below once we are ready to go, but this upsets 204 * the soft update code, so we go ahead and write the new buffers. 205 * 206 * Allocate all indirect blocks and mark all of them as not 207 * needing to be copied. 208 */ 209 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 210 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), 211 fs->fs_bsize, td->td_ucred, B_METAONLY, &ibp); 212 if (error) 213 goto out; 214 bdwrite(ibp); 215 } 216 /* 217 * Allocate copies for the superblock and its summary information. 218 */ 219 error = UFS_BALLOC(vp, (off_t)(SBOFF), SBSIZE, KERNCRED, 0, &nbp); 220 if (error) 221 goto out; 222 bawrite(nbp); 223 blkno = fragstoblks(fs, fs->fs_csaddr); 224 len = howmany(fs->fs_cssize, fs->fs_bsize); 225 for (loc = 0; loc < len; loc++) { 226 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), 227 fs->fs_bsize, KERNCRED, 0, &nbp); 228 if (error) 229 goto out; 230 bawrite(nbp); 231 } 232 /* 233 * Allocate all cylinder group blocks. 234 */ 235 for (cg = 0; cg < fs->fs_ncg; cg++) { 236 error = UFS_BALLOC(vp, (off_t)(cgtod(fs, cg)) << fs->fs_fshift, 237 fs->fs_bsize, KERNCRED, 0, &nbp); 238 if (error) 239 goto out; 240 bdwrite(nbp); 241 } 242 /* 243 * Copy all the cylinder group maps. Although the 244 * filesystem is still active, we hope that only a few 245 * cylinder groups will change between now and when we 246 * suspend operations. Thus, we will be able to quickly 247 * touch up the few cylinder groups that changed during 248 * the suspension period. 249 */ 250 len = howmany(fs->fs_ncg, NBBY); 251 MALLOC(fs->fs_active, int *, len, M_DEVBUF, M_WAITOK); 252 bzero(fs->fs_active, len); 253 for (cg = 0; cg < fs->fs_ncg; cg++) { 254 error = bread(vp, fragstoblks(fs, cgtod(fs, cg)), fs->fs_bsize, 255 KERNCRED, &nbp); 256 if (error) { 257 brelse(nbp); 258 goto out; 259 } 260 error = cgaccount(cg, vp, nbp, 1); 261 bawrite(nbp); 262 if (error) 263 goto out; 264 } 265 /* 266 * Change inode to snapshot type file. 267 */ 268 ip->i_flags |= SF_SNAPSHOT; 269 ip->i_flag |= IN_CHANGE | IN_UPDATE; 270 /* 271 * Ensure that the snapshot is completely on disk. 272 */ 273 if ((error = VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td)) != 0) 274 goto out; 275 /* 276 * All allocations are done, so we can now snapshot the system. 277 * 278 * Recind nice scheduling while running with the filesystem suspended. 279 */ 280 if (td->td_ksegrp->kg_nice > 0) { 281 saved_nice = td->td_ksegrp->kg_nice; 282 td->td_ksegrp->kg_nice = 0; 283 } 284 /* 285 * Suspend operation on filesystem. 286 */ 287 for (;;) { 288 vn_finished_write(wrtmp); 289 vfs_write_suspend(vp->v_mount); 290 if (mp->mnt_kern_flag & MNTK_SUSPENDED) 291 break; 292 vn_start_write(NULL, &wrtmp, V_WAIT); 293 } 294 if (collectsnapstats) 295 nanotime(&starttime); 296 /* 297 * First, copy all the cylinder group maps that have changed. 298 */ 299 for (cg = 0; cg < fs->fs_ncg; cg++) { 300 if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0) 301 continue; 302 redo++; 303 error = bread(vp, fragstoblks(fs, cgtod(fs, cg)), fs->fs_bsize, 304 KERNCRED, &nbp); 305 if (error) { 306 brelse(nbp); 307 goto out1; 308 } 309 error = cgaccount(cg, vp, nbp, 2); 310 bawrite(nbp); 311 if (error) 312 goto out1; 313 } 314 /* 315 * Grab a copy of the superblock and its summary information. 316 * We delay writing it until the suspension is released below. 317 */ 318 error = bread(vp, lblkno(fs, SBOFF), fs->fs_bsize, KERNCRED, &sbp); 319 if (error) { 320 brelse(sbp); 321 sbp = NULL; 322 goto out1; 323 } 324 copy_fs = (struct fs *)(sbp->b_data + blkoff(fs, SBOFF)); 325 bcopy(fs, copy_fs, fs->fs_sbsize); 326 if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) 327 copy_fs->fs_clean = 1; 328 if (fs->fs_sbsize < SBSIZE) 329 bzero(&sbp->b_data[blkoff(fs, SBOFF) + fs->fs_sbsize], 330 SBSIZE - fs->fs_sbsize); 331 size = blkroundup(fs, fs->fs_cssize); 332 if (fs->fs_contigsumsize > 0) 333 size += fs->fs_ncg * sizeof(int32_t); 334 space = malloc((u_long)size, M_UFSMNT, M_WAITOK); 335 copy_fs->fs_csp = space; 336 bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize); 337 (char *)space += fs->fs_cssize; 338 loc = howmany(fs->fs_cssize, fs->fs_fsize); 339 i = fs->fs_frag - loc % fs->fs_frag; 340 len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; 341 if (len > 0) { 342 if ((error = bread(ip->i_devvp, 343 fsbtodb(fs, fs->fs_csaddr + loc), 344 len, KERNCRED, &bp)) != 0) { 345 brelse(bp); 346 free(copy_fs->fs_csp, M_UFSMNT); 347 bawrite(sbp); 348 sbp = NULL; 349 goto out1; 350 } 351 bcopy(bp->b_data, space, (u_int)len); 352 (char *)space += len; 353 bp->b_flags |= B_INVAL | B_NOCACHE; 354 brelse(bp); 355 } 356 if (fs->fs_contigsumsize > 0) { 357 copy_fs->fs_maxcluster = lp = space; 358 for (i = 0; i < fs->fs_ncg; i++) 359 *lp++ = fs->fs_contigsumsize; 360 } 361 /* 362 * We must check for active files that have been unlinked 363 * (e.g., with a zero link count). We have to expunge all 364 * trace of these files from the snapshot so that they are 365 * not reclaimed prematurely by fsck or unnecessarily dumped. 366 * We turn off the MNTK_SUSPENDED flag to avoid a panic from 367 * spec_strategy about writing on a suspended filesystem. 368 */ 369 mp->mnt_kern_flag &= ~MNTK_SUSPENDED; 370 mtx_lock(&mntvnode_mtx); 371 loop: 372 for (xvp = TAILQ_FIRST(&mp->mnt_nvnodelist); xvp; xvp = nvp) { 373 /* 374 * Make sure this vnode wasn't reclaimed in getnewvnode(). 375 * Start over if it has (it won't be on the list anymore). 376 */ 377 if (xvp->v_mount != mp) 378 goto loop; 379 nvp = TAILQ_NEXT(xvp, v_nmntvnodes); 380 mtx_unlock(&mntvnode_mtx); 381 mtx_lock(&xvp->v_interlock); 382 if (xvp->v_usecount == 0 || xvp->v_type == VNON || 383 (VOP_GETATTR(xvp, &vat, td->td_proc->p_ucred, td) == 0 && 384 vat.va_nlink > 0)) { 385 mtx_unlock(&xvp->v_interlock); 386 mtx_lock(&mntvnode_mtx); 387 continue; 388 } 389 if (snapdebug) 390 vprint("ffs_snapshot: busy vnode", xvp); 391 if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK, td) != 0) 392 goto loop; 393 xp = VTOI(xvp); 394 /* 395 * If there is a fragment, clear it here. 396 */ 397 blkno = 0; 398 loc = howmany(xp->i_size, fs->fs_bsize) - 1; 399 if (loc < NDADDR) { 400 len = fragroundup(fs, blkoff(fs, xp->i_size)); 401 if (len < fs->fs_bsize) { 402 ffs_blkfree(copy_fs, vp, xp->i_db[loc], len, 403 xp->i_number); 404 blkno = xp->i_db[loc]; 405 xp->i_db[loc] = 0; 406 } 407 } 408 error = expunge(vp, xp, copy_fs, fullacct, BLK_NOCOPY); 409 if (blkno) 410 xp->i_db[loc] = blkno; 411 if (!error) 412 error = ffs_freefile(copy_fs, vp, xp->i_number, 413 xp->i_mode); 414 VOP_UNLOCK(xvp, 0, td); 415 if (error) { 416 free(copy_fs->fs_csp, M_UFSMNT); 417 bawrite(sbp); 418 sbp = NULL; 419 goto out1; 420 } 421 mtx_lock(&mntvnode_mtx); 422 } 423 mtx_unlock(&mntvnode_mtx); 424 /* 425 * Record snapshot inode. Since this is the newest snapshot, 426 * it must be placed at the end of the list. 427 */ 428 fs->fs_snapinum[snaploc] = ip->i_number; 429 if (ip->i_nextsnap.tqe_prev != 0) 430 panic("ffs_snapshot: %d already on list", ip->i_number); 431 snaphead = &ip->i_devvp->v_rdev->si_snapshots; 432 TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap); 433 ip->i_devvp->v_rdev->si_copyonwrite = ffs_copyonwrite; 434 ip->i_devvp->v_flag |= VCOPYONWRITE; 435 vp->v_flag |= VSYSTEM; 436 out1: 437 /* 438 * Resume operation on filesystem. 439 */ 440 vfs_write_resume(vp->v_mount); 441 if (saved_nice > 0) 442 td->td_ksegrp->kg_nice = saved_nice; 443 vn_start_write(NULL, &wrtmp, V_WAIT); 444 if (collectsnapstats && starttime.tv_sec > 0) { 445 nanotime(&endtime); 446 timespecsub(&endtime, &starttime); 447 printf("%s: suspended %d.%03ld sec, redo %ld of %d\n", 448 vp->v_mount->mnt_stat.f_mntonname, endtime.tv_sec, 449 endtime.tv_nsec / 1000000, redo, fs->fs_ncg); 450 } 451 if (sbp == NULL) 452 goto out; 453 /* 454 * Copy allocation information from all the snapshots in 455 * this snapshot and then expunge them from its view. 456 */ 457 snaphead = &ip->i_devvp->v_rdev->si_snapshots; 458 TAILQ_FOREACH(xp, snaphead, i_nextsnap) { 459 if (xp == ip) 460 break; 461 if ((error = expunge(vp, xp, fs, snapacct, BLK_SNAP)) != 0) { 462 fs->fs_snapinum[snaploc] = 0; 463 goto done; 464 } 465 } 466 /* 467 * Expunge the blocks used by the snapshots from the set of 468 * blocks marked as used in the snapshot bitmaps. 469 */ 470 if ((error = expunge(vp, ip, copy_fs, mapacct, BLK_SNAP)) != 0) { 471 fs->fs_snapinum[snaploc] = 0; 472 goto done; 473 } 474 /* 475 * Write the superblock and its summary information 476 * to the snapshot. 477 */ 478 blkno = fragstoblks(fs, fs->fs_csaddr); 479 len = howmany(fs->fs_cssize, fs->fs_bsize); 480 space = copy_fs->fs_csp; 481 for (loc = 0; loc < len; loc++) { 482 error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp); 483 if (error) { 484 brelse(nbp); 485 fs->fs_snapinum[snaploc] = 0; 486 goto done; 487 } 488 bcopy(space, nbp->b_data, fs->fs_bsize); 489 space = (char *)space + fs->fs_bsize; 490 bawrite(nbp); 491 } 492 done: 493 free(copy_fs->fs_csp, M_UFSMNT); 494 bawrite(sbp); 495 out: 496 if (fs->fs_active != 0) { 497 FREE(fs->fs_active, M_DEVBUF); 498 fs->fs_active = 0; 499 } 500 mp->mnt_flag = flag; 501 if (error) 502 (void) UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED, td); 503 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 504 if (error) 505 vput(vp); 506 else 507 VOP_UNLOCK(vp, 0, td); 508 vn_finished_write(wrtmp); 509 return (error); 510 } 511 512 /* 513 * Copy a cylinder group map. All the unallocated blocks are marked 514 * BLK_NOCOPY so that the snapshot knows that it need not copy them 515 * if they are later written. If passno is one, then this is a first 516 * pass, so only setting needs to be done. If passno is 2, then this 517 * is a revision to a previous pass which must be undone as the 518 * replacement pass is done. 519 */ 520 static int 521 cgaccount(cg, vp, nbp, passno) 522 int cg; 523 struct vnode *vp; 524 struct buf *nbp; 525 int passno; 526 { 527 struct buf *bp, *ibp; 528 struct inode *ip; 529 struct cg *cgp; 530 struct fs *fs; 531 int error, numblks, base, len, loc, indiroff; 532 533 ip = VTOI(vp); 534 fs = ip->i_fs; 535 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 536 (int)fs->fs_cgsize, KERNCRED, &bp); 537 if (error) { 538 brelse(bp); 539 return (error); 540 } 541 cgp = (struct cg *)bp->b_data; 542 if (!cg_chkmagic(cgp)) { 543 brelse(bp); 544 return (EIO); 545 } 546 atomic_set_int(&ACTIVECGNUM(fs, cg), ACTIVECGOFF(cg)); 547 bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize); 548 if (fs->fs_cgsize < fs->fs_bsize) 549 bzero(&nbp->b_data[fs->fs_cgsize], 550 fs->fs_bsize - fs->fs_cgsize); 551 if (passno == 2) 552 nbp->b_flags |= B_VALIDSUSPWRT; 553 numblks = howmany(fs->fs_size, fs->fs_frag); 554 len = howmany(fs->fs_fpg, fs->fs_frag); 555 base = cg * fs->fs_fpg / fs->fs_frag; 556 if (base + len >= numblks) 557 len = numblks - base - 1; 558 loc = 0; 559 if (base < NDADDR) { 560 for ( ; loc < NDADDR; loc++) { 561 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 562 ip->i_db[loc] = BLK_NOCOPY; 563 else if (passno == 2 && ip->i_db[loc] == BLK_NOCOPY) 564 ip->i_db[loc] = 0; 565 else if (passno == 1 && ip->i_db[loc] == BLK_NOCOPY) 566 panic("ffs_snapshot: lost direct block"); 567 } 568 } 569 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), 570 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 571 if (error) { 572 brelse(bp); 573 return (error); 574 } 575 indiroff = (base + loc - NDADDR) % NINDIR(fs); 576 for ( ; loc < len; loc++, indiroff++) { 577 if (indiroff >= NINDIR(fs)) { 578 if (passno == 2) 579 ibp->b_flags |= B_VALIDSUSPWRT; 580 bawrite(ibp); 581 error = UFS_BALLOC(vp, 582 lblktosize(fs, (off_t)(base + loc)), 583 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 584 if (error) { 585 brelse(bp); 586 return (error); 587 } 588 indiroff = 0; 589 } 590 if (ffs_isblock(fs, cg_blksfree(cgp), loc)) 591 ((ufs_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; 592 else if (passno == 2 && 593 ((ufs_daddr_t *)(ibp->b_data))[indiroff] == BLK_NOCOPY) 594 ((ufs_daddr_t *)(ibp->b_data))[indiroff] = 0; 595 else if (passno == 1 && 596 ((ufs_daddr_t *)(ibp->b_data))[indiroff] == BLK_NOCOPY) 597 panic("ffs_snapshot: lost indirect block"); 598 } 599 bqrelse(bp); 600 if (passno == 2) 601 ibp->b_flags |= B_VALIDSUSPWRT; 602 bdwrite(ibp); 603 return (0); 604 } 605 606 /* 607 * Before expunging a snapshot inode, note all the 608 * blocks that it claims with BLK_SNAP so that fsck will 609 * be able to account for those blocks properly and so 610 * that this snapshot knows that it need not copy them 611 * if the other snapshot holding them is freed. 612 */ 613 static int 614 expunge(snapvp, cancelip, fs, acctfunc, expungetype) 615 struct vnode *snapvp; 616 struct inode *cancelip; 617 struct fs *fs; 618 int (*acctfunc)(struct vnode *, ufs_daddr_t *, ufs_daddr_t *, 619 struct fs *, ufs_daddr_t, int); 620 int expungetype; 621 { 622 int i, len, error, numblks, blksperindir; 623 ufs_daddr_t lbn, rlbn, blkno, indiroff; 624 struct thread *td = curthread; 625 struct dinode *dip; 626 struct buf *bp; 627 628 numblks = howmany(cancelip->i_size, fs->fs_bsize); 629 if ((error = (*acctfunc)(snapvp, &cancelip->i_db[0], 630 &cancelip->i_ib[NIADDR], fs, 0, expungetype))) 631 return (error); 632 blksperindir = 1; 633 lbn = -NDADDR; 634 len = numblks - NDADDR; 635 rlbn = NDADDR; 636 for (i = 0; len > 0 && i < NIADDR; i++) { 637 error = indiracct(snapvp, ITOV(cancelip), i, cancelip->i_ib[i], 638 lbn, rlbn, len, blksperindir, fs, acctfunc, expungetype); 639 if (error) 640 return (error); 641 blksperindir *= NINDIR(fs); 642 lbn -= blksperindir + 1; 643 len -= blksperindir; 644 rlbn += blksperindir; 645 } 646 /* 647 * Prepare to expunge the inode. If its inode block has not 648 * yet been copied, then allocate and fill the copy. 649 */ 650 lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); 651 blkno = 0; 652 if (lbn < NDADDR) { 653 blkno = cancelip->i_db[lbn]; 654 } else { 655 td->td_proc->p_flag |= P_COWINPROGRESS; 656 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 657 fs->fs_bsize, KERNCRED, B_METAONLY, &bp); 658 td->td_proc->p_flag &= ~P_COWINPROGRESS; 659 if (error) 660 return (error); 661 indiroff = (lbn - NDADDR) % NINDIR(fs); 662 blkno = ((ufs_daddr_t *)(bp->b_data))[indiroff]; 663 bqrelse(bp); 664 } 665 error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn), 666 fs->fs_bsize, KERNCRED, 0, &bp); 667 if (error) 668 return (error); 669 if (blkno == 0 && (error = readblock(bp, lbn))) 670 return (error); 671 /* 672 * Set a snapshot inode to be a zero length file, regular files 673 * to be completely unallocated. 674 */ 675 dip = (struct dinode *)bp->b_data + ino_to_fsbo(fs, cancelip->i_number); 676 if (expungetype == BLK_NOCOPY) 677 dip->di_mode = 0; 678 dip->di_size = 0; 679 dip->di_blocks = 0; 680 dip->di_flags &= ~SF_SNAPSHOT; 681 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs_daddr_t)); 682 bdwrite(bp); 683 return (0); 684 } 685 686 /* 687 * Descend an indirect block chain for vnode cancelvp accounting for all 688 * its indirect blocks in snapvp. 689 */ 690 static int 691 indiracct(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, blksperindir, fs, 692 acctfunc, expungetype) 693 struct vnode *snapvp; 694 struct vnode *cancelvp; 695 int level; 696 ufs_daddr_t blkno; 697 int lbn; 698 int rlbn; 699 int remblks; 700 int blksperindir; 701 struct fs *fs; 702 int (*acctfunc)(struct vnode *, ufs_daddr_t *, ufs_daddr_t *, 703 struct fs *, ufs_daddr_t, int); 704 int expungetype; 705 { 706 int subblksperindir, error, last, num, i; 707 struct indir indirs[NIADDR + 2]; 708 ufs_daddr_t *bap; 709 struct buf *bp; 710 711 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 712 return (error); 713 if (lbn != indirs[num - 1 - level].in_lbn || blkno == 0 || num < 2) 714 panic("indiracct: botched params"); 715 /* 716 * We have to expand bread here since it will deadlock looking 717 * up the block number for any blocks that are not in the cache. 718 */ 719 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0); 720 bp->b_blkno = fsbtodb(fs, blkno); 721 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 722 (error = readblock(bp, fragstoblks(fs, blkno)))) { 723 brelse(bp); 724 return (error); 725 } 726 /* 727 * Account for the block pointers in this indirect block. 728 */ 729 last = howmany(remblks, blksperindir); 730 if (last > NINDIR(fs)) 731 last = NINDIR(fs); 732 MALLOC(bap, ufs_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); 733 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 734 bqrelse(bp); 735 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, rlbn, expungetype); 736 if (error || level == 0) 737 goto out; 738 /* 739 * Account for the block pointers in each of the indirect blocks 740 * in the levels below us. 741 */ 742 subblksperindir = blksperindir / NINDIR(fs); 743 for (lbn++, level--, i = 0; i < last; i++) { 744 error = indiracct(snapvp, cancelvp, level, bap[i], lbn, 745 rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); 746 if (error) 747 goto out; 748 rlbn += blksperindir; 749 lbn -= blksperindir; 750 remblks -= blksperindir; 751 } 752 out: 753 FREE(bap, M_DEVBUF); 754 return (error); 755 } 756 757 /* 758 * Do both snap accounting and map accounting. 759 */ 760 static int 761 fullacct(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 762 struct vnode *vp; 763 ufs_daddr_t *oldblkp, *lastblkp; 764 struct fs *fs; 765 ufs_daddr_t lblkno; 766 int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 767 { 768 int error; 769 770 if ((error = snapacct(vp, oldblkp, lastblkp, fs, lblkno, expungetype))) 771 return (error); 772 return (mapacct(vp, oldblkp, lastblkp, fs, lblkno, expungetype)); 773 } 774 775 /* 776 * Identify a set of blocks allocated in a snapshot inode. 777 */ 778 static int 779 snapacct(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 780 struct vnode *vp; 781 ufs_daddr_t *oldblkp, *lastblkp; 782 struct fs *fs; 783 ufs_daddr_t lblkno; 784 int expungetype; /* BLK_SNAP or BLK_NOCOPY */ 785 { 786 struct inode *ip = VTOI(vp); 787 ufs_daddr_t lbn, blkno, *blkp; 788 struct buf *ibp; 789 int error; 790 791 for ( ; oldblkp < lastblkp; oldblkp++) { 792 blkno = *oldblkp; 793 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 794 continue; 795 lbn = fragstoblks(fs, blkno); 796 if (lbn < NDADDR) { 797 blkp = &ip->i_db[lbn]; 798 ip->i_flag |= IN_CHANGE | IN_UPDATE; 799 } else { 800 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 801 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 802 if (error) 803 return (error); 804 blkp = &((ufs_daddr_t *)(ibp->b_data)) 805 [(lbn - NDADDR) % NINDIR(fs)]; 806 } 807 /* 808 * If we are expunging a snapshot vnode and we 809 * find a block marked BLK_NOCOPY, then it is 810 * one that has been allocated to this snapshot after 811 * we took our current snapshot and can be ignored. 812 */ 813 if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { 814 if (lbn >= NDADDR) 815 brelse(ibp); 816 } else { 817 if (*blkp != 0) 818 panic("snapacct: bad block"); 819 *blkp = expungetype; 820 if (lbn >= NDADDR) 821 bdwrite(ibp); 822 } 823 } 824 return (0); 825 } 826 827 /* 828 * Account for a set of blocks allocated in a snapshot inode. 829 */ 830 static int 831 mapacct(vp, oldblkp, lastblkp, fs, lblkno, expungetype) 832 struct vnode *vp; 833 ufs_daddr_t *oldblkp, *lastblkp; 834 struct fs *fs; 835 ufs_daddr_t lblkno; 836 int expungetype; 837 { 838 ufs_daddr_t blkno; 839 ino_t inum; 840 841 inum = VTOI(vp)->i_number; 842 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 843 blkno = *oldblkp; 844 if (blkno == 0 || blkno == BLK_NOCOPY) 845 continue; 846 if (blkno == BLK_SNAP) 847 blkno = blkstofrags(fs, lblkno); 848 ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum); 849 } 850 return (0); 851 } 852 853 /* 854 * Decrement extra reference on snapshot when last name is removed. 855 * It will not be freed until the last open reference goes away. 856 */ 857 void 858 ffs_snapgone(ip) 859 struct inode *ip; 860 { 861 struct inode *xp; 862 struct fs *fs; 863 int snaploc; 864 865 /* 866 * Find snapshot in incore list. 867 */ 868 TAILQ_FOREACH(xp, &ip->i_devvp->v_rdev->si_snapshots, i_nextsnap) 869 if (xp == ip) 870 break; 871 if (xp == 0) 872 printf("ffs_snapgone: lost snapshot vnode %d\n", 873 ip->i_number); 874 else 875 vrele(ITOV(ip)); 876 /* 877 * Delete snapshot inode from superblock. Keep list dense. 878 */ 879 fs = ip->i_fs; 880 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 881 if (fs->fs_snapinum[snaploc] == ip->i_number) 882 break; 883 if (snaploc < FSMAXSNAP) { 884 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { 885 if (fs->fs_snapinum[snaploc] == 0) 886 break; 887 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; 888 } 889 fs->fs_snapinum[snaploc - 1] = 0; 890 } 891 } 892 893 /* 894 * Prepare a snapshot file for being removed. 895 */ 896 void 897 ffs_snapremove(vp) 898 struct vnode *vp; 899 { 900 struct inode *ip; 901 struct vnode *devvp; 902 struct buf *ibp; 903 struct fs *fs; 904 ufs_daddr_t blkno, dblk; 905 int error, numblks, loc, last; 906 907 ip = VTOI(vp); 908 fs = ip->i_fs; 909 /* 910 * If active, delete from incore list (this snapshot may 911 * already have been in the process of being deleted, so 912 * would not have been active). 913 * 914 * Clear copy-on-write flag if last snapshot. 915 */ 916 if (ip->i_nextsnap.tqe_prev != 0) { 917 devvp = ip->i_devvp; 918 TAILQ_REMOVE(&devvp->v_rdev->si_snapshots, ip, i_nextsnap); 919 ip->i_nextsnap.tqe_prev = 0; 920 if (TAILQ_FIRST(&devvp->v_rdev->si_snapshots) == 0) { 921 devvp->v_rdev->si_copyonwrite = 0; 922 devvp->v_flag &= ~VCOPYONWRITE; 923 } 924 } 925 /* 926 * Clear all BLK_NOCOPY fields. Pass any block claims to other 927 * snapshots that want them (see ffs_snapblkfree below). 928 */ 929 for (blkno = 1; blkno < NDADDR; blkno++) { 930 dblk = ip->i_db[blkno]; 931 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 932 ip->i_db[blkno] = 0; 933 else if ((dblk == blkstofrags(fs, blkno) && 934 ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize, 935 ip->i_number))) { 936 ip->i_blocks -= btodb(fs->fs_bsize); 937 ip->i_db[blkno] = 0; 938 } 939 } 940 numblks = howmany(ip->i_size, fs->fs_bsize); 941 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 942 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), 943 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 944 if (error) 945 continue; 946 if ((last = fs->fs_size - blkno) > NINDIR(fs)) 947 last = NINDIR(fs); 948 for (loc = 0; loc < last; loc++) { 949 dblk = ((ufs_daddr_t *)(ibp->b_data))[loc]; 950 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 951 ((ufs_daddr_t *)(ibp->b_data))[loc] = 0; 952 else if ((dblk == blkstofrags(fs, blkno) && 953 ffs_snapblkfree(fs, ip->i_devvp, dblk, 954 fs->fs_bsize, ip->i_number))) { 955 ip->i_blocks -= btodb(fs->fs_bsize); 956 ((ufs_daddr_t *)(ibp->b_data))[loc] = 0; 957 } 958 } 959 bawrite(ibp); 960 } 961 /* 962 * Clear snapshot flag and drop reference. 963 */ 964 ip->i_flags &= ~SF_SNAPSHOT; 965 ip->i_flag |= IN_CHANGE | IN_UPDATE; 966 } 967 968 /* 969 * Notification that a block is being freed. Return zero if the free 970 * should be allowed to proceed. Return non-zero if the snapshot file 971 * wants to claim the block. The block will be claimed if it is an 972 * uncopied part of one of the snapshots. It will be freed if it is 973 * either a BLK_NOCOPY or has already been copied in all of the snapshots. 974 * If a fragment is being freed, then all snapshots that care about 975 * it must make a copy since a snapshot file can only claim full sized 976 * blocks. Note that if more than one snapshot file maps the block, 977 * we can pick one at random to claim it. Since none of the snapshots 978 * can change, we are assurred that they will all see the same unmodified 979 * image. When deleting a snapshot file (see ffs_snapremove above), we 980 * must push any of these claimed blocks to one of the other snapshots 981 * that maps it. These claimed blocks are easily identified as they will 982 * have a block number equal to their logical block number within the 983 * snapshot. A copied block can never have this property because they 984 * must always have been allocated from a BLK_NOCOPY location. 985 */ 986 int 987 ffs_snapblkfree(fs, devvp, bno, size, inum) 988 struct fs *fs; 989 struct vnode *devvp; 990 ufs_daddr_t bno; 991 long size; 992 ino_t inum; 993 { 994 struct buf *ibp, *cbp, *savedcbp = 0; 995 struct thread *td = curthread; 996 struct inode *ip; 997 struct vnode *vp; 998 ufs_daddr_t lbn, blkno; 999 int indiroff = 0, error = 0, claimedblk = 0; 1000 struct snaphead *snaphead; 1001 1002 lbn = fragstoblks(fs, bno); 1003 snaphead = &devvp->v_rdev->si_snapshots; 1004 TAILQ_FOREACH(ip, snaphead, i_nextsnap) { 1005 vp = ITOV(ip); 1006 /* 1007 * Lookup block being written. 1008 */ 1009 if (lbn < NDADDR) { 1010 blkno = ip->i_db[lbn]; 1011 } else { 1012 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1013 td->td_proc->p_flag |= P_COWINPROGRESS; 1014 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1015 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 1016 td->td_proc->p_flag &= ~P_COWINPROGRESS; 1017 VOP_UNLOCK(vp, 0, td); 1018 if (error) 1019 break; 1020 indiroff = (lbn - NDADDR) % NINDIR(fs); 1021 blkno = ((ufs_daddr_t *)(ibp->b_data))[indiroff]; 1022 } 1023 /* 1024 * Check to see if block needs to be copied. 1025 */ 1026 switch (blkno) { 1027 /* 1028 * If the snapshot has already copied the block (default), 1029 * or does not care about the block, it is not needed. 1030 */ 1031 default: 1032 case BLK_NOCOPY: 1033 if (lbn >= NDADDR) 1034 bqrelse(ibp); 1035 continue; 1036 /* 1037 * No previous snapshot claimed the block, so it will be 1038 * freed and become a BLK_NOCOPY (don't care) for us. 1039 */ 1040 case BLK_SNAP: 1041 if (claimedblk) 1042 panic("snapblkfree: inconsistent block type"); 1043 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1044 if (lbn < NDADDR) { 1045 ip->i_db[lbn] = BLK_NOCOPY; 1046 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1047 } else { 1048 ((ufs_daddr_t *)(ibp->b_data))[indiroff] = 1049 BLK_NOCOPY; 1050 bdwrite(ibp); 1051 } 1052 VOP_UNLOCK(vp, 0, td); 1053 continue; 1054 /* 1055 * A block that we map is being freed. If it has not been 1056 * claimed yet, we will claim or copy it (below). 1057 */ 1058 case 0: 1059 claimedblk = 1; 1060 break; 1061 } 1062 /* 1063 * If this is a full size block, we will just grab it 1064 * and assign it to the snapshot inode. Otherwise we 1065 * will proceed to copy it. See explanation for this 1066 * routine as to why only a single snapshot needs to 1067 * claim this block. 1068 */ 1069 if (size == fs->fs_bsize) { 1070 #ifdef DEBUG 1071 if (snapdebug) 1072 printf("%s %d lbn %d from inum %d\n", 1073 "Grabonremove: snapino", ip->i_number, lbn, 1074 inum); 1075 #endif 1076 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1077 if (lbn < NDADDR) { 1078 ip->i_db[lbn] = bno; 1079 } else { 1080 ((ufs_daddr_t *)(ibp->b_data))[indiroff] = bno; 1081 bdwrite(ibp); 1082 } 1083 ip->i_blocks += btodb(size); 1084 ip->i_flag |= IN_CHANGE | IN_UPDATE; 1085 VOP_UNLOCK(vp, 0, td); 1086 return (1); 1087 } 1088 if (lbn >= NDADDR) 1089 bqrelse(ibp); 1090 /* 1091 * Allocate the block into which to do the copy. Note that this 1092 * allocation will never require any additional allocations for 1093 * the snapshot inode. 1094 */ 1095 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1096 td->td_proc->p_flag |= P_COWINPROGRESS; 1097 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1098 fs->fs_bsize, KERNCRED, 0, &cbp); 1099 td->td_proc->p_flag &= ~P_COWINPROGRESS; 1100 if (error) { 1101 VOP_UNLOCK(vp, 0, td); 1102 break; 1103 } 1104 #ifdef DEBUG 1105 if (snapdebug) 1106 printf( 1107 "Copyonremove: snapino %lu lbn %ld for inum %lu size %ld to blkno %lld\n", 1108 (unsigned long)ip->i_number, (long)lbn, 1109 (unsigned long)inum, size, (long long)cbp->b_blkno); 1110 #endif 1111 /* 1112 * If we have already read the old block contents, then 1113 * simply copy them to the new block. Note that we need 1114 * to synchronously write snapshots that have not been 1115 * unlinked, and hence will be visible after a crash, 1116 * to ensure their integrity. 1117 */ 1118 if (savedcbp != 0) { 1119 bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 1120 bawrite(cbp); 1121 if (dopersistence && ip->i_effnlink > 0) 1122 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1123 VOP_UNLOCK(vp, 0, td); 1124 continue; 1125 } 1126 /* 1127 * Otherwise, read the old block contents into the buffer. 1128 */ 1129 if ((error = readblock(cbp, lbn)) != 0) { 1130 bzero(cbp->b_data, fs->fs_bsize); 1131 bawrite(cbp); 1132 if (dopersistence && ip->i_effnlink > 0) 1133 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1134 VOP_UNLOCK(vp, 0, td); 1135 break; 1136 } 1137 VOP_UNLOCK(vp, 0, td); 1138 savedcbp = cbp; 1139 } 1140 /* 1141 * Note that we need to synchronously write snapshots that 1142 * have not been unlinked, and hence will be visible after 1143 * a crash, to ensure their integrity. 1144 */ 1145 if (savedcbp) { 1146 vp = savedcbp->b_vp; 1147 bawrite(savedcbp); 1148 if (dopersistence && VTOI(vp)->i_effnlink > 0) { 1149 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1150 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1151 VOP_UNLOCK(vp, 0, td); 1152 } 1153 } 1154 /* 1155 * If we have been unable to allocate a block in which to do 1156 * the copy, then return non-zero so that the fragment will 1157 * not be freed. Although space will be lost, the snapshot 1158 * will stay consistent. 1159 */ 1160 return (error); 1161 } 1162 1163 /* 1164 * Associate snapshot files when mounting. 1165 */ 1166 void 1167 ffs_snapshot_mount(mp) 1168 struct mount *mp; 1169 { 1170 struct ufsmount *ump = VFSTOUFS(mp); 1171 struct fs *fs = ump->um_fs; 1172 struct thread *td = curthread; 1173 struct snaphead *snaphead; 1174 struct vnode *vp; 1175 struct inode *ip; 1176 int error, snaploc, loc; 1177 1178 snaphead = &ump->um_devvp->v_rdev->si_snapshots; 1179 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { 1180 if (fs->fs_snapinum[snaploc] == 0) 1181 return; 1182 if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], 1183 LK_EXCLUSIVE, &vp)) != 0){ 1184 printf("ffs_snapshot_mount: vget failed %d\n", error); 1185 continue; 1186 } 1187 ip = VTOI(vp); 1188 if ((ip->i_flags & SF_SNAPSHOT) == 0) { 1189 printf("ffs_snapshot_mount: non-snapshot inode %d\n", 1190 fs->fs_snapinum[snaploc]); 1191 vput(vp); 1192 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { 1193 if (fs->fs_snapinum[loc] == 0) 1194 break; 1195 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; 1196 } 1197 fs->fs_snapinum[loc - 1] = 0; 1198 snaploc--; 1199 continue; 1200 } 1201 if (ip->i_nextsnap.tqe_prev != 0) 1202 panic("ffs_snapshot_mount: %d already on list", 1203 ip->i_number); 1204 else 1205 TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap); 1206 vp->v_flag |= VSYSTEM; 1207 ump->um_devvp->v_rdev->si_copyonwrite = ffs_copyonwrite; 1208 ump->um_devvp->v_flag |= VCOPYONWRITE; 1209 VOP_UNLOCK(vp, 0, td); 1210 } 1211 } 1212 1213 /* 1214 * Disassociate snapshot files when unmounting. 1215 */ 1216 void 1217 ffs_snapshot_unmount(mp) 1218 struct mount *mp; 1219 { 1220 struct ufsmount *ump = VFSTOUFS(mp); 1221 struct snaphead *snaphead = &ump->um_devvp->v_rdev->si_snapshots; 1222 struct inode *xp; 1223 1224 while ((xp = TAILQ_FIRST(snaphead)) != 0) { 1225 TAILQ_REMOVE(snaphead, xp, i_nextsnap); 1226 xp->i_nextsnap.tqe_prev = 0; 1227 if (xp->i_effnlink > 0) 1228 vrele(ITOV(xp)); 1229 } 1230 ump->um_devvp->v_rdev->si_copyonwrite = 0; 1231 ump->um_devvp->v_flag &= ~VCOPYONWRITE; 1232 } 1233 1234 /* 1235 * Check for need to copy block that is about to be written, 1236 * copying the block if necessary. 1237 */ 1238 static int 1239 ffs_copyonwrite(devvp, bp) 1240 struct vnode *devvp; 1241 struct buf *bp; 1242 { 1243 struct buf *ibp, *cbp, *savedcbp = 0; 1244 struct thread *td = curthread; 1245 struct fs *fs; 1246 struct inode *ip; 1247 struct vnode *vp; 1248 ufs_daddr_t lbn, blkno; 1249 int indiroff, error = 0; 1250 1251 fs = TAILQ_FIRST(&devvp->v_rdev->si_snapshots)->i_fs; 1252 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); 1253 if (td->td_proc->p_flag & P_COWINPROGRESS) 1254 panic("ffs_copyonwrite: recursive call"); 1255 TAILQ_FOREACH(ip, &devvp->v_rdev->si_snapshots, i_nextsnap) { 1256 vp = ITOV(ip); 1257 /* 1258 * We ensure that everything of our own that needs to be 1259 * copied will be done at the time that ffs_snapshot is 1260 * called. Thus we can skip the check here which can 1261 * deadlock in doing the lookup in UFS_BALLOC. 1262 */ 1263 if (bp->b_vp == vp) 1264 continue; 1265 /* 1266 * Check to see if block needs to be copied. We have to 1267 * be able to do the UFS_BALLOC without blocking, otherwise 1268 * we may get in a deadlock with another process also 1269 * trying to allocate. If we find outselves unable to 1270 * get the buffer lock, we unlock the snapshot vnode, 1271 * sleep briefly, and try again. 1272 */ 1273 retry: 1274 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1275 if (lbn < NDADDR) { 1276 blkno = ip->i_db[lbn]; 1277 } else { 1278 td->td_proc->p_flag |= P_COWINPROGRESS; 1279 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1280 fs->fs_bsize, KERNCRED, B_METAONLY | B_NOWAIT, &ibp); 1281 td->td_proc->p_flag &= ~P_COWINPROGRESS; 1282 if (error) { 1283 VOP_UNLOCK(vp, 0, td); 1284 if (error != EWOULDBLOCK) 1285 break; 1286 tsleep(vp, td->td_ksegrp->kg_user_pri, "nap", 1); 1287 goto retry; 1288 } 1289 indiroff = (lbn - NDADDR) % NINDIR(fs); 1290 blkno = ((ufs_daddr_t *)(ibp->b_data))[indiroff]; 1291 bqrelse(ibp); 1292 } 1293 #ifdef DIAGNOSTIC 1294 if (blkno == BLK_SNAP && bp->b_lblkno >= 0) 1295 panic("ffs_copyonwrite: bad copy block"); 1296 #endif 1297 if (blkno != 0) { 1298 VOP_UNLOCK(vp, 0, td); 1299 continue; 1300 } 1301 /* 1302 * Allocate the block into which to do the copy. Note that this 1303 * allocation will never require any additional allocations for 1304 * the snapshot inode. 1305 */ 1306 td->td_proc->p_flag |= P_COWINPROGRESS; 1307 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1308 fs->fs_bsize, KERNCRED, B_NOWAIT, &cbp); 1309 td->td_proc->p_flag &= ~P_COWINPROGRESS; 1310 if (error) { 1311 VOP_UNLOCK(vp, 0, td); 1312 if (error != EWOULDBLOCK) 1313 break; 1314 tsleep(vp, td->td_ksegrp->kg_user_pri, "nap", 1); 1315 goto retry; 1316 } 1317 #ifdef DEBUG 1318 if (snapdebug) { 1319 printf("Copyonwrite: snapino %d lbn %d for ", 1320 ip->i_number, lbn); 1321 if (bp->b_vp == devvp) 1322 printf("fs metadata"); 1323 else 1324 printf("inum %d", VTOI(bp->b_vp)->i_number); 1325 printf(" lblkno %lld to blkno %lld\n", 1326 (long long)bp->b_lblkno, (long long)cbp->b_blkno); 1327 } 1328 #endif 1329 /* 1330 * If we have already read the old block contents, then 1331 * simply copy them to the new block. Note that we need 1332 * to synchronously write snapshots that have not been 1333 * unlinked, and hence will be visible after a crash, 1334 * to ensure their integrity. 1335 */ 1336 if (savedcbp != 0) { 1337 bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 1338 bawrite(cbp); 1339 if (dopersistence && ip->i_effnlink > 0) 1340 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1341 VOP_UNLOCK(vp, 0, td); 1342 continue; 1343 } 1344 /* 1345 * Otherwise, read the old block contents into the buffer. 1346 */ 1347 if ((error = readblock(cbp, lbn)) != 0) { 1348 bzero(cbp->b_data, fs->fs_bsize); 1349 bawrite(cbp); 1350 if (dopersistence && ip->i_effnlink > 0) 1351 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1352 VOP_UNLOCK(vp, 0, td); 1353 break; 1354 } 1355 savedcbp = cbp; 1356 VOP_UNLOCK(vp, 0, td); 1357 } 1358 /* 1359 * Note that we need to synchronously write snapshots that 1360 * have not been unlinked, and hence will be visible after 1361 * a crash, to ensure their integrity. 1362 */ 1363 if (savedcbp) { 1364 vp = savedcbp->b_vp; 1365 bawrite(savedcbp); 1366 if (dopersistence && VTOI(vp)->i_effnlink > 0) { 1367 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td); 1368 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td); 1369 VOP_UNLOCK(vp, 0, td); 1370 } 1371 } 1372 return (error); 1373 } 1374 1375 /* 1376 * Read the specified block into the given buffer. 1377 * Much of this boiler-plate comes from bwrite(). 1378 */ 1379 static int 1380 readblock(bp, lbn) 1381 struct buf *bp; 1382 daddr_t lbn; 1383 { 1384 struct uio auio; 1385 struct iovec aiov; 1386 struct thread *td = curthread; 1387 struct inode *ip = VTOI(bp->b_vp); 1388 1389 aiov.iov_base = bp->b_data; 1390 aiov.iov_len = bp->b_bcount; 1391 auio.uio_iov = &aiov; 1392 auio.uio_iovcnt = 1; 1393 auio.uio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn))); 1394 auio.uio_resid = bp->b_bcount; 1395 auio.uio_rw = UIO_READ; 1396 auio.uio_segflg = UIO_SYSSPACE; 1397 auio.uio_td = td; 1398 return (physio(ip->i_devvp->v_rdev, &auio, 0)); 1399 } 1400