1 /* 2 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. 3 * 4 * Further information about snapshots can be obtained from: 5 * 6 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 7 * 1614 Oxford Street mckusick@mckusick.com 8 * Berkeley, CA 94709-1608 +1-510-843-9542 9 * USA 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 21 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 22 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 34 * $FreeBSD$ 35 */ 36 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/conf.h> 40 #include <sys/bio.h> 41 #include <sys/buf.h> 42 #include <sys/proc.h> 43 #include <sys/namei.h> 44 #include <sys/stat.h> 45 #include <sys/malloc.h> 46 #include <sys/mount.h> 47 #include <sys/resource.h> 48 #include <sys/resourcevar.h> 49 #include <sys/vnode.h> 50 51 #include <ufs/ufs/extattr.h> 52 #include <ufs/ufs/quota.h> 53 #include <ufs/ufs/ufsmount.h> 54 #include <ufs/ufs/inode.h> 55 #include <ufs/ufs/ufs_extern.h> 56 57 #include <ufs/ffs/fs.h> 58 #include <ufs/ffs/ffs_extern.h> 59 60 #define KERNCRED proc0.p_ucred 61 #define DEBUG 1 62 63 static int expunge __P((struct vnode *, struct inode *, struct fs *, 64 int (*) __P((struct vnode *, ufs_daddr_t *, ufs_daddr_t *, struct fs *, 65 ufs_daddr_t)))); 66 static int indiracct __P((struct vnode *, struct vnode *, int, ufs_daddr_t, 67 int, int, int, int, struct fs *, int (*) __P((struct vnode *, 68 ufs_daddr_t *, ufs_daddr_t *, struct fs *, ufs_daddr_t)))); 69 static int snapacct __P((struct vnode *, ufs_daddr_t *, ufs_daddr_t *, 70 struct fs *, ufs_daddr_t)); 71 static int mapacct __P((struct vnode *, ufs_daddr_t *, ufs_daddr_t *, 72 struct fs *, ufs_daddr_t)); 73 static int ffs_copyonwrite __P((struct vnode *, struct buf *)); 74 static int readblock __P((struct buf *, daddr_t)); 75 76 /* 77 * To ensure the consistency of snapshots across crashes, we must 78 * synchronously write out copied blocks before allowing the 79 * originals to be modified. Because of the rather severe speed 80 * penalty that this imposes, the following flag allows this 81 * crash persistence to be disabled. 82 */ 83 int dopersistence = 0; 84 85 #ifdef DEBUG 86 #include <sys/sysctl.h> 87 SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, ""); 88 int snapdebug = 0; 89 SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, ""); 90 #endif /* DEBUG */ 91 92 /* 93 * Create a snapshot file and initialize it for the filesystem. 94 */ 95 int 96 ffs_snapshot(mp, snapfile) 97 struct mount *mp; 98 char *snapfile; 99 { 100 ufs_daddr_t blkno, inoblks[FSMAXSNAP]; 101 int error, cg, snaploc, indiroff, numblks; 102 int i, size, base, len, loc, inoblkcnt; 103 int flag = mp->mnt_flag; 104 int32_t *lp; 105 void *space; 106 struct fs *copy_fs = NULL, *fs = VFSTOUFS(mp)->um_fs; 107 struct snaphead *snaphead; 108 struct proc *p = CURPROC; 109 struct inode *ip, *xp; 110 struct buf *bp, *nbp, *ibp, *sbp = NULL; 111 struct nameidata nd; 112 struct mount *wrtmp; 113 struct vattr vat; 114 struct vnode *vp; 115 struct cg *cgp; 116 117 /* 118 * Need to serialize access to snapshot code per filesystem. 119 */ 120 /* 121 * Assign a snapshot slot in the superblock. 122 */ 123 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 124 if (fs->fs_snapinum[snaploc] == 0) 125 break; 126 if (snaploc == FSMAXSNAP) 127 return (ENOSPC); 128 /* 129 * Create the snapshot file. 130 */ 131 restart: 132 NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, snapfile, p); 133 if ((error = namei(&nd)) != 0) 134 return (error); 135 if (nd.ni_vp != NULL) { 136 vput(nd.ni_vp); 137 error = EEXIST; 138 } 139 if (nd.ni_dvp->v_mount != mp) 140 error = EXDEV; 141 if (error) { 142 NDFREE(&nd, NDF_ONLY_PNBUF); 143 if (nd.ni_dvp == nd.ni_vp) 144 vrele(nd.ni_dvp); 145 else 146 vput(nd.ni_dvp); 147 return (error); 148 } 149 VATTR_NULL(&vat); 150 vat.va_type = VREG; 151 vat.va_mode = S_IRUSR; 152 vat.va_vaflags |= VA_EXCLUSIVE; 153 if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp)) 154 wrtmp = NULL; 155 if (wrtmp != mp) 156 panic("ffs_snapshot: mount mismatch"); 157 if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) { 158 NDFREE(&nd, NDF_ONLY_PNBUF); 159 vput(nd.ni_dvp); 160 if ((error = vn_start_write(NULL, &wrtmp, 161 V_XSLEEP | PCATCH)) != 0) 162 return (error); 163 goto restart; 164 } 165 VOP_LEASE(nd.ni_dvp, p, KERNCRED, LEASE_WRITE); 166 error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat); 167 vput(nd.ni_dvp); 168 if (error) { 169 NDFREE(&nd, NDF_ONLY_PNBUF); 170 vn_finished_write(wrtmp); 171 return (error); 172 } 173 vp = nd.ni_vp; 174 ip = VTOI(vp); 175 /* 176 * Allocate and copy the last block contents so as to be able 177 * to set size to that of the filesystem. 178 */ 179 numblks = howmany(fs->fs_size, fs->fs_frag); 180 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), 181 fs->fs_bsize, KERNCRED, B_CLRBUF, &bp); 182 if (error) 183 goto out; 184 ip->i_size = lblktosize(fs, (off_t)numblks); 185 ip->i_flag |= IN_CHANGE | IN_UPDATE; 186 if ((error = readblock(bp, numblks - 1)) != 0) 187 goto out; 188 bawrite(bp); 189 /* 190 * Preallocate critical data structures so that we can copy 191 * them in without further allocation after we suspend all 192 * operations on the filesystem. We would like to just release 193 * the allocated buffers without writing them since they will 194 * be filled in below once we are ready to go, but this upsets 195 * the soft update code, so we go ahead and write the new buffers. 196 * 197 * Allocate all indirect blocks and mark all of them as not 198 * needing to be copied. 199 */ 200 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 201 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), 202 fs->fs_bsize, p->p_ucred, B_METAONLY, &ibp); 203 if (error) 204 goto out; 205 bdwrite(ibp); 206 } 207 /* 208 * Allocate shadow blocks to copy all of the other snapshot inodes 209 * so that we will be able to expunge them from this snapshot. Also 210 * include a copy of ourselves so that we do not deadlock trying 211 * to copyonwrite ourselves when VOP_FSYNC'ing below. 212 */ 213 fs->fs_snapinum[snaploc] = ip->i_number; 214 for (loc = snaploc, inoblkcnt = 0; loc >= 0; loc--) { 215 blkno = fragstoblks(fs, ino_to_fsba(fs, fs->fs_snapinum[loc])); 216 fs->fs_snapinum[snaploc] = 0; 217 for (i = 0; i < inoblkcnt; i++) 218 if (inoblks[i] == blkno) 219 break; 220 if (i == inoblkcnt) { 221 inoblks[inoblkcnt++] = blkno; 222 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), 223 fs->fs_bsize, KERNCRED, 0, &nbp); 224 if (error) 225 goto out; 226 bawrite(nbp); 227 } 228 } 229 /* 230 * Allocate all cylinder group blocks. 231 */ 232 for (cg = 0; cg < fs->fs_ncg; cg++) { 233 error = UFS_BALLOC(vp, (off_t)(cgtod(fs, cg)) << fs->fs_fshift, 234 fs->fs_bsize, KERNCRED, 0, &nbp); 235 if (error) 236 goto out; 237 bawrite(nbp); 238 } 239 /* 240 * Allocate copies for the superblock and its summary information. 241 */ 242 error = UFS_BALLOC(vp, (off_t)(SBOFF), SBSIZE, KERNCRED, 0, &nbp); 243 if (error) 244 goto out; 245 bawrite(nbp); 246 blkno = fragstoblks(fs, fs->fs_csaddr); 247 len = howmany(fs->fs_cssize, fs->fs_bsize); 248 for (loc = 0; loc < len; loc++) { 249 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), 250 fs->fs_bsize, KERNCRED, 0, &nbp); 251 if (error) 252 goto out; 253 bawrite(nbp); 254 } 255 /* 256 * Change inode to snapshot type file. 257 */ 258 ip->i_flags |= SF_SNAPSHOT; 259 ip->i_flag |= IN_CHANGE | IN_UPDATE; 260 /* 261 * Ensure that the snapshot is completely on disk. 262 */ 263 if ((error = VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p)) != 0) 264 goto out; 265 /* 266 * All allocations are done, so we can now snapshot the system. 267 * 268 * Suspend operation on filesystem. 269 */ 270 for (;;) { 271 vn_finished_write(wrtmp); 272 vfs_write_suspend(vp->v_mount); 273 if (mp->mnt_kern_flag & MNTK_SUSPENDED) 274 break; 275 vn_start_write(NULL, &wrtmp, V_WAIT); 276 } 277 /* 278 * First, copy all the cylinder group maps. All the unallocated 279 * blocks are marked BLK_NOCOPY so that the snapshot knows that 280 * it need not copy them if they are later written. 281 */ 282 len = howmany(fs->fs_fpg, fs->fs_frag); 283 for (cg = 0; cg < fs->fs_ncg; cg++) { 284 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 285 (int)fs->fs_cgsize, KERNCRED, &bp); 286 if (error) { 287 brelse(bp); 288 goto out1; 289 } 290 cgp = (struct cg *)bp->b_data; 291 if (!cg_chkmagic(cgp)) { 292 brelse(bp); 293 error = EIO; 294 goto out1; 295 } 296 error = bread(vp, fragstoblks(fs, cgtod(fs, cg)), fs->fs_bsize, 297 KERNCRED, &nbp); 298 if (error) { 299 brelse(bp); 300 brelse(nbp); 301 goto out1; 302 } 303 bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize); 304 if (fs->fs_cgsize < fs->fs_bsize) 305 bzero(&nbp->b_data[fs->fs_cgsize], 306 fs->fs_bsize - fs->fs_cgsize); 307 nbp->b_flags |= B_VALIDSUSPWRT; 308 bawrite(nbp); 309 base = cg * fs->fs_fpg / fs->fs_frag; 310 if (base + len >= numblks) 311 len = numblks - base - 1; 312 loc = 0; 313 if (base < NDADDR) { 314 for ( ; loc < NDADDR; loc++) { 315 if (!ffs_isblock(fs, cg_blksfree(cgp), loc)) 316 continue; 317 ip->i_db[loc] = BLK_NOCOPY; 318 } 319 } 320 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), 321 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 322 if (error) { 323 brelse(bp); 324 goto out1; 325 } 326 indiroff = (base + loc - NDADDR) % NINDIR(fs); 327 for ( ; loc < len; loc++, indiroff++) { 328 if (indiroff >= NINDIR(fs)) { 329 ibp->b_flags |= B_VALIDSUSPWRT; 330 bawrite(ibp); 331 error = UFS_BALLOC(vp, 332 lblktosize(fs, (off_t)(base + loc)), 333 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 334 if (error) { 335 brelse(bp); 336 goto out1; 337 } 338 indiroff = 0; 339 } 340 if (!ffs_isblock(fs, cg_blksfree(cgp), loc)) 341 continue; 342 if (((ufs_daddr_t *)(ibp->b_data))[indiroff] != 0) 343 panic("ffs_snapshot: lost block"); 344 ((ufs_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; 345 } 346 bqrelse(bp); 347 ibp->b_flags |= B_VALIDSUSPWRT; 348 bdwrite(ibp); 349 } 350 /* 351 * Copy the shadow blocks for the snapshot inodes so that 352 * the copies can can be expunged. 353 */ 354 for (loc = 0; loc < inoblkcnt; loc++) { 355 error = bread(vp, inoblks[loc], fs->fs_bsize, KERNCRED, &nbp); 356 if (error) 357 goto out1; 358 readblock(nbp, inoblks[loc]); 359 nbp->b_flags |= B_VALIDSUSPWRT; 360 bdwrite(nbp); 361 } 362 /* 363 * Copy allocation information from all the snapshots in 364 * this snapshot and then expunge them from its view. 365 */ 366 snaphead = &ip->i_devvp->v_rdev->si_snapshots; 367 TAILQ_FOREACH(xp, snaphead, i_nextsnap) 368 if ((error = expunge(vp, xp, fs, snapacct)) != 0) 369 goto out1; 370 /* 371 * Grab a copy of the superblock and its summary information. 372 * We delay writing it until the suspension is released below. 373 */ 374 error = bread(vp, lblkno(fs, SBOFF), fs->fs_bsize, KERNCRED, &sbp); 375 if (error) 376 goto out1; 377 copy_fs = (struct fs *)(sbp->b_data + blkoff(fs, SBOFF)); 378 bcopy(fs, copy_fs, fs->fs_sbsize); 379 if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) 380 copy_fs->fs_clean = 1; 381 if (fs->fs_sbsize < SBSIZE) 382 bzero(&sbp->b_data[blkoff(fs, SBOFF) + fs->fs_sbsize], 383 SBSIZE - fs->fs_sbsize); 384 size = blkroundup(fs, fs->fs_cssize); 385 if (fs->fs_contigsumsize > 0) 386 size += fs->fs_ncg * sizeof(int32_t); 387 space = malloc((u_long)size, M_UFSMNT, M_WAITOK); 388 copy_fs->fs_csp = space; 389 bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize); 390 (char *)space += fs->fs_cssize; 391 loc = howmany(fs->fs_cssize, fs->fs_fsize); 392 i = fs->fs_frag - loc % fs->fs_frag; 393 len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; 394 if (len > 0) { 395 if ((error = bread(ip->i_devvp, 396 fsbtodb(fs, fs->fs_csaddr + loc), 397 len, KERNCRED, &bp)) != 0) { 398 free(copy_fs->fs_csp, M_UFSMNT); 399 goto out1; 400 } 401 bcopy(bp->b_data, space, (u_int)len); 402 (char *)space += len; 403 bp->b_flags |= B_INVAL | B_NOCACHE; 404 brelse(bp); 405 } 406 if (fs->fs_contigsumsize > 0) { 407 copy_fs->fs_maxcluster = lp = space; 408 for (i = 0; i < fs->fs_ncg; i++) 409 *lp++ = fs->fs_contigsumsize; 410 } 411 /* 412 * Record snapshot inode. Since this is the newest snapshot, 413 * it must be placed at the end of the list. 414 */ 415 fs->fs_snapinum[snaploc] = ip->i_number; 416 if (ip->i_nextsnap.tqe_prev != 0) 417 panic("ffs_snapshot: %d already on list", ip->i_number); 418 TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap); 419 ip->i_devvp->v_rdev->si_copyonwrite = ffs_copyonwrite; 420 ip->i_devvp->v_flag |= VCOPYONWRITE; 421 vp->v_flag |= VSYSTEM; 422 /* 423 * Resume operation on filesystem. 424 */ 425 out1: 426 vfs_write_resume(vp->v_mount); 427 vn_start_write(NULL, &wrtmp, V_WAIT); 428 if (sbp != NULL) { 429 /* 430 * Expunge the blocks used by the snapshots from the set of 431 * blocks marked as used in the snapshot bitmaps. 432 */ 433 if ((error = expunge(vp, VTOI(vp), copy_fs, mapacct)) != 0) { 434 vref(vp); 435 ffs_snapgone(VTOI(vp)); 436 free(copy_fs->fs_csp, M_UFSMNT); 437 bawrite(sbp); 438 goto out; 439 } 440 /* 441 * Write the superblock and its summary information 442 * to the snapshot. 443 */ 444 blkno = fragstoblks(fs, fs->fs_csaddr); 445 len = howmany(fs->fs_cssize, fs->fs_bsize); 446 space = copy_fs->fs_csp; 447 for (loc = 0; loc < len; loc++) { 448 error = bread(vp, blkno + loc, fs->fs_bsize, 449 KERNCRED, &nbp); 450 if (error) { 451 vref(vp); 452 ffs_snapgone(VTOI(vp)); 453 free(copy_fs->fs_csp, M_UFSMNT); 454 bawrite(sbp); 455 goto out; 456 } 457 bcopy(space, nbp->b_data, fs->fs_bsize); 458 space = (char *)space + fs->fs_bsize; 459 bawrite(nbp); 460 } 461 free(copy_fs->fs_csp, M_UFSMNT); 462 bawrite(sbp); 463 } 464 out: 465 mp->mnt_flag = flag; 466 if (error) 467 (void) UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED, p); 468 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p); 469 if (error) 470 vput(vp); 471 else 472 VOP_UNLOCK(vp, 0, p); 473 vn_finished_write(wrtmp); 474 return (error); 475 } 476 477 /* 478 * Before expunging a snapshot inode, note all the 479 * blocks that it claims with BLK_SNAP so that fsck will 480 * be able to account for those blocks properly and so 481 * that this snapshot knows that it need not copy them 482 * if the other snapshot holding them is freed. 483 */ 484 static int 485 expunge(vp, xp, fs, acctfunc) 486 struct vnode *vp; 487 struct inode *xp; 488 struct fs *fs; 489 int (*acctfunc) __P((struct vnode *, ufs_daddr_t *, ufs_daddr_t *, 490 struct fs *, ufs_daddr_t)); 491 { 492 int i, len, error, numblks, blksperindir; 493 ufs_daddr_t lbn, rlbn, blkno; 494 struct dinode *dip; 495 struct buf *bp; 496 497 if ((error = (*acctfunc)(vp, &xp->i_db[0], &xp->i_ib[NIADDR], fs, 0))) 498 return (error); 499 numblks = howmany(fs->fs_size, fs->fs_frag); 500 blksperindir = 1; 501 lbn = -NDADDR; 502 len = numblks - NDADDR; 503 rlbn = NDADDR; 504 for (i = 0; len > 0 && i < NIADDR; i++) { 505 error = indiracct(vp, ITOV(xp), i, xp->i_ib[i], lbn, 506 rlbn, len, blksperindir, fs, acctfunc); 507 if (error) 508 return (error); 509 blksperindir *= NINDIR(fs); 510 lbn -= blksperindir + 1; 511 len -= blksperindir; 512 rlbn += blksperindir; 513 } 514 /* 515 * Set copied snapshot inode to be a zero length file. 516 */ 517 blkno = fragstoblks(fs, ino_to_fsba(fs, xp->i_number)); 518 if ((error = bread(vp, blkno, fs->fs_bsize, KERNCRED, &bp)) != 0) 519 return (error); 520 dip = (struct dinode *)bp->b_data + 521 ino_to_fsbo(fs, xp->i_number); 522 dip->di_size = 0; 523 dip->di_blocks = 0; 524 dip->di_flags &= ~SF_SNAPSHOT; 525 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs_daddr_t)); 526 bp->b_flags |= B_VALIDSUSPWRT; 527 bdwrite(bp); 528 return (0); 529 } 530 531 /* 532 * Descend an indirect block chain for vnode cancelvp accounting for all 533 * its indirect blocks in snapvp. 534 */ 535 static int 536 indiracct(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, blksperindir, fs, 537 acctfunc) 538 struct vnode *snapvp; 539 struct vnode *cancelvp; 540 int level; 541 ufs_daddr_t blkno; 542 int lbn; 543 int rlbn; 544 int remblks; 545 int blksperindir; 546 struct fs *fs; 547 int (*acctfunc) __P((struct vnode *, ufs_daddr_t *, ufs_daddr_t *, 548 struct fs *, ufs_daddr_t)); 549 { 550 int subblksperindir, error, last, num, i; 551 struct indir indirs[NIADDR + 2]; 552 ufs_daddr_t *bap; 553 struct buf *bp; 554 555 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 556 return (error); 557 if (lbn != indirs[num - 1 - level].in_lbn || blkno == 0 || num < 2) 558 panic("indiracct: botched params"); 559 /* 560 * We have to expand bread here since it will deadlock looking 561 * up the block number for any blocks that are not in the cache. 562 */ 563 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0); 564 bp->b_blkno = fsbtodb(fs, blkno); 565 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 566 (error = readblock(bp, fragstoblks(fs, blkno)))) { 567 brelse(bp); 568 return (error); 569 } 570 /* 571 * Account for the block pointers in this indirect block. 572 */ 573 last = howmany(remblks, blksperindir); 574 if (last > NINDIR(fs)) 575 last = NINDIR(fs); 576 MALLOC(bap, ufs_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); 577 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 578 bqrelse(bp); 579 error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, rlbn); 580 if (error || level == 0) 581 goto out; 582 /* 583 * Account for the block pointers in each of the indirect blocks 584 * in the levels below us. 585 */ 586 subblksperindir = blksperindir / NINDIR(fs); 587 for (lbn++, level--, i = 0; i < last; i++) { 588 error = indiracct(snapvp, cancelvp, level, bap[i], lbn, 589 rlbn, remblks, subblksperindir, fs, acctfunc); 590 if (error) 591 goto out; 592 rlbn += blksperindir; 593 lbn -= blksperindir; 594 remblks -= blksperindir; 595 } 596 out: 597 FREE(bap, M_DEVBUF); 598 return (error); 599 } 600 601 /* 602 * Account for a set of blocks allocated in a snapshot inode. 603 */ 604 static int 605 snapacct(vp, oldblkp, lastblkp, fs, lblkno) 606 struct vnode *vp; 607 ufs_daddr_t *oldblkp, *lastblkp; 608 struct fs *fs; 609 ufs_daddr_t lblkno; 610 { 611 struct inode *ip = VTOI(vp); 612 ufs_daddr_t lbn, blkno, *blkp; 613 struct buf *ibp; 614 int error; 615 616 for ( ; oldblkp < lastblkp; oldblkp++) { 617 blkno = *oldblkp; 618 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 619 continue; 620 lbn = fragstoblks(fs, blkno); 621 if (lbn < NDADDR) { 622 blkp = &ip->i_db[lbn]; 623 ip->i_flag |= IN_CHANGE | IN_UPDATE; 624 } else { 625 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 626 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 627 if (error) 628 return (error); 629 blkp = &((ufs_daddr_t *)(ibp->b_data)) 630 [(lbn - NDADDR) % NINDIR(fs)]; 631 } 632 if (*blkp != 0) 633 panic("snapacct: bad block"); 634 *blkp = BLK_SNAP; 635 if (lbn >= NDADDR) { 636 ibp->b_flags |= B_VALIDSUSPWRT; 637 bdwrite(ibp); 638 } 639 } 640 return (0); 641 } 642 643 /* 644 * Account for a set of blocks allocated in a snapshot inode. 645 */ 646 static int 647 mapacct(vp, oldblkp, lastblkp, fs, lblkno) 648 struct vnode *vp; 649 ufs_daddr_t *oldblkp, *lastblkp; 650 struct fs *fs; 651 ufs_daddr_t lblkno; 652 { 653 struct inode *ip = VTOI(vp); 654 ufs_daddr_t blkno, cgblkno, fragno; 655 struct buf *bp; 656 struct cg *cgp; 657 char *blksfree; 658 int i, cg, error; 659 660 for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { 661 blkno = *oldblkp; 662 if (blkno == 0 || blkno == BLK_NOCOPY) 663 continue; 664 if (blkno == BLK_SNAP) 665 blkno = blkstofrags(fs, lblkno); 666 cg = dtog(fs, blkno); 667 cgblkno = fragstoblks(fs, cgtod(fs, cg)); 668 if ((error = bread(vp, cgblkno, fs->fs_bsize, KERNCRED, &bp))) 669 return (error); 670 cgp = (struct cg *)bp->b_data; 671 if (!cg_chkmagic(cgp) || cgp->cg_cgx != cg) { 672 if (!cg_chkmagic(cgp)) 673 printf("mapacct: bad magic 0x%x\n", 674 cgp->cg_magic); 675 else 676 printf("%s: mismatched cg %d != cg_cgx %d\n", 677 "mapacct", cg, cgp->cg_cgx); 678 brelse(bp); 679 return (EIO); 680 } 681 cgp->cg_time = time_second; 682 cgblkno = dtogd(fs, blkno); 683 blksfree = cg_blksfree(cgp); 684 fragno = fragstoblks(fs, cgblkno); 685 if (!ffs_isfreeblock(fs, blksfree, fragno)) { 686 printf("dev = %s, block = %ld, fs = %s\n", 687 devtoname(ip->i_dev), (long)blkno, fs->fs_fsmnt); 688 panic("mapacct: freeing free block"); 689 } 690 ffs_setblock(fs, blksfree, fragno); 691 ffs_clusteracct(fs, cgp, fragno, 1); 692 cgp->cg_cs.cs_nbfree++; 693 fs->fs_cstotal.cs_nbfree++; 694 fs->fs_cs(fs, cg).cs_nbfree++; 695 i = cbtocylno(fs, cgblkno); 696 cg_blks(fs, cgp, i)[cbtorpos(fs, cgblkno)]++; 697 cg_blktot(cgp)[i]++; 698 fs->fs_fmod = 1; 699 bdwrite(bp); 700 } 701 return (0); 702 } 703 704 /* 705 * Decrement extra reference on snapshot when last name is removed. 706 * It will not be freed until the last open reference goes away. 707 */ 708 void 709 ffs_snapgone(ip) 710 struct inode *ip; 711 { 712 struct inode *xp; 713 struct fs *fs; 714 int snaploc; 715 716 /* 717 * Find snapshot in incore list. 718 */ 719 TAILQ_FOREACH(xp, &ip->i_devvp->v_rdev->si_snapshots, i_nextsnap) 720 if (xp == ip) 721 break; 722 if (xp == 0) 723 printf("ffs_snapgone: lost snapshot vnode %d\n", 724 ip->i_number); 725 else 726 vrele(ITOV(ip)); 727 /* 728 * Delete snapshot inode from superblock. Keep list dense. 729 */ 730 fs = ip->i_fs; 731 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 732 if (fs->fs_snapinum[snaploc] == ip->i_number) 733 break; 734 if (snaploc < FSMAXSNAP) { 735 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { 736 if (fs->fs_snapinum[snaploc] == 0) 737 break; 738 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; 739 } 740 fs->fs_snapinum[snaploc - 1] = 0; 741 } 742 } 743 744 /* 745 * Prepare a snapshot file for being removed. 746 */ 747 void 748 ffs_snapremove(vp) 749 struct vnode *vp; 750 { 751 struct inode *ip; 752 struct vnode *devvp; 753 struct buf *ibp; 754 struct fs *fs; 755 ufs_daddr_t blkno, dblk; 756 int error, numblks, loc, last; 757 758 ip = VTOI(vp); 759 fs = ip->i_fs; 760 /* 761 * If active, delete from incore list (this snapshot may 762 * already have been in the process of being deleted, so 763 * would not have been active). 764 * 765 * Clear copy-on-write flag if last snapshot. 766 */ 767 if (ip->i_nextsnap.tqe_prev != 0) { 768 devvp = ip->i_devvp; 769 TAILQ_REMOVE(&devvp->v_rdev->si_snapshots, ip, i_nextsnap); 770 ip->i_nextsnap.tqe_prev = 0; 771 if (TAILQ_FIRST(&devvp->v_rdev->si_snapshots) == 0) { 772 devvp->v_rdev->si_copyonwrite = 0; 773 devvp->v_flag &= ~VCOPYONWRITE; 774 } 775 } 776 /* 777 * Clear all BLK_NOCOPY fields. Pass any block claims to other 778 * snapshots that want them (see ffs_snapblkfree below). 779 */ 780 for (blkno = 1; blkno < NDADDR; blkno++) { 781 dblk = ip->i_db[blkno]; 782 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 783 ip->i_db[blkno] = 0; 784 else if ((dblk == blkstofrags(fs, blkno) && 785 ffs_snapblkfree(ip, dblk, fs->fs_bsize))) { 786 ip->i_blocks -= btodb(fs->fs_bsize); 787 ip->i_db[blkno] = 0; 788 } 789 } 790 numblks = howmany(ip->i_size, fs->fs_bsize); 791 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 792 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), 793 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 794 if (error) 795 continue; 796 if ((last = fs->fs_size - blkno) > NINDIR(fs)) 797 last = NINDIR(fs); 798 for (loc = 0; loc < last; loc++) { 799 dblk = ((ufs_daddr_t *)(ibp->b_data))[loc]; 800 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) 801 ((ufs_daddr_t *)(ibp->b_data))[loc] = 0; 802 else if ((dblk == blkstofrags(fs, blkno) && 803 ffs_snapblkfree(ip, dblk, fs->fs_bsize))) { 804 ip->i_blocks -= btodb(fs->fs_bsize); 805 ((ufs_daddr_t *)(ibp->b_data))[loc] = 0; 806 } 807 } 808 bawrite(ibp); 809 } 810 /* 811 * Clear snapshot flag and drop reference. 812 */ 813 ip->i_flags &= ~SF_SNAPSHOT; 814 ip->i_flag |= IN_CHANGE | IN_UPDATE; 815 } 816 817 /* 818 * Notification that a block is being freed. Return zero if the free 819 * should be allowed to proceed. Return non-zero if the snapshot file 820 * wants to claim the block. The block will be claimed if it is an 821 * uncopied part of one of the snapshots. It will be freed if it is 822 * either a BLK_NOCOPY or has already been copied in all of the snapshots. 823 * If a fragment is being freed, then all snapshots that care about 824 * it must make a copy since a snapshot file can only claim full sized 825 * blocks. Note that if more than one snapshot file maps the block, 826 * we can pick one at random to claim it. Since none of the snapshots 827 * can change, we are assurred that they will all see the same unmodified 828 * image. When deleting a snapshot file (see ffs_snapremove above), we 829 * must push any of these claimed blocks to one of the other snapshots 830 * that maps it. These claimed blocks are easily identified as they will 831 * have a block number equal to their logical block number within the 832 * snapshot. A copied block can never have this property because they 833 * must always have been allocated from a BLK_NOCOPY location. 834 */ 835 int 836 ffs_snapblkfree(freeip, bno, size) 837 struct inode *freeip; 838 ufs_daddr_t bno; 839 long size; 840 { 841 struct buf *ibp, *cbp, *savedcbp = 0; 842 struct fs *fs = freeip->i_fs; 843 struct proc *p = CURPROC; 844 struct inode *ip; 845 struct vnode *vp; 846 ufs_daddr_t lbn, blkno; 847 int indiroff = 0, error = 0, claimedblk = 0; 848 struct snaphead *snaphead; 849 850 lbn = fragstoblks(fs, bno); 851 snaphead = &freeip->i_devvp->v_rdev->si_snapshots; 852 TAILQ_FOREACH(ip, snaphead, i_nextsnap) { 853 vp = ITOV(ip); 854 /* 855 * Lookup block being written. 856 */ 857 if (lbn < NDADDR) { 858 blkno = ip->i_db[lbn]; 859 } else { 860 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 861 p->p_flag |= P_COWINPROGRESS; 862 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 863 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 864 p->p_flag &= ~P_COWINPROGRESS; 865 VOP_UNLOCK(vp, 0, p); 866 if (error) 867 break; 868 indiroff = (lbn - NDADDR) % NINDIR(fs); 869 blkno = ((ufs_daddr_t *)(ibp->b_data))[indiroff]; 870 } 871 /* 872 * Check to see if block needs to be copied. 873 */ 874 switch (blkno) { 875 /* 876 * If the snapshot has already copied the block (default), 877 * or does not care about the block, it is not needed. 878 */ 879 default: 880 case BLK_NOCOPY: 881 if (lbn >= NDADDR) 882 bqrelse(ibp); 883 continue; 884 /* 885 * No previous snapshot claimed the block, so it will be 886 * freed and become a BLK_NOCOPY (don't care) for us. 887 */ 888 case BLK_SNAP: 889 if (claimedblk) 890 panic("snapblkfree: inconsistent block type"); 891 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 892 if (lbn < NDADDR) { 893 ip->i_db[lbn] = BLK_NOCOPY; 894 ip->i_flag |= IN_CHANGE | IN_UPDATE; 895 } else { 896 ((ufs_daddr_t *)(ibp->b_data))[indiroff] = 897 BLK_NOCOPY; 898 bdwrite(ibp); 899 } 900 VOP_UNLOCK(vp, 0, p); 901 continue; 902 /* 903 * A block that we map is being freed. If it has not been 904 * claimed yet, we will claim or copy it (below). 905 */ 906 case 0: 907 claimedblk = 1; 908 break; 909 } 910 /* 911 * If this is a full size block, we will just grab it 912 * and assign it to the snapshot inode. Otherwise we 913 * will proceed to copy it. See explanation for this 914 * routine as to why only a single snapshot needs to 915 * claim this block. 916 */ 917 if (size == fs->fs_bsize) { 918 #ifdef DEBUG 919 if (snapdebug) 920 printf("%s %d lbn %d from inum %d\n", 921 "Grabonremove: snapino", ip->i_number, lbn, 922 freeip->i_number); 923 #endif 924 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 925 if (lbn < NDADDR) { 926 ip->i_db[lbn] = bno; 927 } else { 928 ((ufs_daddr_t *)(ibp->b_data))[indiroff] = bno; 929 bdwrite(ibp); 930 } 931 ip->i_blocks += btodb(size); 932 ip->i_flag |= IN_CHANGE | IN_UPDATE; 933 VOP_UNLOCK(vp, 0, p); 934 return (1); 935 } 936 if (lbn >= NDADDR) 937 bqrelse(ibp); 938 /* 939 * Allocate the block into which to do the copy. Note that this 940 * allocation will never require any additional allocations for 941 * the snapshot inode. 942 */ 943 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 944 p->p_flag |= P_COWINPROGRESS; 945 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 946 fs->fs_bsize, KERNCRED, 0, &cbp); 947 p->p_flag &= ~P_COWINPROGRESS; 948 if (error) { 949 VOP_UNLOCK(vp, 0, p); 950 break; 951 } 952 #ifdef DEBUG 953 if (snapdebug) 954 printf("%s%d lbn %d for inum %d size %ld to blkno %d\n", 955 "Copyonremove: snapino ", ip->i_number, lbn, 956 freeip->i_number, size, cbp->b_blkno); 957 #endif 958 /* 959 * If we have already read the old block contents, then 960 * simply copy them to the new block. Note that we need 961 * to synchronously write snapshots that have not been 962 * unlinked, and hence will be visible after a crash, 963 * to ensure their integrity. 964 */ 965 if (savedcbp != 0) { 966 bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 967 bawrite(cbp); 968 if (dopersistence && ip->i_effnlink > 0) 969 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p); 970 VOP_UNLOCK(vp, 0, p); 971 continue; 972 } 973 /* 974 * Otherwise, read the old block contents into the buffer. 975 */ 976 if ((error = readblock(cbp, lbn)) != 0) { 977 bzero(cbp->b_data, fs->fs_bsize); 978 bawrite(cbp); 979 if (dopersistence && ip->i_effnlink > 0) 980 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p); 981 VOP_UNLOCK(vp, 0, p); 982 break; 983 } 984 VOP_UNLOCK(vp, 0, p); 985 savedcbp = cbp; 986 } 987 /* 988 * Note that we need to synchronously write snapshots that 989 * have not been unlinked, and hence will be visible after 990 * a crash, to ensure their integrity. 991 */ 992 if (savedcbp) { 993 vp = savedcbp->b_vp; 994 bawrite(savedcbp); 995 if (dopersistence && VTOI(vp)->i_effnlink > 0) { 996 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 997 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p); 998 VOP_UNLOCK(vp, 0, p); 999 } 1000 } 1001 /* 1002 * If we have been unable to allocate a block in which to do 1003 * the copy, then return non-zero so that the fragment will 1004 * not be freed. Although space will be lost, the snapshot 1005 * will stay consistent. 1006 */ 1007 return (error); 1008 } 1009 1010 /* 1011 * Associate snapshot files when mounting. 1012 */ 1013 void 1014 ffs_snapshot_mount(mp) 1015 struct mount *mp; 1016 { 1017 struct ufsmount *ump = VFSTOUFS(mp); 1018 struct fs *fs = ump->um_fs; 1019 struct proc *p = CURPROC; 1020 struct snaphead *snaphead; 1021 struct vnode *vp; 1022 struct inode *ip; 1023 int error, snaploc, loc; 1024 1025 snaphead = &ump->um_devvp->v_rdev->si_snapshots; 1026 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { 1027 if (fs->fs_snapinum[snaploc] == 0) 1028 return; 1029 if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], &vp)) != 0){ 1030 printf("ffs_snapshot_mount: vget failed %d\n", error); 1031 continue; 1032 } 1033 ip = VTOI(vp); 1034 if ((ip->i_flags & SF_SNAPSHOT) == 0) { 1035 printf("ffs_snapshot_mount: non-snapshot inode %d\n", 1036 fs->fs_snapinum[snaploc]); 1037 vput(vp); 1038 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { 1039 if (fs->fs_snapinum[loc] == 0) 1040 break; 1041 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; 1042 } 1043 fs->fs_snapinum[loc - 1] = 0; 1044 snaploc--; 1045 continue; 1046 } 1047 if (ip->i_nextsnap.tqe_prev != 0) 1048 panic("ffs_snapshot_mount: %d already on list", 1049 ip->i_number); 1050 else 1051 TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap); 1052 vp->v_flag |= VSYSTEM; 1053 ump->um_devvp->v_rdev->si_copyonwrite = ffs_copyonwrite; 1054 ump->um_devvp->v_flag |= VCOPYONWRITE; 1055 VOP_UNLOCK(vp, 0, p); 1056 } 1057 } 1058 1059 /* 1060 * Disassociate snapshot files when unmounting. 1061 */ 1062 void 1063 ffs_snapshot_unmount(mp) 1064 struct mount *mp; 1065 { 1066 struct ufsmount *ump = VFSTOUFS(mp); 1067 struct snaphead *snaphead = &ump->um_devvp->v_rdev->si_snapshots; 1068 struct inode *xp; 1069 1070 while ((xp = TAILQ_FIRST(snaphead)) != 0) { 1071 TAILQ_REMOVE(snaphead, xp, i_nextsnap); 1072 xp->i_nextsnap.tqe_prev = 0; 1073 if (xp->i_effnlink > 0) 1074 vrele(ITOV(xp)); 1075 } 1076 ump->um_devvp->v_rdev->si_copyonwrite = 0; 1077 ump->um_devvp->v_flag &= ~VCOPYONWRITE; 1078 } 1079 1080 /* 1081 * Check for need to copy block that is about to be written, 1082 * copying the block if necessary. 1083 */ 1084 static int 1085 ffs_copyonwrite(devvp, bp) 1086 struct vnode *devvp; 1087 struct buf *bp; 1088 { 1089 struct buf *ibp, *cbp, *savedcbp = 0; 1090 struct proc *p = CURPROC; 1091 struct fs *fs; 1092 struct inode *ip; 1093 struct vnode *vp; 1094 ufs_daddr_t lbn, blkno; 1095 int indiroff, error = 0; 1096 1097 fs = TAILQ_FIRST(&devvp->v_rdev->si_snapshots)->i_fs; 1098 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); 1099 if (p->p_flag & P_COWINPROGRESS) 1100 panic("ffs_copyonwrite: recursive call"); 1101 TAILQ_FOREACH(ip, &devvp->v_rdev->si_snapshots, i_nextsnap) { 1102 vp = ITOV(ip); 1103 /* 1104 * We ensure that everything of our own that needs to be 1105 * copied will be done at the time that ffs_snapshot is 1106 * called. Thus we can skip the check here which can 1107 * deadlock in doing the lookup in UFS_BALLOC. 1108 */ 1109 if (bp->b_vp == vp) 1110 continue; 1111 /* 1112 * Check to see if block needs to be copied. We have to 1113 * be able to do the UFS_BALLOC without blocking, otherwise 1114 * we may get in a deadlock with another process also 1115 * trying to allocate. If we find outselves unable to 1116 * get the buffer lock, we unlock the snapshot vnode, 1117 * sleep briefly, and try again. 1118 */ 1119 retry: 1120 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 1121 if (lbn < NDADDR) { 1122 blkno = ip->i_db[lbn]; 1123 } else { 1124 p->p_flag |= P_COWINPROGRESS; 1125 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1126 fs->fs_bsize, KERNCRED, B_METAONLY | B_NOWAIT, &ibp); 1127 p->p_flag &= ~P_COWINPROGRESS; 1128 if (error) { 1129 VOP_UNLOCK(vp, 0, p); 1130 if (error != EWOULDBLOCK) 1131 break; 1132 tsleep(vp, p->p_pri.pri_user, "nap", 1); 1133 goto retry; 1134 } 1135 indiroff = (lbn - NDADDR) % NINDIR(fs); 1136 blkno = ((ufs_daddr_t *)(ibp->b_data))[indiroff]; 1137 bqrelse(ibp); 1138 } 1139 #ifdef DIAGNOSTIC 1140 if (blkno == BLK_SNAP && bp->b_lblkno >= 0) 1141 panic("ffs_copyonwrite: bad copy block"); 1142 #endif 1143 if (blkno != 0) { 1144 VOP_UNLOCK(vp, 0, p); 1145 continue; 1146 } 1147 /* 1148 * Allocate the block into which to do the copy. Note that this 1149 * allocation will never require any additional allocations for 1150 * the snapshot inode. 1151 */ 1152 p->p_flag |= P_COWINPROGRESS; 1153 error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), 1154 fs->fs_bsize, KERNCRED, B_NOWAIT, &cbp); 1155 p->p_flag &= ~P_COWINPROGRESS; 1156 if (error) { 1157 VOP_UNLOCK(vp, 0, p); 1158 if (error != EWOULDBLOCK) 1159 break; 1160 tsleep(vp, p->p_pri.pri_user, "nap", 1); 1161 goto retry; 1162 } 1163 #ifdef DEBUG 1164 if (snapdebug) { 1165 printf("Copyonwrite: snapino %d lbn %d for ", 1166 ip->i_number, lbn); 1167 if (bp->b_vp == devvp) 1168 printf("fs metadata"); 1169 else 1170 printf("inum %d", VTOI(bp->b_vp)->i_number); 1171 printf(" lblkno %d to blkno %d\n", bp->b_lblkno, 1172 cbp->b_blkno); 1173 } 1174 #endif 1175 /* 1176 * If we have already read the old block contents, then 1177 * simply copy them to the new block. Note that we need 1178 * to synchronously write snapshots that have not been 1179 * unlinked, and hence will be visible after a crash, 1180 * to ensure their integrity. 1181 */ 1182 if (savedcbp != 0) { 1183 bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 1184 bawrite(cbp); 1185 if (dopersistence && ip->i_effnlink > 0) 1186 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p); 1187 VOP_UNLOCK(vp, 0, p); 1188 continue; 1189 } 1190 /* 1191 * Otherwise, read the old block contents into the buffer. 1192 */ 1193 if ((error = readblock(cbp, lbn)) != 0) { 1194 bzero(cbp->b_data, fs->fs_bsize); 1195 bawrite(cbp); 1196 if (dopersistence && ip->i_effnlink > 0) 1197 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p); 1198 VOP_UNLOCK(vp, 0, p); 1199 break; 1200 } 1201 savedcbp = cbp; 1202 VOP_UNLOCK(vp, 0, p); 1203 } 1204 /* 1205 * Note that we need to synchronously write snapshots that 1206 * have not been unlinked, and hence will be visible after 1207 * a crash, to ensure their integrity. 1208 */ 1209 if (savedcbp) { 1210 vp = savedcbp->b_vp; 1211 bawrite(savedcbp); 1212 if (dopersistence && VTOI(vp)->i_effnlink > 0) { 1213 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 1214 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p); 1215 VOP_UNLOCK(vp, 0, p); 1216 } 1217 } 1218 return (error); 1219 } 1220 1221 /* 1222 * Read the specified block into the given buffer. 1223 * Much of this boiler-plate comes from bwrite(). 1224 */ 1225 static int 1226 readblock(bp, lbn) 1227 struct buf *bp; 1228 daddr_t lbn; 1229 { 1230 struct uio auio; 1231 struct iovec aiov; 1232 struct proc *p = CURPROC; 1233 struct inode *ip = VTOI(bp->b_vp); 1234 1235 aiov.iov_base = bp->b_data; 1236 aiov.iov_len = bp->b_bcount; 1237 auio.uio_iov = &aiov; 1238 auio.uio_iovcnt = 1; 1239 auio.uio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn))); 1240 auio.uio_resid = bp->b_bcount; 1241 auio.uio_rw = UIO_READ; 1242 auio.uio_segflg = UIO_SYSSPACE; 1243 auio.uio_procp = p; 1244 return (physio(ip->i_devvp->v_rdev, &auio, 0)); 1245 } 1246