1 /* 2 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. 3 * 4 * Further information about snapshots can be obtained from: 5 * 6 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 7 * 1614 Oxford Street mckusick@mckusick.com 8 * Berkeley, CA 94709-1608 +1-510-843-9542 9 * USA 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 21 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 22 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)ffs_snapshot.c 8.10 (McKusick) 7/11/00 34 * $FreeBSD$ 35 */ 36 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/bio.h> 40 #include <sys/buf.h> 41 #include <sys/proc.h> 42 #include <sys/namei.h> 43 #include <sys/stat.h> 44 #include <sys/malloc.h> 45 #include <sys/mount.h> 46 #include <sys/resource.h> 47 #include <sys/resourcevar.h> 48 #include <sys/vnode.h> 49 50 #include <ufs/ufs/extattr.h> 51 #include <ufs/ufs/quota.h> 52 #include <ufs/ufs/ufsmount.h> 53 #include <ufs/ufs/inode.h> 54 #include <ufs/ufs/ufs_extern.h> 55 56 #include <ufs/ffs/fs.h> 57 #include <ufs/ffs/ffs_extern.h> 58 59 #define KERNCRED proc0.p_ucred 60 #define CURPROC curproc 61 #define DEBUG 62 63 static int indiracct __P((struct vnode *, struct vnode *, int, ufs_daddr_t, 64 int, int, int, int)); 65 static int snapacct __P((struct vnode *, ufs_daddr_t *, ufs_daddr_t *)); 66 static int readblock __P((struct buf *, daddr_t)); 67 68 #ifdef DEBUG 69 #include <sys/sysctl.h> 70 int snapdebug = 0; 71 SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, ""); 72 #endif /* DEBUG */ 73 74 /* 75 * Create a snapshot file and initialize it for the filesystem. 76 */ 77 int 78 ffs_snapshot(mp, snapfile) 79 struct mount *mp; 80 char *snapfile; 81 { 82 ufs_daddr_t rlbn; 83 ufs_daddr_t lbn, blkno, copyblkno, inoblks[FSMAXSNAP]; 84 int error, cg, snaploc, indiroff, numblks; 85 int i, size, base, len, loc, inoblkcnt; 86 int blksperindir, flag = mp->mnt_flag; 87 struct fs *fs = VFSTOUFS(mp)->um_fs; 88 struct proc *p = CURPROC; 89 struct inode *devip, *ip, *xp; 90 struct buf *bp, *nbp, *ibp; 91 struct vnode *vp, *devvp; 92 struct nameidata nd; 93 struct mount *wrtmp; 94 struct dinode *dip; 95 struct vattr vat; 96 struct cg *cgp; 97 98 /* 99 * Need to serialize access to snapshot code per filesystem. 100 */ 101 /* 102 * Assign a snapshot slot in the superblock. 103 */ 104 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 105 if (fs->fs_snapinum[snaploc] == 0) 106 break; 107 if (snaploc == FSMAXSNAP) 108 return (ENOSPC); 109 /* 110 * Create the snapshot file. 111 */ 112 restart: 113 NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, snapfile, p); 114 if ((error = namei(&nd)) != 0) 115 return (error); 116 if (nd.ni_vp != NULL) { 117 vput(nd.ni_vp); 118 error = EEXIST; 119 } 120 if (nd.ni_dvp->v_mount != mp) 121 error = EXDEV; 122 if (error) { 123 NDFREE(&nd, NDF_ONLY_PNBUF); 124 if (nd.ni_dvp == nd.ni_vp) 125 vrele(nd.ni_dvp); 126 else 127 vput(nd.ni_dvp); 128 return (error); 129 } 130 VATTR_NULL(&vat); 131 vat.va_type = VREG; 132 vat.va_mode = S_IRUSR; 133 vat.va_vaflags |= VA_EXCLUSIVE; 134 if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp)) 135 wrtmp = NULL; 136 if (wrtmp != mp) 137 panic("ffs_snapshot: mount mismatch"); 138 if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) { 139 NDFREE(&nd, NDF_ONLY_PNBUF); 140 vput(nd.ni_dvp); 141 if ((error = vn_start_write(NULL, &wrtmp, 142 V_XSLEEP | PCATCH)) != 0) 143 return (error); 144 goto restart; 145 } 146 VOP_LEASE(nd.ni_dvp, p, KERNCRED, LEASE_WRITE); 147 error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat); 148 vput(nd.ni_dvp); 149 if (error) { 150 NDFREE(&nd, NDF_ONLY_PNBUF); 151 vn_finished_write(wrtmp); 152 return (error); 153 } 154 vp = nd.ni_vp; 155 ip = VTOI(vp); 156 devvp = ip->i_devvp; 157 devip = VTOI(devvp); 158 /* 159 * Allocate and copy the last block contents so as to be able 160 * to set size to that of the filesystem. 161 */ 162 numblks = howmany(fs->fs_size, fs->fs_frag); 163 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), 164 fs->fs_bsize, KERNCRED, B_CLRBUF, &bp); 165 if (error) 166 goto out; 167 ip->i_size = lblktosize(fs, (off_t)numblks); 168 ip->i_flag |= IN_CHANGE | IN_UPDATE; 169 if ((error = readblock(bp, numblks - 1)) != 0) 170 goto out; 171 bawrite(bp); 172 /* 173 * Preallocate critical data structures so that we can copy 174 * them in without further allocation after we suspend all 175 * operations on the filesystem. We would like to just release 176 * the allocated buffers without writing them since they will 177 * be filled in below once we are ready to go, but this upsets 178 * the soft update code, so we go ahead and write the new buffers. 179 * 180 * Allocate all indirect blocks. Also allocate shadow copies 181 * for each of the indirect blocks. 182 */ 183 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 184 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), 185 fs->fs_bsize, p->p_ucred, B_METAONLY, &ibp); 186 if (error) 187 goto out; 188 copyblkno = fragstoblks(fs, dbtofsb(fs, ibp->b_blkno)); 189 bdwrite(ibp); 190 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)copyblkno), 191 fs->fs_bsize, p->p_ucred, 0, &nbp); 192 if (error) 193 goto out; 194 bawrite(nbp); 195 } 196 /* 197 * Allocate shadow blocks to copy all of the other snapshot inodes 198 * so that we will be able to expunge them from this snapshot. 199 */ 200 for (loc = 0, inoblkcnt = 0; loc < snaploc; loc++) { 201 blkno = fragstoblks(fs, ino_to_fsba(fs, fs->fs_snapinum[loc])); 202 for (i = 0; i < inoblkcnt; i++) 203 if (inoblks[i] == blkno) 204 break; 205 if (i == inoblkcnt) { 206 inoblks[inoblkcnt++] = blkno; 207 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), 208 fs->fs_bsize, KERNCRED, 0, &nbp); 209 if (error) 210 goto out; 211 bawrite(nbp); 212 } 213 } 214 /* 215 * Allocate all cylinder group blocks. 216 */ 217 for (cg = 0; cg < fs->fs_ncg; cg++) { 218 error = VOP_BALLOC(vp, (off_t)(cgtod(fs, cg)) << fs->fs_fshift, 219 fs->fs_bsize, KERNCRED, 0, &nbp); 220 if (error) 221 goto out; 222 bawrite(nbp); 223 } 224 /* 225 * Allocate copies for the superblock and its summary information. 226 */ 227 error = VOP_BALLOC(vp, (off_t)(SBOFF), fs->fs_bsize, KERNCRED, 228 0, &nbp); 229 if (error) 230 goto out; 231 bawrite(nbp); 232 blkno = fragstoblks(fs, fs->fs_csaddr); 233 len = howmany(fs->fs_cssize, fs->fs_bsize); 234 for (loc = 0; loc < len; loc++) { 235 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), 236 fs->fs_bsize, KERNCRED, 0, &nbp); 237 if (error) 238 goto out; 239 bawrite(nbp); 240 } 241 /* 242 * Change inode to snapshot type file. 243 */ 244 ip->i_flags |= SF_IMMUTABLE | SF_SNAPSHOT; 245 ip->i_flag |= IN_CHANGE | IN_UPDATE; 246 /* 247 * Ensure that the snapshot is completely on disk. 248 */ 249 if ((error = VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p)) != 0) 250 goto out; 251 /* 252 * All allocations are done, so we can now snapshot the system. 253 * 254 * Suspend operation on filesystem. 255 */ 256 for (;;) { 257 vn_finished_write(wrtmp); 258 vfs_write_suspend(vp->v_mount); 259 if (mp->mnt_kern_flag & MNTK_SUSPENDED) 260 break; 261 vn_start_write(NULL, &wrtmp, V_WAIT); 262 } 263 /* 264 * First, copy all the cylinder group maps. All the unallocated 265 * blocks are marked BLK_NOCOPY so that the snapshot knows that 266 * it need not copy them if they are later written. 267 */ 268 len = howmany(fs->fs_fpg, fs->fs_frag); 269 for (cg = 0; cg < fs->fs_ncg; cg++) { 270 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 271 (int)fs->fs_cgsize, KERNCRED, &bp); 272 if (error) { 273 brelse(bp); 274 goto out1; 275 } 276 cgp = (struct cg *)bp->b_data; 277 if (!cg_chkmagic(cgp)) { 278 brelse(bp); 279 error = EIO; 280 goto out1; 281 } 282 error = bread(vp, fragstoblks(fs, cgtod(fs, cg)), fs->fs_bsize, 283 KERNCRED, &nbp); 284 if (error) { 285 brelse(bp); 286 brelse(nbp); 287 goto out1; 288 } 289 bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize); 290 if (fs->fs_cgsize < fs->fs_bsize) 291 bzero(&nbp->b_data[fs->fs_cgsize], 292 fs->fs_bsize - fs->fs_cgsize); 293 bawrite(nbp); 294 base = cg * fs->fs_fpg / fs->fs_frag; 295 if (base + len > numblks) 296 len = numblks - base; 297 loc = 0; 298 if (base < NDADDR) { 299 for ( ; loc < NDADDR; loc++) { 300 if (!ffs_isblock(fs, cg_blksfree(cgp), loc)) 301 continue; 302 ip->i_db[loc] = BLK_NOCOPY; 303 } 304 } 305 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), 306 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 307 if (error) { 308 brelse(bp); 309 goto out1; 310 } 311 indiroff = (base + loc - NDADDR) % NINDIR(fs); 312 for ( ; loc < len; loc++, indiroff++) { 313 if (indiroff >= NINDIR(fs)) { 314 bawrite(ibp); 315 error = VOP_BALLOC(vp, 316 lblktosize(fs, (off_t)(base + loc)), 317 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 318 if (error) { 319 brelse(bp); 320 goto out1; 321 } 322 indiroff = 0; 323 } 324 if (!ffs_isblock(fs, cg_blksfree(cgp), loc)) 325 continue; 326 ((ufs_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; 327 } 328 brelse(bp); 329 bdwrite(ibp); 330 } 331 /* 332 * Snapshot the superblock and its summary information. 333 */ 334 error = VOP_BALLOC(vp, (off_t)(SBOFF), fs->fs_bsize, KERNCRED, 335 0, &nbp); 336 if (error) 337 goto out1; 338 bcopy(fs, nbp->b_data, fs->fs_sbsize); 339 ((struct fs *)(nbp->b_data))->fs_clean = 1; 340 if (fs->fs_sbsize < fs->fs_bsize) 341 bzero(&nbp->b_data[fs->fs_sbsize], 342 fs->fs_bsize - fs->fs_sbsize); 343 bawrite(nbp); 344 blkno = fragstoblks(fs, fs->fs_csaddr); 345 len = howmany(fs->fs_cssize, fs->fs_bsize) - 1; 346 size = fs->fs_bsize; 347 for (loc = 0; loc <= len; loc++) { 348 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), 349 fs->fs_bsize, KERNCRED, 0, &nbp); 350 if (error) 351 goto out1; 352 if (loc == len) { 353 readblock(nbp, blkno + loc); 354 size = fs->fs_cssize % fs->fs_bsize; 355 } 356 bcopy(fs->fs_csp[loc], nbp->b_data, size); 357 bawrite(nbp); 358 } 359 /* 360 * Copy the shadow blocks for the snapshot inodes so that 361 * the copies can can be expunged. 362 */ 363 for (loc = 0; loc < inoblkcnt; loc++) { 364 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)inoblks[loc]), 365 fs->fs_bsize, KERNCRED, 0, &nbp); 366 if (error) 367 goto out1; 368 readblock(nbp, inoblks[loc]); 369 bdwrite(nbp); 370 } 371 /* 372 * Copy allocation information from other snapshots and then 373 * expunge them from the view of the current snapshot. 374 */ 375 for (xp = devip->i_copyonwrite; xp; xp = xp->i_copyonwrite) { 376 /* 377 * Before expunging a snapshot inode, note all the 378 * blocks that it claims with BLK_SNAP so that fsck will 379 * be able to account for those blocks properly and so 380 * that this snapshot knows that it need not copy them 381 * if the other snapshot holding them is freed. 382 */ 383 if ((error = snapacct(vp, &xp->i_db[0], &xp->i_ib[NIADDR])) !=0) 384 goto out1; 385 blksperindir = 1; 386 lbn = -NDADDR; 387 len = numblks - NDADDR; 388 rlbn = NDADDR; 389 for (i = 0; len > 0 && i < NIADDR; i++) { 390 error = indiracct(vp, ITOV(xp), i, xp->i_ib[i], lbn, 391 rlbn, len, blksperindir); 392 if (error) 393 goto out1; 394 blksperindir *= NINDIR(fs); 395 lbn -= blksperindir + 1; 396 len -= blksperindir; 397 rlbn += blksperindir; 398 } 399 /* 400 * Set copied snapshot inode to be a zero length file. 401 */ 402 blkno = fragstoblks(fs, ino_to_fsba(fs, xp->i_number)); 403 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), 404 fs->fs_bsize, KERNCRED, 0, &nbp); 405 if (error) 406 goto out1; 407 dip = (struct dinode *)nbp->b_data + 408 ino_to_fsbo(fs, xp->i_number); 409 dip->di_size = 0; 410 dip->di_blocks = 0; 411 dip->di_flags &= ~(SF_IMMUTABLE | SF_SNAPSHOT); 412 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs_daddr_t)); 413 bdwrite(nbp); 414 } 415 /* 416 * Copy all indirect blocks to their shadows (allocated above) 417 * to avoid deadlock in ffs_copyonwrite. 418 */ 419 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 420 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), 421 fs->fs_bsize, p->p_ucred, B_METAONLY, &ibp); 422 if (error) 423 goto out1; 424 copyblkno = fragstoblks(fs, dbtofsb(fs, ibp->b_blkno)); 425 brelse(ibp); 426 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)copyblkno), 427 fs->fs_bsize, p->p_ucred, 0, &nbp); 428 if (error) 429 goto out1; 430 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), 431 fs->fs_bsize, p->p_ucred, B_METAONLY, &ibp); 432 if (error) { 433 brelse(nbp); 434 goto out1; 435 } 436 bcopy(ibp->b_data, nbp->b_data, fs->fs_bsize); 437 brelse(ibp); 438 bawrite(nbp); 439 } 440 /* 441 * Record snapshot inode. Since this is the newest snapshot, 442 * it must be placed at the end of the list. 443 */ 444 fs->fs_snapinum[snaploc] = ip->i_number; 445 if (ip->i_copyonwrite != 0) 446 panic("ffs_snapshot: %d already on list", ip->i_number); 447 if (devip->i_copyonwrite == 0) { 448 devvp->v_flag |= VCOPYONWRITE; 449 devip->i_copyonwrite = ip; 450 } else { 451 for (xp = devip->i_copyonwrite; xp->i_copyonwrite != 0; ) 452 xp = xp->i_copyonwrite; 453 xp->i_copyonwrite = ip; 454 } 455 vp->v_flag |= VSYSTEM; 456 /* 457 * Resume operation on filesystem. 458 */ 459 out1: 460 vfs_write_resume(vp->v_mount); 461 vn_start_write(NULL, &wrtmp, V_WAIT); 462 out: 463 mp->mnt_flag = flag; 464 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p); 465 if (error) 466 vput(vp); 467 else 468 VOP_UNLOCK(vp, 0, p); 469 vn_finished_write(wrtmp); 470 return (error); 471 } 472 473 /* 474 * Descend an indirect block chain for vnode cancelvp accounting for all 475 * its indirect blocks in snapvp. 476 */ 477 static int 478 indiracct(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, blksperindir) 479 struct vnode *snapvp; 480 struct vnode *cancelvp; 481 int level; 482 ufs_daddr_t blkno; 483 int lbn; 484 int rlbn; 485 int remblks; 486 int blksperindir; 487 { 488 int subblksperindir, error, last, num, i; 489 struct indir indirs[NIADDR + 2]; 490 ufs_daddr_t *bap; 491 struct buf *bp; 492 struct fs *fs; 493 494 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 495 return (error); 496 if (lbn != indirs[num - 1 - level].in_lbn || blkno == 0 || num < 2) 497 panic("indiracct: botched params"); 498 /* 499 * We have to expand bread here since it will deadlock looking 500 * up the block number for any blocks that are not in the cache. 501 */ 502 fs = VTOI(cancelvp)->i_fs; 503 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0); 504 bp->b_blkno = fsbtodb(fs, blkno); 505 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 506 (error = readblock(bp, fragstoblks(fs, blkno)))) { 507 brelse(bp); 508 return (error); 509 } 510 /* 511 * Account for the block pointers in this indirect block. 512 */ 513 last = howmany(remblks, blksperindir); 514 if (last > NINDIR(fs)) 515 last = NINDIR(fs); 516 if (snapvp != cancelvp) { 517 bap = (ufs_daddr_t *)bp->b_data; 518 } else { 519 MALLOC(bap, ufs_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); 520 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 521 brelse(bp); 522 } 523 error = snapacct(snapvp, &bap[0], &bap[last]); 524 if (error || level == 0) 525 goto out; 526 /* 527 * Account for the block pointers in each of the indirect blocks 528 * in the levels below us. 529 */ 530 subblksperindir = blksperindir / NINDIR(fs); 531 for (lbn++, level--, i = 0; i < last; i++) { 532 error = indiracct(snapvp, cancelvp, level, bap[i], lbn, 533 rlbn, remblks, subblksperindir); 534 if (error) 535 goto out; 536 rlbn += blksperindir; 537 lbn -= blksperindir; 538 remblks -= blksperindir; 539 } 540 out: 541 if (snapvp != cancelvp) 542 brelse(bp); 543 else 544 FREE(bap, M_DEVBUF); 545 return (error); 546 } 547 548 /* 549 * Account for a set of blocks allocated in a snapshot inode. 550 */ 551 static int 552 snapacct(vp, oldblkp, lastblkp) 553 struct vnode *vp; 554 ufs_daddr_t *oldblkp, *lastblkp; 555 { 556 struct inode *ip = VTOI(vp); 557 struct fs *fs = ip->i_fs; 558 ufs_daddr_t lbn, blkno, *blkp; 559 struct buf *ibp; 560 int error; 561 562 for ( ; oldblkp < lastblkp; oldblkp++) { 563 blkno = *oldblkp; 564 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 565 continue; 566 lbn = fragstoblks(fs, blkno); 567 if (lbn < NDADDR) { 568 blkp = &ip->i_db[lbn]; 569 ip->i_flag |= IN_CHANGE | IN_UPDATE; 570 } else { 571 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), 572 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 573 if (error) 574 return (error); 575 blkp = &((ufs_daddr_t *)(ibp->b_data)) 576 [(lbn - NDADDR) % NINDIR(fs)]; 577 } 578 if (*blkp != 0) 579 panic("snapacct: bad block"); 580 *blkp = BLK_SNAP; 581 if (lbn >= NDADDR) 582 bdwrite(ibp); 583 } 584 return (0); 585 } 586 587 /* 588 * Prepare a snapshot file for being removed. 589 */ 590 void 591 ffs_snapremove(vp) 592 struct vnode *vp; 593 { 594 struct inode *ip, *xp; 595 struct vnode *devvp; 596 struct buf *ibp; 597 struct fs *fs; 598 ufs_daddr_t blkno, dblk; 599 int error, snaploc, loc, last; 600 601 ip = VTOI(vp); 602 fs = ip->i_fs; 603 /* 604 * Delete snapshot inode from superblock. Keep list dense. 605 */ 606 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 607 if (fs->fs_snapinum[snaploc] == ip->i_number) 608 break; 609 if (snaploc < FSMAXSNAP) { 610 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { 611 if (fs->fs_snapinum[snaploc] == 0) 612 break; 613 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; 614 } 615 fs->fs_snapinum[snaploc - 1] = 0; 616 } 617 /* 618 * Delete from incore list. 619 * Clear copy-on-write flag if last snapshot. 620 */ 621 devvp = ip->i_devvp; 622 for (xp = VTOI(devvp); xp; xp = xp->i_copyonwrite) { 623 if (xp->i_copyonwrite != ip) 624 continue; 625 xp->i_copyonwrite = ip->i_copyonwrite; 626 ip->i_copyonwrite = 0; 627 break; 628 } 629 if (xp == 0) { 630 printf("ffs_snapremove: lost snapshot vnode %d\n", 631 ip->i_number); 632 vref(vp); 633 } 634 if (VTOI(devvp)->i_copyonwrite == 0) 635 devvp->v_flag &= ~VCOPYONWRITE; 636 /* 637 * Clear all BLK_NOCOPY fields. Pass any block claims to other 638 * snapshots that want them (see ffs_snapblkfree below). 639 */ 640 for (blkno = 1; blkno < NDADDR; blkno++) { 641 dblk = ip->i_db[blkno]; 642 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP || 643 (dblk == blkstofrags(fs, blkno) && 644 ffs_snapblkfree(ip, dblk, fs->fs_bsize))) 645 ip->i_db[blkno] = 0; 646 } 647 for (blkno = NDADDR; blkno < fs->fs_size; blkno += NINDIR(fs)) { 648 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), 649 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 650 if (error) 651 continue; 652 if ((last = fs->fs_size - blkno) > NINDIR(fs)) 653 last = NINDIR(fs); 654 for (loc = 0; loc < last; loc++) { 655 dblk = ((ufs_daddr_t *)(ibp->b_data))[loc]; 656 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP || 657 (dblk == blkstofrags(fs, blkno) && 658 ffs_snapblkfree(ip, dblk, fs->fs_bsize))) 659 ((ufs_daddr_t *)(ibp->b_data))[loc] = 0; 660 } 661 bawrite(ibp); 662 } 663 /* 664 * Clear snapshot flag and drop reference. 665 */ 666 ip->i_flags &= ~(SF_IMMUTABLE | SF_SNAPSHOT); 667 ip->i_flag |= IN_CHANGE | IN_UPDATE; 668 vrele(vp); 669 } 670 671 /* 672 * Notification that a block is being freed. Return zero if the free 673 * should be allowed to proceed. Return non-zero if the snapshot file 674 * wants to claim the block. The block will be claimed if it is an 675 * uncopied part of one of the snapshots. It will be freed if it is 676 * either a BLK_NOCOPY or has already been copied in all of the snapshots. 677 * If a fragment is being freed, then all snapshots that care about 678 * it must make a copy since a snapshot file can only claim full sized 679 * blocks. Note that if more than one snapshot file maps the block, 680 * we can pick one at random to claim it. Since none of the snapshots 681 * can change, we are assurred that they will all see the same unmodified 682 * image. When deleting a snapshot file (see ffs_snapremove above), we 683 * must push any of these claimed blocks to one of the other snapshots 684 * that maps it. These claimed blocks are easily identified as they will 685 * have a block number equal to their logical block number within the 686 * snapshot. A copied block can never have this property because they 687 * must always have been allocated from a BLK_NOCOPY location. 688 */ 689 int 690 ffs_snapblkfree(freeip, bno, size) 691 struct inode *freeip; 692 ufs_daddr_t bno; 693 long size; 694 { 695 struct buf *ibp, *cbp, *savedcbp = 0; 696 struct fs *fs = freeip->i_fs; 697 struct proc *p = CURPROC; 698 struct inode *ip; 699 struct vnode *vp; 700 ufs_daddr_t lbn, blkno; 701 int indiroff = 0, error = 0, claimedblk = 0; 702 703 lbn = fragstoblks(fs, bno); 704 for (ip = VTOI(freeip->i_devvp)->i_copyonwrite; ip; 705 ip = ip->i_copyonwrite) { 706 vp = ITOV(ip); 707 /* 708 * Lookup block being written. 709 */ 710 if (lbn < NDADDR) { 711 blkno = ip->i_db[lbn]; 712 } else { 713 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 714 p->p_flag |= P_COWINPROGRESS; 715 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), 716 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 717 p->p_flag &= ~P_COWINPROGRESS; 718 VOP_UNLOCK(vp, 0, p); 719 if (error) 720 break; 721 indiroff = (lbn - NDADDR) % NINDIR(fs); 722 blkno = ((ufs_daddr_t *)(ibp->b_data))[indiroff]; 723 } 724 /* 725 * Check to see if block needs to be copied. 726 */ 727 switch (blkno) { 728 /* 729 * If the snapshot has already copied the block (default), 730 * or does not care about the block, it is not needed. 731 */ 732 default: 733 case BLK_NOCOPY: 734 if (lbn >= NDADDR) 735 brelse(ibp); 736 continue; 737 /* 738 * No previous snapshot claimed the block, so it will be 739 * freed and become a BLK_NOCOPY (don't care) for us. 740 */ 741 case BLK_SNAP: 742 if (claimedblk) 743 panic("snapblkfree: inconsistent block type"); 744 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 745 if (lbn < NDADDR) { 746 ip->i_db[lbn] = BLK_NOCOPY; 747 ip->i_flag |= IN_CHANGE | IN_UPDATE; 748 } else { 749 ((ufs_daddr_t *)(ibp->b_data))[indiroff] = 750 BLK_NOCOPY; 751 bdwrite(ibp); 752 } 753 VOP_UNLOCK(vp, 0, p); 754 continue; 755 /* 756 * A block that we map is being freed. If it has not been 757 * claimed yet, we will claim or copy it (below). 758 */ 759 case 0: 760 claimedblk = 1; 761 break; 762 } 763 /* 764 * If this is a full size block, we will just grab it 765 * and assign it to the snapshot inode. Otherwise we 766 * will proceed to copy it. See explanation for this 767 * routine as to why only a single snapshot needs to 768 * claim this block. 769 */ 770 if (size == fs->fs_bsize) { 771 #ifdef DEBUG 772 if (snapdebug) 773 printf("%s %d lbn %d from inum %d\n", 774 "Grabonremove: snapino", ip->i_number, lbn, 775 freeip->i_number); 776 #endif 777 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 778 if (lbn < NDADDR) { 779 ip->i_db[lbn] = bno; 780 } else { 781 ((ufs_daddr_t *)(ibp->b_data))[indiroff] = bno; 782 bdwrite(ibp); 783 } 784 ip->i_blocks += btodb(size); 785 ip->i_flag |= IN_CHANGE | IN_UPDATE; 786 VOP_UNLOCK(vp, 0, p); 787 return (1); 788 } 789 if (lbn >= NDADDR) 790 brelse(ibp); 791 /* 792 * Allocate the block into which to do the copy. Note that this 793 * allocation will never require any additional allocations for 794 * the snapshot inode. 795 */ 796 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 797 p->p_flag |= P_COWINPROGRESS; 798 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), 799 fs->fs_bsize, KERNCRED, 0, &cbp); 800 p->p_flag &= ~P_COWINPROGRESS; 801 VOP_UNLOCK(vp, 0, p); 802 if (error) 803 break; 804 #ifdef DEBUG 805 if (snapdebug) 806 printf("%s%d lbn %d for inum %d size %ld to blkno %d\n", 807 "Copyonremove: snapino ", ip->i_number, lbn, 808 freeip->i_number, size, cbp->b_blkno); 809 #endif 810 /* 811 * If we have already read the old block contents, then 812 * simply copy them to the new block. 813 */ 814 if (savedcbp != 0) { 815 bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 816 bawrite(cbp); 817 continue; 818 } 819 /* 820 * Otherwise, read the old block contents into the buffer. 821 */ 822 if ((error = readblock(cbp, lbn)) != 0) 823 break; 824 savedcbp = cbp; 825 } 826 if (savedcbp) 827 bawrite(savedcbp); 828 /* 829 * If we have been unable to allocate a block in which to do 830 * the copy, then return non-zero so that the fragment will 831 * not be freed. Although space will be lost, the snapshot 832 * will stay consistent. 833 */ 834 return (error); 835 } 836 837 /* 838 * Associate snapshot files when mounting. 839 */ 840 void 841 ffs_snapshot_mount(mp) 842 struct mount *mp; 843 { 844 struct ufsmount *ump = VFSTOUFS(mp); 845 struct fs *fs = ump->um_fs; 846 struct proc *p = CURPROC; 847 struct inode *ip, **listtailp; 848 struct vnode *vp; 849 int error, snaploc, loc; 850 851 listtailp = &VTOI(ump->um_devvp)->i_copyonwrite; 852 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { 853 if (fs->fs_snapinum[snaploc] == 0) 854 return; 855 if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], &vp)) != 0){ 856 printf("ffs_snapshot_mount: vget failed %d\n", error); 857 continue; 858 } 859 ip = VTOI(vp); 860 if ((ip->i_flags & SF_SNAPSHOT) == 0) { 861 printf("ffs_snapshot_mount: non-snapshot inode %d\n", 862 fs->fs_snapinum[snaploc]); 863 vput(vp); 864 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { 865 if (fs->fs_snapinum[loc] == 0) 866 break; 867 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; 868 } 869 fs->fs_snapinum[loc - 1] = 0; 870 snaploc--; 871 continue; 872 } 873 if (ip->i_copyonwrite != 0) 874 panic("ffs_snapshot_mount: %d already on list", 875 ip->i_number); 876 *listtailp = ip; 877 listtailp = &ip->i_copyonwrite; 878 vp->v_flag |= VSYSTEM; 879 VOP_UNLOCK(vp, 0, p); 880 ump->um_devvp->v_flag |= VCOPYONWRITE; 881 } 882 } 883 884 /* 885 * Disassociate snapshot files when unmounting. 886 */ 887 void 888 ffs_snapshot_unmount(mp) 889 struct mount *mp; 890 { 891 struct ufsmount *ump = VFSTOUFS(mp); 892 struct inode *devip = VTOI(ump->um_devvp); 893 struct inode *xp; 894 895 while ((xp = devip->i_copyonwrite) != 0) { 896 devip->i_copyonwrite = xp->i_copyonwrite; 897 xp->i_copyonwrite = 0; 898 vrele(ITOV(xp)); 899 } 900 ump->um_devvp->v_flag &= ~VCOPYONWRITE; 901 } 902 903 /* 904 * Check for need to copy block that is about to be written, 905 * copying the block if necessary. 906 */ 907 int 908 ffs_copyonwrite(ap) 909 struct vop_copyonwrite_args /* { 910 struct vnode *a_vp; 911 struct buf *a_bp; 912 } */ *ap; 913 { 914 struct buf *ibp, *cbp, *savedcbp = 0, *bp = ap->a_bp; 915 struct fs *fs = VTOI(bp->b_vp)->i_fs; 916 struct proc *p = CURPROC; 917 struct inode *ip; 918 struct vnode *vp; 919 ufs_daddr_t lbn, blkno; 920 int indiroff, error = 0; 921 922 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); 923 if (p->p_flag & P_COWINPROGRESS) 924 panic("ffs_copyonwrite: recursive call"); 925 for (ip = VTOI(ap->a_vp)->i_copyonwrite; ip; ip = ip->i_copyonwrite) { 926 vp = ITOV(ip); 927 /* 928 * We ensure that everything of our own that needs to be 929 * copied will be done at the time that ffs_snapshot is 930 * called. Thus we can skip the check here which can 931 * deadlock in doing the lookup in VOP_BALLOC. 932 */ 933 if (bp->b_vp == vp) 934 continue; 935 /* 936 * Check to see if block needs to be copied. 937 */ 938 if (lbn < NDADDR) { 939 blkno = ip->i_db[lbn]; 940 } else { 941 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 942 p->p_flag |= P_COWINPROGRESS; 943 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), 944 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 945 p->p_flag &= ~P_COWINPROGRESS; 946 VOP_UNLOCK(vp, 0, p); 947 if (error) 948 break; 949 indiroff = (lbn - NDADDR) % NINDIR(fs); 950 blkno = ((ufs_daddr_t *)(ibp->b_data))[indiroff]; 951 brelse(ibp); 952 } 953 #ifdef DIAGNOSTIC 954 if (blkno == BLK_SNAP && bp->b_lblkno >= 0) 955 panic("ffs_copyonwrite: bad copy block"); 956 #endif 957 if (blkno != 0) 958 continue; 959 /* 960 * Allocate the block into which to do the copy. Note that this 961 * allocation will never require any additional allocations for 962 * the snapshot inode. 963 */ 964 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 965 p->p_flag |= P_COWINPROGRESS; 966 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), 967 fs->fs_bsize, KERNCRED, 0, &cbp); 968 p->p_flag &= ~P_COWINPROGRESS; 969 VOP_UNLOCK(vp, 0, p); 970 #ifdef DEBUG 971 if (snapdebug) { 972 printf("Copyonwrite: snapino %d lbn %d for ", 973 ip->i_number, lbn); 974 if (bp->b_vp == ap->a_vp) 975 printf("fs metadata"); 976 else 977 printf("inum %d", VTOI(bp->b_vp)->i_number); 978 printf(" lblkno %d to blkno %d\n", bp->b_lblkno, 979 cbp->b_blkno); 980 } 981 #endif 982 if (error) 983 break; 984 /* 985 * If we have already read the old block contents, then 986 * simply copy them to the new block. 987 */ 988 if (savedcbp != 0) { 989 bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 990 bawrite(cbp); 991 continue; 992 } 993 /* 994 * Otherwise, read the old block contents into the buffer. 995 */ 996 if ((error = readblock(cbp, lbn)) != 0) 997 break; 998 savedcbp = cbp; 999 } 1000 if (savedcbp) 1001 bawrite(savedcbp); 1002 return (error); 1003 } 1004 1005 /* 1006 * Read the specified block into the given buffer. 1007 * Much of this boiler-plate comes from bwrite(). 1008 */ 1009 static int 1010 readblock(bp, lbn) 1011 struct buf *bp; 1012 daddr_t lbn; 1013 { 1014 struct uio auio; 1015 struct iovec aiov; 1016 struct proc *p = CURPROC; 1017 struct inode *ip = VTOI(bp->b_vp); 1018 1019 aiov.iov_base = bp->b_data; 1020 aiov.iov_len = bp->b_bcount; 1021 auio.uio_iov = &aiov; 1022 auio.uio_iovcnt = 1; 1023 auio.uio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn))); 1024 auio.uio_resid = bp->b_bcount; 1025 auio.uio_rw = UIO_READ; 1026 auio.uio_segflg = UIO_SYSSPACE; 1027 auio.uio_procp = p; 1028 return (physio(ip->i_devvp->v_rdev, &auio, 0)); 1029 } 1030