1 /* 2 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. 3 * 4 * Further information about snapshots can be obtained from: 5 * 6 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 7 * 1614 Oxford Street mckusick@mckusick.com 8 * Berkeley, CA 94709-1608 +1-510-843-9542 9 * USA 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 21 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 22 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 34 * $FreeBSD$ 35 */ 36 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/bio.h> 40 #include <sys/buf.h> 41 #include <sys/proc.h> 42 #include <sys/namei.h> 43 #include <sys/stat.h> 44 #include <sys/malloc.h> 45 #include <sys/mount.h> 46 #include <sys/resource.h> 47 #include <sys/resourcevar.h> 48 #include <sys/vnode.h> 49 50 #include <ufs/ufs/extattr.h> 51 #include <ufs/ufs/quota.h> 52 #include <ufs/ufs/ufsmount.h> 53 #include <ufs/ufs/inode.h> 54 #include <ufs/ufs/ufs_extern.h> 55 56 #include <ufs/ffs/fs.h> 57 #include <ufs/ffs/ffs_extern.h> 58 59 #define KERNCRED proc0.p_ucred 60 #define CURPROC curproc 61 #define DEBUG 62 63 static int indiracct __P((struct vnode *, struct vnode *, int, ufs_daddr_t, 64 int, int, int, int)); 65 static int snapacct __P((struct vnode *, ufs_daddr_t *, ufs_daddr_t *)); 66 static int readblock __P((struct buf *, daddr_t)); 67 68 #ifdef DEBUG 69 #include <sys/sysctl.h> 70 int snapdebug = 0; 71 SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, ""); 72 #endif /* DEBUG */ 73 74 /* 75 * Create a snapshot file and initialize it for the filesystem. 76 */ 77 int 78 ffs_snapshot(mp, snapfile) 79 struct mount *mp; 80 char *snapfile; 81 { 82 ufs_daddr_t rlbn; 83 ufs_daddr_t lbn, blkno, copyblkno, inoblks[FSMAXSNAP]; 84 int error, cg, snaploc, indiroff, numblks; 85 int i, size, base, len, loc, inoblkcnt; 86 int blksperindir, flag = mp->mnt_flag; 87 struct fs *fs = VFSTOUFS(mp)->um_fs; 88 struct proc *p = CURPROC; 89 struct inode *devip, *ip, *xp; 90 struct buf *bp, *nbp, *ibp; 91 struct vnode *vp, *devvp; 92 struct nameidata nd; 93 struct mount *wrtmp; 94 struct dinode *dip; 95 struct vattr vat; 96 struct cg *cgp; 97 98 /* 99 * Need to serialize access to snapshot code per filesystem. 100 */ 101 /* 102 * Assign a snapshot slot in the superblock. 103 */ 104 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 105 if (fs->fs_snapinum[snaploc] == 0) 106 break; 107 if (snaploc == FSMAXSNAP) 108 return (ENOSPC); 109 /* 110 * Create the snapshot file. 111 */ 112 restart: 113 NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, snapfile, p); 114 if ((error = namei(&nd)) != 0) 115 return (error); 116 if (nd.ni_vp != NULL) { 117 vput(nd.ni_vp); 118 error = EEXIST; 119 } 120 if (nd.ni_dvp->v_mount != mp) 121 error = EXDEV; 122 if (error) { 123 NDFREE(&nd, NDF_ONLY_PNBUF); 124 if (nd.ni_dvp == nd.ni_vp) 125 vrele(nd.ni_dvp); 126 else 127 vput(nd.ni_dvp); 128 return (error); 129 } 130 VATTR_NULL(&vat); 131 vat.va_type = VREG; 132 vat.va_mode = S_IRUSR; 133 vat.va_vaflags |= VA_EXCLUSIVE; 134 if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp)) 135 wrtmp = NULL; 136 if (wrtmp != mp) 137 panic("ffs_snapshot: mount mismatch"); 138 if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) { 139 NDFREE(&nd, NDF_ONLY_PNBUF); 140 vput(nd.ni_dvp); 141 if ((error = vn_start_write(NULL, &wrtmp, 142 V_XSLEEP | PCATCH)) != 0) 143 return (error); 144 goto restart; 145 } 146 VOP_LEASE(nd.ni_dvp, p, KERNCRED, LEASE_WRITE); 147 error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat); 148 vput(nd.ni_dvp); 149 if (error) { 150 NDFREE(&nd, NDF_ONLY_PNBUF); 151 vn_finished_write(wrtmp); 152 return (error); 153 } 154 vp = nd.ni_vp; 155 ip = VTOI(vp); 156 devvp = ip->i_devvp; 157 devip = VTOI(devvp); 158 /* 159 * Allocate and copy the last block contents so as to be able 160 * to set size to that of the filesystem. 161 */ 162 numblks = howmany(fs->fs_size, fs->fs_frag); 163 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), 164 fs->fs_bsize, KERNCRED, B_CLRBUF, &bp); 165 if (error) 166 goto out; 167 ip->i_size = lblktosize(fs, (off_t)numblks); 168 ip->i_flag |= IN_CHANGE | IN_UPDATE; 169 if ((error = readblock(bp, numblks - 1)) != 0) 170 goto out; 171 bawrite(bp); 172 /* 173 * Preallocate critical data structures so that we can copy 174 * them in without further allocation after we suspend all 175 * operations on the filesystem. We would like to just release 176 * the allocated buffers without writing them since they will 177 * be filled in below once we are ready to go, but this upsets 178 * the soft update code, so we go ahead and write the new buffers. 179 * 180 * Allocate all indirect blocks. Also allocate shadow copies 181 * for each of the indirect blocks. 182 */ 183 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 184 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), 185 fs->fs_bsize, p->p_ucred, B_METAONLY, &ibp); 186 if (error) 187 goto out; 188 copyblkno = fragstoblks(fs, dbtofsb(fs, ibp->b_blkno)); 189 bdwrite(ibp); 190 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)copyblkno), 191 fs->fs_bsize, p->p_ucred, 0, &nbp); 192 if (error) 193 goto out; 194 bawrite(nbp); 195 } 196 /* 197 * Allocate shadow blocks to copy all of the other snapshot inodes 198 * so that we will be able to expunge them from this snapshot. 199 */ 200 for (loc = 0, inoblkcnt = 0; loc < snaploc; loc++) { 201 blkno = fragstoblks(fs, ino_to_fsba(fs, fs->fs_snapinum[loc])); 202 for (i = 0; i < inoblkcnt; i++) 203 if (inoblks[i] == blkno) 204 break; 205 if (i == inoblkcnt) { 206 inoblks[inoblkcnt++] = blkno; 207 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), 208 fs->fs_bsize, KERNCRED, 0, &nbp); 209 if (error) 210 goto out; 211 bawrite(nbp); 212 } 213 } 214 /* 215 * Allocate all cylinder group blocks. 216 */ 217 for (cg = 0; cg < fs->fs_ncg; cg++) { 218 error = VOP_BALLOC(vp, (off_t)(cgtod(fs, cg)) << fs->fs_fshift, 219 fs->fs_bsize, KERNCRED, 0, &nbp); 220 if (error) 221 goto out; 222 bawrite(nbp); 223 } 224 /* 225 * Allocate copies for the superblock and its summary information. 226 */ 227 error = VOP_BALLOC(vp, (off_t)(SBOFF), fs->fs_bsize, KERNCRED, 228 0, &nbp); 229 if (error) 230 goto out; 231 bawrite(nbp); 232 blkno = fragstoblks(fs, fs->fs_csaddr); 233 len = howmany(fs->fs_cssize, fs->fs_bsize); 234 for (loc = 0; loc < len; loc++) { 235 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), 236 fs->fs_bsize, KERNCRED, 0, &nbp); 237 if (error) 238 goto out; 239 bawrite(nbp); 240 } 241 /* 242 * Change inode to snapshot type file. 243 */ 244 ip->i_flags |= SF_IMMUTABLE | SF_SNAPSHOT; 245 ip->i_flag |= IN_CHANGE | IN_UPDATE; 246 /* 247 * Ensure that the snapshot is completely on disk. 248 */ 249 if ((error = VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p)) != 0) 250 goto out; 251 /* 252 * All allocations are done, so we can now snapshot the system. 253 * 254 * Suspend operation on filesystem. 255 */ 256 for (;;) { 257 vn_finished_write(wrtmp); 258 vfs_write_suspend(vp->v_mount); 259 if (mp->mnt_kern_flag & MNTK_SUSPENDED) 260 break; 261 vn_start_write(NULL, &wrtmp, V_WAIT); 262 } 263 /* 264 * First, copy all the cylinder group maps. All the unallocated 265 * blocks are marked BLK_NOCOPY so that the snapshot knows that 266 * it need not copy them if they are later written. 267 */ 268 len = howmany(fs->fs_fpg, fs->fs_frag); 269 for (cg = 0; cg < fs->fs_ncg; cg++) { 270 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 271 (int)fs->fs_cgsize, KERNCRED, &bp); 272 if (error) { 273 brelse(bp); 274 goto out1; 275 } 276 cgp = (struct cg *)bp->b_data; 277 if (!cg_chkmagic(cgp)) { 278 brelse(bp); 279 error = EIO; 280 goto out1; 281 } 282 error = bread(vp, fragstoblks(fs, cgtod(fs, cg)), fs->fs_bsize, 283 KERNCRED, &nbp); 284 if (error) { 285 brelse(bp); 286 brelse(nbp); 287 goto out1; 288 } 289 bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize); 290 if (fs->fs_cgsize < fs->fs_bsize) 291 bzero(&nbp->b_data[fs->fs_cgsize], 292 fs->fs_bsize - fs->fs_cgsize); 293 nbp->b_flags |= B_VALIDSUSPWRT; 294 bawrite(nbp); 295 base = cg * fs->fs_fpg / fs->fs_frag; 296 if (base + len > numblks) 297 len = numblks - base; 298 loc = 0; 299 if (base < NDADDR) { 300 for ( ; loc < NDADDR; loc++) { 301 if (!ffs_isblock(fs, cg_blksfree(cgp), loc)) 302 continue; 303 ip->i_db[loc] = BLK_NOCOPY; 304 } 305 } 306 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), 307 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 308 if (error) { 309 brelse(bp); 310 goto out1; 311 } 312 indiroff = (base + loc - NDADDR) % NINDIR(fs); 313 for ( ; loc < len; loc++, indiroff++) { 314 if (indiroff >= NINDIR(fs)) { 315 ibp->b_flags |= B_VALIDSUSPWRT; 316 bawrite(ibp); 317 error = VOP_BALLOC(vp, 318 lblktosize(fs, (off_t)(base + loc)), 319 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 320 if (error) { 321 brelse(bp); 322 goto out1; 323 } 324 indiroff = 0; 325 } 326 if (!ffs_isblock(fs, cg_blksfree(cgp), loc)) 327 continue; 328 ((ufs_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; 329 } 330 bqrelse(bp); 331 ibp->b_flags |= B_VALIDSUSPWRT; 332 bdwrite(ibp); 333 } 334 /* 335 * Snapshot the superblock and its summary information. 336 */ 337 error = VOP_BALLOC(vp, (off_t)(SBOFF), fs->fs_bsize, KERNCRED, 338 0, &nbp); 339 if (error) 340 goto out1; 341 bcopy(fs, nbp->b_data, fs->fs_sbsize); 342 ((struct fs *)(nbp->b_data))->fs_clean = 1; 343 if (fs->fs_sbsize < fs->fs_bsize) 344 bzero(&nbp->b_data[fs->fs_sbsize], 345 fs->fs_bsize - fs->fs_sbsize); 346 nbp->b_flags |= B_VALIDSUSPWRT; 347 bawrite(nbp); 348 blkno = fragstoblks(fs, fs->fs_csaddr); 349 len = howmany(fs->fs_cssize, fs->fs_bsize) - 1; 350 size = fs->fs_bsize; 351 for (loc = 0; loc <= len; loc++) { 352 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), 353 fs->fs_bsize, KERNCRED, 0, &nbp); 354 if (error) 355 goto out1; 356 if (loc == len) { 357 readblock(nbp, blkno + loc); 358 size = fs->fs_cssize % fs->fs_bsize; 359 } 360 bcopy(fs->fs_csp[loc], nbp->b_data, size); 361 nbp->b_flags |= B_VALIDSUSPWRT; 362 bawrite(nbp); 363 } 364 /* 365 * Copy the shadow blocks for the snapshot inodes so that 366 * the copies can can be expunged. 367 */ 368 for (loc = 0; loc < inoblkcnt; loc++) { 369 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)inoblks[loc]), 370 fs->fs_bsize, KERNCRED, 0, &nbp); 371 if (error) 372 goto out1; 373 readblock(nbp, inoblks[loc]); 374 nbp->b_flags |= B_VALIDSUSPWRT; 375 bdwrite(nbp); 376 } 377 /* 378 * Copy allocation information from other snapshots and then 379 * expunge them from the view of the current snapshot. 380 */ 381 for (xp = devip->i_copyonwrite; xp; xp = xp->i_copyonwrite) { 382 /* 383 * Before expunging a snapshot inode, note all the 384 * blocks that it claims with BLK_SNAP so that fsck will 385 * be able to account for those blocks properly and so 386 * that this snapshot knows that it need not copy them 387 * if the other snapshot holding them is freed. 388 */ 389 if ((error = snapacct(vp, &xp->i_db[0], &xp->i_ib[NIADDR])) !=0) 390 goto out1; 391 blksperindir = 1; 392 lbn = -NDADDR; 393 len = numblks - NDADDR; 394 rlbn = NDADDR; 395 for (i = 0; len > 0 && i < NIADDR; i++) { 396 error = indiracct(vp, ITOV(xp), i, xp->i_ib[i], lbn, 397 rlbn, len, blksperindir); 398 if (error) 399 goto out1; 400 blksperindir *= NINDIR(fs); 401 lbn -= blksperindir + 1; 402 len -= blksperindir; 403 rlbn += blksperindir; 404 } 405 /* 406 * Set copied snapshot inode to be a zero length file. 407 */ 408 blkno = fragstoblks(fs, ino_to_fsba(fs, xp->i_number)); 409 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), 410 fs->fs_bsize, KERNCRED, 0, &nbp); 411 if (error) 412 goto out1; 413 dip = (struct dinode *)nbp->b_data + 414 ino_to_fsbo(fs, xp->i_number); 415 dip->di_size = 0; 416 dip->di_blocks = 0; 417 dip->di_flags &= ~(SF_IMMUTABLE | SF_SNAPSHOT); 418 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs_daddr_t)); 419 nbp->b_flags |= B_VALIDSUSPWRT; 420 bdwrite(nbp); 421 } 422 /* 423 * Copy all indirect blocks to their shadows (allocated above) 424 * to avoid deadlock in ffs_copyonwrite. 425 */ 426 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 427 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), 428 fs->fs_bsize, p->p_ucred, B_METAONLY, &ibp); 429 if (error) 430 goto out1; 431 copyblkno = fragstoblks(fs, dbtofsb(fs, ibp->b_blkno)); 432 bqrelse(ibp); 433 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)copyblkno), 434 fs->fs_bsize, p->p_ucred, 0, &nbp); 435 if (error) 436 goto out1; 437 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), 438 fs->fs_bsize, p->p_ucred, B_METAONLY, &ibp); 439 if (error) { 440 brelse(nbp); 441 goto out1; 442 } 443 bcopy(ibp->b_data, nbp->b_data, fs->fs_bsize); 444 bqrelse(ibp); 445 nbp->b_flags |= B_VALIDSUSPWRT; 446 bawrite(nbp); 447 } 448 /* 449 * Record snapshot inode. Since this is the newest snapshot, 450 * it must be placed at the end of the list. 451 */ 452 fs->fs_snapinum[snaploc] = ip->i_number; 453 if (ip->i_copyonwrite != 0) 454 panic("ffs_snapshot: %d already on list", ip->i_number); 455 if (devip->i_copyonwrite == 0) { 456 devvp->v_flag |= VCOPYONWRITE; 457 devip->i_copyonwrite = ip; 458 } else { 459 for (xp = devip->i_copyonwrite; xp->i_copyonwrite != 0; ) 460 xp = xp->i_copyonwrite; 461 xp->i_copyonwrite = ip; 462 } 463 vp->v_flag |= VSYSTEM; 464 /* 465 * Resume operation on filesystem. 466 */ 467 out1: 468 vfs_write_resume(vp->v_mount); 469 vn_start_write(NULL, &wrtmp, V_WAIT); 470 out: 471 mp->mnt_flag = flag; 472 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p); 473 if (error) 474 vput(vp); 475 else 476 VOP_UNLOCK(vp, 0, p); 477 vn_finished_write(wrtmp); 478 return (error); 479 } 480 481 /* 482 * Descend an indirect block chain for vnode cancelvp accounting for all 483 * its indirect blocks in snapvp. 484 */ 485 static int 486 indiracct(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, blksperindir) 487 struct vnode *snapvp; 488 struct vnode *cancelvp; 489 int level; 490 ufs_daddr_t blkno; 491 int lbn; 492 int rlbn; 493 int remblks; 494 int blksperindir; 495 { 496 int subblksperindir, error, last, num, i; 497 struct indir indirs[NIADDR + 2]; 498 ufs_daddr_t *bap; 499 struct buf *bp; 500 struct fs *fs; 501 502 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 503 return (error); 504 if (lbn != indirs[num - 1 - level].in_lbn || blkno == 0 || num < 2) 505 panic("indiracct: botched params"); 506 /* 507 * We have to expand bread here since it will deadlock looking 508 * up the block number for any blocks that are not in the cache. 509 */ 510 fs = VTOI(cancelvp)->i_fs; 511 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0); 512 bp->b_blkno = fsbtodb(fs, blkno); 513 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 514 (error = readblock(bp, fragstoblks(fs, blkno)))) { 515 brelse(bp); 516 return (error); 517 } 518 /* 519 * Account for the block pointers in this indirect block. 520 */ 521 last = howmany(remblks, blksperindir); 522 if (last > NINDIR(fs)) 523 last = NINDIR(fs); 524 if (snapvp != cancelvp) { 525 bap = (ufs_daddr_t *)bp->b_data; 526 } else { 527 MALLOC(bap, ufs_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); 528 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 529 bqrelse(bp); 530 } 531 error = snapacct(snapvp, &bap[0], &bap[last]); 532 if (error || level == 0) 533 goto out; 534 /* 535 * Account for the block pointers in each of the indirect blocks 536 * in the levels below us. 537 */ 538 subblksperindir = blksperindir / NINDIR(fs); 539 for (lbn++, level--, i = 0; i < last; i++) { 540 error = indiracct(snapvp, cancelvp, level, bap[i], lbn, 541 rlbn, remblks, subblksperindir); 542 if (error) 543 goto out; 544 rlbn += blksperindir; 545 lbn -= blksperindir; 546 remblks -= blksperindir; 547 } 548 out: 549 if (snapvp != cancelvp) 550 bqrelse(bp); 551 else 552 FREE(bap, M_DEVBUF); 553 return (error); 554 } 555 556 /* 557 * Account for a set of blocks allocated in a snapshot inode. 558 */ 559 static int 560 snapacct(vp, oldblkp, lastblkp) 561 struct vnode *vp; 562 ufs_daddr_t *oldblkp, *lastblkp; 563 { 564 struct inode *ip = VTOI(vp); 565 struct fs *fs = ip->i_fs; 566 ufs_daddr_t lbn, blkno, *blkp; 567 struct buf *ibp; 568 int error; 569 570 for ( ; oldblkp < lastblkp; oldblkp++) { 571 blkno = *oldblkp; 572 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 573 continue; 574 lbn = fragstoblks(fs, blkno); 575 if (lbn < NDADDR) { 576 blkp = &ip->i_db[lbn]; 577 ip->i_flag |= IN_CHANGE | IN_UPDATE; 578 } else { 579 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), 580 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 581 if (error) 582 return (error); 583 blkp = &((ufs_daddr_t *)(ibp->b_data)) 584 [(lbn - NDADDR) % NINDIR(fs)]; 585 } 586 if (*blkp != 0) 587 panic("snapacct: bad block"); 588 *blkp = BLK_SNAP; 589 if (lbn >= NDADDR) { 590 ibp->b_flags |= B_VALIDSUSPWRT; 591 bdwrite(ibp); 592 } 593 } 594 return (0); 595 } 596 597 /* 598 * Prepare a snapshot file for being removed. 599 */ 600 void 601 ffs_snapremove(vp) 602 struct vnode *vp; 603 { 604 struct inode *ip, *xp; 605 struct vnode *devvp; 606 struct buf *ibp; 607 struct fs *fs; 608 ufs_daddr_t blkno, dblk; 609 int error, snaploc, loc, last; 610 611 ip = VTOI(vp); 612 fs = ip->i_fs; 613 /* 614 * Delete snapshot inode from superblock. Keep list dense. 615 */ 616 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 617 if (fs->fs_snapinum[snaploc] == ip->i_number) 618 break; 619 if (snaploc < FSMAXSNAP) { 620 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { 621 if (fs->fs_snapinum[snaploc] == 0) 622 break; 623 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; 624 } 625 fs->fs_snapinum[snaploc - 1] = 0; 626 } 627 /* 628 * Delete from incore list. 629 * Clear copy-on-write flag if last snapshot. 630 */ 631 devvp = ip->i_devvp; 632 for (xp = VTOI(devvp); xp; xp = xp->i_copyonwrite) { 633 if (xp->i_copyonwrite != ip) 634 continue; 635 xp->i_copyonwrite = ip->i_copyonwrite; 636 ip->i_copyonwrite = 0; 637 break; 638 } 639 if (xp == 0) { 640 printf("ffs_snapremove: lost snapshot vnode %d\n", 641 ip->i_number); 642 vref(vp); 643 } 644 if (VTOI(devvp)->i_copyonwrite == 0) 645 devvp->v_flag &= ~VCOPYONWRITE; 646 /* 647 * Clear all BLK_NOCOPY fields. Pass any block claims to other 648 * snapshots that want them (see ffs_snapblkfree below). 649 */ 650 for (blkno = 1; blkno < NDADDR; blkno++) { 651 dblk = ip->i_db[blkno]; 652 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP || 653 (dblk == blkstofrags(fs, blkno) && 654 ffs_snapblkfree(ip, dblk, fs->fs_bsize))) 655 ip->i_db[blkno] = 0; 656 } 657 for (blkno = NDADDR; blkno < fs->fs_size; blkno += NINDIR(fs)) { 658 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), 659 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 660 if (error) 661 continue; 662 if ((last = fs->fs_size - blkno) > NINDIR(fs)) 663 last = NINDIR(fs); 664 for (loc = 0; loc < last; loc++) { 665 dblk = ((ufs_daddr_t *)(ibp->b_data))[loc]; 666 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP || 667 (dblk == blkstofrags(fs, blkno) && 668 ffs_snapblkfree(ip, dblk, fs->fs_bsize))) 669 ((ufs_daddr_t *)(ibp->b_data))[loc] = 0; 670 } 671 bawrite(ibp); 672 } 673 /* 674 * Clear snapshot flag and drop reference. 675 */ 676 ip->i_flags &= ~(SF_IMMUTABLE | SF_SNAPSHOT); 677 ip->i_flag |= IN_CHANGE | IN_UPDATE; 678 vrele(vp); 679 } 680 681 /* 682 * Notification that a block is being freed. Return zero if the free 683 * should be allowed to proceed. Return non-zero if the snapshot file 684 * wants to claim the block. The block will be claimed if it is an 685 * uncopied part of one of the snapshots. It will be freed if it is 686 * either a BLK_NOCOPY or has already been copied in all of the snapshots. 687 * If a fragment is being freed, then all snapshots that care about 688 * it must make a copy since a snapshot file can only claim full sized 689 * blocks. Note that if more than one snapshot file maps the block, 690 * we can pick one at random to claim it. Since none of the snapshots 691 * can change, we are assurred that they will all see the same unmodified 692 * image. When deleting a snapshot file (see ffs_snapremove above), we 693 * must push any of these claimed blocks to one of the other snapshots 694 * that maps it. These claimed blocks are easily identified as they will 695 * have a block number equal to their logical block number within the 696 * snapshot. A copied block can never have this property because they 697 * must always have been allocated from a BLK_NOCOPY location. 698 */ 699 int 700 ffs_snapblkfree(freeip, bno, size) 701 struct inode *freeip; 702 ufs_daddr_t bno; 703 long size; 704 { 705 struct buf *ibp, *cbp, *savedcbp = 0; 706 struct fs *fs = freeip->i_fs; 707 struct proc *p = CURPROC; 708 struct inode *ip; 709 struct vnode *vp; 710 ufs_daddr_t lbn, blkno; 711 int indiroff = 0, error = 0, claimedblk = 0; 712 713 lbn = fragstoblks(fs, bno); 714 for (ip = VTOI(freeip->i_devvp)->i_copyonwrite; ip; 715 ip = ip->i_copyonwrite) { 716 vp = ITOV(ip); 717 /* 718 * Lookup block being written. 719 */ 720 if (lbn < NDADDR) { 721 blkno = ip->i_db[lbn]; 722 } else { 723 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 724 p->p_flag |= P_COWINPROGRESS; 725 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), 726 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 727 p->p_flag &= ~P_COWINPROGRESS; 728 VOP_UNLOCK(vp, 0, p); 729 if (error) 730 break; 731 indiroff = (lbn - NDADDR) % NINDIR(fs); 732 blkno = ((ufs_daddr_t *)(ibp->b_data))[indiroff]; 733 } 734 /* 735 * Check to see if block needs to be copied. 736 */ 737 switch (blkno) { 738 /* 739 * If the snapshot has already copied the block (default), 740 * or does not care about the block, it is not needed. 741 */ 742 default: 743 case BLK_NOCOPY: 744 if (lbn >= NDADDR) 745 bqrelse(ibp); 746 continue; 747 /* 748 * No previous snapshot claimed the block, so it will be 749 * freed and become a BLK_NOCOPY (don't care) for us. 750 */ 751 case BLK_SNAP: 752 if (claimedblk) 753 panic("snapblkfree: inconsistent block type"); 754 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 755 if (lbn < NDADDR) { 756 ip->i_db[lbn] = BLK_NOCOPY; 757 ip->i_flag |= IN_CHANGE | IN_UPDATE; 758 } else { 759 ((ufs_daddr_t *)(ibp->b_data))[indiroff] = 760 BLK_NOCOPY; 761 bdwrite(ibp); 762 } 763 VOP_UNLOCK(vp, 0, p); 764 continue; 765 /* 766 * A block that we map is being freed. If it has not been 767 * claimed yet, we will claim or copy it (below). 768 */ 769 case 0: 770 claimedblk = 1; 771 break; 772 } 773 /* 774 * If this is a full size block, we will just grab it 775 * and assign it to the snapshot inode. Otherwise we 776 * will proceed to copy it. See explanation for this 777 * routine as to why only a single snapshot needs to 778 * claim this block. 779 */ 780 if (size == fs->fs_bsize) { 781 #ifdef DEBUG 782 if (snapdebug) 783 printf("%s %d lbn %d from inum %d\n", 784 "Grabonremove: snapino", ip->i_number, lbn, 785 freeip->i_number); 786 #endif 787 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 788 if (lbn < NDADDR) { 789 ip->i_db[lbn] = bno; 790 } else { 791 ((ufs_daddr_t *)(ibp->b_data))[indiroff] = bno; 792 bdwrite(ibp); 793 } 794 ip->i_blocks += btodb(size); 795 ip->i_flag |= IN_CHANGE | IN_UPDATE; 796 VOP_UNLOCK(vp, 0, p); 797 return (1); 798 } 799 if (lbn >= NDADDR) 800 bqrelse(ibp); 801 /* 802 * Allocate the block into which to do the copy. Note that this 803 * allocation will never require any additional allocations for 804 * the snapshot inode. 805 */ 806 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 807 p->p_flag |= P_COWINPROGRESS; 808 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), 809 fs->fs_bsize, KERNCRED, 0, &cbp); 810 p->p_flag &= ~P_COWINPROGRESS; 811 VOP_UNLOCK(vp, 0, p); 812 if (error) 813 break; 814 #ifdef DEBUG 815 if (snapdebug) 816 printf("%s%d lbn %d for inum %d size %ld to blkno %d\n", 817 "Copyonremove: snapino ", ip->i_number, lbn, 818 freeip->i_number, size, cbp->b_blkno); 819 #endif 820 /* 821 * If we have already read the old block contents, then 822 * simply copy them to the new block. 823 */ 824 if (savedcbp != 0) { 825 bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 826 bawrite(cbp); 827 continue; 828 } 829 /* 830 * Otherwise, read the old block contents into the buffer. 831 */ 832 if ((error = readblock(cbp, lbn)) != 0) 833 break; 834 savedcbp = cbp; 835 } 836 if (savedcbp) 837 bawrite(savedcbp); 838 /* 839 * If we have been unable to allocate a block in which to do 840 * the copy, then return non-zero so that the fragment will 841 * not be freed. Although space will be lost, the snapshot 842 * will stay consistent. 843 */ 844 return (error); 845 } 846 847 /* 848 * Associate snapshot files when mounting. 849 */ 850 void 851 ffs_snapshot_mount(mp) 852 struct mount *mp; 853 { 854 struct ufsmount *ump = VFSTOUFS(mp); 855 struct fs *fs = ump->um_fs; 856 struct proc *p = CURPROC; 857 struct inode *ip, **listtailp; 858 struct vnode *vp; 859 int error, snaploc, loc; 860 861 listtailp = &VTOI(ump->um_devvp)->i_copyonwrite; 862 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { 863 if (fs->fs_snapinum[snaploc] == 0) 864 return; 865 if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], &vp)) != 0){ 866 printf("ffs_snapshot_mount: vget failed %d\n", error); 867 continue; 868 } 869 ip = VTOI(vp); 870 if ((ip->i_flags & SF_SNAPSHOT) == 0) { 871 printf("ffs_snapshot_mount: non-snapshot inode %d\n", 872 fs->fs_snapinum[snaploc]); 873 vput(vp); 874 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { 875 if (fs->fs_snapinum[loc] == 0) 876 break; 877 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; 878 } 879 fs->fs_snapinum[loc - 1] = 0; 880 snaploc--; 881 continue; 882 } 883 if (ip->i_copyonwrite != 0) 884 panic("ffs_snapshot_mount: %d already on list", 885 ip->i_number); 886 *listtailp = ip; 887 listtailp = &ip->i_copyonwrite; 888 vp->v_flag |= VSYSTEM; 889 VOP_UNLOCK(vp, 0, p); 890 ump->um_devvp->v_flag |= VCOPYONWRITE; 891 } 892 } 893 894 /* 895 * Disassociate snapshot files when unmounting. 896 */ 897 void 898 ffs_snapshot_unmount(mp) 899 struct mount *mp; 900 { 901 struct ufsmount *ump = VFSTOUFS(mp); 902 struct inode *devip = VTOI(ump->um_devvp); 903 struct inode *xp; 904 905 while ((xp = devip->i_copyonwrite) != 0) { 906 devip->i_copyonwrite = xp->i_copyonwrite; 907 xp->i_copyonwrite = 0; 908 vrele(ITOV(xp)); 909 } 910 ump->um_devvp->v_flag &= ~VCOPYONWRITE; 911 } 912 913 /* 914 * Check for need to copy block that is about to be written, 915 * copying the block if necessary. 916 */ 917 int 918 ffs_copyonwrite(ap) 919 struct vop_copyonwrite_args /* { 920 struct vnode *a_vp; 921 struct buf *a_bp; 922 } */ *ap; 923 { 924 struct buf *ibp, *cbp, *savedcbp = 0, *bp = ap->a_bp; 925 struct fs *fs = VTOI(bp->b_vp)->i_fs; 926 struct proc *p = CURPROC; 927 struct inode *ip; 928 struct vnode *vp; 929 ufs_daddr_t lbn, blkno; 930 int indiroff, error = 0; 931 932 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); 933 if (p->p_flag & P_COWINPROGRESS) 934 panic("ffs_copyonwrite: recursive call"); 935 for (ip = VTOI(ap->a_vp)->i_copyonwrite; ip; ip = ip->i_copyonwrite) { 936 vp = ITOV(ip); 937 /* 938 * We ensure that everything of our own that needs to be 939 * copied will be done at the time that ffs_snapshot is 940 * called. Thus we can skip the check here which can 941 * deadlock in doing the lookup in VOP_BALLOC. 942 */ 943 if (bp->b_vp == vp) 944 continue; 945 /* 946 * Check to see if block needs to be copied. We have to 947 * be able to do the VOP_BALLOC without blocking, otherwise 948 * we may get in a deadlock with another process also 949 * trying to allocate. If we find outselves unable to 950 * get the buffer lock, we unlock the snapshot vnode, 951 * sleep briefly, and try again. 952 */ 953 retry: 954 vn_lock(vp, LK_SHARED | LK_RETRY, p); 955 if (lbn < NDADDR) { 956 blkno = ip->i_db[lbn]; 957 } else { 958 p->p_flag |= P_COWINPROGRESS; 959 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), 960 fs->fs_bsize, KERNCRED, B_METAONLY | B_NOWAIT, &ibp); 961 p->p_flag &= ~P_COWINPROGRESS; 962 if (error) { 963 VOP_UNLOCK(vp, 0, p); 964 if (error != EWOULDBLOCK) 965 break; 966 tsleep(vp, p->p_usrpri, "nap", 1); 967 goto retry; 968 } 969 indiroff = (lbn - NDADDR) % NINDIR(fs); 970 blkno = ((ufs_daddr_t *)(ibp->b_data))[indiroff]; 971 bqrelse(ibp); 972 } 973 #ifdef DIAGNOSTIC 974 if (blkno == BLK_SNAP && bp->b_lblkno >= 0) 975 panic("ffs_copyonwrite: bad copy block"); 976 #endif 977 if (blkno != 0) { 978 VOP_UNLOCK(vp, 0, p); 979 continue; 980 } 981 /* 982 * Allocate the block into which to do the copy. Note that this 983 * allocation will never require any additional allocations for 984 * the snapshot inode. 985 */ 986 p->p_flag |= P_COWINPROGRESS; 987 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), 988 fs->fs_bsize, KERNCRED, B_NOWAIT, &cbp); 989 p->p_flag &= ~P_COWINPROGRESS; 990 VOP_UNLOCK(vp, 0, p); 991 if (error) { 992 if (error != EWOULDBLOCK) 993 break; 994 tsleep(vp, p->p_usrpri, "nap", 1); 995 goto retry; 996 } 997 #ifdef DEBUG 998 if (snapdebug) { 999 printf("Copyonwrite: snapino %d lbn %d for ", 1000 ip->i_number, lbn); 1001 if (bp->b_vp == ap->a_vp) 1002 printf("fs metadata"); 1003 else 1004 printf("inum %d", VTOI(bp->b_vp)->i_number); 1005 printf(" lblkno %d to blkno %d\n", bp->b_lblkno, 1006 cbp->b_blkno); 1007 } 1008 #endif 1009 /* 1010 * If we have already read the old block contents, then 1011 * simply copy them to the new block. 1012 */ 1013 if (savedcbp != 0) { 1014 bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 1015 bawrite(cbp); 1016 continue; 1017 } 1018 /* 1019 * Otherwise, read the old block contents into the buffer. 1020 */ 1021 if ((error = readblock(cbp, lbn)) != 0) 1022 break; 1023 savedcbp = cbp; 1024 } 1025 if (savedcbp) 1026 bawrite(savedcbp); 1027 return (error); 1028 } 1029 1030 /* 1031 * Read the specified block into the given buffer. 1032 * Much of this boiler-plate comes from bwrite(). 1033 */ 1034 static int 1035 readblock(bp, lbn) 1036 struct buf *bp; 1037 daddr_t lbn; 1038 { 1039 struct uio auio; 1040 struct iovec aiov; 1041 struct proc *p = CURPROC; 1042 struct inode *ip = VTOI(bp->b_vp); 1043 1044 aiov.iov_base = bp->b_data; 1045 aiov.iov_len = bp->b_bcount; 1046 auio.uio_iov = &aiov; 1047 auio.uio_iovcnt = 1; 1048 auio.uio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn))); 1049 auio.uio_resid = bp->b_bcount; 1050 auio.uio_rw = UIO_READ; 1051 auio.uio_segflg = UIO_SYSSPACE; 1052 auio.uio_procp = p; 1053 return (physio(ip->i_devvp->v_rdev, &auio, 0)); 1054 } 1055