1 /* 2 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. 3 * 4 * Further information about snapshots can be obtained from: 5 * 6 * Marshall Kirk McKusick http://www.mckusick.com/softdep/ 7 * 1614 Oxford Street mckusick@mckusick.com 8 * Berkeley, CA 94709-1608 +1-510-843-9542 9 * USA 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 21 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY 22 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR 25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 34 * $FreeBSD$ 35 */ 36 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/bio.h> 40 #include <sys/buf.h> 41 #include <sys/proc.h> 42 #include <sys/namei.h> 43 #include <sys/stat.h> 44 #include <sys/malloc.h> 45 #include <sys/mount.h> 46 #include <sys/resource.h> 47 #include <sys/resourcevar.h> 48 #include <sys/vnode.h> 49 50 #include <ufs/ufs/extattr.h> 51 #include <ufs/ufs/quota.h> 52 #include <ufs/ufs/ufsmount.h> 53 #include <ufs/ufs/inode.h> 54 #include <ufs/ufs/ufs_extern.h> 55 56 #include <ufs/ffs/fs.h> 57 #include <ufs/ffs/ffs_extern.h> 58 59 #define KERNCRED proc0.p_ucred 60 #define DEBUG 1 61 62 static int indiracct __P((struct vnode *, struct vnode *, int, ufs_daddr_t, 63 int, int, int, int)); 64 static int snapacct __P((struct vnode *, ufs_daddr_t *, ufs_daddr_t *)); 65 static int readblock __P((struct buf *, daddr_t)); 66 67 #ifdef DEBUG 68 #include <sys/sysctl.h> 69 int snapdebug = 0; 70 SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, ""); 71 #endif /* DEBUG */ 72 73 /* 74 * Create a snapshot file and initialize it for the filesystem. 75 */ 76 int 77 ffs_snapshot(mp, snapfile) 78 struct mount *mp; 79 char *snapfile; 80 { 81 ufs_daddr_t rlbn; 82 ufs_daddr_t lbn, blkno, copyblkno, inoblks[FSMAXSNAP]; 83 int error, cg, snaploc, indiroff, numblks; 84 int i, size, base, len, loc, inoblkcnt; 85 int blksperindir, flag = mp->mnt_flag; 86 struct fs *fs = VFSTOUFS(mp)->um_fs; 87 struct proc *p = CURPROC; 88 struct inode *devip, *ip, *xp; 89 struct buf *bp, *nbp, *ibp; 90 struct vnode *vp, *devvp; 91 struct nameidata nd; 92 struct mount *wrtmp; 93 struct dinode *dip; 94 struct vattr vat; 95 struct cg *cgp; 96 97 /* 98 * Need to serialize access to snapshot code per filesystem. 99 */ 100 /* 101 * Assign a snapshot slot in the superblock. 102 */ 103 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 104 if (fs->fs_snapinum[snaploc] == 0) 105 break; 106 if (snaploc == FSMAXSNAP) 107 return (ENOSPC); 108 /* 109 * Create the snapshot file. 110 */ 111 restart: 112 NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, snapfile, p); 113 if ((error = namei(&nd)) != 0) 114 return (error); 115 if (nd.ni_vp != NULL) { 116 vput(nd.ni_vp); 117 error = EEXIST; 118 } 119 if (nd.ni_dvp->v_mount != mp) 120 error = EXDEV; 121 if (error) { 122 NDFREE(&nd, NDF_ONLY_PNBUF); 123 if (nd.ni_dvp == nd.ni_vp) 124 vrele(nd.ni_dvp); 125 else 126 vput(nd.ni_dvp); 127 return (error); 128 } 129 VATTR_NULL(&vat); 130 vat.va_type = VREG; 131 vat.va_mode = S_IRUSR; 132 vat.va_vaflags |= VA_EXCLUSIVE; 133 if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp)) 134 wrtmp = NULL; 135 if (wrtmp != mp) 136 panic("ffs_snapshot: mount mismatch"); 137 if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) { 138 NDFREE(&nd, NDF_ONLY_PNBUF); 139 vput(nd.ni_dvp); 140 if ((error = vn_start_write(NULL, &wrtmp, 141 V_XSLEEP | PCATCH)) != 0) 142 return (error); 143 goto restart; 144 } 145 VOP_LEASE(nd.ni_dvp, p, KERNCRED, LEASE_WRITE); 146 error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat); 147 vput(nd.ni_dvp); 148 if (error) { 149 NDFREE(&nd, NDF_ONLY_PNBUF); 150 vn_finished_write(wrtmp); 151 return (error); 152 } 153 vp = nd.ni_vp; 154 ip = VTOI(vp); 155 devvp = ip->i_devvp; 156 devip = VTOI(devvp); 157 /* 158 * Allocate and copy the last block contents so as to be able 159 * to set size to that of the filesystem. 160 */ 161 numblks = howmany(fs->fs_size, fs->fs_frag); 162 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), 163 fs->fs_bsize, KERNCRED, B_CLRBUF, &bp); 164 if (error) 165 goto out; 166 ip->i_size = lblktosize(fs, (off_t)numblks); 167 ip->i_flag |= IN_CHANGE | IN_UPDATE; 168 if ((error = readblock(bp, numblks - 1)) != 0) 169 goto out; 170 bawrite(bp); 171 /* 172 * Preallocate critical data structures so that we can copy 173 * them in without further allocation after we suspend all 174 * operations on the filesystem. We would like to just release 175 * the allocated buffers without writing them since they will 176 * be filled in below once we are ready to go, but this upsets 177 * the soft update code, so we go ahead and write the new buffers. 178 * 179 * Allocate all indirect blocks. Also allocate shadow copies 180 * for each of the indirect blocks. 181 */ 182 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 183 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), 184 fs->fs_bsize, p->p_ucred, B_METAONLY, &ibp); 185 if (error) 186 goto out; 187 copyblkno = fragstoblks(fs, dbtofsb(fs, ibp->b_blkno)); 188 bdwrite(ibp); 189 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)copyblkno), 190 fs->fs_bsize, p->p_ucred, 0, &nbp); 191 if (error) 192 goto out; 193 bawrite(nbp); 194 } 195 /* 196 * Allocate shadow blocks to copy all of the other snapshot inodes 197 * so that we will be able to expunge them from this snapshot. 198 */ 199 for (loc = 0, inoblkcnt = 0; loc < snaploc; loc++) { 200 blkno = fragstoblks(fs, ino_to_fsba(fs, fs->fs_snapinum[loc])); 201 for (i = 0; i < inoblkcnt; i++) 202 if (inoblks[i] == blkno) 203 break; 204 if (i == inoblkcnt) { 205 inoblks[inoblkcnt++] = blkno; 206 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), 207 fs->fs_bsize, KERNCRED, 0, &nbp); 208 if (error) 209 goto out; 210 bawrite(nbp); 211 } 212 } 213 /* 214 * Allocate all cylinder group blocks. 215 */ 216 for (cg = 0; cg < fs->fs_ncg; cg++) { 217 error = VOP_BALLOC(vp, (off_t)(cgtod(fs, cg)) << fs->fs_fshift, 218 fs->fs_bsize, KERNCRED, 0, &nbp); 219 if (error) 220 goto out; 221 bawrite(nbp); 222 } 223 /* 224 * Allocate copies for the superblock and its summary information. 225 */ 226 error = VOP_BALLOC(vp, (off_t)(SBOFF), fs->fs_bsize, KERNCRED, 227 0, &nbp); 228 if (error) 229 goto out; 230 bawrite(nbp); 231 blkno = fragstoblks(fs, fs->fs_csaddr); 232 len = howmany(fs->fs_cssize, fs->fs_bsize); 233 for (loc = 0; loc < len; loc++) { 234 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), 235 fs->fs_bsize, KERNCRED, 0, &nbp); 236 if (error) 237 goto out; 238 bawrite(nbp); 239 } 240 /* 241 * Change inode to snapshot type file. 242 */ 243 ip->i_flags |= SF_SNAPSHOT; 244 ip->i_flag |= IN_CHANGE | IN_UPDATE; 245 /* 246 * Ensure that the snapshot is completely on disk. 247 */ 248 if ((error = VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p)) != 0) 249 goto out; 250 /* 251 * All allocations are done, so we can now snapshot the system. 252 * 253 * Suspend operation on filesystem. 254 */ 255 for (;;) { 256 vn_finished_write(wrtmp); 257 vfs_write_suspend(vp->v_mount); 258 if (mp->mnt_kern_flag & MNTK_SUSPENDED) 259 break; 260 vn_start_write(NULL, &wrtmp, V_WAIT); 261 } 262 /* 263 * First, copy all the cylinder group maps. All the unallocated 264 * blocks are marked BLK_NOCOPY so that the snapshot knows that 265 * it need not copy them if they are later written. 266 */ 267 len = howmany(fs->fs_fpg, fs->fs_frag); 268 for (cg = 0; cg < fs->fs_ncg; cg++) { 269 error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), 270 (int)fs->fs_cgsize, KERNCRED, &bp); 271 if (error) { 272 brelse(bp); 273 goto out1; 274 } 275 cgp = (struct cg *)bp->b_data; 276 if (!cg_chkmagic(cgp)) { 277 brelse(bp); 278 error = EIO; 279 goto out1; 280 } 281 error = bread(vp, fragstoblks(fs, cgtod(fs, cg)), fs->fs_bsize, 282 KERNCRED, &nbp); 283 if (error) { 284 brelse(bp); 285 brelse(nbp); 286 goto out1; 287 } 288 bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize); 289 if (fs->fs_cgsize < fs->fs_bsize) 290 bzero(&nbp->b_data[fs->fs_cgsize], 291 fs->fs_bsize - fs->fs_cgsize); 292 nbp->b_flags |= B_VALIDSUSPWRT; 293 bawrite(nbp); 294 base = cg * fs->fs_fpg / fs->fs_frag; 295 if (base + len > numblks) 296 len = numblks - base; 297 loc = 0; 298 if (base < NDADDR) { 299 for ( ; loc < NDADDR; loc++) { 300 if (!ffs_isblock(fs, cg_blksfree(cgp), loc)) 301 continue; 302 ip->i_db[loc] = BLK_NOCOPY; 303 } 304 } 305 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), 306 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 307 if (error) { 308 brelse(bp); 309 goto out1; 310 } 311 indiroff = (base + loc - NDADDR) % NINDIR(fs); 312 for ( ; loc < len; loc++, indiroff++) { 313 if (indiroff >= NINDIR(fs)) { 314 ibp->b_flags |= B_VALIDSUSPWRT; 315 bawrite(ibp); 316 error = VOP_BALLOC(vp, 317 lblktosize(fs, (off_t)(base + loc)), 318 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 319 if (error) { 320 brelse(bp); 321 goto out1; 322 } 323 indiroff = 0; 324 } 325 if (!ffs_isblock(fs, cg_blksfree(cgp), loc)) 326 continue; 327 ((ufs_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; 328 } 329 bqrelse(bp); 330 ibp->b_flags |= B_VALIDSUSPWRT; 331 bdwrite(ibp); 332 } 333 /* 334 * Snapshot the superblock and its summary information. 335 */ 336 error = VOP_BALLOC(vp, (off_t)(SBOFF), fs->fs_bsize, KERNCRED, 337 0, &nbp); 338 if (error) 339 goto out1; 340 bcopy(fs, nbp->b_data, fs->fs_sbsize); 341 ((struct fs *)(nbp->b_data))->fs_clean = 1; 342 if (fs->fs_sbsize < fs->fs_bsize) 343 bzero(&nbp->b_data[fs->fs_sbsize], 344 fs->fs_bsize - fs->fs_sbsize); 345 nbp->b_flags |= B_VALIDSUSPWRT; 346 bawrite(nbp); 347 blkno = fragstoblks(fs, fs->fs_csaddr); 348 len = howmany(fs->fs_cssize, fs->fs_bsize) - 1; 349 size = fs->fs_bsize; 350 for (loc = 0; loc <= len; loc++) { 351 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), 352 fs->fs_bsize, KERNCRED, 0, &nbp); 353 if (error) 354 goto out1; 355 if (loc == len) { 356 readblock(nbp, blkno + loc); 357 size = fs->fs_cssize % fs->fs_bsize; 358 } 359 bcopy(fs->fs_csp[loc], nbp->b_data, size); 360 nbp->b_flags |= B_VALIDSUSPWRT; 361 bawrite(nbp); 362 } 363 /* 364 * Copy the shadow blocks for the snapshot inodes so that 365 * the copies can can be expunged. 366 */ 367 for (loc = 0; loc < inoblkcnt; loc++) { 368 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)inoblks[loc]), 369 fs->fs_bsize, KERNCRED, 0, &nbp); 370 if (error) 371 goto out1; 372 readblock(nbp, inoblks[loc]); 373 nbp->b_flags |= B_VALIDSUSPWRT; 374 bdwrite(nbp); 375 } 376 /* 377 * Copy allocation information from other snapshots and then 378 * expunge them from the view of the current snapshot. 379 */ 380 for (xp = devip->i_copyonwrite; xp; xp = xp->i_copyonwrite) { 381 /* 382 * Before expunging a snapshot inode, note all the 383 * blocks that it claims with BLK_SNAP so that fsck will 384 * be able to account for those blocks properly and so 385 * that this snapshot knows that it need not copy them 386 * if the other snapshot holding them is freed. 387 */ 388 if ((error = snapacct(vp, &xp->i_db[0], &xp->i_ib[NIADDR])) !=0) 389 goto out1; 390 blksperindir = 1; 391 lbn = -NDADDR; 392 len = numblks - NDADDR; 393 rlbn = NDADDR; 394 for (i = 0; len > 0 && i < NIADDR; i++) { 395 error = indiracct(vp, ITOV(xp), i, xp->i_ib[i], lbn, 396 rlbn, len, blksperindir); 397 if (error) 398 goto out1; 399 blksperindir *= NINDIR(fs); 400 lbn -= blksperindir + 1; 401 len -= blksperindir; 402 rlbn += blksperindir; 403 } 404 /* 405 * Set copied snapshot inode to be a zero length file. 406 */ 407 blkno = fragstoblks(fs, ino_to_fsba(fs, xp->i_number)); 408 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), 409 fs->fs_bsize, KERNCRED, 0, &nbp); 410 if (error) 411 goto out1; 412 dip = (struct dinode *)nbp->b_data + 413 ino_to_fsbo(fs, xp->i_number); 414 dip->di_size = 0; 415 dip->di_blocks = 0; 416 dip->di_flags &= ~SF_SNAPSHOT; 417 bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs_daddr_t)); 418 nbp->b_flags |= B_VALIDSUSPWRT; 419 bdwrite(nbp); 420 } 421 /* 422 * Copy all indirect blocks to their shadows (allocated above) 423 * to avoid deadlock in ffs_copyonwrite. 424 */ 425 for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) { 426 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), 427 fs->fs_bsize, p->p_ucred, B_METAONLY, &ibp); 428 if (error) 429 goto out1; 430 copyblkno = fragstoblks(fs, dbtofsb(fs, ibp->b_blkno)); 431 bqrelse(ibp); 432 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)copyblkno), 433 fs->fs_bsize, p->p_ucred, 0, &nbp); 434 if (error) 435 goto out1; 436 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), 437 fs->fs_bsize, p->p_ucred, B_METAONLY, &ibp); 438 if (error) { 439 brelse(nbp); 440 goto out1; 441 } 442 bcopy(ibp->b_data, nbp->b_data, fs->fs_bsize); 443 bqrelse(ibp); 444 nbp->b_flags |= B_VALIDSUSPWRT; 445 bawrite(nbp); 446 } 447 /* 448 * Record snapshot inode. Since this is the newest snapshot, 449 * it must be placed at the end of the list. 450 */ 451 fs->fs_snapinum[snaploc] = ip->i_number; 452 if (ip->i_copyonwrite != 0) 453 panic("ffs_snapshot: %d already on list", ip->i_number); 454 if (devip->i_copyonwrite == 0) { 455 devvp->v_flag |= VCOPYONWRITE; 456 devip->i_copyonwrite = ip; 457 } else { 458 for (xp = devip->i_copyonwrite; xp->i_copyonwrite != 0; ) 459 xp = xp->i_copyonwrite; 460 xp->i_copyonwrite = ip; 461 } 462 vp->v_flag |= VSYSTEM; 463 /* 464 * Resume operation on filesystem. 465 */ 466 out1: 467 vfs_write_resume(vp->v_mount); 468 vn_start_write(NULL, &wrtmp, V_WAIT); 469 out: 470 mp->mnt_flag = flag; 471 (void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p); 472 if (error) 473 vput(vp); 474 else 475 VOP_UNLOCK(vp, 0, p); 476 vn_finished_write(wrtmp); 477 return (error); 478 } 479 480 /* 481 * Descend an indirect block chain for vnode cancelvp accounting for all 482 * its indirect blocks in snapvp. 483 */ 484 static int 485 indiracct(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, blksperindir) 486 struct vnode *snapvp; 487 struct vnode *cancelvp; 488 int level; 489 ufs_daddr_t blkno; 490 int lbn; 491 int rlbn; 492 int remblks; 493 int blksperindir; 494 { 495 int subblksperindir, error, last, num, i; 496 struct indir indirs[NIADDR + 2]; 497 ufs_daddr_t *bap; 498 struct buf *bp; 499 struct fs *fs; 500 501 if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) 502 return (error); 503 if (lbn != indirs[num - 1 - level].in_lbn || blkno == 0 || num < 2) 504 panic("indiracct: botched params"); 505 /* 506 * We have to expand bread here since it will deadlock looking 507 * up the block number for any blocks that are not in the cache. 508 */ 509 fs = VTOI(cancelvp)->i_fs; 510 bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0); 511 bp->b_blkno = fsbtodb(fs, blkno); 512 if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && 513 (error = readblock(bp, fragstoblks(fs, blkno)))) { 514 brelse(bp); 515 return (error); 516 } 517 /* 518 * Account for the block pointers in this indirect block. 519 */ 520 last = howmany(remblks, blksperindir); 521 if (last > NINDIR(fs)) 522 last = NINDIR(fs); 523 if (snapvp != cancelvp) { 524 bap = (ufs_daddr_t *)bp->b_data; 525 } else { 526 MALLOC(bap, ufs_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK); 527 bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); 528 bqrelse(bp); 529 } 530 error = snapacct(snapvp, &bap[0], &bap[last]); 531 if (error || level == 0) 532 goto out; 533 /* 534 * Account for the block pointers in each of the indirect blocks 535 * in the levels below us. 536 */ 537 subblksperindir = blksperindir / NINDIR(fs); 538 for (lbn++, level--, i = 0; i < last; i++) { 539 error = indiracct(snapvp, cancelvp, level, bap[i], lbn, 540 rlbn, remblks, subblksperindir); 541 if (error) 542 goto out; 543 rlbn += blksperindir; 544 lbn -= blksperindir; 545 remblks -= blksperindir; 546 } 547 out: 548 if (snapvp != cancelvp) 549 bqrelse(bp); 550 else 551 FREE(bap, M_DEVBUF); 552 return (error); 553 } 554 555 /* 556 * Account for a set of blocks allocated in a snapshot inode. 557 */ 558 static int 559 snapacct(vp, oldblkp, lastblkp) 560 struct vnode *vp; 561 ufs_daddr_t *oldblkp, *lastblkp; 562 { 563 struct inode *ip = VTOI(vp); 564 struct fs *fs = ip->i_fs; 565 ufs_daddr_t lbn, blkno, *blkp; 566 struct buf *ibp; 567 int error; 568 569 for ( ; oldblkp < lastblkp; oldblkp++) { 570 blkno = *oldblkp; 571 if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) 572 continue; 573 lbn = fragstoblks(fs, blkno); 574 if (lbn < NDADDR) { 575 blkp = &ip->i_db[lbn]; 576 ip->i_flag |= IN_CHANGE | IN_UPDATE; 577 } else { 578 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), 579 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 580 if (error) 581 return (error); 582 blkp = &((ufs_daddr_t *)(ibp->b_data)) 583 [(lbn - NDADDR) % NINDIR(fs)]; 584 } 585 if (*blkp != 0) 586 panic("snapacct: bad block"); 587 *blkp = BLK_SNAP; 588 if (lbn >= NDADDR) { 589 ibp->b_flags |= B_VALIDSUSPWRT; 590 bdwrite(ibp); 591 } 592 } 593 return (0); 594 } 595 596 /* 597 * Prepare a snapshot file for being removed. 598 */ 599 void 600 ffs_snapremove(vp) 601 struct vnode *vp; 602 { 603 struct inode *ip, *xp; 604 struct vnode *devvp; 605 struct buf *ibp; 606 struct fs *fs; 607 ufs_daddr_t blkno, dblk; 608 int error, snaploc, loc, last; 609 610 ip = VTOI(vp); 611 fs = ip->i_fs; 612 /* 613 * Delete snapshot inode from superblock. Keep list dense. 614 */ 615 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) 616 if (fs->fs_snapinum[snaploc] == ip->i_number) 617 break; 618 if (snaploc < FSMAXSNAP) { 619 for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { 620 if (fs->fs_snapinum[snaploc] == 0) 621 break; 622 fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; 623 } 624 fs->fs_snapinum[snaploc - 1] = 0; 625 } 626 /* 627 * Delete from incore list. 628 * Clear copy-on-write flag if last snapshot. 629 */ 630 devvp = ip->i_devvp; 631 for (xp = VTOI(devvp); xp; xp = xp->i_copyonwrite) { 632 if (xp->i_copyonwrite != ip) 633 continue; 634 xp->i_copyonwrite = ip->i_copyonwrite; 635 ip->i_copyonwrite = 0; 636 break; 637 } 638 if (xp == 0) 639 printf("ffs_snapremove: lost snapshot vnode %d\n", 640 ip->i_number); 641 if (VTOI(devvp)->i_copyonwrite == 0) 642 devvp->v_flag &= ~VCOPYONWRITE; 643 /* 644 * Clear all BLK_NOCOPY fields. Pass any block claims to other 645 * snapshots that want them (see ffs_snapblkfree below). 646 */ 647 for (blkno = 1; blkno < NDADDR; blkno++) { 648 dblk = ip->i_db[blkno]; 649 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP || 650 (dblk == blkstofrags(fs, blkno) && 651 ffs_snapblkfree(ip, dblk, fs->fs_bsize))) 652 ip->i_db[blkno] = 0; 653 } 654 for (blkno = NDADDR; blkno < fs->fs_size; blkno += NINDIR(fs)) { 655 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno), 656 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 657 if (error) 658 continue; 659 if ((last = fs->fs_size - blkno) > NINDIR(fs)) 660 last = NINDIR(fs); 661 for (loc = 0; loc < last; loc++) { 662 dblk = ((ufs_daddr_t *)(ibp->b_data))[loc]; 663 if (dblk == BLK_NOCOPY || dblk == BLK_SNAP || 664 (dblk == blkstofrags(fs, blkno) && 665 ffs_snapblkfree(ip, dblk, fs->fs_bsize))) 666 ((ufs_daddr_t *)(ibp->b_data))[loc] = 0; 667 } 668 bawrite(ibp); 669 } 670 /* 671 * Clear snapshot flag and drop reference. 672 */ 673 ip->i_flags &= ~SF_SNAPSHOT; 674 ip->i_flag |= IN_CHANGE | IN_UPDATE; 675 } 676 677 /* 678 * Notification that a block is being freed. Return zero if the free 679 * should be allowed to proceed. Return non-zero if the snapshot file 680 * wants to claim the block. The block will be claimed if it is an 681 * uncopied part of one of the snapshots. It will be freed if it is 682 * either a BLK_NOCOPY or has already been copied in all of the snapshots. 683 * If a fragment is being freed, then all snapshots that care about 684 * it must make a copy since a snapshot file can only claim full sized 685 * blocks. Note that if more than one snapshot file maps the block, 686 * we can pick one at random to claim it. Since none of the snapshots 687 * can change, we are assurred that they will all see the same unmodified 688 * image. When deleting a snapshot file (see ffs_snapremove above), we 689 * must push any of these claimed blocks to one of the other snapshots 690 * that maps it. These claimed blocks are easily identified as they will 691 * have a block number equal to their logical block number within the 692 * snapshot. A copied block can never have this property because they 693 * must always have been allocated from a BLK_NOCOPY location. 694 */ 695 int 696 ffs_snapblkfree(freeip, bno, size) 697 struct inode *freeip; 698 ufs_daddr_t bno; 699 long size; 700 { 701 struct buf *ibp, *cbp, *savedcbp = 0; 702 struct fs *fs = freeip->i_fs; 703 struct proc *p = CURPROC; 704 struct inode *ip; 705 struct vnode *vp; 706 ufs_daddr_t lbn, blkno; 707 int indiroff = 0, error = 0, claimedblk = 0; 708 709 lbn = fragstoblks(fs, bno); 710 for (ip = VTOI(freeip->i_devvp)->i_copyonwrite; ip; 711 ip = ip->i_copyonwrite) { 712 vp = ITOV(ip); 713 /* 714 * Lookup block being written. 715 */ 716 if (lbn < NDADDR) { 717 blkno = ip->i_db[lbn]; 718 } else { 719 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 720 p->p_flag |= P_COWINPROGRESS; 721 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), 722 fs->fs_bsize, KERNCRED, B_METAONLY, &ibp); 723 p->p_flag &= ~P_COWINPROGRESS; 724 VOP_UNLOCK(vp, 0, p); 725 if (error) 726 break; 727 indiroff = (lbn - NDADDR) % NINDIR(fs); 728 blkno = ((ufs_daddr_t *)(ibp->b_data))[indiroff]; 729 } 730 /* 731 * Check to see if block needs to be copied. 732 */ 733 switch (blkno) { 734 /* 735 * If the snapshot has already copied the block (default), 736 * or does not care about the block, it is not needed. 737 */ 738 default: 739 case BLK_NOCOPY: 740 if (lbn >= NDADDR) 741 bqrelse(ibp); 742 continue; 743 /* 744 * No previous snapshot claimed the block, so it will be 745 * freed and become a BLK_NOCOPY (don't care) for us. 746 */ 747 case BLK_SNAP: 748 if (claimedblk) 749 panic("snapblkfree: inconsistent block type"); 750 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 751 if (lbn < NDADDR) { 752 ip->i_db[lbn] = BLK_NOCOPY; 753 ip->i_flag |= IN_CHANGE | IN_UPDATE; 754 } else { 755 ((ufs_daddr_t *)(ibp->b_data))[indiroff] = 756 BLK_NOCOPY; 757 bdwrite(ibp); 758 } 759 VOP_UNLOCK(vp, 0, p); 760 continue; 761 /* 762 * A block that we map is being freed. If it has not been 763 * claimed yet, we will claim or copy it (below). 764 */ 765 case 0: 766 claimedblk = 1; 767 break; 768 } 769 /* 770 * If this is a full size block, we will just grab it 771 * and assign it to the snapshot inode. Otherwise we 772 * will proceed to copy it. See explanation for this 773 * routine as to why only a single snapshot needs to 774 * claim this block. 775 */ 776 if (size == fs->fs_bsize) { 777 #ifdef DEBUG 778 if (snapdebug) 779 printf("%s %d lbn %d from inum %d\n", 780 "Grabonremove: snapino", ip->i_number, lbn, 781 freeip->i_number); 782 #endif 783 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 784 if (lbn < NDADDR) { 785 ip->i_db[lbn] = bno; 786 } else { 787 ((ufs_daddr_t *)(ibp->b_data))[indiroff] = bno; 788 bdwrite(ibp); 789 } 790 ip->i_blocks += btodb(size); 791 ip->i_flag |= IN_CHANGE | IN_UPDATE; 792 VOP_UNLOCK(vp, 0, p); 793 return (1); 794 } 795 if (lbn >= NDADDR) 796 bqrelse(ibp); 797 /* 798 * Allocate the block into which to do the copy. Note that this 799 * allocation will never require any additional allocations for 800 * the snapshot inode. 801 */ 802 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); 803 p->p_flag |= P_COWINPROGRESS; 804 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), 805 fs->fs_bsize, KERNCRED, 0, &cbp); 806 p->p_flag &= ~P_COWINPROGRESS; 807 VOP_UNLOCK(vp, 0, p); 808 if (error) 809 break; 810 #ifdef DEBUG 811 if (snapdebug) 812 printf("%s%d lbn %d for inum %d size %ld to blkno %d\n", 813 "Copyonremove: snapino ", ip->i_number, lbn, 814 freeip->i_number, size, cbp->b_blkno); 815 #endif 816 /* 817 * If we have already read the old block contents, then 818 * simply copy them to the new block. 819 */ 820 if (savedcbp != 0) { 821 bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 822 bawrite(cbp); 823 continue; 824 } 825 /* 826 * Otherwise, read the old block contents into the buffer. 827 */ 828 if ((error = readblock(cbp, lbn)) != 0) 829 break; 830 savedcbp = cbp; 831 } 832 if (savedcbp) 833 bawrite(savedcbp); 834 /* 835 * If we have been unable to allocate a block in which to do 836 * the copy, then return non-zero so that the fragment will 837 * not be freed. Although space will be lost, the snapshot 838 * will stay consistent. 839 */ 840 return (error); 841 } 842 843 /* 844 * Associate snapshot files when mounting. 845 */ 846 void 847 ffs_snapshot_mount(mp) 848 struct mount *mp; 849 { 850 struct ufsmount *ump = VFSTOUFS(mp); 851 struct fs *fs = ump->um_fs; 852 struct proc *p = CURPROC; 853 struct inode *ip, **listtailp; 854 struct vnode *vp; 855 int error, snaploc, loc; 856 857 listtailp = &VTOI(ump->um_devvp)->i_copyonwrite; 858 for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { 859 if (fs->fs_snapinum[snaploc] == 0) 860 return; 861 if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], &vp)) != 0){ 862 printf("ffs_snapshot_mount: vget failed %d\n", error); 863 continue; 864 } 865 ip = VTOI(vp); 866 if ((ip->i_flags & SF_SNAPSHOT) == 0) { 867 printf("ffs_snapshot_mount: non-snapshot inode %d\n", 868 fs->fs_snapinum[snaploc]); 869 vput(vp); 870 for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { 871 if (fs->fs_snapinum[loc] == 0) 872 break; 873 fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; 874 } 875 fs->fs_snapinum[loc - 1] = 0; 876 snaploc--; 877 continue; 878 } 879 if (ip->i_copyonwrite != 0) 880 panic("ffs_snapshot_mount: %d already on list", 881 ip->i_number); 882 *listtailp = ip; 883 listtailp = &ip->i_copyonwrite; 884 vp->v_flag |= VSYSTEM; 885 VOP_UNLOCK(vp, 0, p); 886 ump->um_devvp->v_flag |= VCOPYONWRITE; 887 } 888 } 889 890 /* 891 * Disassociate snapshot files when unmounting. 892 */ 893 void 894 ffs_snapshot_unmount(mp) 895 struct mount *mp; 896 { 897 struct ufsmount *ump = VFSTOUFS(mp); 898 struct inode *devip = VTOI(ump->um_devvp); 899 struct inode *xp; 900 901 while ((xp = devip->i_copyonwrite) != 0) { 902 devip->i_copyonwrite = xp->i_copyonwrite; 903 xp->i_copyonwrite = 0; 904 if (xp->i_effnlink > 0) 905 vrele(ITOV(xp)); 906 } 907 ump->um_devvp->v_flag &= ~VCOPYONWRITE; 908 } 909 910 /* 911 * Check for need to copy block that is about to be written, 912 * copying the block if necessary. 913 */ 914 int 915 ffs_copyonwrite(ap) 916 struct vop_copyonwrite_args /* { 917 struct vnode *a_vp; 918 struct buf *a_bp; 919 } */ *ap; 920 { 921 struct buf *ibp, *cbp, *savedcbp = 0, *bp = ap->a_bp; 922 struct fs *fs = VTOI(bp->b_vp)->i_fs; 923 struct proc *p = CURPROC; 924 struct inode *ip; 925 struct vnode *vp; 926 ufs_daddr_t lbn, blkno; 927 int indiroff, error = 0; 928 929 lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); 930 if (p->p_flag & P_COWINPROGRESS) 931 panic("ffs_copyonwrite: recursive call"); 932 for (ip = VTOI(ap->a_vp)->i_copyonwrite; ip; ip = ip->i_copyonwrite) { 933 vp = ITOV(ip); 934 /* 935 * We ensure that everything of our own that needs to be 936 * copied will be done at the time that ffs_snapshot is 937 * called. Thus we can skip the check here which can 938 * deadlock in doing the lookup in VOP_BALLOC. 939 */ 940 if (bp->b_vp == vp) 941 continue; 942 /* 943 * Check to see if block needs to be copied. We have to 944 * be able to do the VOP_BALLOC without blocking, otherwise 945 * we may get in a deadlock with another process also 946 * trying to allocate. If we find outselves unable to 947 * get the buffer lock, we unlock the snapshot vnode, 948 * sleep briefly, and try again. 949 */ 950 retry: 951 vn_lock(vp, LK_SHARED | LK_RETRY, p); 952 if (lbn < NDADDR) { 953 blkno = ip->i_db[lbn]; 954 } else { 955 p->p_flag |= P_COWINPROGRESS; 956 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), 957 fs->fs_bsize, KERNCRED, B_METAONLY | B_NOWAIT, &ibp); 958 p->p_flag &= ~P_COWINPROGRESS; 959 if (error) { 960 VOP_UNLOCK(vp, 0, p); 961 if (error != EWOULDBLOCK) 962 break; 963 tsleep(vp, p->p_usrpri, "nap", 1); 964 goto retry; 965 } 966 indiroff = (lbn - NDADDR) % NINDIR(fs); 967 blkno = ((ufs_daddr_t *)(ibp->b_data))[indiroff]; 968 bqrelse(ibp); 969 } 970 #ifdef DIAGNOSTIC 971 if (blkno == BLK_SNAP && bp->b_lblkno >= 0) 972 panic("ffs_copyonwrite: bad copy block"); 973 #endif 974 if (blkno != 0) { 975 VOP_UNLOCK(vp, 0, p); 976 continue; 977 } 978 /* 979 * Allocate the block into which to do the copy. Note that this 980 * allocation will never require any additional allocations for 981 * the snapshot inode. 982 */ 983 p->p_flag |= P_COWINPROGRESS; 984 error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn), 985 fs->fs_bsize, KERNCRED, B_NOWAIT, &cbp); 986 p->p_flag &= ~P_COWINPROGRESS; 987 VOP_UNLOCK(vp, 0, p); 988 if (error) { 989 if (error != EWOULDBLOCK) 990 break; 991 tsleep(vp, p->p_usrpri, "nap", 1); 992 goto retry; 993 } 994 #ifdef DEBUG 995 if (snapdebug) { 996 printf("Copyonwrite: snapino %d lbn %d for ", 997 ip->i_number, lbn); 998 if (bp->b_vp == ap->a_vp) 999 printf("fs metadata"); 1000 else 1001 printf("inum %d", VTOI(bp->b_vp)->i_number); 1002 printf(" lblkno %d to blkno %d\n", bp->b_lblkno, 1003 cbp->b_blkno); 1004 } 1005 #endif 1006 /* 1007 * If we have already read the old block contents, then 1008 * simply copy them to the new block. 1009 */ 1010 if (savedcbp != 0) { 1011 bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); 1012 bawrite(cbp); 1013 continue; 1014 } 1015 /* 1016 * Otherwise, read the old block contents into the buffer. 1017 */ 1018 if ((error = readblock(cbp, lbn)) != 0) 1019 break; 1020 savedcbp = cbp; 1021 } 1022 if (savedcbp) 1023 bawrite(savedcbp); 1024 return (error); 1025 } 1026 1027 /* 1028 * Read the specified block into the given buffer. 1029 * Much of this boiler-plate comes from bwrite(). 1030 */ 1031 static int 1032 readblock(bp, lbn) 1033 struct buf *bp; 1034 daddr_t lbn; 1035 { 1036 struct uio auio; 1037 struct iovec aiov; 1038 struct proc *p = CURPROC; 1039 struct inode *ip = VTOI(bp->b_vp); 1040 1041 aiov.iov_base = bp->b_data; 1042 aiov.iov_len = bp->b_bcount; 1043 auio.uio_iov = &aiov; 1044 auio.uio_iovcnt = 1; 1045 auio.uio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn))); 1046 auio.uio_resid = bp->b_bcount; 1047 auio.uio_rw = UIO_READ; 1048 auio.uio_segflg = UIO_SYSSPACE; 1049 auio.uio_procp = p; 1050 return (physio(ip->i_devvp->v_rdev, &auio, 0)); 1051 } 1052