xref: /freebsd/sys/ufs/ffs/ffs_snapshot.c (revision 77a0943ded95b9e6438f7db70c4a28e4d93946d4)
1 /*
2  * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
3  *
4  * Further information about snapshots can be obtained from:
5  *
6  *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
7  *	1614 Oxford Street		mckusick@mckusick.com
8  *	Berkeley, CA 94709-1608		+1-510-843-9542
9  *	USA
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  *
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
22  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24  * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
25  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *	@(#)ffs_snapshot.c	8.11 (McKusick) 7/23/00
34  * $FreeBSD$
35  */
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/bio.h>
40 #include <sys/buf.h>
41 #include <sys/proc.h>
42 #include <sys/namei.h>
43 #include <sys/stat.h>
44 #include <sys/malloc.h>
45 #include <sys/mount.h>
46 #include <sys/resource.h>
47 #include <sys/resourcevar.h>
48 #include <sys/vnode.h>
49 
50 #include <ufs/ufs/extattr.h>
51 #include <ufs/ufs/quota.h>
52 #include <ufs/ufs/ufsmount.h>
53 #include <ufs/ufs/inode.h>
54 #include <ufs/ufs/ufs_extern.h>
55 
56 #include <ufs/ffs/fs.h>
57 #include <ufs/ffs/ffs_extern.h>
58 
59 #define KERNCRED proc0.p_ucred
60 #define DEBUG 1
61 
62 static int indiracct __P((struct vnode *, struct vnode *, int, ufs_daddr_t,
63 	int, int, int, int));
64 static int snapacct __P((struct vnode *, ufs_daddr_t *, ufs_daddr_t *));
65 static int readblock __P((struct buf *, daddr_t));
66 
67 #ifdef DEBUG
68 #include <sys/sysctl.h>
69 int snapdebug = 0;
70 SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, "");
71 #endif /* DEBUG */
72 
73 /*
74  * Create a snapshot file and initialize it for the filesystem.
75  */
76 int
77 ffs_snapshot(mp, snapfile)
78 	struct mount *mp;
79 	char *snapfile;
80 {
81 	ufs_daddr_t rlbn;
82 	ufs_daddr_t lbn, blkno, copyblkno, inoblks[FSMAXSNAP];
83 	int error, cg, snaploc, indiroff, numblks;
84 	int i, size, base, len, loc, inoblkcnt;
85 	int blksperindir, flag = mp->mnt_flag;
86 	struct fs *fs = VFSTOUFS(mp)->um_fs;
87 	struct proc *p = CURPROC;
88 	struct inode *devip, *ip, *xp;
89 	struct buf *bp, *nbp, *ibp;
90 	struct vnode *vp, *devvp;
91 	struct nameidata nd;
92 	struct mount *wrtmp;
93 	struct dinode *dip;
94 	struct vattr vat;
95 	struct cg *cgp;
96 
97 	/*
98 	 * Need to serialize access to snapshot code per filesystem.
99 	 */
100 	/*
101 	 * Assign a snapshot slot in the superblock.
102 	 */
103 	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
104 		if (fs->fs_snapinum[snaploc] == 0)
105 			break;
106 	if (snaploc == FSMAXSNAP)
107 		return (ENOSPC);
108 	/*
109 	 * Create the snapshot file.
110 	 */
111 restart:
112 	NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, snapfile, p);
113 	if ((error = namei(&nd)) != 0)
114 		return (error);
115 	if (nd.ni_vp != NULL) {
116 		vput(nd.ni_vp);
117 		error = EEXIST;
118 	}
119 	if (nd.ni_dvp->v_mount != mp)
120 		error = EXDEV;
121 	if (error) {
122 		NDFREE(&nd, NDF_ONLY_PNBUF);
123 		if (nd.ni_dvp == nd.ni_vp)
124 			vrele(nd.ni_dvp);
125 		else
126 			vput(nd.ni_dvp);
127 		return (error);
128 	}
129 	VATTR_NULL(&vat);
130 	vat.va_type = VREG;
131 	vat.va_mode = S_IRUSR;
132 	vat.va_vaflags |= VA_EXCLUSIVE;
133 	if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp))
134 		wrtmp = NULL;
135 	if (wrtmp != mp)
136 		panic("ffs_snapshot: mount mismatch");
137 	if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) {
138 		NDFREE(&nd, NDF_ONLY_PNBUF);
139 		vput(nd.ni_dvp);
140 		if ((error = vn_start_write(NULL, &wrtmp,
141 		    V_XSLEEP | PCATCH)) != 0)
142 			return (error);
143 		goto restart;
144 	}
145 	VOP_LEASE(nd.ni_dvp, p, KERNCRED, LEASE_WRITE);
146 	error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat);
147 	vput(nd.ni_dvp);
148 	if (error) {
149 		NDFREE(&nd, NDF_ONLY_PNBUF);
150 		vn_finished_write(wrtmp);
151 		return (error);
152 	}
153 	vp = nd.ni_vp;
154 	ip = VTOI(vp);
155 	devvp = ip->i_devvp;
156 	devip = VTOI(devvp);
157 	/*
158 	 * Allocate and copy the last block contents so as to be able
159 	 * to set size to that of the filesystem.
160 	 */
161 	numblks = howmany(fs->fs_size, fs->fs_frag);
162 	error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)),
163 	    fs->fs_bsize, KERNCRED, B_CLRBUF, &bp);
164 	if (error)
165 		goto out;
166 	ip->i_size = lblktosize(fs, (off_t)numblks);
167 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
168 	if ((error = readblock(bp, numblks - 1)) != 0)
169 		goto out;
170 	bawrite(bp);
171 	/*
172 	 * Preallocate critical data structures so that we can copy
173 	 * them in without further allocation after we suspend all
174 	 * operations on the filesystem. We would like to just release
175 	 * the allocated buffers without writing them since they will
176 	 * be filled in below once we are ready to go, but this upsets
177 	 * the soft update code, so we go ahead and write the new buffers.
178 	 *
179 	 * Allocate all indirect blocks. Also allocate shadow copies
180 	 * for each of the indirect blocks.
181 	 */
182 	for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
183 		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno),
184 		    fs->fs_bsize, p->p_ucred, B_METAONLY, &ibp);
185 		if (error)
186 			goto out;
187 		copyblkno = fragstoblks(fs, dbtofsb(fs, ibp->b_blkno));
188 		bdwrite(ibp);
189 		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)copyblkno),
190 		    fs->fs_bsize, p->p_ucred, 0, &nbp);
191 		if (error)
192 			goto out;
193 		bawrite(nbp);
194 	}
195 	/*
196 	 * Allocate shadow blocks to copy all of the other snapshot inodes
197 	 * so that we will be able to expunge them from this snapshot.
198 	 */
199 	for (loc = 0, inoblkcnt = 0; loc < snaploc; loc++) {
200 		blkno = fragstoblks(fs, ino_to_fsba(fs, fs->fs_snapinum[loc]));
201 		for (i = 0; i < inoblkcnt; i++)
202 			if (inoblks[i] == blkno)
203 				break;
204 		if (i == inoblkcnt) {
205 			inoblks[inoblkcnt++] = blkno;
206 			error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno),
207 			    fs->fs_bsize, KERNCRED, 0, &nbp);
208 			if (error)
209 				goto out;
210 			bawrite(nbp);
211 		}
212 	}
213 	/*
214 	 * Allocate all cylinder group blocks.
215 	 */
216 	for (cg = 0; cg < fs->fs_ncg; cg++) {
217 		error = VOP_BALLOC(vp, (off_t)(cgtod(fs, cg)) << fs->fs_fshift,
218 		    fs->fs_bsize, KERNCRED, 0, &nbp);
219 		if (error)
220 			goto out;
221 		bawrite(nbp);
222 	}
223 	/*
224 	 * Allocate copies for the superblock and its summary information.
225 	 */
226 	error = VOP_BALLOC(vp, (off_t)(SBOFF), fs->fs_bsize, KERNCRED,
227 	    0, &nbp);
228 	if (error)
229 		goto out;
230 	bawrite(nbp);
231 	blkno = fragstoblks(fs, fs->fs_csaddr);
232 	len = howmany(fs->fs_cssize, fs->fs_bsize);
233 	for (loc = 0; loc < len; loc++) {
234 		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)),
235 		    fs->fs_bsize, KERNCRED, 0, &nbp);
236 		if (error)
237 			goto out;
238 		bawrite(nbp);
239 	}
240 	/*
241 	 * Change inode to snapshot type file.
242 	 */
243 	ip->i_flags |= SF_SNAPSHOT;
244 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
245 	/*
246 	 * Ensure that the snapshot is completely on disk.
247 	 */
248 	if ((error = VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p)) != 0)
249 		goto out;
250 	/*
251 	 * All allocations are done, so we can now snapshot the system.
252 	 *
253 	 * Suspend operation on filesystem.
254 	 */
255 	for (;;) {
256 		vn_finished_write(wrtmp);
257 		vfs_write_suspend(vp->v_mount);
258 		if (mp->mnt_kern_flag & MNTK_SUSPENDED)
259 			break;
260 		vn_start_write(NULL, &wrtmp, V_WAIT);
261 	}
262 	/*
263 	 * First, copy all the cylinder group maps. All the unallocated
264 	 * blocks are marked BLK_NOCOPY so that the snapshot knows that
265 	 * it need not copy them if they are later written.
266 	 */
267 	len = howmany(fs->fs_fpg, fs->fs_frag);
268 	for (cg = 0; cg < fs->fs_ncg; cg++) {
269 		error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
270 			(int)fs->fs_cgsize, KERNCRED, &bp);
271 		if (error) {
272 			brelse(bp);
273 			goto out1;
274 		}
275 		cgp = (struct cg *)bp->b_data;
276 		if (!cg_chkmagic(cgp)) {
277 			brelse(bp);
278 			error = EIO;
279 			goto out1;
280 		}
281 		error = bread(vp, fragstoblks(fs, cgtod(fs, cg)), fs->fs_bsize,
282 			KERNCRED, &nbp);
283 		if (error) {
284 			brelse(bp);
285 			brelse(nbp);
286 			goto out1;
287 		}
288 		bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize);
289 		if (fs->fs_cgsize < fs->fs_bsize)
290 			bzero(&nbp->b_data[fs->fs_cgsize],
291 			    fs->fs_bsize - fs->fs_cgsize);
292 		nbp->b_flags |= B_VALIDSUSPWRT;
293 		bawrite(nbp);
294 		base = cg * fs->fs_fpg / fs->fs_frag;
295 		if (base + len > numblks)
296 			len = numblks - base;
297 		loc = 0;
298 		if (base < NDADDR) {
299 			for ( ; loc < NDADDR; loc++) {
300 				if (!ffs_isblock(fs, cg_blksfree(cgp), loc))
301 					continue;
302 				ip->i_db[loc] = BLK_NOCOPY;
303 			}
304 		}
305 		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)),
306 		    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
307 		if (error) {
308 			brelse(bp);
309 			goto out1;
310 		}
311 		indiroff = (base + loc - NDADDR) % NINDIR(fs);
312 		for ( ; loc < len; loc++, indiroff++) {
313 			if (indiroff >= NINDIR(fs)) {
314 				ibp->b_flags |= B_VALIDSUSPWRT;
315 				bawrite(ibp);
316 				error = VOP_BALLOC(vp,
317 				    lblktosize(fs, (off_t)(base + loc)),
318 				    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
319 				if (error) {
320 					brelse(bp);
321 					goto out1;
322 				}
323 				indiroff = 0;
324 			}
325 			if (!ffs_isblock(fs, cg_blksfree(cgp), loc))
326 				continue;
327 			((ufs_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY;
328 		}
329 		bqrelse(bp);
330 		ibp->b_flags |= B_VALIDSUSPWRT;
331 		bdwrite(ibp);
332 	}
333 	/*
334 	 * Snapshot the superblock and its summary information.
335 	 */
336 	error = VOP_BALLOC(vp, (off_t)(SBOFF), fs->fs_bsize, KERNCRED,
337 	    0, &nbp);
338 	if (error)
339 		goto out1;
340 	bcopy(fs, nbp->b_data, fs->fs_sbsize);
341 	((struct fs *)(nbp->b_data))->fs_clean = 1;
342 	if (fs->fs_sbsize < fs->fs_bsize)
343 		bzero(&nbp->b_data[fs->fs_sbsize],
344 		    fs->fs_bsize - fs->fs_sbsize);
345 	nbp->b_flags |= B_VALIDSUSPWRT;
346 	bawrite(nbp);
347 	blkno = fragstoblks(fs, fs->fs_csaddr);
348 	len = howmany(fs->fs_cssize, fs->fs_bsize) - 1;
349 	size = fs->fs_bsize;
350 	for (loc = 0; loc <= len; loc++) {
351 		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)),
352 		    fs->fs_bsize, KERNCRED, 0, &nbp);
353 		if (error)
354 			goto out1;
355 		if (loc == len) {
356 			readblock(nbp, blkno + loc);
357 			size = fs->fs_cssize % fs->fs_bsize;
358 		}
359 		bcopy(fs->fs_csp[loc], nbp->b_data, size);
360 		nbp->b_flags |= B_VALIDSUSPWRT;
361 		bawrite(nbp);
362 	}
363 	/*
364 	 * Copy the shadow blocks for the snapshot inodes so that
365 	 * the copies can can be expunged.
366 	 */
367 	for (loc = 0; loc < inoblkcnt; loc++) {
368 		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)inoblks[loc]),
369 		    fs->fs_bsize, KERNCRED, 0, &nbp);
370 		if (error)
371 			goto out1;
372 		readblock(nbp, inoblks[loc]);
373 		nbp->b_flags |= B_VALIDSUSPWRT;
374 		bdwrite(nbp);
375 	}
376 	/*
377 	 * Copy allocation information from other snapshots and then
378 	 * expunge them from the view of the current snapshot.
379 	 */
380 	for (xp = devip->i_copyonwrite; xp; xp = xp->i_copyonwrite) {
381 		/*
382 		 * Before expunging a snapshot inode, note all the
383 		 * blocks that it claims with BLK_SNAP so that fsck will
384 		 * be able to account for those blocks properly and so
385 		 * that this snapshot knows that it need not copy them
386 		 * if the other snapshot holding them is freed.
387 		 */
388 		if ((error = snapacct(vp, &xp->i_db[0], &xp->i_ib[NIADDR])) !=0)
389 			goto out1;
390 		blksperindir = 1;
391 		lbn = -NDADDR;
392 		len = numblks - NDADDR;
393 		rlbn = NDADDR;
394 		for (i = 0; len > 0 && i < NIADDR; i++) {
395 			error = indiracct(vp, ITOV(xp), i, xp->i_ib[i], lbn,
396 			    rlbn, len, blksperindir);
397 			if (error)
398 				goto out1;
399 			blksperindir *= NINDIR(fs);
400 			lbn -= blksperindir + 1;
401 			len -= blksperindir;
402 			rlbn += blksperindir;
403 		}
404 		/*
405 		 * Set copied snapshot inode to be a zero length file.
406 		 */
407 		blkno = fragstoblks(fs, ino_to_fsba(fs, xp->i_number));
408 		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno),
409 		    fs->fs_bsize, KERNCRED, 0, &nbp);
410 		if (error)
411 			goto out1;
412 		dip = (struct dinode *)nbp->b_data +
413 		    ino_to_fsbo(fs, xp->i_number);
414 		dip->di_size = 0;
415 		dip->di_blocks = 0;
416 		dip->di_flags &= ~SF_SNAPSHOT;
417 		bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs_daddr_t));
418 		nbp->b_flags |= B_VALIDSUSPWRT;
419 		bdwrite(nbp);
420 	}
421 	/*
422 	 * Copy all indirect blocks to their shadows (allocated above)
423 	 * to avoid deadlock in ffs_copyonwrite.
424 	 */
425 	for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
426 		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno),
427 		    fs->fs_bsize, p->p_ucred, B_METAONLY, &ibp);
428 		if (error)
429 			goto out1;
430 		copyblkno = fragstoblks(fs, dbtofsb(fs, ibp->b_blkno));
431 		bqrelse(ibp);
432 		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)copyblkno),
433 		    fs->fs_bsize, p->p_ucred, 0, &nbp);
434 		if (error)
435 			goto out1;
436 		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno),
437 		    fs->fs_bsize, p->p_ucred, B_METAONLY, &ibp);
438 		if (error) {
439 			brelse(nbp);
440 			goto out1;
441 		}
442 		bcopy(ibp->b_data, nbp->b_data, fs->fs_bsize);
443 		bqrelse(ibp);
444 		nbp->b_flags |= B_VALIDSUSPWRT;
445 		bawrite(nbp);
446 	}
447 	/*
448 	 * Record snapshot inode. Since this is the newest snapshot,
449 	 * it must be placed at the end of the list.
450 	 */
451 	fs->fs_snapinum[snaploc] = ip->i_number;
452 	if (ip->i_copyonwrite != 0)
453 		panic("ffs_snapshot: %d already on list", ip->i_number);
454 	if (devip->i_copyonwrite == 0) {
455 		devvp->v_flag |= VCOPYONWRITE;
456 		devip->i_copyonwrite = ip;
457 	} else {
458 		for (xp = devip->i_copyonwrite; xp->i_copyonwrite != 0; )
459 			xp = xp->i_copyonwrite;
460 		xp->i_copyonwrite = ip;
461 	}
462 	vp->v_flag |= VSYSTEM;
463 	/*
464 	 * Resume operation on filesystem.
465 	 */
466 out1:
467 	vfs_write_resume(vp->v_mount);
468 	vn_start_write(NULL, &wrtmp, V_WAIT);
469 out:
470 	mp->mnt_flag = flag;
471 	(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p);
472 	if (error)
473 		vput(vp);
474 	else
475 		VOP_UNLOCK(vp, 0, p);
476 	vn_finished_write(wrtmp);
477 	return (error);
478 }
479 
480 /*
481  * Descend an indirect block chain for vnode cancelvp accounting for all
482  * its indirect blocks in snapvp.
483  */
484 static int
485 indiracct(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, blksperindir)
486 	struct vnode *snapvp;
487 	struct vnode *cancelvp;
488 	int level;
489 	ufs_daddr_t blkno;
490 	int lbn;
491 	int rlbn;
492 	int remblks;
493 	int blksperindir;
494 {
495 	int subblksperindir, error, last, num, i;
496 	struct indir indirs[NIADDR + 2];
497 	ufs_daddr_t *bap;
498 	struct buf *bp;
499 	struct fs *fs;
500 
501 	if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
502 		return (error);
503 	if (lbn != indirs[num - 1 - level].in_lbn || blkno == 0 || num < 2)
504 		panic("indiracct: botched params");
505 	/*
506 	 * We have to expand bread here since it will deadlock looking
507 	 * up the block number for any blocks that are not in the cache.
508 	 */
509 	fs = VTOI(cancelvp)->i_fs;
510 	bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0);
511 	bp->b_blkno = fsbtodb(fs, blkno);
512 	if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 &&
513 	    (error = readblock(bp, fragstoblks(fs, blkno)))) {
514 		brelse(bp);
515 		return (error);
516 	}
517 	/*
518 	 * Account for the block pointers in this indirect block.
519 	 */
520 	last = howmany(remblks, blksperindir);
521 	if (last > NINDIR(fs))
522 		last = NINDIR(fs);
523 	if (snapvp != cancelvp) {
524 		bap = (ufs_daddr_t *)bp->b_data;
525 	} else {
526 		MALLOC(bap, ufs_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK);
527 		bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
528 		bqrelse(bp);
529 	}
530 	error = snapacct(snapvp, &bap[0], &bap[last]);
531 	if (error || level == 0)
532 		goto out;
533 	/*
534 	 * Account for the block pointers in each of the indirect blocks
535 	 * in the levels below us.
536 	 */
537 	subblksperindir = blksperindir / NINDIR(fs);
538 	for (lbn++, level--, i = 0; i < last; i++) {
539 		error = indiracct(snapvp, cancelvp, level, bap[i], lbn,
540 		    rlbn, remblks, subblksperindir);
541 		if (error)
542 			goto out;
543 		rlbn += blksperindir;
544 		lbn -= blksperindir;
545 		remblks -= blksperindir;
546 	}
547 out:
548 	if (snapvp != cancelvp)
549 		bqrelse(bp);
550 	else
551 		FREE(bap, M_DEVBUF);
552 	return (error);
553 }
554 
555 /*
556  * Account for a set of blocks allocated in a snapshot inode.
557  */
558 static int
559 snapacct(vp, oldblkp, lastblkp)
560 	struct vnode *vp;
561 	ufs_daddr_t *oldblkp, *lastblkp;
562 {
563 	struct inode *ip = VTOI(vp);
564 	struct fs *fs = ip->i_fs;
565 	ufs_daddr_t lbn, blkno, *blkp;
566 	struct buf *ibp;
567 	int error;
568 
569 	for ( ; oldblkp < lastblkp; oldblkp++) {
570 		blkno = *oldblkp;
571 		if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
572 			continue;
573 		lbn = fragstoblks(fs, blkno);
574 		if (lbn < NDADDR) {
575 			blkp = &ip->i_db[lbn];
576 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
577 		} else {
578 			error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn),
579 			    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
580 			if (error)
581 				return (error);
582 			blkp = &((ufs_daddr_t *)(ibp->b_data))
583 			    [(lbn - NDADDR) % NINDIR(fs)];
584 		}
585 		if (*blkp != 0)
586 			panic("snapacct: bad block");
587 		*blkp = BLK_SNAP;
588 		if (lbn >= NDADDR) {
589 			ibp->b_flags |= B_VALIDSUSPWRT;
590 			bdwrite(ibp);
591 		}
592 	}
593 	return (0);
594 }
595 
596 /*
597  * Prepare a snapshot file for being removed.
598  */
599 void
600 ffs_snapremove(vp)
601 	struct vnode *vp;
602 {
603 	struct inode *ip, *xp;
604 	struct vnode *devvp;
605 	struct buf *ibp;
606 	struct fs *fs;
607 	ufs_daddr_t blkno, dblk;
608 	int error, snaploc, loc, last;
609 
610 	ip = VTOI(vp);
611 	fs = ip->i_fs;
612 	/*
613 	 * Delete snapshot inode from superblock. Keep list dense.
614 	 */
615 	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
616 		if (fs->fs_snapinum[snaploc] == ip->i_number)
617 			break;
618 	if (snaploc < FSMAXSNAP) {
619 		for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
620 			if (fs->fs_snapinum[snaploc] == 0)
621 				break;
622 			fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
623 		}
624 		fs->fs_snapinum[snaploc - 1] = 0;
625 	}
626 	/*
627 	 * Delete from incore list.
628 	 * Clear copy-on-write flag if last snapshot.
629 	 */
630 	devvp = ip->i_devvp;
631 	for (xp = VTOI(devvp); xp; xp = xp->i_copyonwrite) {
632 		if (xp->i_copyonwrite != ip)
633 			continue;
634 		xp->i_copyonwrite = ip->i_copyonwrite;
635 		ip->i_copyonwrite = 0;
636 		break;
637 	}
638 	if (xp == 0)
639 		printf("ffs_snapremove: lost snapshot vnode %d\n",
640 		    ip->i_number);
641 	if (VTOI(devvp)->i_copyonwrite == 0)
642 		devvp->v_flag &= ~VCOPYONWRITE;
643 	/*
644 	 * Clear all BLK_NOCOPY fields. Pass any block claims to other
645 	 * snapshots that want them (see ffs_snapblkfree below).
646 	 */
647 	for (blkno = 1; blkno < NDADDR; blkno++) {
648 		dblk = ip->i_db[blkno];
649 		if (dblk == BLK_NOCOPY || dblk == BLK_SNAP ||
650 		    (dblk == blkstofrags(fs, blkno) &&
651 		     ffs_snapblkfree(ip, dblk, fs->fs_bsize)))
652 			ip->i_db[blkno] = 0;
653 	}
654 	for (blkno = NDADDR; blkno < fs->fs_size; blkno += NINDIR(fs)) {
655 		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)blkno),
656 		    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
657 		if (error)
658 			continue;
659 		if ((last = fs->fs_size - blkno) > NINDIR(fs))
660 			last = NINDIR(fs);
661 		for (loc = 0; loc < last; loc++) {
662 			dblk = ((ufs_daddr_t *)(ibp->b_data))[loc];
663 			if (dblk == BLK_NOCOPY || dblk == BLK_SNAP ||
664 			    (dblk == blkstofrags(fs, blkno) &&
665 			     ffs_snapblkfree(ip, dblk, fs->fs_bsize)))
666 				((ufs_daddr_t *)(ibp->b_data))[loc] = 0;
667 		}
668 		bawrite(ibp);
669 	}
670 	/*
671 	 * Clear snapshot flag and drop reference.
672 	 */
673 	ip->i_flags &= ~SF_SNAPSHOT;
674 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
675 }
676 
677 /*
678  * Notification that a block is being freed. Return zero if the free
679  * should be allowed to proceed. Return non-zero if the snapshot file
680  * wants to claim the block. The block will be claimed if it is an
681  * uncopied part of one of the snapshots. It will be freed if it is
682  * either a BLK_NOCOPY or has already been copied in all of the snapshots.
683  * If a fragment is being freed, then all snapshots that care about
684  * it must make a copy since a snapshot file can only claim full sized
685  * blocks. Note that if more than one snapshot file maps the block,
686  * we can pick one at random to claim it. Since none of the snapshots
687  * can change, we are assurred that they will all see the same unmodified
688  * image. When deleting a snapshot file (see ffs_snapremove above), we
689  * must push any of these claimed blocks to one of the other snapshots
690  * that maps it. These claimed blocks are easily identified as they will
691  * have a block number equal to their logical block number within the
692  * snapshot. A copied block can never have this property because they
693  * must always have been allocated from a BLK_NOCOPY location.
694  */
695 int
696 ffs_snapblkfree(freeip, bno, size)
697 	struct inode *freeip;
698 	ufs_daddr_t bno;
699 	long size;
700 {
701 	struct buf *ibp, *cbp, *savedcbp = 0;
702 	struct fs *fs = freeip->i_fs;
703 	struct proc *p = CURPROC;
704 	struct inode *ip;
705 	struct vnode *vp;
706 	ufs_daddr_t lbn, blkno;
707 	int indiroff = 0, error = 0, claimedblk = 0;
708 
709 	lbn = fragstoblks(fs, bno);
710 	for (ip = VTOI(freeip->i_devvp)->i_copyonwrite; ip;
711 	     ip = ip->i_copyonwrite) {
712 		vp = ITOV(ip);
713 		/*
714 		 * Lookup block being written.
715 		 */
716 		if (lbn < NDADDR) {
717 			blkno = ip->i_db[lbn];
718 		} else {
719 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
720 			p->p_flag |= P_COWINPROGRESS;
721 			error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn),
722 			    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
723 			p->p_flag &= ~P_COWINPROGRESS;
724 			VOP_UNLOCK(vp, 0, p);
725 			if (error)
726 				break;
727 			indiroff = (lbn - NDADDR) % NINDIR(fs);
728 			blkno = ((ufs_daddr_t *)(ibp->b_data))[indiroff];
729 		}
730 		/*
731 		 * Check to see if block needs to be copied.
732 		 */
733 		switch (blkno) {
734 		/*
735 		 * If the snapshot has already copied the block (default),
736 		 * or does not care about the block, it is not needed.
737 		 */
738 		default:
739 		case BLK_NOCOPY:
740 			if (lbn >= NDADDR)
741 				bqrelse(ibp);
742 			continue;
743 		/*
744 		 * No previous snapshot claimed the block, so it will be
745 		 * freed and become a BLK_NOCOPY (don't care) for us.
746 		 */
747 		case BLK_SNAP:
748 			if (claimedblk)
749 				panic("snapblkfree: inconsistent block type");
750 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
751 			if (lbn < NDADDR) {
752 				ip->i_db[lbn] = BLK_NOCOPY;
753 				ip->i_flag |= IN_CHANGE | IN_UPDATE;
754 			} else {
755 				((ufs_daddr_t *)(ibp->b_data))[indiroff] =
756 				    BLK_NOCOPY;
757 				bdwrite(ibp);
758 			}
759 			VOP_UNLOCK(vp, 0, p);
760 			continue;
761 		/*
762 		 * A block that we map is being freed. If it has not been
763 		 * claimed yet, we will claim or copy it (below).
764 		 */
765 		case 0:
766 			claimedblk = 1;
767 			break;
768 		}
769 		/*
770 		 * If this is a full size block, we will just grab it
771 		 * and assign it to the snapshot inode. Otherwise we
772 		 * will proceed to copy it. See explanation for this
773 		 * routine as to why only a single snapshot needs to
774 		 * claim this block.
775 		 */
776 		if (size == fs->fs_bsize) {
777 #ifdef DEBUG
778 			if (snapdebug)
779 				printf("%s %d lbn %d from inum %d\n",
780 				    "Grabonremove: snapino", ip->i_number, lbn,
781 				    freeip->i_number);
782 #endif
783 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
784 			if (lbn < NDADDR) {
785 				ip->i_db[lbn] = bno;
786 			} else {
787 				((ufs_daddr_t *)(ibp->b_data))[indiroff] = bno;
788 				bdwrite(ibp);
789 			}
790 			ip->i_blocks += btodb(size);
791 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
792 			VOP_UNLOCK(vp, 0, p);
793 			return (1);
794 		}
795 		if (lbn >= NDADDR)
796 			bqrelse(ibp);
797 		/*
798 		 * Allocate the block into which to do the copy. Note that this
799 		 * allocation will never require any additional allocations for
800 		 * the snapshot inode.
801 		 */
802 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
803 		p->p_flag |= P_COWINPROGRESS;
804 		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn),
805 		    fs->fs_bsize, KERNCRED, 0, &cbp);
806 		p->p_flag &= ~P_COWINPROGRESS;
807 		VOP_UNLOCK(vp, 0, p);
808 		if (error)
809 			break;
810 #ifdef DEBUG
811 		if (snapdebug)
812 			printf("%s%d lbn %d for inum %d size %ld to blkno %d\n",
813 			    "Copyonremove: snapino ", ip->i_number, lbn,
814 			    freeip->i_number, size, cbp->b_blkno);
815 #endif
816 		/*
817 		 * If we have already read the old block contents, then
818 		 * simply copy them to the new block.
819 		 */
820 		if (savedcbp != 0) {
821 			bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
822 			bawrite(cbp);
823 			continue;
824 		}
825 		/*
826 		 * Otherwise, read the old block contents into the buffer.
827 		 */
828 		if ((error = readblock(cbp, lbn)) != 0)
829 			break;
830 		savedcbp = cbp;
831 	}
832 	if (savedcbp)
833 		bawrite(savedcbp);
834 	/*
835 	 * If we have been unable to allocate a block in which to do
836 	 * the copy, then return non-zero so that the fragment will
837 	 * not be freed. Although space will be lost, the snapshot
838 	 * will stay consistent.
839 	 */
840 	return (error);
841 }
842 
843 /*
844  * Associate snapshot files when mounting.
845  */
846 void
847 ffs_snapshot_mount(mp)
848 	struct mount *mp;
849 {
850 	struct ufsmount *ump = VFSTOUFS(mp);
851 	struct fs *fs = ump->um_fs;
852 	struct proc *p = CURPROC;
853 	struct inode *ip, **listtailp;
854 	struct vnode *vp;
855 	int error, snaploc, loc;
856 
857 	listtailp = &VTOI(ump->um_devvp)->i_copyonwrite;
858 	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
859 		if (fs->fs_snapinum[snaploc] == 0)
860 			return;
861 		if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], &vp)) != 0){
862 			printf("ffs_snapshot_mount: vget failed %d\n", error);
863 			continue;
864 		}
865 		ip = VTOI(vp);
866 		if ((ip->i_flags & SF_SNAPSHOT) == 0) {
867 			printf("ffs_snapshot_mount: non-snapshot inode %d\n",
868 			    fs->fs_snapinum[snaploc]);
869 			vput(vp);
870 			for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
871 				if (fs->fs_snapinum[loc] == 0)
872 					break;
873 				fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
874 			}
875 			fs->fs_snapinum[loc - 1] = 0;
876 			snaploc--;
877 			continue;
878 		}
879 		if (ip->i_copyonwrite != 0)
880 			panic("ffs_snapshot_mount: %d already on list",
881 			    ip->i_number);
882 		*listtailp = ip;
883 		listtailp = &ip->i_copyonwrite;
884 		vp->v_flag |= VSYSTEM;
885 		VOP_UNLOCK(vp, 0, p);
886 		ump->um_devvp->v_flag |= VCOPYONWRITE;
887 	}
888 }
889 
890 /*
891  * Disassociate snapshot files when unmounting.
892  */
893 void
894 ffs_snapshot_unmount(mp)
895 	struct mount *mp;
896 {
897 	struct ufsmount *ump = VFSTOUFS(mp);
898 	struct inode *devip = VTOI(ump->um_devvp);
899 	struct inode *xp;
900 
901 	while ((xp = devip->i_copyonwrite) != 0) {
902 		devip->i_copyonwrite = xp->i_copyonwrite;
903 		xp->i_copyonwrite = 0;
904 		if (xp->i_effnlink > 0)
905 			vrele(ITOV(xp));
906 	}
907 	ump->um_devvp->v_flag &= ~VCOPYONWRITE;
908 }
909 
910 /*
911  * Check for need to copy block that is about to be written,
912  * copying the block if necessary.
913  */
914 int
915 ffs_copyonwrite(ap)
916 	struct vop_copyonwrite_args /* {
917 		struct vnode *a_vp;
918 		struct buf *a_bp;
919 	} */ *ap;
920 {
921 	struct buf *ibp, *cbp, *savedcbp = 0, *bp = ap->a_bp;
922 	struct fs *fs = VTOI(bp->b_vp)->i_fs;
923 	struct proc *p = CURPROC;
924 	struct inode *ip;
925 	struct vnode *vp;
926 	ufs_daddr_t lbn, blkno;
927 	int indiroff, error = 0;
928 
929 	lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
930 	if (p->p_flag & P_COWINPROGRESS)
931 		panic("ffs_copyonwrite: recursive call");
932 	for (ip = VTOI(ap->a_vp)->i_copyonwrite; ip; ip = ip->i_copyonwrite) {
933 		vp = ITOV(ip);
934 		/*
935 		 * We ensure that everything of our own that needs to be
936 		 * copied will be done at the time that ffs_snapshot is
937 		 * called. Thus we can skip the check here which can
938 		 * deadlock in doing the lookup in VOP_BALLOC.
939 		 */
940 		if (bp->b_vp == vp)
941 			continue;
942 		/*
943 		 * Check to see if block needs to be copied. We have to
944 		 * be able to do the VOP_BALLOC without blocking, otherwise
945 		 * we may get in a deadlock with another process also
946 		 * trying to allocate. If we find outselves unable to
947 		 * get the buffer lock, we unlock the snapshot vnode,
948 		 * sleep briefly, and try again.
949 		 */
950 retry:
951 		vn_lock(vp, LK_SHARED | LK_RETRY, p);
952 		if (lbn < NDADDR) {
953 			blkno = ip->i_db[lbn];
954 		} else {
955 			p->p_flag |= P_COWINPROGRESS;
956 			error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn),
957 			   fs->fs_bsize, KERNCRED, B_METAONLY | B_NOWAIT, &ibp);
958 			p->p_flag &= ~P_COWINPROGRESS;
959 			if (error) {
960 				VOP_UNLOCK(vp, 0, p);
961 				if (error != EWOULDBLOCK)
962 					break;
963 				tsleep(vp, p->p_usrpri, "nap", 1);
964 				goto retry;
965 			}
966 			indiroff = (lbn - NDADDR) % NINDIR(fs);
967 			blkno = ((ufs_daddr_t *)(ibp->b_data))[indiroff];
968 			bqrelse(ibp);
969 		}
970 #ifdef DIAGNOSTIC
971 		if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
972 			panic("ffs_copyonwrite: bad copy block");
973 #endif
974 		if (blkno != 0) {
975 			VOP_UNLOCK(vp, 0, p);
976 			continue;
977 		}
978 		/*
979 		 * Allocate the block into which to do the copy. Note that this
980 		 * allocation will never require any additional allocations for
981 		 * the snapshot inode.
982 		 */
983 		p->p_flag |= P_COWINPROGRESS;
984 		error = VOP_BALLOC(vp, lblktosize(fs, (off_t)lbn),
985 		    fs->fs_bsize, KERNCRED, B_NOWAIT, &cbp);
986 		p->p_flag &= ~P_COWINPROGRESS;
987 		VOP_UNLOCK(vp, 0, p);
988 		if (error) {
989 			if (error != EWOULDBLOCK)
990 				break;
991 			tsleep(vp, p->p_usrpri, "nap", 1);
992 			goto retry;
993 		}
994 #ifdef DEBUG
995 		if (snapdebug) {
996 			printf("Copyonwrite: snapino %d lbn %d for ",
997 			    ip->i_number, lbn);
998 			if (bp->b_vp == ap->a_vp)
999 				printf("fs metadata");
1000 			else
1001 				printf("inum %d", VTOI(bp->b_vp)->i_number);
1002 			printf(" lblkno %d to blkno %d\n", bp->b_lblkno,
1003 			    cbp->b_blkno);
1004 		}
1005 #endif
1006 		/*
1007 		 * If we have already read the old block contents, then
1008 		 * simply copy them to the new block.
1009 		 */
1010 		if (savedcbp != 0) {
1011 			bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
1012 			bawrite(cbp);
1013 			continue;
1014 		}
1015 		/*
1016 		 * Otherwise, read the old block contents into the buffer.
1017 		 */
1018 		if ((error = readblock(cbp, lbn)) != 0)
1019 			break;
1020 		savedcbp = cbp;
1021 	}
1022 	if (savedcbp)
1023 		bawrite(savedcbp);
1024 	return (error);
1025 }
1026 
1027 /*
1028  * Read the specified block into the given buffer.
1029  * Much of this boiler-plate comes from bwrite().
1030  */
1031 static int
1032 readblock(bp, lbn)
1033 	struct buf *bp;
1034 	daddr_t lbn;
1035 {
1036 	struct uio auio;
1037 	struct iovec aiov;
1038 	struct proc *p = CURPROC;
1039 	struct inode *ip = VTOI(bp->b_vp);
1040 
1041 	aiov.iov_base = bp->b_data;
1042 	aiov.iov_len = bp->b_bcount;
1043 	auio.uio_iov = &aiov;
1044 	auio.uio_iovcnt = 1;
1045 	auio.uio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn)));
1046 	auio.uio_resid = bp->b_bcount;
1047 	auio.uio_rw = UIO_READ;
1048 	auio.uio_segflg = UIO_SYSSPACE;
1049 	auio.uio_procp = p;
1050 	return (physio(ip->i_devvp->v_rdev, &auio, 0));
1051 }
1052