xref: /freebsd/sys/ufs/ffs/ffs_snapshot.c (revision 3ff369fed2a08f32dda232c10470b949bef9489f)
1 /*
2  * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
3  *
4  * Further information about snapshots can be obtained from:
5  *
6  *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
7  *	1614 Oxford Street		mckusick@mckusick.com
8  *	Berkeley, CA 94709-1608		+1-510-843-9542
9  *	USA
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  *
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
22  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24  * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
25  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *	@(#)ffs_snapshot.c	8.11 (McKusick) 7/23/00
34  * $FreeBSD$
35  */
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/conf.h>
40 #include <sys/disklabel.h>
41 #include <sys/bio.h>
42 #include <sys/buf.h>
43 #include <sys/proc.h>
44 #include <sys/namei.h>
45 #include <sys/stat.h>
46 #include <sys/malloc.h>
47 #include <sys/mount.h>
48 #include <sys/resource.h>
49 #include <sys/resourcevar.h>
50 #include <sys/vnode.h>
51 
52 #include <ufs/ufs/extattr.h>
53 #include <ufs/ufs/quota.h>
54 #include <ufs/ufs/ufsmount.h>
55 #include <ufs/ufs/inode.h>
56 #include <ufs/ufs/ufs_extern.h>
57 
58 #include <ufs/ffs/fs.h>
59 #include <ufs/ffs/ffs_extern.h>
60 
61 #define KERNCRED thread0.td_ucred
62 #define DEBUG 1
63 
64 static int cgaccount(int, struct vnode *, struct buf *, int);
65 static int expunge(struct vnode *, struct inode *, struct fs *,
66     int (*)(struct vnode *, ufs_daddr_t *, ufs_daddr_t *, struct fs *,
67     ufs_daddr_t, int), int);
68 static int indiracct(struct vnode *, struct vnode *, int, ufs_daddr_t,
69     int, int, int, int, struct fs *, int (*)(struct vnode *,
70     ufs_daddr_t *, ufs_daddr_t *, struct fs *, ufs_daddr_t, int), int);
71 static int fullacct(struct vnode *, ufs_daddr_t *, ufs_daddr_t *,
72     struct fs *, ufs_daddr_t, int);
73 static int snapacct(struct vnode *, ufs_daddr_t *, ufs_daddr_t *,
74     struct fs *, ufs_daddr_t, int);
75 static int mapacct(struct vnode *, ufs_daddr_t *, ufs_daddr_t *,
76     struct fs *, ufs_daddr_t, int);
77 static int ffs_copyonwrite(struct vnode *, struct buf *);
78 static int readblock(struct buf *, daddr_t);
79 
80 /*
81  * To ensure the consistency of snapshots across crashes, we must
82  * synchronously write out copied blocks before allowing the
83  * originals to be modified. Because of the rather severe speed
84  * penalty that this imposes, the following flag allows this
85  * crash persistence to be disabled.
86  */
87 int dopersistence = 0;
88 
89 #ifdef DEBUG
90 #include <sys/sysctl.h>
91 SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, "");
92 int snapdebug = 0;
93 SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, "");
94 int collectsnapstats = 0;
95 SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats,
96 	0, "");
97 #endif /* DEBUG */
98 
99 /*
100  * Create a snapshot file and initialize it for the filesystem.
101  */
102 int
103 ffs_snapshot(mp, snapfile)
104 	struct mount *mp;
105 	char *snapfile;
106 {
107 	ufs_daddr_t blkno;
108 	int error, cg, snaploc, numblks;
109 	int i, size, len, loc;
110 	int flag = mp->mnt_flag;
111 	struct timespec starttime = {0, 0}, endtime;
112 	char saved_nice = 0;
113 	long redo = 0;
114 	int32_t *lp;
115 	void *space;
116 	struct fs *copy_fs = NULL, *fs = VFSTOUFS(mp)->um_fs;
117 	struct snaphead *snaphead;
118 	struct thread *td = curthread;
119 	struct inode *ip, *xp;
120 	struct buf *bp, *nbp, *ibp, *sbp = NULL;
121 	struct nameidata nd;
122 	struct mount *wrtmp;
123 	struct vattr vat;
124 	struct vnode *vp, *xvp, *nvp;
125 
126 	/*
127 	 * Need to serialize access to snapshot code per filesystem.
128 	 */
129 	/*
130 	 * Assign a snapshot slot in the superblock.
131 	 */
132 	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
133 		if (fs->fs_snapinum[snaploc] == 0)
134 			break;
135 	if (snaploc == FSMAXSNAP)
136 		return (ENOSPC);
137 	/*
138 	 * Create the snapshot file.
139 	 */
140 restart:
141 	NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, snapfile, td);
142 	if ((error = namei(&nd)) != 0)
143 		return (error);
144 	if (nd.ni_vp != NULL) {
145 		vput(nd.ni_vp);
146 		error = EEXIST;
147 	}
148 	if (nd.ni_dvp->v_mount != mp)
149 		error = EXDEV;
150 	if (error) {
151 		NDFREE(&nd, NDF_ONLY_PNBUF);
152 		if (nd.ni_dvp == nd.ni_vp)
153 			vrele(nd.ni_dvp);
154 		else
155 			vput(nd.ni_dvp);
156 		return (error);
157 	}
158 	VATTR_NULL(&vat);
159 	vat.va_type = VREG;
160 	vat.va_mode = S_IRUSR;
161 	vat.va_vaflags |= VA_EXCLUSIVE;
162 	if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp))
163 		wrtmp = NULL;
164 	if (wrtmp != mp)
165 		panic("ffs_snapshot: mount mismatch");
166 	if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) {
167 		NDFREE(&nd, NDF_ONLY_PNBUF);
168 		vput(nd.ni_dvp);
169 		if ((error = vn_start_write(NULL, &wrtmp,
170 		    V_XSLEEP | PCATCH)) != 0)
171 			return (error);
172 		goto restart;
173 	}
174 	VOP_LEASE(nd.ni_dvp, td, KERNCRED, LEASE_WRITE);
175 	error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat);
176 	vput(nd.ni_dvp);
177 	if (error) {
178 		NDFREE(&nd, NDF_ONLY_PNBUF);
179 		vn_finished_write(wrtmp);
180 		return (error);
181 	}
182 	vp = nd.ni_vp;
183 	ip = VTOI(vp);
184 	/*
185 	 * Allocate and copy the last block contents so as to be able
186 	 * to set size to that of the filesystem.
187 	 */
188 	numblks = howmany(fs->fs_size, fs->fs_frag);
189 	error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)),
190 	    fs->fs_bsize, KERNCRED, B_CLRBUF, &bp);
191 	if (error)
192 		goto out;
193 	ip->i_size = lblktosize(fs, (off_t)numblks);
194 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
195 	if ((error = readblock(bp, numblks - 1)) != 0)
196 		goto out;
197 	bawrite(bp);
198 	/*
199 	 * Preallocate critical data structures so that we can copy
200 	 * them in without further allocation after we suspend all
201 	 * operations on the filesystem. We would like to just release
202 	 * the allocated buffers without writing them since they will
203 	 * be filled in below once we are ready to go, but this upsets
204 	 * the soft update code, so we go ahead and write the new buffers.
205 	 *
206 	 * Allocate all indirect blocks and mark all of them as not
207 	 * needing to be copied.
208 	 */
209 	for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
210 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno),
211 		    fs->fs_bsize, td->td_ucred, B_METAONLY, &ibp);
212 		if (error)
213 			goto out;
214 		bdwrite(ibp);
215 	}
216 	/*
217 	 * Allocate copies for the superblock and its summary information.
218 	 */
219 	error = UFS_BALLOC(vp, (off_t)(SBOFF), SBSIZE, KERNCRED, 0, &nbp);
220 	if (error)
221 		goto out;
222 	bawrite(nbp);
223 	blkno = fragstoblks(fs, fs->fs_csaddr);
224 	len = howmany(fs->fs_cssize, fs->fs_bsize);
225 	for (loc = 0; loc < len; loc++) {
226 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)),
227 		    fs->fs_bsize, KERNCRED, 0, &nbp);
228 		if (error)
229 			goto out;
230 		bawrite(nbp);
231 	}
232 	/*
233 	 * Allocate all cylinder group blocks.
234 	 */
235 	for (cg = 0; cg < fs->fs_ncg; cg++) {
236 		error = UFS_BALLOC(vp, (off_t)(cgtod(fs, cg)) << fs->fs_fshift,
237 		    fs->fs_bsize, KERNCRED, 0, &nbp);
238 		if (error)
239 			goto out;
240 		bdwrite(nbp);
241 	}
242 	/*
243 	 * Copy all the cylinder group maps. Although the
244 	 * filesystem is still active, we hope that only a few
245 	 * cylinder groups will change between now and when we
246 	 * suspend operations. Thus, we will be able to quickly
247 	 * touch up the few cylinder groups that changed during
248 	 * the suspension period.
249 	 */
250 	len = howmany(fs->fs_ncg, NBBY);
251 	MALLOC(fs->fs_active, int *, len, M_DEVBUF, M_WAITOK);
252 	bzero(fs->fs_active, len);
253 	for (cg = 0; cg < fs->fs_ncg; cg++) {
254 		error = bread(vp, fragstoblks(fs, cgtod(fs, cg)), fs->fs_bsize,
255 		    KERNCRED, &nbp);
256 		if (error) {
257 			brelse(nbp);
258 			goto out;
259 		}
260 		error = cgaccount(cg, vp, nbp, 1);
261 		bawrite(nbp);
262 		if (error)
263 			goto out;
264 	}
265 	/*
266 	 * Change inode to snapshot type file.
267 	 */
268 	ip->i_flags |= SF_SNAPSHOT;
269 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
270 	/*
271 	 * Ensure that the snapshot is completely on disk.
272 	 */
273 	if ((error = VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td)) != 0)
274 		goto out;
275 	/*
276 	 * All allocations are done, so we can now snapshot the system.
277 	 *
278 	 * Recind nice scheduling while running with the filesystem suspended.
279 	 */
280 	if (td->td_ksegrp->kg_nice > 0) {
281 		saved_nice = td->td_ksegrp->kg_nice;
282 		td->td_ksegrp->kg_nice = 0;
283 	}
284 	/*
285 	 * Suspend operation on filesystem.
286 	 */
287 	for (;;) {
288 		vn_finished_write(wrtmp);
289 		vfs_write_suspend(vp->v_mount);
290 		if (mp->mnt_kern_flag & MNTK_SUSPENDED)
291 			break;
292 		vn_start_write(NULL, &wrtmp, V_WAIT);
293 	}
294 	if (collectsnapstats)
295 		nanotime(&starttime);
296 	/*
297 	 * First, copy all the cylinder group maps that have changed.
298 	 */
299 	for (cg = 0; cg < fs->fs_ncg; cg++) {
300 		if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0)
301 			continue;
302 		redo++;
303 		error = bread(vp, fragstoblks(fs, cgtod(fs, cg)), fs->fs_bsize,
304 			KERNCRED, &nbp);
305 		if (error) {
306 			brelse(nbp);
307 			goto out1;
308 		}
309 		error = cgaccount(cg, vp, nbp, 2);
310 		bawrite(nbp);
311 		if (error)
312 			goto out1;
313 	}
314 	/*
315 	 * Grab a copy of the superblock and its summary information.
316 	 * We delay writing it until the suspension is released below.
317 	 */
318 	error = bread(vp, lblkno(fs, SBOFF), fs->fs_bsize, KERNCRED, &sbp);
319 	if (error) {
320 		brelse(sbp);
321 		sbp = NULL;
322 		goto out1;
323 	}
324 	copy_fs = (struct fs *)(sbp->b_data + blkoff(fs, SBOFF));
325 	bcopy(fs, copy_fs, fs->fs_sbsize);
326 	if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0)
327 		copy_fs->fs_clean = 1;
328 	if (fs->fs_sbsize < SBSIZE)
329 		bzero(&sbp->b_data[blkoff(fs, SBOFF) + fs->fs_sbsize],
330 		    SBSIZE - fs->fs_sbsize);
331 	size = blkroundup(fs, fs->fs_cssize);
332 	if (fs->fs_contigsumsize > 0)
333 		size += fs->fs_ncg * sizeof(int32_t);
334 	space = malloc((u_long)size, M_UFSMNT, M_WAITOK);
335 	copy_fs->fs_csp = space;
336 	bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize);
337 	(char *)space += fs->fs_cssize;
338 	loc = howmany(fs->fs_cssize, fs->fs_fsize);
339 	i = fs->fs_frag - loc % fs->fs_frag;
340 	len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize;
341 	if (len > 0) {
342 		if ((error = bread(ip->i_devvp,
343 		    fsbtodb(fs, fs->fs_csaddr + loc),
344 		    len, KERNCRED, &bp)) != 0) {
345 			brelse(bp);
346 			free(copy_fs->fs_csp, M_UFSMNT);
347 			bawrite(sbp);
348 			sbp = NULL;
349 			goto out1;
350 		}
351 		bcopy(bp->b_data, space, (u_int)len);
352 		(char *)space += len;
353 		bp->b_flags |= B_INVAL | B_NOCACHE;
354 		brelse(bp);
355 	}
356 	if (fs->fs_contigsumsize > 0) {
357 		copy_fs->fs_maxcluster = lp = space;
358 		for (i = 0; i < fs->fs_ncg; i++)
359 			*lp++ = fs->fs_contigsumsize;
360 	}
361 	/*
362 	 * We must check for active files that have been unlinked
363 	 * (e.g., with a zero link count). We have to expunge all
364 	 * trace of these files from the snapshot so that they are
365 	 * not reclaimed prematurely by fsck or unnecessarily dumped.
366 	 * We turn off the MNTK_SUSPENDED flag to avoid a panic from
367 	 * spec_strategy about writing on a suspended filesystem.
368 	 */
369 	mp->mnt_kern_flag &= ~MNTK_SUSPENDED;
370 	mtx_lock(&mntvnode_mtx);
371 loop:
372 	for (xvp = TAILQ_FIRST(&mp->mnt_nvnodelist); xvp; xvp = nvp) {
373 		/*
374 		 * Make sure this vnode wasn't reclaimed in getnewvnode().
375 		 * Start over if it has (it won't be on the list anymore).
376 		 */
377 		if (xvp->v_mount != mp)
378 			goto loop;
379 		nvp = TAILQ_NEXT(xvp, v_nmntvnodes);
380 		mtx_unlock(&mntvnode_mtx);
381 		mtx_lock(&xvp->v_interlock);
382 		if (xvp->v_usecount == 0 || xvp->v_type == VNON ||
383 		    (VOP_GETATTR(xvp, &vat, td->td_proc->p_ucred, td) == 0 &&
384 		    vat.va_nlink > 0)) {
385 			mtx_unlock(&xvp->v_interlock);
386 			mtx_lock(&mntvnode_mtx);
387 			continue;
388 		}
389 		if (snapdebug)
390 			vprint("ffs_snapshot: busy vnode", xvp);
391 		if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK, td) != 0)
392 			goto loop;
393 		xp = VTOI(xvp);
394 		/*
395 		 * If there is a fragment, clear it here.
396 		 */
397 		blkno = 0;
398 		loc = howmany(xp->i_size, fs->fs_bsize) - 1;
399 		if (loc < NDADDR) {
400 			len = fragroundup(fs, blkoff(fs, xp->i_size));
401 			if (len < fs->fs_bsize) {
402 				ffs_blkfree(copy_fs, vp, xp->i_db[loc], len,
403 				    xp->i_number);
404 				blkno = xp->i_db[loc];
405 				xp->i_db[loc] = 0;
406 			}
407 		}
408 		error = expunge(vp, xp, copy_fs, fullacct, BLK_NOCOPY);
409 		if (blkno)
410 			xp->i_db[loc] = blkno;
411 		if (!error)
412 			error = ffs_freefile(copy_fs, vp, xp->i_number,
413 			    xp->i_mode);
414 		VOP_UNLOCK(xvp, 0, td);
415 		if (error) {
416 			free(copy_fs->fs_csp, M_UFSMNT);
417 			bawrite(sbp);
418 			sbp = NULL;
419 			goto out1;
420 		}
421 		mtx_lock(&mntvnode_mtx);
422 	}
423 	mtx_unlock(&mntvnode_mtx);
424 	/*
425 	 * Record snapshot inode. Since this is the newest snapshot,
426 	 * it must be placed at the end of the list.
427 	 */
428 	fs->fs_snapinum[snaploc] = ip->i_number;
429 	if (ip->i_nextsnap.tqe_prev != 0)
430 		panic("ffs_snapshot: %d already on list", ip->i_number);
431 	snaphead = &ip->i_devvp->v_rdev->si_snapshots;
432 	TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap);
433 	ip->i_devvp->v_rdev->si_copyonwrite = ffs_copyonwrite;
434 	ip->i_devvp->v_flag |= VCOPYONWRITE;
435 	vp->v_flag |= VSYSTEM;
436 out1:
437 	/*
438 	 * Resume operation on filesystem.
439 	 */
440 	vfs_write_resume(vp->v_mount);
441 	if (saved_nice > 0)
442 		td->td_ksegrp->kg_nice = saved_nice;
443 	vn_start_write(NULL, &wrtmp, V_WAIT);
444 	if (collectsnapstats && starttime.tv_sec > 0) {
445 		nanotime(&endtime);
446 		timespecsub(&endtime, &starttime);
447 		printf("%s: suspended %d.%03ld sec, redo %ld of %d\n",
448 		    vp->v_mount->mnt_stat.f_mntonname, endtime.tv_sec,
449 		    endtime.tv_nsec / 1000000, redo, fs->fs_ncg);
450 	}
451 	if (sbp == NULL)
452 		goto out;
453 	/*
454 	 * Copy allocation information from all the snapshots in
455 	 * this snapshot and then expunge them from its view.
456 	 */
457 	snaphead = &ip->i_devvp->v_rdev->si_snapshots;
458 	TAILQ_FOREACH(xp, snaphead, i_nextsnap) {
459 		if (xp == ip)
460 			break;
461 		if ((error = expunge(vp, xp, fs, snapacct, BLK_SNAP)) != 0) {
462 			fs->fs_snapinum[snaploc] = 0;
463 			goto done;
464 		}
465 	}
466 	/*
467 	 * Expunge the blocks used by the snapshots from the set of
468 	 * blocks marked as used in the snapshot bitmaps.
469 	 */
470 	if ((error = expunge(vp, ip, copy_fs, mapacct, BLK_SNAP)) != 0) {
471 		fs->fs_snapinum[snaploc] = 0;
472 		goto done;
473 	}
474 	/*
475 	 * Write the superblock and its summary information
476 	 * to the snapshot.
477 	 */
478 	blkno = fragstoblks(fs, fs->fs_csaddr);
479 	len = howmany(fs->fs_cssize, fs->fs_bsize);
480 	space = copy_fs->fs_csp;
481 	for (loc = 0; loc < len; loc++) {
482 		error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp);
483 		if (error) {
484 			brelse(nbp);
485 			fs->fs_snapinum[snaploc] = 0;
486 			goto done;
487 		}
488 		bcopy(space, nbp->b_data, fs->fs_bsize);
489 		space = (char *)space + fs->fs_bsize;
490 		bawrite(nbp);
491 	}
492 done:
493 	free(copy_fs->fs_csp, M_UFSMNT);
494 	bawrite(sbp);
495 out:
496 	if (fs->fs_active != 0) {
497 		FREE(fs->fs_active, M_DEVBUF);
498 		fs->fs_active = 0;
499 	}
500 	mp->mnt_flag = flag;
501 	if (error)
502 		(void) UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED, td);
503 	(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
504 	if (error)
505 		vput(vp);
506 	else
507 		VOP_UNLOCK(vp, 0, td);
508 	vn_finished_write(wrtmp);
509 	return (error);
510 }
511 
512 /*
513  * Copy a cylinder group map. All the unallocated blocks are marked
514  * BLK_NOCOPY so that the snapshot knows that it need not copy them
515  * if they are later written. If passno is one, then this is a first
516  * pass, so only setting needs to be done. If passno is 2, then this
517  * is a revision to a previous pass which must be undone as the
518  * replacement pass is done.
519  */
520 static int
521 cgaccount(cg, vp, nbp, passno)
522 	int cg;
523 	struct vnode *vp;
524 	struct buf *nbp;
525 	int passno;
526 {
527 	struct buf *bp, *ibp;
528 	struct inode *ip;
529 	struct cg *cgp;
530 	struct fs *fs;
531 	int error, numblks, base, len, loc, indiroff;
532 
533 	ip = VTOI(vp);
534 	fs = ip->i_fs;
535 	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
536 		(int)fs->fs_cgsize, KERNCRED, &bp);
537 	if (error) {
538 		brelse(bp);
539 		return (error);
540 	}
541 	cgp = (struct cg *)bp->b_data;
542 	if (!cg_chkmagic(cgp)) {
543 		brelse(bp);
544 		return (EIO);
545 	}
546 	atomic_set_int(&ACTIVECGNUM(fs, cg), ACTIVECGOFF(cg));
547 	bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize);
548 	if (fs->fs_cgsize < fs->fs_bsize)
549 		bzero(&nbp->b_data[fs->fs_cgsize],
550 		    fs->fs_bsize - fs->fs_cgsize);
551 	if (passno == 2)
552 		nbp->b_flags |= B_VALIDSUSPWRT;
553 	numblks = howmany(fs->fs_size, fs->fs_frag);
554 	len = howmany(fs->fs_fpg, fs->fs_frag);
555 	base = cg * fs->fs_fpg / fs->fs_frag;
556 	if (base + len >= numblks)
557 		len = numblks - base - 1;
558 	loc = 0;
559 	if (base < NDADDR) {
560 		for ( ; loc < NDADDR; loc++) {
561 			if (ffs_isblock(fs, cg_blksfree(cgp), loc))
562 				ip->i_db[loc] = BLK_NOCOPY;
563 			else if (passno == 2 && ip->i_db[loc] == BLK_NOCOPY)
564 				ip->i_db[loc] = 0;
565 			else if (passno == 1 && ip->i_db[loc] == BLK_NOCOPY)
566 				panic("ffs_snapshot: lost direct block");
567 		}
568 	}
569 	error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)),
570 	    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
571 	if (error) {
572 		brelse(bp);
573 		return (error);
574 	}
575 	indiroff = (base + loc - NDADDR) % NINDIR(fs);
576 	for ( ; loc < len; loc++, indiroff++) {
577 		if (indiroff >= NINDIR(fs)) {
578 			if (passno == 2)
579 				ibp->b_flags |= B_VALIDSUSPWRT;
580 			bawrite(ibp);
581 			error = UFS_BALLOC(vp,
582 			    lblktosize(fs, (off_t)(base + loc)),
583 			    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
584 			if (error) {
585 				brelse(bp);
586 				return (error);
587 			}
588 			indiroff = 0;
589 		}
590 		if (ffs_isblock(fs, cg_blksfree(cgp), loc))
591 			((ufs_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY;
592 		else if (passno == 2 &&
593 			 ((ufs_daddr_t *)(ibp->b_data))[indiroff] == BLK_NOCOPY)
594 			((ufs_daddr_t *)(ibp->b_data))[indiroff] = 0;
595 		else if (passno == 1 &&
596 			 ((ufs_daddr_t *)(ibp->b_data))[indiroff] == BLK_NOCOPY)
597 			panic("ffs_snapshot: lost indirect block");
598 	}
599 	bqrelse(bp);
600 	if (passno == 2)
601 		ibp->b_flags |= B_VALIDSUSPWRT;
602 	bdwrite(ibp);
603 	return (0);
604 }
605 
606 /*
607  * Before expunging a snapshot inode, note all the
608  * blocks that it claims with BLK_SNAP so that fsck will
609  * be able to account for those blocks properly and so
610  * that this snapshot knows that it need not copy them
611  * if the other snapshot holding them is freed.
612  */
613 static int
614 expunge(snapvp, cancelip, fs, acctfunc, expungetype)
615 	struct vnode *snapvp;
616 	struct inode *cancelip;
617 	struct fs *fs;
618 	int (*acctfunc)(struct vnode *, ufs_daddr_t *, ufs_daddr_t *,
619 	    struct fs *, ufs_daddr_t, int);
620 	int expungetype;
621 {
622 	int i, len, error, numblks, blksperindir;
623 	ufs_daddr_t lbn, rlbn, blkno, indiroff;
624 	struct thread *td = curthread;
625 	struct dinode *dip;
626 	struct buf *bp;
627 
628 	numblks = howmany(cancelip->i_size, fs->fs_bsize);
629 	if ((error = (*acctfunc)(snapvp, &cancelip->i_db[0],
630 	     &cancelip->i_ib[NIADDR], fs, 0, expungetype)))
631 		return (error);
632 	blksperindir = 1;
633 	lbn = -NDADDR;
634 	len = numblks - NDADDR;
635 	rlbn = NDADDR;
636 	for (i = 0; len > 0 && i < NIADDR; i++) {
637 		error = indiracct(snapvp, ITOV(cancelip), i, cancelip->i_ib[i],
638 		    lbn, rlbn, len, blksperindir, fs, acctfunc, expungetype);
639 		if (error)
640 			return (error);
641 		blksperindir *= NINDIR(fs);
642 		lbn -= blksperindir + 1;
643 		len -= blksperindir;
644 		rlbn += blksperindir;
645 	}
646 	/*
647 	 * Prepare to expunge the inode. If its inode block has not
648 	 * yet been copied, then allocate and fill the copy.
649 	 */
650 	lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
651 	blkno = 0;
652 	if (lbn < NDADDR) {
653 		blkno = cancelip->i_db[lbn];
654 	} else {
655 		td->td_proc->p_flag |= P_COWINPROGRESS;
656 		error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn),
657 		   fs->fs_bsize, KERNCRED, B_METAONLY, &bp);
658 		td->td_proc->p_flag &= ~P_COWINPROGRESS;
659 		if (error)
660 			return (error);
661 		indiroff = (lbn - NDADDR) % NINDIR(fs);
662 		blkno = ((ufs_daddr_t *)(bp->b_data))[indiroff];
663 		bqrelse(bp);
664 	}
665 	error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn),
666 	    fs->fs_bsize, KERNCRED, 0, &bp);
667 	if (error)
668 		return (error);
669 	if (blkno == 0 && (error = readblock(bp, lbn)))
670 		return (error);
671 	/*
672 	 * Set a snapshot inode to be a zero length file, regular files
673 	 * to be completely unallocated.
674 	 */
675 	dip = (struct dinode *)bp->b_data + ino_to_fsbo(fs, cancelip->i_number);
676 	if (expungetype == BLK_NOCOPY)
677 		dip->di_mode = 0;
678 	dip->di_size = 0;
679 	dip->di_blocks = 0;
680 	dip->di_flags &= ~SF_SNAPSHOT;
681 	bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs_daddr_t));
682 	bdwrite(bp);
683 	return (0);
684 }
685 
686 /*
687  * Descend an indirect block chain for vnode cancelvp accounting for all
688  * its indirect blocks in snapvp.
689  */
690 static int
691 indiracct(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, blksperindir, fs,
692 	    acctfunc, expungetype)
693 	struct vnode *snapvp;
694 	struct vnode *cancelvp;
695 	int level;
696 	ufs_daddr_t blkno;
697 	int lbn;
698 	int rlbn;
699 	int remblks;
700 	int blksperindir;
701 	struct fs *fs;
702 	int (*acctfunc)(struct vnode *, ufs_daddr_t *, ufs_daddr_t *,
703 	    struct fs *, ufs_daddr_t, int);
704 	int expungetype;
705 {
706 	int subblksperindir, error, last, num, i;
707 	struct indir indirs[NIADDR + 2];
708 	ufs_daddr_t *bap;
709 	struct buf *bp;
710 
711 	if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
712 		return (error);
713 	if (lbn != indirs[num - 1 - level].in_lbn || blkno == 0 || num < 2)
714 		panic("indiracct: botched params");
715 	/*
716 	 * We have to expand bread here since it will deadlock looking
717 	 * up the block number for any blocks that are not in the cache.
718 	 */
719 	bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0);
720 	bp->b_blkno = fsbtodb(fs, blkno);
721 	if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 &&
722 	    (error = readblock(bp, fragstoblks(fs, blkno)))) {
723 		brelse(bp);
724 		return (error);
725 	}
726 	/*
727 	 * Account for the block pointers in this indirect block.
728 	 */
729 	last = howmany(remblks, blksperindir);
730 	if (last > NINDIR(fs))
731 		last = NINDIR(fs);
732 	MALLOC(bap, ufs_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK);
733 	bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
734 	bqrelse(bp);
735 	error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, rlbn, expungetype);
736 	if (error || level == 0)
737 		goto out;
738 	/*
739 	 * Account for the block pointers in each of the indirect blocks
740 	 * in the levels below us.
741 	 */
742 	subblksperindir = blksperindir / NINDIR(fs);
743 	for (lbn++, level--, i = 0; i < last; i++) {
744 		error = indiracct(snapvp, cancelvp, level, bap[i], lbn,
745 		    rlbn, remblks, subblksperindir, fs, acctfunc, expungetype);
746 		if (error)
747 			goto out;
748 		rlbn += blksperindir;
749 		lbn -= blksperindir;
750 		remblks -= blksperindir;
751 	}
752 out:
753 	FREE(bap, M_DEVBUF);
754 	return (error);
755 }
756 
757 /*
758  * Do both snap accounting and map accounting.
759  */
760 static int
761 fullacct(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
762 	struct vnode *vp;
763 	ufs_daddr_t *oldblkp, *lastblkp;
764 	struct fs *fs;
765 	ufs_daddr_t lblkno;
766 	int expungetype;	/* BLK_SNAP or BLK_NOCOPY */
767 {
768 	int error;
769 
770 	if ((error = snapacct(vp, oldblkp, lastblkp, fs, lblkno, expungetype)))
771 		return (error);
772 	return (mapacct(vp, oldblkp, lastblkp, fs, lblkno, expungetype));
773 }
774 
775 /*
776  * Identify a set of blocks allocated in a snapshot inode.
777  */
778 static int
779 snapacct(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
780 	struct vnode *vp;
781 	ufs_daddr_t *oldblkp, *lastblkp;
782 	struct fs *fs;
783 	ufs_daddr_t lblkno;
784 	int expungetype;	/* BLK_SNAP or BLK_NOCOPY */
785 {
786 	struct inode *ip = VTOI(vp);
787 	ufs_daddr_t lbn, blkno, *blkp;
788 	struct buf *ibp;
789 	int error;
790 
791 	for ( ; oldblkp < lastblkp; oldblkp++) {
792 		blkno = *oldblkp;
793 		if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
794 			continue;
795 		lbn = fragstoblks(fs, blkno);
796 		if (lbn < NDADDR) {
797 			blkp = &ip->i_db[lbn];
798 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
799 		} else {
800 			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
801 			    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
802 			if (error)
803 				return (error);
804 			blkp = &((ufs_daddr_t *)(ibp->b_data))
805 			    [(lbn - NDADDR) % NINDIR(fs)];
806 		}
807 		/*
808 		 * If we are expunging a snapshot vnode and we
809 		 * find a block marked BLK_NOCOPY, then it is
810 		 * one that has been allocated to this snapshot after
811 		 * we took our current snapshot and can be ignored.
812 		 */
813 		if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) {
814 			if (lbn >= NDADDR)
815 				brelse(ibp);
816 		} else {
817 			if (*blkp != 0)
818 				panic("snapacct: bad block");
819 			*blkp = expungetype;
820 			if (lbn >= NDADDR)
821 				bdwrite(ibp);
822 		}
823 	}
824 	return (0);
825 }
826 
827 /*
828  * Account for a set of blocks allocated in a snapshot inode.
829  */
830 static int
831 mapacct(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
832 	struct vnode *vp;
833 	ufs_daddr_t *oldblkp, *lastblkp;
834 	struct fs *fs;
835 	ufs_daddr_t lblkno;
836 	int expungetype;
837 {
838 	ufs_daddr_t blkno;
839 	ino_t inum;
840 
841 	inum = VTOI(vp)->i_number;
842 	for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
843 		blkno = *oldblkp;
844 		if (blkno == 0 || blkno == BLK_NOCOPY)
845 			continue;
846 		if (blkno == BLK_SNAP)
847 			blkno = blkstofrags(fs, lblkno);
848 		ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum);
849 	}
850 	return (0);
851 }
852 
853 /*
854  * Decrement extra reference on snapshot when last name is removed.
855  * It will not be freed until the last open reference goes away.
856  */
857 void
858 ffs_snapgone(ip)
859 	struct inode *ip;
860 {
861 	struct inode *xp;
862 	struct fs *fs;
863 	int snaploc;
864 
865 	/*
866 	 * Find snapshot in incore list.
867 	 */
868 	TAILQ_FOREACH(xp, &ip->i_devvp->v_rdev->si_snapshots, i_nextsnap)
869 		if (xp == ip)
870 			break;
871 	if (xp == 0)
872 		printf("ffs_snapgone: lost snapshot vnode %d\n",
873 		    ip->i_number);
874 	else
875 		vrele(ITOV(ip));
876 	/*
877 	 * Delete snapshot inode from superblock. Keep list dense.
878 	 */
879 	fs = ip->i_fs;
880 	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
881 		if (fs->fs_snapinum[snaploc] == ip->i_number)
882 			break;
883 	if (snaploc < FSMAXSNAP) {
884 		for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
885 			if (fs->fs_snapinum[snaploc] == 0)
886 				break;
887 			fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
888 		}
889 		fs->fs_snapinum[snaploc - 1] = 0;
890 	}
891 }
892 
893 /*
894  * Prepare a snapshot file for being removed.
895  */
896 void
897 ffs_snapremove(vp)
898 	struct vnode *vp;
899 {
900 	struct inode *ip;
901 	struct vnode *devvp;
902 	struct buf *ibp;
903 	struct fs *fs;
904 	ufs_daddr_t blkno, dblk;
905 	int error, numblks, loc, last;
906 
907 	ip = VTOI(vp);
908 	fs = ip->i_fs;
909 	/*
910 	 * If active, delete from incore list (this snapshot may
911 	 * already have been in the process of being deleted, so
912 	 * would not have been active).
913 	 *
914 	 * Clear copy-on-write flag if last snapshot.
915 	 */
916 	if (ip->i_nextsnap.tqe_prev != 0) {
917 		devvp = ip->i_devvp;
918 		TAILQ_REMOVE(&devvp->v_rdev->si_snapshots, ip, i_nextsnap);
919 		ip->i_nextsnap.tqe_prev = 0;
920 		if (TAILQ_FIRST(&devvp->v_rdev->si_snapshots) == 0) {
921 			devvp->v_rdev->si_copyonwrite = 0;
922 			devvp->v_flag &= ~VCOPYONWRITE;
923 		}
924 	}
925 	/*
926 	 * Clear all BLK_NOCOPY fields. Pass any block claims to other
927 	 * snapshots that want them (see ffs_snapblkfree below).
928 	 */
929 	for (blkno = 1; blkno < NDADDR; blkno++) {
930 		dblk = ip->i_db[blkno];
931 		if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
932 			ip->i_db[blkno] = 0;
933 		else if ((dblk == blkstofrags(fs, blkno) &&
934 		     ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize,
935 		     ip->i_number))) {
936 			ip->i_blocks -= btodb(fs->fs_bsize);
937 			ip->i_db[blkno] = 0;
938 		}
939 	}
940 	numblks = howmany(ip->i_size, fs->fs_bsize);
941 	for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
942 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno),
943 		    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
944 		if (error)
945 			continue;
946 		if ((last = fs->fs_size - blkno) > NINDIR(fs))
947 			last = NINDIR(fs);
948 		for (loc = 0; loc < last; loc++) {
949 			dblk = ((ufs_daddr_t *)(ibp->b_data))[loc];
950 			if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
951 				((ufs_daddr_t *)(ibp->b_data))[loc] = 0;
952 			else if ((dblk == blkstofrags(fs, blkno) &&
953 			     ffs_snapblkfree(fs, ip->i_devvp, dblk,
954 			     fs->fs_bsize, ip->i_number))) {
955 				ip->i_blocks -= btodb(fs->fs_bsize);
956 				((ufs_daddr_t *)(ibp->b_data))[loc] = 0;
957 			}
958 		}
959 		bawrite(ibp);
960 	}
961 	/*
962 	 * Clear snapshot flag and drop reference.
963 	 */
964 	ip->i_flags &= ~SF_SNAPSHOT;
965 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
966 }
967 
968 /*
969  * Notification that a block is being freed. Return zero if the free
970  * should be allowed to proceed. Return non-zero if the snapshot file
971  * wants to claim the block. The block will be claimed if it is an
972  * uncopied part of one of the snapshots. It will be freed if it is
973  * either a BLK_NOCOPY or has already been copied in all of the snapshots.
974  * If a fragment is being freed, then all snapshots that care about
975  * it must make a copy since a snapshot file can only claim full sized
976  * blocks. Note that if more than one snapshot file maps the block,
977  * we can pick one at random to claim it. Since none of the snapshots
978  * can change, we are assurred that they will all see the same unmodified
979  * image. When deleting a snapshot file (see ffs_snapremove above), we
980  * must push any of these claimed blocks to one of the other snapshots
981  * that maps it. These claimed blocks are easily identified as they will
982  * have a block number equal to their logical block number within the
983  * snapshot. A copied block can never have this property because they
984  * must always have been allocated from a BLK_NOCOPY location.
985  */
986 int
987 ffs_snapblkfree(fs, devvp, bno, size, inum)
988 	struct fs *fs;
989 	struct vnode *devvp;
990 	ufs_daddr_t bno;
991 	long size;
992 	ino_t inum;
993 {
994 	struct buf *ibp, *cbp, *savedcbp = 0;
995 	struct thread *td = curthread;
996 	struct inode *ip;
997 	struct vnode *vp;
998 	ufs_daddr_t lbn, blkno;
999 	int indiroff = 0, error = 0, claimedblk = 0;
1000 	struct snaphead *snaphead;
1001 
1002 	lbn = fragstoblks(fs, bno);
1003 	snaphead = &devvp->v_rdev->si_snapshots;
1004 	TAILQ_FOREACH(ip, snaphead, i_nextsnap) {
1005 		vp = ITOV(ip);
1006 		/*
1007 		 * Lookup block being written.
1008 		 */
1009 		if (lbn < NDADDR) {
1010 			blkno = ip->i_db[lbn];
1011 		} else {
1012 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1013 			td->td_proc->p_flag |= P_COWINPROGRESS;
1014 			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
1015 			    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
1016 			td->td_proc->p_flag &= ~P_COWINPROGRESS;
1017 			VOP_UNLOCK(vp, 0, td);
1018 			if (error)
1019 				break;
1020 			indiroff = (lbn - NDADDR) % NINDIR(fs);
1021 			blkno = ((ufs_daddr_t *)(ibp->b_data))[indiroff];
1022 		}
1023 		/*
1024 		 * Check to see if block needs to be copied.
1025 		 */
1026 		switch (blkno) {
1027 		/*
1028 		 * If the snapshot has already copied the block (default),
1029 		 * or does not care about the block, it is not needed.
1030 		 */
1031 		default:
1032 		case BLK_NOCOPY:
1033 			if (lbn >= NDADDR)
1034 				bqrelse(ibp);
1035 			continue;
1036 		/*
1037 		 * No previous snapshot claimed the block, so it will be
1038 		 * freed and become a BLK_NOCOPY (don't care) for us.
1039 		 */
1040 		case BLK_SNAP:
1041 			if (claimedblk)
1042 				panic("snapblkfree: inconsistent block type");
1043 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1044 			if (lbn < NDADDR) {
1045 				ip->i_db[lbn] = BLK_NOCOPY;
1046 				ip->i_flag |= IN_CHANGE | IN_UPDATE;
1047 			} else {
1048 				((ufs_daddr_t *)(ibp->b_data))[indiroff] =
1049 				    BLK_NOCOPY;
1050 				bdwrite(ibp);
1051 			}
1052 			VOP_UNLOCK(vp, 0, td);
1053 			continue;
1054 		/*
1055 		 * A block that we map is being freed. If it has not been
1056 		 * claimed yet, we will claim or copy it (below).
1057 		 */
1058 		case 0:
1059 			claimedblk = 1;
1060 			break;
1061 		}
1062 		/*
1063 		 * If this is a full size block, we will just grab it
1064 		 * and assign it to the snapshot inode. Otherwise we
1065 		 * will proceed to copy it. See explanation for this
1066 		 * routine as to why only a single snapshot needs to
1067 		 * claim this block.
1068 		 */
1069 		if (size == fs->fs_bsize) {
1070 #ifdef DEBUG
1071 			if (snapdebug)
1072 				printf("%s %d lbn %d from inum %d\n",
1073 				    "Grabonremove: snapino", ip->i_number, lbn,
1074 				    inum);
1075 #endif
1076 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1077 			if (lbn < NDADDR) {
1078 				ip->i_db[lbn] = bno;
1079 			} else {
1080 				((ufs_daddr_t *)(ibp->b_data))[indiroff] = bno;
1081 				bdwrite(ibp);
1082 			}
1083 			ip->i_blocks += btodb(size);
1084 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
1085 			VOP_UNLOCK(vp, 0, td);
1086 			return (1);
1087 		}
1088 		if (lbn >= NDADDR)
1089 			bqrelse(ibp);
1090 		/*
1091 		 * Allocate the block into which to do the copy. Note that this
1092 		 * allocation will never require any additional allocations for
1093 		 * the snapshot inode.
1094 		 */
1095 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1096 		td->td_proc->p_flag |= P_COWINPROGRESS;
1097 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
1098 		    fs->fs_bsize, KERNCRED, 0, &cbp);
1099 		td->td_proc->p_flag &= ~P_COWINPROGRESS;
1100 		if (error) {
1101 			VOP_UNLOCK(vp, 0, td);
1102 			break;
1103 		}
1104 #ifdef DEBUG
1105 		if (snapdebug)
1106 			printf(
1107 "Copyonremove: snapino %lu lbn %ld for inum %lu size %ld to blkno %lld\n",
1108 			    (unsigned long)ip->i_number, (long)lbn,
1109 			    (unsigned long)inum, size, (long long)cbp->b_blkno);
1110 #endif
1111 		/*
1112 		 * If we have already read the old block contents, then
1113 		 * simply copy them to the new block. Note that we need
1114 		 * to synchronously write snapshots that have not been
1115 		 * unlinked, and hence will be visible after a crash,
1116 		 * to ensure their integrity.
1117 		 */
1118 		if (savedcbp != 0) {
1119 			bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
1120 			bawrite(cbp);
1121 			if (dopersistence && ip->i_effnlink > 0)
1122 				(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
1123 			VOP_UNLOCK(vp, 0, td);
1124 			continue;
1125 		}
1126 		/*
1127 		 * Otherwise, read the old block contents into the buffer.
1128 		 */
1129 		if ((error = readblock(cbp, lbn)) != 0) {
1130 			bzero(cbp->b_data, fs->fs_bsize);
1131 			bawrite(cbp);
1132 			if (dopersistence && ip->i_effnlink > 0)
1133 				(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
1134 			VOP_UNLOCK(vp, 0, td);
1135 			break;
1136 		}
1137 		VOP_UNLOCK(vp, 0, td);
1138 		savedcbp = cbp;
1139 	}
1140 	/*
1141 	 * Note that we need to synchronously write snapshots that
1142 	 * have not been unlinked, and hence will be visible after
1143 	 * a crash, to ensure their integrity.
1144 	 */
1145 	if (savedcbp) {
1146 		vp = savedcbp->b_vp;
1147 		bawrite(savedcbp);
1148 		if (dopersistence && VTOI(vp)->i_effnlink > 0) {
1149 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1150 			(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
1151 			VOP_UNLOCK(vp, 0, td);
1152 		}
1153 	}
1154 	/*
1155 	 * If we have been unable to allocate a block in which to do
1156 	 * the copy, then return non-zero so that the fragment will
1157 	 * not be freed. Although space will be lost, the snapshot
1158 	 * will stay consistent.
1159 	 */
1160 	return (error);
1161 }
1162 
1163 /*
1164  * Associate snapshot files when mounting.
1165  */
1166 void
1167 ffs_snapshot_mount(mp)
1168 	struct mount *mp;
1169 {
1170 	struct ufsmount *ump = VFSTOUFS(mp);
1171 	struct fs *fs = ump->um_fs;
1172 	struct thread *td = curthread;
1173 	struct snaphead *snaphead;
1174 	struct vnode *vp;
1175 	struct inode *ip;
1176 	int error, snaploc, loc;
1177 
1178 	snaphead = &ump->um_devvp->v_rdev->si_snapshots;
1179 	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
1180 		if (fs->fs_snapinum[snaploc] == 0)
1181 			return;
1182 		if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc],
1183 		    LK_EXCLUSIVE, &vp)) != 0){
1184 			printf("ffs_snapshot_mount: vget failed %d\n", error);
1185 			continue;
1186 		}
1187 		ip = VTOI(vp);
1188 		if ((ip->i_flags & SF_SNAPSHOT) == 0) {
1189 			printf("ffs_snapshot_mount: non-snapshot inode %d\n",
1190 			    fs->fs_snapinum[snaploc]);
1191 			vput(vp);
1192 			for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
1193 				if (fs->fs_snapinum[loc] == 0)
1194 					break;
1195 				fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
1196 			}
1197 			fs->fs_snapinum[loc - 1] = 0;
1198 			snaploc--;
1199 			continue;
1200 		}
1201 		if (ip->i_nextsnap.tqe_prev != 0)
1202 			panic("ffs_snapshot_mount: %d already on list",
1203 			    ip->i_number);
1204 		else
1205 			TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap);
1206 		vp->v_flag |= VSYSTEM;
1207 		ump->um_devvp->v_rdev->si_copyonwrite = ffs_copyonwrite;
1208 		ump->um_devvp->v_flag |= VCOPYONWRITE;
1209 		VOP_UNLOCK(vp, 0, td);
1210 	}
1211 }
1212 
1213 /*
1214  * Disassociate snapshot files when unmounting.
1215  */
1216 void
1217 ffs_snapshot_unmount(mp)
1218 	struct mount *mp;
1219 {
1220 	struct ufsmount *ump = VFSTOUFS(mp);
1221 	struct snaphead *snaphead = &ump->um_devvp->v_rdev->si_snapshots;
1222 	struct inode *xp;
1223 
1224 	while ((xp = TAILQ_FIRST(snaphead)) != 0) {
1225 		TAILQ_REMOVE(snaphead, xp, i_nextsnap);
1226 		xp->i_nextsnap.tqe_prev = 0;
1227 		if (xp->i_effnlink > 0)
1228 			vrele(ITOV(xp));
1229 	}
1230 	ump->um_devvp->v_rdev->si_copyonwrite = 0;
1231 	ump->um_devvp->v_flag &= ~VCOPYONWRITE;
1232 }
1233 
1234 /*
1235  * Check for need to copy block that is about to be written,
1236  * copying the block if necessary.
1237  */
1238 static int
1239 ffs_copyonwrite(devvp, bp)
1240 	struct vnode *devvp;
1241 	struct buf *bp;
1242 {
1243 	struct buf *ibp, *cbp, *savedcbp = 0;
1244 	struct thread *td = curthread;
1245 	struct fs *fs;
1246 	struct inode *ip;
1247 	struct vnode *vp;
1248 	ufs_daddr_t lbn, blkno;
1249 	int indiroff, error = 0;
1250 
1251 	fs = TAILQ_FIRST(&devvp->v_rdev->si_snapshots)->i_fs;
1252 	lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
1253 	if (td->td_proc->p_flag & P_COWINPROGRESS)
1254 		panic("ffs_copyonwrite: recursive call");
1255 	TAILQ_FOREACH(ip, &devvp->v_rdev->si_snapshots, i_nextsnap) {
1256 		vp = ITOV(ip);
1257 		/*
1258 		 * We ensure that everything of our own that needs to be
1259 		 * copied will be done at the time that ffs_snapshot is
1260 		 * called. Thus we can skip the check here which can
1261 		 * deadlock in doing the lookup in UFS_BALLOC.
1262 		 */
1263 		if (bp->b_vp == vp)
1264 			continue;
1265 		/*
1266 		 * Check to see if block needs to be copied. We have to
1267 		 * be able to do the UFS_BALLOC without blocking, otherwise
1268 		 * we may get in a deadlock with another process also
1269 		 * trying to allocate. If we find outselves unable to
1270 		 * get the buffer lock, we unlock the snapshot vnode,
1271 		 * sleep briefly, and try again.
1272 		 */
1273 retry:
1274 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1275 		if (lbn < NDADDR) {
1276 			blkno = ip->i_db[lbn];
1277 		} else {
1278 			td->td_proc->p_flag |= P_COWINPROGRESS;
1279 			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
1280 			   fs->fs_bsize, KERNCRED, B_METAONLY | B_NOWAIT, &ibp);
1281 			td->td_proc->p_flag &= ~P_COWINPROGRESS;
1282 			if (error) {
1283 				VOP_UNLOCK(vp, 0, td);
1284 				if (error != EWOULDBLOCK)
1285 					break;
1286 				tsleep(vp, td->td_ksegrp->kg_user_pri, "nap", 1);
1287 				goto retry;
1288 			}
1289 			indiroff = (lbn - NDADDR) % NINDIR(fs);
1290 			blkno = ((ufs_daddr_t *)(ibp->b_data))[indiroff];
1291 			bqrelse(ibp);
1292 		}
1293 #ifdef DIAGNOSTIC
1294 		if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
1295 			panic("ffs_copyonwrite: bad copy block");
1296 #endif
1297 		if (blkno != 0) {
1298 			VOP_UNLOCK(vp, 0, td);
1299 			continue;
1300 		}
1301 		/*
1302 		 * Allocate the block into which to do the copy. Note that this
1303 		 * allocation will never require any additional allocations for
1304 		 * the snapshot inode.
1305 		 */
1306 		td->td_proc->p_flag |= P_COWINPROGRESS;
1307 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
1308 		    fs->fs_bsize, KERNCRED, B_NOWAIT, &cbp);
1309 		td->td_proc->p_flag &= ~P_COWINPROGRESS;
1310 		if (error) {
1311 			VOP_UNLOCK(vp, 0, td);
1312 			if (error != EWOULDBLOCK)
1313 				break;
1314 			tsleep(vp, td->td_ksegrp->kg_user_pri, "nap", 1);
1315 			goto retry;
1316 		}
1317 #ifdef DEBUG
1318 		if (snapdebug) {
1319 			printf("Copyonwrite: snapino %d lbn %d for ",
1320 			    ip->i_number, lbn);
1321 			if (bp->b_vp == devvp)
1322 				printf("fs metadata");
1323 			else
1324 				printf("inum %d", VTOI(bp->b_vp)->i_number);
1325 			printf(" lblkno %lld to blkno %lld\n",
1326 			    (long long)bp->b_lblkno, (long long)cbp->b_blkno);
1327 		}
1328 #endif
1329 		/*
1330 		 * If we have already read the old block contents, then
1331 		 * simply copy them to the new block. Note that we need
1332 		 * to synchronously write snapshots that have not been
1333 		 * unlinked, and hence will be visible after a crash,
1334 		 * to ensure their integrity.
1335 		 */
1336 		if (savedcbp != 0) {
1337 			bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
1338 			bawrite(cbp);
1339 			if (dopersistence && ip->i_effnlink > 0)
1340 				(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
1341 			VOP_UNLOCK(vp, 0, td);
1342 			continue;
1343 		}
1344 		/*
1345 		 * Otherwise, read the old block contents into the buffer.
1346 		 */
1347 		if ((error = readblock(cbp, lbn)) != 0) {
1348 			bzero(cbp->b_data, fs->fs_bsize);
1349 			bawrite(cbp);
1350 			if (dopersistence && ip->i_effnlink > 0)
1351 				(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
1352 			VOP_UNLOCK(vp, 0, td);
1353 			break;
1354 		}
1355 		savedcbp = cbp;
1356 		VOP_UNLOCK(vp, 0, td);
1357 	}
1358 	/*
1359 	 * Note that we need to synchronously write snapshots that
1360 	 * have not been unlinked, and hence will be visible after
1361 	 * a crash, to ensure their integrity.
1362 	 */
1363 	if (savedcbp) {
1364 		vp = savedcbp->b_vp;
1365 		bawrite(savedcbp);
1366 		if (dopersistence && VTOI(vp)->i_effnlink > 0) {
1367 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1368 			(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
1369 			VOP_UNLOCK(vp, 0, td);
1370 		}
1371 	}
1372 	return (error);
1373 }
1374 
1375 /*
1376  * Read the specified block into the given buffer.
1377  * Much of this boiler-plate comes from bwrite().
1378  */
1379 static int
1380 readblock(bp, lbn)
1381 	struct buf *bp;
1382 	daddr_t lbn;
1383 {
1384 	struct uio auio;
1385 	struct iovec aiov;
1386 	struct thread *td = curthread;
1387 	struct inode *ip = VTOI(bp->b_vp);
1388 
1389 	aiov.iov_base = bp->b_data;
1390 	aiov.iov_len = bp->b_bcount;
1391 	auio.uio_iov = &aiov;
1392 	auio.uio_iovcnt = 1;
1393 	auio.uio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn)));
1394 	auio.uio_resid = bp->b_bcount;
1395 	auio.uio_rw = UIO_READ;
1396 	auio.uio_segflg = UIO_SYSSPACE;
1397 	auio.uio_td = td;
1398 	return (physio(ip->i_devvp->v_rdev, &auio, 0));
1399 }
1400