xref: /freebsd/sys/ufs/ffs/ffs_snapshot.c (revision a3e8fd0b7f663db7eafff527d5c3ca3bcfa8a537)
1 /*
2  * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
3  *
4  * Further information about snapshots can be obtained from:
5  *
6  *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
7  *	1614 Oxford Street		mckusick@mckusick.com
8  *	Berkeley, CA 94709-1608		+1-510-843-9542
9  *	USA
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  *
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
22  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24  * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
25  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *	@(#)ffs_snapshot.c	8.11 (McKusick) 7/23/00
34  * $FreeBSD$
35  */
36 
37 #include <sys/param.h>
38 #include <sys/stdint.h>
39 #include <sys/kernel.h>
40 #include <sys/systm.h>
41 #include <sys/conf.h>
42 #include <sys/bio.h>
43 #include <sys/buf.h>
44 #include <sys/proc.h>
45 #include <sys/namei.h>
46 #include <sys/stat.h>
47 #include <sys/malloc.h>
48 #include <sys/mount.h>
49 #include <sys/resource.h>
50 #include <sys/resourcevar.h>
51 #include <sys/vnode.h>
52 
53 #include <ufs/ufs/extattr.h>
54 #include <ufs/ufs/quota.h>
55 #include <ufs/ufs/ufsmount.h>
56 #include <ufs/ufs/inode.h>
57 #include <ufs/ufs/ufs_extern.h>
58 
59 #include <ufs/ffs/fs.h>
60 #include <ufs/ffs/ffs_extern.h>
61 
62 #define KERNCRED thread0.td_ucred
63 #define DEBUG 1
64 
65 static int cgaccount(int, struct vnode *, struct buf *, int);
66 static int expunge_ufs1(struct vnode *, struct inode *, struct fs *,
67     int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
68     ufs_lbn_t, int), int);
69 static int indiracct_ufs1(struct vnode *, struct vnode *, int,
70     ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
71     int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
72     ufs_lbn_t, int), int);
73 static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
74     struct fs *, ufs_lbn_t, int);
75 static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
76     struct fs *, ufs_lbn_t, int);
77 static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
78     struct fs *, ufs_lbn_t, int);
79 static int expunge_ufs2(struct vnode *, struct inode *, struct fs *,
80     int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
81     ufs_lbn_t, int), int);
82 static int indiracct_ufs2(struct vnode *, struct vnode *, int,
83     ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
84     int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
85     ufs_lbn_t, int), int);
86 static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
87     struct fs *, ufs_lbn_t, int);
88 static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
89     struct fs *, ufs_lbn_t, int);
90 static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
91     struct fs *, ufs_lbn_t, int);
92 static int ffs_copyonwrite(struct vnode *, struct buf *);
93 static int readblock(struct buf *, ufs2_daddr_t);
94 
95 /*
96  * To ensure the consistency of snapshots across crashes, we must
97  * synchronously write out copied blocks before allowing the
98  * originals to be modified. Because of the rather severe speed
99  * penalty that this imposes, the following flag allows this
100  * crash persistence to be disabled.
101  */
102 int dopersistence = 0;
103 
104 #ifdef DEBUG
105 #include <sys/sysctl.h>
106 SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, "");
107 int snapdebug = 0;
108 SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, "");
109 int collectsnapstats = 0;
110 SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats,
111 	0, "");
112 #endif /* DEBUG */
113 
114 /*
115  * Create a snapshot file and initialize it for the filesystem.
116  */
117 int
118 ffs_snapshot(mp, snapfile)
119 	struct mount *mp;
120 	char *snapfile;
121 {
122 	ufs2_daddr_t numblks, blkno;
123 	int error, cg, snaploc;
124 	int i, size, len, loc;
125 	int flag = mp->mnt_flag;
126 	struct timespec starttime = {0, 0}, endtime;
127 	char saved_nice = 0;
128 	long redo = 0;
129 	int32_t *lp;
130 	void *space;
131 	daddr_t *listhd;
132 	struct fs *copy_fs = NULL, *fs = VFSTOUFS(mp)->um_fs;
133 	struct snaphead *snaphead;
134 	struct thread *td = curthread;
135 	struct inode *ip, *xp;
136 	struct buf *bp, *nbp, *ibp, *sbp = NULL;
137 	struct nameidata nd;
138 	struct mount *wrtmp;
139 	struct vattr vat;
140 	struct vnode *vp, *xvp, *nvp;
141 	struct uio auio;
142 	struct iovec aiov;
143 
144 	/*
145 	 * Need to serialize access to snapshot code per filesystem.
146 	 */
147 	/*
148 	 * Assign a snapshot slot in the superblock.
149 	 */
150 	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
151 		if (fs->fs_snapinum[snaploc] == 0)
152 			break;
153 	if (snaploc == FSMAXSNAP)
154 		return (ENOSPC);
155 	/*
156 	 * Create the snapshot file.
157 	 */
158 restart:
159 	NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, snapfile, td);
160 	if ((error = namei(&nd)) != 0)
161 		return (error);
162 	if (nd.ni_vp != NULL) {
163 		vput(nd.ni_vp);
164 		error = EEXIST;
165 	}
166 	if (nd.ni_dvp->v_mount != mp)
167 		error = EXDEV;
168 	if (error) {
169 		NDFREE(&nd, NDF_ONLY_PNBUF);
170 		if (nd.ni_dvp == nd.ni_vp)
171 			vrele(nd.ni_dvp);
172 		else
173 			vput(nd.ni_dvp);
174 		return (error);
175 	}
176 	VATTR_NULL(&vat);
177 	vat.va_type = VREG;
178 	vat.va_mode = S_IRUSR;
179 	vat.va_vaflags |= VA_EXCLUSIVE;
180 	if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp))
181 		wrtmp = NULL;
182 	if (wrtmp != mp)
183 		panic("ffs_snapshot: mount mismatch");
184 	if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) {
185 		NDFREE(&nd, NDF_ONLY_PNBUF);
186 		vput(nd.ni_dvp);
187 		if ((error = vn_start_write(NULL, &wrtmp,
188 		    V_XSLEEP | PCATCH)) != 0)
189 			return (error);
190 		goto restart;
191 	}
192 	VOP_LEASE(nd.ni_dvp, td, KERNCRED, LEASE_WRITE);
193 	error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat);
194 	vput(nd.ni_dvp);
195 	if (error) {
196 		NDFREE(&nd, NDF_ONLY_PNBUF);
197 		vn_finished_write(wrtmp);
198 		return (error);
199 	}
200 	vp = nd.ni_vp;
201 	ip = VTOI(vp);
202 	/*
203 	 * Allocate and copy the last block contents so as to be able
204 	 * to set size to that of the filesystem.
205 	 */
206 	numblks = howmany(fs->fs_size, fs->fs_frag);
207 	error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)),
208 	    fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp);
209 	if (error)
210 		goto out;
211 	ip->i_size = lblktosize(fs, (off_t)numblks);
212 	DIP(ip, i_size) = ip->i_size;
213 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
214 	if ((error = readblock(bp, numblks - 1)) != 0)
215 		goto out;
216 	bawrite(bp);
217 	/*
218 	 * Preallocate critical data structures so that we can copy
219 	 * them in without further allocation after we suspend all
220 	 * operations on the filesystem. We would like to just release
221 	 * the allocated buffers without writing them since they will
222 	 * be filled in below once we are ready to go, but this upsets
223 	 * the soft update code, so we go ahead and write the new buffers.
224 	 *
225 	 * Allocate all indirect blocks and mark all of them as not
226 	 * needing to be copied.
227 	 */
228 	for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
229 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno),
230 		    fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp);
231 		if (error)
232 			goto out;
233 		bdwrite(ibp);
234 	}
235 	/*
236 	 * Allocate copies for the superblock and its summary information.
237 	 */
238 	error = UFS_BALLOC(vp, lfragtosize(fs, fs->fs_sblockloc),
239 	    fs->fs_sbsize, KERNCRED, 0, &nbp);
240 	if (error)
241 		goto out;
242 	bawrite(nbp);
243 	blkno = fragstoblks(fs, fs->fs_csaddr);
244 	len = howmany(fs->fs_cssize, fs->fs_bsize);
245 	for (loc = 0; loc < len; loc++) {
246 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)),
247 		    fs->fs_bsize, KERNCRED, 0, &nbp);
248 		if (error)
249 			goto out;
250 		bawrite(nbp);
251 	}
252 	/*
253 	 * Allocate all cylinder group blocks.
254 	 */
255 	for (cg = 0; cg < fs->fs_ncg; cg++) {
256 		error = UFS_BALLOC(vp, (off_t)(cgtod(fs, cg)) << fs->fs_fshift,
257 		    fs->fs_bsize, KERNCRED, 0, &nbp);
258 		if (error)
259 			goto out;
260 		bdwrite(nbp);
261 	}
262 	/*
263 	 * Copy all the cylinder group maps. Although the
264 	 * filesystem is still active, we hope that only a few
265 	 * cylinder groups will change between now and when we
266 	 * suspend operations. Thus, we will be able to quickly
267 	 * touch up the few cylinder groups that changed during
268 	 * the suspension period.
269 	 */
270 	len = howmany(fs->fs_ncg, NBBY);
271 	MALLOC(fs->fs_active, int *, len, M_DEVBUF, M_WAITOK);
272 	bzero(fs->fs_active, len);
273 	for (cg = 0; cg < fs->fs_ncg; cg++) {
274 		error = bread(vp, fragstoblks(fs, cgtod(fs, cg)), fs->fs_bsize,
275 		    KERNCRED, &nbp);
276 		if (error) {
277 			brelse(nbp);
278 			goto out;
279 		}
280 		error = cgaccount(cg, vp, nbp, 1);
281 		bawrite(nbp);
282 		if (error)
283 			goto out;
284 	}
285 	/*
286 	 * Change inode to snapshot type file.
287 	 */
288 	ip->i_flags |= SF_SNAPSHOT;
289 	DIP(ip, i_flags) = ip->i_flags;
290 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
291 	/*
292 	 * Ensure that the snapshot is completely on disk.
293 	 */
294 	if ((error = VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td)) != 0)
295 		goto out;
296 	/*
297 	 * All allocations are done, so we can now snapshot the system.
298 	 *
299 	 * Recind nice scheduling while running with the filesystem suspended.
300 	 */
301 	if (td->td_ksegrp->kg_nice > 0) {
302 		saved_nice = td->td_ksegrp->kg_nice;
303 		td->td_ksegrp->kg_nice = 0;
304 	}
305 	/*
306 	 * Suspend operation on filesystem.
307 	 */
308 	for (;;) {
309 		vn_finished_write(wrtmp);
310 		if ((error = vfs_write_suspend(vp->v_mount)) != 0) {
311 			vn_start_write(NULL, &wrtmp, V_WAIT);
312 			goto out;
313 		}
314 		if (mp->mnt_kern_flag & MNTK_SUSPENDED)
315 			break;
316 		vn_start_write(NULL, &wrtmp, V_WAIT);
317 	}
318 	if (collectsnapstats)
319 		nanotime(&starttime);
320 	/*
321 	 * First, copy all the cylinder group maps that have changed.
322 	 */
323 	for (cg = 0; cg < fs->fs_ncg; cg++) {
324 		if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0)
325 			continue;
326 		redo++;
327 		error = bread(vp, fragstoblks(fs, cgtod(fs, cg)), fs->fs_bsize,
328 			KERNCRED, &nbp);
329 		if (error) {
330 			brelse(nbp);
331 			goto out1;
332 		}
333 		error = cgaccount(cg, vp, nbp, 2);
334 		bawrite(nbp);
335 		if (error)
336 			goto out1;
337 	}
338 	/*
339 	 * Grab a copy of the superblock and its summary information.
340 	 * We delay writing it until the suspension is released below.
341 	 */
342 	error = bread(vp, fragstoblks(fs, fs->fs_sblockloc), fs->fs_bsize,
343 	    KERNCRED, &sbp);
344 	if (error) {
345 		brelse(sbp);
346 		sbp = NULL;
347 		goto out1;
348 	}
349 	loc = blkoff(fs, lfragtosize(fs, fs->fs_sblockloc));
350 	copy_fs = (struct fs *)(sbp->b_data + loc);
351 	bcopy(fs, copy_fs, fs->fs_sbsize);
352 	if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0)
353 		copy_fs->fs_clean = 1;
354 	if (fs->fs_sbsize < SBLOCKSIZE)
355 		bzero(&sbp->b_data[loc + fs->fs_sbsize],
356 		    SBLOCKSIZE - fs->fs_sbsize);
357 	size = blkroundup(fs, fs->fs_cssize);
358 	if (fs->fs_contigsumsize > 0)
359 		size += fs->fs_ncg * sizeof(int32_t);
360 	space = malloc((u_long)size, M_UFSMNT, M_WAITOK);
361 	copy_fs->fs_csp = space;
362 	bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize);
363 	(char *)space += fs->fs_cssize;
364 	loc = howmany(fs->fs_cssize, fs->fs_fsize);
365 	i = fs->fs_frag - loc % fs->fs_frag;
366 	len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize;
367 	if (len > 0) {
368 		if ((error = bread(ip->i_devvp,
369 		    fsbtodb(fs, fs->fs_csaddr + loc),
370 		    len, KERNCRED, &bp)) != 0) {
371 			brelse(bp);
372 			free(copy_fs->fs_csp, M_UFSMNT);
373 			bawrite(sbp);
374 			sbp = NULL;
375 			goto out1;
376 		}
377 		bcopy(bp->b_data, space, (u_int)len);
378 		(char *)space += len;
379 		bp->b_flags |= B_INVAL | B_NOCACHE;
380 		brelse(bp);
381 	}
382 	if (fs->fs_contigsumsize > 0) {
383 		copy_fs->fs_maxcluster = lp = space;
384 		for (i = 0; i < fs->fs_ncg; i++)
385 			*lp++ = fs->fs_contigsumsize;
386 	}
387 	/*
388 	 * We must check for active files that have been unlinked
389 	 * (e.g., with a zero link count). We have to expunge all
390 	 * trace of these files from the snapshot so that they are
391 	 * not reclaimed prematurely by fsck or unnecessarily dumped.
392 	 * We turn off the MNTK_SUSPENDED flag to avoid a panic from
393 	 * spec_strategy about writing on a suspended filesystem.
394 	 * Note that we skip unlinked snapshot files as they will
395 	 * be handled separately below.
396 	 */
397 	mp->mnt_kern_flag &= ~MNTK_SUSPENDED;
398 	mtx_lock(&mntvnode_mtx);
399 loop:
400 	for (xvp = TAILQ_FIRST(&mp->mnt_nvnodelist); xvp; xvp = nvp) {
401 		/*
402 		 * Make sure this vnode wasn't reclaimed in getnewvnode().
403 		 * Start over if it has (it won't be on the list anymore).
404 		 */
405 		if (xvp->v_mount != mp)
406 			goto loop;
407 		nvp = TAILQ_NEXT(xvp, v_nmntvnodes);
408 		mtx_unlock(&mntvnode_mtx);
409 		mp_fixme("Unlocked GETATTR.");
410 		if (vrefcnt(xvp) == 0 || xvp->v_type == VNON ||
411 		    (VTOI(xvp)->i_flags & SF_SNAPSHOT) ||
412 		    (VOP_GETATTR(xvp, &vat, td->td_proc->p_ucred, td) == 0 &&
413 		    vat.va_nlink > 0)) {
414 			mtx_lock(&mntvnode_mtx);
415 			continue;
416 		}
417 		if (snapdebug)
418 			vprint("ffs_snapshot: busy vnode", xvp);
419 		if (vn_lock(xvp, LK_EXCLUSIVE, td) != 0)
420 			goto loop;
421 		xp = VTOI(xvp);
422 		/*
423 		 * If there is a fragment, clear it here.
424 		 */
425 		blkno = 0;
426 		loc = howmany(xp->i_size, fs->fs_bsize) - 1;
427 		if (loc < NDADDR) {
428 			len = fragroundup(fs, blkoff(fs, xp->i_size));
429 			if (len < fs->fs_bsize) {
430 				ffs_blkfree(copy_fs, vp, DIP(xp, i_db[loc]),
431 				    len, xp->i_number);
432 				blkno = DIP(xp, i_db[loc]);
433 				DIP(xp, i_db[loc]) = 0;
434 			}
435 		}
436 		if (xp->i_ump->um_fstype == UFS1)
437 			error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1,
438 			    BLK_NOCOPY);
439 		else
440 			error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2,
441 			    BLK_NOCOPY);
442 		if (blkno)
443 			DIP(xp, i_db[loc]) = blkno;
444 		if (!error)
445 			error = ffs_freefile(copy_fs, vp, xp->i_number,
446 			    xp->i_mode);
447 		VOP_UNLOCK(xvp, 0, td);
448 		if (error) {
449 			free(copy_fs->fs_csp, M_UFSMNT);
450 			bawrite(sbp);
451 			sbp = NULL;
452 			goto out1;
453 		}
454 		mtx_lock(&mntvnode_mtx);
455 	}
456 	mtx_unlock(&mntvnode_mtx);
457 	/*
458 	 * If there already exist snapshots on this filesystem, grab a
459 	 * reference to their shared lock. If this is the first snapshot
460 	 * on this filesystem, we need to allocate a lock for the snapshots
461 	 * to share. In either case, acquire the snapshot lock and give
462 	 * up our original private lock.
463 	 */
464 	snaphead = &ip->i_devvp->v_rdev->si_snapshots;
465 	if ((xp = TAILQ_FIRST(snaphead)) != NULL) {
466 		VI_LOCK(vp);
467 		vp->v_vnlock = ITOV(xp)->v_vnlock;
468 	} else {
469 		struct lock *lkp;
470 
471 		MALLOC(lkp, struct lock *, sizeof(struct lock), M_UFSMNT,
472 		    M_WAITOK);
473 		lockinit(lkp, PVFS, "snaplk", VLKTIMEOUT,
474 		    LK_CANRECURSE | LK_NOPAUSE);
475 		VI_LOCK(vp);
476 		vp->v_vnlock = lkp;
477 	}
478 	vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, td);
479 	VI_LOCK(vp);
480 	lockmgr(&vp->v_lock, LK_INTERLOCK | LK_RELEASE, VI_MTX(vp), td);
481 	/*
482 	 * Record snapshot inode. Since this is the newest snapshot,
483 	 * it must be placed at the end of the list.
484 	 */
485 	fs->fs_snapinum[snaploc] = ip->i_number;
486 	if (ip->i_nextsnap.tqe_prev != 0)
487 		panic("ffs_snapshot: %d already on list", ip->i_number);
488 	ASSERT_VOP_LOCKED(ip->i_devvp, "ffs_snapshot devvp");
489 	TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap);
490 	ip->i_devvp->v_rdev->si_copyonwrite = ffs_copyonwrite;
491 	ip->i_devvp->v_vflag |= VV_COPYONWRITE;
492 	ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp");
493 	vp->v_vflag |= VV_SYSTEM;
494 out1:
495 	/*
496 	 * Resume operation on filesystem.
497 	 */
498 	vfs_write_resume(vp->v_mount);
499 	vn_start_write(NULL, &wrtmp, V_WAIT);
500 	if (collectsnapstats && starttime.tv_sec > 0) {
501 		nanotime(&endtime);
502 		timespecsub(&endtime, &starttime);
503 		printf("%s: suspended %d.%03ld sec, redo %ld of %d\n",
504 		    vp->v_mount->mnt_stat.f_mntonname, endtime.tv_sec,
505 		    endtime.tv_nsec / 1000000, redo, fs->fs_ncg);
506 	}
507 	if (sbp == NULL)
508 		goto out;
509 	/*
510 	 * Copy allocation information from all the snapshots in
511 	 * this snapshot and then expunge them from its view.
512 	 */
513 	snaphead = &ip->i_devvp->v_rdev->si_snapshots;
514 	TAILQ_FOREACH(xp, snaphead, i_nextsnap) {
515 		if (xp == ip)
516 			break;
517 		if (xp->i_ump->um_fstype == UFS1)
518 			error = expunge_ufs1(vp, xp, fs, snapacct_ufs1,
519 			    BLK_SNAP);
520 		else
521 			error = expunge_ufs2(vp, xp, fs, snapacct_ufs2,
522 			    BLK_SNAP);
523 		if (error) {
524 			fs->fs_snapinum[snaploc] = 0;
525 			goto done;
526 		}
527 	}
528 	/*
529 	 * Allocate the space for the list of preallocated snapshot blocks.
530 	 */
531 	ip->i_snaplistsize = fragstoblks(fs, dbtofsb(fs, DIP(ip,i_blocks))) + 1;
532 	MALLOC(listhd, daddr_t *, ip->i_snaplistsize * sizeof(daddr_t),
533 	    M_UFSMNT, M_WAITOK);
534 	ip->i_snapblklist = listhd;
535 	*ip->i_snapblklist++ = ip->i_snaplistsize;
536 	/*
537 	 * Expunge the blocks used by the snapshots from the set of
538 	 * blocks marked as used in the snapshot bitmaps. Also, collect
539 	 * the list of allocated blocks in i_snapblklist.
540 	 */
541 	if (ip->i_ump->um_fstype == UFS1)
542 		error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP);
543 	else
544 		error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP);
545 	if (error) {
546 		fs->fs_snapinum[snaploc] = 0;
547 		FREE(listhd, M_UFSMNT);
548 		goto done;
549 	}
550 	/*
551 	 * Write out the list of allocated blocks to the end of the snapshot.
552 	 */
553 	if (ip->i_snapblklist - listhd != ip->i_snaplistsize)
554 		printf("Snaplist mismatch, got %jd should be %jd\n",
555 		    (intmax_t)(ip->i_snapblklist - listhd),
556 		    (intmax_t)ip->i_snaplistsize);
557 	auio.uio_iov = &aiov;
558 	auio.uio_iovcnt = 1;
559 	aiov.iov_base = (void *)listhd;
560 	aiov.iov_len = ip->i_snaplistsize * sizeof(daddr_t);
561 	auio.uio_resid = aiov.iov_len;;
562 	auio.uio_offset = ip->i_size;
563 	auio.uio_segflg = UIO_SYSSPACE;
564 	auio.uio_rw = UIO_WRITE;
565 	auio.uio_td = td;
566 	if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
567 		fs->fs_snapinum[snaploc] = 0;
568 		FREE(listhd, M_UFSMNT);
569 		goto done;
570 	}
571 	ip->i_snapblklist = listhd;
572 	/*
573 	 * Write the superblock and its summary information
574 	 * to the snapshot.
575 	 */
576 	blkno = fragstoblks(fs, fs->fs_csaddr);
577 	len = howmany(fs->fs_cssize, fs->fs_bsize);
578 	space = copy_fs->fs_csp;
579 	for (loc = 0; loc < len; loc++) {
580 		error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp);
581 		if (error) {
582 			brelse(nbp);
583 			fs->fs_snapinum[snaploc] = 0;
584 			FREE(listhd, M_UFSMNT);
585 			ip->i_snapblklist = NULL;
586 			goto done;
587 		}
588 		bcopy(space, nbp->b_data, fs->fs_bsize);
589 		space = (char *)space + fs->fs_bsize;
590 		bawrite(nbp);
591 	}
592 done:
593 	free(copy_fs->fs_csp, M_UFSMNT);
594 	bawrite(sbp);
595 out:
596 	if (saved_nice > 0)
597 		td->td_ksegrp->kg_nice = saved_nice;
598 	if (fs->fs_active != 0) {
599 		FREE(fs->fs_active, M_DEVBUF);
600 		fs->fs_active = 0;
601 	}
602 	mp->mnt_flag = flag;
603 	if (error)
604 		(void) UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED, td);
605 	(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
606 	if (error)
607 		vput(vp);
608 	else
609 		VOP_UNLOCK(vp, 0, td);
610 	vn_finished_write(wrtmp);
611 	return (error);
612 }
613 
614 /*
615  * Copy a cylinder group map. All the unallocated blocks are marked
616  * BLK_NOCOPY so that the snapshot knows that it need not copy them
617  * if they are later written. If passno is one, then this is a first
618  * pass, so only setting needs to be done. If passno is 2, then this
619  * is a revision to a previous pass which must be undone as the
620  * replacement pass is done.
621  */
622 static int
623 cgaccount(cg, vp, nbp, passno)
624 	int cg;
625 	struct vnode *vp;
626 	struct buf *nbp;
627 	int passno;
628 {
629 	struct buf *bp, *ibp;
630 	struct inode *ip;
631 	struct cg *cgp;
632 	struct fs *fs;
633 	ufs2_daddr_t base, numblks;
634 	int error, len, loc, indiroff;
635 
636 	ip = VTOI(vp);
637 	fs = ip->i_fs;
638 	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
639 		(int)fs->fs_cgsize, KERNCRED, &bp);
640 	if (error) {
641 		brelse(bp);
642 		return (error);
643 	}
644 	cgp = (struct cg *)bp->b_data;
645 	if (!cg_chkmagic(cgp)) {
646 		brelse(bp);
647 		return (EIO);
648 	}
649 	atomic_set_int(&ACTIVECGNUM(fs, cg), ACTIVECGOFF(cg));
650 	bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize);
651 	if (fs->fs_cgsize < fs->fs_bsize)
652 		bzero(&nbp->b_data[fs->fs_cgsize],
653 		    fs->fs_bsize - fs->fs_cgsize);
654 	if (passno == 2)
655 		nbp->b_flags |= B_VALIDSUSPWRT;
656 	numblks = howmany(fs->fs_size, fs->fs_frag);
657 	len = howmany(fs->fs_fpg, fs->fs_frag);
658 	base = cg * fs->fs_fpg / fs->fs_frag;
659 	if (base + len >= numblks)
660 		len = numblks - base - 1;
661 	loc = 0;
662 	if (base < NDADDR) {
663 		for ( ; loc < NDADDR; loc++) {
664 			if (ffs_isblock(fs, cg_blksfree(cgp), loc))
665 				DIP(ip, i_db[loc]) = BLK_NOCOPY;
666 			else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY)
667 				DIP(ip, i_db[loc]) = 0;
668 			else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY)
669 				panic("ffs_snapshot: lost direct block");
670 		}
671 	}
672 	error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)),
673 	    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
674 	if (error) {
675 		brelse(bp);
676 		return (error);
677 	}
678 	indiroff = (base + loc - NDADDR) % NINDIR(fs);
679 	for ( ; loc < len; loc++, indiroff++) {
680 		if (indiroff >= NINDIR(fs)) {
681 			if (passno == 2)
682 				ibp->b_flags |= B_VALIDSUSPWRT;
683 			bawrite(ibp);
684 			error = UFS_BALLOC(vp,
685 			    lblktosize(fs, (off_t)(base + loc)),
686 			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
687 			if (error) {
688 				brelse(bp);
689 				return (error);
690 			}
691 			indiroff = 0;
692 		}
693 		if (ip->i_ump->um_fstype == UFS1) {
694 			if (ffs_isblock(fs, cg_blksfree(cgp), loc))
695 				((ufs1_daddr_t *)(ibp->b_data))[indiroff] =
696 				    BLK_NOCOPY;
697 			else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data))
698 			    [indiroff] == BLK_NOCOPY)
699 				((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0;
700 			else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data))
701 			    [indiroff] == BLK_NOCOPY)
702 				panic("ffs_snapshot: lost indirect block");
703 			continue;
704 		}
705 		if (ffs_isblock(fs, cg_blksfree(cgp), loc))
706 			((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY;
707 		else if (passno == 2 &&
708 		    ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY)
709 			((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0;
710 		else if (passno == 1 &&
711 		    ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY)
712 			panic("ffs_snapshot: lost indirect block");
713 	}
714 	bqrelse(bp);
715 	if (passno == 2)
716 		ibp->b_flags |= B_VALIDSUSPWRT;
717 	bdwrite(ibp);
718 	return (0);
719 }
720 
721 /*
722  * Before expunging a snapshot inode, note all the
723  * blocks that it claims with BLK_SNAP so that fsck will
724  * be able to account for those blocks properly and so
725  * that this snapshot knows that it need not copy them
726  * if the other snapshot holding them is freed. This code
727  * is reproduced once each for UFS1 and UFS2.
728  */
729 static int
730 expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype)
731 	struct vnode *snapvp;
732 	struct inode *cancelip;
733 	struct fs *fs;
734 	int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
735 	    struct fs *, ufs_lbn_t, int);
736 	int expungetype;
737 {
738 	int i, error, indiroff;
739 	ufs_lbn_t lbn, rlbn;
740 	ufs2_daddr_t len, blkno, numblks, blksperindir;
741 	struct ufs1_dinode *dip;
742 	struct thread *td = curthread;
743 	struct buf *bp;
744 
745 	numblks = howmany(cancelip->i_size, fs->fs_bsize);
746 	if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0],
747 	    &cancelip->i_din1->di_ib[NIADDR], fs, 0, expungetype)))
748 		return (error);
749 	blksperindir = 1;
750 	lbn = -NDADDR;
751 	len = numblks - NDADDR;
752 	rlbn = NDADDR;
753 	for (i = 0; len > 0 && i < NIADDR; i++) {
754 		error = indiracct_ufs1(snapvp, ITOV(cancelip), i,
755 		    cancelip->i_din1->di_ib[i], lbn, rlbn, len,
756 		    blksperindir, fs, acctfunc, expungetype);
757 		if (error)
758 			return (error);
759 		blksperindir *= NINDIR(fs);
760 		lbn -= blksperindir + 1;
761 		len -= blksperindir;
762 		rlbn += blksperindir;
763 	}
764 	/*
765 	 * Prepare to expunge the inode. If its inode block has not
766 	 * yet been copied, then allocate and fill the copy.
767 	 */
768 	lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
769 	blkno = 0;
770 	if (lbn < NDADDR) {
771 		blkno = cancelip->i_din1->di_db[lbn];
772 	} else {
773 		td->td_proc->p_flag |= P_COWINPROGRESS;
774 		error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn),
775 		   fs->fs_bsize, KERNCRED, BA_METAONLY, &bp);
776 		td->td_proc->p_flag &= ~P_COWINPROGRESS;
777 		if (error)
778 			return (error);
779 		indiroff = (lbn - NDADDR) % NINDIR(fs);
780 		blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff];
781 		bqrelse(bp);
782 	}
783 	error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn),
784 	    fs->fs_bsize, KERNCRED, 0, &bp);
785 	if (error)
786 		return (error);
787 	if (blkno == 0 && (error = readblock(bp, lbn)))
788 		return (error);
789 	/*
790 	 * Set a snapshot inode to be a zero length file, regular files
791 	 * to be completely unallocated.
792 	 */
793 	dip = (struct ufs1_dinode *)bp->b_data +
794 	    ino_to_fsbo(fs, cancelip->i_number);
795 	if (expungetype == BLK_NOCOPY)
796 		dip->di_mode = 0;
797 	dip->di_size = 0;
798 	dip->di_blocks = 0;
799 	dip->di_flags &= ~SF_SNAPSHOT;
800 	bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t));
801 	bdwrite(bp);
802 	return (0);
803 }
804 
805 /*
806  * Descend an indirect block chain for vnode cancelvp accounting for all
807  * its indirect blocks in snapvp.
808  */
809 static int
810 indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks,
811 	    blksperindir, fs, acctfunc, expungetype)
812 	struct vnode *snapvp;
813 	struct vnode *cancelvp;
814 	int level;
815 	ufs1_daddr_t blkno;
816 	ufs_lbn_t lbn;
817 	ufs_lbn_t rlbn;
818 	ufs_lbn_t remblks;
819 	ufs_lbn_t blksperindir;
820 	struct fs *fs;
821 	int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
822 	    struct fs *, ufs_lbn_t, int);
823 	int expungetype;
824 {
825 	int error, num, i;
826 	ufs_lbn_t subblksperindir;
827 	struct indir indirs[NIADDR + 2];
828 	ufs1_daddr_t last, *bap;
829 	struct buf *bp;
830 
831 	if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
832 		return (error);
833 	if (lbn != indirs[num - 1 - level].in_lbn || blkno == 0 || num < 2)
834 		panic("indiracct: botched params");
835 	/*
836 	 * We have to expand bread here since it will deadlock looking
837 	 * up the block number for any blocks that are not in the cache.
838 	 */
839 	bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0);
840 	bp->b_blkno = fsbtodb(fs, blkno);
841 	if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 &&
842 	    (error = readblock(bp, fragstoblks(fs, blkno)))) {
843 		brelse(bp);
844 		return (error);
845 	}
846 	/*
847 	 * Account for the block pointers in this indirect block.
848 	 */
849 	last = howmany(remblks, blksperindir);
850 	if (last > NINDIR(fs))
851 		last = NINDIR(fs);
852 	MALLOC(bap, ufs1_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK);
853 	bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
854 	bqrelse(bp);
855 	error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, rlbn, expungetype);
856 	if (error || level == 0)
857 		goto out;
858 	/*
859 	 * Account for the block pointers in each of the indirect blocks
860 	 * in the levels below us.
861 	 */
862 	subblksperindir = blksperindir / NINDIR(fs);
863 	for (lbn++, level--, i = 0; i < last; i++) {
864 		error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn,
865 		    rlbn, remblks, subblksperindir, fs, acctfunc, expungetype);
866 		if (error)
867 			goto out;
868 		rlbn += blksperindir;
869 		lbn -= blksperindir;
870 		remblks -= blksperindir;
871 	}
872 out:
873 	FREE(bap, M_DEVBUF);
874 	return (error);
875 }
876 
877 /*
878  * Do both snap accounting and map accounting.
879  */
880 static int
881 fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)
882 	struct vnode *vp;
883 	ufs1_daddr_t *oldblkp, *lastblkp;
884 	struct fs *fs;
885 	ufs_lbn_t lblkno;
886 	int exptype;	/* BLK_SNAP or BLK_NOCOPY */
887 {
888 	int error;
889 
890 	if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)))
891 		return (error);
892 	return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype));
893 }
894 
895 /*
896  * Identify a set of blocks allocated in a snapshot inode.
897  */
898 static int
899 snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
900 	struct vnode *vp;
901 	ufs1_daddr_t *oldblkp, *lastblkp;
902 	struct fs *fs;
903 	ufs_lbn_t lblkno;
904 	int expungetype;	/* BLK_SNAP or BLK_NOCOPY */
905 {
906 	struct inode *ip = VTOI(vp);
907 	ufs1_daddr_t blkno, *blkp;
908 	ufs_lbn_t lbn;
909 	struct buf *ibp;
910 	int error;
911 
912 	for ( ; oldblkp < lastblkp; oldblkp++) {
913 		blkno = *oldblkp;
914 		if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
915 			continue;
916 		lbn = fragstoblks(fs, blkno);
917 		if (lbn < NDADDR) {
918 			blkp = &ip->i_din1->di_db[lbn];
919 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
920 		} else {
921 			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
922 			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
923 			if (error)
924 				return (error);
925 			blkp = &((ufs1_daddr_t *)(ibp->b_data))
926 			    [(lbn - NDADDR) % NINDIR(fs)];
927 		}
928 		/*
929 		 * If we are expunging a snapshot vnode and we
930 		 * find a block marked BLK_NOCOPY, then it is
931 		 * one that has been allocated to this snapshot after
932 		 * we took our current snapshot and can be ignored.
933 		 */
934 		if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) {
935 			if (lbn >= NDADDR)
936 				brelse(ibp);
937 		} else {
938 			if (*blkp != 0)
939 				panic("snapacct: bad block");
940 			*blkp = expungetype;
941 			if (lbn >= NDADDR)
942 				bdwrite(ibp);
943 		}
944 	}
945 	return (0);
946 }
947 
948 /*
949  * Account for a set of blocks allocated in a snapshot inode.
950  */
951 static int
952 mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
953 	struct vnode *vp;
954 	ufs1_daddr_t *oldblkp, *lastblkp;
955 	struct fs *fs;
956 	ufs_lbn_t lblkno;
957 	int expungetype;
958 {
959 	ufs1_daddr_t blkno;
960 	struct inode *ip;
961 	ino_t inum;
962 
963 	ip = VTOI(vp);
964 	inum = ip->i_number;
965 	for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
966 		blkno = *oldblkp;
967 		if (blkno == 0 || blkno == BLK_NOCOPY)
968 			continue;
969 		if (expungetype == BLK_SNAP && blkno != BLK_SNAP)
970 			*ip->i_snapblklist++ = lblkno;
971 		if (blkno == BLK_SNAP)
972 			blkno = blkstofrags(fs, lblkno);
973 		ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum);
974 	}
975 	return (0);
976 }
977 
978 /*
979  * Before expunging a snapshot inode, note all the
980  * blocks that it claims with BLK_SNAP so that fsck will
981  * be able to account for those blocks properly and so
982  * that this snapshot knows that it need not copy them
983  * if the other snapshot holding them is freed. This code
984  * is reproduced once each for UFS1 and UFS2.
985  */
986 static int
987 expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype)
988 	struct vnode *snapvp;
989 	struct inode *cancelip;
990 	struct fs *fs;
991 	int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
992 	    struct fs *, ufs_lbn_t, int);
993 	int expungetype;
994 {
995 	int i, error, indiroff;
996 	ufs_lbn_t lbn, rlbn;
997 	ufs2_daddr_t len, blkno, numblks, blksperindir;
998 	struct ufs2_dinode *dip;
999 	struct thread *td = curthread;
1000 	struct buf *bp;
1001 
1002 	numblks = howmany(cancelip->i_size, fs->fs_bsize);
1003 	if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0],
1004 	    &cancelip->i_din2->di_ib[NIADDR], fs, 0, expungetype)))
1005 		return (error);
1006 	blksperindir = 1;
1007 	lbn = -NDADDR;
1008 	len = numblks - NDADDR;
1009 	rlbn = NDADDR;
1010 	for (i = 0; len > 0 && i < NIADDR; i++) {
1011 		error = indiracct_ufs2(snapvp, ITOV(cancelip), i,
1012 		    cancelip->i_din2->di_ib[i], lbn, rlbn, len,
1013 		    blksperindir, fs, acctfunc, expungetype);
1014 		if (error)
1015 			return (error);
1016 		blksperindir *= NINDIR(fs);
1017 		lbn -= blksperindir + 1;
1018 		len -= blksperindir;
1019 		rlbn += blksperindir;
1020 	}
1021 	/*
1022 	 * Prepare to expunge the inode. If its inode block has not
1023 	 * yet been copied, then allocate and fill the copy.
1024 	 */
1025 	lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
1026 	blkno = 0;
1027 	if (lbn < NDADDR) {
1028 		blkno = cancelip->i_din2->di_db[lbn];
1029 	} else {
1030 		td->td_proc->p_flag |= P_COWINPROGRESS;
1031 		error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn),
1032 		   fs->fs_bsize, KERNCRED, BA_METAONLY, &bp);
1033 		td->td_proc->p_flag &= ~P_COWINPROGRESS;
1034 		if (error)
1035 			return (error);
1036 		indiroff = (lbn - NDADDR) % NINDIR(fs);
1037 		blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff];
1038 		bqrelse(bp);
1039 	}
1040 	error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn),
1041 	    fs->fs_bsize, KERNCRED, 0, &bp);
1042 	if (error)
1043 		return (error);
1044 	if (blkno == 0 && (error = readblock(bp, lbn)))
1045 		return (error);
1046 	/*
1047 	 * Set a snapshot inode to be a zero length file, regular files
1048 	 * to be completely unallocated.
1049 	 */
1050 	dip = (struct ufs2_dinode *)bp->b_data +
1051 	    ino_to_fsbo(fs, cancelip->i_number);
1052 	if (expungetype == BLK_NOCOPY)
1053 		dip->di_mode = 0;
1054 	dip->di_size = 0;
1055 	dip->di_blocks = 0;
1056 	dip->di_flags &= ~SF_SNAPSHOT;
1057 	bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t));
1058 	bdwrite(bp);
1059 	return (0);
1060 }
1061 
1062 /*
1063  * Descend an indirect block chain for vnode cancelvp accounting for all
1064  * its indirect blocks in snapvp.
1065  */
1066 static int
1067 indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks,
1068 	    blksperindir, fs, acctfunc, expungetype)
1069 	struct vnode *snapvp;
1070 	struct vnode *cancelvp;
1071 	int level;
1072 	ufs2_daddr_t blkno;
1073 	ufs_lbn_t lbn;
1074 	ufs_lbn_t rlbn;
1075 	ufs_lbn_t remblks;
1076 	ufs_lbn_t blksperindir;
1077 	struct fs *fs;
1078 	int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
1079 	    struct fs *, ufs_lbn_t, int);
1080 	int expungetype;
1081 {
1082 	int error, num, i;
1083 	ufs_lbn_t subblksperindir;
1084 	struct indir indirs[NIADDR + 2];
1085 	ufs2_daddr_t last, *bap;
1086 	struct buf *bp;
1087 
1088 	if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
1089 		return (error);
1090 	if (lbn != indirs[num - 1 - level].in_lbn || blkno == 0 || num < 2)
1091 		panic("indiracct: botched params");
1092 	/*
1093 	 * We have to expand bread here since it will deadlock looking
1094 	 * up the block number for any blocks that are not in the cache.
1095 	 */
1096 	bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0);
1097 	bp->b_blkno = fsbtodb(fs, blkno);
1098 	if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 &&
1099 	    (error = readblock(bp, fragstoblks(fs, blkno)))) {
1100 		brelse(bp);
1101 		return (error);
1102 	}
1103 	/*
1104 	 * Account for the block pointers in this indirect block.
1105 	 */
1106 	last = howmany(remblks, blksperindir);
1107 	if (last > NINDIR(fs))
1108 		last = NINDIR(fs);
1109 	MALLOC(bap, ufs2_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK);
1110 	bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
1111 	bqrelse(bp);
1112 	error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, rlbn, expungetype);
1113 	if (error || level == 0)
1114 		goto out;
1115 	/*
1116 	 * Account for the block pointers in each of the indirect blocks
1117 	 * in the levels below us.
1118 	 */
1119 	subblksperindir = blksperindir / NINDIR(fs);
1120 	for (lbn++, level--, i = 0; i < last; i++) {
1121 		error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn,
1122 		    rlbn, remblks, subblksperindir, fs, acctfunc, expungetype);
1123 		if (error)
1124 			goto out;
1125 		rlbn += blksperindir;
1126 		lbn -= blksperindir;
1127 		remblks -= blksperindir;
1128 	}
1129 out:
1130 	FREE(bap, M_DEVBUF);
1131 	return (error);
1132 }
1133 
1134 /*
1135  * Do both snap accounting and map accounting.
1136  */
1137 static int
1138 fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)
1139 	struct vnode *vp;
1140 	ufs2_daddr_t *oldblkp, *lastblkp;
1141 	struct fs *fs;
1142 	ufs_lbn_t lblkno;
1143 	int exptype;	/* BLK_SNAP or BLK_NOCOPY */
1144 {
1145 	int error;
1146 
1147 	if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)))
1148 		return (error);
1149 	return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype));
1150 }
1151 
1152 /*
1153  * Identify a set of blocks allocated in a snapshot inode.
1154  */
1155 static int
1156 snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
1157 	struct vnode *vp;
1158 	ufs2_daddr_t *oldblkp, *lastblkp;
1159 	struct fs *fs;
1160 	ufs_lbn_t lblkno;
1161 	int expungetype;	/* BLK_SNAP or BLK_NOCOPY */
1162 {
1163 	struct inode *ip = VTOI(vp);
1164 	ufs2_daddr_t blkno, *blkp;
1165 	ufs_lbn_t lbn;
1166 	struct buf *ibp;
1167 	int error;
1168 
1169 	for ( ; oldblkp < lastblkp; oldblkp++) {
1170 		blkno = *oldblkp;
1171 		if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
1172 			continue;
1173 		lbn = fragstoblks(fs, blkno);
1174 		if (lbn < NDADDR) {
1175 			blkp = &ip->i_din2->di_db[lbn];
1176 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
1177 		} else {
1178 			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
1179 			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
1180 			if (error)
1181 				return (error);
1182 			blkp = &((ufs2_daddr_t *)(ibp->b_data))
1183 			    [(lbn - NDADDR) % NINDIR(fs)];
1184 		}
1185 		/*
1186 		 * If we are expunging a snapshot vnode and we
1187 		 * find a block marked BLK_NOCOPY, then it is
1188 		 * one that has been allocated to this snapshot after
1189 		 * we took our current snapshot and can be ignored.
1190 		 */
1191 		if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) {
1192 			if (lbn >= NDADDR)
1193 				brelse(ibp);
1194 		} else {
1195 			if (*blkp != 0)
1196 				panic("snapacct: bad block");
1197 			*blkp = expungetype;
1198 			if (lbn >= NDADDR)
1199 				bdwrite(ibp);
1200 		}
1201 	}
1202 	return (0);
1203 }
1204 
1205 /*
1206  * Account for a set of blocks allocated in a snapshot inode.
1207  */
1208 static int
1209 mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
1210 	struct vnode *vp;
1211 	ufs2_daddr_t *oldblkp, *lastblkp;
1212 	struct fs *fs;
1213 	ufs_lbn_t lblkno;
1214 	int expungetype;
1215 {
1216 	ufs2_daddr_t blkno;
1217 	struct inode *ip;
1218 	ino_t inum;
1219 
1220 	ip = VTOI(vp);
1221 	inum = ip->i_number;
1222 	for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
1223 		blkno = *oldblkp;
1224 		if (blkno == 0 || blkno == BLK_NOCOPY)
1225 			continue;
1226 		if (expungetype == BLK_SNAP && blkno != BLK_SNAP)
1227 			*ip->i_snapblklist++ = lblkno;
1228 		if (blkno == BLK_SNAP)
1229 			blkno = blkstofrags(fs, lblkno);
1230 		ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum);
1231 	}
1232 	return (0);
1233 }
1234 
1235 /*
1236  * Decrement extra reference on snapshot when last name is removed.
1237  * It will not be freed until the last open reference goes away.
1238  */
1239 void
1240 ffs_snapgone(ip)
1241 	struct inode *ip;
1242 {
1243 	struct inode *xp;
1244 	struct fs *fs;
1245 	int snaploc;
1246 
1247 	/*
1248 	 * Find snapshot in incore list.
1249 	 */
1250 	TAILQ_FOREACH(xp, &ip->i_devvp->v_rdev->si_snapshots, i_nextsnap)
1251 		if (xp == ip)
1252 			break;
1253 	if (xp == 0)
1254 		printf("ffs_snapgone: lost snapshot vnode %d\n",
1255 		    ip->i_number);
1256 	else
1257 		vrele(ITOV(ip));
1258 	/*
1259 	 * Delete snapshot inode from superblock. Keep list dense.
1260 	 */
1261 	fs = ip->i_fs;
1262 	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
1263 		if (fs->fs_snapinum[snaploc] == ip->i_number)
1264 			break;
1265 	if (snaploc < FSMAXSNAP) {
1266 		for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
1267 			if (fs->fs_snapinum[snaploc] == 0)
1268 				break;
1269 			fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
1270 		}
1271 		fs->fs_snapinum[snaploc - 1] = 0;
1272 	}
1273 }
1274 
1275 /*
1276  * Prepare a snapshot file for being removed.
1277  */
1278 void
1279 ffs_snapremove(vp)
1280 	struct vnode *vp;
1281 {
1282 	struct inode *ip;
1283 	struct vnode *devvp;
1284 	struct lock *lkp;
1285 	struct buf *ibp;
1286 	struct fs *fs;
1287 	struct thread *td = curthread;
1288 	ufs2_daddr_t numblks, blkno, dblk;
1289 	int error, loc, last;
1290 
1291 	ip = VTOI(vp);
1292 	fs = ip->i_fs;
1293 	/*
1294 	 * If active, delete from incore list (this snapshot may
1295 	 * already have been in the process of being deleted, so
1296 	 * would not have been active).
1297 	 *
1298 	 * Clear copy-on-write flag if last snapshot.
1299 	 */
1300 	if (ip->i_nextsnap.tqe_prev != 0) {
1301 		VI_LOCK(vp);
1302 		lockmgr(&vp->v_lock, LK_INTERLOCK|LK_EXCLUSIVE, VI_MTX(vp), td);
1303 		VI_LOCK(vp);
1304 		lkp = vp->v_vnlock;
1305 		vp->v_vnlock = &vp->v_lock;
1306 		lockmgr(lkp, LK_INTERLOCK | LK_RELEASE, VI_MTX(vp), td);
1307 		devvp = ip->i_devvp;
1308 		TAILQ_REMOVE(&devvp->v_rdev->si_snapshots, ip, i_nextsnap);
1309 		ip->i_nextsnap.tqe_prev = 0;
1310 		ASSERT_VOP_LOCKED(devvp, "ffs_snapremove devvp");
1311 		if (TAILQ_FIRST(&devvp->v_rdev->si_snapshots) == 0) {
1312 			lockdestroy(lkp);
1313 			FREE(lkp, M_UFSMNT);
1314 			devvp->v_rdev->si_copyonwrite = 0;
1315 			devvp->v_vflag &= ~VV_COPYONWRITE;
1316 		}
1317 	}
1318 	/*
1319 	 * Get rid of its hints list.
1320 	 */
1321 	if (ip->i_snapblklist != NULL) {
1322 		FREE(ip->i_snapblklist, M_UFSMNT);
1323 		ip->i_snapblklist = NULL;
1324 	}
1325 	/*
1326 	 * Clear all BLK_NOCOPY fields. Pass any block claims to other
1327 	 * snapshots that want them (see ffs_snapblkfree below).
1328 	 */
1329 	for (blkno = 1; blkno < NDADDR; blkno++) {
1330 		dblk = DIP(ip, i_db[blkno]);
1331 		if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1332 			DIP(ip, i_db[blkno]) = 0;
1333 		else if ((dblk == blkstofrags(fs, blkno) &&
1334 		     ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize,
1335 		     ip->i_number))) {
1336 			DIP(ip, i_blocks) -= btodb(fs->fs_bsize);
1337 			DIP(ip, i_db[blkno]) = 0;
1338 		}
1339 	}
1340 	numblks = howmany(ip->i_size, fs->fs_bsize);
1341 	for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
1342 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno),
1343 		    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
1344 		if (error)
1345 			continue;
1346 		if (fs->fs_size - blkno > NINDIR(fs))
1347 			last = NINDIR(fs);
1348 		else
1349 			last = fs->fs_size - blkno;
1350 		for (loc = 0; loc < last; loc++) {
1351 			if (ip->i_ump->um_fstype == UFS1) {
1352 				dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc];
1353 				if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1354 					((ufs1_daddr_t *)(ibp->b_data))[loc]= 0;
1355 				else if ((dblk == blkstofrags(fs, blkno) &&
1356 				     ffs_snapblkfree(fs, ip->i_devvp, dblk,
1357 				     fs->fs_bsize, ip->i_number))) {
1358 					ip->i_din1->di_blocks -=
1359 					    btodb(fs->fs_bsize);
1360 					((ufs1_daddr_t *)(ibp->b_data))[loc]= 0;
1361 				}
1362 				continue;
1363 			}
1364 			dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc];
1365 			if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1366 				((ufs2_daddr_t *)(ibp->b_data))[loc] = 0;
1367 			else if ((dblk == blkstofrags(fs, blkno) &&
1368 			     ffs_snapblkfree(fs, ip->i_devvp, dblk,
1369 			     fs->fs_bsize, ip->i_number))) {
1370 				ip->i_din2->di_blocks -= btodb(fs->fs_bsize);
1371 				((ufs2_daddr_t *)(ibp->b_data))[loc] = 0;
1372 			}
1373 		}
1374 		bawrite(ibp);
1375 	}
1376 	/*
1377 	 * Clear snapshot flag and drop reference.
1378 	 */
1379 	ip->i_flags &= ~SF_SNAPSHOT;
1380 	DIP(ip, i_flags) = ip->i_flags;
1381 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
1382 }
1383 
1384 /*
1385  * Notification that a block is being freed. Return zero if the free
1386  * should be allowed to proceed. Return non-zero if the snapshot file
1387  * wants to claim the block. The block will be claimed if it is an
1388  * uncopied part of one of the snapshots. It will be freed if it is
1389  * either a BLK_NOCOPY or has already been copied in all of the snapshots.
1390  * If a fragment is being freed, then all snapshots that care about
1391  * it must make a copy since a snapshot file can only claim full sized
1392  * blocks. Note that if more than one snapshot file maps the block,
1393  * we can pick one at random to claim it. Since none of the snapshots
1394  * can change, we are assurred that they will all see the same unmodified
1395  * image. When deleting a snapshot file (see ffs_snapremove above), we
1396  * must push any of these claimed blocks to one of the other snapshots
1397  * that maps it. These claimed blocks are easily identified as they will
1398  * have a block number equal to their logical block number within the
1399  * snapshot. A copied block can never have this property because they
1400  * must always have been allocated from a BLK_NOCOPY location.
1401  */
1402 int
1403 ffs_snapblkfree(fs, devvp, bno, size, inum)
1404 	struct fs *fs;
1405 	struct vnode *devvp;
1406 	ufs2_daddr_t bno;
1407 	long size;
1408 	ino_t inum;
1409 {
1410 	struct buf *ibp, *cbp, *savedcbp = 0;
1411 	struct thread *td = curthread;
1412 	struct inode *ip;
1413 	struct vnode *vp;
1414 	ufs_lbn_t lbn;
1415 	ufs2_daddr_t blkno;
1416 	int indiroff = 0, error = 0, claimedblk = 0;
1417 	struct snaphead *snaphead;
1418 
1419 	lbn = fragstoblks(fs, bno);
1420 	snaphead = &devvp->v_rdev->si_snapshots;
1421 	TAILQ_FOREACH(ip, snaphead, i_nextsnap) {
1422 		vp = ITOV(ip);
1423 		/*
1424 		 * Lookup block being written.
1425 		 */
1426 		if (lbn < NDADDR) {
1427 			blkno = DIP(ip, i_db[lbn]);
1428 		} else {
1429 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1430 			td->td_proc->p_flag |= P_COWINPROGRESS;
1431 			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
1432 			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
1433 			td->td_proc->p_flag &= ~P_COWINPROGRESS;
1434 			VOP_UNLOCK(vp, 0, td);
1435 			if (error)
1436 				break;
1437 			indiroff = (lbn - NDADDR) % NINDIR(fs);
1438 			if (ip->i_ump->um_fstype == UFS1)
1439 				blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff];
1440 			else
1441 				blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff];
1442 		}
1443 		/*
1444 		 * Check to see if block needs to be copied.
1445 		 */
1446 		if (blkno == 0) {
1447 			/*
1448 			 * A block that we map is being freed. If it has not
1449 			 * been claimed yet, we will claim or copy it (below).
1450 			 */
1451 			claimedblk = 1;
1452 		} else if (blkno == BLK_SNAP) {
1453 			/*
1454 			 * No previous snapshot claimed the block,
1455 			 * so it will be * freed and become a BLK_NOCOPY
1456 			 * (don't care) for us.
1457 			 */
1458 			if (claimedblk)
1459 				panic("snapblkfree: inconsistent block type");
1460 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1461 			if (lbn < NDADDR) {
1462 				DIP(ip, i_db[lbn]) = BLK_NOCOPY;
1463 				ip->i_flag |= IN_CHANGE | IN_UPDATE;
1464 			} else if (ip->i_ump->um_fstype == UFS1) {
1465 				((ufs1_daddr_t *)(ibp->b_data))[indiroff] =
1466 				    BLK_NOCOPY;
1467 				bdwrite(ibp);
1468 			} else {
1469 				((ufs2_daddr_t *)(ibp->b_data))[indiroff] =
1470 				    BLK_NOCOPY;
1471 				bdwrite(ibp);
1472 			}
1473 			VOP_UNLOCK(vp, 0, td);
1474 			continue;
1475 		} else /* BLK_NOCOPY or default */ {
1476 			/*
1477 			 * If the snapshot has already copied the block
1478 			 * (default), or does not care about the block,
1479 			 * it is not needed.
1480 			 */
1481 			if (lbn >= NDADDR)
1482 				bqrelse(ibp);
1483 			continue;
1484 		}
1485 		/*
1486 		 * If this is a full size block, we will just grab it
1487 		 * and assign it to the snapshot inode. Otherwise we
1488 		 * will proceed to copy it. See explanation for this
1489 		 * routine as to why only a single snapshot needs to
1490 		 * claim this block.
1491 		 */
1492 		if (size == fs->fs_bsize) {
1493 #ifdef DEBUG
1494 			if (snapdebug)
1495 				printf("%s %d lbn %jd from inum %d\n",
1496 				    "Grabonremove: snapino", ip->i_number,
1497 				    (intmax_t)lbn, inum);
1498 #endif
1499 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1500 			if (lbn < NDADDR) {
1501 				DIP(ip, i_db[lbn]) = bno;
1502 			} else if (ip->i_ump->um_fstype == UFS1) {
1503 				((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno;
1504 				bdwrite(ibp);
1505 			} else {
1506 				((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno;
1507 				bdwrite(ibp);
1508 			}
1509 			DIP(ip, i_blocks) += btodb(size);
1510 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
1511 			VOP_UNLOCK(vp, 0, td);
1512 			return (1);
1513 		}
1514 		if (lbn >= NDADDR)
1515 			bqrelse(ibp);
1516 		/*
1517 		 * Allocate the block into which to do the copy. Note that this
1518 		 * allocation will never require any additional allocations for
1519 		 * the snapshot inode.
1520 		 */
1521 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1522 		td->td_proc->p_flag |= P_COWINPROGRESS;
1523 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
1524 		    fs->fs_bsize, KERNCRED, 0, &cbp);
1525 		td->td_proc->p_flag &= ~P_COWINPROGRESS;
1526 		if (error) {
1527 			VOP_UNLOCK(vp, 0, td);
1528 			break;
1529 		}
1530 #ifdef DEBUG
1531 		if (snapdebug)
1532 			printf("%s%d lbn %jd %s %d size %ld to blkno %jd\n",
1533 			    "Copyonremove: snapino ", ip->i_number,
1534 			    (intmax_t)lbn, "for inum", inum, size,
1535 			    (intmax_t)cbp->b_blkno);
1536 #endif
1537 		/*
1538 		 * If we have already read the old block contents, then
1539 		 * simply copy them to the new block. Note that we need
1540 		 * to synchronously write snapshots that have not been
1541 		 * unlinked, and hence will be visible after a crash,
1542 		 * to ensure their integrity.
1543 		 */
1544 		if (savedcbp != 0) {
1545 			bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
1546 			bawrite(cbp);
1547 			if (dopersistence && ip->i_effnlink > 0)
1548 				(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
1549 			VOP_UNLOCK(vp, 0, td);
1550 			continue;
1551 		}
1552 		/*
1553 		 * Otherwise, read the old block contents into the buffer.
1554 		 */
1555 		if ((error = readblock(cbp, lbn)) != 0) {
1556 			bzero(cbp->b_data, fs->fs_bsize);
1557 			bawrite(cbp);
1558 			if (dopersistence && ip->i_effnlink > 0)
1559 				(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
1560 			VOP_UNLOCK(vp, 0, td);
1561 			break;
1562 		}
1563 		VOP_UNLOCK(vp, 0, td);
1564 		savedcbp = cbp;
1565 	}
1566 	/*
1567 	 * Note that we need to synchronously write snapshots that
1568 	 * have not been unlinked, and hence will be visible after
1569 	 * a crash, to ensure their integrity.
1570 	 */
1571 	if (savedcbp) {
1572 		vp = savedcbp->b_vp;
1573 		bawrite(savedcbp);
1574 		if (dopersistence && VTOI(vp)->i_effnlink > 0) {
1575 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1576 			(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
1577 			VOP_UNLOCK(vp, 0, td);
1578 		}
1579 	}
1580 	/*
1581 	 * If we have been unable to allocate a block in which to do
1582 	 * the copy, then return non-zero so that the fragment will
1583 	 * not be freed. Although space will be lost, the snapshot
1584 	 * will stay consistent.
1585 	 */
1586 	return (error);
1587 }
1588 
1589 /*
1590  * Associate snapshot files when mounting.
1591  */
1592 void
1593 ffs_snapshot_mount(mp)
1594 	struct mount *mp;
1595 {
1596 	struct ufsmount *ump = VFSTOUFS(mp);
1597 	struct fs *fs = ump->um_fs;
1598 	struct thread *td = curthread;
1599 	struct snaphead *snaphead;
1600 	struct vnode *vp;
1601 	struct inode *ip, *xp;
1602 	struct uio auio;
1603 	struct iovec aiov;
1604 	void *listhd;
1605 	char *reason;
1606 	int error, snaploc, loc;
1607 
1608 	/*
1609 	 * XXX The following needs to be set before UFS_TRUNCATE or
1610 	 * VOP_READ can be called.
1611 	 */
1612 	mp->mnt_stat.f_iosize = fs->fs_bsize;
1613 	/*
1614 	 * Process each snapshot listed in the superblock.
1615 	 */
1616 	snaphead = &ump->um_devvp->v_rdev->si_snapshots;
1617 	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
1618 		if (fs->fs_snapinum[snaploc] == 0)
1619 			return;
1620 		if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc],
1621 		    LK_EXCLUSIVE, &vp)) != 0){
1622 			printf("ffs_snapshot_mount: vget failed %d\n", error);
1623 			continue;
1624 		}
1625 		ip = VTOI(vp);
1626 		if ((ip->i_flags & SF_SNAPSHOT) == 0 || ip->i_size ==
1627 		    lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) {
1628 			if ((ip->i_flags & SF_SNAPSHOT) == 0) {
1629 				reason = "non-snapshot";
1630 			} else {
1631 				reason = "old format snapshot";
1632 				(void)UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED, td);
1633 				(void)VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
1634 			}
1635 			printf("ffs_snapshot_mount: %s inode %d\n",
1636 			    reason, fs->fs_snapinum[snaploc]);
1637 			vput(vp);
1638 			for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
1639 				if (fs->fs_snapinum[loc] == 0)
1640 					break;
1641 				fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
1642 			}
1643 			fs->fs_snapinum[loc - 1] = 0;
1644 			snaploc--;
1645 			continue;
1646 		}
1647 		/*
1648 		 * Allocate the space for the block hints list.
1649 		 */
1650 		auio.uio_iov = &aiov;
1651 		auio.uio_iovcnt = 1;
1652 		aiov.iov_base = (void *)&ip->i_snaplistsize;
1653 		aiov.iov_len = sizeof(ip->i_snaplistsize);
1654 		auio.uio_resid = aiov.iov_len;
1655 		auio.uio_offset =
1656 		    lblktosize(fs, howmany(fs->fs_size, fs->fs_frag));
1657 		auio.uio_segflg = UIO_SYSSPACE;
1658 		auio.uio_rw = UIO_READ;
1659 		auio.uio_td = td;
1660 		if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
1661 			printf("ffs_snapshot_mount: read_1 failed %d\n", error);
1662 			continue;
1663 		}
1664 		MALLOC(listhd, void *, ip->i_snaplistsize * sizeof(daddr_t),
1665 		    M_UFSMNT, M_WAITOK);
1666 		auio.uio_iovcnt = 1;
1667 		aiov.iov_base = listhd;
1668 		aiov.iov_len = ip->i_snaplistsize * sizeof (daddr_t);
1669 		auio.uio_resid = aiov.iov_len;
1670 		auio.uio_offset -= sizeof(ip->i_snaplistsize);
1671 		if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
1672 			printf("ffs_snapshot_mount: read_2 failed %d\n", error);
1673 			FREE(listhd, M_UFSMNT);
1674 			continue;
1675 		}
1676 		ip->i_snapblklist = (daddr_t *)listhd;
1677 		/*
1678 		 * If there already exist snapshots on this filesystem, grab a
1679 		 * reference to their shared lock. If this is the first snapshot
1680 		 * on this filesystem, we need to allocate a lock for the
1681 		 * snapshots to share. In either case, acquire the snapshot
1682 		 * lock and give up our original private lock.
1683 		 */
1684 		if ((xp = TAILQ_FIRST(snaphead)) != NULL) {
1685 			VI_LOCK(vp);
1686 			vp->v_vnlock = ITOV(xp)->v_vnlock;
1687 		} else {
1688 			struct lock *lkp;
1689 
1690 			MALLOC(lkp, struct lock *, sizeof(struct lock),
1691 			    M_UFSMNT, M_WAITOK);
1692 			lockinit(lkp, PVFS, "snaplk", VLKTIMEOUT,
1693 			    LK_CANRECURSE | LK_NOPAUSE);
1694 			VI_LOCK(vp);
1695 			vp->v_vnlock = lkp;
1696 		}
1697 		vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, td);
1698 		VI_LOCK(vp);
1699 		lockmgr(&vp->v_lock, LK_INTERLOCK | LK_RELEASE, VI_MTX(vp), td);
1700 		/*
1701 		 * Link it onto the active snapshot list.
1702 		 */
1703 		if (ip->i_nextsnap.tqe_prev != 0)
1704 			panic("ffs_snapshot_mount: %d already on list",
1705 			    ip->i_number);
1706 		else
1707 			TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap);
1708 		vp->v_vflag |= VV_SYSTEM;
1709 		ump->um_devvp->v_rdev->si_copyonwrite = ffs_copyonwrite;
1710 		ASSERT_VOP_LOCKED(ump->um_devvp, "ffs_snapshot_mount");
1711 		ump->um_devvp->v_vflag |= VV_COPYONWRITE;
1712 		VOP_UNLOCK(vp, 0, td);
1713 	}
1714 }
1715 
1716 /*
1717  * Disassociate snapshot files when unmounting.
1718  */
1719 void
1720 ffs_snapshot_unmount(mp)
1721 	struct mount *mp;
1722 {
1723 	struct ufsmount *ump = VFSTOUFS(mp);
1724 	struct snaphead *snaphead = &ump->um_devvp->v_rdev->si_snapshots;
1725 	struct lock *lkp = NULL;
1726 	struct inode *xp;
1727 	struct vnode *vp;
1728 
1729 	while ((xp = TAILQ_FIRST(snaphead)) != 0) {
1730 		vp = ITOV(xp);
1731 		lkp = vp->v_vnlock;
1732 		vp->v_vnlock = &vp->v_lock;
1733 		TAILQ_REMOVE(snaphead, xp, i_nextsnap);
1734 		if (xp->i_snapblklist != NULL) {
1735 			FREE(xp->i_snapblklist, M_UFSMNT);
1736 			xp->i_snapblklist = NULL;
1737 		}
1738 		xp->i_nextsnap.tqe_prev = 0;
1739 		if (xp->i_effnlink > 0)
1740 			vrele(vp);
1741 	}
1742 	if (lkp != NULL) {
1743 		lockdestroy(lkp);
1744 		FREE(lkp, M_UFSMNT);
1745 	}
1746 	ASSERT_VOP_LOCKED(ump->um_devvp, "ffs_snapshot_unmount");
1747 	ump->um_devvp->v_rdev->si_copyonwrite = 0;
1748 	ump->um_devvp->v_vflag &= ~VV_COPYONWRITE;
1749 }
1750 
1751 /*
1752  * Check for need to copy block that is about to be written,
1753  * copying the block if necessary.
1754  */
1755 static int
1756 ffs_copyonwrite(devvp, bp)
1757 	struct vnode *devvp;
1758 	struct buf *bp;
1759 {
1760 	struct snaphead *snaphead;
1761 	struct buf *ibp, *cbp, *savedcbp = 0;
1762 	struct thread *td = curthread;
1763 	struct fs *fs;
1764 	struct inode *ip;
1765 	struct vnode *vp = 0;
1766 	ufs2_daddr_t lbn, blkno;
1767 	int lower, upper, mid, indiroff, snapshot_locked = 0, error = 0;
1768 
1769 	if (td->td_proc->p_flag & P_COWINPROGRESS)
1770 		panic("ffs_copyonwrite: recursive call");
1771 	snaphead = &devvp->v_rdev->si_snapshots;
1772 	ip = TAILQ_FIRST(snaphead);
1773 	fs = ip->i_fs;
1774 	lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
1775 	TAILQ_FOREACH(ip, snaphead, i_nextsnap) {
1776 		vp = ITOV(ip);
1777 		/*
1778 		 * We ensure that everything of our own that needs to be
1779 		 * copied will be done at the time that ffs_snapshot is
1780 		 * called. Thus we can skip the check here which can
1781 		 * deadlock in doing the lookup in UFS_BALLOC.
1782 		 */
1783 		if (bp->b_vp == vp)
1784 			continue;
1785 	retry:
1786 		/*
1787 		 * First check to see if it is in the preallocated list.
1788 		 * By doing this check we avoid several potential deadlocks.
1789 		 */
1790 		lower = 1;
1791 		upper = ip->i_snaplistsize - 1;
1792 		while (lower <= upper) {
1793 			mid = (lower + upper) / 2;
1794 			if (ip->i_snapblklist[mid] == lbn)
1795 				break;
1796 			if (ip->i_snapblklist[mid] < lbn)
1797 				lower = mid + 1;
1798 			else
1799 				upper = mid - 1;
1800 		}
1801 		if (lower <= upper)
1802 			continue;
1803 		/*
1804 		 * Check to see if block needs to be copied. We do not have
1805 		 * to hold the snapshot lock while doing this lookup as it
1806 		 * will never require any additional allocations for the
1807 		 * snapshot inode.
1808 		 */
1809 		if (lbn < NDADDR) {
1810 			blkno = DIP(ip, i_db[lbn]);
1811 		} else {
1812 			td->td_proc->p_flag |= P_COWINPROGRESS;
1813 			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
1814 			   fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
1815 			td->td_proc->p_flag &= ~P_COWINPROGRESS;
1816 			if (error)
1817 				break;
1818 			indiroff = (lbn - NDADDR) % NINDIR(fs);
1819 			if (ip->i_ump->um_fstype == UFS1)
1820 				blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff];
1821 			else
1822 				blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff];
1823 			bqrelse(ibp);
1824 		}
1825 #ifdef DIAGNOSTIC
1826 		if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
1827 			panic("ffs_copyonwrite: bad copy block");
1828 #endif
1829 		if (blkno != 0)
1830 			continue;
1831 		/*
1832 		 * Allocate the block into which to do the copy. Since
1833 		 * multiple processes may all try to copy the same block,
1834 		 * we have to recheck our need to do a copy if we sleep
1835 		 * waiting for the lock.
1836 		 *
1837 		 * Because all snapshots on a filesystem share a single
1838 		 * lock, we ensure that we will never be in competition
1839 		 * with another process to allocate a block.
1840 		 */
1841 		if (snapshot_locked == 0 &&
1842 		    vn_lock(vp, LK_EXCLUSIVE | LK_SLEEPFAIL, td) != 0)
1843 			goto retry;
1844 		snapshot_locked = 1;
1845 		td->td_proc->p_flag |= P_COWINPROGRESS;
1846 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
1847 		    fs->fs_bsize, KERNCRED, 0, &cbp);
1848 		td->td_proc->p_flag &= ~P_COWINPROGRESS;
1849 		if (error)
1850 			break;
1851 #ifdef DEBUG
1852 		if (snapdebug) {
1853 			printf("Copyonwrite: snapino %d lbn %jd for ",
1854 			    ip->i_number, (intmax_t)lbn);
1855 			if (bp->b_vp == devvp)
1856 				printf("fs metadata");
1857 			else
1858 				printf("inum %d", VTOI(bp->b_vp)->i_number);
1859 			printf(" lblkno %jd to blkno %jd\n",
1860 			    (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno);
1861 		}
1862 #endif
1863 		/*
1864 		 * If we have already read the old block contents, then
1865 		 * simply copy them to the new block. Note that we need
1866 		 * to synchronously write snapshots that have not been
1867 		 * unlinked, and hence will be visible after a crash,
1868 		 * to ensure their integrity.
1869 		 */
1870 		if (savedcbp != 0) {
1871 			bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
1872 			bawrite(cbp);
1873 			if (dopersistence && ip->i_effnlink > 0)
1874 				(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
1875 			continue;
1876 		}
1877 		/*
1878 		 * Otherwise, read the old block contents into the buffer.
1879 		 */
1880 		if ((error = readblock(cbp, lbn)) != 0) {
1881 			bzero(cbp->b_data, fs->fs_bsize);
1882 			bawrite(cbp);
1883 			if (dopersistence && ip->i_effnlink > 0)
1884 				(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
1885 			break;
1886 		}
1887 		savedcbp = cbp;
1888 	}
1889 	/*
1890 	 * Note that we need to synchronously write snapshots that
1891 	 * have not been unlinked, and hence will be visible after
1892 	 * a crash, to ensure their integrity.
1893 	 */
1894 	if (savedcbp) {
1895 		vp = savedcbp->b_vp;
1896 		bawrite(savedcbp);
1897 		if (dopersistence && VTOI(vp)->i_effnlink > 0)
1898 			(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
1899 	}
1900 	if (snapshot_locked)
1901 		VOP_UNLOCK(vp, 0, td);
1902 	return (error);
1903 }
1904 
1905 /*
1906  * Read the specified block into the given buffer.
1907  * Much of this boiler-plate comes from bwrite().
1908  */
1909 static int
1910 readblock(bp, lbn)
1911 	struct buf *bp;
1912 	ufs2_daddr_t lbn;
1913 {
1914 	struct uio auio;
1915 	struct iovec aiov;
1916 	struct thread *td = curthread;
1917 	struct inode *ip = VTOI(bp->b_vp);
1918 
1919 	aiov.iov_base = bp->b_data;
1920 	aiov.iov_len = bp->b_bcount;
1921 	auio.uio_iov = &aiov;
1922 	auio.uio_iovcnt = 1;
1923 	auio.uio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn)));
1924 	auio.uio_resid = bp->b_bcount;
1925 	auio.uio_rw = UIO_READ;
1926 	auio.uio_segflg = UIO_SYSSPACE;
1927 	auio.uio_td = td;
1928 	return (physio(ip->i_devvp->v_rdev, &auio, 0));
1929 }
1930