xref: /freebsd/sys/ufs/ffs/ffs_snapshot.c (revision ee2ea5ceafed78a5bd9810beb9e3ca927180c226)
1 /*
2  * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
3  *
4  * Further information about snapshots can be obtained from:
5  *
6  *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
7  *	1614 Oxford Street		mckusick@mckusick.com
8  *	Berkeley, CA 94709-1608		+1-510-843-9542
9  *	USA
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  *
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
22  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24  * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
25  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *	@(#)ffs_snapshot.c	8.11 (McKusick) 7/23/00
34  * $FreeBSD$
35  */
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/conf.h>
40 #include <sys/bio.h>
41 #include <sys/buf.h>
42 #include <sys/proc.h>
43 #include <sys/namei.h>
44 #include <sys/stat.h>
45 #include <sys/malloc.h>
46 #include <sys/mount.h>
47 #include <sys/resource.h>
48 #include <sys/resourcevar.h>
49 #include <sys/vnode.h>
50 
51 #include <ufs/ufs/extattr.h>
52 #include <ufs/ufs/quota.h>
53 #include <ufs/ufs/ufsmount.h>
54 #include <ufs/ufs/inode.h>
55 #include <ufs/ufs/ufs_extern.h>
56 
57 #include <ufs/ffs/fs.h>
58 #include <ufs/ffs/ffs_extern.h>
59 
60 #define KERNCRED thread0.td_ucred
61 #define DEBUG 1
62 
63 static int cgaccount(int, struct vnode *, struct buf *, int);
64 static int expunge(struct vnode *, struct inode *, struct fs *,
65     int (*)(struct vnode *, ufs_daddr_t *, ufs_daddr_t *, struct fs *,
66     ufs_daddr_t, int), int);
67 static int indiracct(struct vnode *, struct vnode *, int, ufs_daddr_t,
68     int, int, int, int, struct fs *, int (*)(struct vnode *,
69     ufs_daddr_t *, ufs_daddr_t *, struct fs *, ufs_daddr_t, int), int);
70 static int fullacct(struct vnode *, ufs_daddr_t *, ufs_daddr_t *,
71     struct fs *, ufs_daddr_t, int);
72 static int snapacct(struct vnode *, ufs_daddr_t *, ufs_daddr_t *,
73     struct fs *, ufs_daddr_t, int);
74 static int mapacct(struct vnode *, ufs_daddr_t *, ufs_daddr_t *,
75     struct fs *, ufs_daddr_t, int);
76 static int ffs_copyonwrite(struct vnode *, struct buf *);
77 static int readblock(struct buf *, daddr_t);
78 
79 /*
80  * To ensure the consistency of snapshots across crashes, we must
81  * synchronously write out copied blocks before allowing the
82  * originals to be modified. Because of the rather severe speed
83  * penalty that this imposes, the following flag allows this
84  * crash persistence to be disabled.
85  */
86 int dopersistence = 0;
87 
88 #ifdef DEBUG
89 #include <sys/sysctl.h>
90 SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, "");
91 int snapdebug = 0;
92 SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, "");
93 int collectsnapstats = 0;
94 SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats,
95 	0, "");
96 #endif /* DEBUG */
97 
98 /*
99  * Create a snapshot file and initialize it for the filesystem.
100  */
101 int
102 ffs_snapshot(mp, snapfile)
103 	struct mount *mp;
104 	char *snapfile;
105 {
106 	ufs_daddr_t blkno;
107 	int error, cg, snaploc, numblks;
108 	int i, size, len, loc;
109 	int flag = mp->mnt_flag;
110 	struct timespec starttime = {0, 0}, endtime;
111 	char saved_nice = 0;
112 	long redo = 0;
113 	int32_t *lp;
114 	void *space;
115 	struct fs *copy_fs = NULL, *fs = VFSTOUFS(mp)->um_fs;
116 	struct snaphead *snaphead;
117 	struct thread *td = curthread;
118 	struct inode *ip, *xp;
119 	struct buf *bp, *nbp, *ibp, *sbp = NULL;
120 	struct nameidata nd;
121 	struct mount *wrtmp;
122 	struct vattr vat;
123 	struct vnode *vp, *xvp, *nvp;
124 
125 	/*
126 	 * Need to serialize access to snapshot code per filesystem.
127 	 */
128 	/*
129 	 * Assign a snapshot slot in the superblock.
130 	 */
131 	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
132 		if (fs->fs_snapinum[snaploc] == 0)
133 			break;
134 	if (snaploc == FSMAXSNAP)
135 		return (ENOSPC);
136 	/*
137 	 * Create the snapshot file.
138 	 */
139 restart:
140 	NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, snapfile, td);
141 	if ((error = namei(&nd)) != 0)
142 		return (error);
143 	if (nd.ni_vp != NULL) {
144 		vput(nd.ni_vp);
145 		error = EEXIST;
146 	}
147 	if (nd.ni_dvp->v_mount != mp)
148 		error = EXDEV;
149 	if (error) {
150 		NDFREE(&nd, NDF_ONLY_PNBUF);
151 		if (nd.ni_dvp == nd.ni_vp)
152 			vrele(nd.ni_dvp);
153 		else
154 			vput(nd.ni_dvp);
155 		return (error);
156 	}
157 	VATTR_NULL(&vat);
158 	vat.va_type = VREG;
159 	vat.va_mode = S_IRUSR;
160 	vat.va_vaflags |= VA_EXCLUSIVE;
161 	if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp))
162 		wrtmp = NULL;
163 	if (wrtmp != mp)
164 		panic("ffs_snapshot: mount mismatch");
165 	if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) {
166 		NDFREE(&nd, NDF_ONLY_PNBUF);
167 		vput(nd.ni_dvp);
168 		if ((error = vn_start_write(NULL, &wrtmp,
169 		    V_XSLEEP | PCATCH)) != 0)
170 			return (error);
171 		goto restart;
172 	}
173 	VOP_LEASE(nd.ni_dvp, td, KERNCRED, LEASE_WRITE);
174 	error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat);
175 	vput(nd.ni_dvp);
176 	if (error) {
177 		NDFREE(&nd, NDF_ONLY_PNBUF);
178 		vn_finished_write(wrtmp);
179 		return (error);
180 	}
181 	vp = nd.ni_vp;
182 	ip = VTOI(vp);
183 	/*
184 	 * Allocate and copy the last block contents so as to be able
185 	 * to set size to that of the filesystem.
186 	 */
187 	numblks = howmany(fs->fs_size, fs->fs_frag);
188 	error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)),
189 	    fs->fs_bsize, KERNCRED, B_CLRBUF, &bp);
190 	if (error)
191 		goto out;
192 	ip->i_size = lblktosize(fs, (off_t)numblks);
193 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
194 	if ((error = readblock(bp, numblks - 1)) != 0)
195 		goto out;
196 	bawrite(bp);
197 	/*
198 	 * Preallocate critical data structures so that we can copy
199 	 * them in without further allocation after we suspend all
200 	 * operations on the filesystem. We would like to just release
201 	 * the allocated buffers without writing them since they will
202 	 * be filled in below once we are ready to go, but this upsets
203 	 * the soft update code, so we go ahead and write the new buffers.
204 	 *
205 	 * Allocate all indirect blocks and mark all of them as not
206 	 * needing to be copied.
207 	 */
208 	for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
209 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno),
210 		    fs->fs_bsize, td->td_ucred, B_METAONLY, &ibp);
211 		if (error)
212 			goto out;
213 		bdwrite(ibp);
214 	}
215 	/*
216 	 * Allocate copies for the superblock and its summary information.
217 	 */
218 	error = UFS_BALLOC(vp, (off_t)(SBOFF), SBSIZE, KERNCRED, 0, &nbp);
219 	if (error)
220 		goto out;
221 	bawrite(nbp);
222 	blkno = fragstoblks(fs, fs->fs_csaddr);
223 	len = howmany(fs->fs_cssize, fs->fs_bsize);
224 	for (loc = 0; loc < len; loc++) {
225 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)),
226 		    fs->fs_bsize, KERNCRED, 0, &nbp);
227 		if (error)
228 			goto out;
229 		bawrite(nbp);
230 	}
231 	/*
232 	 * Allocate all cylinder group blocks.
233 	 */
234 	for (cg = 0; cg < fs->fs_ncg; cg++) {
235 		error = UFS_BALLOC(vp, (off_t)(cgtod(fs, cg)) << fs->fs_fshift,
236 		    fs->fs_bsize, KERNCRED, 0, &nbp);
237 		if (error)
238 			goto out;
239 		bdwrite(nbp);
240 	}
241 	/*
242 	 * Copy all the cylinder group maps. Although the
243 	 * filesystem is still active, we hope that only a few
244 	 * cylinder groups will change between now and when we
245 	 * suspend operations. Thus, we will be able to quickly
246 	 * touch up the few cylinder groups that changed during
247 	 * the suspension period.
248 	 */
249 	len = howmany(fs->fs_ncg, NBBY);
250 	MALLOC(fs->fs_active, int *, len, M_DEVBUF, M_WAITOK);
251 	bzero(fs->fs_active, len);
252 	for (cg = 0; cg < fs->fs_ncg; cg++) {
253 		error = bread(vp, fragstoblks(fs, cgtod(fs, cg)), fs->fs_bsize,
254 		    KERNCRED, &nbp);
255 		if (error) {
256 			brelse(nbp);
257 			goto out;
258 		}
259 		error = cgaccount(cg, vp, nbp, 1);
260 		bawrite(nbp);
261 		if (error)
262 			goto out;
263 	}
264 	/*
265 	 * Change inode to snapshot type file.
266 	 */
267 	ip->i_flags |= SF_SNAPSHOT;
268 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
269 	/*
270 	 * Ensure that the snapshot is completely on disk.
271 	 */
272 	if ((error = VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td)) != 0)
273 		goto out;
274 	/*
275 	 * All allocations are done, so we can now snapshot the system.
276 	 *
277 	 * Recind nice scheduling while running with the filesystem suspended.
278 	 */
279 	if (td->td_ksegrp->kg_nice > 0) {
280 		saved_nice = td->td_ksegrp->kg_nice;
281 		td->td_ksegrp->kg_nice = 0;
282 	}
283 	/*
284 	 * Suspend operation on filesystem.
285 	 */
286 	for (;;) {
287 		vn_finished_write(wrtmp);
288 		vfs_write_suspend(vp->v_mount);
289 		if (mp->mnt_kern_flag & MNTK_SUSPENDED)
290 			break;
291 		vn_start_write(NULL, &wrtmp, V_WAIT);
292 	}
293 	if (collectsnapstats)
294 		nanotime(&starttime);
295 	/*
296 	 * First, copy all the cylinder group maps that have changed.
297 	 */
298 	for (cg = 0; cg < fs->fs_ncg; cg++) {
299 		if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0)
300 			continue;
301 		redo++;
302 		error = bread(vp, fragstoblks(fs, cgtod(fs, cg)), fs->fs_bsize,
303 			KERNCRED, &nbp);
304 		if (error) {
305 			brelse(nbp);
306 			goto out1;
307 		}
308 		error = cgaccount(cg, vp, nbp, 2);
309 		bawrite(nbp);
310 		if (error)
311 			goto out1;
312 	}
313 	/*
314 	 * Grab a copy of the superblock and its summary information.
315 	 * We delay writing it until the suspension is released below.
316 	 */
317 	error = bread(vp, lblkno(fs, SBOFF), fs->fs_bsize, KERNCRED, &sbp);
318 	if (error) {
319 		brelse(sbp);
320 		sbp = NULL;
321 		goto out1;
322 	}
323 	copy_fs = (struct fs *)(sbp->b_data + blkoff(fs, SBOFF));
324 	bcopy(fs, copy_fs, fs->fs_sbsize);
325 	if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0)
326 		copy_fs->fs_clean = 1;
327 	if (fs->fs_sbsize < SBSIZE)
328 		bzero(&sbp->b_data[blkoff(fs, SBOFF) + fs->fs_sbsize],
329 		    SBSIZE - fs->fs_sbsize);
330 	size = blkroundup(fs, fs->fs_cssize);
331 	if (fs->fs_contigsumsize > 0)
332 		size += fs->fs_ncg * sizeof(int32_t);
333 	space = malloc((u_long)size, M_UFSMNT, M_WAITOK);
334 	copy_fs->fs_csp = space;
335 	bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize);
336 	(char *)space += fs->fs_cssize;
337 	loc = howmany(fs->fs_cssize, fs->fs_fsize);
338 	i = fs->fs_frag - loc % fs->fs_frag;
339 	len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize;
340 	if (len > 0) {
341 		if ((error = bread(ip->i_devvp,
342 		    fsbtodb(fs, fs->fs_csaddr + loc),
343 		    len, KERNCRED, &bp)) != 0) {
344 			brelse(bp);
345 			free(copy_fs->fs_csp, M_UFSMNT);
346 			bawrite(sbp);
347 			sbp = NULL;
348 			goto out1;
349 		}
350 		bcopy(bp->b_data, space, (u_int)len);
351 		(char *)space += len;
352 		bp->b_flags |= B_INVAL | B_NOCACHE;
353 		brelse(bp);
354 	}
355 	if (fs->fs_contigsumsize > 0) {
356 		copy_fs->fs_maxcluster = lp = space;
357 		for (i = 0; i < fs->fs_ncg; i++)
358 			*lp++ = fs->fs_contigsumsize;
359 	}
360 	/*
361 	 * We must check for active files that have been unlinked
362 	 * (e.g., with a zero link count). We have to expunge all
363 	 * trace of these files from the snapshot so that they are
364 	 * not reclaimed prematurely by fsck or unnecessarily dumped.
365 	 * We turn off the MNTK_SUSPENDED flag to avoid a panic from
366 	 * spec_strategy about writing on a suspended filesystem.
367 	 */
368 	mp->mnt_kern_flag &= ~MNTK_SUSPENDED;
369 	mtx_lock(&mntvnode_mtx);
370 loop:
371 	for (xvp = TAILQ_FIRST(&mp->mnt_nvnodelist); xvp; xvp = nvp) {
372 		/*
373 		 * Make sure this vnode wasn't reclaimed in getnewvnode().
374 		 * Start over if it has (it won't be on the list anymore).
375 		 */
376 		if (xvp->v_mount != mp)
377 			goto loop;
378 		nvp = TAILQ_NEXT(xvp, v_nmntvnodes);
379 		mtx_unlock(&mntvnode_mtx);
380 		mtx_lock(&xvp->v_interlock);
381 		if (xvp->v_usecount == 0 || xvp->v_type == VNON ||
382 		    (VOP_GETATTR(xvp, &vat, td->td_proc->p_ucred, td) == 0 &&
383 		    vat.va_nlink > 0)) {
384 			mtx_unlock(&xvp->v_interlock);
385 			mtx_lock(&mntvnode_mtx);
386 			continue;
387 		}
388 		if (snapdebug)
389 			vprint("ffs_snapshot: busy vnode", xvp);
390 		if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK, td) != 0)
391 			goto loop;
392 		xp = VTOI(xvp);
393 		/*
394 		 * If there is a fragment, clear it here.
395 		 */
396 		blkno = 0;
397 		loc = howmany(xp->i_size, fs->fs_bsize) - 1;
398 		if (loc < NDADDR) {
399 			len = fragroundup(fs, blkoff(fs, xp->i_size));
400 			if (len < fs->fs_bsize) {
401 				ffs_blkfree(copy_fs, vp, xp->i_db[loc], len,
402 				    xp->i_number);
403 				blkno = xp->i_db[loc];
404 				xp->i_db[loc] = 0;
405 			}
406 		}
407 		error = expunge(vp, xp, copy_fs, fullacct, BLK_NOCOPY);
408 		if (blkno)
409 			xp->i_db[loc] = blkno;
410 		if (!error)
411 			error = ffs_freefile(copy_fs, vp, xp->i_number,
412 			    xp->i_mode);
413 		VOP_UNLOCK(xvp, 0, td);
414 		if (error) {
415 			free(copy_fs->fs_csp, M_UFSMNT);
416 			bawrite(sbp);
417 			sbp = NULL;
418 			goto out1;
419 		}
420 		mtx_lock(&mntvnode_mtx);
421 	}
422 	mtx_unlock(&mntvnode_mtx);
423 	/*
424 	 * Record snapshot inode. Since this is the newest snapshot,
425 	 * it must be placed at the end of the list.
426 	 */
427 	fs->fs_snapinum[snaploc] = ip->i_number;
428 	if (ip->i_nextsnap.tqe_prev != 0)
429 		panic("ffs_snapshot: %d already on list", ip->i_number);
430 	snaphead = &ip->i_devvp->v_rdev->si_snapshots;
431 	TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap);
432 	ip->i_devvp->v_rdev->si_copyonwrite = ffs_copyonwrite;
433 	ip->i_devvp->v_flag |= VCOPYONWRITE;
434 	vp->v_flag |= VSYSTEM;
435 out1:
436 	/*
437 	 * Resume operation on filesystem.
438 	 */
439 	vfs_write_resume(vp->v_mount);
440 	if (saved_nice > 0)
441 		td->td_ksegrp->kg_nice = saved_nice;
442 	vn_start_write(NULL, &wrtmp, V_WAIT);
443 	if (collectsnapstats && starttime.tv_sec > 0) {
444 		nanotime(&endtime);
445 		timespecsub(&endtime, &starttime);
446 		printf("%s: suspended %d.%03ld sec, redo %ld of %d\n",
447 		    vp->v_mount->mnt_stat.f_mntonname, endtime.tv_sec,
448 		    endtime.tv_nsec / 1000000, redo, fs->fs_ncg);
449 	}
450 	if (sbp == NULL)
451 		goto out;
452 	/*
453 	 * Copy allocation information from all the snapshots in
454 	 * this snapshot and then expunge them from its view.
455 	 */
456 	snaphead = &ip->i_devvp->v_rdev->si_snapshots;
457 	TAILQ_FOREACH(xp, snaphead, i_nextsnap) {
458 		if (xp == ip)
459 			break;
460 		if ((error = expunge(vp, xp, fs, snapacct, BLK_SNAP)) != 0) {
461 			fs->fs_snapinum[snaploc] = 0;
462 			goto done;
463 		}
464 	}
465 	/*
466 	 * Expunge the blocks used by the snapshots from the set of
467 	 * blocks marked as used in the snapshot bitmaps.
468 	 */
469 	if ((error = expunge(vp, ip, copy_fs, mapacct, BLK_SNAP)) != 0) {
470 		fs->fs_snapinum[snaploc] = 0;
471 		goto done;
472 	}
473 	/*
474 	 * Write the superblock and its summary information
475 	 * to the snapshot.
476 	 */
477 	blkno = fragstoblks(fs, fs->fs_csaddr);
478 	len = howmany(fs->fs_cssize, fs->fs_bsize);
479 	space = copy_fs->fs_csp;
480 	for (loc = 0; loc < len; loc++) {
481 		error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp);
482 		if (error) {
483 			brelse(nbp);
484 			fs->fs_snapinum[snaploc] = 0;
485 			goto done;
486 		}
487 		bcopy(space, nbp->b_data, fs->fs_bsize);
488 		space = (char *)space + fs->fs_bsize;
489 		bawrite(nbp);
490 	}
491 done:
492 	free(copy_fs->fs_csp, M_UFSMNT);
493 	bawrite(sbp);
494 out:
495 	if (fs->fs_active != 0) {
496 		FREE(fs->fs_active, M_DEVBUF);
497 		fs->fs_active = 0;
498 	}
499 	mp->mnt_flag = flag;
500 	if (error)
501 		(void) UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED, td);
502 	(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
503 	if (error)
504 		vput(vp);
505 	else
506 		VOP_UNLOCK(vp, 0, td);
507 	vn_finished_write(wrtmp);
508 	return (error);
509 }
510 
511 /*
512  * Copy a cylinder group map. All the unallocated blocks are marked
513  * BLK_NOCOPY so that the snapshot knows that it need not copy them
514  * if they are later written. If passno is one, then this is a first
515  * pass, so only setting needs to be done. If passno is 2, then this
516  * is a revision to a previous pass which must be undone as the
517  * replacement pass is done.
518  */
519 static int
520 cgaccount(cg, vp, nbp, passno)
521 	int cg;
522 	struct vnode *vp;
523 	struct buf *nbp;
524 	int passno;
525 {
526 	struct buf *bp, *ibp;
527 	struct inode *ip;
528 	struct cg *cgp;
529 	struct fs *fs;
530 	int error, numblks, base, len, loc, indiroff;
531 
532 	ip = VTOI(vp);
533 	fs = ip->i_fs;
534 	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
535 		(int)fs->fs_cgsize, KERNCRED, &bp);
536 	if (error) {
537 		brelse(bp);
538 		return (error);
539 	}
540 	cgp = (struct cg *)bp->b_data;
541 	if (!cg_chkmagic(cgp)) {
542 		brelse(bp);
543 		return (EIO);
544 	}
545 	atomic_set_int(&ACTIVECGNUM(fs, cg), ACTIVECGOFF(cg));
546 	bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize);
547 	if (fs->fs_cgsize < fs->fs_bsize)
548 		bzero(&nbp->b_data[fs->fs_cgsize],
549 		    fs->fs_bsize - fs->fs_cgsize);
550 	if (passno == 2)
551 		nbp->b_flags |= B_VALIDSUSPWRT;
552 	numblks = howmany(fs->fs_size, fs->fs_frag);
553 	len = howmany(fs->fs_fpg, fs->fs_frag);
554 	base = cg * fs->fs_fpg / fs->fs_frag;
555 	if (base + len >= numblks)
556 		len = numblks - base - 1;
557 	loc = 0;
558 	if (base < NDADDR) {
559 		for ( ; loc < NDADDR; loc++) {
560 			if (ffs_isblock(fs, cg_blksfree(cgp), loc))
561 				ip->i_db[loc] = BLK_NOCOPY;
562 			else if (passno == 2 && ip->i_db[loc] == BLK_NOCOPY)
563 				ip->i_db[loc] = 0;
564 			else if (passno == 1 && ip->i_db[loc] == BLK_NOCOPY)
565 				panic("ffs_snapshot: lost direct block");
566 		}
567 	}
568 	error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)),
569 	    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
570 	if (error) {
571 		brelse(bp);
572 		return (error);
573 	}
574 	indiroff = (base + loc - NDADDR) % NINDIR(fs);
575 	for ( ; loc < len; loc++, indiroff++) {
576 		if (indiroff >= NINDIR(fs)) {
577 			if (passno == 2)
578 				ibp->b_flags |= B_VALIDSUSPWRT;
579 			bawrite(ibp);
580 			error = UFS_BALLOC(vp,
581 			    lblktosize(fs, (off_t)(base + loc)),
582 			    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
583 			if (error) {
584 				brelse(bp);
585 				return (error);
586 			}
587 			indiroff = 0;
588 		}
589 		if (ffs_isblock(fs, cg_blksfree(cgp), loc))
590 			((ufs_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY;
591 		else if (passno == 2 &&
592 			 ((ufs_daddr_t *)(ibp->b_data))[indiroff] == BLK_NOCOPY)
593 			((ufs_daddr_t *)(ibp->b_data))[indiroff] = 0;
594 		else if (passno == 1 &&
595 			 ((ufs_daddr_t *)(ibp->b_data))[indiroff] == BLK_NOCOPY)
596 			panic("ffs_snapshot: lost indirect block");
597 	}
598 	bqrelse(bp);
599 	if (passno == 2)
600 		ibp->b_flags |= B_VALIDSUSPWRT;
601 	bdwrite(ibp);
602 	return (0);
603 }
604 
605 /*
606  * Before expunging a snapshot inode, note all the
607  * blocks that it claims with BLK_SNAP so that fsck will
608  * be able to account for those blocks properly and so
609  * that this snapshot knows that it need not copy them
610  * if the other snapshot holding them is freed.
611  */
612 static int
613 expunge(snapvp, cancelip, fs, acctfunc, expungetype)
614 	struct vnode *snapvp;
615 	struct inode *cancelip;
616 	struct fs *fs;
617 	int (*acctfunc)(struct vnode *, ufs_daddr_t *, ufs_daddr_t *,
618 	    struct fs *, ufs_daddr_t, int);
619 	int expungetype;
620 {
621 	int i, len, error, numblks, blksperindir;
622 	ufs_daddr_t lbn, rlbn, blkno, indiroff;
623 	struct thread *td = curthread;
624 	struct dinode *dip;
625 	struct buf *bp;
626 
627 	numblks = howmany(cancelip->i_size, fs->fs_bsize);
628 	if ((error = (*acctfunc)(snapvp, &cancelip->i_db[0],
629 	     &cancelip->i_ib[NIADDR], fs, 0, expungetype)))
630 		return (error);
631 	blksperindir = 1;
632 	lbn = -NDADDR;
633 	len = numblks - NDADDR;
634 	rlbn = NDADDR;
635 	for (i = 0; len > 0 && i < NIADDR; i++) {
636 		error = indiracct(snapvp, ITOV(cancelip), i, cancelip->i_ib[i],
637 		    lbn, rlbn, len, blksperindir, fs, acctfunc, expungetype);
638 		if (error)
639 			return (error);
640 		blksperindir *= NINDIR(fs);
641 		lbn -= blksperindir + 1;
642 		len -= blksperindir;
643 		rlbn += blksperindir;
644 	}
645 	/*
646 	 * Prepare to expunge the inode. If its inode block has not
647 	 * yet been copied, then allocate and fill the copy.
648 	 */
649 	lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
650 	blkno = 0;
651 	if (lbn < NDADDR) {
652 		blkno = cancelip->i_db[lbn];
653 	} else {
654 		td->td_proc->p_flag |= P_COWINPROGRESS;
655 		error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn),
656 		   fs->fs_bsize, KERNCRED, B_METAONLY, &bp);
657 		td->td_proc->p_flag &= ~P_COWINPROGRESS;
658 		if (error)
659 			return (error);
660 		indiroff = (lbn - NDADDR) % NINDIR(fs);
661 		blkno = ((ufs_daddr_t *)(bp->b_data))[indiroff];
662 		bqrelse(bp);
663 	}
664 	error = UFS_BALLOC(snapvp, lblktosize(fs, (off_t)lbn),
665 	    fs->fs_bsize, KERNCRED, 0, &bp);
666 	if (error)
667 		return (error);
668 	if (blkno == 0 && (error = readblock(bp, lbn)))
669 		return (error);
670 	/*
671 	 * Set a snapshot inode to be a zero length file, regular files
672 	 * to be completely unallocated.
673 	 */
674 	dip = (struct dinode *)bp->b_data + ino_to_fsbo(fs, cancelip->i_number);
675 	if (expungetype == BLK_NOCOPY)
676 		dip->di_mode = 0;
677 	dip->di_size = 0;
678 	dip->di_blocks = 0;
679 	dip->di_flags &= ~SF_SNAPSHOT;
680 	bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs_daddr_t));
681 	bdwrite(bp);
682 	return (0);
683 }
684 
685 /*
686  * Descend an indirect block chain for vnode cancelvp accounting for all
687  * its indirect blocks in snapvp.
688  */
689 static int
690 indiracct(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, blksperindir, fs,
691 	    acctfunc, expungetype)
692 	struct vnode *snapvp;
693 	struct vnode *cancelvp;
694 	int level;
695 	ufs_daddr_t blkno;
696 	int lbn;
697 	int rlbn;
698 	int remblks;
699 	int blksperindir;
700 	struct fs *fs;
701 	int (*acctfunc)(struct vnode *, ufs_daddr_t *, ufs_daddr_t *,
702 	    struct fs *, ufs_daddr_t, int);
703 	int expungetype;
704 {
705 	int subblksperindir, error, last, num, i;
706 	struct indir indirs[NIADDR + 2];
707 	ufs_daddr_t *bap;
708 	struct buf *bp;
709 
710 	if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
711 		return (error);
712 	if (lbn != indirs[num - 1 - level].in_lbn || blkno == 0 || num < 2)
713 		panic("indiracct: botched params");
714 	/*
715 	 * We have to expand bread here since it will deadlock looking
716 	 * up the block number for any blocks that are not in the cache.
717 	 */
718 	bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0);
719 	bp->b_blkno = fsbtodb(fs, blkno);
720 	if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 &&
721 	    (error = readblock(bp, fragstoblks(fs, blkno)))) {
722 		brelse(bp);
723 		return (error);
724 	}
725 	/*
726 	 * Account for the block pointers in this indirect block.
727 	 */
728 	last = howmany(remblks, blksperindir);
729 	if (last > NINDIR(fs))
730 		last = NINDIR(fs);
731 	MALLOC(bap, ufs_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK);
732 	bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
733 	bqrelse(bp);
734 	error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, rlbn, expungetype);
735 	if (error || level == 0)
736 		goto out;
737 	/*
738 	 * Account for the block pointers in each of the indirect blocks
739 	 * in the levels below us.
740 	 */
741 	subblksperindir = blksperindir / NINDIR(fs);
742 	for (lbn++, level--, i = 0; i < last; i++) {
743 		error = indiracct(snapvp, cancelvp, level, bap[i], lbn,
744 		    rlbn, remblks, subblksperindir, fs, acctfunc, expungetype);
745 		if (error)
746 			goto out;
747 		rlbn += blksperindir;
748 		lbn -= blksperindir;
749 		remblks -= blksperindir;
750 	}
751 out:
752 	FREE(bap, M_DEVBUF);
753 	return (error);
754 }
755 
756 /*
757  * Do both snap accounting and map accounting.
758  */
759 static int
760 fullacct(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
761 	struct vnode *vp;
762 	ufs_daddr_t *oldblkp, *lastblkp;
763 	struct fs *fs;
764 	ufs_daddr_t lblkno;
765 	int expungetype;	/* BLK_SNAP or BLK_NOCOPY */
766 {
767 	int error;
768 
769 	if ((error = snapacct(vp, oldblkp, lastblkp, fs, lblkno, expungetype)))
770 		return (error);
771 	return (mapacct(vp, oldblkp, lastblkp, fs, lblkno, expungetype));
772 }
773 
774 /*
775  * Identify a set of blocks allocated in a snapshot inode.
776  */
777 static int
778 snapacct(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
779 	struct vnode *vp;
780 	ufs_daddr_t *oldblkp, *lastblkp;
781 	struct fs *fs;
782 	ufs_daddr_t lblkno;
783 	int expungetype;	/* BLK_SNAP or BLK_NOCOPY */
784 {
785 	struct inode *ip = VTOI(vp);
786 	ufs_daddr_t lbn, blkno, *blkp;
787 	struct buf *ibp;
788 	int error;
789 
790 	for ( ; oldblkp < lastblkp; oldblkp++) {
791 		blkno = *oldblkp;
792 		if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
793 			continue;
794 		lbn = fragstoblks(fs, blkno);
795 		if (lbn < NDADDR) {
796 			blkp = &ip->i_db[lbn];
797 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
798 		} else {
799 			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
800 			    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
801 			if (error)
802 				return (error);
803 			blkp = &((ufs_daddr_t *)(ibp->b_data))
804 			    [(lbn - NDADDR) % NINDIR(fs)];
805 		}
806 		/*
807 		 * If we are expunging a snapshot vnode and we
808 		 * find a block marked BLK_NOCOPY, then it is
809 		 * one that has been allocated to this snapshot after
810 		 * we took our current snapshot and can be ignored.
811 		 */
812 		if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) {
813 			if (lbn >= NDADDR)
814 				brelse(ibp);
815 		} else {
816 			if (*blkp != 0)
817 				panic("snapacct: bad block");
818 			*blkp = expungetype;
819 			if (lbn >= NDADDR)
820 				bdwrite(ibp);
821 		}
822 	}
823 	return (0);
824 }
825 
826 /*
827  * Account for a set of blocks allocated in a snapshot inode.
828  */
829 static int
830 mapacct(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
831 	struct vnode *vp;
832 	ufs_daddr_t *oldblkp, *lastblkp;
833 	struct fs *fs;
834 	ufs_daddr_t lblkno;
835 	int expungetype;
836 {
837 	ufs_daddr_t blkno;
838 	ino_t inum;
839 
840 	inum = VTOI(vp)->i_number;
841 	for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
842 		blkno = *oldblkp;
843 		if (blkno == 0 || blkno == BLK_NOCOPY)
844 			continue;
845 		if (blkno == BLK_SNAP)
846 			blkno = blkstofrags(fs, lblkno);
847 		ffs_blkfree(fs, vp, blkno, fs->fs_bsize, inum);
848 	}
849 	return (0);
850 }
851 
852 /*
853  * Decrement extra reference on snapshot when last name is removed.
854  * It will not be freed until the last open reference goes away.
855  */
856 void
857 ffs_snapgone(ip)
858 	struct inode *ip;
859 {
860 	struct inode *xp;
861 	struct fs *fs;
862 	int snaploc;
863 
864 	/*
865 	 * Find snapshot in incore list.
866 	 */
867 	TAILQ_FOREACH(xp, &ip->i_devvp->v_rdev->si_snapshots, i_nextsnap)
868 		if (xp == ip)
869 			break;
870 	if (xp == 0)
871 		printf("ffs_snapgone: lost snapshot vnode %d\n",
872 		    ip->i_number);
873 	else
874 		vrele(ITOV(ip));
875 	/*
876 	 * Delete snapshot inode from superblock. Keep list dense.
877 	 */
878 	fs = ip->i_fs;
879 	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
880 		if (fs->fs_snapinum[snaploc] == ip->i_number)
881 			break;
882 	if (snaploc < FSMAXSNAP) {
883 		for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
884 			if (fs->fs_snapinum[snaploc] == 0)
885 				break;
886 			fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
887 		}
888 		fs->fs_snapinum[snaploc - 1] = 0;
889 	}
890 }
891 
892 /*
893  * Prepare a snapshot file for being removed.
894  */
895 void
896 ffs_snapremove(vp)
897 	struct vnode *vp;
898 {
899 	struct inode *ip;
900 	struct vnode *devvp;
901 	struct buf *ibp;
902 	struct fs *fs;
903 	ufs_daddr_t blkno, dblk;
904 	int error, numblks, loc, last;
905 
906 	ip = VTOI(vp);
907 	fs = ip->i_fs;
908 	/*
909 	 * If active, delete from incore list (this snapshot may
910 	 * already have been in the process of being deleted, so
911 	 * would not have been active).
912 	 *
913 	 * Clear copy-on-write flag if last snapshot.
914 	 */
915 	if (ip->i_nextsnap.tqe_prev != 0) {
916 		devvp = ip->i_devvp;
917 		TAILQ_REMOVE(&devvp->v_rdev->si_snapshots, ip, i_nextsnap);
918 		ip->i_nextsnap.tqe_prev = 0;
919 		if (TAILQ_FIRST(&devvp->v_rdev->si_snapshots) == 0) {
920 			devvp->v_rdev->si_copyonwrite = 0;
921 			devvp->v_flag &= ~VCOPYONWRITE;
922 		}
923 	}
924 	/*
925 	 * Clear all BLK_NOCOPY fields. Pass any block claims to other
926 	 * snapshots that want them (see ffs_snapblkfree below).
927 	 */
928 	for (blkno = 1; blkno < NDADDR; blkno++) {
929 		dblk = ip->i_db[blkno];
930 		if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
931 			ip->i_db[blkno] = 0;
932 		else if ((dblk == blkstofrags(fs, blkno) &&
933 		     ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize,
934 		     ip->i_number))) {
935 			ip->i_blocks -= btodb(fs->fs_bsize);
936 			ip->i_db[blkno] = 0;
937 		}
938 	}
939 	numblks = howmany(ip->i_size, fs->fs_bsize);
940 	for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
941 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno),
942 		    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
943 		if (error)
944 			continue;
945 		if ((last = fs->fs_size - blkno) > NINDIR(fs))
946 			last = NINDIR(fs);
947 		for (loc = 0; loc < last; loc++) {
948 			dblk = ((ufs_daddr_t *)(ibp->b_data))[loc];
949 			if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
950 				((ufs_daddr_t *)(ibp->b_data))[loc] = 0;
951 			else if ((dblk == blkstofrags(fs, blkno) &&
952 			     ffs_snapblkfree(fs, ip->i_devvp, dblk,
953 			     fs->fs_bsize, ip->i_number))) {
954 				ip->i_blocks -= btodb(fs->fs_bsize);
955 				((ufs_daddr_t *)(ibp->b_data))[loc] = 0;
956 			}
957 		}
958 		bawrite(ibp);
959 	}
960 	/*
961 	 * Clear snapshot flag and drop reference.
962 	 */
963 	ip->i_flags &= ~SF_SNAPSHOT;
964 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
965 }
966 
967 /*
968  * Notification that a block is being freed. Return zero if the free
969  * should be allowed to proceed. Return non-zero if the snapshot file
970  * wants to claim the block. The block will be claimed if it is an
971  * uncopied part of one of the snapshots. It will be freed if it is
972  * either a BLK_NOCOPY or has already been copied in all of the snapshots.
973  * If a fragment is being freed, then all snapshots that care about
974  * it must make a copy since a snapshot file can only claim full sized
975  * blocks. Note that if more than one snapshot file maps the block,
976  * we can pick one at random to claim it. Since none of the snapshots
977  * can change, we are assurred that they will all see the same unmodified
978  * image. When deleting a snapshot file (see ffs_snapremove above), we
979  * must push any of these claimed blocks to one of the other snapshots
980  * that maps it. These claimed blocks are easily identified as they will
981  * have a block number equal to their logical block number within the
982  * snapshot. A copied block can never have this property because they
983  * must always have been allocated from a BLK_NOCOPY location.
984  */
985 int
986 ffs_snapblkfree(fs, devvp, bno, size, inum)
987 	struct fs *fs;
988 	struct vnode *devvp;
989 	ufs_daddr_t bno;
990 	long size;
991 	ino_t inum;
992 {
993 	struct buf *ibp, *cbp, *savedcbp = 0;
994 	struct thread *td = curthread;
995 	struct inode *ip;
996 	struct vnode *vp;
997 	ufs_daddr_t lbn, blkno;
998 	int indiroff = 0, error = 0, claimedblk = 0;
999 	struct snaphead *snaphead;
1000 
1001 	lbn = fragstoblks(fs, bno);
1002 	snaphead = &devvp->v_rdev->si_snapshots;
1003 	TAILQ_FOREACH(ip, snaphead, i_nextsnap) {
1004 		vp = ITOV(ip);
1005 		/*
1006 		 * Lookup block being written.
1007 		 */
1008 		if (lbn < NDADDR) {
1009 			blkno = ip->i_db[lbn];
1010 		} else {
1011 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1012 			td->td_proc->p_flag |= P_COWINPROGRESS;
1013 			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
1014 			    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
1015 			td->td_proc->p_flag &= ~P_COWINPROGRESS;
1016 			VOP_UNLOCK(vp, 0, td);
1017 			if (error)
1018 				break;
1019 			indiroff = (lbn - NDADDR) % NINDIR(fs);
1020 			blkno = ((ufs_daddr_t *)(ibp->b_data))[indiroff];
1021 		}
1022 		/*
1023 		 * Check to see if block needs to be copied.
1024 		 */
1025 		switch (blkno) {
1026 		/*
1027 		 * If the snapshot has already copied the block (default),
1028 		 * or does not care about the block, it is not needed.
1029 		 */
1030 		default:
1031 		case BLK_NOCOPY:
1032 			if (lbn >= NDADDR)
1033 				bqrelse(ibp);
1034 			continue;
1035 		/*
1036 		 * No previous snapshot claimed the block, so it will be
1037 		 * freed and become a BLK_NOCOPY (don't care) for us.
1038 		 */
1039 		case BLK_SNAP:
1040 			if (claimedblk)
1041 				panic("snapblkfree: inconsistent block type");
1042 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1043 			if (lbn < NDADDR) {
1044 				ip->i_db[lbn] = BLK_NOCOPY;
1045 				ip->i_flag |= IN_CHANGE | IN_UPDATE;
1046 			} else {
1047 				((ufs_daddr_t *)(ibp->b_data))[indiroff] =
1048 				    BLK_NOCOPY;
1049 				bdwrite(ibp);
1050 			}
1051 			VOP_UNLOCK(vp, 0, td);
1052 			continue;
1053 		/*
1054 		 * A block that we map is being freed. If it has not been
1055 		 * claimed yet, we will claim or copy it (below).
1056 		 */
1057 		case 0:
1058 			claimedblk = 1;
1059 			break;
1060 		}
1061 		/*
1062 		 * If this is a full size block, we will just grab it
1063 		 * and assign it to the snapshot inode. Otherwise we
1064 		 * will proceed to copy it. See explanation for this
1065 		 * routine as to why only a single snapshot needs to
1066 		 * claim this block.
1067 		 */
1068 		if (size == fs->fs_bsize) {
1069 #ifdef DEBUG
1070 			if (snapdebug)
1071 				printf("%s %d lbn %d from inum %d\n",
1072 				    "Grabonremove: snapino", ip->i_number, lbn,
1073 				    inum);
1074 #endif
1075 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1076 			if (lbn < NDADDR) {
1077 				ip->i_db[lbn] = bno;
1078 			} else {
1079 				((ufs_daddr_t *)(ibp->b_data))[indiroff] = bno;
1080 				bdwrite(ibp);
1081 			}
1082 			ip->i_blocks += btodb(size);
1083 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
1084 			VOP_UNLOCK(vp, 0, td);
1085 			return (1);
1086 		}
1087 		if (lbn >= NDADDR)
1088 			bqrelse(ibp);
1089 		/*
1090 		 * Allocate the block into which to do the copy. Note that this
1091 		 * allocation will never require any additional allocations for
1092 		 * the snapshot inode.
1093 		 */
1094 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1095 		td->td_proc->p_flag |= P_COWINPROGRESS;
1096 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
1097 		    fs->fs_bsize, KERNCRED, 0, &cbp);
1098 		td->td_proc->p_flag &= ~P_COWINPROGRESS;
1099 		if (error) {
1100 			VOP_UNLOCK(vp, 0, td);
1101 			break;
1102 		}
1103 #ifdef DEBUG
1104 		if (snapdebug)
1105 			printf(
1106 "Copyonremove: snapino %lu lbn %ld for inum %lu size %ld to blkno %lld\n",
1107 			    (unsigned long)ip->i_number, (long)lbn,
1108 			    (unsigned long)inum, size, (long long)cbp->b_blkno);
1109 #endif
1110 		/*
1111 		 * If we have already read the old block contents, then
1112 		 * simply copy them to the new block. Note that we need
1113 		 * to synchronously write snapshots that have not been
1114 		 * unlinked, and hence will be visible after a crash,
1115 		 * to ensure their integrity.
1116 		 */
1117 		if (savedcbp != 0) {
1118 			bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
1119 			bawrite(cbp);
1120 			if (dopersistence && ip->i_effnlink > 0)
1121 				(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
1122 			VOP_UNLOCK(vp, 0, td);
1123 			continue;
1124 		}
1125 		/*
1126 		 * Otherwise, read the old block contents into the buffer.
1127 		 */
1128 		if ((error = readblock(cbp, lbn)) != 0) {
1129 			bzero(cbp->b_data, fs->fs_bsize);
1130 			bawrite(cbp);
1131 			if (dopersistence && ip->i_effnlink > 0)
1132 				(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
1133 			VOP_UNLOCK(vp, 0, td);
1134 			break;
1135 		}
1136 		VOP_UNLOCK(vp, 0, td);
1137 		savedcbp = cbp;
1138 	}
1139 	/*
1140 	 * Note that we need to synchronously write snapshots that
1141 	 * have not been unlinked, and hence will be visible after
1142 	 * a crash, to ensure their integrity.
1143 	 */
1144 	if (savedcbp) {
1145 		vp = savedcbp->b_vp;
1146 		bawrite(savedcbp);
1147 		if (dopersistence && VTOI(vp)->i_effnlink > 0) {
1148 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1149 			(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
1150 			VOP_UNLOCK(vp, 0, td);
1151 		}
1152 	}
1153 	/*
1154 	 * If we have been unable to allocate a block in which to do
1155 	 * the copy, then return non-zero so that the fragment will
1156 	 * not be freed. Although space will be lost, the snapshot
1157 	 * will stay consistent.
1158 	 */
1159 	return (error);
1160 }
1161 
1162 /*
1163  * Associate snapshot files when mounting.
1164  */
1165 void
1166 ffs_snapshot_mount(mp)
1167 	struct mount *mp;
1168 {
1169 	struct ufsmount *ump = VFSTOUFS(mp);
1170 	struct fs *fs = ump->um_fs;
1171 	struct thread *td = curthread;
1172 	struct snaphead *snaphead;
1173 	struct vnode *vp;
1174 	struct inode *ip;
1175 	int error, snaploc, loc;
1176 
1177 	snaphead = &ump->um_devvp->v_rdev->si_snapshots;
1178 	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
1179 		if (fs->fs_snapinum[snaploc] == 0)
1180 			return;
1181 		if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc],
1182 		    LK_EXCLUSIVE, &vp)) != 0){
1183 			printf("ffs_snapshot_mount: vget failed %d\n", error);
1184 			continue;
1185 		}
1186 		ip = VTOI(vp);
1187 		if ((ip->i_flags & SF_SNAPSHOT) == 0) {
1188 			printf("ffs_snapshot_mount: non-snapshot inode %d\n",
1189 			    fs->fs_snapinum[snaploc]);
1190 			vput(vp);
1191 			for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
1192 				if (fs->fs_snapinum[loc] == 0)
1193 					break;
1194 				fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
1195 			}
1196 			fs->fs_snapinum[loc - 1] = 0;
1197 			snaploc--;
1198 			continue;
1199 		}
1200 		if (ip->i_nextsnap.tqe_prev != 0)
1201 			panic("ffs_snapshot_mount: %d already on list",
1202 			    ip->i_number);
1203 		else
1204 			TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap);
1205 		vp->v_flag |= VSYSTEM;
1206 		ump->um_devvp->v_rdev->si_copyonwrite = ffs_copyonwrite;
1207 		ump->um_devvp->v_flag |= VCOPYONWRITE;
1208 		VOP_UNLOCK(vp, 0, td);
1209 	}
1210 }
1211 
1212 /*
1213  * Disassociate snapshot files when unmounting.
1214  */
1215 void
1216 ffs_snapshot_unmount(mp)
1217 	struct mount *mp;
1218 {
1219 	struct ufsmount *ump = VFSTOUFS(mp);
1220 	struct snaphead *snaphead = &ump->um_devvp->v_rdev->si_snapshots;
1221 	struct inode *xp;
1222 
1223 	while ((xp = TAILQ_FIRST(snaphead)) != 0) {
1224 		TAILQ_REMOVE(snaphead, xp, i_nextsnap);
1225 		xp->i_nextsnap.tqe_prev = 0;
1226 		if (xp->i_effnlink > 0)
1227 			vrele(ITOV(xp));
1228 	}
1229 	ump->um_devvp->v_rdev->si_copyonwrite = 0;
1230 	ump->um_devvp->v_flag &= ~VCOPYONWRITE;
1231 }
1232 
1233 /*
1234  * Check for need to copy block that is about to be written,
1235  * copying the block if necessary.
1236  */
1237 static int
1238 ffs_copyonwrite(devvp, bp)
1239 	struct vnode *devvp;
1240 	struct buf *bp;
1241 {
1242 	struct buf *ibp, *cbp, *savedcbp = 0;
1243 	struct thread *td = curthread;
1244 	struct fs *fs;
1245 	struct inode *ip;
1246 	struct vnode *vp;
1247 	ufs_daddr_t lbn, blkno;
1248 	int indiroff, error = 0;
1249 
1250 	fs = TAILQ_FIRST(&devvp->v_rdev->si_snapshots)->i_fs;
1251 	lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
1252 	if (td->td_proc->p_flag & P_COWINPROGRESS)
1253 		panic("ffs_copyonwrite: recursive call");
1254 	TAILQ_FOREACH(ip, &devvp->v_rdev->si_snapshots, i_nextsnap) {
1255 		vp = ITOV(ip);
1256 		/*
1257 		 * We ensure that everything of our own that needs to be
1258 		 * copied will be done at the time that ffs_snapshot is
1259 		 * called. Thus we can skip the check here which can
1260 		 * deadlock in doing the lookup in UFS_BALLOC.
1261 		 */
1262 		if (bp->b_vp == vp)
1263 			continue;
1264 		/*
1265 		 * Check to see if block needs to be copied. We have to
1266 		 * be able to do the UFS_BALLOC without blocking, otherwise
1267 		 * we may get in a deadlock with another process also
1268 		 * trying to allocate. If we find outselves unable to
1269 		 * get the buffer lock, we unlock the snapshot vnode,
1270 		 * sleep briefly, and try again.
1271 		 */
1272 retry:
1273 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1274 		if (lbn < NDADDR) {
1275 			blkno = ip->i_db[lbn];
1276 		} else {
1277 			td->td_proc->p_flag |= P_COWINPROGRESS;
1278 			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
1279 			   fs->fs_bsize, KERNCRED, B_METAONLY | B_NOWAIT, &ibp);
1280 			td->td_proc->p_flag &= ~P_COWINPROGRESS;
1281 			if (error) {
1282 				VOP_UNLOCK(vp, 0, td);
1283 				if (error != EWOULDBLOCK)
1284 					break;
1285 				tsleep(vp, td->td_ksegrp->kg_user_pri, "nap", 1);
1286 				goto retry;
1287 			}
1288 			indiroff = (lbn - NDADDR) % NINDIR(fs);
1289 			blkno = ((ufs_daddr_t *)(ibp->b_data))[indiroff];
1290 			bqrelse(ibp);
1291 		}
1292 #ifdef DIAGNOSTIC
1293 		if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
1294 			panic("ffs_copyonwrite: bad copy block");
1295 #endif
1296 		if (blkno != 0) {
1297 			VOP_UNLOCK(vp, 0, td);
1298 			continue;
1299 		}
1300 		/*
1301 		 * Allocate the block into which to do the copy. Note that this
1302 		 * allocation will never require any additional allocations for
1303 		 * the snapshot inode.
1304 		 */
1305 		td->td_proc->p_flag |= P_COWINPROGRESS;
1306 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
1307 		    fs->fs_bsize, KERNCRED, B_NOWAIT, &cbp);
1308 		td->td_proc->p_flag &= ~P_COWINPROGRESS;
1309 		if (error) {
1310 			VOP_UNLOCK(vp, 0, td);
1311 			if (error != EWOULDBLOCK)
1312 				break;
1313 			tsleep(vp, td->td_ksegrp->kg_user_pri, "nap", 1);
1314 			goto retry;
1315 		}
1316 #ifdef DEBUG
1317 		if (snapdebug) {
1318 			printf("Copyonwrite: snapino %d lbn %d for ",
1319 			    ip->i_number, lbn);
1320 			if (bp->b_vp == devvp)
1321 				printf("fs metadata");
1322 			else
1323 				printf("inum %d", VTOI(bp->b_vp)->i_number);
1324 			printf(" lblkno %lld to blkno %lld\n",
1325 			    (long long)bp->b_lblkno, (long long)cbp->b_blkno);
1326 		}
1327 #endif
1328 		/*
1329 		 * If we have already read the old block contents, then
1330 		 * simply copy them to the new block. Note that we need
1331 		 * to synchronously write snapshots that have not been
1332 		 * unlinked, and hence will be visible after a crash,
1333 		 * to ensure their integrity.
1334 		 */
1335 		if (savedcbp != 0) {
1336 			bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
1337 			bawrite(cbp);
1338 			if (dopersistence && ip->i_effnlink > 0)
1339 				(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
1340 			VOP_UNLOCK(vp, 0, td);
1341 			continue;
1342 		}
1343 		/*
1344 		 * Otherwise, read the old block contents into the buffer.
1345 		 */
1346 		if ((error = readblock(cbp, lbn)) != 0) {
1347 			bzero(cbp->b_data, fs->fs_bsize);
1348 			bawrite(cbp);
1349 			if (dopersistence && ip->i_effnlink > 0)
1350 				(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
1351 			VOP_UNLOCK(vp, 0, td);
1352 			break;
1353 		}
1354 		savedcbp = cbp;
1355 		VOP_UNLOCK(vp, 0, td);
1356 	}
1357 	/*
1358 	 * Note that we need to synchronously write snapshots that
1359 	 * have not been unlinked, and hence will be visible after
1360 	 * a crash, to ensure their integrity.
1361 	 */
1362 	if (savedcbp) {
1363 		vp = savedcbp->b_vp;
1364 		bawrite(savedcbp);
1365 		if (dopersistence && VTOI(vp)->i_effnlink > 0) {
1366 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1367 			(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, td);
1368 			VOP_UNLOCK(vp, 0, td);
1369 		}
1370 	}
1371 	return (error);
1372 }
1373 
1374 /*
1375  * Read the specified block into the given buffer.
1376  * Much of this boiler-plate comes from bwrite().
1377  */
1378 static int
1379 readblock(bp, lbn)
1380 	struct buf *bp;
1381 	daddr_t lbn;
1382 {
1383 	struct uio auio;
1384 	struct iovec aiov;
1385 	struct thread *td = curthread;
1386 	struct inode *ip = VTOI(bp->b_vp);
1387 
1388 	aiov.iov_base = bp->b_data;
1389 	aiov.iov_len = bp->b_bcount;
1390 	auio.uio_iov = &aiov;
1391 	auio.uio_iovcnt = 1;
1392 	auio.uio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn)));
1393 	auio.uio_resid = bp->b_bcount;
1394 	auio.uio_rw = UIO_READ;
1395 	auio.uio_segflg = UIO_SYSSPACE;
1396 	auio.uio_td = td;
1397 	return (physio(ip->i_devvp->v_rdev, &auio, 0));
1398 }
1399