xref: /freebsd/sys/ufs/ffs/ffs_snapshot.c (revision 1b6c76a2fe091c74f08427e6c870851025a9cf67)
1 /*
2  * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
3  *
4  * Further information about snapshots can be obtained from:
5  *
6  *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
7  *	1614 Oxford Street		mckusick@mckusick.com
8  *	Berkeley, CA 94709-1608		+1-510-843-9542
9  *	USA
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  *
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
22  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24  * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
25  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *	@(#)ffs_snapshot.c	8.11 (McKusick) 7/23/00
34  * $FreeBSD$
35  */
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/conf.h>
40 #include <sys/bio.h>
41 #include <sys/buf.h>
42 #include <sys/proc.h>
43 #include <sys/namei.h>
44 #include <sys/stat.h>
45 #include <sys/malloc.h>
46 #include <sys/mount.h>
47 #include <sys/resource.h>
48 #include <sys/resourcevar.h>
49 #include <sys/vnode.h>
50 
51 #include <ufs/ufs/extattr.h>
52 #include <ufs/ufs/quota.h>
53 #include <ufs/ufs/ufsmount.h>
54 #include <ufs/ufs/inode.h>
55 #include <ufs/ufs/ufs_extern.h>
56 
57 #include <ufs/ffs/fs.h>
58 #include <ufs/ffs/ffs_extern.h>
59 
60 #define KERNCRED proc0.p_ucred
61 #define DEBUG 1
62 
63 static int expunge __P((struct vnode *, struct inode *, struct fs *,
64     int (*) __P((struct vnode *, ufs_daddr_t *, ufs_daddr_t *, struct fs *,
65     ufs_daddr_t))));
66 static int indiracct __P((struct vnode *, struct vnode *, int, ufs_daddr_t,
67     int, int, int, int, struct fs *, int (*) __P((struct vnode *,
68     ufs_daddr_t *, ufs_daddr_t *, struct fs *, ufs_daddr_t))));
69 static int snapacct __P((struct vnode *, ufs_daddr_t *, ufs_daddr_t *,
70     struct fs *, ufs_daddr_t));
71 static int mapacct __P((struct vnode *, ufs_daddr_t *, ufs_daddr_t *,
72     struct fs *, ufs_daddr_t));
73 static int ffs_copyonwrite __P((struct vnode *, struct buf *));
74 static int readblock __P((struct buf *, daddr_t));
75 
76 /*
77  * To ensure the consistency of snapshots across crashes, we must
78  * synchronously write out copied blocks before allowing the
79  * originals to be modified. Because of the rather severe speed
80  * penalty that this imposes, the following flag allows this
81  * crash persistence to be disabled.
82  */
83 int dopersistence = 0;
84 
85 #ifdef DEBUG
86 #include <sys/sysctl.h>
87 SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, "");
88 int snapdebug = 0;
89 SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, "");
90 #endif /* DEBUG */
91 
92 /*
93  * Create a snapshot file and initialize it for the filesystem.
94  */
95 int
96 ffs_snapshot(mp, snapfile)
97 	struct mount *mp;
98 	char *snapfile;
99 {
100 	ufs_daddr_t blkno, inoblks[FSMAXSNAP];
101 	int error, cg, snaploc, indiroff, numblks;
102 	int i, size, base, len, loc, inoblkcnt;
103 	int flag = mp->mnt_flag;
104 	int32_t *lp;
105 	void *space;
106 	struct fs *copy_fs = NULL, *fs = VFSTOUFS(mp)->um_fs;
107 	struct snaphead *snaphead;
108 	struct proc *p = CURPROC;
109 	struct inode *ip, *xp;
110 	struct buf *bp, *nbp, *ibp, *sbp = NULL;
111 	struct nameidata nd;
112 	struct mount *wrtmp;
113 	struct vattr vat;
114 	struct vnode *vp;
115 	struct cg *cgp;
116 
117 	/*
118 	 * Need to serialize access to snapshot code per filesystem.
119 	 */
120 	/*
121 	 * Assign a snapshot slot in the superblock.
122 	 */
123 	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
124 		if (fs->fs_snapinum[snaploc] == 0)
125 			break;
126 	if (snaploc == FSMAXSNAP)
127 		return (ENOSPC);
128 	/*
129 	 * Create the snapshot file.
130 	 */
131 restart:
132 	NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, snapfile, p);
133 	if ((error = namei(&nd)) != 0)
134 		return (error);
135 	if (nd.ni_vp != NULL) {
136 		vput(nd.ni_vp);
137 		error = EEXIST;
138 	}
139 	if (nd.ni_dvp->v_mount != mp)
140 		error = EXDEV;
141 	if (error) {
142 		NDFREE(&nd, NDF_ONLY_PNBUF);
143 		if (nd.ni_dvp == nd.ni_vp)
144 			vrele(nd.ni_dvp);
145 		else
146 			vput(nd.ni_dvp);
147 		return (error);
148 	}
149 	VATTR_NULL(&vat);
150 	vat.va_type = VREG;
151 	vat.va_mode = S_IRUSR;
152 	vat.va_vaflags |= VA_EXCLUSIVE;
153 	if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp))
154 		wrtmp = NULL;
155 	if (wrtmp != mp)
156 		panic("ffs_snapshot: mount mismatch");
157 	if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) {
158 		NDFREE(&nd, NDF_ONLY_PNBUF);
159 		vput(nd.ni_dvp);
160 		if ((error = vn_start_write(NULL, &wrtmp,
161 		    V_XSLEEP | PCATCH)) != 0)
162 			return (error);
163 		goto restart;
164 	}
165 	VOP_LEASE(nd.ni_dvp, p, KERNCRED, LEASE_WRITE);
166 	error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat);
167 	vput(nd.ni_dvp);
168 	if (error) {
169 		NDFREE(&nd, NDF_ONLY_PNBUF);
170 		vn_finished_write(wrtmp);
171 		return (error);
172 	}
173 	vp = nd.ni_vp;
174 	ip = VTOI(vp);
175 	/*
176 	 * Allocate and copy the last block contents so as to be able
177 	 * to set size to that of the filesystem.
178 	 */
179 	numblks = howmany(fs->fs_size, fs->fs_frag);
180 	error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)),
181 	    fs->fs_bsize, KERNCRED, B_CLRBUF, &bp);
182 	if (error)
183 		goto out;
184 	ip->i_size = lblktosize(fs, (off_t)numblks);
185 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
186 	if ((error = readblock(bp, numblks - 1)) != 0)
187 		goto out;
188 	bawrite(bp);
189 	/*
190 	 * Preallocate critical data structures so that we can copy
191 	 * them in without further allocation after we suspend all
192 	 * operations on the filesystem. We would like to just release
193 	 * the allocated buffers without writing them since they will
194 	 * be filled in below once we are ready to go, but this upsets
195 	 * the soft update code, so we go ahead and write the new buffers.
196 	 *
197 	 * Allocate all indirect blocks and mark all of them as not
198 	 * needing to be copied.
199 	 */
200 	for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
201 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno),
202 		    fs->fs_bsize, p->p_ucred, B_METAONLY, &ibp);
203 		if (error)
204 			goto out;
205 		bdwrite(ibp);
206 	}
207 	/*
208 	 * Allocate shadow blocks to copy all of the other snapshot inodes
209 	 * so that we will be able to expunge them from this snapshot. Also
210 	 * include a copy of ourselves so that we do not deadlock trying
211 	 * to copyonwrite ourselves when VOP_FSYNC'ing below.
212 	 */
213 	fs->fs_snapinum[snaploc] = ip->i_number;
214 	for (loc = snaploc, inoblkcnt = 0; loc >= 0; loc--) {
215 		blkno = fragstoblks(fs, ino_to_fsba(fs, fs->fs_snapinum[loc]));
216 		fs->fs_snapinum[snaploc] = 0;
217 		for (i = 0; i < inoblkcnt; i++)
218 			if (inoblks[i] == blkno)
219 				break;
220 		if (i == inoblkcnt) {
221 			inoblks[inoblkcnt++] = blkno;
222 			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno),
223 			    fs->fs_bsize, KERNCRED, 0, &nbp);
224 			if (error)
225 				goto out;
226 			bawrite(nbp);
227 		}
228 	}
229 	/*
230 	 * Allocate all cylinder group blocks.
231 	 */
232 	for (cg = 0; cg < fs->fs_ncg; cg++) {
233 		error = UFS_BALLOC(vp, (off_t)(cgtod(fs, cg)) << fs->fs_fshift,
234 		    fs->fs_bsize, KERNCRED, 0, &nbp);
235 		if (error)
236 			goto out;
237 		bawrite(nbp);
238 	}
239 	/*
240 	 * Allocate copies for the superblock and its summary information.
241 	 */
242 	error = UFS_BALLOC(vp, (off_t)(SBOFF), SBSIZE, KERNCRED, 0, &nbp);
243 	if (error)
244 		goto out;
245 	bawrite(nbp);
246 	blkno = fragstoblks(fs, fs->fs_csaddr);
247 	len = howmany(fs->fs_cssize, fs->fs_bsize);
248 	for (loc = 0; loc < len; loc++) {
249 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)),
250 		    fs->fs_bsize, KERNCRED, 0, &nbp);
251 		if (error)
252 			goto out;
253 		bawrite(nbp);
254 	}
255 	/*
256 	 * Change inode to snapshot type file.
257 	 */
258 	ip->i_flags |= SF_SNAPSHOT;
259 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
260 	/*
261 	 * Ensure that the snapshot is completely on disk.
262 	 */
263 	if ((error = VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p)) != 0)
264 		goto out;
265 	/*
266 	 * All allocations are done, so we can now snapshot the system.
267 	 *
268 	 * Suspend operation on filesystem.
269 	 */
270 	for (;;) {
271 		vn_finished_write(wrtmp);
272 		vfs_write_suspend(vp->v_mount);
273 		if (mp->mnt_kern_flag & MNTK_SUSPENDED)
274 			break;
275 		vn_start_write(NULL, &wrtmp, V_WAIT);
276 	}
277 	/*
278 	 * First, copy all the cylinder group maps. All the unallocated
279 	 * blocks are marked BLK_NOCOPY so that the snapshot knows that
280 	 * it need not copy them if they are later written.
281 	 */
282 	len = howmany(fs->fs_fpg, fs->fs_frag);
283 	for (cg = 0; cg < fs->fs_ncg; cg++) {
284 		error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
285 			(int)fs->fs_cgsize, KERNCRED, &bp);
286 		if (error) {
287 			brelse(bp);
288 			goto out1;
289 		}
290 		cgp = (struct cg *)bp->b_data;
291 		if (!cg_chkmagic(cgp)) {
292 			brelse(bp);
293 			error = EIO;
294 			goto out1;
295 		}
296 		error = bread(vp, fragstoblks(fs, cgtod(fs, cg)), fs->fs_bsize,
297 			KERNCRED, &nbp);
298 		if (error) {
299 			brelse(bp);
300 			brelse(nbp);
301 			goto out1;
302 		}
303 		bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize);
304 		if (fs->fs_cgsize < fs->fs_bsize)
305 			bzero(&nbp->b_data[fs->fs_cgsize],
306 			    fs->fs_bsize - fs->fs_cgsize);
307 		nbp->b_flags |= B_VALIDSUSPWRT;
308 		bawrite(nbp);
309 		base = cg * fs->fs_fpg / fs->fs_frag;
310 		if (base + len >= numblks)
311 			len = numblks - base - 1;
312 		loc = 0;
313 		if (base < NDADDR) {
314 			for ( ; loc < NDADDR; loc++) {
315 				if (!ffs_isblock(fs, cg_blksfree(cgp), loc))
316 					continue;
317 				ip->i_db[loc] = BLK_NOCOPY;
318 			}
319 		}
320 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)),
321 		    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
322 		if (error) {
323 			brelse(bp);
324 			goto out1;
325 		}
326 		indiroff = (base + loc - NDADDR) % NINDIR(fs);
327 		for ( ; loc < len; loc++, indiroff++) {
328 			if (indiroff >= NINDIR(fs)) {
329 				ibp->b_flags |= B_VALIDSUSPWRT;
330 				bawrite(ibp);
331 				error = UFS_BALLOC(vp,
332 				    lblktosize(fs, (off_t)(base + loc)),
333 				    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
334 				if (error) {
335 					brelse(bp);
336 					goto out1;
337 				}
338 				indiroff = 0;
339 			}
340 			if (!ffs_isblock(fs, cg_blksfree(cgp), loc))
341 				continue;
342 			if (((ufs_daddr_t *)(ibp->b_data))[indiroff] != 0)
343 				panic("ffs_snapshot: lost block");
344 			((ufs_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY;
345 		}
346 		bqrelse(bp);
347 		ibp->b_flags |= B_VALIDSUSPWRT;
348 		bdwrite(ibp);
349 	}
350 	/*
351 	 * Copy the shadow blocks for the snapshot inodes so that
352 	 * the copies can can be expunged.
353 	 */
354 	for (loc = 0; loc < inoblkcnt; loc++) {
355 		error = bread(vp, inoblks[loc], fs->fs_bsize, KERNCRED, &nbp);
356 		if (error)
357 			goto out1;
358 		readblock(nbp, inoblks[loc]);
359 		nbp->b_flags |= B_VALIDSUSPWRT;
360 		bdwrite(nbp);
361 	}
362 	/*
363 	 * Copy allocation information from all the snapshots in
364 	 * this snapshot and then expunge them from its view.
365 	 */
366 	snaphead = &ip->i_devvp->v_rdev->si_snapshots;
367 	TAILQ_FOREACH(xp, snaphead, i_nextsnap)
368 		if ((error = expunge(vp, xp, fs, snapacct)) != 0)
369 			goto out1;
370 	/*
371 	 * Grab a copy of the superblock and its summary information.
372 	 * We delay writing it until the suspension is released below.
373 	 */
374 	error = bread(vp, lblkno(fs, SBOFF), fs->fs_bsize, KERNCRED, &sbp);
375 	if (error)
376 		goto out1;
377 	copy_fs = (struct fs *)(sbp->b_data + blkoff(fs, SBOFF));
378 	bcopy(fs, copy_fs, fs->fs_sbsize);
379 	if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0)
380 		copy_fs->fs_clean = 1;
381 	if (fs->fs_sbsize < SBSIZE)
382 		bzero(&sbp->b_data[blkoff(fs, SBOFF) + fs->fs_sbsize],
383 		    SBSIZE - fs->fs_sbsize);
384 	size = blkroundup(fs, fs->fs_cssize);
385 	if (fs->fs_contigsumsize > 0)
386 		size += fs->fs_ncg * sizeof(int32_t);
387 	space = malloc((u_long)size, M_UFSMNT, M_WAITOK);
388 	copy_fs->fs_csp = space;
389 	bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize);
390 	(char *)space += fs->fs_cssize;
391 	loc = howmany(fs->fs_cssize, fs->fs_fsize);
392 	i = fs->fs_frag - loc % fs->fs_frag;
393 	len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize;
394 	if (len > 0) {
395 		if ((error = bread(ip->i_devvp,
396 		    fsbtodb(fs, fs->fs_csaddr + loc),
397 		    len, KERNCRED, &bp)) != 0) {
398 			free(copy_fs->fs_csp, M_UFSMNT);
399 			goto out1;
400 		}
401 		bcopy(bp->b_data, space, (u_int)len);
402 		(char *)space += len;
403 		bp->b_flags |= B_INVAL | B_NOCACHE;
404 		brelse(bp);
405 	}
406 	if (fs->fs_contigsumsize > 0) {
407 		copy_fs->fs_maxcluster = lp = space;
408 		for (i = 0; i < fs->fs_ncg; i++)
409 			*lp++ = fs->fs_contigsumsize;
410 	}
411 	/*
412 	 * Record snapshot inode. Since this is the newest snapshot,
413 	 * it must be placed at the end of the list.
414 	 */
415 	fs->fs_snapinum[snaploc] = ip->i_number;
416 	if (ip->i_nextsnap.tqe_prev != 0)
417 		panic("ffs_snapshot: %d already on list", ip->i_number);
418 	TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap);
419 	ip->i_devvp->v_rdev->si_copyonwrite = ffs_copyonwrite;
420 	ip->i_devvp->v_flag |= VCOPYONWRITE;
421 	vp->v_flag |= VSYSTEM;
422 	/*
423 	 * Resume operation on filesystem.
424 	 */
425 out1:
426 	vfs_write_resume(vp->v_mount);
427 	vn_start_write(NULL, &wrtmp, V_WAIT);
428 	if (sbp != NULL) {
429 		/*
430 		 * Expunge the blocks used by the snapshots from the set of
431 		 * blocks marked as used in the snapshot bitmaps.
432 		 */
433 		if ((error = expunge(vp, VTOI(vp), copy_fs, mapacct)) != 0) {
434 			vref(vp);
435 			ffs_snapgone(VTOI(vp));
436 			free(copy_fs->fs_csp, M_UFSMNT);
437 			bawrite(sbp);
438 			goto out;
439 		}
440 		/*
441 		 * Write the superblock and its summary information
442 		 * to the snapshot.
443 		 */
444 		blkno = fragstoblks(fs, fs->fs_csaddr);
445 		len = howmany(fs->fs_cssize, fs->fs_bsize);
446 		space = copy_fs->fs_csp;
447 		for (loc = 0; loc < len; loc++) {
448 			error = bread(vp, blkno + loc, fs->fs_bsize,
449 				KERNCRED, &nbp);
450 			if (error) {
451 				vref(vp);
452 				ffs_snapgone(VTOI(vp));
453 				free(copy_fs->fs_csp, M_UFSMNT);
454 				bawrite(sbp);
455 				goto out;
456 			}
457 			bcopy(space, nbp->b_data, fs->fs_bsize);
458 			space = (char *)space + fs->fs_bsize;
459 			bawrite(nbp);
460 		}
461 		free(copy_fs->fs_csp, M_UFSMNT);
462 		bawrite(sbp);
463 	}
464 out:
465 	mp->mnt_flag = flag;
466 	if (error)
467 		(void) UFS_TRUNCATE(vp, (off_t)0, 0, NOCRED, p);
468 	(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p);
469 	if (error)
470 		vput(vp);
471 	else
472 		VOP_UNLOCK(vp, 0, p);
473 	vn_finished_write(wrtmp);
474 	return (error);
475 }
476 
477 /*
478  * Before expunging a snapshot inode, note all the
479  * blocks that it claims with BLK_SNAP so that fsck will
480  * be able to account for those blocks properly and so
481  * that this snapshot knows that it need not copy them
482  * if the other snapshot holding them is freed.
483  */
484 static int
485 expunge(vp, xp, fs, acctfunc)
486 	struct vnode *vp;
487 	struct inode *xp;
488 	struct fs *fs;
489 	int (*acctfunc) __P((struct vnode *, ufs_daddr_t *, ufs_daddr_t *,
490 	    struct fs *, ufs_daddr_t));
491 {
492 	int i, len, error, numblks, blksperindir;
493 	ufs_daddr_t lbn, rlbn, blkno;
494 	struct dinode *dip;
495 	struct buf *bp;
496 
497 	if ((error = (*acctfunc)(vp, &xp->i_db[0], &xp->i_ib[NIADDR], fs, 0)))
498 		return (error);
499 	numblks = howmany(fs->fs_size, fs->fs_frag);
500 	blksperindir = 1;
501 	lbn = -NDADDR;
502 	len = numblks - NDADDR;
503 	rlbn = NDADDR;
504 	for (i = 0; len > 0 && i < NIADDR; i++) {
505 		error = indiracct(vp, ITOV(xp), i, xp->i_ib[i], lbn,
506 		    rlbn, len, blksperindir, fs, acctfunc);
507 		if (error)
508 			return (error);
509 		blksperindir *= NINDIR(fs);
510 		lbn -= blksperindir + 1;
511 		len -= blksperindir;
512 		rlbn += blksperindir;
513 	}
514 	/*
515 	 * Set copied snapshot inode to be a zero length file.
516 	 */
517 	blkno = fragstoblks(fs, ino_to_fsba(fs, xp->i_number));
518 	if ((error = bread(vp, blkno, fs->fs_bsize, KERNCRED, &bp)) != 0)
519 		return (error);
520 	dip = (struct dinode *)bp->b_data +
521 	    ino_to_fsbo(fs, xp->i_number);
522 	dip->di_size = 0;
523 	dip->di_blocks = 0;
524 	dip->di_flags &= ~SF_SNAPSHOT;
525 	bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs_daddr_t));
526 	bp->b_flags |= B_VALIDSUSPWRT;
527 	bdwrite(bp);
528 	return (0);
529 }
530 
531 /*
532  * Descend an indirect block chain for vnode cancelvp accounting for all
533  * its indirect blocks in snapvp.
534  */
535 static int
536 indiracct(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, blksperindir, fs,
537 	    acctfunc)
538 	struct vnode *snapvp;
539 	struct vnode *cancelvp;
540 	int level;
541 	ufs_daddr_t blkno;
542 	int lbn;
543 	int rlbn;
544 	int remblks;
545 	int blksperindir;
546 	struct fs *fs;
547 	int (*acctfunc) __P((struct vnode *, ufs_daddr_t *, ufs_daddr_t *,
548 	    struct fs *, ufs_daddr_t));
549 {
550 	int subblksperindir, error, last, num, i;
551 	struct indir indirs[NIADDR + 2];
552 	ufs_daddr_t *bap;
553 	struct buf *bp;
554 
555 	if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
556 		return (error);
557 	if (lbn != indirs[num - 1 - level].in_lbn || blkno == 0 || num < 2)
558 		panic("indiracct: botched params");
559 	/*
560 	 * We have to expand bread here since it will deadlock looking
561 	 * up the block number for any blocks that are not in the cache.
562 	 */
563 	bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0);
564 	bp->b_blkno = fsbtodb(fs, blkno);
565 	if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 &&
566 	    (error = readblock(bp, fragstoblks(fs, blkno)))) {
567 		brelse(bp);
568 		return (error);
569 	}
570 	/*
571 	 * Account for the block pointers in this indirect block.
572 	 */
573 	last = howmany(remblks, blksperindir);
574 	if (last > NINDIR(fs))
575 		last = NINDIR(fs);
576 	MALLOC(bap, ufs_daddr_t *, fs->fs_bsize, M_DEVBUF, M_WAITOK);
577 	bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
578 	bqrelse(bp);
579 	error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, rlbn);
580 	if (error || level == 0)
581 		goto out;
582 	/*
583 	 * Account for the block pointers in each of the indirect blocks
584 	 * in the levels below us.
585 	 */
586 	subblksperindir = blksperindir / NINDIR(fs);
587 	for (lbn++, level--, i = 0; i < last; i++) {
588 		error = indiracct(snapvp, cancelvp, level, bap[i], lbn,
589 		    rlbn, remblks, subblksperindir, fs, acctfunc);
590 		if (error)
591 			goto out;
592 		rlbn += blksperindir;
593 		lbn -= blksperindir;
594 		remblks -= blksperindir;
595 	}
596 out:
597 	FREE(bap, M_DEVBUF);
598 	return (error);
599 }
600 
601 /*
602  * Account for a set of blocks allocated in a snapshot inode.
603  */
604 static int
605 snapacct(vp, oldblkp, lastblkp, fs, lblkno)
606 	struct vnode *vp;
607 	ufs_daddr_t *oldblkp, *lastblkp;
608 	struct fs *fs;
609 	ufs_daddr_t lblkno;
610 {
611 	struct inode *ip = VTOI(vp);
612 	ufs_daddr_t lbn, blkno, *blkp;
613 	struct buf *ibp;
614 	int error;
615 
616 	for ( ; oldblkp < lastblkp; oldblkp++) {
617 		blkno = *oldblkp;
618 		if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
619 			continue;
620 		lbn = fragstoblks(fs, blkno);
621 		if (lbn < NDADDR) {
622 			blkp = &ip->i_db[lbn];
623 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
624 		} else {
625 			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
626 			    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
627 			if (error)
628 				return (error);
629 			blkp = &((ufs_daddr_t *)(ibp->b_data))
630 			    [(lbn - NDADDR) % NINDIR(fs)];
631 		}
632 		if (*blkp != 0)
633 			panic("snapacct: bad block");
634 		*blkp = BLK_SNAP;
635 		if (lbn >= NDADDR) {
636 			ibp->b_flags |= B_VALIDSUSPWRT;
637 			bdwrite(ibp);
638 		}
639 	}
640 	return (0);
641 }
642 
643 /*
644  * Account for a set of blocks allocated in a snapshot inode.
645  */
646 static int
647 mapacct(vp, oldblkp, lastblkp, fs, lblkno)
648 	struct vnode *vp;
649 	ufs_daddr_t *oldblkp, *lastblkp;
650 	struct fs *fs;
651 	ufs_daddr_t lblkno;
652 {
653 	struct inode *ip = VTOI(vp);
654 	ufs_daddr_t blkno, cgblkno, fragno;
655 	struct buf *bp;
656 	struct cg *cgp;
657 	char *blksfree;
658 	int i, cg, error;
659 
660 	for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
661 		blkno = *oldblkp;
662 		if (blkno == 0 || blkno == BLK_NOCOPY)
663 			continue;
664 		if (blkno == BLK_SNAP)
665 			blkno = blkstofrags(fs, lblkno);
666 		cg = dtog(fs, blkno);
667 		cgblkno = fragstoblks(fs, cgtod(fs, cg));
668 		if ((error = bread(vp, cgblkno, fs->fs_bsize, KERNCRED, &bp)))
669 			return (error);
670 		cgp = (struct cg *)bp->b_data;
671 		if (!cg_chkmagic(cgp) || cgp->cg_cgx != cg) {
672 			if (!cg_chkmagic(cgp))
673 				printf("mapacct: bad magic 0x%x\n",
674 				    cgp->cg_magic);
675 			else
676 				printf("%s: mismatched cg %d != cg_cgx %d\n",
677 				    "mapacct", cg, cgp->cg_cgx);
678 			brelse(bp);
679 			return (EIO);
680 		}
681 		cgp->cg_time = time_second;
682 		cgblkno = dtogd(fs, blkno);
683 		blksfree = cg_blksfree(cgp);
684 		fragno = fragstoblks(fs, cgblkno);
685 		if (!ffs_isfreeblock(fs, blksfree, fragno)) {
686 			printf("dev = %s, block = %ld, fs = %s\n",
687 			    devtoname(ip->i_dev), (long)blkno, fs->fs_fsmnt);
688 			panic("mapacct: freeing free block");
689 		}
690 		ffs_setblock(fs, blksfree, fragno);
691 		ffs_clusteracct(fs, cgp, fragno, 1);
692 		cgp->cg_cs.cs_nbfree++;
693 		fs->fs_cstotal.cs_nbfree++;
694 		fs->fs_cs(fs, cg).cs_nbfree++;
695 		i = cbtocylno(fs, cgblkno);
696 		cg_blks(fs, cgp, i)[cbtorpos(fs, cgblkno)]++;
697 		cg_blktot(cgp)[i]++;
698 		fs->fs_fmod = 1;
699 		bdwrite(bp);
700 	}
701 	return (0);
702 }
703 
704 /*
705  * Decrement extra reference on snapshot when last name is removed.
706  * It will not be freed until the last open reference goes away.
707  */
708 void
709 ffs_snapgone(ip)
710 	struct inode *ip;
711 {
712 	struct inode *xp;
713 	struct fs *fs;
714 	int snaploc;
715 
716 	/*
717 	 * Find snapshot in incore list.
718 	 */
719 	TAILQ_FOREACH(xp, &ip->i_devvp->v_rdev->si_snapshots, i_nextsnap)
720 		if (xp == ip)
721 			break;
722 	if (xp == 0)
723 		printf("ffs_snapgone: lost snapshot vnode %d\n",
724 		    ip->i_number);
725 	else
726 		vrele(ITOV(ip));
727 	/*
728 	 * Delete snapshot inode from superblock. Keep list dense.
729 	 */
730 	fs = ip->i_fs;
731 	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
732 		if (fs->fs_snapinum[snaploc] == ip->i_number)
733 			break;
734 	if (snaploc < FSMAXSNAP) {
735 		for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
736 			if (fs->fs_snapinum[snaploc] == 0)
737 				break;
738 			fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
739 		}
740 		fs->fs_snapinum[snaploc - 1] = 0;
741 	}
742 }
743 
744 /*
745  * Prepare a snapshot file for being removed.
746  */
747 void
748 ffs_snapremove(vp)
749 	struct vnode *vp;
750 {
751 	struct inode *ip;
752 	struct vnode *devvp;
753 	struct buf *ibp;
754 	struct fs *fs;
755 	ufs_daddr_t blkno, dblk;
756 	int error, numblks, loc, last;
757 
758 	ip = VTOI(vp);
759 	fs = ip->i_fs;
760 	/*
761 	 * If active, delete from incore list (this snapshot may
762 	 * already have been in the process of being deleted, so
763 	 * would not have been active).
764 	 *
765 	 * Clear copy-on-write flag if last snapshot.
766 	 */
767 	if (ip->i_nextsnap.tqe_prev != 0) {
768 		devvp = ip->i_devvp;
769 		TAILQ_REMOVE(&devvp->v_rdev->si_snapshots, ip, i_nextsnap);
770 		ip->i_nextsnap.tqe_prev = 0;
771 		if (TAILQ_FIRST(&devvp->v_rdev->si_snapshots) == 0) {
772 			devvp->v_rdev->si_copyonwrite = 0;
773 			devvp->v_flag &= ~VCOPYONWRITE;
774 		}
775 	}
776 	/*
777 	 * Clear all BLK_NOCOPY fields. Pass any block claims to other
778 	 * snapshots that want them (see ffs_snapblkfree below).
779 	 */
780 	for (blkno = 1; blkno < NDADDR; blkno++) {
781 		dblk = ip->i_db[blkno];
782 		if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
783 			ip->i_db[blkno] = 0;
784 		else if ((dblk == blkstofrags(fs, blkno) &&
785 		     ffs_snapblkfree(ip, dblk, fs->fs_bsize))) {
786 			ip->i_blocks -= btodb(fs->fs_bsize);
787 			ip->i_db[blkno] = 0;
788 		}
789 	}
790 	numblks = howmany(ip->i_size, fs->fs_bsize);
791 	for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
792 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno),
793 		    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
794 		if (error)
795 			continue;
796 		if ((last = fs->fs_size - blkno) > NINDIR(fs))
797 			last = NINDIR(fs);
798 		for (loc = 0; loc < last; loc++) {
799 			dblk = ((ufs_daddr_t *)(ibp->b_data))[loc];
800 			if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
801 				((ufs_daddr_t *)(ibp->b_data))[loc] = 0;
802 			else if ((dblk == blkstofrags(fs, blkno) &&
803 			     ffs_snapblkfree(ip, dblk, fs->fs_bsize))) {
804 				ip->i_blocks -= btodb(fs->fs_bsize);
805 				((ufs_daddr_t *)(ibp->b_data))[loc] = 0;
806 			}
807 		}
808 		bawrite(ibp);
809 	}
810 	/*
811 	 * Clear snapshot flag and drop reference.
812 	 */
813 	ip->i_flags &= ~SF_SNAPSHOT;
814 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
815 }
816 
817 /*
818  * Notification that a block is being freed. Return zero if the free
819  * should be allowed to proceed. Return non-zero if the snapshot file
820  * wants to claim the block. The block will be claimed if it is an
821  * uncopied part of one of the snapshots. It will be freed if it is
822  * either a BLK_NOCOPY or has already been copied in all of the snapshots.
823  * If a fragment is being freed, then all snapshots that care about
824  * it must make a copy since a snapshot file can only claim full sized
825  * blocks. Note that if more than one snapshot file maps the block,
826  * we can pick one at random to claim it. Since none of the snapshots
827  * can change, we are assurred that they will all see the same unmodified
828  * image. When deleting a snapshot file (see ffs_snapremove above), we
829  * must push any of these claimed blocks to one of the other snapshots
830  * that maps it. These claimed blocks are easily identified as they will
831  * have a block number equal to their logical block number within the
832  * snapshot. A copied block can never have this property because they
833  * must always have been allocated from a BLK_NOCOPY location.
834  */
835 int
836 ffs_snapblkfree(freeip, bno, size)
837 	struct inode *freeip;
838 	ufs_daddr_t bno;
839 	long size;
840 {
841 	struct buf *ibp, *cbp, *savedcbp = 0;
842 	struct fs *fs = freeip->i_fs;
843 	struct proc *p = CURPROC;
844 	struct inode *ip;
845 	struct vnode *vp;
846 	ufs_daddr_t lbn, blkno;
847 	int indiroff = 0, error = 0, claimedblk = 0;
848 	struct snaphead *snaphead;
849 
850 	lbn = fragstoblks(fs, bno);
851 	snaphead = &freeip->i_devvp->v_rdev->si_snapshots;
852 	TAILQ_FOREACH(ip, snaphead, i_nextsnap) {
853 		vp = ITOV(ip);
854 		/*
855 		 * Lookup block being written.
856 		 */
857 		if (lbn < NDADDR) {
858 			blkno = ip->i_db[lbn];
859 		} else {
860 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
861 			p->p_flag |= P_COWINPROGRESS;
862 			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
863 			    fs->fs_bsize, KERNCRED, B_METAONLY, &ibp);
864 			p->p_flag &= ~P_COWINPROGRESS;
865 			VOP_UNLOCK(vp, 0, p);
866 			if (error)
867 				break;
868 			indiroff = (lbn - NDADDR) % NINDIR(fs);
869 			blkno = ((ufs_daddr_t *)(ibp->b_data))[indiroff];
870 		}
871 		/*
872 		 * Check to see if block needs to be copied.
873 		 */
874 		switch (blkno) {
875 		/*
876 		 * If the snapshot has already copied the block (default),
877 		 * or does not care about the block, it is not needed.
878 		 */
879 		default:
880 		case BLK_NOCOPY:
881 			if (lbn >= NDADDR)
882 				bqrelse(ibp);
883 			continue;
884 		/*
885 		 * No previous snapshot claimed the block, so it will be
886 		 * freed and become a BLK_NOCOPY (don't care) for us.
887 		 */
888 		case BLK_SNAP:
889 			if (claimedblk)
890 				panic("snapblkfree: inconsistent block type");
891 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
892 			if (lbn < NDADDR) {
893 				ip->i_db[lbn] = BLK_NOCOPY;
894 				ip->i_flag |= IN_CHANGE | IN_UPDATE;
895 			} else {
896 				((ufs_daddr_t *)(ibp->b_data))[indiroff] =
897 				    BLK_NOCOPY;
898 				bdwrite(ibp);
899 			}
900 			VOP_UNLOCK(vp, 0, p);
901 			continue;
902 		/*
903 		 * A block that we map is being freed. If it has not been
904 		 * claimed yet, we will claim or copy it (below).
905 		 */
906 		case 0:
907 			claimedblk = 1;
908 			break;
909 		}
910 		/*
911 		 * If this is a full size block, we will just grab it
912 		 * and assign it to the snapshot inode. Otherwise we
913 		 * will proceed to copy it. See explanation for this
914 		 * routine as to why only a single snapshot needs to
915 		 * claim this block.
916 		 */
917 		if (size == fs->fs_bsize) {
918 #ifdef DEBUG
919 			if (snapdebug)
920 				printf("%s %d lbn %d from inum %d\n",
921 				    "Grabonremove: snapino", ip->i_number, lbn,
922 				    freeip->i_number);
923 #endif
924 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
925 			if (lbn < NDADDR) {
926 				ip->i_db[lbn] = bno;
927 			} else {
928 				((ufs_daddr_t *)(ibp->b_data))[indiroff] = bno;
929 				bdwrite(ibp);
930 			}
931 			ip->i_blocks += btodb(size);
932 			ip->i_flag |= IN_CHANGE | IN_UPDATE;
933 			VOP_UNLOCK(vp, 0, p);
934 			return (1);
935 		}
936 		if (lbn >= NDADDR)
937 			bqrelse(ibp);
938 		/*
939 		 * Allocate the block into which to do the copy. Note that this
940 		 * allocation will never require any additional allocations for
941 		 * the snapshot inode.
942 		 */
943 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
944 		p->p_flag |= P_COWINPROGRESS;
945 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
946 		    fs->fs_bsize, KERNCRED, 0, &cbp);
947 		p->p_flag &= ~P_COWINPROGRESS;
948 		if (error) {
949 			VOP_UNLOCK(vp, 0, p);
950 			break;
951 		}
952 #ifdef DEBUG
953 		if (snapdebug)
954 			printf("%s%d lbn %d for inum %d size %ld to blkno %d\n",
955 			    "Copyonremove: snapino ", ip->i_number, lbn,
956 			    freeip->i_number, size, cbp->b_blkno);
957 #endif
958 		/*
959 		 * If we have already read the old block contents, then
960 		 * simply copy them to the new block. Note that we need
961 		 * to synchronously write snapshots that have not been
962 		 * unlinked, and hence will be visible after a crash,
963 		 * to ensure their integrity.
964 		 */
965 		if (savedcbp != 0) {
966 			bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
967 			bawrite(cbp);
968 			if (dopersistence && ip->i_effnlink > 0)
969 				(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p);
970 			VOP_UNLOCK(vp, 0, p);
971 			continue;
972 		}
973 		/*
974 		 * Otherwise, read the old block contents into the buffer.
975 		 */
976 		if ((error = readblock(cbp, lbn)) != 0) {
977 			bzero(cbp->b_data, fs->fs_bsize);
978 			bawrite(cbp);
979 			if (dopersistence && ip->i_effnlink > 0)
980 				(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p);
981 			VOP_UNLOCK(vp, 0, p);
982 			break;
983 		}
984 		VOP_UNLOCK(vp, 0, p);
985 		savedcbp = cbp;
986 	}
987 	/*
988 	 * Note that we need to synchronously write snapshots that
989 	 * have not been unlinked, and hence will be visible after
990 	 * a crash, to ensure their integrity.
991 	 */
992 	if (savedcbp) {
993 		vp = savedcbp->b_vp;
994 		bawrite(savedcbp);
995 		if (dopersistence && VTOI(vp)->i_effnlink > 0) {
996 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
997 			(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p);
998 			VOP_UNLOCK(vp, 0, p);
999 		}
1000 	}
1001 	/*
1002 	 * If we have been unable to allocate a block in which to do
1003 	 * the copy, then return non-zero so that the fragment will
1004 	 * not be freed. Although space will be lost, the snapshot
1005 	 * will stay consistent.
1006 	 */
1007 	return (error);
1008 }
1009 
1010 /*
1011  * Associate snapshot files when mounting.
1012  */
1013 void
1014 ffs_snapshot_mount(mp)
1015 	struct mount *mp;
1016 {
1017 	struct ufsmount *ump = VFSTOUFS(mp);
1018 	struct fs *fs = ump->um_fs;
1019 	struct proc *p = CURPROC;
1020 	struct snaphead *snaphead;
1021 	struct vnode *vp;
1022 	struct inode *ip;
1023 	int error, snaploc, loc;
1024 
1025 	snaphead = &ump->um_devvp->v_rdev->si_snapshots;
1026 	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
1027 		if (fs->fs_snapinum[snaploc] == 0)
1028 			return;
1029 		if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc], &vp)) != 0){
1030 			printf("ffs_snapshot_mount: vget failed %d\n", error);
1031 			continue;
1032 		}
1033 		ip = VTOI(vp);
1034 		if ((ip->i_flags & SF_SNAPSHOT) == 0) {
1035 			printf("ffs_snapshot_mount: non-snapshot inode %d\n",
1036 			    fs->fs_snapinum[snaploc]);
1037 			vput(vp);
1038 			for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
1039 				if (fs->fs_snapinum[loc] == 0)
1040 					break;
1041 				fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
1042 			}
1043 			fs->fs_snapinum[loc - 1] = 0;
1044 			snaploc--;
1045 			continue;
1046 		}
1047 		if (ip->i_nextsnap.tqe_prev != 0)
1048 			panic("ffs_snapshot_mount: %d already on list",
1049 			    ip->i_number);
1050 		else
1051 			TAILQ_INSERT_TAIL(snaphead, ip, i_nextsnap);
1052 		vp->v_flag |= VSYSTEM;
1053 		ump->um_devvp->v_rdev->si_copyonwrite = ffs_copyonwrite;
1054 		ump->um_devvp->v_flag |= VCOPYONWRITE;
1055 		VOP_UNLOCK(vp, 0, p);
1056 	}
1057 }
1058 
1059 /*
1060  * Disassociate snapshot files when unmounting.
1061  */
1062 void
1063 ffs_snapshot_unmount(mp)
1064 	struct mount *mp;
1065 {
1066 	struct ufsmount *ump = VFSTOUFS(mp);
1067 	struct snaphead *snaphead = &ump->um_devvp->v_rdev->si_snapshots;
1068 	struct inode *xp;
1069 
1070 	while ((xp = TAILQ_FIRST(snaphead)) != 0) {
1071 		TAILQ_REMOVE(snaphead, xp, i_nextsnap);
1072 		xp->i_nextsnap.tqe_prev = 0;
1073 		if (xp->i_effnlink > 0)
1074 			vrele(ITOV(xp));
1075 	}
1076 	ump->um_devvp->v_rdev->si_copyonwrite = 0;
1077 	ump->um_devvp->v_flag &= ~VCOPYONWRITE;
1078 }
1079 
1080 /*
1081  * Check for need to copy block that is about to be written,
1082  * copying the block if necessary.
1083  */
1084 static int
1085 ffs_copyonwrite(devvp, bp)
1086 	struct vnode *devvp;
1087 	struct buf *bp;
1088 {
1089 	struct buf *ibp, *cbp, *savedcbp = 0;
1090 	struct proc *p = CURPROC;
1091 	struct fs *fs;
1092 	struct inode *ip;
1093 	struct vnode *vp;
1094 	ufs_daddr_t lbn, blkno;
1095 	int indiroff, error = 0;
1096 
1097 	fs = TAILQ_FIRST(&devvp->v_rdev->si_snapshots)->i_fs;
1098 	lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
1099 	if (p->p_flag & P_COWINPROGRESS)
1100 		panic("ffs_copyonwrite: recursive call");
1101 	TAILQ_FOREACH(ip, &devvp->v_rdev->si_snapshots, i_nextsnap) {
1102 		vp = ITOV(ip);
1103 		/*
1104 		 * We ensure that everything of our own that needs to be
1105 		 * copied will be done at the time that ffs_snapshot is
1106 		 * called. Thus we can skip the check here which can
1107 		 * deadlock in doing the lookup in UFS_BALLOC.
1108 		 */
1109 		if (bp->b_vp == vp)
1110 			continue;
1111 		/*
1112 		 * Check to see if block needs to be copied. We have to
1113 		 * be able to do the UFS_BALLOC without blocking, otherwise
1114 		 * we may get in a deadlock with another process also
1115 		 * trying to allocate. If we find outselves unable to
1116 		 * get the buffer lock, we unlock the snapshot vnode,
1117 		 * sleep briefly, and try again.
1118 		 */
1119 retry:
1120 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
1121 		if (lbn < NDADDR) {
1122 			blkno = ip->i_db[lbn];
1123 		} else {
1124 			p->p_flag |= P_COWINPROGRESS;
1125 			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
1126 			   fs->fs_bsize, KERNCRED, B_METAONLY | B_NOWAIT, &ibp);
1127 			p->p_flag &= ~P_COWINPROGRESS;
1128 			if (error) {
1129 				VOP_UNLOCK(vp, 0, p);
1130 				if (error != EWOULDBLOCK)
1131 					break;
1132 				tsleep(vp, p->p_pri.pri_user, "nap", 1);
1133 				goto retry;
1134 			}
1135 			indiroff = (lbn - NDADDR) % NINDIR(fs);
1136 			blkno = ((ufs_daddr_t *)(ibp->b_data))[indiroff];
1137 			bqrelse(ibp);
1138 		}
1139 #ifdef DIAGNOSTIC
1140 		if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
1141 			panic("ffs_copyonwrite: bad copy block");
1142 #endif
1143 		if (blkno != 0) {
1144 			VOP_UNLOCK(vp, 0, p);
1145 			continue;
1146 		}
1147 		/*
1148 		 * Allocate the block into which to do the copy. Note that this
1149 		 * allocation will never require any additional allocations for
1150 		 * the snapshot inode.
1151 		 */
1152 		p->p_flag |= P_COWINPROGRESS;
1153 		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
1154 		    fs->fs_bsize, KERNCRED, B_NOWAIT, &cbp);
1155 		p->p_flag &= ~P_COWINPROGRESS;
1156 		if (error) {
1157 			VOP_UNLOCK(vp, 0, p);
1158 			if (error != EWOULDBLOCK)
1159 				break;
1160 			tsleep(vp, p->p_pri.pri_user, "nap", 1);
1161 			goto retry;
1162 		}
1163 #ifdef DEBUG
1164 		if (snapdebug) {
1165 			printf("Copyonwrite: snapino %d lbn %d for ",
1166 			    ip->i_number, lbn);
1167 			if (bp->b_vp == devvp)
1168 				printf("fs metadata");
1169 			else
1170 				printf("inum %d", VTOI(bp->b_vp)->i_number);
1171 			printf(" lblkno %d to blkno %d\n", bp->b_lblkno,
1172 			    cbp->b_blkno);
1173 		}
1174 #endif
1175 		/*
1176 		 * If we have already read the old block contents, then
1177 		 * simply copy them to the new block. Note that we need
1178 		 * to synchronously write snapshots that have not been
1179 		 * unlinked, and hence will be visible after a crash,
1180 		 * to ensure their integrity.
1181 		 */
1182 		if (savedcbp != 0) {
1183 			bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
1184 			bawrite(cbp);
1185 			if (dopersistence && ip->i_effnlink > 0)
1186 				(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p);
1187 			VOP_UNLOCK(vp, 0, p);
1188 			continue;
1189 		}
1190 		/*
1191 		 * Otherwise, read the old block contents into the buffer.
1192 		 */
1193 		if ((error = readblock(cbp, lbn)) != 0) {
1194 			bzero(cbp->b_data, fs->fs_bsize);
1195 			bawrite(cbp);
1196 			if (dopersistence && ip->i_effnlink > 0)
1197 				(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p);
1198 			VOP_UNLOCK(vp, 0, p);
1199 			break;
1200 		}
1201 		savedcbp = cbp;
1202 		VOP_UNLOCK(vp, 0, p);
1203 	}
1204 	/*
1205 	 * Note that we need to synchronously write snapshots that
1206 	 * have not been unlinked, and hence will be visible after
1207 	 * a crash, to ensure their integrity.
1208 	 */
1209 	if (savedcbp) {
1210 		vp = savedcbp->b_vp;
1211 		bawrite(savedcbp);
1212 		if (dopersistence && VTOI(vp)->i_effnlink > 0) {
1213 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
1214 			(void) VOP_FSYNC(vp, KERNCRED, MNT_WAIT, p);
1215 			VOP_UNLOCK(vp, 0, p);
1216 		}
1217 	}
1218 	return (error);
1219 }
1220 
1221 /*
1222  * Read the specified block into the given buffer.
1223  * Much of this boiler-plate comes from bwrite().
1224  */
1225 static int
1226 readblock(bp, lbn)
1227 	struct buf *bp;
1228 	daddr_t lbn;
1229 {
1230 	struct uio auio;
1231 	struct iovec aiov;
1232 	struct proc *p = CURPROC;
1233 	struct inode *ip = VTOI(bp->b_vp);
1234 
1235 	aiov.iov_base = bp->b_data;
1236 	aiov.iov_len = bp->b_bcount;
1237 	auio.uio_iov = &aiov;
1238 	auio.uio_iovcnt = 1;
1239 	auio.uio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn)));
1240 	auio.uio_resid = bp->b_bcount;
1241 	auio.uio_rw = UIO_READ;
1242 	auio.uio_segflg = UIO_SYSSPACE;
1243 	auio.uio_procp = p;
1244 	return (physio(ip->i_devvp->v_rdev, &auio, 0));
1245 }
1246