xref: /titanic_50/usr/src/uts/common/io/fssnap.c (revision 142c9f13e148d687426ed2d4e8bd93717eeaebbc)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/debug.h>
29 #include <sys/types.h>
30 #include <sys/file.h>
31 #include <sys/errno.h>
32 #include <sys/uio.h>
33 #include <sys/open.h>
34 #include <sys/cred.h>
35 #include <sys/kmem.h>
36 #include <sys/conf.h>
37 #include <sys/cmn_err.h>
38 #include <sys/modctl.h>
39 #include <sys/disp.h>
40 #include <sys/atomic.h>
41 #include <sys/filio.h>
42 #include <sys/stat.h> /* needed for S_IFBLK and S_IFCHR */
43 #include <sys/kstat.h>
44 
45 #include <sys/ddi.h>
46 #include <sys/devops.h>
47 #include <sys/sunddi.h>
48 #include <sys/esunddi.h>
49 #include <sys/priv_names.h>
50 
51 #include <sys/fssnap.h>
52 #include <sys/fssnap_if.h>
53 
54 /*
55  * This module implements the file system snapshot code, which provides a
56  * point-in-time image of a file system for the purposes of online backup.
57  * There are essentially two parts to this project: the driver half and the
58  * file system half.  The driver half is a pseudo device driver called
59  * "fssnap" that represents the snapshot.  Each snapshot is assigned a
60  * number that corresponds to the minor number of the device, and a control
61  * device with a high minor number is used to initiate snapshot creation and
62  * deletion.  For all practical purposes the driver half acts like a
63  * read-only disk device whose contents are exactly the same as the master
64  * file system at the time the snapshot was created.
65  *
66  * The file system half provides interfaces necessary for performing the
67  * file system dependent operations required to create and delete snapshots
68  * and a special driver strategy routine that must always be used by the file
69  * system for snapshots to work correctly.
70  *
71  * When a snapshot is to be created, the user utility will send an ioctl to
72  * the control device of the driver half specifying the file system to be
73  * snapshotted, the file descriptor of a backing-store file which is used to
74  * hold old data before it is overwritten, and other snapshot parameters.
75  * This ioctl is passed on to the file system specified in the original
76  * ioctl request.  The file system is expected to be able to flush
77  * everything out to make the file system consistent and lock it to ensure
78  * no changes occur while the snapshot is being created.  It then calls
79  * fssnap_create() to create state for a new snapshot, from which an opaque
80  * handle is returned with the snapshot locked.  Next, the file system must
81  * populate the "candidate bitmap", which tells the snapshot code which
82  * "chunks" should be considered for copy-on-write (a chunk is the unit of
83  * granularity used for copy-on-write, which is independent of the device
84  * and file system block sizes).  This is typically done by scanning the
85  * file system allocation bitmaps to determine which chunks contain
86  * allocated blocks in the file system at the time the snapshot was created.
87  * If a chunk has no allocated blocks, it does not need to be copied before
88  * being written to.  Once the candidate bitmap is populated with
89  * fssnap_set_candidate(), the file system calls fssnap_create_done() to
90  * complete the snapshot creation and unlock the snapshot.  The file system
91  * may now be unlocked and modifications to it resumed.
92  *
93  * Once a snapshot is created, the file system must perform all writes
94  * through a special strategy routine, fssnap_strategy().  This strategy
95  * routine determines whether the chunks contained by the write must be
96  * copied before being overwritten by consulting the candidate bitmap
97  * described above, and the "hastrans bitmap" which tells it whether the chunk
98  * has been copied already or not.  If the chunk is a candidate but has not
99  * been copied, it reads the old data in and adds it to a queue.  The
100  * old data can then be overwritten with the new data.  An asynchronous
101  * task queue is dispatched for each old chunk read in which writes the old
102  * data to the backing file specified at snapshot creation time.  The
103  * backing file is a sparse file the same size as the file system that
104  * contains the old data at the offset that data originally had in the
105  * file system.  If the queue containing in-memory chunks gets too large,
106  * writes to the file system may be throttled by a semaphore until the
107  * task queues have a chance to push some of the chunks to the backing file.
108  *
109  * With the candidate bitmap, the hastrans bitmap, the data on the master
110  * file system, and the old data in memory and in the backing file, the
111  * snapshot pseudo-driver can piece together the original file system
112  * information to satisfy read requests.  If the requested chunk is not a
113  * candidate, it returns a zeroed buffer.  If the chunk is a candidate but
114  * has not been copied it reads it from the master file system.  If it is a
115  * candidate and has been copied, it either copies the data from the
116  * in-memory queue or it reads it in from the backing file.  The result is
117  * a replication of the original file system that can be backed up, mounted,
118  * or manipulated by other file system utilities that work on a read-only
119  * device.
120  *
121  * This module is divided into three roughly logical sections:
122  *
123  *     - The snapshot driver, which is a character/block driver
124  *       representing the snapshot itself.  These routines are
125  *       prefixed with "snap_".
126  *
127  *     - The library routines that are defined in fssnap_if.h that
128  *       are used by file systems that use this snapshot implementation.
129  *       These functions are prefixed with "fssnap_" and are called through
130  *       a function vector from the file system.
131  *
132  *     - The helper routines used by the snapshot driver and the fssnap
133  *       library routines for managing the translation table and other
134  *       useful functions.  These routines are all static and are
135  *       prefixed with either "fssnap_" or "transtbl_" if they
136  *       are specifically used for translation table activities.
137  */
138 
139 static dev_info_t		*fssnap_dip = NULL;
140 static struct snapshot_id	*snapshot = NULL;
141 static struct snapshot_id	snap_ctl;
142 static int			num_snapshots = 0;
143 static kmutex_t			snapshot_mutex;
144 static char			snapname[] = SNAP_NAME;
145 
146 /* "tunable" parameters */
147 static int		fssnap_taskq_nthreads = FSSNAP_TASKQ_THREADS;
148 static uint_t		fssnap_max_mem_chunks = FSSNAP_MAX_MEM_CHUNKS;
149 static int		fssnap_taskq_maxtasks = FSSNAP_TASKQ_MAXTASKS;
150 
151 /* static function prototypes */
152 
153 /* snapshot driver */
154 static int snap_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
155 static int snap_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
156 static int snap_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
157 static int snap_open(dev_t *devp, int flag, int otyp, cred_t *cred);
158 static int snap_close(dev_t dev, int flag, int otyp, cred_t *cred);
159 static int snap_strategy(struct buf *bp);
160 static int snap_read(dev_t dev, struct uio *uiop, cred_t *credp);
161 static int snap_print(dev_t dev, char *str);
162 static int snap_ioctl(dev_t dev, int cmd, intptr_t arg, int mode,
163     cred_t *credp, int *rvalp);
164 static int snap_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op,
165     int flags, char *name, caddr_t valuep, int *lengthp);
166 static int snap_getchunk(struct snapshot_id *sidp, chunknumber_t chunk,
167     int offset, int len, char *buffer);
168 
169 
170 /* fssnap interface implementations (see fssnap_if.h) */
171 static void fssnap_strategy_impl(void *, struct buf *);
172 static void *fssnap_create_impl(chunknumber_t, uint_t, u_offset_t,
173     struct vnode *, int, struct vnode **, char *, u_offset_t);
174 static void fssnap_set_candidate_impl(void *, chunknumber_t);
175 static int fssnap_is_candidate_impl(void *, u_offset_t);
176 static int fssnap_create_done_impl(void *);
177 static int fssnap_delete_impl(void *);
178 
179 /* fssnap interface support routines */
180 static int  fssnap_translate(struct snapshot_id **, struct buf *);
181 static void fssnap_write_taskq(void *);
182 static void fssnap_create_kstats(snapshot_id_t *, int, const char *,
183     const char *);
184 static int  fssnap_update_kstat_num(kstat_t *, int);
185 static void fssnap_delete_kstats(struct cow_info *);
186 
187 /* translation table prototypes */
188 static cow_map_node_t *transtbl_add(cow_map_t *, chunknumber_t, caddr_t);
189 static cow_map_node_t *transtbl_get(cow_map_t *, chunknumber_t);
190 static void transtbl_delete(cow_map_t *, cow_map_node_t *);
191 static void transtbl_free(cow_map_t *);
192 
193 static kstat_t *fssnap_highwater_kstat;
194 
195 /* ************************************************************************ */
196 
197 /* Device and Module Structures */
198 
199 static struct cb_ops snap_cb_ops = {
200 	snap_open,
201 	snap_close,
202 	snap_strategy,
203 	snap_print,
204 	nodev,		/* no snap_dump */
205 	snap_read,
206 	nodev,		/* no snap_write */
207 	snap_ioctl,
208 	nodev,		/* no snap_devmap */
209 	nodev,		/* no snap_mmap   */
210 	nodev,		/* no snap_segmap */
211 	nochpoll,
212 	snap_prop_op,
213 	NULL,		/* streamtab */
214 	D_64BIT | D_NEW | D_MP, /* driver compatibility */
215 	CB_REV,
216 	nodev,		/* async I/O read entry point */
217 	nodev		/* async I/O write entry point */
218 };
219 
220 static struct dev_ops snap_ops = {
221 	DEVO_REV,
222 	0,			/* ref count */
223 	snap_getinfo,
224 	nulldev,		/* snap_identify obsolete */
225 	nulldev,		/* no snap_probe */
226 	snap_attach,
227 	snap_detach,
228 	nodev,			/* no snap_reset */
229 	&snap_cb_ops,
230 	(struct bus_ops *)NULL,
231 	nulldev			/* no snap_power() */
232 };
233 
234 extern struct mod_ops mod_driverops;
235 
236 static struct modldrv md = {
237 	&mod_driverops, /* Type of module. This is a driver */
238 	"snapshot driver %I%", 	/* Name of the module */
239 	&snap_ops,
240 };
241 
242 static struct modlinkage ml = {
243 	MODREV_1,
244 	&md,
245 	NULL
246 };
247 
248 static void *statep;
249 
250 int
251 _init(void)
252 {
253 	int	error;
254 	kstat_t	*ksp;
255 	kstat_named_t	*ksdata;
256 
257 	error = ddi_soft_state_init(&statep, sizeof (struct snapshot_id *), 1);
258 	if (error) {
259 		cmn_err(CE_WARN, "_init: failed to init ddi_soft_state.");
260 		return (error);
261 	}
262 
263 	error = mod_install(&ml);
264 
265 	if (error) {
266 		cmn_err(CE_WARN, "_init: failed to mod_install.");
267 		ddi_soft_state_fini(&statep);
268 		return (error);
269 	}
270 
271 	/*
272 	 * Fill in the snapshot operations vector for file systems
273 	 * (defined in fssnap_if.c)
274 	 */
275 
276 	snapops.fssnap_create = fssnap_create_impl;
277 	snapops.fssnap_set_candidate = fssnap_set_candidate_impl;
278 	snapops.fssnap_is_candidate = fssnap_is_candidate_impl;
279 	snapops.fssnap_create_done = fssnap_create_done_impl;
280 	snapops.fssnap_delete = fssnap_delete_impl;
281 	snapops.fssnap_strategy = fssnap_strategy_impl;
282 
283 	mutex_init(&snapshot_mutex, NULL, MUTEX_DEFAULT, NULL);
284 
285 	/*
286 	 * Initialize the fssnap highwater kstat
287 	 */
288 	ksp = kstat_create(snapname, 0, FSSNAP_KSTAT_HIGHWATER, "misc",
289 	    KSTAT_TYPE_NAMED, 1, 0);
290 	if (ksp != NULL) {
291 		ksdata = (kstat_named_t *)ksp->ks_data;
292 		kstat_named_init(ksdata, FSSNAP_KSTAT_HIGHWATER,
293 		    KSTAT_DATA_UINT32);
294 		ksdata->value.ui32 = 0;
295 		kstat_install(ksp);
296 	} else {
297 		cmn_err(CE_WARN, "_init: failed to create highwater kstat.");
298 	}
299 	fssnap_highwater_kstat = ksp;
300 
301 	return (0);
302 }
303 
304 int
305 _info(struct modinfo *modinfop)
306 {
307 	return (mod_info(&ml, modinfop));
308 }
309 
310 int
311 _fini(void)
312 {
313 	int	error;
314 
315 	error = mod_remove(&ml);
316 	if (error)
317 		return (error);
318 	ddi_soft_state_fini(&statep);
319 
320 	/*
321 	 * delete the fssnap highwater kstat
322 	 */
323 	kstat_delete(fssnap_highwater_kstat);
324 
325 	mutex_destroy(&snapshot_mutex);
326 
327 	/* Clear out the file system operations vector */
328 	snapops.fssnap_create = NULL;
329 	snapops.fssnap_set_candidate = NULL;
330 	snapops.fssnap_create_done = NULL;
331 	snapops.fssnap_delete = NULL;
332 	snapops.fssnap_strategy = NULL;
333 
334 	return (0);
335 }
336 
337 /* ************************************************************************ */
338 
339 /*
340  * Snapshot Driver Routines
341  *
342  * This section implements the snapshot character and block drivers.  The
343  * device will appear to be a consistent read-only file system to
344  * applications that wish to back it up or mount it.  The snapshot driver
345  * communicates with the file system through the translation table, which
346  * tells the snapshot driver where to find the data necessary to piece
347  * together the frozen file system.  The data may either be on the master
348  * device (no translation exists), in memory (a translation exists but has
349  * not been flushed to the backing store), or in the backing store file.
350  * The read request may require the snapshot driver to retrieve data from
351  * several different places and piece it together to look like a single
352  * contiguous read.
353  *
354  * The device minor number corresponds to the snapshot number in the list of
355  * snapshot identifiers.  The soft state for each minor number is simply a
356  * pointer to the snapshot id, which holds all of the snapshot state.  One
357  * minor number is designated as the control device.  All snapshot create
358  * and delete requests go through the control device to ensure this module
359  * is properly loaded and attached before the file system starts calling
360  * routines defined here.
361  */
362 
363 
364 /*
365  * snap_getinfo() - snapshot driver getinfo(9E) routine
366  *
367  */
368 /*ARGSUSED*/
369 static int
370 snap_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
371 {
372 	switch (infocmd) {
373 	case DDI_INFO_DEVT2DEVINFO:
374 		*result = fssnap_dip;
375 		return (DDI_SUCCESS);
376 	case DDI_INFO_DEVT2INSTANCE:
377 		*result = 0;	/* we only have one instance */
378 		return (DDI_SUCCESS);
379 	}
380 	return (DDI_FAILURE);
381 }
382 
383 /*
384  * snap_attach() - snapshot driver attach(9E) routine
385  *
386  *    sets up snapshot control device and control state.  The control state
387  *    is a pointer to an "anonymous" snapshot_id for tracking opens and closes
388  */
389 static int
390 snap_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
391 {
392 	int			error;
393 
394 	switch (cmd) {
395 	case DDI_ATTACH:
396 		/* create the control device */
397 		error = ddi_create_priv_minor_node(dip, SNAP_CTL_NODE, S_IFCHR,
398 		    SNAP_CTL_MINOR, DDI_PSEUDO, PRIVONLY_DEV,
399 		    PRIV_SYS_CONFIG, PRIV_SYS_CONFIG, 0666);
400 		if (error == DDI_FAILURE) {
401 			return (DDI_FAILURE);
402 		}
403 
404 		rw_init(&snap_ctl.sid_rwlock, NULL, RW_DEFAULT, NULL);
405 		rw_enter(&snap_ctl.sid_rwlock, RW_WRITER);
406 		fssnap_dip = dip;
407 		snap_ctl.sid_snapnumber = SNAP_CTL_MINOR;
408 		/* the control sid is not linked into the snapshot list */
409 		snap_ctl.sid_next = NULL;
410 		snap_ctl.sid_cowinfo = NULL;
411 		snap_ctl.sid_flags = 0;
412 		rw_exit(&snap_ctl.sid_rwlock);
413 		ddi_report_dev(dip);
414 
415 		return (DDI_SUCCESS);
416 	case DDI_PM_RESUME:
417 		return (DDI_SUCCESS);
418 
419 	case DDI_RESUME:
420 		return (DDI_SUCCESS);
421 
422 	default:
423 		return (DDI_FAILURE);
424 	}
425 }
426 
427 /*
428  * snap_detach() - snapshot driver detach(9E) routine
429  *
430  *    destroys snapshot control device and control state.  If any snapshots
431  *    are active (ie. num_snapshots != 0), the device will refuse to detach.
432  */
433 static int
434 snap_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
435 {
436 	struct snapshot_id *sidp, *sidnextp;
437 
438 	switch (cmd) {
439 	case DDI_DETACH:
440 		/* do not detach if the device is active */
441 		mutex_enter(&snapshot_mutex);
442 		if ((num_snapshots != 0) ||
443 		    ((snap_ctl.sid_flags & SID_CHAR_BUSY) != 0)) {
444 			mutex_exit(&snapshot_mutex);
445 			return (DDI_FAILURE);
446 		}
447 
448 		/* free up the snapshot list */
449 		for (sidp = snapshot; sidp != NULL; sidp = sidnextp) {
450 			ASSERT(SID_AVAILABLE(sidp) &&
451 			    !RW_LOCK_HELD(&sidp->sid_rwlock));
452 			sidnextp = sidp->sid_next;
453 			rw_destroy(&sidp->sid_rwlock);
454 			kmem_free(sidp, sizeof (struct snapshot_id));
455 		}
456 		snapshot = NULL;
457 
458 		/* delete the control device */
459 		ddi_remove_minor_node(dip, SNAP_CTL_NODE);
460 		fssnap_dip = NULL;
461 
462 		ASSERT((snap_ctl.sid_flags & SID_CHAR_BUSY) == 0);
463 		rw_destroy(&snap_ctl.sid_rwlock);
464 		mutex_exit(&snapshot_mutex);
465 
466 		return (DDI_SUCCESS);
467 
468 	default:
469 		return (DDI_FAILURE);
470 	}
471 }
472 
473 /*
474  * snap_open() - snapshot driver open(9E) routine
475  *
476  *     marks the snapshot id as busy so it will not be recycled when deleted
477  *     until the snapshot is closed.
478  */
479 /* ARGSUSED */
480 static int
481 snap_open(dev_t *devp, int flag, int otyp, cred_t *cred)
482 {
483 	minor_t	minor;
484 	struct snapshot_id **sidpp, *sidp;
485 
486 	/* snapshots are read-only */
487 	if (flag & FWRITE)
488 		return (EROFS);
489 
490 	minor = getminor(*devp);
491 
492 	if (minor == SNAP_CTL_MINOR) {
493 		/* control device must be opened exclusively */
494 		if (((flag & FEXCL) != FEXCL) || (otyp != OTYP_CHR))
495 			return (EINVAL);
496 
497 		rw_enter(&snap_ctl.sid_rwlock, RW_WRITER);
498 		if ((snap_ctl.sid_flags & SID_CHAR_BUSY) != 0) {
499 			rw_exit(&snap_ctl.sid_rwlock);
500 			return (EBUSY);
501 		}
502 
503 		snap_ctl.sid_flags |= SID_CHAR_BUSY;
504 		rw_exit(&snap_ctl.sid_rwlock);
505 
506 		return (0);
507 	}
508 
509 	sidpp = ddi_get_soft_state(statep, minor);
510 	if (sidpp == NULL || *sidpp == NULL)
511 		return (ENXIO);
512 	sidp = *sidpp;
513 	rw_enter(&sidp->sid_rwlock, RW_WRITER);
514 
515 	if ((flag & FEXCL) && SID_BUSY(sidp)) {
516 		rw_exit(&sidp->sid_rwlock);
517 		return (EAGAIN);
518 	}
519 
520 	ASSERT(sidpp != NULL && sidp != NULL);
521 	/* check to see if this snapshot has been killed on us */
522 	if (SID_INACTIVE(sidp)) {
523 		cmn_err(CE_WARN, "snap_open: snapshot %d does not exist.",
524 		    minor);
525 		rw_exit(&sidp->sid_rwlock);
526 		return (ENXIO);
527 	}
528 
529 	switch (otyp) {
530 	case OTYP_CHR:
531 		sidp->sid_flags |= SID_CHAR_BUSY;
532 		break;
533 	case OTYP_BLK:
534 		sidp->sid_flags |= SID_BLOCK_BUSY;
535 		break;
536 	default:
537 		rw_exit(&sidp->sid_rwlock);
538 		return (EINVAL);
539 	}
540 
541 	rw_exit(&sidp->sid_rwlock);
542 
543 	/*
544 	 * at this point if a valid snapshot was found then it has
545 	 * been marked busy and we can use it.
546 	 */
547 	return (0);
548 }
549 
550 /*
551  * snap_close() - snapshot driver close(9E) routine
552  *
553  *    unsets the busy bits in the snapshot id.  If the snapshot has been
554  *    deleted while the snapshot device was open, the close call will clean
555  *    up the remaining state information.
556  */
557 /* ARGSUSED */
558 static int
559 snap_close(dev_t dev, int flag, int otyp, cred_t *cred)
560 {
561 	struct snapshot_id	**sidpp, *sidp;
562 	minor_t			minor;
563 	char			name[20];
564 
565 	minor = getminor(dev);
566 
567 	/* if this is the control device, close it and return */
568 	if (minor == SNAP_CTL_MINOR) {
569 		rw_enter(&snap_ctl.sid_rwlock, RW_WRITER);
570 		snap_ctl.sid_flags &= ~(SID_CHAR_BUSY);
571 		rw_exit(&snap_ctl.sid_rwlock);
572 		return (0);
573 	}
574 
575 	sidpp = ddi_get_soft_state(statep, minor);
576 	if (sidpp == NULL || *sidpp == NULL) {
577 		cmn_err(CE_WARN, "snap_close: could not find state for "
578 		    "snapshot %d.", minor);
579 		return (ENXIO);
580 	}
581 	sidp = *sidpp;
582 	mutex_enter(&snapshot_mutex);
583 	rw_enter(&sidp->sid_rwlock, RW_WRITER);
584 
585 	/* Mark the snapshot as not being busy anymore */
586 	switch (otyp) {
587 	case OTYP_CHR:
588 		sidp->sid_flags &= ~(SID_CHAR_BUSY);
589 		break;
590 	case OTYP_BLK:
591 		sidp->sid_flags &= ~(SID_BLOCK_BUSY);
592 		break;
593 	default:
594 		mutex_exit(&snapshot_mutex);
595 		rw_exit(&sidp->sid_rwlock);
596 		return (EINVAL);
597 	}
598 
599 	if (SID_AVAILABLE(sidp)) {
600 		/*
601 		 * if this is the last close on a snapshot that has been
602 		 * deleted, then free up the soft state.  The snapdelete
603 		 * ioctl does not free this when the device is in use so
604 		 * we do it here after the last reference goes away.
605 		 */
606 
607 		/* remove the device nodes */
608 		ASSERT(fssnap_dip != NULL);
609 		(void) snprintf(name, sizeof (name), "%d",
610 		    sidp->sid_snapnumber);
611 		ddi_remove_minor_node(fssnap_dip, name);
612 		(void) snprintf(name, sizeof (name), "%d,raw",
613 		    sidp->sid_snapnumber);
614 		ddi_remove_minor_node(fssnap_dip, name);
615 
616 		/* delete the state structure */
617 		ddi_soft_state_free(statep, sidp->sid_snapnumber);
618 		num_snapshots--;
619 	}
620 
621 	mutex_exit(&snapshot_mutex);
622 	rw_exit(&sidp->sid_rwlock);
623 
624 	return (0);
625 }
626 
627 /*
628  * snap_read() - snapshot driver read(9E) routine
629  *
630  *    reads data from the snapshot by calling snap_strategy() through physio()
631  */
632 /* ARGSUSED */
633 static int
634 snap_read(dev_t dev, struct uio *uiop, cred_t *credp)
635 {
636 	minor_t		minor;
637 	struct snapshot_id **sidpp;
638 
639 	minor = getminor(dev);
640 	sidpp = ddi_get_soft_state(statep, minor);
641 	if (sidpp == NULL || *sidpp == NULL) {
642 		cmn_err(CE_WARN,
643 		    "snap_read: could not find state for snapshot %d.", minor);
644 		return (ENXIO);
645 	}
646 	return (physio(snap_strategy, NULL, dev, B_READ, minphys, uiop));
647 }
648 
649 /*
650  * snap_strategy() - snapshot driver strategy(9E) routine
651  *
652  *    cycles through each chunk in the requested buffer and calls
653  *    snap_getchunk() on each chunk to retrieve it from the appropriate
654  *    place.  Once all of the parts are put together the requested buffer
655  *    is returned.  The snapshot driver is read-only, so a write is invalid.
656  */
657 static int
658 snap_strategy(struct buf *bp)
659 {
660 	struct snapshot_id **sidpp, *sidp;
661 	minor_t		minor;
662 	chunknumber_t	chunk;
663 	int		off, len;
664 	u_longlong_t	reqptr;
665 	int		error = 0;
666 	size_t		chunksz;
667 	caddr_t		buf;
668 
669 	/* snapshot device is read-only */
670 	if (bp->b_flags & B_WRITE) {
671 		bioerror(bp, EROFS);
672 		bp->b_resid = bp->b_bcount;
673 		biodone(bp);
674 		return (0);
675 	}
676 
677 	minor = getminor(bp->b_edev);
678 	sidpp = ddi_get_soft_state(statep, minor);
679 	if (sidpp == NULL || *sidpp == NULL) {
680 		cmn_err(CE_WARN,
681 		    "snap_strategy: could not find state for snapshot %d.",
682 		    minor);
683 		bioerror(bp, ENXIO);
684 		bp->b_resid = bp->b_bcount;
685 		biodone(bp);
686 		return (0);
687 	}
688 	sidp = *sidpp;
689 	ASSERT(sidp);
690 	rw_enter(&sidp->sid_rwlock, RW_READER);
691 
692 	if (SID_INACTIVE(sidp)) {
693 		bioerror(bp, ENXIO);
694 		bp->b_resid = bp->b_bcount;
695 		biodone(bp);
696 		rw_exit(&sidp->sid_rwlock);
697 		return (0);
698 	}
699 
700 	if (bp->b_flags & (B_PAGEIO|B_PHYS))
701 		bp_mapin(bp);
702 
703 	bp->b_resid = bp->b_bcount;
704 	ASSERT(bp->b_un.b_addr);
705 	buf = bp->b_un.b_addr;
706 
707 	chunksz = sidp->sid_cowinfo->cow_map.cmap_chunksz;
708 
709 	/* reqptr is the current DEV_BSIZE offset into the device */
710 	/* chunk is the chunk containing reqptr */
711 	/* len is the length of the request (in the current chunk) in bytes */
712 	/* off is the byte offset into the current chunk */
713 	reqptr = bp->b_lblkno;
714 	while (bp->b_resid > 0) {
715 		chunk = dbtocowchunk(&sidp->sid_cowinfo->cow_map, reqptr);
716 		off = (reqptr % (chunksz >> DEV_BSHIFT)) << DEV_BSHIFT;
717 		len = min(chunksz - off, bp->b_resid);
718 		ASSERT((off + len) <= chunksz);
719 
720 		if ((error = snap_getchunk(sidp, chunk, off, len, buf)) != 0) {
721 			/*
722 			 * EINVAL means the user tried to go out of range.
723 			 * Anything else means it's likely that we're
724 			 * confused.
725 			 */
726 			if (error != EINVAL) {
727 				cmn_err(CE_WARN, "snap_strategy: error "
728 				    "calling snap_getchunk, chunk = %llu, "
729 				    "offset = %d, len = %d, resid = %lu, "
730 				    "error = %d.",
731 				    chunk, off, len, bp->b_resid, error);
732 			}
733 			bioerror(bp, error);
734 			biodone(bp);
735 			rw_exit(&sidp->sid_rwlock);
736 			return (0);
737 		}
738 		bp->b_resid -= len;
739 		reqptr += (len >> DEV_BSHIFT);
740 		buf += len;
741 	}
742 
743 	ASSERT(bp->b_resid == 0);
744 	biodone(bp);
745 
746 	rw_exit(&sidp->sid_rwlock);
747 	return (0);
748 }
749 
750 /*
751  * snap_getchunk() - helper function for snap_strategy()
752  *
753  *    gets the requested data from the appropriate place and fills in the
754  *    buffer.  chunk is the chunk number of the request, offset is the
755  *    offset into that chunk and must be less than the chunk size.  len is
756  *    the length of the request starting at offset, and must not exceed a
757  *    chunk boundary.  buffer is the address to copy the data to.  len
758  *    bytes are copied into the buffer starting at the location specified.
759  *
760  *    A chunk is located according to the following algorithm:
761  *        - If the chunk does not have a translation or is not a candidate
762  *          for translation, it is read straight from the master device.
763  *        - If the chunk does have a translation, then it is either on
764  *          disk or in memory:
765  *            o If it is in memory the requested data is simply copied out
766  *              of the in-memory buffer.
767  *            o If it is in the backing store, it is read from there.
768  *
769  *    This function does the real work of the snapshot driver.
770  */
771 static int
772 snap_getchunk(struct snapshot_id *sidp, chunknumber_t chunk, int offset,
773     int len, char *buffer)
774 {
775 	cow_map_t	*cmap = &sidp->sid_cowinfo->cow_map;
776 	cow_map_node_t	*cmn;
777 	struct buf	*snapbuf;
778 	int		error = 0;
779 	char		*newbuffer;
780 	int		newlen = 0;
781 	int		partial = 0;
782 
783 	ASSERT(RW_READ_HELD(&sidp->sid_rwlock));
784 	ASSERT(offset + len <= cmap->cmap_chunksz);
785 
786 	/*
787 	 * Check if the chunk number is out of range and if so bail out
788 	 */
789 	if (chunk >= (cmap->cmap_bmsize * NBBY)) {
790 		return (EINVAL);
791 	}
792 
793 	/*
794 	 * If the chunk is not a candidate for translation, then the chunk
795 	 * was not allocated when the snapshot was taken.  Since it does
796 	 * not contain data associated with this snapshot, just return a
797 	 * zero buffer instead.
798 	 */
799 	if (isclr(cmap->cmap_candidate, chunk)) {
800 		bzero(buffer, len);
801 		return (0);
802 	}
803 
804 	/*
805 	 * if the chunk is a candidate for translation but a
806 	 * translation does not exist, then read through to the
807 	 * original file system.  The rwlock is held until the read
808 	 * completes if it hasn't been translated to make sure the
809 	 * file system does not translate the block before we
810 	 * access it. If it has already been translated we don't
811 	 * need the lock, because the translation will never go away.
812 	 */
813 	rw_enter(&cmap->cmap_rwlock, RW_READER);
814 	if (isclr(cmap->cmap_hastrans, chunk)) {
815 		snapbuf = getrbuf(KM_SLEEP);
816 		/*
817 		 * Reading into the buffer saves having to do a copy,
818 		 * but gets tricky if the request size is not a
819 		 * multiple of DEV_BSIZE.  However, we are filling the
820 		 * buffer left to right, so future reads will write
821 		 * over any extra data we might have read.
822 		 */
823 
824 		partial = len % DEV_BSIZE;
825 
826 		snapbuf->b_bcount = len;
827 		snapbuf->b_lblkno = lbtodb(chunk * cmap->cmap_chunksz + offset);
828 		snapbuf->b_un.b_addr = buffer;
829 
830 		snapbuf->b_iodone = NULL;
831 		snapbuf->b_proc = NULL;		/* i.e. the kernel */
832 		snapbuf->b_flags = B_READ | B_BUSY;
833 		snapbuf->b_edev = sidp->sid_fvp->v_vfsp->vfs_dev;
834 
835 		if (partial) {
836 			/*
837 			 * Partial block read in progress.
838 			 * This is bad as modules further down the line
839 			 * assume buf's are exact multiples of DEV_BSIZE
840 			 * and we end up with fewer, or zero, bytes read.
841 			 * To get round this we need to round up to the
842 			 * nearest full block read and then return only
843 			 * len bytes.
844 			 */
845 			newlen = (len - partial) + DEV_BSIZE;
846 			newbuffer = kmem_alloc(newlen, KM_SLEEP);
847 
848 			snapbuf->b_bcount = newlen;
849 			snapbuf->b_un.b_addr = newbuffer;
850 		}
851 
852 		(void) bdev_strategy(snapbuf);
853 		(void) biowait(snapbuf);
854 
855 		error = geterror(snapbuf);
856 
857 		if (partial) {
858 			/*
859 			 * Partial block read. Now we need to bcopy the
860 			 * correct number of bytes back into the
861 			 * supplied buffer, and tidy up our temp
862 			 * buffer.
863 			 */
864 			bcopy(newbuffer, buffer, len);
865 			kmem_free(newbuffer, newlen);
866 		}
867 
868 		freerbuf(snapbuf);
869 		rw_exit(&cmap->cmap_rwlock);
870 
871 		return (error);
872 	}
873 
874 	/*
875 	 * finally, if the chunk is a candidate for translation and it
876 	 * has been translated, then we clone the chunk of the buffer
877 	 * that was copied aside by the file system.
878 	 * The cmap_rwlock does not need to be held after we know the
879 	 * data has already been copied. Once a chunk has been copied
880 	 * to the backing file, it is stable read only data.
881 	 */
882 	cmn = transtbl_get(cmap, chunk);
883 
884 	/* check whether the data is in memory or in the backing file */
885 	if (cmn != NULL) {
886 		ASSERT(cmn->cmn_buf);
887 		/* already in memory */
888 		bcopy(cmn->cmn_buf + offset, buffer, len);
889 		rw_exit(&cmap->cmap_rwlock);
890 	} else {
891 		ssize_t resid = len;
892 		int	bf_index;
893 		/*
894 		 * can cause deadlock with writer if we don't drop the
895 		 * cmap_rwlock before trying to get the backing store file
896 		 * vnode rwlock.
897 		 */
898 		rw_exit(&cmap->cmap_rwlock);
899 
900 		bf_index = chunk / cmap->cmap_chunksperbf;
901 
902 		/* read buffer from backing file */
903 		error = vn_rdwr(UIO_READ,
904 		    (sidp->sid_cowinfo->cow_backfile_array)[bf_index],
905 		    buffer, len, ((chunk % cmap->cmap_chunksperbf) *
906 		    cmap->cmap_chunksz) + offset, UIO_SYSSPACE, 0,
907 		    RLIM64_INFINITY, kcred, &resid);
908 	}
909 
910 	return (error);
911 }
912 
913 /*
914  * snap_print() - snapshot driver print(9E) routine
915  *
916  *    prints the device identification string.
917  */
918 static int
919 snap_print(dev_t dev, char *str)
920 {
921 	struct snapshot_id **sidpp;
922 	minor_t		minor;
923 
924 	minor = getminor(dev);
925 	sidpp = ddi_get_soft_state(statep, minor);
926 	if (sidpp == NULL || *sidpp == NULL) {
927 		cmn_err(CE_WARN,
928 		    "snap_print: could not find state for snapshot %d.", minor);
929 		return (ENXIO);
930 	}
931 
932 	cmn_err(CE_NOTE, "snap_print: snapshot %d: %s",  minor, str);
933 
934 	return (0);
935 }
936 
937 /*
938  * snap_prop_op() - snapshot driver prop_op(9E) routine
939  *
940  *    get 32-bit and 64-bit values for size (character driver) and nblocks
941  *    (block driver).
942  */
943 static int
944 snap_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op,
945     int flags, char *name, caddr_t valuep, int *lengthp)
946 {
947 	int		minor;
948 	struct snapshot_id **sidpp;
949 	dev_t		mdev;
950 	dev_info_t	*mdip;
951 	int		error;
952 
953 	minor = getminor(dev);
954 
955 	/* if this is the control device just check for .conf properties */
956 	if (minor == SNAP_CTL_MINOR)
957 		return (ddi_prop_op(dev, dip, prop_op, flags, name,
958 		    valuep, lengthp));
959 
960 	/* check to see if there is a master device plumbed */
961 	sidpp = ddi_get_soft_state(statep, minor);
962 	if (sidpp == NULL || *sidpp == NULL) {
963 		cmn_err(CE_WARN,
964 		    "snap_prop_op: could not find state for "
965 		    "snapshot %d.", minor);
966 		return (DDI_PROP_NOT_FOUND);
967 	}
968 
969 	if (((*sidpp)->sid_fvp == NULL) || ((*sidpp)->sid_fvp->v_vfsp == NULL))
970 		return (ddi_prop_op(dev, dip, prop_op, flags, name,
971 		    valuep, lengthp));
972 
973 	/* hold master device and pass operation down */
974 	mdev = (*sidpp)->sid_fvp->v_vfsp->vfs_dev;
975 	if (mdip = e_ddi_hold_devi_by_dev(mdev, 0)) {
976 
977 		/* get size information from the master device. */
978 		error = cdev_prop_op(mdev, mdip,
979 		    prop_op, flags, name, valuep, lengthp);
980 		ddi_release_devi(mdip);
981 		if (error == DDI_PROP_SUCCESS)
982 			return (error);
983 	}
984 
985 	/* master device did not service the request, try framework */
986 	return (ddi_prop_op(dev, dip, prop_op, flags, name, valuep, lengthp));
987 
988 }
989 
990 /*
991  * snap_ioctl() - snapshot driver ioctl(9E) routine
992  *
993  *    only applies to the control device.  The control device accepts two
994  *    ioctl requests: create a snapshot or delete a snapshot.  In either
995  *    case, the vnode for the requested file system is extracted, and the
996  *    request is passed on to the file system via the same ioctl.  The file
997  *    system is responsible for doing the things necessary for creating or
998  *    destroying a snapshot, including any file system specific operations
999  *    that must be performed as well as setting up and deleting the snapshot
1000  *    state through the fssnap interfaces.
1001  */
1002 static int
1003 snap_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
1004 int *rvalp)
1005 {
1006 	minor_t	minor;
1007 	int error = 0;
1008 
1009 	minor = getminor(dev);
1010 
1011 	if (minor != SNAP_CTL_MINOR) {
1012 		return (EINVAL);
1013 	}
1014 
1015 	switch (cmd) {
1016 	case _FIOSNAPSHOTCREATE:
1017 	{
1018 		struct fiosnapcreate	fc;
1019 		struct file		*fp;
1020 		struct vnode		*vp;
1021 
1022 		if (ddi_copyin((void *)arg, &fc, sizeof (fc), mode))
1023 			return (EFAULT);
1024 
1025 		/* get vnode for file system mount point */
1026 		if ((fp = getf(fc.rootfiledesc)) == NULL)
1027 			return (EBADF);
1028 
1029 		ASSERT(fp->f_vnode);
1030 		vp = fp->f_vnode;
1031 		VN_HOLD(vp);
1032 		releasef(fc.rootfiledesc);
1033 
1034 		/* pass ioctl request to file system */
1035 		error = VOP_IOCTL(vp, cmd, arg, 0, credp, rvalp, NULL);
1036 		VN_RELE(vp);
1037 		break;
1038 	}
1039 	case _FIOSNAPSHOTCREATE_MULTI:
1040 	{
1041 		struct fiosnapcreate_multi	fc;
1042 		struct file		*fp;
1043 		struct vnode		*vp;
1044 
1045 		if (ddi_copyin((void *)arg, &fc, sizeof (fc), mode))
1046 			return (EFAULT);
1047 
1048 		/* get vnode for file system mount point */
1049 		if ((fp = getf(fc.rootfiledesc)) == NULL)
1050 			return (EBADF);
1051 
1052 		ASSERT(fp->f_vnode);
1053 		vp = fp->f_vnode;
1054 		VN_HOLD(vp);
1055 		releasef(fc.rootfiledesc);
1056 
1057 		/* pass ioctl request to file system */
1058 		error = VOP_IOCTL(vp, cmd, arg, 0, credp, rvalp, NULL);
1059 		VN_RELE(vp);
1060 		break;
1061 	}
1062 	case _FIOSNAPSHOTDELETE:
1063 	{
1064 		major_t			major;
1065 		struct fiosnapdelete	fc;
1066 		snapshot_id_t		*sidp = NULL;
1067 		snapshot_id_t		*sidnextp = NULL;
1068 		struct file		*fp = NULL;
1069 		struct vnode		*vp = NULL;
1070 		struct vfs 		*vfsp = NULL;
1071 		vfsops_t		*vfsops = EIO_vfsops;
1072 
1073 		if (ddi_copyin((void *)arg, &fc, sizeof (fc), mode))
1074 			return (EFAULT);
1075 
1076 		/* get vnode for file system mount point */
1077 		if ((fp = getf(fc.rootfiledesc)) == NULL)
1078 			return (EBADF);
1079 
1080 		ASSERT(fp->f_vnode);
1081 		vp = fp->f_vnode;
1082 		VN_HOLD(vp);
1083 		releasef(fc.rootfiledesc);
1084 		/*
1085 		 * Test for two formats of delete and set correct minor/vp:
1086 		 * pseudo device:
1087 		 * fssnap -d [/dev/fssnap/x]
1088 		 * or
1089 		 * mount point:
1090 		 * fssnap -d [/mntpt]
1091 		 * Note that minor is verified to be equal to SNAP_CTL_MINOR
1092 		 * at this point which is an invalid minor number.
1093 		 */
1094 		ASSERT(fssnap_dip != NULL);
1095 		major = ddi_driver_major(fssnap_dip);
1096 		mutex_enter(&snapshot_mutex);
1097 		for (sidp = snapshot; sidp != NULL; sidp = sidnextp) {
1098 			rw_enter(&sidp->sid_rwlock, RW_READER);
1099 			sidnextp = sidp->sid_next;
1100 			/* pseudo device: */
1101 			if (major == getmajor(vp->v_rdev)) {
1102 				minor = getminor(vp->v_rdev);
1103 				if (sidp->sid_snapnumber == (uint_t)minor &&
1104 				    sidp->sid_fvp) {
1105 					VN_RELE(vp);
1106 					vp = sidp->sid_fvp;
1107 					VN_HOLD(vp);
1108 					rw_exit(&sidp->sid_rwlock);
1109 					break;
1110 				}
1111 			/* Mount point: */
1112 			} else {
1113 				if (sidp->sid_fvp == vp) {
1114 					minor = sidp->sid_snapnumber;
1115 					rw_exit(&sidp->sid_rwlock);
1116 					break;
1117 				}
1118 			}
1119 			rw_exit(&sidp->sid_rwlock);
1120 		}
1121 		mutex_exit(&snapshot_mutex);
1122 		/* Verify minor got set correctly above */
1123 		if (minor == SNAP_CTL_MINOR) {
1124 			VN_RELE(vp);
1125 			return (EINVAL);
1126 		}
1127 		dev = makedevice(major, minor);
1128 		/*
1129 		 * Create dummy vfs entry
1130 		 * to use as a locking semaphore across the IOCTL
1131 		 * for mount in progress cases...
1132 		 */
1133 		vfsp = vfs_alloc(KM_SLEEP);
1134 		VFS_INIT(vfsp, vfsops, NULL);
1135 		VFS_HOLD(vfsp);
1136 		vfs_addmip(dev, vfsp);
1137 		if ((vfs_devmounting(dev, vfsp)) ||
1138 		    (vfs_devismounted(dev))) {
1139 			vfs_delmip(vfsp);
1140 			VFS_RELE(vfsp);
1141 			VN_RELE(vp);
1142 			return (EBUSY);
1143 		}
1144 		/*
1145 		 * Nobody mounted but do not release mount in progress lock
1146 		 * until IOCTL complete to prohibit a mount sneaking
1147 		 * in
1148 		 */
1149 		error = VOP_IOCTL(vp, cmd, arg, 0, credp, rvalp, NULL);
1150 		vfs_delmip(vfsp);
1151 		VFS_RELE(vfsp);
1152 		VN_RELE(vp);
1153 		break;
1154 	}
1155 	default:
1156 		cmn_err(CE_WARN, "snap_ioctl: Invalid ioctl cmd %d, minor %d.",
1157 		    cmd, minor);
1158 		return (EINVAL);
1159 	}
1160 
1161 	return (error);
1162 }
1163 
1164 
1165 /* ************************************************************************ */
1166 
1167 /*
1168  * Translation Table Routines
1169  *
1170  *    These support routines implement a simple doubly linked list
1171  *    to keep track of chunks that are currently in memory.  The maximum
1172  *    size of the list is determined by the fssnap_max_mem_chunks variable.
1173  *    The cmap_rwlock is used to protect the linkage of the list.
1174  */
1175 
1176 /*
1177  * transtbl_add() - add a node to the translation table
1178  *
1179  *    allocates a new node and points it at the buffer passed in.  The node
1180  *    is added to the beginning of the doubly linked list and the head of
1181  *    the list is moved.  The cmap_rwlock must be held as a writer through
1182  *    this operation.
1183  */
1184 static cow_map_node_t *
1185 transtbl_add(cow_map_t *cmap, chunknumber_t chunk, caddr_t buf)
1186 {
1187 	cow_map_node_t	*cmnode;
1188 
1189 	ASSERT(RW_WRITE_HELD(&cmap->cmap_rwlock));
1190 
1191 	cmnode = kmem_alloc(sizeof (cow_map_node_t), KM_SLEEP);
1192 
1193 	/*
1194 	 * insert new translations at the beginning so cmn_table is always
1195 	 * the first node.
1196 	 */
1197 	cmnode->cmn_chunk = chunk;
1198 	cmnode->cmn_buf = buf;
1199 	cmnode->cmn_prev = NULL;
1200 	cmnode->cmn_next = cmap->cmap_table;
1201 	if (cmnode->cmn_next)
1202 		cmnode->cmn_next->cmn_prev = cmnode;
1203 	cmap->cmap_table = cmnode;
1204 
1205 	return (cmnode);
1206 }
1207 
1208 /*
1209  * transtbl_get() - look up a node in the translation table
1210  *
1211  *    called by the snapshot driver to find data that has been translated.
1212  *    The lookup is done by the chunk number, and the node is returned.
1213  *    If the node was not found, NULL is returned.
1214  */
1215 static cow_map_node_t *
1216 transtbl_get(cow_map_t *cmap, chunknumber_t chunk)
1217 {
1218 	cow_map_node_t *cmn;
1219 
1220 	ASSERT(RW_READ_HELD(&cmap->cmap_rwlock));
1221 	ASSERT(cmap);
1222 
1223 	/* search the translation table */
1224 	for (cmn = cmap->cmap_table; cmn != NULL; cmn = cmn->cmn_next) {
1225 		if (cmn->cmn_chunk == chunk)
1226 			return (cmn);
1227 	}
1228 
1229 	/* not found */
1230 	return (NULL);
1231 }
1232 
1233 /*
1234  * transtbl_delete() - delete a node from the translation table
1235  *
1236  *    called when a node's data has been written out to disk.  The
1237  *    cmap_rwlock must be held as a writer for this operation.  If the node
1238  *    being deleted is the head of the list, then the head is moved to the
1239  *    next node.  Both the node's data and the node itself are freed.
1240  */
1241 static void
1242 transtbl_delete(cow_map_t *cmap, cow_map_node_t *cmn)
1243 {
1244 	ASSERT(RW_WRITE_HELD(&cmap->cmap_rwlock));
1245 	ASSERT(cmn);
1246 	ASSERT(cmap->cmap_table);
1247 
1248 	/* if the head of the list is being deleted, then move the head up */
1249 	if (cmap->cmap_table == cmn) {
1250 		ASSERT(cmn->cmn_prev == NULL);
1251 		cmap->cmap_table = cmn->cmn_next;
1252 	}
1253 
1254 
1255 	/* make previous node's next pointer skip over current node */
1256 	if (cmn->cmn_prev != NULL) {
1257 		ASSERT(cmn->cmn_prev->cmn_next == cmn);
1258 		cmn->cmn_prev->cmn_next = cmn->cmn_next;
1259 	}
1260 
1261 	/* make next node's previous pointer skip over current node */
1262 	if (cmn->cmn_next != NULL) {
1263 		ASSERT(cmn->cmn_next->cmn_prev == cmn);
1264 		cmn->cmn_next->cmn_prev = cmn->cmn_prev;
1265 	}
1266 
1267 	/* free the data and the node */
1268 	ASSERT(cmn->cmn_buf);
1269 	kmem_free(cmn->cmn_buf, cmap->cmap_chunksz);
1270 	kmem_free(cmn, sizeof (cow_map_node_t));
1271 }
1272 
1273 /*
1274  * transtbl_free() - free the entire translation table
1275  *
1276  *    called when the snapshot is deleted.  This frees all of the nodes in
1277  *    the translation table (but not the bitmaps).
1278  */
1279 static void
1280 transtbl_free(cow_map_t *cmap)
1281 {
1282 	cow_map_node_t	*curnode;
1283 	cow_map_node_t	*tempnode;
1284 
1285 	for (curnode = cmap->cmap_table; curnode != NULL; curnode = tempnode) {
1286 		tempnode = curnode->cmn_next;
1287 
1288 		kmem_free(curnode->cmn_buf, cmap->cmap_chunksz);
1289 		kmem_free(curnode, sizeof (cow_map_node_t));
1290 	}
1291 }
1292 
1293 
1294 /* ************************************************************************ */
1295 
1296 /*
1297  * Interface Implementation Routines
1298  *
1299  * The following functions implement snapshot interface routines that are
1300  * called by the file system to create, delete, and use a snapshot.  The
1301  * interfaces are defined in fssnap_if.c and are filled in by this driver
1302  * when it is loaded.  This technique allows the file system to depend on
1303  * the interface module without having to load the full implementation and
1304  * snapshot device drivers.
1305  */
1306 
1307 /*
1308  * fssnap_strategy_impl() - strategy routine called by the file system
1309  *
1310  *    called by the file system to handle copy-on-write when necessary.  All
1311  *    reads and writes that the file system performs should go through this
1312  *    function.  If the file system calls the underlying device's strategy
1313  *    routine without going through fssnap_strategy() (eg. by calling
1314  *    bdev_strategy()), the snapshot may not be consistent.
1315  *
1316  *    This function starts by doing significant sanity checking to insure
1317  *    the snapshot was not deleted out from under it or deleted and then
1318  *    recreated.  To do this, it checks the actual pointer passed into it
1319  *    (ie. the handle held by the file system).  NOTE that the parameter is
1320  *    a POINTER TO A POINTER to the snapshot id.  Once the snapshot id is
1321  *    locked, it knows things are ok and that this snapshot is really for
1322  *    this file system.
1323  *
1324  *    If the request is a write, fssnap_translate() is called to determine
1325  *    whether a copy-on-write is required.  If it is a read, the read is
1326  *    simply passed on to the underlying device.
1327  */
1328 static void
1329 fssnap_strategy_impl(void *snapshot_id, buf_t *bp)
1330 {
1331 	struct snapshot_id **sidpp;
1332 	struct snapshot_id *sidp;
1333 	int error;
1334 
1335 	/* read requests are always passed through */
1336 	if (bp->b_flags & B_READ) {
1337 		(void) bdev_strategy(bp);
1338 		return;
1339 	}
1340 
1341 	/*
1342 	 * Because we were not able to take the snapshot read lock BEFORE
1343 	 * checking for a snapshot back in the file system, things may have
1344 	 * drastically changed out from under us.  For instance, the snapshot
1345 	 * may have been deleted, deleted and recreated, or worse yet, deleted
1346 	 * for this file system but now the snapshot number is in use by another
1347 	 * file system.
1348 	 *
1349 	 * Having a pointer to the file system's snapshot id pointer allows us
1350 	 * to sanity check most of this, though it assumes the file system is
1351 	 * keeping track of a pointer to the snapshot_id somewhere.
1352 	 */
1353 	sidpp = (struct snapshot_id **)snapshot_id;
1354 	sidp = *sidpp;
1355 
1356 	/*
1357 	 * if this file system's snapshot was disabled, just pass the
1358 	 * request through.
1359 	 */
1360 	if (sidp == NULL) {
1361 		(void) bdev_strategy(bp);
1362 		return;
1363 	}
1364 
1365 	/*
1366 	 * Once we have the reader lock the snapshot will not magically go
1367 	 * away.  But things may have changed on us before this so double check.
1368 	 */
1369 	rw_enter(&sidp->sid_rwlock, RW_READER);
1370 
1371 	/*
1372 	 * if an error was founds somewhere the DELETE flag will be
1373 	 * set to indicate the snapshot should be deleted and no new
1374 	 * translations should occur.
1375 	 */
1376 	if (sidp->sid_flags & SID_DELETE) {
1377 		rw_exit(&sidp->sid_rwlock);
1378 		(void) fssnap_delete_impl(sidpp);
1379 		(void) bdev_strategy(bp);
1380 		return;
1381 	}
1382 
1383 	/*
1384 	 * If the file system is no longer pointing to the snapshot we were
1385 	 * called with, then it should not attempt to translate this buffer as
1386 	 * it may be going to a snapshot for a different file system.
1387 	 * Even if the file system snapshot pointer is still the same, the
1388 	 * snapshot may have been disabled before we got the reader lock.
1389 	 */
1390 	if (sidp != *sidpp || SID_INACTIVE(sidp)) {
1391 		rw_exit(&sidp->sid_rwlock);
1392 		(void) bdev_strategy(bp);
1393 		return;
1394 	}
1395 
1396 	/*
1397 	 * At this point we're sure the snapshot will not go away while the
1398 	 * reader lock is held, and we are reasonably certain that we are
1399 	 * writing to the correct snapshot.
1400 	 */
1401 	if ((error = fssnap_translate(sidpp, bp)) != 0) {
1402 		/*
1403 		 * fssnap_translate can release the reader lock if it
1404 		 * has to wait for a semaphore.  In this case it is possible
1405 		 * for the snapshot to be deleted in this time frame.  If this
1406 		 * happens just sent the buf thru to the filesystems device.
1407 		 */
1408 		if (sidp != *sidpp || SID_INACTIVE(sidp)) {
1409 			rw_exit(&sidp->sid_rwlock);
1410 			(void) bdev_strategy(bp);
1411 			return;
1412 		}
1413 		bioerror(bp, error);
1414 		biodone(bp);
1415 	}
1416 	rw_exit(&sidp->sid_rwlock);
1417 }
1418 
1419 /*
1420  * fssnap_translate() - helper function for fssnap_strategy()
1421  *
1422  *    performs the actual copy-on-write for write requests, if required.
1423  *    This function does the real work of the file system side of things.
1424  *
1425  *    It first checks the candidate bitmap to quickly determine whether any
1426  *    action is necessary.  If the candidate bitmap indicates the chunk was
1427  *    allocated when the snapshot was created, then it checks to see whether
1428  *    a translation already exists.  If a translation already exists then no
1429  *    action is required.  If the chunk is a candidate for copy-on-write,
1430  *    and a translation does not already exist, then the chunk is read in
1431  *    and a node is added to the translation table.
1432  *
1433  *    Once all of the chunks in the request range have been copied (if they
1434  *    needed to be), then the original request can be satisfied and the old
1435  *    data can be overwritten.
1436  */
1437 static int
1438 fssnap_translate(struct snapshot_id **sidpp, struct buf *wbp)
1439 {
1440 	snapshot_id_t	*sidp = *sidpp;
1441 	struct buf	*oldbp;	/* buffer to store old data in */
1442 	struct cow_info	*cowp = sidp->sid_cowinfo;
1443 	cow_map_t	*cmap = &cowp->cow_map;
1444 	cow_map_node_t	*cmn;
1445 	chunknumber_t	cowchunk, startchunk, endchunk;
1446 	int		error;
1447 	int	throttle_write = 0;
1448 
1449 	/* make sure the snapshot is active */
1450 	ASSERT(RW_READ_HELD(&sidp->sid_rwlock));
1451 
1452 	startchunk = dbtocowchunk(cmap, wbp->b_lblkno);
1453 	endchunk   = dbtocowchunk(cmap, wbp->b_lblkno +
1454 	    ((wbp->b_bcount-1) >> DEV_BSHIFT));
1455 
1456 	/*
1457 	 * Do not throttle the writes of the fssnap taskq thread and
1458 	 * the log roll (trans_roll) thread. Furthermore the writes to
1459 	 * the on-disk log are also not subject to throttling.
1460 	 * The fssnap_write_taskq thread's write can block on the throttling
1461 	 * semaphore which leads to self-deadlock as this same thread
1462 	 * releases the throttling semaphore after completing the IO.
1463 	 * If the trans_roll thread's write is throttled then we can deadlock
1464 	 * because the fssnap_taskq_thread which releases the throttling
1465 	 * semaphore can block waiting for log space which can only be
1466 	 * released by the trans_roll thread.
1467 	 */
1468 
1469 	throttle_write = !(taskq_member(cowp->cow_taskq, curthread) ||
1470 	    tsd_get(bypass_snapshot_throttle_key));
1471 
1472 	/*
1473 	 * Iterate through all chunks covered by this write and perform the
1474 	 * copy-aside if necessary.  Once all chunks have been safely
1475 	 * stowed away, the new data may be written in a single sweep.
1476 	 *
1477 	 * For each chunk in the range, the following sequence is performed:
1478 	 *	- Is the chunk a candidate for translation?
1479 	 *		o If not, then no translation is necessary, continue
1480 	 *	- If it is a candidate, then does it already have a translation?
1481 	 *		o If so, then no translation is necessary, continue
1482 	 *	- If it is a candidate, but does not yet have a translation,
1483 	 *	  then read the old data and schedule an asynchronous taskq
1484 	 *	  to write the old data to the backing file.
1485 	 *
1486 	 * Once this has been performed over the entire range of chunks, then
1487 	 * it is safe to overwrite the data that is there.
1488 	 *
1489 	 * Note that no lock is required to check the candidate bitmap because
1490 	 * it never changes once the snapshot is created.  The reader lock is
1491 	 * taken to check the hastrans bitmap since it may change.  If it
1492 	 * turns out a copy is required, then the lock is upgraded to a
1493 	 * writer, and the bitmap is re-checked as it may have changed while
1494 	 * the lock was released.  Finally, the write lock is held while
1495 	 * reading the old data to make sure it is not translated out from
1496 	 * under us.
1497 	 *
1498 	 * This locking mechanism should be sufficient to handle multiple
1499 	 * threads writing to overlapping chunks simultaneously.
1500 	 */
1501 	for (cowchunk = startchunk; cowchunk <= endchunk; cowchunk++) {
1502 		/*
1503 		 * If the cowchunk is outside of the range of our
1504 		 * candidate maps, then simply break out of the
1505 		 * loop and pass the I/O through to bdev_strategy.
1506 		 * This would occur if the file system has grown
1507 		 * larger since the snapshot was taken.
1508 		 */
1509 		if (cowchunk >= (cmap->cmap_bmsize * NBBY))
1510 			break;
1511 
1512 		/*
1513 		 * If no disk blocks were allocated in this chunk when the
1514 		 * snapshot was created then no copy-on-write will be
1515 		 * required.  Since this bitmap is read-only no locks are
1516 		 * necessary.
1517 		 */
1518 		if (isclr(cmap->cmap_candidate, cowchunk)) {
1519 			continue;
1520 		}
1521 
1522 		/*
1523 		 * If a translation already exists, the data can be written
1524 		 * through since the old data has already been saved off.
1525 		 */
1526 		if (isset(cmap->cmap_hastrans, cowchunk)) {
1527 			continue;
1528 		}
1529 
1530 
1531 		/*
1532 		 * Throttle translations if there are too many outstanding
1533 		 * chunks in memory.  The semaphore is sema_v'd by the taskq.
1534 		 *
1535 		 * You can't keep the sid_rwlock if you would go to sleep.
1536 		 * This will result in deadlock when someone tries to delete
1537 		 * the snapshot (wants the sid_rwlock as a writer, but can't
1538 		 * get it).
1539 		 */
1540 		if (throttle_write) {
1541 			if (sema_tryp(&cmap->cmap_throttle_sem) == 0) {
1542 				rw_exit(&sidp->sid_rwlock);
1543 				atomic_add_32(&cmap->cmap_waiters, 1);
1544 				sema_p(&cmap->cmap_throttle_sem);
1545 				atomic_add_32(&cmap->cmap_waiters, -1);
1546 				rw_enter(&sidp->sid_rwlock, RW_READER);
1547 
1548 			/*
1549 			 * Now since we released the sid_rwlock the state may
1550 			 * have transitioned underneath us. so check that again.
1551 			 */
1552 				if (sidp != *sidpp || SID_INACTIVE(sidp)) {
1553 					sema_v(&cmap->cmap_throttle_sem);
1554 					return (ENXIO);
1555 				}
1556 			}
1557 		}
1558 
1559 		/*
1560 		 * Acquire the lock as a writer and check to see if a
1561 		 * translation has been added in the meantime.
1562 		 */
1563 		rw_enter(&cmap->cmap_rwlock, RW_WRITER);
1564 		if (isset(cmap->cmap_hastrans, cowchunk)) {
1565 			if (throttle_write)
1566 				sema_v(&cmap->cmap_throttle_sem);
1567 			rw_exit(&cmap->cmap_rwlock);
1568 			continue; /* go to the next chunk */
1569 		}
1570 
1571 		/*
1572 		 * read a full chunk of data from the requested offset rounded
1573 		 * down to the nearest chunk size.
1574 		 */
1575 		oldbp = getrbuf(KM_SLEEP);
1576 		oldbp->b_lblkno = cowchunktodb(cmap, cowchunk);
1577 		oldbp->b_edev = wbp->b_edev;
1578 		oldbp->b_bcount = cmap->cmap_chunksz;
1579 		oldbp->b_bufsize = cmap->cmap_chunksz;
1580 		oldbp->b_iodone = NULL;
1581 		oldbp->b_proc = NULL;
1582 		oldbp->b_flags = B_READ;
1583 		oldbp->b_un.b_addr = kmem_alloc(cmap->cmap_chunksz, KM_SLEEP);
1584 
1585 		(void) bdev_strategy(oldbp);
1586 		(void) biowait(oldbp);
1587 
1588 		/*
1589 		 * It's ok to bail in the middle of translating the range
1590 		 * because the extra copy-asides will not hurt anything
1591 		 * (except by using extra space in the backing store).
1592 		 */
1593 		if ((error = geterror(oldbp)) != 0) {
1594 			cmn_err(CE_WARN, "fssnap_translate: error reading "
1595 			    "old data for snapshot %d, chunk %llu, disk block "
1596 			    "%lld, size %lu, error %d.", sidp->sid_snapnumber,
1597 			    cowchunk, oldbp->b_lblkno, oldbp->b_bcount, error);
1598 			kmem_free(oldbp->b_un.b_addr, cmap->cmap_chunksz);
1599 			freerbuf(oldbp);
1600 			rw_exit(&cmap->cmap_rwlock);
1601 			if (throttle_write)
1602 				sema_v(&cmap->cmap_throttle_sem);
1603 			return (error);
1604 		}
1605 
1606 		/*
1607 		 * add the node to the translation table and save a reference
1608 		 * to pass to the taskq for writing out to the backing file
1609 		 */
1610 		cmn = transtbl_add(cmap, cowchunk, oldbp->b_un.b_addr);
1611 		freerbuf(oldbp);
1612 
1613 		/*
1614 		 * Add a reference to the snapshot id so the lower level
1615 		 * processing (ie. the taskq) can get back to the state
1616 		 * information.
1617 		 */
1618 		cmn->cmn_sid = sidp;
1619 		cmn->release_sem = throttle_write;
1620 		setbit(cmap->cmap_hastrans, cowchunk);
1621 
1622 		rw_exit(&cmap->cmap_rwlock);
1623 
1624 		/*
1625 		 * schedule the asynchronous write to the backing file
1626 		 */
1627 		if (cowp->cow_backfile_array != NULL)
1628 			(void) taskq_dispatch(cowp->cow_taskq,
1629 			    fssnap_write_taskq, cmn, TQ_SLEEP);
1630 	}
1631 
1632 	/*
1633 	 * Write new data in place of the old data.  At this point all of the
1634 	 * chunks touched by this write have been copied aside and so the new
1635 	 * data can be written out all at once.
1636 	 */
1637 	(void) bdev_strategy(wbp);
1638 
1639 	return (0);
1640 }
1641 
1642 /*
1643  * fssnap_write_taskq() - write in-memory translations to the backing file
1644  *
1645  *    writes in-memory translations to the backing file asynchronously.  A
1646  *    task is dispatched each time a new translation is created.  The task
1647  *    writes the data to the backing file and removes it from the memory
1648  *    list. The throttling semaphore is released only if the particular
1649  *    translation was throttled in fssnap_translate.
1650  */
1651 static void
1652 fssnap_write_taskq(void *arg)
1653 {
1654 	cow_map_node_t	*cmn = (cow_map_node_t *)arg;
1655 	snapshot_id_t	*sidp = cmn->cmn_sid;
1656 	cow_info_t	*cowp = sidp->sid_cowinfo;
1657 	cow_map_t	*cmap = &cowp->cow_map;
1658 	int		error;
1659 	int		bf_index;
1660 	int		release_sem = cmn->release_sem;
1661 
1662 	/*
1663 	 * The sid_rwlock does not need to be held here because the taskqs
1664 	 * are destroyed explicitly by fssnap_delete (with the sid_rwlock
1665 	 * held as a writer).  taskq_destroy() will flush all of the tasks
1666 	 * out before fssnap_delete frees up all of the structures.
1667 	 */
1668 
1669 	/* if the snapshot was disabled from under us, drop the request. */
1670 	rw_enter(&sidp->sid_rwlock, RW_READER);
1671 	if (SID_INACTIVE(sidp)) {
1672 		rw_exit(&sidp->sid_rwlock);
1673 		if (release_sem)
1674 			sema_v(&cmap->cmap_throttle_sem);
1675 		return;
1676 	}
1677 	rw_exit(&sidp->sid_rwlock);
1678 
1679 	atomic_add_64((uint64_t *)&cmap->cmap_nchunks, 1);
1680 
1681 	if ((cmap->cmap_maxsize != 0) &&
1682 	    ((cmap->cmap_nchunks * cmap->cmap_chunksz) > cmap->cmap_maxsize)) {
1683 		cmn_err(CE_WARN, "fssnap_write_taskq: snapshot %d (%s) has "
1684 		    "reached the maximum backing file size specified (%llu "
1685 		    "bytes) and will be deleted.", sidp->sid_snapnumber,
1686 		    (char *)cowp->cow_kstat_mntpt->ks_data,
1687 		    cmap->cmap_maxsize);
1688 		if (release_sem)
1689 			sema_v(&cmap->cmap_throttle_sem);
1690 		atomic_or_uint(&sidp->sid_flags, SID_DELETE);
1691 		return;
1692 	}
1693 
1694 	/* perform the write */
1695 	bf_index = cmn->cmn_chunk / cmap->cmap_chunksperbf;
1696 
1697 	if (error = vn_rdwr(UIO_WRITE, (cowp->cow_backfile_array)[bf_index],
1698 	    cmn->cmn_buf, cmap->cmap_chunksz,
1699 	    (cmn->cmn_chunk % cmap->cmap_chunksperbf) * cmap->cmap_chunksz,
1700 	    UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, (ssize_t *)NULL)) {
1701 		cmn_err(CE_WARN, "fssnap_write_taskq: error writing to "
1702 		    "backing file.  DELETING SNAPSHOT %d, backing file path "
1703 		    "%s, offset %llu bytes, error %d.", sidp->sid_snapnumber,
1704 		    (char *)cowp->cow_kstat_bfname->ks_data,
1705 		    cmn->cmn_chunk * cmap->cmap_chunksz, error);
1706 		if (release_sem)
1707 			sema_v(&cmap->cmap_throttle_sem);
1708 		atomic_or_uint(&sidp->sid_flags, SID_DELETE);
1709 		return;
1710 	}
1711 
1712 	/*
1713 	 * now remove the node and buffer from memory
1714 	 */
1715 	rw_enter(&cmap->cmap_rwlock, RW_WRITER);
1716 	transtbl_delete(cmap, cmn);
1717 	rw_exit(&cmap->cmap_rwlock);
1718 
1719 	/* Allow more translations */
1720 	if (release_sem)
1721 		sema_v(&cmap->cmap_throttle_sem);
1722 
1723 }
1724 
1725 /*
1726  * fssnap_create_impl() - called from the file system to create a new snapshot
1727  *
1728  *    allocates and initializes the structures needed for a new snapshot.
1729  *    This is called by the file system when it receives an ioctl request to
1730  *    create a new snapshot.  An unused snapshot identifier is either found
1731  *    or created, and eventually returned as the opaque handle the file
1732  *    system will use to identify this snapshot.  The snapshot number
1733  *    associated with the snapshot identifier is the same as the minor
1734  *    number for the snapshot device that is used to access that snapshot.
1735  *
1736  *    The snapshot can not be used until the candidate bitmap is populated
1737  *    by the file system (see fssnap_set_candidate_impl()), and the file
1738  *    system finishes the setup process by calling fssnap_create_done().
1739  *    Nearly all of the snapshot locks are held for the duration of the
1740  *    create, and are not released until fssnap_create_done is called().
1741  */
1742 static void *
1743 fssnap_create_impl(chunknumber_t nchunks, uint_t chunksz, u_offset_t maxsize,
1744     struct vnode *fsvp, int backfilecount, struct vnode **bfvpp, char *backpath,
1745     u_offset_t max_backfile_size)
1746 {
1747 	refstr_t *mountpoint;
1748 	char taskqname[50];
1749 	struct cow_info *cowp;
1750 	struct cow_map	*cmap;
1751 	struct snapshot_id *sidp;
1752 	int lastsnap;
1753 
1754 	/*
1755 	 * Sanity check the parameters we care about
1756 	 * (we don't care about the informational parameters)
1757 	 */
1758 	if ((nchunks == 0) ||
1759 	    ((chunksz % DEV_BSIZE) != 0) ||
1760 	    (bfvpp == NULL)) {
1761 		return (NULL);
1762 	}
1763 
1764 	/*
1765 	 * Look for unused snapshot identifiers.  Snapshot ids are never
1766 	 * freed, but deleted snapshot ids will be recycled as needed.
1767 	 */
1768 	mutex_enter(&snapshot_mutex);
1769 
1770 findagain:
1771 	lastsnap = 0;
1772 	for (sidp = snapshot; sidp != NULL; sidp = sidp->sid_next) {
1773 		if (sidp->sid_snapnumber > lastsnap)
1774 			lastsnap = sidp->sid_snapnumber;
1775 
1776 		/*
1777 		 * The sid_rwlock is taken as a reader initially so that
1778 		 * activity on each snapshot is not stalled while searching
1779 		 * for a free snapshot id.
1780 		 */
1781 		rw_enter(&sidp->sid_rwlock, RW_READER);
1782 
1783 		/*
1784 		 * If the snapshot has been deleted and nobody is using the
1785 		 * snapshot device than we can reuse this snapshot_id.  If
1786 		 * the snapshot is marked to be deleted (SID_DELETE), then
1787 		 * it hasn't been deleted yet so don't reuse it.
1788 		 */
1789 		if (SID_AVAILABLE(sidp))
1790 			break; /* This spot is unused, so take it */
1791 		rw_exit(&sidp->sid_rwlock);
1792 	}
1793 
1794 	/*
1795 	 * add a new snapshot identifier if there are no deleted
1796 	 * entries.  Since it doesn't matter what order the entries
1797 	 * are in we can just add it to the beginning of the list.
1798 	 */
1799 	if (sidp) {
1800 		if (rw_tryupgrade(&sidp->sid_rwlock) == 0) {
1801 			/* someone else grabbed it as a writer, try again */
1802 			rw_exit(&sidp->sid_rwlock);
1803 			goto findagain;
1804 		}
1805 	} else {
1806 		/* Create a new node if we didn't find an unused one */
1807 		sidp = kmem_alloc(sizeof (struct snapshot_id), KM_SLEEP);
1808 		rw_init(&sidp->sid_rwlock, NULL, RW_DEFAULT, NULL);
1809 		rw_enter(&sidp->sid_rwlock, RW_WRITER);
1810 		sidp->sid_snapnumber = (snapshot == NULL) ? 0 : lastsnap + 1;
1811 		sidp->sid_cowinfo = NULL;
1812 		sidp->sid_flags = 0;
1813 		sidp->sid_next = snapshot;
1814 		snapshot = sidp;
1815 	}
1816 
1817 	ASSERT(RW_WRITE_HELD(&sidp->sid_rwlock));
1818 	ASSERT(sidp->sid_cowinfo == NULL);
1819 	ASSERT(sidp->sid_snapnumber <= (lastsnap + 1));
1820 
1821 	sidp->sid_flags |= SID_CREATING;
1822 	/* The root vnode is held until snap_delete_impl() is called */
1823 	VN_HOLD(fsvp);
1824 	sidp->sid_fvp = fsvp;
1825 	num_snapshots++;
1826 
1827 	/* allocate and initialize structures */
1828 
1829 	cowp = kmem_zalloc(sizeof (struct cow_info), KM_SLEEP);
1830 
1831 	cowp->cow_backfile_array = bfvpp;
1832 	cowp->cow_backcount = backfilecount;
1833 	cowp->cow_backfile_sz = max_backfile_size;
1834 
1835 	/*
1836 	 * Initialize task queues for this snapshot.  Only a small number
1837 	 * of threads are required because they will be serialized on the
1838 	 * backing file's reader/writer lock anyway.
1839 	 */
1840 	(void) snprintf(taskqname, sizeof (taskqname), "%s_taskq_%d", snapname,
1841 	    sidp->sid_snapnumber);
1842 	cowp->cow_taskq = taskq_create(taskqname, fssnap_taskq_nthreads,
1843 	    minclsyspri, 1,  fssnap_taskq_maxtasks, 0);
1844 
1845 	/* don't allow tasks to start until after everything is ready */
1846 	taskq_suspend(cowp->cow_taskq);
1847 
1848 	/* initialize translation table */
1849 	cmap = &cowp->cow_map;
1850 	rw_init(&cmap->cmap_rwlock, NULL, RW_DEFAULT, NULL);
1851 	rw_enter(&cmap->cmap_rwlock, RW_WRITER);
1852 
1853 	sema_init(&cmap->cmap_throttle_sem, fssnap_max_mem_chunks, NULL,
1854 	    SEMA_DEFAULT, NULL);
1855 
1856 	cmap->cmap_chunksz = chunksz;
1857 	cmap->cmap_maxsize = maxsize;
1858 	cmap->cmap_chunksperbf = max_backfile_size / chunksz;
1859 
1860 	/*
1861 	 * allocate one bit per chunk for the bitmaps, round up
1862 	 */
1863 	cmap->cmap_bmsize = (nchunks + (NBBY - 1)) / NBBY;
1864 	cmap->cmap_hastrans  = kmem_zalloc(cmap->cmap_bmsize, KM_SLEEP);
1865 	cmap->cmap_candidate = kmem_zalloc(cmap->cmap_bmsize, KM_SLEEP);
1866 
1867 	sidp->sid_cowinfo = cowp;
1868 
1869 	/* initialize kstats for this snapshot */
1870 	mountpoint = vfs_getmntpoint(fsvp->v_vfsp);
1871 	fssnap_create_kstats(sidp, sidp->sid_snapnumber,
1872 	    refstr_value(mountpoint), backpath);
1873 	refstr_rele(mountpoint);
1874 
1875 	mutex_exit(&snapshot_mutex);
1876 
1877 	/*
1878 	 * return with snapshot id rwlock held as a writer until
1879 	 * fssnap_create_done is called
1880 	 */
1881 	return (sidp);
1882 }
1883 
1884 /*
1885  * fssnap_set_candidate_impl() - mark a chunk as a candidate for copy-on-write
1886  *
1887  *    sets a bit in the candidate bitmap that indicates that a chunk is a
1888  *    candidate for copy-on-write.  Typically, chunks that are allocated on
1889  *    the file system at the time the snapshot is taken are candidates,
1890  *    while chunks that have no allocated data do not need to be copied.
1891  *    Chunks containing metadata must be marked as candidates as well.
1892  */
1893 static void
1894 fssnap_set_candidate_impl(void *snapshot_id, chunknumber_t chunknumber)
1895 {
1896 	struct snapshot_id	*sid = snapshot_id;
1897 	struct cow_info *cowp = sid->sid_cowinfo;
1898 	struct cow_map	*cmap = &cowp->cow_map;
1899 
1900 	/* simple bitmap operation for now */
1901 	ASSERT(chunknumber < (cmap->cmap_bmsize * NBBY));
1902 	setbit(cmap->cmap_candidate, chunknumber);
1903 }
1904 
1905 /*
1906  * fssnap_is_candidate_impl() - check whether a chunk is a candidate
1907  *
1908  *    returns 0 if the chunk is not a candidate and 1 if the chunk is a
1909  *    candidate.  This can be used by the file system to change behavior for
1910  *    chunks that might induce a copy-on-write.  The offset is specified in
1911  *    bytes since the chunk size may not be known by the file system.
1912  */
1913 static int
1914 fssnap_is_candidate_impl(void *snapshot_id, u_offset_t off)
1915 {
1916 	struct snapshot_id	*sid = snapshot_id;
1917 	struct cow_info *cowp = sid->sid_cowinfo;
1918 	struct cow_map	*cmap = &cowp->cow_map;
1919 	ulong_t chunknumber = off / cmap->cmap_chunksz;
1920 
1921 	/* simple bitmap operation for now */
1922 	ASSERT(chunknumber < (cmap->cmap_bmsize * NBBY));
1923 	return (isset(cmap->cmap_candidate, chunknumber));
1924 }
1925 
1926 /*
1927  * fssnap_create_done_impl() - complete the snapshot setup process
1928  *
1929  *    called when the file system is done populating the candidate bitmap
1930  *    and it is ready to start using the snapshot.  This routine releases
1931  *    the snapshot locks, allows taskq tasks to start processing, and
1932  *    creates the device minor nodes associated with the snapshot.
1933  */
1934 static int
1935 fssnap_create_done_impl(void *snapshot_id)
1936 {
1937 	struct snapshot_id	**sidpp, *sidp = snapshot_id;
1938 	struct cow_info		*cowp;
1939 	struct cow_map		*cmap;
1940 	int			snapnumber = -1;
1941 	char			name[20];
1942 
1943 	/* sid rwlock and cmap rwlock should be taken from fssnap_create */
1944 	ASSERT(sidp);
1945 	ASSERT(RW_WRITE_HELD(&sidp->sid_rwlock));
1946 	ASSERT(sidp->sid_cowinfo);
1947 
1948 	cowp = sidp->sid_cowinfo;
1949 	cmap = &cowp->cow_map;
1950 
1951 	ASSERT(RW_WRITE_HELD(&cmap->cmap_rwlock));
1952 
1953 	sidp->sid_flags &= ~(SID_CREATING | SID_DISABLED);
1954 	snapnumber = sidp->sid_snapnumber;
1955 
1956 	/* allocate state structure and find new snapshot id */
1957 	if (ddi_soft_state_zalloc(statep, snapnumber) != DDI_SUCCESS) {
1958 		cmn_err(CE_WARN,
1959 		    "snap_ioctl: create: could not allocate "
1960 		    "state for snapshot %d.", snapnumber);
1961 		snapnumber = -1;
1962 		goto out;
1963 	}
1964 
1965 	sidpp = ddi_get_soft_state(statep, snapnumber);
1966 	*sidpp = sidp;
1967 
1968 	/* create minor node based on snapshot number */
1969 	ASSERT(fssnap_dip != NULL);
1970 	(void) snprintf(name, sizeof (name), "%d", snapnumber);
1971 	if (ddi_create_minor_node(fssnap_dip, name, S_IFBLK,
1972 	    snapnumber, DDI_PSEUDO, 0) != DDI_SUCCESS) {
1973 		cmn_err(CE_WARN, "snap_ioctl: could not create "
1974 		    "block minor node for snapshot %d.", snapnumber);
1975 		snapnumber = -1;
1976 		goto out;
1977 	}
1978 
1979 	(void) snprintf(name, sizeof (name), "%d,raw", snapnumber);
1980 	if (ddi_create_minor_node(fssnap_dip, name, S_IFCHR,
1981 	    snapnumber, DDI_PSEUDO, 0) != DDI_SUCCESS) {
1982 		cmn_err(CE_WARN, "snap_ioctl: could not create "
1983 		    "character minor node for snapshot %d.", snapnumber);
1984 		snapnumber = -1;
1985 	}
1986 
1987 out:
1988 	rw_exit(&sidp->sid_rwlock);
1989 	rw_exit(&cmap->cmap_rwlock);
1990 
1991 	/* let the taskq threads start processing */
1992 	taskq_resume(cowp->cow_taskq);
1993 
1994 	return (snapnumber);
1995 }
1996 
1997 /*
1998  * fssnap_delete_impl() - delete a snapshot
1999  *
2000  *    used when a snapshot is no longer needed.  This is called by the file
2001  *    system when it receives an ioctl request to delete a snapshot.  It is
2002  *    also called internally when error conditions such as disk full, errors
2003  *    writing to the backing file, or backing file maxsize exceeded occur.
2004  *    If the snapshot device is busy when the delete request is received,
2005  *    all state will be deleted except for the soft state and device files
2006  *    associated with the snapshot; they will be deleted when the snapshot
2007  *    device is closed.
2008  *
2009  *    NOTE this function takes a POINTER TO A POINTER to the snapshot id,
2010  *    and expects to be able to set the handle held by the file system to
2011  *    NULL.  This depends on the file system checking that variable for NULL
2012  *    before calling fssnap_strategy().
2013  */
2014 static int
2015 fssnap_delete_impl(void *snapshot_id)
2016 {
2017 	struct snapshot_id	**sidpp = (struct snapshot_id **)snapshot_id;
2018 	struct snapshot_id	*sidp;
2019 	struct snapshot_id	**statesidpp;
2020 	struct cow_info		*cowp;
2021 	struct cow_map		*cmap;
2022 	char			name[20];
2023 	int			snapnumber = -1;
2024 	vnode_t			**vpp;
2025 
2026 	/*
2027 	 * sidp is guaranteed to be valid if sidpp is valid because
2028 	 * the snapshot list is append-only.
2029 	 */
2030 	if (sidpp == NULL) {
2031 		return (-1);
2032 	}
2033 
2034 	sidp = *sidpp;
2035 	rw_enter(&sidp->sid_rwlock, RW_WRITER);
2036 
2037 	ASSERT(RW_WRITE_HELD(&sidp->sid_rwlock));
2038 
2039 	/*
2040 	 * double check that the snapshot is still valid for THIS file system
2041 	 */
2042 	if (*sidpp == NULL) {
2043 		rw_exit(&sidp->sid_rwlock);
2044 		return (-1);
2045 	}
2046 
2047 	/*
2048 	 * Now we know the snapshot is still valid and will not go away
2049 	 * because we have the write lock.  Once the state is transitioned
2050 	 * to "disabling", the sid_rwlock can be released.  Any pending I/O
2051 	 * waiting for the lock as a reader will check for this state and
2052 	 * abort without touching data that may be getting freed.
2053 	 */
2054 	sidp->sid_flags |= SID_DISABLING;
2055 	if (sidp->sid_flags & SID_DELETE) {
2056 		cmn_err(CE_WARN, "Snapshot %d automatically deleted.",
2057 		    sidp->sid_snapnumber);
2058 		sidp->sid_flags &= ~(SID_DELETE);
2059 	}
2060 
2061 
2062 	/*
2063 	 * This is pointing into file system specific data!  The assumption is
2064 	 * that fssnap_strategy() gets called from the file system based on
2065 	 * whether this reference to the snapshot_id is NULL or not.  So
2066 	 * setting this to NULL should disable snapshots for the file system.
2067 	 */
2068 	*sidpp = NULL;
2069 
2070 	/* remove cowinfo */
2071 	cowp = sidp->sid_cowinfo;
2072 	if (cowp == NULL) {
2073 		rw_exit(&sidp->sid_rwlock);
2074 		return (-1);
2075 	}
2076 	rw_exit(&sidp->sid_rwlock);
2077 
2078 	/* destroy task queues first so they don't reference freed data. */
2079 	if (cowp->cow_taskq) {
2080 		taskq_destroy(cowp->cow_taskq);
2081 		cowp->cow_taskq = NULL;
2082 	}
2083 
2084 	if (cowp->cow_backfile_array != NULL) {
2085 		for (vpp = cowp->cow_backfile_array; *vpp; vpp++)
2086 			VN_RELE(*vpp);
2087 		kmem_free(cowp->cow_backfile_array,
2088 		    (cowp->cow_backcount + 1) * sizeof (vnode_t *));
2089 		cowp->cow_backfile_array = NULL;
2090 	}
2091 
2092 	sidp->sid_cowinfo = NULL;
2093 
2094 	/* remove cmap */
2095 	cmap = &cowp->cow_map;
2096 	ASSERT(cmap);
2097 
2098 	if (cmap->cmap_candidate)
2099 		kmem_free(cmap->cmap_candidate, cmap->cmap_bmsize);
2100 
2101 	if (cmap->cmap_hastrans)
2102 		kmem_free(cmap->cmap_hastrans, cmap->cmap_bmsize);
2103 
2104 	if (cmap->cmap_table)
2105 		transtbl_free(&cowp->cow_map);
2106 
2107 	rw_destroy(&cmap->cmap_rwlock);
2108 
2109 	while (cmap->cmap_waiters) {
2110 		sema_p(&cmap->cmap_throttle_sem);
2111 		sema_v(&cmap->cmap_throttle_sem);
2112 	}
2113 	sema_destroy(&cmap->cmap_throttle_sem);
2114 
2115 	/* remove kstats */
2116 	fssnap_delete_kstats(cowp);
2117 
2118 	kmem_free(cowp, sizeof (struct cow_info));
2119 
2120 	statesidpp = ddi_get_soft_state(statep, sidp->sid_snapnumber);
2121 	if (statesidpp == NULL || *statesidpp == NULL) {
2122 		cmn_err(CE_WARN,
2123 		    "fssnap_delete_impl: could not find state for snapshot %d.",
2124 		    sidp->sid_snapnumber);
2125 	}
2126 	ASSERT(*statesidpp == sidp);
2127 
2128 	/*
2129 	 * Leave the node in the list marked DISABLED so it can be reused
2130 	 * and avoid many race conditions.  Return the snapshot number
2131 	 * that was deleted.
2132 	 */
2133 	mutex_enter(&snapshot_mutex);
2134 	rw_enter(&sidp->sid_rwlock, RW_WRITER);
2135 	sidp->sid_flags &= ~(SID_DISABLING);
2136 	sidp->sid_flags |= SID_DISABLED;
2137 	VN_RELE(sidp->sid_fvp);
2138 	sidp->sid_fvp = NULL;
2139 	snapnumber = sidp->sid_snapnumber;
2140 
2141 	/*
2142 	 * If the snapshot is not busy, free the device info now.  Otherwise
2143 	 * the device nodes are freed in snap_close() when the device is
2144 	 * closed.  The sid will not be reused until the device is not busy.
2145 	 */
2146 	if (SID_AVAILABLE(sidp)) {
2147 		/* remove the device nodes */
2148 		ASSERT(fssnap_dip != NULL);
2149 		(void) snprintf(name, sizeof (name), "%d",
2150 		    sidp->sid_snapnumber);
2151 		ddi_remove_minor_node(fssnap_dip, name);
2152 		(void) snprintf(name, sizeof (name), "%d,raw",
2153 		    sidp->sid_snapnumber);
2154 		ddi_remove_minor_node(fssnap_dip, name);
2155 
2156 		/* delete the state structure */
2157 		ddi_soft_state_free(statep, sidp->sid_snapnumber);
2158 		num_snapshots--;
2159 	}
2160 
2161 	mutex_exit(&snapshot_mutex);
2162 	rw_exit(&sidp->sid_rwlock);
2163 
2164 	return (snapnumber);
2165 }
2166 
2167 /*
2168  * fssnap_create_kstats() - allocate and initialize snapshot kstats
2169  *
2170  */
2171 static void
2172 fssnap_create_kstats(snapshot_id_t *sidp, int snapnum,
2173     const char *mountpoint, const char *backfilename)
2174 {
2175 	kstat_t *num, *mntpoint, *bfname;
2176 	kstat_named_t *hw;
2177 	struct cow_info *cowp = sidp->sid_cowinfo;
2178 	struct cow_kstat_num *stats;
2179 
2180 	/* update the high water mark */
2181 	if (fssnap_highwater_kstat == NULL) {
2182 		cmn_err(CE_WARN, "fssnap_create_kstats: failed to lookup "
2183 		    "high water mark kstat.");
2184 		return;
2185 	}
2186 
2187 	hw = (kstat_named_t *)fssnap_highwater_kstat->ks_data;
2188 	if (hw->value.ui32 < snapnum)
2189 		hw->value.ui32 = snapnum;
2190 
2191 	/* initialize the mount point kstat */
2192 	kstat_delete_byname(snapname, snapnum, FSSNAP_KSTAT_MNTPT);
2193 
2194 	if (mountpoint != NULL) {
2195 		mntpoint = kstat_create(snapname, snapnum, FSSNAP_KSTAT_MNTPT,
2196 		    "misc", KSTAT_TYPE_RAW, strlen(mountpoint) + 1, 0);
2197 		if (mntpoint == NULL) {
2198 			cowp->cow_kstat_mntpt = NULL;
2199 			cmn_err(CE_WARN, "fssnap_create_kstats: failed to "
2200 			    "create mount point kstat");
2201 		} else {
2202 			(void) strncpy(mntpoint->ks_data, mountpoint,
2203 			    strlen(mountpoint));
2204 			cowp->cow_kstat_mntpt = mntpoint;
2205 			kstat_install(mntpoint);
2206 		}
2207 	} else {
2208 		cowp->cow_kstat_mntpt = NULL;
2209 		cmn_err(CE_WARN, "fssnap_create_kstats: mount point not "
2210 		    "specified.");
2211 	}
2212 
2213 	/* initialize the backing file kstat */
2214 	kstat_delete_byname(snapname, snapnum, FSSNAP_KSTAT_BFNAME);
2215 
2216 	if (backfilename == NULL) {
2217 		cowp->cow_kstat_bfname = NULL;
2218 	} else {
2219 		bfname = kstat_create(snapname, snapnum, FSSNAP_KSTAT_BFNAME,
2220 		    "misc", KSTAT_TYPE_RAW, strlen(backfilename) + 1, 0);
2221 		if (bfname != NULL) {
2222 			(void) strncpy(bfname->ks_data, backfilename,
2223 			    strlen(backfilename));
2224 			cowp->cow_kstat_bfname = bfname;
2225 			kstat_install(bfname);
2226 		} else {
2227 			cowp->cow_kstat_bfname = NULL;
2228 			cmn_err(CE_WARN, "fssnap_create_kstats: failed to "
2229 			    "create backing file name kstat");
2230 		}
2231 	}
2232 
2233 	/* initialize numeric kstats */
2234 	kstat_delete_byname(snapname, snapnum, FSSNAP_KSTAT_NUM);
2235 
2236 	num = kstat_create(snapname, snapnum, FSSNAP_KSTAT_NUM,
2237 	    "misc", KSTAT_TYPE_NAMED,
2238 	    sizeof (struct cow_kstat_num) / sizeof (kstat_named_t),
2239 	    0);
2240 	if (num == NULL) {
2241 		cmn_err(CE_WARN, "fssnap_create_kstats: failed to create "
2242 		    "numeric kstats");
2243 		cowp->cow_kstat_num = NULL;
2244 		return;
2245 	}
2246 
2247 	cowp->cow_kstat_num = num;
2248 	stats = num->ks_data;
2249 	num->ks_update = fssnap_update_kstat_num;
2250 	num->ks_private = sidp;
2251 
2252 	kstat_named_init(&stats->ckn_state, FSSNAP_KSTAT_NUM_STATE,
2253 	    KSTAT_DATA_INT32);
2254 	kstat_named_init(&stats->ckn_bfsize, FSSNAP_KSTAT_NUM_BFSIZE,
2255 	    KSTAT_DATA_UINT64);
2256 	kstat_named_init(&stats->ckn_maxsize, FSSNAP_KSTAT_NUM_MAXSIZE,
2257 	    KSTAT_DATA_UINT64);
2258 	kstat_named_init(&stats->ckn_createtime, FSSNAP_KSTAT_NUM_CREATETIME,
2259 	    KSTAT_DATA_LONG);
2260 	kstat_named_init(&stats->ckn_chunksize, FSSNAP_KSTAT_NUM_CHUNKSIZE,
2261 	    KSTAT_DATA_UINT32);
2262 
2263 	/* initialize the static kstats */
2264 	stats->ckn_chunksize.value.ui32 = cowp->cow_map.cmap_chunksz;
2265 	stats->ckn_maxsize.value.ui64 = cowp->cow_map.cmap_maxsize;
2266 	stats->ckn_createtime.value.l = gethrestime_sec();
2267 
2268 	kstat_install(num);
2269 }
2270 
2271 /*
2272  * fssnap_update_kstat_num() - update a numerical snapshot kstat value
2273  *
2274  */
2275 int
2276 fssnap_update_kstat_num(kstat_t *ksp, int rw)
2277 {
2278 	snapshot_id_t *sidp = (snapshot_id_t *)ksp->ks_private;
2279 	struct cow_info *cowp = sidp->sid_cowinfo;
2280 	struct cow_kstat_num *stats = ksp->ks_data;
2281 
2282 	if (rw == KSTAT_WRITE)
2283 		return (EACCES);
2284 
2285 	/* state */
2286 	if (sidp->sid_flags & SID_CREATING)
2287 		stats->ckn_state.value.i32 = COWSTATE_CREATING;
2288 	else if (SID_INACTIVE(sidp))
2289 		stats->ckn_state.value.i32 = COWSTATE_DISABLED;
2290 	else if (SID_BUSY(sidp))
2291 		stats->ckn_state.value.i32 = COWSTATE_ACTIVE;
2292 	else
2293 		stats->ckn_state.value.i32 = COWSTATE_IDLE;
2294 
2295 	/* bfsize */
2296 	stats->ckn_bfsize.value.ui64 = cowp->cow_map.cmap_nchunks *
2297 	    cowp->cow_map.cmap_chunksz;
2298 
2299 	return (0);
2300 }
2301 
2302 /*
2303  * fssnap_delete_kstats() - deallocate snapshot kstats
2304  *
2305  */
2306 void
2307 fssnap_delete_kstats(struct cow_info *cowp)
2308 {
2309 	if (cowp->cow_kstat_num != NULL) {
2310 		kstat_delete(cowp->cow_kstat_num);
2311 		cowp->cow_kstat_num = NULL;
2312 	}
2313 	if (cowp->cow_kstat_mntpt != NULL) {
2314 		kstat_delete(cowp->cow_kstat_mntpt);
2315 		cowp->cow_kstat_mntpt = NULL;
2316 	}
2317 	if (cowp->cow_kstat_bfname != NULL) {
2318 		kstat_delete(cowp->cow_kstat_bfname);
2319 		cowp->cow_kstat_bfname = NULL;
2320 	}
2321 }
2322