xref: /illumos-gate/usr/src/uts/common/io/fssnap.c (revision ddfcde867cfcf679df9c2825a008d604634c1193)
17c478bd9Sstevel@tonic-gate /*
27c478bd9Sstevel@tonic-gate  * CDDL HEADER START
37c478bd9Sstevel@tonic-gate  *
47c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5*ddfcde86Srsb  * Common Development and Distribution License (the "License").
6*ddfcde86Srsb  * You may not use this file except in compliance with the License.
77c478bd9Sstevel@tonic-gate  *
87c478bd9Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bd9Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
107c478bd9Sstevel@tonic-gate  * See the License for the specific language governing permissions
117c478bd9Sstevel@tonic-gate  * and limitations under the License.
127c478bd9Sstevel@tonic-gate  *
137c478bd9Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
147c478bd9Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bd9Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
167c478bd9Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
177c478bd9Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bd9Sstevel@tonic-gate  *
197c478bd9Sstevel@tonic-gate  * CDDL HEADER END
207c478bd9Sstevel@tonic-gate  */
217c478bd9Sstevel@tonic-gate /*
22*ddfcde86Srsb  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
237c478bd9Sstevel@tonic-gate  * Use is subject to license terms.
247c478bd9Sstevel@tonic-gate  */
257c478bd9Sstevel@tonic-gate 
267c478bd9Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
277c478bd9Sstevel@tonic-gate 
287c478bd9Sstevel@tonic-gate #include <sys/debug.h>
297c478bd9Sstevel@tonic-gate #include <sys/types.h>
307c478bd9Sstevel@tonic-gate #include <sys/file.h>
317c478bd9Sstevel@tonic-gate #include <sys/errno.h>
327c478bd9Sstevel@tonic-gate #include <sys/uio.h>
337c478bd9Sstevel@tonic-gate #include <sys/open.h>
347c478bd9Sstevel@tonic-gate #include <sys/cred.h>
357c478bd9Sstevel@tonic-gate #include <sys/kmem.h>
367c478bd9Sstevel@tonic-gate #include <sys/conf.h>
377c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h>
387c478bd9Sstevel@tonic-gate #include <sys/modctl.h>
397c478bd9Sstevel@tonic-gate #include <sys/disp.h>
407c478bd9Sstevel@tonic-gate #include <sys/atomic.h>
417c478bd9Sstevel@tonic-gate #include <sys/filio.h>
427c478bd9Sstevel@tonic-gate #include <sys/stat.h> /* needed for S_IFBLK and S_IFCHR */
437c478bd9Sstevel@tonic-gate #include <sys/kstat.h>
447c478bd9Sstevel@tonic-gate 
457c478bd9Sstevel@tonic-gate #include <sys/ddi.h>
467c478bd9Sstevel@tonic-gate #include <sys/devops.h>
477c478bd9Sstevel@tonic-gate #include <sys/sunddi.h>
487c478bd9Sstevel@tonic-gate #include <sys/priv_names.h>
497c478bd9Sstevel@tonic-gate 
507c478bd9Sstevel@tonic-gate #include <sys/fssnap.h>
517c478bd9Sstevel@tonic-gate #include <sys/fssnap_if.h>
527c478bd9Sstevel@tonic-gate 
537c478bd9Sstevel@tonic-gate /*
547c478bd9Sstevel@tonic-gate  * This module implements the file system snapshot code, which provides a
557c478bd9Sstevel@tonic-gate  * point-in-time image of a file system for the purposes of online backup.
567c478bd9Sstevel@tonic-gate  * There are essentially two parts to this project: the driver half and the
577c478bd9Sstevel@tonic-gate  * file system half.  The driver half is a pseudo device driver called
587c478bd9Sstevel@tonic-gate  * "fssnap" that represents the snapshot.  Each snapshot is assigned a
597c478bd9Sstevel@tonic-gate  * number that corresponds to the minor number of the device, and a control
607c478bd9Sstevel@tonic-gate  * device with a high minor number is used to initiate snapshot creation and
617c478bd9Sstevel@tonic-gate  * deletion.  For all practical purposes the driver half acts like a
627c478bd9Sstevel@tonic-gate  * read-only disk device whose contents are exactly the same as the master
637c478bd9Sstevel@tonic-gate  * file system at the time the snapshot was created.
647c478bd9Sstevel@tonic-gate  *
657c478bd9Sstevel@tonic-gate  * The file system half provides interfaces necessary for performing the
667c478bd9Sstevel@tonic-gate  * file system dependent operations required to create and delete snapshots
677c478bd9Sstevel@tonic-gate  * and a special driver strategy routine that must always be used by the file
687c478bd9Sstevel@tonic-gate  * system for snapshots to work correctly.
697c478bd9Sstevel@tonic-gate  *
707c478bd9Sstevel@tonic-gate  * When a snapshot is to be created, the user utility will send an ioctl to
717c478bd9Sstevel@tonic-gate  * the control device of the driver half specifying the file system to be
727c478bd9Sstevel@tonic-gate  * snapshotted, the file descriptor of a backing-store file which is used to
737c478bd9Sstevel@tonic-gate  * hold old data before it is overwritten, and other snapshot parameters.
747c478bd9Sstevel@tonic-gate  * This ioctl is passed on to the file system specified in the original
757c478bd9Sstevel@tonic-gate  * ioctl request.  The file system is expected to be able to flush
767c478bd9Sstevel@tonic-gate  * everything out to make the file system consistent and lock it to ensure
777c478bd9Sstevel@tonic-gate  * no changes occur while the snapshot is being created.  It then calls
787c478bd9Sstevel@tonic-gate  * fssnap_create() to create state for a new snapshot, from which an opaque
797c478bd9Sstevel@tonic-gate  * handle is returned with the snapshot locked.  Next, the file system must
807c478bd9Sstevel@tonic-gate  * populate the "candidate bitmap", which tells the snapshot code which
817c478bd9Sstevel@tonic-gate  * "chunks" should be considered for copy-on-write (a chunk is the unit of
827c478bd9Sstevel@tonic-gate  * granularity used for copy-on-write, which is independent of the device
837c478bd9Sstevel@tonic-gate  * and file system block sizes).  This is typically done by scanning the
847c478bd9Sstevel@tonic-gate  * file system allocation bitmaps to determine which chunks contain
857c478bd9Sstevel@tonic-gate  * allocated blocks in the file system at the time the snapshot was created.
867c478bd9Sstevel@tonic-gate  * If a chunk has no allocated blocks, it does not need to be copied before
877c478bd9Sstevel@tonic-gate  * being written to.  Once the candidate bitmap is populated with
887c478bd9Sstevel@tonic-gate  * fssnap_set_candidate(), the file system calls fssnap_create_done() to
897c478bd9Sstevel@tonic-gate  * complete the snapshot creation and unlock the snapshot.  The file system
907c478bd9Sstevel@tonic-gate  * may now be unlocked and modifications to it resumed.
917c478bd9Sstevel@tonic-gate  *
927c478bd9Sstevel@tonic-gate  * Once a snapshot is created, the file system must perform all writes
937c478bd9Sstevel@tonic-gate  * through a special strategy routine, fssnap_strategy().  This strategy
947c478bd9Sstevel@tonic-gate  * routine determines whether the chunks contained by the write must be
957c478bd9Sstevel@tonic-gate  * copied before being overwritten by consulting the candidate bitmap
967c478bd9Sstevel@tonic-gate  * described above, and the "hastrans bitmap" which tells it whether the chunk
977c478bd9Sstevel@tonic-gate  * has been copied already or not.  If the chunk is a candidate but has not
987c478bd9Sstevel@tonic-gate  * been copied, it reads the old data in and adds it to a queue.  The
997c478bd9Sstevel@tonic-gate  * old data can then be overwritten with the new data.  An asynchronous
1007c478bd9Sstevel@tonic-gate  * task queue is dispatched for each old chunk read in which writes the old
1017c478bd9Sstevel@tonic-gate  * data to the backing file specified at snapshot creation time.  The
1027c478bd9Sstevel@tonic-gate  * backing file is a sparse file the same size as the file system that
1037c478bd9Sstevel@tonic-gate  * contains the old data at the offset that data originally had in the
1047c478bd9Sstevel@tonic-gate  * file system.  If the queue containing in-memory chunks gets too large,
1057c478bd9Sstevel@tonic-gate  * writes to the file system may be throttled by a semaphore until the
1067c478bd9Sstevel@tonic-gate  * task queues have a chance to push some of the chunks to the backing file.
1077c478bd9Sstevel@tonic-gate  *
1087c478bd9Sstevel@tonic-gate  * With the candidate bitmap, the hastrans bitmap, the data on the master
1097c478bd9Sstevel@tonic-gate  * file system, and the old data in memory and in the backing file, the
1107c478bd9Sstevel@tonic-gate  * snapshot pseudo-driver can piece together the original file system
1117c478bd9Sstevel@tonic-gate  * information to satisfy read requests.  If the requested chunk is not a
1127c478bd9Sstevel@tonic-gate  * candidate, it returns a zeroed buffer.  If the chunk is a candidate but
1137c478bd9Sstevel@tonic-gate  * has not been copied it reads it from the master file system.  If it is a
1147c478bd9Sstevel@tonic-gate  * candidate and has been copied, it either copies the data from the
1157c478bd9Sstevel@tonic-gate  * in-memory queue or it reads it in from the backing file.  The result is
1167c478bd9Sstevel@tonic-gate  * a replication of the original file system that can be backed up, mounted,
1177c478bd9Sstevel@tonic-gate  * or manipulated by other file system utilities that work on a read-only
1187c478bd9Sstevel@tonic-gate  * device.
1197c478bd9Sstevel@tonic-gate  *
1207c478bd9Sstevel@tonic-gate  * This module is divided into three roughly logical sections:
1217c478bd9Sstevel@tonic-gate  *
1227c478bd9Sstevel@tonic-gate  *     - The snapshot driver, which is a character/block driver
1237c478bd9Sstevel@tonic-gate  *       representing the snapshot itself.  These routines are
1247c478bd9Sstevel@tonic-gate  *       prefixed with "snap_".
1257c478bd9Sstevel@tonic-gate  *
1267c478bd9Sstevel@tonic-gate  *     - The library routines that are defined in fssnap_if.h that
1277c478bd9Sstevel@tonic-gate  *       are used by file systems that use this snapshot implementation.
1287c478bd9Sstevel@tonic-gate  *       These functions are prefixed with "fssnap_" and are called through
1297c478bd9Sstevel@tonic-gate  *       a function vector from the file system.
1307c478bd9Sstevel@tonic-gate  *
1317c478bd9Sstevel@tonic-gate  *     - The helper routines used by the snapshot driver and the fssnap
1327c478bd9Sstevel@tonic-gate  *       library routines for managing the translation table and other
1337c478bd9Sstevel@tonic-gate  *       useful functions.  These routines are all static and are
1347c478bd9Sstevel@tonic-gate  *       prefixed with either "fssnap_" or "transtbl_" if they
1357c478bd9Sstevel@tonic-gate  *       are specifically used for translation table activities.
1367c478bd9Sstevel@tonic-gate  */
1377c478bd9Sstevel@tonic-gate 
1387c478bd9Sstevel@tonic-gate static dev_info_t		*fssnap_dip = NULL;
1397c478bd9Sstevel@tonic-gate static struct snapshot_id	*snapshot = NULL;
1407c478bd9Sstevel@tonic-gate static struct snapshot_id	snap_ctl;
1417c478bd9Sstevel@tonic-gate static int			num_snapshots = 0;
1427c478bd9Sstevel@tonic-gate static kmutex_t			snapshot_mutex;
1437c478bd9Sstevel@tonic-gate static char			snapname[] = SNAP_NAME;
1447c478bd9Sstevel@tonic-gate 
1457c478bd9Sstevel@tonic-gate /* "tunable" parameters */
1467c478bd9Sstevel@tonic-gate static int		fssnap_taskq_nthreads = FSSNAP_TASKQ_THREADS;
1477c478bd9Sstevel@tonic-gate static uint_t		fssnap_max_mem_chunks = FSSNAP_MAX_MEM_CHUNKS;
1487c478bd9Sstevel@tonic-gate static int		fssnap_taskq_maxtasks = FSSNAP_TASKQ_MAXTASKS;
1497c478bd9Sstevel@tonic-gate 
1507c478bd9Sstevel@tonic-gate /* static function prototypes */
1517c478bd9Sstevel@tonic-gate 
1527c478bd9Sstevel@tonic-gate /* snapshot driver */
1537c478bd9Sstevel@tonic-gate static int snap_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
1547c478bd9Sstevel@tonic-gate static int snap_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
1557c478bd9Sstevel@tonic-gate static int snap_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
1567c478bd9Sstevel@tonic-gate static int snap_open(dev_t *devp, int flag, int otyp, cred_t *cred);
1577c478bd9Sstevel@tonic-gate static int snap_close(dev_t dev, int flag, int otyp, cred_t *cred);
1587c478bd9Sstevel@tonic-gate static int snap_strategy(struct buf *bp);
1597c478bd9Sstevel@tonic-gate static int snap_read(dev_t dev, struct uio *uiop, cred_t *credp);
1607c478bd9Sstevel@tonic-gate static int snap_print(dev_t dev, char *str);
1617c478bd9Sstevel@tonic-gate static int snap_ioctl(dev_t dev, int cmd, intptr_t arg, int mode,
1627c478bd9Sstevel@tonic-gate     cred_t *credp, int *rvalp);
1637c478bd9Sstevel@tonic-gate static int snap_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op,
1647c478bd9Sstevel@tonic-gate     int flags, char *name, caddr_t valuep, int *lengthp);
1657c478bd9Sstevel@tonic-gate static int snap_getchunk(struct snapshot_id *sidp, chunknumber_t chunk,
1667c478bd9Sstevel@tonic-gate     int offset, int len, char *buffer);
1677c478bd9Sstevel@tonic-gate 
1687c478bd9Sstevel@tonic-gate 
1697c478bd9Sstevel@tonic-gate /* fssnap interface implementations (see fssnap_if.h) */
1707c478bd9Sstevel@tonic-gate static void fssnap_strategy_impl(void *, struct buf *);
1717c478bd9Sstevel@tonic-gate static void *fssnap_create_impl(chunknumber_t, uint_t, u_offset_t,
1727c478bd9Sstevel@tonic-gate     struct vnode *, int, struct vnode **, char *, u_offset_t);
1737c478bd9Sstevel@tonic-gate static void fssnap_set_candidate_impl(void *, chunknumber_t);
1747c478bd9Sstevel@tonic-gate static int fssnap_is_candidate_impl(void *, u_offset_t);
1757c478bd9Sstevel@tonic-gate static int fssnap_create_done_impl(void *);
1767c478bd9Sstevel@tonic-gate static int fssnap_delete_impl(void *);
1777c478bd9Sstevel@tonic-gate 
1787c478bd9Sstevel@tonic-gate /* fssnap interface support routines */
1797c478bd9Sstevel@tonic-gate static int  fssnap_translate(struct snapshot_id **, struct buf *);
1807c478bd9Sstevel@tonic-gate static void fssnap_write_taskq(void *);
1817c478bd9Sstevel@tonic-gate static void fssnap_create_kstats(snapshot_id_t *, int, const char *,
1827c478bd9Sstevel@tonic-gate     const char *);
1837c478bd9Sstevel@tonic-gate static int  fssnap_update_kstat_num(kstat_t *, int);
1847c478bd9Sstevel@tonic-gate static void fssnap_delete_kstats(struct cow_info *);
1857c478bd9Sstevel@tonic-gate 
1867c478bd9Sstevel@tonic-gate /* translation table prototypes */
1877c478bd9Sstevel@tonic-gate static cow_map_node_t *transtbl_add(cow_map_t *, chunknumber_t, caddr_t);
1887c478bd9Sstevel@tonic-gate static cow_map_node_t *transtbl_get(cow_map_t *, chunknumber_t);
1897c478bd9Sstevel@tonic-gate static void transtbl_delete(cow_map_t *, cow_map_node_t *);
1907c478bd9Sstevel@tonic-gate static void transtbl_free(cow_map_t *);
1917c478bd9Sstevel@tonic-gate 
1927c478bd9Sstevel@tonic-gate static kstat_t *fssnap_highwater_kstat;
1937c478bd9Sstevel@tonic-gate 
1947c478bd9Sstevel@tonic-gate /* ************************************************************************ */
1957c478bd9Sstevel@tonic-gate 
1967c478bd9Sstevel@tonic-gate /* Device and Module Structures */
1977c478bd9Sstevel@tonic-gate 
1987c478bd9Sstevel@tonic-gate static struct cb_ops snap_cb_ops = {
1997c478bd9Sstevel@tonic-gate 	snap_open,
2007c478bd9Sstevel@tonic-gate 	snap_close,
2017c478bd9Sstevel@tonic-gate 	snap_strategy,
2027c478bd9Sstevel@tonic-gate 	snap_print,
2037c478bd9Sstevel@tonic-gate 	nodev,		/* no snap_dump */
2047c478bd9Sstevel@tonic-gate 	snap_read,
2057c478bd9Sstevel@tonic-gate 	nodev,		/* no snap_write */
2067c478bd9Sstevel@tonic-gate 	snap_ioctl,
2077c478bd9Sstevel@tonic-gate 	nodev,		/* no snap_devmap */
2087c478bd9Sstevel@tonic-gate 	nodev,		/* no snap_mmap   */
2097c478bd9Sstevel@tonic-gate 	nodev,		/* no snap_segmap */
2107c478bd9Sstevel@tonic-gate 	nochpoll,
2117c478bd9Sstevel@tonic-gate 	snap_prop_op,
2127c478bd9Sstevel@tonic-gate 	NULL,		/* streamtab */
2137c478bd9Sstevel@tonic-gate 	D_64BIT | D_NEW | D_MP, /* driver compatibility */
2147c478bd9Sstevel@tonic-gate 	CB_REV,
2157c478bd9Sstevel@tonic-gate 	nodev,		/* async I/O read entry point */
2167c478bd9Sstevel@tonic-gate 	nodev		/* async I/O write entry point */
2177c478bd9Sstevel@tonic-gate };
2187c478bd9Sstevel@tonic-gate 
2197c478bd9Sstevel@tonic-gate static struct dev_ops snap_ops = {
2207c478bd9Sstevel@tonic-gate 	DEVO_REV,
2217c478bd9Sstevel@tonic-gate 	0,			/* ref count */
2227c478bd9Sstevel@tonic-gate 	snap_getinfo,
2237c478bd9Sstevel@tonic-gate 	nulldev,		/* snap_identify obsolete */
2247c478bd9Sstevel@tonic-gate 	nulldev,		/* no snap_probe */
2257c478bd9Sstevel@tonic-gate 	snap_attach,
2267c478bd9Sstevel@tonic-gate 	snap_detach,
2277c478bd9Sstevel@tonic-gate 	nodev,			/* no snap_reset */
2287c478bd9Sstevel@tonic-gate 	&snap_cb_ops,
2297c478bd9Sstevel@tonic-gate 	(struct bus_ops *)NULL,
2307c478bd9Sstevel@tonic-gate 	nulldev			/* no snap_power() */
2317c478bd9Sstevel@tonic-gate };
2327c478bd9Sstevel@tonic-gate 
2337c478bd9Sstevel@tonic-gate extern struct mod_ops mod_driverops;
2347c478bd9Sstevel@tonic-gate 
2357c478bd9Sstevel@tonic-gate static struct modldrv md = {
2367c478bd9Sstevel@tonic-gate 	&mod_driverops, /* Type of module. This is a driver */
2377c478bd9Sstevel@tonic-gate 	"snapshot driver %I%", 	/* Name of the module */
2387c478bd9Sstevel@tonic-gate 	&snap_ops,
2397c478bd9Sstevel@tonic-gate };
2407c478bd9Sstevel@tonic-gate 
2417c478bd9Sstevel@tonic-gate static struct modlinkage ml = {
2427c478bd9Sstevel@tonic-gate 	MODREV_1,
2437c478bd9Sstevel@tonic-gate 	&md,
2447c478bd9Sstevel@tonic-gate 	NULL
2457c478bd9Sstevel@tonic-gate };
2467c478bd9Sstevel@tonic-gate 
2477c478bd9Sstevel@tonic-gate static void *statep;
2487c478bd9Sstevel@tonic-gate 
2497c478bd9Sstevel@tonic-gate int
2507c478bd9Sstevel@tonic-gate _init(void)
2517c478bd9Sstevel@tonic-gate {
2527c478bd9Sstevel@tonic-gate 	int	error;
2537c478bd9Sstevel@tonic-gate 	kstat_t	*ksp;
2547c478bd9Sstevel@tonic-gate 	kstat_named_t	*ksdata;
2557c478bd9Sstevel@tonic-gate 
2567c478bd9Sstevel@tonic-gate 	error = ddi_soft_state_init(&statep, sizeof (struct snapshot_id *), 1);
2577c478bd9Sstevel@tonic-gate 	if (error) {
2587c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN, "_init: failed to init ddi_soft_state.");
2597c478bd9Sstevel@tonic-gate 		return (error);
2607c478bd9Sstevel@tonic-gate 	}
2617c478bd9Sstevel@tonic-gate 
2627c478bd9Sstevel@tonic-gate 	error = mod_install(&ml);
2637c478bd9Sstevel@tonic-gate 
2647c478bd9Sstevel@tonic-gate 	if (error) {
2657c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN, "_init: failed to mod_install.");
2667c478bd9Sstevel@tonic-gate 		ddi_soft_state_fini(&statep);
2677c478bd9Sstevel@tonic-gate 		return (error);
2687c478bd9Sstevel@tonic-gate 	}
2697c478bd9Sstevel@tonic-gate 
2707c478bd9Sstevel@tonic-gate 	/*
2717c478bd9Sstevel@tonic-gate 	 * Fill in the snapshot operations vector for file systems
2727c478bd9Sstevel@tonic-gate 	 * (defined in fssnap_if.c)
2737c478bd9Sstevel@tonic-gate 	 */
2747c478bd9Sstevel@tonic-gate 
2757c478bd9Sstevel@tonic-gate 	snapops.fssnap_create = fssnap_create_impl;
2767c478bd9Sstevel@tonic-gate 	snapops.fssnap_set_candidate = fssnap_set_candidate_impl;
2777c478bd9Sstevel@tonic-gate 	snapops.fssnap_is_candidate = fssnap_is_candidate_impl;
2787c478bd9Sstevel@tonic-gate 	snapops.fssnap_create_done = fssnap_create_done_impl;
2797c478bd9Sstevel@tonic-gate 	snapops.fssnap_delete = fssnap_delete_impl;
2807c478bd9Sstevel@tonic-gate 	snapops.fssnap_strategy = fssnap_strategy_impl;
2817c478bd9Sstevel@tonic-gate 
2827c478bd9Sstevel@tonic-gate 	mutex_init(&snapshot_mutex, NULL, MUTEX_DEFAULT, NULL);
2837c478bd9Sstevel@tonic-gate 
2847c478bd9Sstevel@tonic-gate 	/*
2857c478bd9Sstevel@tonic-gate 	 * Initialize the fssnap highwater kstat
2867c478bd9Sstevel@tonic-gate 	 */
2877c478bd9Sstevel@tonic-gate 	ksp = kstat_create(snapname, 0, FSSNAP_KSTAT_HIGHWATER, "misc",
2887c478bd9Sstevel@tonic-gate 	    KSTAT_TYPE_NAMED, 1, 0);
2897c478bd9Sstevel@tonic-gate 	if (ksp != NULL) {
2907c478bd9Sstevel@tonic-gate 		ksdata = (kstat_named_t *)ksp->ks_data;
2917c478bd9Sstevel@tonic-gate 		kstat_named_init(ksdata, FSSNAP_KSTAT_HIGHWATER,
2927c478bd9Sstevel@tonic-gate 		    KSTAT_DATA_UINT32);
2937c478bd9Sstevel@tonic-gate 		ksdata->value.ui32 = 0;
2947c478bd9Sstevel@tonic-gate 		kstat_install(ksp);
2957c478bd9Sstevel@tonic-gate 	} else {
2967c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN, "_init: failed to create highwater kstat.");
2977c478bd9Sstevel@tonic-gate 	}
2987c478bd9Sstevel@tonic-gate 	fssnap_highwater_kstat = ksp;
2997c478bd9Sstevel@tonic-gate 
3007c478bd9Sstevel@tonic-gate 	return (0);
3017c478bd9Sstevel@tonic-gate }
3027c478bd9Sstevel@tonic-gate 
3037c478bd9Sstevel@tonic-gate int
3047c478bd9Sstevel@tonic-gate _info(struct modinfo *modinfop)
3057c478bd9Sstevel@tonic-gate {
3067c478bd9Sstevel@tonic-gate 	return (mod_info(&ml, modinfop));
3077c478bd9Sstevel@tonic-gate }
3087c478bd9Sstevel@tonic-gate 
3097c478bd9Sstevel@tonic-gate int
3107c478bd9Sstevel@tonic-gate _fini(void)
3117c478bd9Sstevel@tonic-gate {
3127c478bd9Sstevel@tonic-gate 	int	error;
3137c478bd9Sstevel@tonic-gate 
3147c478bd9Sstevel@tonic-gate 	error = mod_remove(&ml);
3157c478bd9Sstevel@tonic-gate 	if (error)
3167c478bd9Sstevel@tonic-gate 		return (error);
3177c478bd9Sstevel@tonic-gate 	ddi_soft_state_fini(&statep);
3187c478bd9Sstevel@tonic-gate 
3197c478bd9Sstevel@tonic-gate 	/*
3207c478bd9Sstevel@tonic-gate 	 * delete the fssnap highwater kstat
3217c478bd9Sstevel@tonic-gate 	 */
3227c478bd9Sstevel@tonic-gate 	kstat_delete(fssnap_highwater_kstat);
3237c478bd9Sstevel@tonic-gate 
3247c478bd9Sstevel@tonic-gate 	mutex_destroy(&snapshot_mutex);
3257c478bd9Sstevel@tonic-gate 
3267c478bd9Sstevel@tonic-gate 	/* Clear out the file system operations vector */
3277c478bd9Sstevel@tonic-gate 	snapops.fssnap_create = NULL;
3287c478bd9Sstevel@tonic-gate 	snapops.fssnap_set_candidate = NULL;
3297c478bd9Sstevel@tonic-gate 	snapops.fssnap_create_done = NULL;
3307c478bd9Sstevel@tonic-gate 	snapops.fssnap_delete = NULL;
3317c478bd9Sstevel@tonic-gate 	snapops.fssnap_strategy = NULL;
3327c478bd9Sstevel@tonic-gate 
3337c478bd9Sstevel@tonic-gate 	return (0);
3347c478bd9Sstevel@tonic-gate }
3357c478bd9Sstevel@tonic-gate 
3367c478bd9Sstevel@tonic-gate /* ************************************************************************ */
3377c478bd9Sstevel@tonic-gate 
3387c478bd9Sstevel@tonic-gate /*
3397c478bd9Sstevel@tonic-gate  * Snapshot Driver Routines
3407c478bd9Sstevel@tonic-gate  *
3417c478bd9Sstevel@tonic-gate  * This section implements the snapshot character and block drivers.  The
3427c478bd9Sstevel@tonic-gate  * device will appear to be a consistent read-only file system to
3437c478bd9Sstevel@tonic-gate  * applications that wish to back it up or mount it.  The snapshot driver
3447c478bd9Sstevel@tonic-gate  * communicates with the file system through the translation table, which
3457c478bd9Sstevel@tonic-gate  * tells the snapshot driver where to find the data necessary to piece
3467c478bd9Sstevel@tonic-gate  * together the frozen file system.  The data may either be on the master
3477c478bd9Sstevel@tonic-gate  * device (no translation exists), in memory (a translation exists but has
3487c478bd9Sstevel@tonic-gate  * not been flushed to the backing store), or in the backing store file.
3497c478bd9Sstevel@tonic-gate  * The read request may require the snapshot driver to retreive data from
3507c478bd9Sstevel@tonic-gate  * several different places and piece it together to look like a single
3517c478bd9Sstevel@tonic-gate  * contiguous read.
3527c478bd9Sstevel@tonic-gate  *
3537c478bd9Sstevel@tonic-gate  * The device minor number corresponds to the snapshot number in the list of
3547c478bd9Sstevel@tonic-gate  * snapshot identifiers.  The soft state for each minor number is simply a
3557c478bd9Sstevel@tonic-gate  * pointer to the snapshot id, which holds all of the snapshot state.  One
3567c478bd9Sstevel@tonic-gate  * minor number is designated as the control device.  All snapshot create
3577c478bd9Sstevel@tonic-gate  * and delete requests go through the control device to ensure this module
3587c478bd9Sstevel@tonic-gate  * is properly loaded and attached before the file system starts calling
3597c478bd9Sstevel@tonic-gate  * routines defined here.
3607c478bd9Sstevel@tonic-gate  */
3617c478bd9Sstevel@tonic-gate 
3627c478bd9Sstevel@tonic-gate 
3637c478bd9Sstevel@tonic-gate /*
3647c478bd9Sstevel@tonic-gate  * snap_getinfo() - snapshot driver getinfo(9E) routine
3657c478bd9Sstevel@tonic-gate  *
3667c478bd9Sstevel@tonic-gate  */
3677c478bd9Sstevel@tonic-gate /*ARGSUSED*/
3687c478bd9Sstevel@tonic-gate static int
3697c478bd9Sstevel@tonic-gate snap_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
3707c478bd9Sstevel@tonic-gate {
3717c478bd9Sstevel@tonic-gate 	switch (infocmd) {
3727c478bd9Sstevel@tonic-gate 	case DDI_INFO_DEVT2DEVINFO:
3737c478bd9Sstevel@tonic-gate 		*result = fssnap_dip;
3747c478bd9Sstevel@tonic-gate 		return (DDI_SUCCESS);
3757c478bd9Sstevel@tonic-gate 	case DDI_INFO_DEVT2INSTANCE:
3767c478bd9Sstevel@tonic-gate 		*result = 0;	/* we only have one instance */
3777c478bd9Sstevel@tonic-gate 		return (DDI_SUCCESS);
3787c478bd9Sstevel@tonic-gate 	}
3797c478bd9Sstevel@tonic-gate 	return (DDI_FAILURE);
3807c478bd9Sstevel@tonic-gate }
3817c478bd9Sstevel@tonic-gate 
3827c478bd9Sstevel@tonic-gate /*
3837c478bd9Sstevel@tonic-gate  * snap_attach() - snapshot driver attach(9E) routine
3847c478bd9Sstevel@tonic-gate  *
3857c478bd9Sstevel@tonic-gate  *    sets up snapshot control device and control state.  The control state
3867c478bd9Sstevel@tonic-gate  *    is a pointer to an "anonymous" snapshot_id for tracking opens and closes
3877c478bd9Sstevel@tonic-gate  */
3887c478bd9Sstevel@tonic-gate static int
3897c478bd9Sstevel@tonic-gate snap_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
3907c478bd9Sstevel@tonic-gate {
3917c478bd9Sstevel@tonic-gate 	int			error;
3927c478bd9Sstevel@tonic-gate 
3937c478bd9Sstevel@tonic-gate 	switch (cmd) {
3947c478bd9Sstevel@tonic-gate 	case DDI_ATTACH:
3957c478bd9Sstevel@tonic-gate 		/* create the control device */
3967c478bd9Sstevel@tonic-gate 		error = ddi_create_priv_minor_node(dip, SNAP_CTL_NODE, S_IFCHR,
3977c478bd9Sstevel@tonic-gate 		    SNAP_CTL_MINOR, DDI_PSEUDO, PRIVONLY_DEV,
3987c478bd9Sstevel@tonic-gate 		    PRIV_SYS_CONFIG, PRIV_SYS_CONFIG, 0666);
3997c478bd9Sstevel@tonic-gate 		if (error == DDI_FAILURE) {
4007c478bd9Sstevel@tonic-gate 			return (DDI_FAILURE);
4017c478bd9Sstevel@tonic-gate 		}
4027c478bd9Sstevel@tonic-gate 
4037c478bd9Sstevel@tonic-gate 		rw_init(&snap_ctl.sid_rwlock, NULL, RW_DEFAULT, NULL);
4047c478bd9Sstevel@tonic-gate 		rw_enter(&snap_ctl.sid_rwlock, RW_WRITER);
4057c478bd9Sstevel@tonic-gate 		fssnap_dip = dip;
4067c478bd9Sstevel@tonic-gate 		snap_ctl.sid_snapnumber = SNAP_CTL_MINOR;
4077c478bd9Sstevel@tonic-gate 		/* the control sid is not linked into the snapshot list */
4087c478bd9Sstevel@tonic-gate 		snap_ctl.sid_next = NULL;
4097c478bd9Sstevel@tonic-gate 		snap_ctl.sid_cowinfo = NULL;
4107c478bd9Sstevel@tonic-gate 		snap_ctl.sid_flags = 0;
4117c478bd9Sstevel@tonic-gate 		rw_exit(&snap_ctl.sid_rwlock);
4127c478bd9Sstevel@tonic-gate 		ddi_report_dev(dip);
4137c478bd9Sstevel@tonic-gate 
4147c478bd9Sstevel@tonic-gate 		return (DDI_SUCCESS);
4157c478bd9Sstevel@tonic-gate 	case DDI_PM_RESUME:
4167c478bd9Sstevel@tonic-gate 		return (DDI_SUCCESS);
4177c478bd9Sstevel@tonic-gate 
4187c478bd9Sstevel@tonic-gate 	case DDI_RESUME:
4197c478bd9Sstevel@tonic-gate 		return (DDI_SUCCESS);
4207c478bd9Sstevel@tonic-gate 
4217c478bd9Sstevel@tonic-gate 	default:
4227c478bd9Sstevel@tonic-gate 		return (DDI_FAILURE);
4237c478bd9Sstevel@tonic-gate 	}
4247c478bd9Sstevel@tonic-gate }
4257c478bd9Sstevel@tonic-gate 
4267c478bd9Sstevel@tonic-gate /*
4277c478bd9Sstevel@tonic-gate  * snap_detach() - snapshot driver detach(9E) routine
4287c478bd9Sstevel@tonic-gate  *
4297c478bd9Sstevel@tonic-gate  *    destroys snapshot control device and control state.  If any snapshots
4307c478bd9Sstevel@tonic-gate  *    are active (ie. num_snapshots != 0), the device will refuse to detach.
4317c478bd9Sstevel@tonic-gate  */
4327c478bd9Sstevel@tonic-gate static int
4337c478bd9Sstevel@tonic-gate snap_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
4347c478bd9Sstevel@tonic-gate {
4357c478bd9Sstevel@tonic-gate 	struct snapshot_id *sidp, *sidnextp;
4367c478bd9Sstevel@tonic-gate 
4377c478bd9Sstevel@tonic-gate 	switch (cmd) {
4387c478bd9Sstevel@tonic-gate 	case DDI_DETACH:
4397c478bd9Sstevel@tonic-gate 		/* do not detach if the device is active */
4407c478bd9Sstevel@tonic-gate 		mutex_enter(&snapshot_mutex);
4417c478bd9Sstevel@tonic-gate 		if ((num_snapshots != 0) ||
4427c478bd9Sstevel@tonic-gate 		    ((snap_ctl.sid_flags & SID_CHAR_BUSY) != 0)) {
4437c478bd9Sstevel@tonic-gate 			mutex_exit(&snapshot_mutex);
4447c478bd9Sstevel@tonic-gate 			return (DDI_FAILURE);
4457c478bd9Sstevel@tonic-gate 		}
4467c478bd9Sstevel@tonic-gate 
4477c478bd9Sstevel@tonic-gate 		/* free up the snapshot list */
4487c478bd9Sstevel@tonic-gate 		for (sidp = snapshot; sidp != NULL; sidp = sidnextp) {
4497c478bd9Sstevel@tonic-gate 			ASSERT(SID_AVAILABLE(sidp) &&
4507c478bd9Sstevel@tonic-gate 			    !RW_LOCK_HELD(&sidp->sid_rwlock));
4517c478bd9Sstevel@tonic-gate 			sidnextp = sidp->sid_next;
4527c478bd9Sstevel@tonic-gate 			rw_destroy(&sidp->sid_rwlock);
4537c478bd9Sstevel@tonic-gate 			kmem_free(sidp, sizeof (struct snapshot_id));
4547c478bd9Sstevel@tonic-gate 		}
4557c478bd9Sstevel@tonic-gate 		snapshot = NULL;
4567c478bd9Sstevel@tonic-gate 
4577c478bd9Sstevel@tonic-gate 		/* delete the control device */
4587c478bd9Sstevel@tonic-gate 		ddi_remove_minor_node(dip, SNAP_CTL_NODE);
4597c478bd9Sstevel@tonic-gate 		fssnap_dip = NULL;
4607c478bd9Sstevel@tonic-gate 
4617c478bd9Sstevel@tonic-gate 		ASSERT((snap_ctl.sid_flags & SID_CHAR_BUSY) == 0);
4627c478bd9Sstevel@tonic-gate 		rw_destroy(&snap_ctl.sid_rwlock);
4637c478bd9Sstevel@tonic-gate 		mutex_exit(&snapshot_mutex);
4647c478bd9Sstevel@tonic-gate 
4657c478bd9Sstevel@tonic-gate 		return (DDI_SUCCESS);
4667c478bd9Sstevel@tonic-gate 
4677c478bd9Sstevel@tonic-gate 	default:
4687c478bd9Sstevel@tonic-gate 		return (DDI_FAILURE);
4697c478bd9Sstevel@tonic-gate 	}
4707c478bd9Sstevel@tonic-gate }
4717c478bd9Sstevel@tonic-gate 
4727c478bd9Sstevel@tonic-gate /*
4737c478bd9Sstevel@tonic-gate  * snap_open() - snapshot driver open(9E) routine
4747c478bd9Sstevel@tonic-gate  *
4757c478bd9Sstevel@tonic-gate  *     marks the snapshot id as busy so it will not be recycled when deleted
4767c478bd9Sstevel@tonic-gate  *     until the snapshot is closed.
4777c478bd9Sstevel@tonic-gate  */
4787c478bd9Sstevel@tonic-gate /* ARGSUSED */
4797c478bd9Sstevel@tonic-gate static int
4807c478bd9Sstevel@tonic-gate snap_open(dev_t *devp, int flag, int otyp, cred_t *cred)
4817c478bd9Sstevel@tonic-gate {
4827c478bd9Sstevel@tonic-gate 	minor_t	minor;
4837c478bd9Sstevel@tonic-gate 	struct snapshot_id **sidpp, *sidp;
4847c478bd9Sstevel@tonic-gate 
4857c478bd9Sstevel@tonic-gate 	/* snapshots are read-only */
4867c478bd9Sstevel@tonic-gate 	if (flag & FWRITE)
4877c478bd9Sstevel@tonic-gate 		return (EROFS);
4887c478bd9Sstevel@tonic-gate 
4897c478bd9Sstevel@tonic-gate 	minor = getminor(*devp);
4907c478bd9Sstevel@tonic-gate 
4917c478bd9Sstevel@tonic-gate 	if (minor == SNAP_CTL_MINOR) {
4927c478bd9Sstevel@tonic-gate 		/* control device must be opened exclusively */
4937c478bd9Sstevel@tonic-gate 		if (((flag & FEXCL) != FEXCL) || (otyp != OTYP_CHR))
4947c478bd9Sstevel@tonic-gate 			return (EINVAL);
4957c478bd9Sstevel@tonic-gate 
4967c478bd9Sstevel@tonic-gate 		rw_enter(&snap_ctl.sid_rwlock, RW_WRITER);
4977c478bd9Sstevel@tonic-gate 		if ((snap_ctl.sid_flags & SID_CHAR_BUSY) != 0) {
4987c478bd9Sstevel@tonic-gate 			rw_exit(&snap_ctl.sid_rwlock);
4997c478bd9Sstevel@tonic-gate 			return (EBUSY);
5007c478bd9Sstevel@tonic-gate 		}
5017c478bd9Sstevel@tonic-gate 
5027c478bd9Sstevel@tonic-gate 		snap_ctl.sid_flags |= SID_CHAR_BUSY;
5037c478bd9Sstevel@tonic-gate 		rw_exit(&snap_ctl.sid_rwlock);
5047c478bd9Sstevel@tonic-gate 
5057c478bd9Sstevel@tonic-gate 		return (0);
5067c478bd9Sstevel@tonic-gate 	}
5077c478bd9Sstevel@tonic-gate 
5087c478bd9Sstevel@tonic-gate 	sidpp = ddi_get_soft_state(statep, minor);
5097c478bd9Sstevel@tonic-gate 	if (sidpp == NULL || *sidpp == NULL)
5107c478bd9Sstevel@tonic-gate 		return (ENXIO);
5117c478bd9Sstevel@tonic-gate 	sidp = *sidpp;
5127c478bd9Sstevel@tonic-gate 	rw_enter(&sidp->sid_rwlock, RW_WRITER);
5137c478bd9Sstevel@tonic-gate 
5147c478bd9Sstevel@tonic-gate 	if ((flag & FEXCL) && SID_BUSY(sidp)) {
5157c478bd9Sstevel@tonic-gate 		rw_exit(&sidp->sid_rwlock);
5167c478bd9Sstevel@tonic-gate 		return (EAGAIN);
5177c478bd9Sstevel@tonic-gate 	}
5187c478bd9Sstevel@tonic-gate 
5197c478bd9Sstevel@tonic-gate 	ASSERT(sidpp != NULL && sidp != NULL);
5207c478bd9Sstevel@tonic-gate 	/* check to see if this snapshot has been killed on us */
5217c478bd9Sstevel@tonic-gate 	if (SID_INACTIVE(sidp)) {
5227c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN, "snap_open: snapshot %d does not exist.",
5237c478bd9Sstevel@tonic-gate 		    minor);
5247c478bd9Sstevel@tonic-gate 		rw_exit(&sidp->sid_rwlock);
5257c478bd9Sstevel@tonic-gate 		return (ENXIO);
5267c478bd9Sstevel@tonic-gate 	}
5277c478bd9Sstevel@tonic-gate 
5287c478bd9Sstevel@tonic-gate 	switch (otyp) {
5297c478bd9Sstevel@tonic-gate 	case OTYP_CHR:
5307c478bd9Sstevel@tonic-gate 		sidp->sid_flags |= SID_CHAR_BUSY;
5317c478bd9Sstevel@tonic-gate 		break;
5327c478bd9Sstevel@tonic-gate 	case OTYP_BLK:
5337c478bd9Sstevel@tonic-gate 		sidp->sid_flags |= SID_BLOCK_BUSY;
5347c478bd9Sstevel@tonic-gate 		break;
5357c478bd9Sstevel@tonic-gate 	default:
5367c478bd9Sstevel@tonic-gate 		rw_exit(&sidp->sid_rwlock);
5377c478bd9Sstevel@tonic-gate 		return (EINVAL);
5387c478bd9Sstevel@tonic-gate 	}
5397c478bd9Sstevel@tonic-gate 
5407c478bd9Sstevel@tonic-gate 	rw_exit(&sidp->sid_rwlock);
5417c478bd9Sstevel@tonic-gate 
5427c478bd9Sstevel@tonic-gate 	/*
5437c478bd9Sstevel@tonic-gate 	 * at this point if a valid snapshot was found then it has
5447c478bd9Sstevel@tonic-gate 	 * been marked busy and we can use it.
5457c478bd9Sstevel@tonic-gate 	 */
5467c478bd9Sstevel@tonic-gate 	return (0);
5477c478bd9Sstevel@tonic-gate }
5487c478bd9Sstevel@tonic-gate 
5497c478bd9Sstevel@tonic-gate /*
5507c478bd9Sstevel@tonic-gate  * snap_close() - snapshot driver close(9E) routine
5517c478bd9Sstevel@tonic-gate  *
5527c478bd9Sstevel@tonic-gate  *    unsets the busy bits in the snapshot id.  If the snapshot has been
5537c478bd9Sstevel@tonic-gate  *    deleted while the snapshot device was open, the close call will clean
5547c478bd9Sstevel@tonic-gate  *    up the remaining state information.
5557c478bd9Sstevel@tonic-gate  */
5567c478bd9Sstevel@tonic-gate /* ARGSUSED */
5577c478bd9Sstevel@tonic-gate static int
5587c478bd9Sstevel@tonic-gate snap_close(dev_t dev, int flag, int otyp, cred_t *cred)
5597c478bd9Sstevel@tonic-gate {
5607c478bd9Sstevel@tonic-gate 	struct snapshot_id	**sidpp, *sidp;
5617c478bd9Sstevel@tonic-gate 	minor_t			minor;
5627c478bd9Sstevel@tonic-gate 	char			name[20];
5637c478bd9Sstevel@tonic-gate 
5647c478bd9Sstevel@tonic-gate 	minor = getminor(dev);
5657c478bd9Sstevel@tonic-gate 
5667c478bd9Sstevel@tonic-gate 	/* if this is the control device, close it and return */
5677c478bd9Sstevel@tonic-gate 	if (minor == SNAP_CTL_MINOR) {
5687c478bd9Sstevel@tonic-gate 		rw_enter(&snap_ctl.sid_rwlock, RW_WRITER);
5697c478bd9Sstevel@tonic-gate 		snap_ctl.sid_flags &= ~(SID_CHAR_BUSY);
5707c478bd9Sstevel@tonic-gate 		rw_exit(&snap_ctl.sid_rwlock);
5717c478bd9Sstevel@tonic-gate 		return (0);
5727c478bd9Sstevel@tonic-gate 	}
5737c478bd9Sstevel@tonic-gate 
5747c478bd9Sstevel@tonic-gate 	sidpp = ddi_get_soft_state(statep, minor);
5757c478bd9Sstevel@tonic-gate 	if (sidpp == NULL || *sidpp == NULL) {
5767c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN, "snap_close: could not find state for "
5777c478bd9Sstevel@tonic-gate 		    "snapshot %d.", minor);
5787c478bd9Sstevel@tonic-gate 		return (ENXIO);
5797c478bd9Sstevel@tonic-gate 	}
5807c478bd9Sstevel@tonic-gate 	sidp = *sidpp;
5817c478bd9Sstevel@tonic-gate 	mutex_enter(&snapshot_mutex);
5827c478bd9Sstevel@tonic-gate 	rw_enter(&sidp->sid_rwlock, RW_WRITER);
5837c478bd9Sstevel@tonic-gate 
5847c478bd9Sstevel@tonic-gate 	/* Mark the snapshot as not being busy anymore */
5857c478bd9Sstevel@tonic-gate 	switch (otyp) {
5867c478bd9Sstevel@tonic-gate 	case OTYP_CHR:
5877c478bd9Sstevel@tonic-gate 		sidp->sid_flags &= ~(SID_CHAR_BUSY);
5887c478bd9Sstevel@tonic-gate 		break;
5897c478bd9Sstevel@tonic-gate 	case OTYP_BLK:
5907c478bd9Sstevel@tonic-gate 		sidp->sid_flags &= ~(SID_BLOCK_BUSY);
5917c478bd9Sstevel@tonic-gate 		break;
5927c478bd9Sstevel@tonic-gate 	default:
5937c478bd9Sstevel@tonic-gate 		mutex_exit(&snapshot_mutex);
5947c478bd9Sstevel@tonic-gate 		rw_exit(&sidp->sid_rwlock);
5957c478bd9Sstevel@tonic-gate 		return (EINVAL);
5967c478bd9Sstevel@tonic-gate 	}
5977c478bd9Sstevel@tonic-gate 
5987c478bd9Sstevel@tonic-gate 	if (SID_AVAILABLE(sidp)) {
5997c478bd9Sstevel@tonic-gate 		/*
6007c478bd9Sstevel@tonic-gate 		 * if this is the last close on a snapshot that has been
6017c478bd9Sstevel@tonic-gate 		 * deleted, then free up the soft state.  The snapdelete
6027c478bd9Sstevel@tonic-gate 		 * ioctl does not free this when the device is in use so
6037c478bd9Sstevel@tonic-gate 		 * we do it here after the last reference goes away.
6047c478bd9Sstevel@tonic-gate 		 */
6057c478bd9Sstevel@tonic-gate 
6067c478bd9Sstevel@tonic-gate 		/* remove the device nodes */
6077c478bd9Sstevel@tonic-gate 		ASSERT(fssnap_dip != NULL);
6087c478bd9Sstevel@tonic-gate 		(void) snprintf(name, sizeof (name), "%d",
6097c478bd9Sstevel@tonic-gate 		    sidp->sid_snapnumber);
6107c478bd9Sstevel@tonic-gate 		ddi_remove_minor_node(fssnap_dip, name);
6117c478bd9Sstevel@tonic-gate 		(void) snprintf(name, sizeof (name), "%d,raw",
6127c478bd9Sstevel@tonic-gate 		    sidp->sid_snapnumber);
6137c478bd9Sstevel@tonic-gate 		ddi_remove_minor_node(fssnap_dip, name);
6147c478bd9Sstevel@tonic-gate 
6157c478bd9Sstevel@tonic-gate 		/* delete the state structure */
6167c478bd9Sstevel@tonic-gate 		ddi_soft_state_free(statep, sidp->sid_snapnumber);
6177c478bd9Sstevel@tonic-gate 		num_snapshots--;
6187c478bd9Sstevel@tonic-gate 	}
6197c478bd9Sstevel@tonic-gate 
6207c478bd9Sstevel@tonic-gate 	mutex_exit(&snapshot_mutex);
6217c478bd9Sstevel@tonic-gate 	rw_exit(&sidp->sid_rwlock);
6227c478bd9Sstevel@tonic-gate 
6237c478bd9Sstevel@tonic-gate 	return (0);
6247c478bd9Sstevel@tonic-gate }
6257c478bd9Sstevel@tonic-gate 
6267c478bd9Sstevel@tonic-gate /*
6277c478bd9Sstevel@tonic-gate  * snap_read() - snapshot driver read(9E) routine
6287c478bd9Sstevel@tonic-gate  *
6297c478bd9Sstevel@tonic-gate  *    reads data from the snapshot by calling snap_strategy() through physio()
6307c478bd9Sstevel@tonic-gate  */
6317c478bd9Sstevel@tonic-gate /* ARGSUSED */
6327c478bd9Sstevel@tonic-gate static int
6337c478bd9Sstevel@tonic-gate snap_read(dev_t dev, struct uio *uiop, cred_t *credp)
6347c478bd9Sstevel@tonic-gate {
6357c478bd9Sstevel@tonic-gate 	minor_t		minor;
6367c478bd9Sstevel@tonic-gate 	struct snapshot_id **sidpp;
6377c478bd9Sstevel@tonic-gate 
6387c478bd9Sstevel@tonic-gate 	minor = getminor(dev);
6397c478bd9Sstevel@tonic-gate 	sidpp = ddi_get_soft_state(statep, minor);
6407c478bd9Sstevel@tonic-gate 	if (sidpp == NULL || *sidpp == NULL) {
6417c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN,
6427c478bd9Sstevel@tonic-gate 		    "snap_read: could not find state for snapshot %d.", minor);
6437c478bd9Sstevel@tonic-gate 		return (ENXIO);
6447c478bd9Sstevel@tonic-gate 	}
6457c478bd9Sstevel@tonic-gate 	return (physio(snap_strategy, NULL, dev, B_READ, minphys, uiop));
6467c478bd9Sstevel@tonic-gate }
6477c478bd9Sstevel@tonic-gate 
6487c478bd9Sstevel@tonic-gate /*
6497c478bd9Sstevel@tonic-gate  * snap_strategy() - snapshot driver strategy(9E) routine
6507c478bd9Sstevel@tonic-gate  *
6517c478bd9Sstevel@tonic-gate  *    cycles through each chunk in the requested buffer and calls
6527c478bd9Sstevel@tonic-gate  *    snap_getchunk() on each chunk to retrieve it from the appropriate
6537c478bd9Sstevel@tonic-gate  *    place.  Once all of the parts are put together the requested buffer
6547c478bd9Sstevel@tonic-gate  *    is returned.  The snapshot driver is read-only, so a write is invalid.
6557c478bd9Sstevel@tonic-gate  */
6567c478bd9Sstevel@tonic-gate static int
6577c478bd9Sstevel@tonic-gate snap_strategy(struct buf *bp)
6587c478bd9Sstevel@tonic-gate {
6597c478bd9Sstevel@tonic-gate 	struct snapshot_id **sidpp, *sidp;
6607c478bd9Sstevel@tonic-gate 	minor_t		minor;
6617c478bd9Sstevel@tonic-gate 	chunknumber_t	chunk;
6627c478bd9Sstevel@tonic-gate 	int		off, len;
6637c478bd9Sstevel@tonic-gate 	u_longlong_t	reqptr;
6647c478bd9Sstevel@tonic-gate 	int		error = 0;
6657c478bd9Sstevel@tonic-gate 	size_t		chunksz;
6667c478bd9Sstevel@tonic-gate 	caddr_t		buf;
6677c478bd9Sstevel@tonic-gate 
6687c478bd9Sstevel@tonic-gate 	/* snapshot device is read-only */
6697c478bd9Sstevel@tonic-gate 	if (bp->b_flags & B_WRITE) {
6707c478bd9Sstevel@tonic-gate 		bioerror(bp, EROFS);
6717c478bd9Sstevel@tonic-gate 		bp->b_resid = bp->b_bcount;
6727c478bd9Sstevel@tonic-gate 		biodone(bp);
6737c478bd9Sstevel@tonic-gate 		return (0);
6747c478bd9Sstevel@tonic-gate 	}
6757c478bd9Sstevel@tonic-gate 
6767c478bd9Sstevel@tonic-gate 	minor = getminor(bp->b_edev);
6777c478bd9Sstevel@tonic-gate 	sidpp = ddi_get_soft_state(statep, minor);
6787c478bd9Sstevel@tonic-gate 	if (sidpp == NULL || *sidpp == NULL) {
6797c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN,
6807c478bd9Sstevel@tonic-gate 		    "snap_strategy: could not find state for snapshot %d.",
6817c478bd9Sstevel@tonic-gate 		    minor);
6827c478bd9Sstevel@tonic-gate 		bioerror(bp, ENXIO);
6837c478bd9Sstevel@tonic-gate 		bp->b_resid = bp->b_bcount;
6847c478bd9Sstevel@tonic-gate 		biodone(bp);
6857c478bd9Sstevel@tonic-gate 		return (0);
6867c478bd9Sstevel@tonic-gate 	}
6877c478bd9Sstevel@tonic-gate 	sidp = *sidpp;
6887c478bd9Sstevel@tonic-gate 	ASSERT(sidp);
6897c478bd9Sstevel@tonic-gate 	rw_enter(&sidp->sid_rwlock, RW_READER);
6907c478bd9Sstevel@tonic-gate 
6917c478bd9Sstevel@tonic-gate 	if (SID_INACTIVE(sidp)) {
6927c478bd9Sstevel@tonic-gate 		bioerror(bp, ENXIO);
6937c478bd9Sstevel@tonic-gate 		bp->b_resid = bp->b_bcount;
6947c478bd9Sstevel@tonic-gate 		biodone(bp);
6957c478bd9Sstevel@tonic-gate 		rw_exit(&sidp->sid_rwlock);
6967c478bd9Sstevel@tonic-gate 		return (0);
6977c478bd9Sstevel@tonic-gate 	}
6987c478bd9Sstevel@tonic-gate 
6997c478bd9Sstevel@tonic-gate 	if (bp->b_flags & (B_PAGEIO|B_PHYS))
7007c478bd9Sstevel@tonic-gate 		bp_mapin(bp);
7017c478bd9Sstevel@tonic-gate 
7027c478bd9Sstevel@tonic-gate 	bp->b_resid = bp->b_bcount;
7037c478bd9Sstevel@tonic-gate 	ASSERT(bp->b_un.b_addr);
7047c478bd9Sstevel@tonic-gate 	buf = bp->b_un.b_addr;
7057c478bd9Sstevel@tonic-gate 
7067c478bd9Sstevel@tonic-gate 	chunksz = sidp->sid_cowinfo->cow_map.cmap_chunksz;
7077c478bd9Sstevel@tonic-gate 
7087c478bd9Sstevel@tonic-gate 	/* reqptr is the current DEV_BSIZE offset into the device */
7097c478bd9Sstevel@tonic-gate 	/* chunk is the chunk containing reqptr */
7107c478bd9Sstevel@tonic-gate 	/* len is the length of the request (in the current chunk) in bytes */
7117c478bd9Sstevel@tonic-gate 	/* off is the byte offset into the current chunk */
7127c478bd9Sstevel@tonic-gate 	reqptr = bp->b_lblkno;
7137c478bd9Sstevel@tonic-gate 	while (bp->b_resid > 0) {
7147c478bd9Sstevel@tonic-gate 		chunk = dbtocowchunk(&sidp->sid_cowinfo->cow_map, reqptr);
7157c478bd9Sstevel@tonic-gate 		off = (reqptr % (chunksz >> DEV_BSHIFT)) << DEV_BSHIFT;
7167c478bd9Sstevel@tonic-gate 		len = min(chunksz - off, bp->b_resid);
7177c478bd9Sstevel@tonic-gate 		ASSERT((off + len) <= chunksz);
7187c478bd9Sstevel@tonic-gate 
7197c478bd9Sstevel@tonic-gate 		if ((error = snap_getchunk(sidp, chunk, off, len, buf)) != 0) {
7207c478bd9Sstevel@tonic-gate 			/*
7217c478bd9Sstevel@tonic-gate 			 * EINVAL means the user tried to go out of range.
7227c478bd9Sstevel@tonic-gate 			 * Anything else means it's likely that we're
7237c478bd9Sstevel@tonic-gate 			 * confused.
7247c478bd9Sstevel@tonic-gate 			 */
7257c478bd9Sstevel@tonic-gate 			if (error != EINVAL) {
7267c478bd9Sstevel@tonic-gate 				cmn_err(CE_WARN, "snap_strategy: error "
7277c478bd9Sstevel@tonic-gate 				    "calling snap_getchunk, chunk = %llu, "
7287c478bd9Sstevel@tonic-gate 				    "offset = %d, len = %d, resid = %lu, "
7297c478bd9Sstevel@tonic-gate 				    "error = %d.",
7307c478bd9Sstevel@tonic-gate 				    chunk, off, len, bp->b_resid, error);
7317c478bd9Sstevel@tonic-gate 			}
7327c478bd9Sstevel@tonic-gate 			bioerror(bp, error);
7337c478bd9Sstevel@tonic-gate 			biodone(bp);
7347c478bd9Sstevel@tonic-gate 			rw_exit(&sidp->sid_rwlock);
7357c478bd9Sstevel@tonic-gate 			return (0);
7367c478bd9Sstevel@tonic-gate 		}
7377c478bd9Sstevel@tonic-gate 		bp->b_resid -= len;
7387c478bd9Sstevel@tonic-gate 		reqptr += (len >> DEV_BSHIFT);
7397c478bd9Sstevel@tonic-gate 		buf += len;
7407c478bd9Sstevel@tonic-gate 	}
7417c478bd9Sstevel@tonic-gate 
7427c478bd9Sstevel@tonic-gate 	ASSERT(bp->b_resid == 0);
7437c478bd9Sstevel@tonic-gate 	biodone(bp);
7447c478bd9Sstevel@tonic-gate 
7457c478bd9Sstevel@tonic-gate 	rw_exit(&sidp->sid_rwlock);
7467c478bd9Sstevel@tonic-gate 	return (0);
7477c478bd9Sstevel@tonic-gate }
7487c478bd9Sstevel@tonic-gate 
7497c478bd9Sstevel@tonic-gate /*
7507c478bd9Sstevel@tonic-gate  * snap_getchunk() - helper function for snap_strategy()
7517c478bd9Sstevel@tonic-gate  *
7527c478bd9Sstevel@tonic-gate  *    gets the requested data from the appropriate place and fills in the
7537c478bd9Sstevel@tonic-gate  *    buffer.  chunk is the chunk number of the request, offset is the
7547c478bd9Sstevel@tonic-gate  *    offset into that chunk and must be less than the chunk size.  len is
7557c478bd9Sstevel@tonic-gate  *    the length of the request starting at offset, and must not exceed a
7567c478bd9Sstevel@tonic-gate  *    chunk boundary.  buffer is the address to copy the data to.  len
7577c478bd9Sstevel@tonic-gate  *    bytes are copied into the buffer starting at the location specified.
7587c478bd9Sstevel@tonic-gate  *
7597c478bd9Sstevel@tonic-gate  *    A chunk is located according to the following algorithm:
7607c478bd9Sstevel@tonic-gate  *        - If the chunk does not have a translation or is not a candidate
7617c478bd9Sstevel@tonic-gate  *          for translation, it is read straight from the master device.
7627c478bd9Sstevel@tonic-gate  *        - If the chunk does have a translation, then it is either on
7637c478bd9Sstevel@tonic-gate  *          disk or in memory:
7647c478bd9Sstevel@tonic-gate  *            o If it is in memory the requested data is simply copied out
7657c478bd9Sstevel@tonic-gate  *              of the in-memory buffer.
7667c478bd9Sstevel@tonic-gate  *            o If it is in the backing store, it is read from there.
7677c478bd9Sstevel@tonic-gate  *
7687c478bd9Sstevel@tonic-gate  *    This function does the real work of the snapshot driver.
7697c478bd9Sstevel@tonic-gate  */
7707c478bd9Sstevel@tonic-gate static int
7717c478bd9Sstevel@tonic-gate snap_getchunk(struct snapshot_id *sidp, chunknumber_t chunk, int offset,
7727c478bd9Sstevel@tonic-gate     int len, char *buffer)
7737c478bd9Sstevel@tonic-gate {
7747c478bd9Sstevel@tonic-gate 	cow_map_t	*cmap = &sidp->sid_cowinfo->cow_map;
7757c478bd9Sstevel@tonic-gate 	cow_map_node_t	*cmn;
7767c478bd9Sstevel@tonic-gate 	struct buf	*snapbuf;
7777c478bd9Sstevel@tonic-gate 	int		error = 0;
7787c478bd9Sstevel@tonic-gate 	char		*newbuffer;
7797c478bd9Sstevel@tonic-gate 	int		newlen = 0;
7807c478bd9Sstevel@tonic-gate 	int		partial = 0;
7817c478bd9Sstevel@tonic-gate 
7827c478bd9Sstevel@tonic-gate 	ASSERT(RW_READ_HELD(&sidp->sid_rwlock));
7837c478bd9Sstevel@tonic-gate 	ASSERT(offset + len <= cmap->cmap_chunksz);
7847c478bd9Sstevel@tonic-gate 
7857c478bd9Sstevel@tonic-gate 	/*
7867c478bd9Sstevel@tonic-gate 	 * Check if the chunk number is out of range and if so bail out
7877c478bd9Sstevel@tonic-gate 	 */
7887c478bd9Sstevel@tonic-gate 	if (chunk >= (cmap->cmap_bmsize * NBBY)) {
7897c478bd9Sstevel@tonic-gate 		return (EINVAL);
7907c478bd9Sstevel@tonic-gate 	}
7917c478bd9Sstevel@tonic-gate 
7927c478bd9Sstevel@tonic-gate 	/*
7937c478bd9Sstevel@tonic-gate 	 * If the chunk is not a candidate for translation, then the chunk
7947c478bd9Sstevel@tonic-gate 	 * was not allocated when the snapshot was taken.  Since it does
7957c478bd9Sstevel@tonic-gate 	 * not contain data associated with this snapshot, just return a
7967c478bd9Sstevel@tonic-gate 	 * zero buffer instead.
7977c478bd9Sstevel@tonic-gate 	 */
7987c478bd9Sstevel@tonic-gate 	if (isclr(cmap->cmap_candidate, chunk)) {
7997c478bd9Sstevel@tonic-gate 		bzero(buffer, len);
8007c478bd9Sstevel@tonic-gate 		return (0);
8017c478bd9Sstevel@tonic-gate 	}
8027c478bd9Sstevel@tonic-gate 
8037c478bd9Sstevel@tonic-gate 	/*
8047c478bd9Sstevel@tonic-gate 	 * if the chunk is a candidate for translation but a
8057c478bd9Sstevel@tonic-gate 	 * translation does not exist, then read through to the
8067c478bd9Sstevel@tonic-gate 	 * original file system.  The rwlock is held until the read
8077c478bd9Sstevel@tonic-gate 	 * completes if it hasn't been translated to make sure the
8087c478bd9Sstevel@tonic-gate 	 * file system does not translate the block before we
8097c478bd9Sstevel@tonic-gate 	 * access it. If it has already been translated we don't
8107c478bd9Sstevel@tonic-gate 	 * need the lock, because the translation will never go away.
8117c478bd9Sstevel@tonic-gate 	 */
8127c478bd9Sstevel@tonic-gate 	rw_enter(&cmap->cmap_rwlock, RW_READER);
8137c478bd9Sstevel@tonic-gate 	if (isclr(cmap->cmap_hastrans, chunk)) {
8147c478bd9Sstevel@tonic-gate 		snapbuf = getrbuf(KM_SLEEP);
8157c478bd9Sstevel@tonic-gate 		/*
8167c478bd9Sstevel@tonic-gate 		 * Reading into the buffer saves having to do a copy,
8177c478bd9Sstevel@tonic-gate 		 * but gets tricky if the request size is not a
8187c478bd9Sstevel@tonic-gate 		 * multiple of DEV_BSIZE.  However, we are filling the
8197c478bd9Sstevel@tonic-gate 		 * buffer left to right, so future reads will write
8207c478bd9Sstevel@tonic-gate 		 * over any extra data we might have read.
8217c478bd9Sstevel@tonic-gate 		 */
8227c478bd9Sstevel@tonic-gate 
8237c478bd9Sstevel@tonic-gate 		partial = len % DEV_BSIZE;
8247c478bd9Sstevel@tonic-gate 
8257c478bd9Sstevel@tonic-gate 		snapbuf->b_bcount = len;
8267c478bd9Sstevel@tonic-gate 		snapbuf->b_lblkno = lbtodb(chunk * cmap->cmap_chunksz + offset);
8277c478bd9Sstevel@tonic-gate 		snapbuf->b_un.b_addr = buffer;
8287c478bd9Sstevel@tonic-gate 
8297c478bd9Sstevel@tonic-gate 		snapbuf->b_iodone = NULL;
8307c478bd9Sstevel@tonic-gate 		snapbuf->b_proc = NULL;		/* i.e. the kernel */
8317c478bd9Sstevel@tonic-gate 		snapbuf->b_flags = B_READ | B_BUSY;
8327c478bd9Sstevel@tonic-gate 		snapbuf->b_edev = sidp->sid_fvp->v_vfsp->vfs_dev;
8337c478bd9Sstevel@tonic-gate 
8347c478bd9Sstevel@tonic-gate 		if (partial) {
8357c478bd9Sstevel@tonic-gate 			/*
8367c478bd9Sstevel@tonic-gate 			 * Partial block read in progress.
8377c478bd9Sstevel@tonic-gate 			 * This is bad as modules further down the line
8387c478bd9Sstevel@tonic-gate 			 * assume buf's are exact multiples of DEV_BSIZE
8397c478bd9Sstevel@tonic-gate 			 * and we end up with fewer, or zero, bytes read.
8407c478bd9Sstevel@tonic-gate 			 * To get round this we need to round up to the
8417c478bd9Sstevel@tonic-gate 			 * nearest full block read and then return only
8427c478bd9Sstevel@tonic-gate 			 * len bytes.
8437c478bd9Sstevel@tonic-gate 			 */
8447c478bd9Sstevel@tonic-gate 			newlen = (len - partial) + DEV_BSIZE;
8457c478bd9Sstevel@tonic-gate 			newbuffer = kmem_alloc(newlen, KM_SLEEP);
8467c478bd9Sstevel@tonic-gate 
8477c478bd9Sstevel@tonic-gate 			snapbuf->b_bcount = newlen;
8487c478bd9Sstevel@tonic-gate 			snapbuf->b_un.b_addr = newbuffer;
8497c478bd9Sstevel@tonic-gate 		}
8507c478bd9Sstevel@tonic-gate 
8517c478bd9Sstevel@tonic-gate 		(void) bdev_strategy(snapbuf);
8527c478bd9Sstevel@tonic-gate 		(void) biowait(snapbuf);
8537c478bd9Sstevel@tonic-gate 
8547c478bd9Sstevel@tonic-gate 		error = geterror(snapbuf);
8557c478bd9Sstevel@tonic-gate 
8567c478bd9Sstevel@tonic-gate 		if (partial) {
8577c478bd9Sstevel@tonic-gate 			/*
8587c478bd9Sstevel@tonic-gate 			 * Partial block read. Now we need to bcopy the
8597c478bd9Sstevel@tonic-gate 			 * correct number of bytes back into the
8607c478bd9Sstevel@tonic-gate 			 * supplied buffer, and tidy up our temp
8617c478bd9Sstevel@tonic-gate 			 * buffer.
8627c478bd9Sstevel@tonic-gate 			 */
8637c478bd9Sstevel@tonic-gate 			bcopy(newbuffer, buffer, len);
8647c478bd9Sstevel@tonic-gate 			kmem_free(newbuffer, newlen);
8657c478bd9Sstevel@tonic-gate 		}
8667c478bd9Sstevel@tonic-gate 
8677c478bd9Sstevel@tonic-gate 		freerbuf(snapbuf);
8687c478bd9Sstevel@tonic-gate 		rw_exit(&cmap->cmap_rwlock);
8697c478bd9Sstevel@tonic-gate 
8707c478bd9Sstevel@tonic-gate 		return (error);
8717c478bd9Sstevel@tonic-gate 	}
8727c478bd9Sstevel@tonic-gate 
8737c478bd9Sstevel@tonic-gate 	/*
8747c478bd9Sstevel@tonic-gate 	 * finally, if the chunk is a candidate for translation and it
8757c478bd9Sstevel@tonic-gate 	 * has been translated, then we clone the chunk of the buffer
8767c478bd9Sstevel@tonic-gate 	 * that was copied aside by the file system.
8777c478bd9Sstevel@tonic-gate 	 * The cmap_rwlock does not need to be held after we know the
8787c478bd9Sstevel@tonic-gate 	 * data has already been copied. Once a chunk has been copied
8797c478bd9Sstevel@tonic-gate 	 * to the backing file, it is stable read only data.
8807c478bd9Sstevel@tonic-gate 	 */
8817c478bd9Sstevel@tonic-gate 	cmn = transtbl_get(cmap, chunk);
8827c478bd9Sstevel@tonic-gate 
8837c478bd9Sstevel@tonic-gate 	/* check whether the data is in memory or in the backing file */
8847c478bd9Sstevel@tonic-gate 	if (cmn != NULL) {
8857c478bd9Sstevel@tonic-gate 		ASSERT(cmn->cmn_buf);
8867c478bd9Sstevel@tonic-gate 		/* already in memory */
8877c478bd9Sstevel@tonic-gate 		bcopy(cmn->cmn_buf + offset, buffer, len);
8887c478bd9Sstevel@tonic-gate 		rw_exit(&cmap->cmap_rwlock);
8897c478bd9Sstevel@tonic-gate 	} else {
8907c478bd9Sstevel@tonic-gate 		ssize_t resid = len;
8917c478bd9Sstevel@tonic-gate 		int	bf_index;
8927c478bd9Sstevel@tonic-gate 		/*
8937c478bd9Sstevel@tonic-gate 		 * can cause deadlock with writer if we don't drop the
8947c478bd9Sstevel@tonic-gate 		 * cmap_rwlock before trying to get the backing store file
8957c478bd9Sstevel@tonic-gate 		 * vnode rwlock.
8967c478bd9Sstevel@tonic-gate 		 */
8977c478bd9Sstevel@tonic-gate 		rw_exit(&cmap->cmap_rwlock);
8987c478bd9Sstevel@tonic-gate 
8997c478bd9Sstevel@tonic-gate 		bf_index = chunk / cmap->cmap_chunksperbf;
9007c478bd9Sstevel@tonic-gate 
9017c478bd9Sstevel@tonic-gate 		/* read buffer from backing file */
9027c478bd9Sstevel@tonic-gate 		error = vn_rdwr(UIO_READ,
9037c478bd9Sstevel@tonic-gate 		    (sidp->sid_cowinfo->cow_backfile_array)[bf_index],
9047c478bd9Sstevel@tonic-gate 		    buffer, len, ((chunk % cmap->cmap_chunksperbf) *
9057c478bd9Sstevel@tonic-gate 		    cmap->cmap_chunksz) + offset, UIO_SYSSPACE, 0,
9067c478bd9Sstevel@tonic-gate 		    RLIM64_INFINITY, kcred, &resid);
9077c478bd9Sstevel@tonic-gate 	}
9087c478bd9Sstevel@tonic-gate 
9097c478bd9Sstevel@tonic-gate 	return (error);
9107c478bd9Sstevel@tonic-gate }
9117c478bd9Sstevel@tonic-gate 
9127c478bd9Sstevel@tonic-gate /*
9137c478bd9Sstevel@tonic-gate  * snap_print() - snapshot driver print(9E) routine
9147c478bd9Sstevel@tonic-gate  *
9157c478bd9Sstevel@tonic-gate  *    prints the device identification string.
9167c478bd9Sstevel@tonic-gate  */
9177c478bd9Sstevel@tonic-gate static int
9187c478bd9Sstevel@tonic-gate snap_print(dev_t dev, char *str)
9197c478bd9Sstevel@tonic-gate {
9207c478bd9Sstevel@tonic-gate 	struct snapshot_id **sidpp;
9217c478bd9Sstevel@tonic-gate 	minor_t		minor;
9227c478bd9Sstevel@tonic-gate 
9237c478bd9Sstevel@tonic-gate 	minor = getminor(dev);
9247c478bd9Sstevel@tonic-gate 	sidpp = ddi_get_soft_state(statep, minor);
9257c478bd9Sstevel@tonic-gate 	if (sidpp == NULL || *sidpp == NULL) {
9267c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN,
9277c478bd9Sstevel@tonic-gate 		    "snap_print: could not find state for snapshot %d.", minor);
9287c478bd9Sstevel@tonic-gate 		return (ENXIO);
9297c478bd9Sstevel@tonic-gate 	}
9307c478bd9Sstevel@tonic-gate 
9317c478bd9Sstevel@tonic-gate 	cmn_err(CE_NOTE, "snap_print: snapshot %d: %s",  minor, str);
9327c478bd9Sstevel@tonic-gate 
9337c478bd9Sstevel@tonic-gate 	return (0);
9347c478bd9Sstevel@tonic-gate }
9357c478bd9Sstevel@tonic-gate 
9367c478bd9Sstevel@tonic-gate /*
9377c478bd9Sstevel@tonic-gate  * snap_prop_op() - snapshot driver prop_op(9E) routine
9387c478bd9Sstevel@tonic-gate  *
9397c478bd9Sstevel@tonic-gate  *    get 32-bit and 64-bit values for size (character driver) and nblocks
9407c478bd9Sstevel@tonic-gate  *    (block driver).
9417c478bd9Sstevel@tonic-gate  */
9427c478bd9Sstevel@tonic-gate static int
9437c478bd9Sstevel@tonic-gate snap_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op,
9447c478bd9Sstevel@tonic-gate     int flags, char *name, caddr_t valuep, int *lengthp)
9457c478bd9Sstevel@tonic-gate {
9467c478bd9Sstevel@tonic-gate 	struct snapshot_id **sidpp;
9477c478bd9Sstevel@tonic-gate 	int		length, km_flags;
9487c478bd9Sstevel@tonic-gate 	int		nblocks, size;
9497c478bd9Sstevel@tonic-gate 	uint64_t	Size, Nblocks;
9507c478bd9Sstevel@tonic-gate 	caddr_t		buffer;
9517c478bd9Sstevel@tonic-gate 	int		minor;
9527c478bd9Sstevel@tonic-gate 	dev_t		mdev;
9537c478bd9Sstevel@tonic-gate 
9547c478bd9Sstevel@tonic-gate 	minor = getminor(dev);
9557c478bd9Sstevel@tonic-gate 	length = *lengthp;		/* Get callers length */
9567c478bd9Sstevel@tonic-gate 
9577c478bd9Sstevel@tonic-gate 	/* if this is the control device just check for .conf properties */
9587c478bd9Sstevel@tonic-gate 	if (minor == SNAP_CTL_MINOR)
9597c478bd9Sstevel@tonic-gate 		return (ddi_prop_op(dev, dip, prop_op, flags, name,
9607c478bd9Sstevel@tonic-gate 			valuep, lengthp));
9617c478bd9Sstevel@tonic-gate 	/* check to see if there is a master device plumbed */
9627c478bd9Sstevel@tonic-gate 	sidpp = ddi_get_soft_state(statep, minor);
9637c478bd9Sstevel@tonic-gate 	if (sidpp == NULL || *sidpp == NULL) {
9647c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN,
9657c478bd9Sstevel@tonic-gate 		    "snap_prop_op: could not find state for "
9667c478bd9Sstevel@tonic-gate 		    "snapshot %d.", minor);
9677c478bd9Sstevel@tonic-gate 		return (DDI_PROP_NOT_FOUND);
9687c478bd9Sstevel@tonic-gate 	}
9697c478bd9Sstevel@tonic-gate 
9707c478bd9Sstevel@tonic-gate 	if (((*sidpp)->sid_fvp == NULL) || ((*sidpp)->sid_fvp->v_vfsp == NULL))
9717c478bd9Sstevel@tonic-gate 		return (ddi_prop_op(dev, dip, prop_op, flags, name,
9727c478bd9Sstevel@tonic-gate 			valuep, lengthp));
9737c478bd9Sstevel@tonic-gate 	mdev = (*sidpp)->sid_fvp->v_vfsp->vfs_dev;
9747c478bd9Sstevel@tonic-gate 
9757c478bd9Sstevel@tonic-gate 	/* get size information from the master device. */
9767c478bd9Sstevel@tonic-gate 
9777c478bd9Sstevel@tonic-gate 	if (strcmp(name, "nblocks") == 0) {
9787c478bd9Sstevel@tonic-gate 		nblocks = bdev_size(mdev);
9797c478bd9Sstevel@tonic-gate 		*lengthp = sizeof (nblocks);	/* Set callers length */
9807c478bd9Sstevel@tonic-gate 	} else if (strcmp(name, "Nblocks") == 0) {
9817c478bd9Sstevel@tonic-gate 		Nblocks = bdev_Size(mdev);
9827c478bd9Sstevel@tonic-gate 		*lengthp = sizeof (Nblocks);	/* Set callers length */
9837c478bd9Sstevel@tonic-gate 	} else if (strcmp(name, "size") == 0) {
9847c478bd9Sstevel@tonic-gate 		size = cdev_size(mdev);
9857c478bd9Sstevel@tonic-gate 		*lengthp = sizeof (size);	/* Set callers length */
9867c478bd9Sstevel@tonic-gate 	} else if (strcmp(name, "Size") == 0) {
9877c478bd9Sstevel@tonic-gate 		Size = cdev_Size(mdev);
9887c478bd9Sstevel@tonic-gate 		*lengthp = sizeof (Size);	/* Set callers length */
9897c478bd9Sstevel@tonic-gate 	} else {	/* not for us */
9907c478bd9Sstevel@tonic-gate 		return (ddi_prop_op(dev, dip, prop_op, flags, name,
9917c478bd9Sstevel@tonic-gate 		    valuep, lengthp));
9927c478bd9Sstevel@tonic-gate 	}
9937c478bd9Sstevel@tonic-gate 
9947c478bd9Sstevel@tonic-gate 	/*
9957c478bd9Sstevel@tonic-gate 	 * If length only request, just return the length.
9967c478bd9Sstevel@tonic-gate 	 */
9977c478bd9Sstevel@tonic-gate 	if (prop_op == PROP_LEN)  {
9987c478bd9Sstevel@tonic-gate 		return (DDI_PROP_SUCCESS);
9997c478bd9Sstevel@tonic-gate 	}
10007c478bd9Sstevel@tonic-gate 
10017c478bd9Sstevel@tonic-gate 	/*
10027c478bd9Sstevel@tonic-gate 	 * Allocate buffer, if required.  Either way, set `buffer' variable.
10037c478bd9Sstevel@tonic-gate 	 */
10047c478bd9Sstevel@tonic-gate 	switch (prop_op)  {
10057c478bd9Sstevel@tonic-gate 	case PROP_LEN_AND_VAL_ALLOC:
10067c478bd9Sstevel@tonic-gate 
10077c478bd9Sstevel@tonic-gate 		km_flags = KM_NOSLEEP;
10087c478bd9Sstevel@tonic-gate 
10097c478bd9Sstevel@tonic-gate 		if (flags & DDI_PROP_CANSLEEP)
10107c478bd9Sstevel@tonic-gate 			km_flags = KM_SLEEP;
10117c478bd9Sstevel@tonic-gate 
10127c478bd9Sstevel@tonic-gate 		buffer = kmem_alloc(*lengthp, km_flags);
10137c478bd9Sstevel@tonic-gate 		if (buffer == NULL)  {
10147c478bd9Sstevel@tonic-gate 			cmn_err(CE_WARN, "snap_get_prop: no mem for "
10157c478bd9Sstevel@tonic-gate 			"property %s.", name);
10167c478bd9Sstevel@tonic-gate 			return (DDI_PROP_NO_MEMORY);
10177c478bd9Sstevel@tonic-gate 		}
10187c478bd9Sstevel@tonic-gate 		*(caddr_t *)valuep = buffer; /* Set callers buf ptr */
10197c478bd9Sstevel@tonic-gate 		break;
10207c478bd9Sstevel@tonic-gate 
10217c478bd9Sstevel@tonic-gate 	case PROP_LEN_AND_VAL_BUF:
10227c478bd9Sstevel@tonic-gate 
10237c478bd9Sstevel@tonic-gate 		if (*lengthp > length)
10247c478bd9Sstevel@tonic-gate 			return (DDI_PROP_BUF_TOO_SMALL);
10257c478bd9Sstevel@tonic-gate 
10267c478bd9Sstevel@tonic-gate 		buffer = valuep; /* get callers buf ptr */
10277c478bd9Sstevel@tonic-gate 		break;
10287c478bd9Sstevel@tonic-gate 	}
10297c478bd9Sstevel@tonic-gate 
10307c478bd9Sstevel@tonic-gate 	if (strcmp(name, "nblocks") == 0) {
10317c478bd9Sstevel@tonic-gate 		*((uint_t *)buffer) = nblocks;
10327c478bd9Sstevel@tonic-gate 	} else if (strcmp(name, "Nblocks") == 0) {
10337c478bd9Sstevel@tonic-gate 		*((uint64_t *)buffer) = Nblocks;
10347c478bd9Sstevel@tonic-gate 	} else if (strcmp(name, "size") == 0) {
10357c478bd9Sstevel@tonic-gate 		*((uint_t *)buffer) = size;
10367c478bd9Sstevel@tonic-gate 	} else if (strcmp(name, "Size") == 0) {
10377c478bd9Sstevel@tonic-gate 		*((uint64_t *)buffer) = Size;
10387c478bd9Sstevel@tonic-gate 	}
10397c478bd9Sstevel@tonic-gate 
10407c478bd9Sstevel@tonic-gate 	return (DDI_PROP_SUCCESS);
10417c478bd9Sstevel@tonic-gate }
10427c478bd9Sstevel@tonic-gate 
10437c478bd9Sstevel@tonic-gate /*
10447c478bd9Sstevel@tonic-gate  * snap_ioctl() - snapshot driver ioctl(9E) routine
10457c478bd9Sstevel@tonic-gate  *
10467c478bd9Sstevel@tonic-gate  *    only applies to the control device.  The control device accepts two
10477c478bd9Sstevel@tonic-gate  *    ioctl requests: create a snapshot or delete a snapshot.  In either
10487c478bd9Sstevel@tonic-gate  *    case, the vnode for the requested file system is extracted, and the
10497c478bd9Sstevel@tonic-gate  *    request is passed on to the file system via the same ioctl.  The file
10507c478bd9Sstevel@tonic-gate  *    system is responsible for doing the things necessary for creating or
10517c478bd9Sstevel@tonic-gate  *    destroying a snapshot, including any file system specific operations
10527c478bd9Sstevel@tonic-gate  *    that must be performed as well as setting up and deleting the snapshot
10537c478bd9Sstevel@tonic-gate  *    state through the fssnap interfaces.
10547c478bd9Sstevel@tonic-gate  */
10557c478bd9Sstevel@tonic-gate static int
10567c478bd9Sstevel@tonic-gate snap_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
10577c478bd9Sstevel@tonic-gate int *rvalp)
10587c478bd9Sstevel@tonic-gate {
10597c478bd9Sstevel@tonic-gate 	minor_t	minor;
10607c478bd9Sstevel@tonic-gate 	int error = 0;
10617c478bd9Sstevel@tonic-gate 
10627c478bd9Sstevel@tonic-gate 	minor = getminor(dev);
10637c478bd9Sstevel@tonic-gate 
10647c478bd9Sstevel@tonic-gate 	if (minor != SNAP_CTL_MINOR) {
10657c478bd9Sstevel@tonic-gate 		return (EINVAL);
10667c478bd9Sstevel@tonic-gate 	}
10677c478bd9Sstevel@tonic-gate 
10687c478bd9Sstevel@tonic-gate 	switch (cmd) {
10697c478bd9Sstevel@tonic-gate 	case _FIOSNAPSHOTCREATE:
10707c478bd9Sstevel@tonic-gate 	{
10717c478bd9Sstevel@tonic-gate 		struct fiosnapcreate	fc;
10727c478bd9Sstevel@tonic-gate 		struct file		*fp;
10737c478bd9Sstevel@tonic-gate 		struct vnode		*vp;
10747c478bd9Sstevel@tonic-gate 
10757c478bd9Sstevel@tonic-gate 		if (ddi_copyin((void *)arg, &fc, sizeof (fc), mode))
10767c478bd9Sstevel@tonic-gate 			return (EFAULT);
10777c478bd9Sstevel@tonic-gate 
10787c478bd9Sstevel@tonic-gate 		/* get vnode for file system mount point */
10797c478bd9Sstevel@tonic-gate 		if ((fp = getf(fc.rootfiledesc)) == NULL)
10807c478bd9Sstevel@tonic-gate 			return (EBADF);
10817c478bd9Sstevel@tonic-gate 
10827c478bd9Sstevel@tonic-gate 		ASSERT(fp->f_vnode);
10837c478bd9Sstevel@tonic-gate 		vp = fp->f_vnode;
10847c478bd9Sstevel@tonic-gate 		VN_HOLD(vp);
10857c478bd9Sstevel@tonic-gate 		releasef(fc.rootfiledesc);
10867c478bd9Sstevel@tonic-gate 
10877c478bd9Sstevel@tonic-gate 		/* pass ioctl request to file system */
10887c478bd9Sstevel@tonic-gate 		error = VOP_IOCTL(vp, cmd, arg, 0, credp, rvalp);
10897c478bd9Sstevel@tonic-gate 		VN_RELE(vp);
10907c478bd9Sstevel@tonic-gate 		break;
10917c478bd9Sstevel@tonic-gate 	}
10927c478bd9Sstevel@tonic-gate 	case _FIOSNAPSHOTCREATE_MULTI:
10937c478bd9Sstevel@tonic-gate 	{
10947c478bd9Sstevel@tonic-gate 		struct fiosnapcreate_multi	fc;
10957c478bd9Sstevel@tonic-gate 		struct file		*fp;
10967c478bd9Sstevel@tonic-gate 		struct vnode		*vp;
10977c478bd9Sstevel@tonic-gate 
10987c478bd9Sstevel@tonic-gate 		if (ddi_copyin((void *)arg, &fc, sizeof (fc), mode))
10997c478bd9Sstevel@tonic-gate 			return (EFAULT);
11007c478bd9Sstevel@tonic-gate 
11017c478bd9Sstevel@tonic-gate 		/* get vnode for file system mount point */
11027c478bd9Sstevel@tonic-gate 		if ((fp = getf(fc.rootfiledesc)) == NULL)
11037c478bd9Sstevel@tonic-gate 			return (EBADF);
11047c478bd9Sstevel@tonic-gate 
11057c478bd9Sstevel@tonic-gate 		ASSERT(fp->f_vnode);
11067c478bd9Sstevel@tonic-gate 		vp = fp->f_vnode;
11077c478bd9Sstevel@tonic-gate 		VN_HOLD(vp);
11087c478bd9Sstevel@tonic-gate 		releasef(fc.rootfiledesc);
11097c478bd9Sstevel@tonic-gate 
11107c478bd9Sstevel@tonic-gate 		/* pass ioctl request to file system */
11117c478bd9Sstevel@tonic-gate 		error = VOP_IOCTL(vp, cmd, arg, 0, credp, rvalp);
11127c478bd9Sstevel@tonic-gate 		VN_RELE(vp);
11137c478bd9Sstevel@tonic-gate 		break;
11147c478bd9Sstevel@tonic-gate 	}
11157c478bd9Sstevel@tonic-gate 	case _FIOSNAPSHOTDELETE:
11167c478bd9Sstevel@tonic-gate 	{
11177c478bd9Sstevel@tonic-gate 		major_t			major;
11187c478bd9Sstevel@tonic-gate 		struct fiosnapdelete	fc;
11197c478bd9Sstevel@tonic-gate 		snapshot_id_t		*sidp = NULL;
11207c478bd9Sstevel@tonic-gate 		snapshot_id_t		*sidnextp = NULL;
11217c478bd9Sstevel@tonic-gate 		struct file		*fp = NULL;
11227c478bd9Sstevel@tonic-gate 		struct vnode		*vp = NULL;
11237c478bd9Sstevel@tonic-gate 		struct vfs 		*vfsp = NULL;
11247c478bd9Sstevel@tonic-gate 		vfsops_t		*vfsops = EIO_vfsops;
11257c478bd9Sstevel@tonic-gate 
11267c478bd9Sstevel@tonic-gate 		if (ddi_copyin((void *)arg, &fc, sizeof (fc), mode))
11277c478bd9Sstevel@tonic-gate 			return (EFAULT);
11287c478bd9Sstevel@tonic-gate 
11297c478bd9Sstevel@tonic-gate 		/* get vnode for file system mount point */
11307c478bd9Sstevel@tonic-gate 		if ((fp = getf(fc.rootfiledesc)) == NULL)
11317c478bd9Sstevel@tonic-gate 			return (EBADF);
11327c478bd9Sstevel@tonic-gate 
11337c478bd9Sstevel@tonic-gate 		ASSERT(fp->f_vnode);
11347c478bd9Sstevel@tonic-gate 		vp = fp->f_vnode;
11357c478bd9Sstevel@tonic-gate 		VN_HOLD(vp);
11367c478bd9Sstevel@tonic-gate 		releasef(fc.rootfiledesc);
11377c478bd9Sstevel@tonic-gate 		/*
11387c478bd9Sstevel@tonic-gate 		 * Test for two formats of delete and set correct minor/vp:
11397c478bd9Sstevel@tonic-gate 		 * pseudo device:
11407c478bd9Sstevel@tonic-gate 		 * fssnap -d [/dev/fssnap/x]
11417c478bd9Sstevel@tonic-gate 		 * or
11427c478bd9Sstevel@tonic-gate 		 * mount point:
11437c478bd9Sstevel@tonic-gate 		 * fssnap -d [/mntpt]
11447c478bd9Sstevel@tonic-gate 		 * Note that minor is verified to be equal to SNAP_CTL_MINOR
11457c478bd9Sstevel@tonic-gate 		 * at this point which is an invalid minor number.
11467c478bd9Sstevel@tonic-gate 		 */
11477c478bd9Sstevel@tonic-gate 		ASSERT(fssnap_dip != NULL);
11487c478bd9Sstevel@tonic-gate 		major = ddi_driver_major(fssnap_dip);
11497c478bd9Sstevel@tonic-gate 		mutex_enter(&snapshot_mutex);
11507c478bd9Sstevel@tonic-gate 		for (sidp = snapshot; sidp != NULL; sidp = sidnextp) {
11517c478bd9Sstevel@tonic-gate 			rw_enter(&sidp->sid_rwlock, RW_READER);
11527c478bd9Sstevel@tonic-gate 			sidnextp = sidp->sid_next;
11537c478bd9Sstevel@tonic-gate 			/* pseudo device: */
11547c478bd9Sstevel@tonic-gate 			if (major == getmajor(vp->v_rdev)) {
11557c478bd9Sstevel@tonic-gate 				minor = getminor(vp->v_rdev);
11567c478bd9Sstevel@tonic-gate 				if (sidp->sid_snapnumber == (uint_t)minor &&
11577c478bd9Sstevel@tonic-gate 				    sidp->sid_fvp) {
11587c478bd9Sstevel@tonic-gate 					VN_RELE(vp);
11597c478bd9Sstevel@tonic-gate 					vp = sidp->sid_fvp;
11607c478bd9Sstevel@tonic-gate 					VN_HOLD(vp);
11617c478bd9Sstevel@tonic-gate 					rw_exit(&sidp->sid_rwlock);
11627c478bd9Sstevel@tonic-gate 					break;
11637c478bd9Sstevel@tonic-gate 				}
11647c478bd9Sstevel@tonic-gate 			/* Mount point: */
11657c478bd9Sstevel@tonic-gate 			} else {
11667c478bd9Sstevel@tonic-gate 				if (sidp->sid_fvp == vp) {
11677c478bd9Sstevel@tonic-gate 					minor = sidp->sid_snapnumber;
11687c478bd9Sstevel@tonic-gate 					rw_exit(&sidp->sid_rwlock);
11697c478bd9Sstevel@tonic-gate 					break;
11707c478bd9Sstevel@tonic-gate 				}
11717c478bd9Sstevel@tonic-gate 			}
11727c478bd9Sstevel@tonic-gate 			rw_exit(&sidp->sid_rwlock);
11737c478bd9Sstevel@tonic-gate 		}
11747c478bd9Sstevel@tonic-gate 		mutex_exit(&snapshot_mutex);
11757c478bd9Sstevel@tonic-gate 		/* Verify minor got set correctly above */
11767c478bd9Sstevel@tonic-gate 		if (minor == SNAP_CTL_MINOR) {
11777c478bd9Sstevel@tonic-gate 			VN_RELE(vp);
11787c478bd9Sstevel@tonic-gate 			return (EINVAL);
11797c478bd9Sstevel@tonic-gate 		}
11807c478bd9Sstevel@tonic-gate 		dev = makedevice(major, minor);
11817c478bd9Sstevel@tonic-gate 		/*
11827c478bd9Sstevel@tonic-gate 		 * Create dummy vfs entry
11837c478bd9Sstevel@tonic-gate 		 * to use as a locking semaphore across the IOCTL
11847c478bd9Sstevel@tonic-gate 		 * for mount in progress cases...
11857c478bd9Sstevel@tonic-gate 		 */
11867c478bd9Sstevel@tonic-gate 		vfsp = kmem_alloc(sizeof (vfs_t), KM_SLEEP);
11877c478bd9Sstevel@tonic-gate 		VFS_INIT(vfsp, vfsops, NULL);
1188*ddfcde86Srsb 		VFS_HOLD(vfsp);
11897c478bd9Sstevel@tonic-gate 		vfs_addmip(dev, vfsp);
11907c478bd9Sstevel@tonic-gate 		if ((vfs_devmounting(dev, vfsp)) ||
11917c478bd9Sstevel@tonic-gate 		    (vfs_devismounted(dev))) {
11927c478bd9Sstevel@tonic-gate 			vfs_delmip(vfsp);
1193*ddfcde86Srsb 			VFS_RELE(vfsp);
11947c478bd9Sstevel@tonic-gate 			VN_RELE(vp);
11957c478bd9Sstevel@tonic-gate 			return (EBUSY);
11967c478bd9Sstevel@tonic-gate 		}
11977c478bd9Sstevel@tonic-gate 		/*
11987c478bd9Sstevel@tonic-gate 		 * Nobody mounted but do not release mount in progress lock
11997c478bd9Sstevel@tonic-gate 		 * until IOCTL complete to prohibit a mount sneaking
12007c478bd9Sstevel@tonic-gate 		 * in
12017c478bd9Sstevel@tonic-gate 		 */
12027c478bd9Sstevel@tonic-gate 		error = VOP_IOCTL(vp, cmd, arg, 0, credp, rvalp);
12037c478bd9Sstevel@tonic-gate 		vfs_delmip(vfsp);
1204*ddfcde86Srsb 		VFS_RELE(vfsp);
12057c478bd9Sstevel@tonic-gate 		VN_RELE(vp);
12067c478bd9Sstevel@tonic-gate 		break;
12077c478bd9Sstevel@tonic-gate 	}
12087c478bd9Sstevel@tonic-gate 	default:
12097c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN, "snap_ioctl: Invalid ioctl cmd %d, minor %d.",
12107c478bd9Sstevel@tonic-gate 		    cmd, minor);
12117c478bd9Sstevel@tonic-gate 		return (EINVAL);
12127c478bd9Sstevel@tonic-gate 	}
12137c478bd9Sstevel@tonic-gate 
12147c478bd9Sstevel@tonic-gate 	return (error);
12157c478bd9Sstevel@tonic-gate }
12167c478bd9Sstevel@tonic-gate 
12177c478bd9Sstevel@tonic-gate 
12187c478bd9Sstevel@tonic-gate /* ************************************************************************ */
12197c478bd9Sstevel@tonic-gate 
12207c478bd9Sstevel@tonic-gate /*
12217c478bd9Sstevel@tonic-gate  * Translation Table Routines
12227c478bd9Sstevel@tonic-gate  *
12237c478bd9Sstevel@tonic-gate  *    These support routines implement a simple doubly linked list
12247c478bd9Sstevel@tonic-gate  *    to keep track of chunks that are currently in memory.  The maximum
12257c478bd9Sstevel@tonic-gate  *    size of the list is determined by the fssnap_max_mem_chunks variable.
12267c478bd9Sstevel@tonic-gate  *    The cmap_rwlock is used to protect the linkage of the list.
12277c478bd9Sstevel@tonic-gate  */
12287c478bd9Sstevel@tonic-gate 
12297c478bd9Sstevel@tonic-gate /*
12307c478bd9Sstevel@tonic-gate  * transtbl_add() - add a node to the translation table
12317c478bd9Sstevel@tonic-gate  *
12327c478bd9Sstevel@tonic-gate  *    allocates a new node and points it at the buffer passed in.  The node
12337c478bd9Sstevel@tonic-gate  *    is added to the beginning of the doubly linked list and the head of
12347c478bd9Sstevel@tonic-gate  *    the list is moved.  The cmap_rwlock must be held as a writer through
12357c478bd9Sstevel@tonic-gate  *    this operation.
12367c478bd9Sstevel@tonic-gate  */
12377c478bd9Sstevel@tonic-gate static cow_map_node_t *
12387c478bd9Sstevel@tonic-gate transtbl_add(cow_map_t *cmap, chunknumber_t chunk, caddr_t buf)
12397c478bd9Sstevel@tonic-gate {
12407c478bd9Sstevel@tonic-gate 	cow_map_node_t	*cmnode;
12417c478bd9Sstevel@tonic-gate 
12427c478bd9Sstevel@tonic-gate 	ASSERT(RW_WRITE_HELD(&cmap->cmap_rwlock));
12437c478bd9Sstevel@tonic-gate 
12447c478bd9Sstevel@tonic-gate 	cmnode = kmem_alloc(sizeof (cow_map_node_t), KM_SLEEP);
12457c478bd9Sstevel@tonic-gate 
12467c478bd9Sstevel@tonic-gate 	/*
12477c478bd9Sstevel@tonic-gate 	 * insert new translations at the beginning so cmn_table is always
12487c478bd9Sstevel@tonic-gate 	 * the first node.
12497c478bd9Sstevel@tonic-gate 	 */
12507c478bd9Sstevel@tonic-gate 	cmnode->cmn_chunk = chunk;
12517c478bd9Sstevel@tonic-gate 	cmnode->cmn_buf = buf;
12527c478bd9Sstevel@tonic-gate 	cmnode->cmn_prev = NULL;
12537c478bd9Sstevel@tonic-gate 	cmnode->cmn_next = cmap->cmap_table;
12547c478bd9Sstevel@tonic-gate 	if (cmnode->cmn_next)
12557c478bd9Sstevel@tonic-gate 		cmnode->cmn_next->cmn_prev = cmnode;
12567c478bd9Sstevel@tonic-gate 	cmap->cmap_table = cmnode;
12577c478bd9Sstevel@tonic-gate 
12587c478bd9Sstevel@tonic-gate 	return (cmnode);
12597c478bd9Sstevel@tonic-gate }
12607c478bd9Sstevel@tonic-gate 
12617c478bd9Sstevel@tonic-gate /*
12627c478bd9Sstevel@tonic-gate  * transtbl_get() - look up a node in the translation table
12637c478bd9Sstevel@tonic-gate  *
12647c478bd9Sstevel@tonic-gate  *    called by the snapshot driver to find data that has been translated.
12657c478bd9Sstevel@tonic-gate  *    The lookup is done by the chunk number, and the node is returned.
12667c478bd9Sstevel@tonic-gate  *    If the node was not found, NULL is returned.
12677c478bd9Sstevel@tonic-gate  */
12687c478bd9Sstevel@tonic-gate static cow_map_node_t *
12697c478bd9Sstevel@tonic-gate transtbl_get(cow_map_t *cmap, chunknumber_t chunk)
12707c478bd9Sstevel@tonic-gate {
12717c478bd9Sstevel@tonic-gate 	cow_map_node_t *cmn;
12727c478bd9Sstevel@tonic-gate 
12737c478bd9Sstevel@tonic-gate 	ASSERT(RW_READ_HELD(&cmap->cmap_rwlock));
12747c478bd9Sstevel@tonic-gate 	ASSERT(cmap);
12757c478bd9Sstevel@tonic-gate 
12767c478bd9Sstevel@tonic-gate 	/* search the translation table */
12777c478bd9Sstevel@tonic-gate 	for (cmn = cmap->cmap_table; cmn != NULL; cmn = cmn->cmn_next) {
12787c478bd9Sstevel@tonic-gate 		if (cmn->cmn_chunk == chunk)
12797c478bd9Sstevel@tonic-gate 			return (cmn);
12807c478bd9Sstevel@tonic-gate 	}
12817c478bd9Sstevel@tonic-gate 
12827c478bd9Sstevel@tonic-gate 	/* not found */
12837c478bd9Sstevel@tonic-gate 	return (NULL);
12847c478bd9Sstevel@tonic-gate }
12857c478bd9Sstevel@tonic-gate 
12867c478bd9Sstevel@tonic-gate /*
12877c478bd9Sstevel@tonic-gate  * transtbl_delete() - delete a node from the translation table
12887c478bd9Sstevel@tonic-gate  *
12897c478bd9Sstevel@tonic-gate  *    called when a node's data has been written out to disk.  The
12907c478bd9Sstevel@tonic-gate  *    cmap_rwlock must be held as a writer for this operation.  If the node
12917c478bd9Sstevel@tonic-gate  *    being deleted is the head of the list, then the head is moved to the
12927c478bd9Sstevel@tonic-gate  *    next node.  Both the node's data and the node itself are freed.
12937c478bd9Sstevel@tonic-gate  */
12947c478bd9Sstevel@tonic-gate static void
12957c478bd9Sstevel@tonic-gate transtbl_delete(cow_map_t *cmap, cow_map_node_t *cmn)
12967c478bd9Sstevel@tonic-gate {
12977c478bd9Sstevel@tonic-gate 	ASSERT(RW_WRITE_HELD(&cmap->cmap_rwlock));
12987c478bd9Sstevel@tonic-gate 	ASSERT(cmn);
12997c478bd9Sstevel@tonic-gate 	ASSERT(cmap->cmap_table);
13007c478bd9Sstevel@tonic-gate 
13017c478bd9Sstevel@tonic-gate 	/* if the head of the list is being deleted, then move the head up */
13027c478bd9Sstevel@tonic-gate 	if (cmap->cmap_table == cmn) {
13037c478bd9Sstevel@tonic-gate 		ASSERT(cmn->cmn_prev == NULL);
13047c478bd9Sstevel@tonic-gate 		cmap->cmap_table = cmn->cmn_next;
13057c478bd9Sstevel@tonic-gate 	}
13067c478bd9Sstevel@tonic-gate 
13077c478bd9Sstevel@tonic-gate 
13087c478bd9Sstevel@tonic-gate 	/* make previous node's next pointer skip over current node */
13097c478bd9Sstevel@tonic-gate 	if (cmn->cmn_prev != NULL) {
13107c478bd9Sstevel@tonic-gate 		ASSERT(cmn->cmn_prev->cmn_next == cmn);
13117c478bd9Sstevel@tonic-gate 		cmn->cmn_prev->cmn_next = cmn->cmn_next;
13127c478bd9Sstevel@tonic-gate 	}
13137c478bd9Sstevel@tonic-gate 
13147c478bd9Sstevel@tonic-gate 	/* make next node's previous pointer skip over current node */
13157c478bd9Sstevel@tonic-gate 	if (cmn->cmn_next != NULL) {
13167c478bd9Sstevel@tonic-gate 		ASSERT(cmn->cmn_next->cmn_prev == cmn);
13177c478bd9Sstevel@tonic-gate 		cmn->cmn_next->cmn_prev = cmn->cmn_prev;
13187c478bd9Sstevel@tonic-gate 	}
13197c478bd9Sstevel@tonic-gate 
13207c478bd9Sstevel@tonic-gate 	/* free the data and the node */
13217c478bd9Sstevel@tonic-gate 	ASSERT(cmn->cmn_buf);
13227c478bd9Sstevel@tonic-gate 	kmem_free(cmn->cmn_buf, cmap->cmap_chunksz);
13237c478bd9Sstevel@tonic-gate 	kmem_free(cmn, sizeof (cow_map_node_t));
13247c478bd9Sstevel@tonic-gate }
13257c478bd9Sstevel@tonic-gate 
13267c478bd9Sstevel@tonic-gate /*
13277c478bd9Sstevel@tonic-gate  * transtbl_free() - free the entire translation table
13287c478bd9Sstevel@tonic-gate  *
13297c478bd9Sstevel@tonic-gate  *    called when the snapshot is deleted.  This frees all of the nodes in
13307c478bd9Sstevel@tonic-gate  *    the translation table (but not the bitmaps).
13317c478bd9Sstevel@tonic-gate  */
13327c478bd9Sstevel@tonic-gate static void
13337c478bd9Sstevel@tonic-gate transtbl_free(cow_map_t *cmap)
13347c478bd9Sstevel@tonic-gate {
13357c478bd9Sstevel@tonic-gate 	cow_map_node_t	*curnode;
13367c478bd9Sstevel@tonic-gate 	cow_map_node_t	*tempnode;
13377c478bd9Sstevel@tonic-gate 
13387c478bd9Sstevel@tonic-gate 	for (curnode = cmap->cmap_table; curnode != NULL; curnode = tempnode) {
13397c478bd9Sstevel@tonic-gate 		tempnode = curnode->cmn_next;
13407c478bd9Sstevel@tonic-gate 
13417c478bd9Sstevel@tonic-gate 		kmem_free(curnode->cmn_buf, cmap->cmap_chunksz);
13427c478bd9Sstevel@tonic-gate 		kmem_free(curnode, sizeof (cow_map_node_t));
13437c478bd9Sstevel@tonic-gate 	}
13447c478bd9Sstevel@tonic-gate }
13457c478bd9Sstevel@tonic-gate 
13467c478bd9Sstevel@tonic-gate 
13477c478bd9Sstevel@tonic-gate /* ************************************************************************ */
13487c478bd9Sstevel@tonic-gate 
13497c478bd9Sstevel@tonic-gate /*
13507c478bd9Sstevel@tonic-gate  * Interface Implementation Routines
13517c478bd9Sstevel@tonic-gate  *
13527c478bd9Sstevel@tonic-gate  * The following functions implement snapshot interface routines that are
13537c478bd9Sstevel@tonic-gate  * called by the file system to create, delete, and use a snapshot.  The
13547c478bd9Sstevel@tonic-gate  * interfaces are defined in fssnap_if.c and are filled in by this driver
13557c478bd9Sstevel@tonic-gate  * when it is loaded.  This technique allows the file system to depend on
13567c478bd9Sstevel@tonic-gate  * the interface module without having to load the full implementation and
13577c478bd9Sstevel@tonic-gate  * snapshot device drivers.
13587c478bd9Sstevel@tonic-gate  */
13597c478bd9Sstevel@tonic-gate 
13607c478bd9Sstevel@tonic-gate /*
13617c478bd9Sstevel@tonic-gate  * fssnap_strategy_impl() - strategy routine called by the file system
13627c478bd9Sstevel@tonic-gate  *
13637c478bd9Sstevel@tonic-gate  *    called by the file system to handle copy-on-write when necessary.  All
13647c478bd9Sstevel@tonic-gate  *    reads and writes that the file system performs should go through this
13657c478bd9Sstevel@tonic-gate  *    function.  If the file system calls the underlying device's strategy
13667c478bd9Sstevel@tonic-gate  *    routine without going through fssnap_strategy() (eg. by calling
13677c478bd9Sstevel@tonic-gate  *    bdev_strategy()), the snapshot may not be consistent.
13687c478bd9Sstevel@tonic-gate  *
13697c478bd9Sstevel@tonic-gate  *    This function starts by doing significant sanity checking to insure
13707c478bd9Sstevel@tonic-gate  *    the snapshot was not deleted out from under it or deleted and then
13717c478bd9Sstevel@tonic-gate  *    recreated.  To do this, it checks the actual pointer passed into it
13727c478bd9Sstevel@tonic-gate  *    (ie. the handle held by the file system).  NOTE that the parameter is
13737c478bd9Sstevel@tonic-gate  *    a POINTER TO A POINTER to the snapshot id.  Once the snapshot id is
13747c478bd9Sstevel@tonic-gate  *    locked, it knows things are ok and that this snapshot is really for
13757c478bd9Sstevel@tonic-gate  *    this file system.
13767c478bd9Sstevel@tonic-gate  *
13777c478bd9Sstevel@tonic-gate  *    If the request is a write, fssnap_translate() is called to determine
13787c478bd9Sstevel@tonic-gate  *    whether a copy-on-write is required.  If it is a read, the read is
13797c478bd9Sstevel@tonic-gate  *    simply passed on to the underlying device.
13807c478bd9Sstevel@tonic-gate  */
13817c478bd9Sstevel@tonic-gate static void
13827c478bd9Sstevel@tonic-gate fssnap_strategy_impl(void *snapshot_id, buf_t *bp)
13837c478bd9Sstevel@tonic-gate {
13847c478bd9Sstevel@tonic-gate 	struct snapshot_id **sidpp;
13857c478bd9Sstevel@tonic-gate 	struct snapshot_id *sidp;
13867c478bd9Sstevel@tonic-gate 	int error;
13877c478bd9Sstevel@tonic-gate 
13887c478bd9Sstevel@tonic-gate 	/* read requests are always passed through */
13897c478bd9Sstevel@tonic-gate 	if (bp->b_flags & B_READ) {
13907c478bd9Sstevel@tonic-gate 		(void) bdev_strategy(bp);
13917c478bd9Sstevel@tonic-gate 		return;
13927c478bd9Sstevel@tonic-gate 	}
13937c478bd9Sstevel@tonic-gate 
13947c478bd9Sstevel@tonic-gate 	/*
13957c478bd9Sstevel@tonic-gate 	 * Because we were not able to take the snapshot read lock BEFORE
13967c478bd9Sstevel@tonic-gate 	 * checking for a snapshot back in the file system, things may have
13977c478bd9Sstevel@tonic-gate 	 * drastically changed out from under us.  For instance, the snapshot
13987c478bd9Sstevel@tonic-gate 	 * may have been deleted, deleted and recreated, or worse yet, deleted
13997c478bd9Sstevel@tonic-gate 	 * for this file system but now the snapshot number is in use by another
14007c478bd9Sstevel@tonic-gate 	 * file system.
14017c478bd9Sstevel@tonic-gate 	 *
14027c478bd9Sstevel@tonic-gate 	 * Having a pointer to the file system's snapshot id pointer allows us
14037c478bd9Sstevel@tonic-gate 	 * to sanity check most of this, though it assumes the file system is
14047c478bd9Sstevel@tonic-gate 	 * keeping track of a pointer to the snapshot_id somewhere.
14057c478bd9Sstevel@tonic-gate 	 */
14067c478bd9Sstevel@tonic-gate 	sidpp = (struct snapshot_id **)snapshot_id;
14077c478bd9Sstevel@tonic-gate 	sidp = *sidpp;
14087c478bd9Sstevel@tonic-gate 
14097c478bd9Sstevel@tonic-gate 	/*
14107c478bd9Sstevel@tonic-gate 	 * if this file system's snapshot was disabled, just pass the
14117c478bd9Sstevel@tonic-gate 	 * request through.
14127c478bd9Sstevel@tonic-gate 	 */
14137c478bd9Sstevel@tonic-gate 	if (sidp == NULL) {
14147c478bd9Sstevel@tonic-gate 		(void) bdev_strategy(bp);
14157c478bd9Sstevel@tonic-gate 		return;
14167c478bd9Sstevel@tonic-gate 	}
14177c478bd9Sstevel@tonic-gate 
14187c478bd9Sstevel@tonic-gate 	/*
14197c478bd9Sstevel@tonic-gate 	 * Once we have the reader lock the snapshot will not magically go
14207c478bd9Sstevel@tonic-gate 	 * away.  But things may have changed on us before this so double check.
14217c478bd9Sstevel@tonic-gate 	 */
14227c478bd9Sstevel@tonic-gate 	rw_enter(&sidp->sid_rwlock, RW_READER);
14237c478bd9Sstevel@tonic-gate 
14247c478bd9Sstevel@tonic-gate 	/*
14257c478bd9Sstevel@tonic-gate 	 * if an error was founds somewhere the DELETE flag will be
14267c478bd9Sstevel@tonic-gate 	 * set to indicate the snapshot should be deleted and no new
14277c478bd9Sstevel@tonic-gate 	 * translations should occur.
14287c478bd9Sstevel@tonic-gate 	 */
14297c478bd9Sstevel@tonic-gate 	if (sidp->sid_flags & SID_DELETE) {
14307c478bd9Sstevel@tonic-gate 		rw_exit(&sidp->sid_rwlock);
14317c478bd9Sstevel@tonic-gate 		(void) fssnap_delete_impl(sidpp);
14327c478bd9Sstevel@tonic-gate 		(void) bdev_strategy(bp);
14337c478bd9Sstevel@tonic-gate 		return;
14347c478bd9Sstevel@tonic-gate 	}
14357c478bd9Sstevel@tonic-gate 
14367c478bd9Sstevel@tonic-gate 	/*
14377c478bd9Sstevel@tonic-gate 	 * If the file system is no longer pointing to the snapshot we were
14387c478bd9Sstevel@tonic-gate 	 * called with, then it should not attempt to translate this buffer as
14397c478bd9Sstevel@tonic-gate 	 * it may be going to a snapshot for a different file system.
14407c478bd9Sstevel@tonic-gate 	 * Even if the file system snapshot pointer is still the same, the
14417c478bd9Sstevel@tonic-gate 	 * snapshot may have been disabled before we got the reader lock.
14427c478bd9Sstevel@tonic-gate 	 */
14437c478bd9Sstevel@tonic-gate 	if (sidp != *sidpp || SID_INACTIVE(sidp)) {
14447c478bd9Sstevel@tonic-gate 		rw_exit(&sidp->sid_rwlock);
14457c478bd9Sstevel@tonic-gate 		(void) bdev_strategy(bp);
14467c478bd9Sstevel@tonic-gate 		return;
14477c478bd9Sstevel@tonic-gate 	}
14487c478bd9Sstevel@tonic-gate 
14497c478bd9Sstevel@tonic-gate 	/*
14507c478bd9Sstevel@tonic-gate 	 * At this point we're sure the snapshot will not go away while the
14517c478bd9Sstevel@tonic-gate 	 * reader lock is held, and we are reasonably certain that we are
14527c478bd9Sstevel@tonic-gate 	 * writing to the correct snapshot.
14537c478bd9Sstevel@tonic-gate 	 */
14547c478bd9Sstevel@tonic-gate 	if ((error = fssnap_translate(sidpp, bp)) != 0) {
14557c478bd9Sstevel@tonic-gate 		/*
14567c478bd9Sstevel@tonic-gate 		 * fssnap_translate can release the reader lock if it
14577c478bd9Sstevel@tonic-gate 		 * has to wait for a semaphore.  In this case it is possible
14587c478bd9Sstevel@tonic-gate 		 * for the snapshot to be deleted in this time frame.  If this
14597c478bd9Sstevel@tonic-gate 		 * happens just sent the buf thru to the filesystems device.
14607c478bd9Sstevel@tonic-gate 		 */
14617c478bd9Sstevel@tonic-gate 		if (sidp != *sidpp || SID_INACTIVE(sidp)) {
14627c478bd9Sstevel@tonic-gate 			rw_exit(&sidp->sid_rwlock);
14637c478bd9Sstevel@tonic-gate 			(void) bdev_strategy(bp);
14647c478bd9Sstevel@tonic-gate 			return;
14657c478bd9Sstevel@tonic-gate 		}
14667c478bd9Sstevel@tonic-gate 		bioerror(bp, error);
14677c478bd9Sstevel@tonic-gate 		biodone(bp);
14687c478bd9Sstevel@tonic-gate 	}
14697c478bd9Sstevel@tonic-gate 	rw_exit(&sidp->sid_rwlock);
14707c478bd9Sstevel@tonic-gate }
14717c478bd9Sstevel@tonic-gate 
14727c478bd9Sstevel@tonic-gate /*
14737c478bd9Sstevel@tonic-gate  * fssnap_translate() - helper function for fssnap_strategy()
14747c478bd9Sstevel@tonic-gate  *
14757c478bd9Sstevel@tonic-gate  *    performs the actual copy-on-write for write requests, if required.
14767c478bd9Sstevel@tonic-gate  *    This function does the real work of the file system side of things.
14777c478bd9Sstevel@tonic-gate  *
14787c478bd9Sstevel@tonic-gate  *    It first checks the candidate bitmap to quickly determine whether any
14797c478bd9Sstevel@tonic-gate  *    action is necessary.  If the candidate bitmap indicates the chunk was
14807c478bd9Sstevel@tonic-gate  *    allocated when the snapshot was created, then it checks to see whether
14817c478bd9Sstevel@tonic-gate  *    a translation already exists.  If a translation already exists then no
14827c478bd9Sstevel@tonic-gate  *    action is required.  If the chunk is a candidate for copy-on-write,
14837c478bd9Sstevel@tonic-gate  *    and a translation does not already exist, then the chunk is read in
14847c478bd9Sstevel@tonic-gate  *    and a node is added to the translation table.
14857c478bd9Sstevel@tonic-gate  *
14867c478bd9Sstevel@tonic-gate  *    Once all of the chunks in the request range have been copied (if they
14877c478bd9Sstevel@tonic-gate  *    needed to be), then the original request can be satisfied and the old
14887c478bd9Sstevel@tonic-gate  *    data can be overwritten.
14897c478bd9Sstevel@tonic-gate  */
14907c478bd9Sstevel@tonic-gate static int
14917c478bd9Sstevel@tonic-gate fssnap_translate(struct snapshot_id **sidpp, struct buf *wbp)
14927c478bd9Sstevel@tonic-gate {
14937c478bd9Sstevel@tonic-gate 	snapshot_id_t	*sidp = *sidpp;
14947c478bd9Sstevel@tonic-gate 	struct buf	*oldbp;	/* buffer to store old data in */
14957c478bd9Sstevel@tonic-gate 	struct cow_info	*cowp = sidp->sid_cowinfo;
14967c478bd9Sstevel@tonic-gate 	cow_map_t	*cmap = &cowp->cow_map;
14977c478bd9Sstevel@tonic-gate 	cow_map_node_t	*cmn;
14987c478bd9Sstevel@tonic-gate 	chunknumber_t	cowchunk, startchunk, endchunk;
14997c478bd9Sstevel@tonic-gate 	int		error;
15007c478bd9Sstevel@tonic-gate 	int	throttle_write = 0;
15017c478bd9Sstevel@tonic-gate 
15027c478bd9Sstevel@tonic-gate 	/* make sure the snapshot is active */
15037c478bd9Sstevel@tonic-gate 	ASSERT(RW_READ_HELD(&sidp->sid_rwlock));
15047c478bd9Sstevel@tonic-gate 
15057c478bd9Sstevel@tonic-gate 	startchunk = dbtocowchunk(cmap, wbp->b_lblkno);
15067c478bd9Sstevel@tonic-gate 	endchunk   = dbtocowchunk(cmap, wbp->b_lblkno +
15077c478bd9Sstevel@tonic-gate 	    ((wbp->b_bcount-1) >> DEV_BSHIFT));
15087c478bd9Sstevel@tonic-gate 
15097c478bd9Sstevel@tonic-gate 	/*
15107c478bd9Sstevel@tonic-gate 	 * Do not throttle the writes of the fssnap taskq thread and
15117c478bd9Sstevel@tonic-gate 	 * the log roll (trans_roll) thread. Furthermore the writes to
15127c478bd9Sstevel@tonic-gate 	 * the on-disk log are also not subject to throttling.
15137c478bd9Sstevel@tonic-gate 	 * The fssnap_write_taskq thread's write can block on the throttling
15147c478bd9Sstevel@tonic-gate 	 * semaphore which leads to self-deadlock as this same thread
15157c478bd9Sstevel@tonic-gate 	 * releases the throttling semaphore after completing the IO.
15167c478bd9Sstevel@tonic-gate 	 * If the trans_roll thread's write is throttled then we can deadlock
15177c478bd9Sstevel@tonic-gate 	 * because the fssnap_taskq_thread which releases the throttling
15187c478bd9Sstevel@tonic-gate 	 * semaphore can block waiting for log space which can only be
15197c478bd9Sstevel@tonic-gate 	 * released by the trans_roll thread.
15207c478bd9Sstevel@tonic-gate 	 */
15217c478bd9Sstevel@tonic-gate 
15227c478bd9Sstevel@tonic-gate 	throttle_write = !(taskq_member(cowp->cow_taskq, curthread) ||
15237c478bd9Sstevel@tonic-gate 				    tsd_get(bypass_snapshot_throttle_key));
15247c478bd9Sstevel@tonic-gate 
15257c478bd9Sstevel@tonic-gate 	/*
15267c478bd9Sstevel@tonic-gate 	 * Iterate through all chunks covered by this write and perform the
15277c478bd9Sstevel@tonic-gate 	 * copy-aside if necessary.  Once all chunks have been safely
15287c478bd9Sstevel@tonic-gate 	 * stowed away, the new data may be written in a single sweep.
15297c478bd9Sstevel@tonic-gate 	 *
15307c478bd9Sstevel@tonic-gate 	 * For each chunk in the range, the following sequence is performed:
15317c478bd9Sstevel@tonic-gate 	 *	- Is the chunk a candidate for translation?
15327c478bd9Sstevel@tonic-gate 	 *		o If not, then no translation is necessary, continue
15337c478bd9Sstevel@tonic-gate 	 *	- If it is a candidate, then does it already have a translation?
15347c478bd9Sstevel@tonic-gate 	 *		o If so, then no translation is necessary, continue
15357c478bd9Sstevel@tonic-gate 	 *	- If it is a candidate, but does not yet have a translation,
15367c478bd9Sstevel@tonic-gate 	 *	  then read the old data and schedule an asynchronous taskq
15377c478bd9Sstevel@tonic-gate 	 *	  to write the old data to the backing file.
15387c478bd9Sstevel@tonic-gate 	 *
15397c478bd9Sstevel@tonic-gate 	 * Once this has been performed over the entire range of chunks, then
15407c478bd9Sstevel@tonic-gate 	 * it is safe to overwrite the data that is there.
15417c478bd9Sstevel@tonic-gate 	 *
15427c478bd9Sstevel@tonic-gate 	 * Note that no lock is required to check the candidate bitmap because
15437c478bd9Sstevel@tonic-gate 	 * it never changes once the snapshot is created.  The reader lock is
15447c478bd9Sstevel@tonic-gate 	 * taken to check the hastrans bitmap since it may change.  If it
15457c478bd9Sstevel@tonic-gate 	 * turns out a copy is required, then the lock is upgraded to a
15467c478bd9Sstevel@tonic-gate 	 * writer, and the bitmap is re-checked as it may have changed while
15477c478bd9Sstevel@tonic-gate 	 * the lock was released.  Finally, the write lock is held while
15487c478bd9Sstevel@tonic-gate 	 * reading the old data to make sure it is not translated out from
15497c478bd9Sstevel@tonic-gate 	 * under us.
15507c478bd9Sstevel@tonic-gate 	 *
15517c478bd9Sstevel@tonic-gate 	 * This locking mechanism should be sufficient to handle multiple
15527c478bd9Sstevel@tonic-gate 	 * threads writing to overlapping chunks simultaneously.
15537c478bd9Sstevel@tonic-gate 	 */
15547c478bd9Sstevel@tonic-gate 	for (cowchunk = startchunk; cowchunk <= endchunk; cowchunk++) {
15557c478bd9Sstevel@tonic-gate 		/*
15567c478bd9Sstevel@tonic-gate 		 * If the cowchunk is outside of the range of our
15577c478bd9Sstevel@tonic-gate 		 * candidate maps, then simply break out of the
15587c478bd9Sstevel@tonic-gate 		 * loop and pass the I/O through to bdev_strategy.
15597c478bd9Sstevel@tonic-gate 		 * This would occur if the file system has grown
15607c478bd9Sstevel@tonic-gate 		 * larger since the snapshot was taken.
15617c478bd9Sstevel@tonic-gate 		 */
15627c478bd9Sstevel@tonic-gate 		if (cowchunk >= (cmap->cmap_bmsize * NBBY))
15637c478bd9Sstevel@tonic-gate 			break;
15647c478bd9Sstevel@tonic-gate 
15657c478bd9Sstevel@tonic-gate 		/*
15667c478bd9Sstevel@tonic-gate 		 * If no disk blocks were allocated in this chunk when the
15677c478bd9Sstevel@tonic-gate 		 * snapshot was created then no copy-on-write will be
15687c478bd9Sstevel@tonic-gate 		 * required.  Since this bitmap is read-only no locks are
15697c478bd9Sstevel@tonic-gate 		 * necessary.
15707c478bd9Sstevel@tonic-gate 		 */
15717c478bd9Sstevel@tonic-gate 		if (isclr(cmap->cmap_candidate, cowchunk)) {
15727c478bd9Sstevel@tonic-gate 			continue;
15737c478bd9Sstevel@tonic-gate 		}
15747c478bd9Sstevel@tonic-gate 
15757c478bd9Sstevel@tonic-gate 		/*
15767c478bd9Sstevel@tonic-gate 		 * If a translation already exists, the data can be written
15777c478bd9Sstevel@tonic-gate 		 * through since the old data has already been saved off.
15787c478bd9Sstevel@tonic-gate 		 */
15797c478bd9Sstevel@tonic-gate 		if (isset(cmap->cmap_hastrans, cowchunk)) {
15807c478bd9Sstevel@tonic-gate 			continue;
15817c478bd9Sstevel@tonic-gate 		}
15827c478bd9Sstevel@tonic-gate 
15837c478bd9Sstevel@tonic-gate 
15847c478bd9Sstevel@tonic-gate 		/*
15857c478bd9Sstevel@tonic-gate 		 * Throttle translations if there are too many outstanding
15867c478bd9Sstevel@tonic-gate 		 * chunks in memory.  The semaphore is sema_v'd by the taskq.
15877c478bd9Sstevel@tonic-gate 		 *
15887c478bd9Sstevel@tonic-gate 		 * You can't keep the sid_rwlock if you would go to sleep.
15897c478bd9Sstevel@tonic-gate 		 * This will result in deadlock when someone tries to delete
15907c478bd9Sstevel@tonic-gate 		 * the snapshot (wants the sid_rwlock as a writer, but can't
15917c478bd9Sstevel@tonic-gate 		 * get it).
15927c478bd9Sstevel@tonic-gate 		 */
15937c478bd9Sstevel@tonic-gate 		if (throttle_write) {
15947c478bd9Sstevel@tonic-gate 			if (sema_tryp(&cmap->cmap_throttle_sem) == 0) {
15957c478bd9Sstevel@tonic-gate 				rw_exit(&sidp->sid_rwlock);
15967c478bd9Sstevel@tonic-gate 				atomic_add_32(&cmap->cmap_waiters, 1);
15977c478bd9Sstevel@tonic-gate 				sema_p(&cmap->cmap_throttle_sem);
15987c478bd9Sstevel@tonic-gate 				atomic_add_32(&cmap->cmap_waiters, -1);
15997c478bd9Sstevel@tonic-gate 				rw_enter(&sidp->sid_rwlock, RW_READER);
16007c478bd9Sstevel@tonic-gate 
16017c478bd9Sstevel@tonic-gate 			/*
16027c478bd9Sstevel@tonic-gate 			 * Now since we released the sid_rwlock the state may
16037c478bd9Sstevel@tonic-gate 			 * have transitioned underneath us. so check that again.
16047c478bd9Sstevel@tonic-gate 			 */
16057c478bd9Sstevel@tonic-gate 				if (sidp != *sidpp || SID_INACTIVE(sidp)) {
16067c478bd9Sstevel@tonic-gate 					sema_v(&cmap->cmap_throttle_sem);
16077c478bd9Sstevel@tonic-gate 					return (ENXIO);
16087c478bd9Sstevel@tonic-gate 				}
16097c478bd9Sstevel@tonic-gate 			}
16107c478bd9Sstevel@tonic-gate 		}
16117c478bd9Sstevel@tonic-gate 
16127c478bd9Sstevel@tonic-gate 		/*
16137c478bd9Sstevel@tonic-gate 		 * Acquire the lock as a writer and check to see if a
16147c478bd9Sstevel@tonic-gate 		 * translation has been added in the meantime.
16157c478bd9Sstevel@tonic-gate 		 */
16167c478bd9Sstevel@tonic-gate 		rw_enter(&cmap->cmap_rwlock, RW_WRITER);
16177c478bd9Sstevel@tonic-gate 		if (isset(cmap->cmap_hastrans, cowchunk)) {
16187c478bd9Sstevel@tonic-gate 			if (throttle_write)
16197c478bd9Sstevel@tonic-gate 				sema_v(&cmap->cmap_throttle_sem);
16207c478bd9Sstevel@tonic-gate 			rw_exit(&cmap->cmap_rwlock);
16217c478bd9Sstevel@tonic-gate 			continue; /* go to the next chunk */
16227c478bd9Sstevel@tonic-gate 		}
16237c478bd9Sstevel@tonic-gate 
16247c478bd9Sstevel@tonic-gate 		/*
16257c478bd9Sstevel@tonic-gate 		 * read a full chunk of data from the requested offset rounded
16267c478bd9Sstevel@tonic-gate 		 * down to the nearest chunk size.
16277c478bd9Sstevel@tonic-gate 		 */
16287c478bd9Sstevel@tonic-gate 		oldbp = getrbuf(KM_SLEEP);
16297c478bd9Sstevel@tonic-gate 		oldbp->b_lblkno = cowchunktodb(cmap, cowchunk);
16307c478bd9Sstevel@tonic-gate 		oldbp->b_edev = wbp->b_edev;
16317c478bd9Sstevel@tonic-gate 		oldbp->b_bcount = cmap->cmap_chunksz;
16327c478bd9Sstevel@tonic-gate 		oldbp->b_bufsize = cmap->cmap_chunksz;
16337c478bd9Sstevel@tonic-gate 		oldbp->b_iodone = NULL;
16347c478bd9Sstevel@tonic-gate 		oldbp->b_proc = NULL;
16357c478bd9Sstevel@tonic-gate 		oldbp->b_flags = B_READ;
16367c478bd9Sstevel@tonic-gate 		oldbp->b_un.b_addr = kmem_alloc(cmap->cmap_chunksz, KM_SLEEP);
16377c478bd9Sstevel@tonic-gate 
16387c478bd9Sstevel@tonic-gate 		(void) bdev_strategy(oldbp);
16397c478bd9Sstevel@tonic-gate 		(void) biowait(oldbp);
16407c478bd9Sstevel@tonic-gate 
16417c478bd9Sstevel@tonic-gate 		/*
16427c478bd9Sstevel@tonic-gate 		 * It's ok to bail in the middle of translating the range
16437c478bd9Sstevel@tonic-gate 		 * because the extra copy-asides will not hurt anything
16447c478bd9Sstevel@tonic-gate 		 * (except by using extra space in the backing store).
16457c478bd9Sstevel@tonic-gate 		 */
16467c478bd9Sstevel@tonic-gate 		if ((error = geterror(oldbp)) != 0) {
16477c478bd9Sstevel@tonic-gate 			cmn_err(CE_WARN, "fssnap_translate: error reading "
16487c478bd9Sstevel@tonic-gate 			    "old data for snapshot %d, chunk %llu, disk block "
16497c478bd9Sstevel@tonic-gate 			    "%lld, size %lu, error %d.", sidp->sid_snapnumber,
16507c478bd9Sstevel@tonic-gate 			    cowchunk, oldbp->b_lblkno, oldbp->b_bcount, error);
16517c478bd9Sstevel@tonic-gate 			kmem_free(oldbp->b_un.b_addr, cmap->cmap_chunksz);
16527c478bd9Sstevel@tonic-gate 			freerbuf(oldbp);
16537c478bd9Sstevel@tonic-gate 			rw_exit(&cmap->cmap_rwlock);
16547c478bd9Sstevel@tonic-gate 			if (throttle_write)
16557c478bd9Sstevel@tonic-gate 				sema_v(&cmap->cmap_throttle_sem);
16567c478bd9Sstevel@tonic-gate 			return (error);
16577c478bd9Sstevel@tonic-gate 		}
16587c478bd9Sstevel@tonic-gate 
16597c478bd9Sstevel@tonic-gate 		/*
16607c478bd9Sstevel@tonic-gate 		 * add the node to the translation table and save a reference
16617c478bd9Sstevel@tonic-gate 		 * to pass to the taskq for writing out to the backing file
16627c478bd9Sstevel@tonic-gate 		 */
16637c478bd9Sstevel@tonic-gate 		cmn = transtbl_add(cmap, cowchunk, oldbp->b_un.b_addr);
16647c478bd9Sstevel@tonic-gate 		freerbuf(oldbp);
16657c478bd9Sstevel@tonic-gate 
16667c478bd9Sstevel@tonic-gate 		/*
16677c478bd9Sstevel@tonic-gate 		 * Add a reference to the snapshot id so the lower level
16687c478bd9Sstevel@tonic-gate 		 * processing (ie. the taskq) can get back to the state
16697c478bd9Sstevel@tonic-gate 		 * information.
16707c478bd9Sstevel@tonic-gate 		 */
16717c478bd9Sstevel@tonic-gate 		cmn->cmn_sid = sidp;
16727c478bd9Sstevel@tonic-gate 		cmn->release_sem = throttle_write;
16737c478bd9Sstevel@tonic-gate 		setbit(cmap->cmap_hastrans, cowchunk);
16747c478bd9Sstevel@tonic-gate 
16757c478bd9Sstevel@tonic-gate 		rw_exit(&cmap->cmap_rwlock);
16767c478bd9Sstevel@tonic-gate 
16777c478bd9Sstevel@tonic-gate 		/*
16787c478bd9Sstevel@tonic-gate 		 * schedule the asynchronous write to the backing file
16797c478bd9Sstevel@tonic-gate 		 */
16807c478bd9Sstevel@tonic-gate 		if (cowp->cow_backfile_array != NULL)
16817c478bd9Sstevel@tonic-gate 			(void) taskq_dispatch(cowp->cow_taskq,
16827c478bd9Sstevel@tonic-gate 			    fssnap_write_taskq, cmn, TQ_SLEEP);
16837c478bd9Sstevel@tonic-gate 	}
16847c478bd9Sstevel@tonic-gate 
16857c478bd9Sstevel@tonic-gate 	/*
16867c478bd9Sstevel@tonic-gate 	 * Write new data in place of the old data.  At this point all of the
16877c478bd9Sstevel@tonic-gate 	 * chunks touched by this write have been copied aside and so the new
16887c478bd9Sstevel@tonic-gate 	 * data can be written out all at once.
16897c478bd9Sstevel@tonic-gate 	 */
16907c478bd9Sstevel@tonic-gate 	(void) bdev_strategy(wbp);
16917c478bd9Sstevel@tonic-gate 
16927c478bd9Sstevel@tonic-gate 	return (0);
16937c478bd9Sstevel@tonic-gate }
16947c478bd9Sstevel@tonic-gate 
16957c478bd9Sstevel@tonic-gate /*
16967c478bd9Sstevel@tonic-gate  * fssnap_write_taskq() - write in-memory translations to the backing file
16977c478bd9Sstevel@tonic-gate  *
16987c478bd9Sstevel@tonic-gate  *    writes in-memory translations to the backing file asynchronously.  A
16997c478bd9Sstevel@tonic-gate  *    task is dispatched each time a new translation is created.  The task
17007c478bd9Sstevel@tonic-gate  *    writes the data to the backing file and removes it from the memory
17017c478bd9Sstevel@tonic-gate  *    list. The throttling semaphore is released only if the particular
17027c478bd9Sstevel@tonic-gate  *    translation was throttled in fssnap_translate.
17037c478bd9Sstevel@tonic-gate  */
17047c478bd9Sstevel@tonic-gate static void
17057c478bd9Sstevel@tonic-gate fssnap_write_taskq(void *arg)
17067c478bd9Sstevel@tonic-gate {
17077c478bd9Sstevel@tonic-gate 	cow_map_node_t	*cmn = (cow_map_node_t *)arg;
17087c478bd9Sstevel@tonic-gate 	snapshot_id_t	*sidp = cmn->cmn_sid;
17097c478bd9Sstevel@tonic-gate 	cow_info_t	*cowp = sidp->sid_cowinfo;
17107c478bd9Sstevel@tonic-gate 	cow_map_t	*cmap = &cowp->cow_map;
17117c478bd9Sstevel@tonic-gate 	int		error;
17127c478bd9Sstevel@tonic-gate 	int		bf_index;
17137c478bd9Sstevel@tonic-gate 	int		release_sem = cmn->release_sem;
17147c478bd9Sstevel@tonic-gate 
17157c478bd9Sstevel@tonic-gate 	/*
17167c478bd9Sstevel@tonic-gate 	 * The sid_rwlock does not need to be held here because the taskqs
17177c478bd9Sstevel@tonic-gate 	 * are destroyed explicitly by fssnap_delete (with the sid_rwlock
17187c478bd9Sstevel@tonic-gate 	 * held as a writer).  taskq_destroy() will flush all of the tasks
17197c478bd9Sstevel@tonic-gate 	 * out before fssnap_delete frees up all of the structures.
17207c478bd9Sstevel@tonic-gate 	 */
17217c478bd9Sstevel@tonic-gate 
17227c478bd9Sstevel@tonic-gate 	/* if the snapshot was disabled from under us, drop the request. */
17237c478bd9Sstevel@tonic-gate 	rw_enter(&sidp->sid_rwlock, RW_READER);
17247c478bd9Sstevel@tonic-gate 	if (SID_INACTIVE(sidp)) {
17257c478bd9Sstevel@tonic-gate 		rw_exit(&sidp->sid_rwlock);
17267c478bd9Sstevel@tonic-gate 		if (release_sem)
17277c478bd9Sstevel@tonic-gate 			sema_v(&cmap->cmap_throttle_sem);
17287c478bd9Sstevel@tonic-gate 		return;
17297c478bd9Sstevel@tonic-gate 	}
17307c478bd9Sstevel@tonic-gate 	rw_exit(&sidp->sid_rwlock);
17317c478bd9Sstevel@tonic-gate 
17327c478bd9Sstevel@tonic-gate 	atomic_add_64((uint64_t *)&cmap->cmap_nchunks, 1);
17337c478bd9Sstevel@tonic-gate 
17347c478bd9Sstevel@tonic-gate 	if ((cmap->cmap_maxsize != 0) &&
17357c478bd9Sstevel@tonic-gate 	    ((cmap->cmap_nchunks * cmap->cmap_chunksz) > cmap->cmap_maxsize)) {
17367c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN, "fssnap_write_taskq: snapshot %d (%s) has "
17377c478bd9Sstevel@tonic-gate 		    "reached the maximum backing file size specified (%llu "
17387c478bd9Sstevel@tonic-gate 		    "bytes) and will be deleted.", sidp->sid_snapnumber,
17397c478bd9Sstevel@tonic-gate 		    (char *)cowp->cow_kstat_mntpt->ks_data,
17407c478bd9Sstevel@tonic-gate 		    cmap->cmap_maxsize);
17417c478bd9Sstevel@tonic-gate 		if (release_sem)
17427c478bd9Sstevel@tonic-gate 			sema_v(&cmap->cmap_throttle_sem);
17437c478bd9Sstevel@tonic-gate 		atomic_or_uint(&sidp->sid_flags, SID_DELETE);
17447c478bd9Sstevel@tonic-gate 		return;
17457c478bd9Sstevel@tonic-gate 	}
17467c478bd9Sstevel@tonic-gate 
17477c478bd9Sstevel@tonic-gate 	/* perform the write */
17487c478bd9Sstevel@tonic-gate 	bf_index = cmn->cmn_chunk / cmap->cmap_chunksperbf;
17497c478bd9Sstevel@tonic-gate 
17507c478bd9Sstevel@tonic-gate 	if (error = vn_rdwr(UIO_WRITE, (cowp->cow_backfile_array)[bf_index],
17517c478bd9Sstevel@tonic-gate 	    cmn->cmn_buf, cmap->cmap_chunksz,
17527c478bd9Sstevel@tonic-gate 	    (cmn->cmn_chunk % cmap->cmap_chunksperbf) * cmap->cmap_chunksz,
17537c478bd9Sstevel@tonic-gate 	    UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, (ssize_t *)NULL)) {
17547c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN, "fssnap_write_taskq: error writing to "
17557c478bd9Sstevel@tonic-gate 		    "backing file.  DELETING SNAPSHOT %d, backing file path "
17567c478bd9Sstevel@tonic-gate 		    "%s, offset %llu bytes, error %d.", sidp->sid_snapnumber,
17577c478bd9Sstevel@tonic-gate 		    (char *)cowp->cow_kstat_bfname->ks_data,
17587c478bd9Sstevel@tonic-gate 		    cmn->cmn_chunk * cmap->cmap_chunksz, error);
17597c478bd9Sstevel@tonic-gate 		if (release_sem)
17607c478bd9Sstevel@tonic-gate 			sema_v(&cmap->cmap_throttle_sem);
17617c478bd9Sstevel@tonic-gate 		atomic_or_uint(&sidp->sid_flags, SID_DELETE);
17627c478bd9Sstevel@tonic-gate 		return;
17637c478bd9Sstevel@tonic-gate 	}
17647c478bd9Sstevel@tonic-gate 
17657c478bd9Sstevel@tonic-gate 	/*
17667c478bd9Sstevel@tonic-gate 	 * now remove the node and buffer from memory
17677c478bd9Sstevel@tonic-gate 	 */
17687c478bd9Sstevel@tonic-gate 	rw_enter(&cmap->cmap_rwlock, RW_WRITER);
17697c478bd9Sstevel@tonic-gate 	transtbl_delete(cmap, cmn);
17707c478bd9Sstevel@tonic-gate 	rw_exit(&cmap->cmap_rwlock);
17717c478bd9Sstevel@tonic-gate 
17727c478bd9Sstevel@tonic-gate 	/* Allow more translations */
17737c478bd9Sstevel@tonic-gate 	if (release_sem)
17747c478bd9Sstevel@tonic-gate 		sema_v(&cmap->cmap_throttle_sem);
17757c478bd9Sstevel@tonic-gate 
17767c478bd9Sstevel@tonic-gate }
17777c478bd9Sstevel@tonic-gate 
17787c478bd9Sstevel@tonic-gate /*
17797c478bd9Sstevel@tonic-gate  * fssnap_create_impl() - called from the file system to create a new snapshot
17807c478bd9Sstevel@tonic-gate  *
17817c478bd9Sstevel@tonic-gate  *    allocates and initializes the structures needed for a new snapshot.
17827c478bd9Sstevel@tonic-gate  *    This is called by the file system when it receives an ioctl request to
17837c478bd9Sstevel@tonic-gate  *    create a new snapshot.  An unused snapshot identifier is either found
17847c478bd9Sstevel@tonic-gate  *    or created, and eventually returned as the opaque handle the file
17857c478bd9Sstevel@tonic-gate  *    system will use to identify this snapshot.  The snapshot number
17867c478bd9Sstevel@tonic-gate  *    associated with the snapshot identifier is the same as the minor
17877c478bd9Sstevel@tonic-gate  *    number for the snapshot device that is used to access that snapshot.
17887c478bd9Sstevel@tonic-gate  *
17897c478bd9Sstevel@tonic-gate  *    The snapshot can not be used until the candidate bitmap is populated
17907c478bd9Sstevel@tonic-gate  *    by the file system (see fssnap_set_candidate_impl()), and the file
17917c478bd9Sstevel@tonic-gate  *    system finishes the setup process by calling fssnap_create_done().
17927c478bd9Sstevel@tonic-gate  *    Nearly all of the snapshot locks are held for the duration of the
17937c478bd9Sstevel@tonic-gate  *    create, and are not released until fssnap_create_done is called().
17947c478bd9Sstevel@tonic-gate  */
17957c478bd9Sstevel@tonic-gate static void *
17967c478bd9Sstevel@tonic-gate fssnap_create_impl(chunknumber_t nchunks, uint_t chunksz, u_offset_t maxsize,
17977c478bd9Sstevel@tonic-gate     struct vnode *fsvp, int backfilecount, struct vnode **bfvpp, char *backpath,
17987c478bd9Sstevel@tonic-gate     u_offset_t max_backfile_size)
17997c478bd9Sstevel@tonic-gate {
18007c478bd9Sstevel@tonic-gate 	refstr_t *mountpoint;
18017c478bd9Sstevel@tonic-gate 	char taskqname[50];
18027c478bd9Sstevel@tonic-gate 	struct cow_info *cowp;
18037c478bd9Sstevel@tonic-gate 	struct cow_map	*cmap;
18047c478bd9Sstevel@tonic-gate 	struct snapshot_id *sidp;
18057c478bd9Sstevel@tonic-gate 	int lastsnap;
18067c478bd9Sstevel@tonic-gate 
18077c478bd9Sstevel@tonic-gate 	/*
18087c478bd9Sstevel@tonic-gate 	 * Sanity check the parameters we care about
18097c478bd9Sstevel@tonic-gate 	 * (we don't care about the informational parameters)
18107c478bd9Sstevel@tonic-gate 	 */
18117c478bd9Sstevel@tonic-gate 	if ((nchunks == 0) ||
18127c478bd9Sstevel@tonic-gate 	    ((chunksz % DEV_BSIZE) != 0) ||
18137c478bd9Sstevel@tonic-gate 	    (bfvpp == NULL)) {
18147c478bd9Sstevel@tonic-gate 		return (NULL);
18157c478bd9Sstevel@tonic-gate 	}
18167c478bd9Sstevel@tonic-gate 
18177c478bd9Sstevel@tonic-gate 	/*
18187c478bd9Sstevel@tonic-gate 	 * Look for unused snapshot identifiers.  Snapshot ids are never
18197c478bd9Sstevel@tonic-gate 	 * freed, but deleted snapshot ids will be recycled as needed.
18207c478bd9Sstevel@tonic-gate 	 */
18217c478bd9Sstevel@tonic-gate 	mutex_enter(&snapshot_mutex);
18227c478bd9Sstevel@tonic-gate 
18237c478bd9Sstevel@tonic-gate findagain:
18247c478bd9Sstevel@tonic-gate 	lastsnap = 0;
18257c478bd9Sstevel@tonic-gate 	for (sidp = snapshot; sidp != NULL; sidp = sidp->sid_next) {
18267c478bd9Sstevel@tonic-gate 		if (sidp->sid_snapnumber > lastsnap)
18277c478bd9Sstevel@tonic-gate 			lastsnap = sidp->sid_snapnumber;
18287c478bd9Sstevel@tonic-gate 
18297c478bd9Sstevel@tonic-gate 		/*
18307c478bd9Sstevel@tonic-gate 		 * The sid_rwlock is taken as a reader initially so that
18317c478bd9Sstevel@tonic-gate 		 * activity on each snapshot is not stalled while searching
18327c478bd9Sstevel@tonic-gate 		 * for a free snapshot id.
18337c478bd9Sstevel@tonic-gate 		 */
18347c478bd9Sstevel@tonic-gate 		rw_enter(&sidp->sid_rwlock, RW_READER);
18357c478bd9Sstevel@tonic-gate 
18367c478bd9Sstevel@tonic-gate 		/*
18377c478bd9Sstevel@tonic-gate 		 * If the snapshot has been deleted and nobody is using the
18387c478bd9Sstevel@tonic-gate 		 * snapshot device than we can reuse this snapshot_id.  If
18397c478bd9Sstevel@tonic-gate 		 * the snapshot is marked to be deleted (SID_DELETE), then
18407c478bd9Sstevel@tonic-gate 		 * it hasn't been deleted yet so don't reuse it.
18417c478bd9Sstevel@tonic-gate 		 */
18427c478bd9Sstevel@tonic-gate 		if (SID_AVAILABLE(sidp))
18437c478bd9Sstevel@tonic-gate 			break; /* This spot is unused, so take it */
18447c478bd9Sstevel@tonic-gate 		rw_exit(&sidp->sid_rwlock);
18457c478bd9Sstevel@tonic-gate 	}
18467c478bd9Sstevel@tonic-gate 
18477c478bd9Sstevel@tonic-gate 	/*
18487c478bd9Sstevel@tonic-gate 	 * add a new snapshot identifier if there are no deleted
18497c478bd9Sstevel@tonic-gate 	 * entries.  Since it doesn't matter what order the entries
18507c478bd9Sstevel@tonic-gate 	 * are in we can just add it to the beginning of the list.
18517c478bd9Sstevel@tonic-gate 	 */
18527c478bd9Sstevel@tonic-gate 	if (sidp) {
18537c478bd9Sstevel@tonic-gate 		if (rw_tryupgrade(&sidp->sid_rwlock) == 0) {
18547c478bd9Sstevel@tonic-gate 			/* someone else grabbed it as a writer, try again */
18557c478bd9Sstevel@tonic-gate 			rw_exit(&sidp->sid_rwlock);
18567c478bd9Sstevel@tonic-gate 			goto findagain;
18577c478bd9Sstevel@tonic-gate 		}
18587c478bd9Sstevel@tonic-gate 	} else {
18597c478bd9Sstevel@tonic-gate 		/* Create a new node if we didn't find an unused one */
18607c478bd9Sstevel@tonic-gate 		sidp = kmem_alloc(sizeof (struct snapshot_id), KM_SLEEP);
18617c478bd9Sstevel@tonic-gate 		rw_init(&sidp->sid_rwlock, NULL, RW_DEFAULT, NULL);
18627c478bd9Sstevel@tonic-gate 		rw_enter(&sidp->sid_rwlock, RW_WRITER);
18637c478bd9Sstevel@tonic-gate 		sidp->sid_snapnumber = (snapshot == NULL) ? 0 : lastsnap + 1;
18647c478bd9Sstevel@tonic-gate 		sidp->sid_cowinfo = NULL;
18657c478bd9Sstevel@tonic-gate 		sidp->sid_flags = 0;
18667c478bd9Sstevel@tonic-gate 		sidp->sid_next = snapshot;
18677c478bd9Sstevel@tonic-gate 		snapshot = sidp;
18687c478bd9Sstevel@tonic-gate 	}
18697c478bd9Sstevel@tonic-gate 
18707c478bd9Sstevel@tonic-gate 	ASSERT(RW_WRITE_HELD(&sidp->sid_rwlock));
18717c478bd9Sstevel@tonic-gate 	ASSERT(sidp->sid_cowinfo == NULL);
18727c478bd9Sstevel@tonic-gate 	ASSERT(sidp->sid_snapnumber <= (lastsnap + 1));
18737c478bd9Sstevel@tonic-gate 
18747c478bd9Sstevel@tonic-gate 	sidp->sid_flags |= SID_CREATING;
18757c478bd9Sstevel@tonic-gate 	/* The root vnode is held until snap_delete_impl() is called */
18767c478bd9Sstevel@tonic-gate 	VN_HOLD(fsvp);
18777c478bd9Sstevel@tonic-gate 	sidp->sid_fvp = fsvp;
18787c478bd9Sstevel@tonic-gate 	num_snapshots++;
18797c478bd9Sstevel@tonic-gate 
18807c478bd9Sstevel@tonic-gate 	/* allocate and initialize structures */
18817c478bd9Sstevel@tonic-gate 
18827c478bd9Sstevel@tonic-gate 	cowp = kmem_zalloc(sizeof (struct cow_info), KM_SLEEP);
18837c478bd9Sstevel@tonic-gate 
18847c478bd9Sstevel@tonic-gate 	cowp->cow_backfile_array = bfvpp;
18857c478bd9Sstevel@tonic-gate 	cowp->cow_backcount = backfilecount;
18867c478bd9Sstevel@tonic-gate 	cowp->cow_backfile_sz = max_backfile_size;
18877c478bd9Sstevel@tonic-gate 
18887c478bd9Sstevel@tonic-gate 	/*
18897c478bd9Sstevel@tonic-gate 	 * Initialize task queues for this snapshot.  Only a small number
18907c478bd9Sstevel@tonic-gate 	 * of threads are required because they will be serialized on the
18917c478bd9Sstevel@tonic-gate 	 * backing file's reader/writer lock anyway.
18927c478bd9Sstevel@tonic-gate 	 */
18937c478bd9Sstevel@tonic-gate 	(void) snprintf(taskqname, sizeof (taskqname), "%s_taskq_%d", snapname,
18947c478bd9Sstevel@tonic-gate 	    sidp->sid_snapnumber);
18957c478bd9Sstevel@tonic-gate 	cowp->cow_taskq = taskq_create(taskqname, fssnap_taskq_nthreads,
18967c478bd9Sstevel@tonic-gate 	    minclsyspri, 1,  fssnap_taskq_maxtasks, 0);
18977c478bd9Sstevel@tonic-gate 
18987c478bd9Sstevel@tonic-gate 	/* don't allow tasks to start until after everything is ready */
18997c478bd9Sstevel@tonic-gate 	taskq_suspend(cowp->cow_taskq);
19007c478bd9Sstevel@tonic-gate 
19017c478bd9Sstevel@tonic-gate 	/* initialize translation table */
19027c478bd9Sstevel@tonic-gate 	cmap = &cowp->cow_map;
19037c478bd9Sstevel@tonic-gate 	rw_init(&cmap->cmap_rwlock, NULL, RW_DEFAULT, NULL);
19047c478bd9Sstevel@tonic-gate 	rw_enter(&cmap->cmap_rwlock, RW_WRITER);
19057c478bd9Sstevel@tonic-gate 
19067c478bd9Sstevel@tonic-gate 	sema_init(&cmap->cmap_throttle_sem, fssnap_max_mem_chunks, NULL,
19077c478bd9Sstevel@tonic-gate 	    SEMA_DEFAULT, NULL);
19087c478bd9Sstevel@tonic-gate 
19097c478bd9Sstevel@tonic-gate 	cmap->cmap_chunksz = chunksz;
19107c478bd9Sstevel@tonic-gate 	cmap->cmap_maxsize = maxsize;
19117c478bd9Sstevel@tonic-gate 	cmap->cmap_chunksperbf = max_backfile_size / chunksz;
19127c478bd9Sstevel@tonic-gate 
19137c478bd9Sstevel@tonic-gate 	/*
19147c478bd9Sstevel@tonic-gate 	 * allocate one bit per chunk for the bitmaps, round up
19157c478bd9Sstevel@tonic-gate 	 */
19167c478bd9Sstevel@tonic-gate 	cmap->cmap_bmsize = (nchunks + (NBBY - 1)) / NBBY;
19177c478bd9Sstevel@tonic-gate 	cmap->cmap_hastrans  = kmem_zalloc(cmap->cmap_bmsize, KM_SLEEP);
19187c478bd9Sstevel@tonic-gate 	cmap->cmap_candidate = kmem_zalloc(cmap->cmap_bmsize, KM_SLEEP);
19197c478bd9Sstevel@tonic-gate 
19207c478bd9Sstevel@tonic-gate 	sidp->sid_cowinfo = cowp;
19217c478bd9Sstevel@tonic-gate 
19227c478bd9Sstevel@tonic-gate 	/* initialize kstats for this snapshot */
19237c478bd9Sstevel@tonic-gate 	mountpoint = vfs_getmntpoint(fsvp->v_vfsp);
19247c478bd9Sstevel@tonic-gate 	fssnap_create_kstats(sidp, sidp->sid_snapnumber,
19257c478bd9Sstevel@tonic-gate 	    refstr_value(mountpoint), backpath);
19267c478bd9Sstevel@tonic-gate 	refstr_rele(mountpoint);
19277c478bd9Sstevel@tonic-gate 
19287c478bd9Sstevel@tonic-gate 	mutex_exit(&snapshot_mutex);
19297c478bd9Sstevel@tonic-gate 
19307c478bd9Sstevel@tonic-gate 	/*
19317c478bd9Sstevel@tonic-gate 	 * return with snapshot id rwlock held as a writer until
19327c478bd9Sstevel@tonic-gate 	 * fssnap_create_done is called
19337c478bd9Sstevel@tonic-gate 	 */
19347c478bd9Sstevel@tonic-gate 	return (sidp);
19357c478bd9Sstevel@tonic-gate }
19367c478bd9Sstevel@tonic-gate 
19377c478bd9Sstevel@tonic-gate /*
19387c478bd9Sstevel@tonic-gate  * fssnap_set_candidate_impl() - mark a chunk as a candidate for copy-on-write
19397c478bd9Sstevel@tonic-gate  *
19407c478bd9Sstevel@tonic-gate  *    sets a bit in the candidate bitmap that indicates that a chunk is a
19417c478bd9Sstevel@tonic-gate  *    candidate for copy-on-write.  Typically, chunks that are allocated on
19427c478bd9Sstevel@tonic-gate  *    the file system at the time the snapshot is taken are candidates,
19437c478bd9Sstevel@tonic-gate  *    while chunks that have no allocated data do not need to be copied.
19447c478bd9Sstevel@tonic-gate  *    Chunks containing metadata must be marked as candidates as well.
19457c478bd9Sstevel@tonic-gate  */
19467c478bd9Sstevel@tonic-gate static void
19477c478bd9Sstevel@tonic-gate fssnap_set_candidate_impl(void *snapshot_id, chunknumber_t chunknumber)
19487c478bd9Sstevel@tonic-gate {
19497c478bd9Sstevel@tonic-gate 	struct snapshot_id	*sid = snapshot_id;
19507c478bd9Sstevel@tonic-gate 	struct cow_info *cowp = sid->sid_cowinfo;
19517c478bd9Sstevel@tonic-gate 	struct cow_map	*cmap = &cowp->cow_map;
19527c478bd9Sstevel@tonic-gate 
19537c478bd9Sstevel@tonic-gate 	/* simple bitmap operation for now */
19547c478bd9Sstevel@tonic-gate 	ASSERT(chunknumber < (cmap->cmap_bmsize * NBBY));
19557c478bd9Sstevel@tonic-gate 	setbit(cmap->cmap_candidate, chunknumber);
19567c478bd9Sstevel@tonic-gate }
19577c478bd9Sstevel@tonic-gate 
19587c478bd9Sstevel@tonic-gate /*
19597c478bd9Sstevel@tonic-gate  * fssnap_is_candidate_impl() - check whether a chunk is a candidate
19607c478bd9Sstevel@tonic-gate  *
19617c478bd9Sstevel@tonic-gate  *    returns 0 if the chunk is not a candidate and 1 if the chunk is a
19627c478bd9Sstevel@tonic-gate  *    candidate.  This can be used by the file system to change behavior for
19637c478bd9Sstevel@tonic-gate  *    chunks that might induce a copy-on-write.  The offset is specified in
19647c478bd9Sstevel@tonic-gate  *    bytes since the chunk size may not be known by the file system.
19657c478bd9Sstevel@tonic-gate  */
19667c478bd9Sstevel@tonic-gate static int
19677c478bd9Sstevel@tonic-gate fssnap_is_candidate_impl(void *snapshot_id, u_offset_t off)
19687c478bd9Sstevel@tonic-gate {
19697c478bd9Sstevel@tonic-gate 	struct snapshot_id	*sid = snapshot_id;
19707c478bd9Sstevel@tonic-gate 	struct cow_info *cowp = sid->sid_cowinfo;
19717c478bd9Sstevel@tonic-gate 	struct cow_map	*cmap = &cowp->cow_map;
19727c478bd9Sstevel@tonic-gate 	ulong_t chunknumber = off / cmap->cmap_chunksz;
19737c478bd9Sstevel@tonic-gate 
19747c478bd9Sstevel@tonic-gate 	/* simple bitmap operation for now */
19757c478bd9Sstevel@tonic-gate 	ASSERT(chunknumber < (cmap->cmap_bmsize * NBBY));
19767c478bd9Sstevel@tonic-gate 	return (isset(cmap->cmap_candidate, chunknumber));
19777c478bd9Sstevel@tonic-gate }
19787c478bd9Sstevel@tonic-gate 
19797c478bd9Sstevel@tonic-gate /*
19807c478bd9Sstevel@tonic-gate  * fssnap_create_done_impl() - complete the snapshot setup process
19817c478bd9Sstevel@tonic-gate  *
19827c478bd9Sstevel@tonic-gate  *    called when the file system is done populating the candidate bitmap
19837c478bd9Sstevel@tonic-gate  *    and it is ready to start using the snapshot.  This routine releases
19847c478bd9Sstevel@tonic-gate  *    the snapshot locks, allows taskq tasks to start processing, and
19857c478bd9Sstevel@tonic-gate  *    creates the device minor nodes associated with the snapshot.
19867c478bd9Sstevel@tonic-gate  */
19877c478bd9Sstevel@tonic-gate static int
19887c478bd9Sstevel@tonic-gate fssnap_create_done_impl(void *snapshot_id)
19897c478bd9Sstevel@tonic-gate {
19907c478bd9Sstevel@tonic-gate 	struct snapshot_id	**sidpp, *sidp = snapshot_id;
19917c478bd9Sstevel@tonic-gate 	struct cow_info		*cowp;
19927c478bd9Sstevel@tonic-gate 	struct cow_map		*cmap;
19937c478bd9Sstevel@tonic-gate 	int			snapnumber = -1;
19947c478bd9Sstevel@tonic-gate 	char			name[20];
19957c478bd9Sstevel@tonic-gate 
19967c478bd9Sstevel@tonic-gate 	/* sid rwlock and cmap rwlock should be taken from fssnap_create */
19977c478bd9Sstevel@tonic-gate 	ASSERT(sidp);
19987c478bd9Sstevel@tonic-gate 	ASSERT(RW_WRITE_HELD(&sidp->sid_rwlock));
19997c478bd9Sstevel@tonic-gate 	ASSERT(sidp->sid_cowinfo);
20007c478bd9Sstevel@tonic-gate 
20017c478bd9Sstevel@tonic-gate 	cowp = sidp->sid_cowinfo;
20027c478bd9Sstevel@tonic-gate 	cmap = &cowp->cow_map;
20037c478bd9Sstevel@tonic-gate 
20047c478bd9Sstevel@tonic-gate 	ASSERT(RW_WRITE_HELD(&cmap->cmap_rwlock));
20057c478bd9Sstevel@tonic-gate 
20067c478bd9Sstevel@tonic-gate 	sidp->sid_flags &= ~(SID_CREATING | SID_DISABLED);
20077c478bd9Sstevel@tonic-gate 	snapnumber = sidp->sid_snapnumber;
20087c478bd9Sstevel@tonic-gate 
20097c478bd9Sstevel@tonic-gate 	/* allocate state structure and find new snapshot id */
20107c478bd9Sstevel@tonic-gate 	if (ddi_soft_state_zalloc(statep, snapnumber) != DDI_SUCCESS) {
20117c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN,
20127c478bd9Sstevel@tonic-gate 		    "snap_ioctl: create: could not allocate "
20137c478bd9Sstevel@tonic-gate 		    "state for snapshot %d.", snapnumber);
20147c478bd9Sstevel@tonic-gate 		snapnumber = -1;
20157c478bd9Sstevel@tonic-gate 		goto out;
20167c478bd9Sstevel@tonic-gate 	}
20177c478bd9Sstevel@tonic-gate 
20187c478bd9Sstevel@tonic-gate 	sidpp = ddi_get_soft_state(statep, snapnumber);
20197c478bd9Sstevel@tonic-gate 	*sidpp = sidp;
20207c478bd9Sstevel@tonic-gate 
20217c478bd9Sstevel@tonic-gate 	/* create minor node based on snapshot number */
20227c478bd9Sstevel@tonic-gate 	ASSERT(fssnap_dip != NULL);
20237c478bd9Sstevel@tonic-gate 	(void) snprintf(name, sizeof (name), "%d", snapnumber);
20247c478bd9Sstevel@tonic-gate 	if (ddi_create_minor_node(fssnap_dip, name, S_IFBLK,
20257c478bd9Sstevel@tonic-gate 	    snapnumber, DDI_PSEUDO, 0) != DDI_SUCCESS) {
20267c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN, "snap_ioctl: could not create "
20277c478bd9Sstevel@tonic-gate 		    "block minor node for snapshot %d.", snapnumber);
20287c478bd9Sstevel@tonic-gate 		snapnumber = -1;
20297c478bd9Sstevel@tonic-gate 		goto out;
20307c478bd9Sstevel@tonic-gate 	}
20317c478bd9Sstevel@tonic-gate 
20327c478bd9Sstevel@tonic-gate 	(void) snprintf(name, sizeof (name), "%d,raw", snapnumber);
20337c478bd9Sstevel@tonic-gate 	if (ddi_create_minor_node(fssnap_dip, name, S_IFCHR,
20347c478bd9Sstevel@tonic-gate 	    snapnumber, DDI_PSEUDO, 0) != DDI_SUCCESS) {
20357c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN, "snap_ioctl: could not create "
20367c478bd9Sstevel@tonic-gate 		    "character minor node for snapshot %d.", snapnumber);
20377c478bd9Sstevel@tonic-gate 		snapnumber = -1;
20387c478bd9Sstevel@tonic-gate 	}
20397c478bd9Sstevel@tonic-gate 
20407c478bd9Sstevel@tonic-gate out:
20417c478bd9Sstevel@tonic-gate 	rw_exit(&sidp->sid_rwlock);
20427c478bd9Sstevel@tonic-gate 	rw_exit(&cmap->cmap_rwlock);
20437c478bd9Sstevel@tonic-gate 
20447c478bd9Sstevel@tonic-gate 	/* let the taskq threads start processing */
20457c478bd9Sstevel@tonic-gate 	taskq_resume(cowp->cow_taskq);
20467c478bd9Sstevel@tonic-gate 
20477c478bd9Sstevel@tonic-gate 	return (snapnumber);
20487c478bd9Sstevel@tonic-gate }
20497c478bd9Sstevel@tonic-gate 
20507c478bd9Sstevel@tonic-gate /*
20517c478bd9Sstevel@tonic-gate  * fssnap_delete_impl() - delete a snapshot
20527c478bd9Sstevel@tonic-gate  *
20537c478bd9Sstevel@tonic-gate  *    used when a snapshot is no longer needed.  This is called by the file
20547c478bd9Sstevel@tonic-gate  *    system when it receives an ioctl request to delete a snapshot.  It is
20557c478bd9Sstevel@tonic-gate  *    also called internally when error conditions such as disk full, errors
20567c478bd9Sstevel@tonic-gate  *    writing to the backing file, or backing file maxsize exceeded occur.
20577c478bd9Sstevel@tonic-gate  *    If the snapshot device is busy when the delete request is received,
20587c478bd9Sstevel@tonic-gate  *    all state will be deleted except for the soft state and device files
20597c478bd9Sstevel@tonic-gate  *    associated with the snapshot; they will be deleted when the snapshot
20607c478bd9Sstevel@tonic-gate  *    device is closed.
20617c478bd9Sstevel@tonic-gate  *
20627c478bd9Sstevel@tonic-gate  *    NOTE this function takes a POINTER TO A POINTER to the snapshot id,
20637c478bd9Sstevel@tonic-gate  *    and expects to be able to set the handle held by the file system to
20647c478bd9Sstevel@tonic-gate  *    NULL.  This depends on the file system checking that variable for NULL
20657c478bd9Sstevel@tonic-gate  *    before calling fssnap_strategy().
20667c478bd9Sstevel@tonic-gate  */
20677c478bd9Sstevel@tonic-gate static int
20687c478bd9Sstevel@tonic-gate fssnap_delete_impl(void *snapshot_id)
20697c478bd9Sstevel@tonic-gate {
20707c478bd9Sstevel@tonic-gate 	struct snapshot_id	**sidpp = (struct snapshot_id **)snapshot_id;
20717c478bd9Sstevel@tonic-gate 	struct snapshot_id	*sidp;
20727c478bd9Sstevel@tonic-gate 	struct snapshot_id	**statesidpp;
20737c478bd9Sstevel@tonic-gate 	struct cow_info		*cowp;
20747c478bd9Sstevel@tonic-gate 	struct cow_map		*cmap;
20757c478bd9Sstevel@tonic-gate 	char			name[20];
20767c478bd9Sstevel@tonic-gate 	int			snapnumber = -1;
20777c478bd9Sstevel@tonic-gate 	vnode_t			**vpp;
20787c478bd9Sstevel@tonic-gate 
20797c478bd9Sstevel@tonic-gate 	/*
20807c478bd9Sstevel@tonic-gate 	 * sidp is guaranteed to be valid if sidpp is valid because
20817c478bd9Sstevel@tonic-gate 	 * the snapshot list is append-only.
20827c478bd9Sstevel@tonic-gate 	 */
20837c478bd9Sstevel@tonic-gate 	if (sidpp == NULL) {
20847c478bd9Sstevel@tonic-gate 		return (-1);
20857c478bd9Sstevel@tonic-gate 	}
20867c478bd9Sstevel@tonic-gate 
20877c478bd9Sstevel@tonic-gate 	sidp = *sidpp;
20887c478bd9Sstevel@tonic-gate 	rw_enter(&sidp->sid_rwlock, RW_WRITER);
20897c478bd9Sstevel@tonic-gate 
20907c478bd9Sstevel@tonic-gate 	ASSERT(RW_WRITE_HELD(&sidp->sid_rwlock));
20917c478bd9Sstevel@tonic-gate 
20927c478bd9Sstevel@tonic-gate 	/*
20937c478bd9Sstevel@tonic-gate 	 * double check that the snapshot is still valid for THIS file system
20947c478bd9Sstevel@tonic-gate 	 */
20957c478bd9Sstevel@tonic-gate 	if (*sidpp == NULL) {
20967c478bd9Sstevel@tonic-gate 		rw_exit(&sidp->sid_rwlock);
20977c478bd9Sstevel@tonic-gate 		return (-1);
20987c478bd9Sstevel@tonic-gate 	}
20997c478bd9Sstevel@tonic-gate 
21007c478bd9Sstevel@tonic-gate 	/*
21017c478bd9Sstevel@tonic-gate 	 * Now we know the snapshot is still valid and will not go away
21027c478bd9Sstevel@tonic-gate 	 * because we have the write lock.  Once the state is transitioned
21037c478bd9Sstevel@tonic-gate 	 * to "disabling", the sid_rwlock can be released.  Any pending I/O
21047c478bd9Sstevel@tonic-gate 	 * waiting for the lock as a reader will check for this state and
21057c478bd9Sstevel@tonic-gate 	 * abort without touching data that may be getting freed.
21067c478bd9Sstevel@tonic-gate 	 */
21077c478bd9Sstevel@tonic-gate 	sidp->sid_flags |= SID_DISABLING;
21087c478bd9Sstevel@tonic-gate 	if (sidp->sid_flags & SID_DELETE) {
21097c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN, "Snapshot %d automatically deleted.",
21107c478bd9Sstevel@tonic-gate 		    sidp->sid_snapnumber);
21117c478bd9Sstevel@tonic-gate 		sidp->sid_flags &= ~(SID_DELETE);
21127c478bd9Sstevel@tonic-gate 	}
21137c478bd9Sstevel@tonic-gate 
21147c478bd9Sstevel@tonic-gate 
21157c478bd9Sstevel@tonic-gate 	/*
21167c478bd9Sstevel@tonic-gate 	 * This is pointing into file system specific data!  The assumption is
21177c478bd9Sstevel@tonic-gate 	 * that fssnap_strategy() gets called from the file system based on
21187c478bd9Sstevel@tonic-gate 	 * whether this reference to the snapshot_id is NULL or not.  So
21197c478bd9Sstevel@tonic-gate 	 * setting this to NULL should disable snapshots for the file system.
21207c478bd9Sstevel@tonic-gate 	 */
21217c478bd9Sstevel@tonic-gate 	*sidpp = NULL;
21227c478bd9Sstevel@tonic-gate 
21237c478bd9Sstevel@tonic-gate 	/* remove cowinfo */
21247c478bd9Sstevel@tonic-gate 	cowp = sidp->sid_cowinfo;
21257c478bd9Sstevel@tonic-gate 	if (cowp == NULL) {
21267c478bd9Sstevel@tonic-gate 		rw_exit(&sidp->sid_rwlock);
21277c478bd9Sstevel@tonic-gate 		return (-1);
21287c478bd9Sstevel@tonic-gate 	}
21297c478bd9Sstevel@tonic-gate 	rw_exit(&sidp->sid_rwlock);
21307c478bd9Sstevel@tonic-gate 
21317c478bd9Sstevel@tonic-gate 	/* destroy task queues first so they don't reference freed data. */
21327c478bd9Sstevel@tonic-gate 	if (cowp->cow_taskq) {
21337c478bd9Sstevel@tonic-gate 		taskq_destroy(cowp->cow_taskq);
21347c478bd9Sstevel@tonic-gate 		cowp->cow_taskq = NULL;
21357c478bd9Sstevel@tonic-gate 	}
21367c478bd9Sstevel@tonic-gate 
21377c478bd9Sstevel@tonic-gate 	if (cowp->cow_backfile_array != NULL) {
21387c478bd9Sstevel@tonic-gate 		for (vpp = cowp->cow_backfile_array; *vpp; vpp++)
21397c478bd9Sstevel@tonic-gate 			VN_RELE(*vpp);
21407c478bd9Sstevel@tonic-gate 		kmem_free(cowp->cow_backfile_array,
21417c478bd9Sstevel@tonic-gate 		    (cowp->cow_backcount + 1) * sizeof (vnode_t *));
21427c478bd9Sstevel@tonic-gate 		cowp->cow_backfile_array = NULL;
21437c478bd9Sstevel@tonic-gate 	}
21447c478bd9Sstevel@tonic-gate 
21457c478bd9Sstevel@tonic-gate 	sidp->sid_cowinfo = NULL;
21467c478bd9Sstevel@tonic-gate 
21477c478bd9Sstevel@tonic-gate 	/* remove cmap */
21487c478bd9Sstevel@tonic-gate 	cmap = &cowp->cow_map;
21497c478bd9Sstevel@tonic-gate 	ASSERT(cmap);
21507c478bd9Sstevel@tonic-gate 
21517c478bd9Sstevel@tonic-gate 	if (cmap->cmap_candidate)
21527c478bd9Sstevel@tonic-gate 		kmem_free(cmap->cmap_candidate, cmap->cmap_bmsize);
21537c478bd9Sstevel@tonic-gate 
21547c478bd9Sstevel@tonic-gate 	if (cmap->cmap_hastrans)
21557c478bd9Sstevel@tonic-gate 		kmem_free(cmap->cmap_hastrans, cmap->cmap_bmsize);
21567c478bd9Sstevel@tonic-gate 
21577c478bd9Sstevel@tonic-gate 	if (cmap->cmap_table)
21587c478bd9Sstevel@tonic-gate 		transtbl_free(&cowp->cow_map);
21597c478bd9Sstevel@tonic-gate 
21607c478bd9Sstevel@tonic-gate 	rw_destroy(&cmap->cmap_rwlock);
21617c478bd9Sstevel@tonic-gate 
21627c478bd9Sstevel@tonic-gate 	while (cmap->cmap_waiters) {
21637c478bd9Sstevel@tonic-gate 		sema_p(&cmap->cmap_throttle_sem);
21647c478bd9Sstevel@tonic-gate 		sema_v(&cmap->cmap_throttle_sem);
21657c478bd9Sstevel@tonic-gate 	}
21667c478bd9Sstevel@tonic-gate 	sema_destroy(&cmap->cmap_throttle_sem);
21677c478bd9Sstevel@tonic-gate 
21687c478bd9Sstevel@tonic-gate 	/* remove kstats */
21697c478bd9Sstevel@tonic-gate 	fssnap_delete_kstats(cowp);
21707c478bd9Sstevel@tonic-gate 
21717c478bd9Sstevel@tonic-gate 	kmem_free(cowp, sizeof (struct cow_info));
21727c478bd9Sstevel@tonic-gate 
21737c478bd9Sstevel@tonic-gate 	statesidpp = ddi_get_soft_state(statep, sidp->sid_snapnumber);
21747c478bd9Sstevel@tonic-gate 	if (statesidpp == NULL || *statesidpp == NULL) {
21757c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN,
21767c478bd9Sstevel@tonic-gate 		    "fssnap_delete_impl: could not find state for snapshot %d.",
21777c478bd9Sstevel@tonic-gate 		    sidp->sid_snapnumber);
21787c478bd9Sstevel@tonic-gate 	}
21797c478bd9Sstevel@tonic-gate 	ASSERT(*statesidpp == sidp);
21807c478bd9Sstevel@tonic-gate 
21817c478bd9Sstevel@tonic-gate 	/*
21827c478bd9Sstevel@tonic-gate 	 * Leave the node in the list marked DISABLED so it can be reused
21837c478bd9Sstevel@tonic-gate 	 * and avoid many race conditions.  Return the snapshot number
21847c478bd9Sstevel@tonic-gate 	 * that was deleted.
21857c478bd9Sstevel@tonic-gate 	 */
21867c478bd9Sstevel@tonic-gate 	mutex_enter(&snapshot_mutex);
21877c478bd9Sstevel@tonic-gate 	rw_enter(&sidp->sid_rwlock, RW_WRITER);
21887c478bd9Sstevel@tonic-gate 	sidp->sid_flags &= ~(SID_DISABLING);
21897c478bd9Sstevel@tonic-gate 	sidp->sid_flags |= SID_DISABLED;
21907c478bd9Sstevel@tonic-gate 	VN_RELE(sidp->sid_fvp);
21917c478bd9Sstevel@tonic-gate 	sidp->sid_fvp = NULL;
21927c478bd9Sstevel@tonic-gate 	snapnumber = sidp->sid_snapnumber;
21937c478bd9Sstevel@tonic-gate 
21947c478bd9Sstevel@tonic-gate 	/*
21957c478bd9Sstevel@tonic-gate 	 * If the snapshot is not busy, free the device info now.  Otherwise
21967c478bd9Sstevel@tonic-gate 	 * the device nodes are freed in snap_close() when the device is
21977c478bd9Sstevel@tonic-gate 	 * closed.  The sid will not be reused until the device is not busy.
21987c478bd9Sstevel@tonic-gate 	 */
21997c478bd9Sstevel@tonic-gate 	if (SID_AVAILABLE(sidp)) {
22007c478bd9Sstevel@tonic-gate 		/* remove the device nodes */
22017c478bd9Sstevel@tonic-gate 		ASSERT(fssnap_dip != NULL);
22027c478bd9Sstevel@tonic-gate 		(void) snprintf(name, sizeof (name), "%d",
22037c478bd9Sstevel@tonic-gate 		    sidp->sid_snapnumber);
22047c478bd9Sstevel@tonic-gate 		ddi_remove_minor_node(fssnap_dip, name);
22057c478bd9Sstevel@tonic-gate 		(void) snprintf(name, sizeof (name), "%d,raw",
22067c478bd9Sstevel@tonic-gate 		    sidp->sid_snapnumber);
22077c478bd9Sstevel@tonic-gate 		ddi_remove_minor_node(fssnap_dip, name);
22087c478bd9Sstevel@tonic-gate 
22097c478bd9Sstevel@tonic-gate 		/* delete the state structure */
22107c478bd9Sstevel@tonic-gate 		ddi_soft_state_free(statep, sidp->sid_snapnumber);
22117c478bd9Sstevel@tonic-gate 		num_snapshots--;
22127c478bd9Sstevel@tonic-gate 	}
22137c478bd9Sstevel@tonic-gate 
22147c478bd9Sstevel@tonic-gate 	mutex_exit(&snapshot_mutex);
22157c478bd9Sstevel@tonic-gate 	rw_exit(&sidp->sid_rwlock);
22167c478bd9Sstevel@tonic-gate 
22177c478bd9Sstevel@tonic-gate 	return (snapnumber);
22187c478bd9Sstevel@tonic-gate }
22197c478bd9Sstevel@tonic-gate 
22207c478bd9Sstevel@tonic-gate /*
22217c478bd9Sstevel@tonic-gate  * fssnap_create_kstats() - allocate and initialize snapshot kstats
22227c478bd9Sstevel@tonic-gate  *
22237c478bd9Sstevel@tonic-gate  */
22247c478bd9Sstevel@tonic-gate static void
22257c478bd9Sstevel@tonic-gate fssnap_create_kstats(snapshot_id_t *sidp, int snapnum,
22267c478bd9Sstevel@tonic-gate     const char *mountpoint, const char *backfilename)
22277c478bd9Sstevel@tonic-gate {
22287c478bd9Sstevel@tonic-gate 	kstat_t *num, *mntpoint, *bfname;
22297c478bd9Sstevel@tonic-gate 	kstat_named_t *hw;
22307c478bd9Sstevel@tonic-gate 	struct cow_info *cowp = sidp->sid_cowinfo;
22317c478bd9Sstevel@tonic-gate 	struct cow_kstat_num *stats;
22327c478bd9Sstevel@tonic-gate 
22337c478bd9Sstevel@tonic-gate 	/* update the high water mark */
22347c478bd9Sstevel@tonic-gate 	if (fssnap_highwater_kstat == NULL) {
22357c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN, "fssnap_create_kstats: failed to lookup "
22367c478bd9Sstevel@tonic-gate 		    "high water mark kstat.");
22377c478bd9Sstevel@tonic-gate 		return;
22387c478bd9Sstevel@tonic-gate 	}
22397c478bd9Sstevel@tonic-gate 
22407c478bd9Sstevel@tonic-gate 	hw = (kstat_named_t *)fssnap_highwater_kstat->ks_data;
22417c478bd9Sstevel@tonic-gate 	if (hw->value.ui32 < snapnum)
22427c478bd9Sstevel@tonic-gate 		hw->value.ui32 = snapnum;
22437c478bd9Sstevel@tonic-gate 
22447c478bd9Sstevel@tonic-gate 	/* initialize the mount point kstat */
22457c478bd9Sstevel@tonic-gate 	kstat_delete_byname(snapname, snapnum, FSSNAP_KSTAT_MNTPT);
22467c478bd9Sstevel@tonic-gate 
22477c478bd9Sstevel@tonic-gate 	if (mountpoint != NULL) {
22487c478bd9Sstevel@tonic-gate 		mntpoint = kstat_create(snapname, snapnum, FSSNAP_KSTAT_MNTPT,
22497c478bd9Sstevel@tonic-gate 		    "misc", KSTAT_TYPE_RAW, strlen(mountpoint) + 1, 0);
22507c478bd9Sstevel@tonic-gate 		if (mntpoint == NULL) {
22517c478bd9Sstevel@tonic-gate 			cowp->cow_kstat_mntpt = NULL;
22527c478bd9Sstevel@tonic-gate 			cmn_err(CE_WARN, "fssnap_create_kstats: failed to "
22537c478bd9Sstevel@tonic-gate 			    "create mount point kstat");
22547c478bd9Sstevel@tonic-gate 		} else {
22557c478bd9Sstevel@tonic-gate 			(void) strncpy(mntpoint->ks_data, mountpoint,
22567c478bd9Sstevel@tonic-gate 			    strlen(mountpoint));
22577c478bd9Sstevel@tonic-gate 			cowp->cow_kstat_mntpt = mntpoint;
22587c478bd9Sstevel@tonic-gate 			kstat_install(mntpoint);
22597c478bd9Sstevel@tonic-gate 		}
22607c478bd9Sstevel@tonic-gate 	} else {
22617c478bd9Sstevel@tonic-gate 		cowp->cow_kstat_mntpt = NULL;
22627c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN, "fssnap_create_kstats: mount point not "
22637c478bd9Sstevel@tonic-gate 		    "specified.");
22647c478bd9Sstevel@tonic-gate 	}
22657c478bd9Sstevel@tonic-gate 
22667c478bd9Sstevel@tonic-gate 	/* initialize the backing file kstat */
22677c478bd9Sstevel@tonic-gate 	kstat_delete_byname(snapname, snapnum, FSSNAP_KSTAT_BFNAME);
22687c478bd9Sstevel@tonic-gate 
22697c478bd9Sstevel@tonic-gate 	if (backfilename == NULL) {
22707c478bd9Sstevel@tonic-gate 		cowp->cow_kstat_bfname = NULL;
22717c478bd9Sstevel@tonic-gate 	} else {
22727c478bd9Sstevel@tonic-gate 		bfname = kstat_create(snapname, snapnum, FSSNAP_KSTAT_BFNAME,
22737c478bd9Sstevel@tonic-gate 		    "misc", KSTAT_TYPE_RAW, strlen(backfilename) + 1, 0);
22747c478bd9Sstevel@tonic-gate 		if (bfname != NULL) {
22757c478bd9Sstevel@tonic-gate 			(void) strncpy(bfname->ks_data, backfilename,
22767c478bd9Sstevel@tonic-gate 			    strlen(backfilename));
22777c478bd9Sstevel@tonic-gate 			cowp->cow_kstat_bfname = bfname;
22787c478bd9Sstevel@tonic-gate 			kstat_install(bfname);
22797c478bd9Sstevel@tonic-gate 		} else {
22807c478bd9Sstevel@tonic-gate 			cowp->cow_kstat_bfname = NULL;
22817c478bd9Sstevel@tonic-gate 			cmn_err(CE_WARN, "fssnap_create_kstats: failed to "
22827c478bd9Sstevel@tonic-gate 			    "create backing file name kstat");
22837c478bd9Sstevel@tonic-gate 		}
22847c478bd9Sstevel@tonic-gate 	}
22857c478bd9Sstevel@tonic-gate 
22867c478bd9Sstevel@tonic-gate 	/* initialize numeric kstats */
22877c478bd9Sstevel@tonic-gate 	kstat_delete_byname(snapname, snapnum, FSSNAP_KSTAT_NUM);
22887c478bd9Sstevel@tonic-gate 
22897c478bd9Sstevel@tonic-gate 	num = kstat_create(snapname, snapnum, FSSNAP_KSTAT_NUM,
22907c478bd9Sstevel@tonic-gate 	    "misc", KSTAT_TYPE_NAMED,
22917c478bd9Sstevel@tonic-gate 	    sizeof (struct cow_kstat_num) / sizeof (kstat_named_t),
22927c478bd9Sstevel@tonic-gate 	    0);
22937c478bd9Sstevel@tonic-gate 	if (num == NULL) {
22947c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN, "fssnap_create_kstats: failed to create "
22957c478bd9Sstevel@tonic-gate 		    "numeric kstats");
22967c478bd9Sstevel@tonic-gate 		cowp->cow_kstat_num = NULL;
22977c478bd9Sstevel@tonic-gate 		return;
22987c478bd9Sstevel@tonic-gate 	}
22997c478bd9Sstevel@tonic-gate 
23007c478bd9Sstevel@tonic-gate 	cowp->cow_kstat_num = num;
23017c478bd9Sstevel@tonic-gate 	stats = num->ks_data;
23027c478bd9Sstevel@tonic-gate 	num->ks_update = fssnap_update_kstat_num;
23037c478bd9Sstevel@tonic-gate 	num->ks_private = sidp;
23047c478bd9Sstevel@tonic-gate 
23057c478bd9Sstevel@tonic-gate 	kstat_named_init(&stats->ckn_state, FSSNAP_KSTAT_NUM_STATE,
23067c478bd9Sstevel@tonic-gate 	    KSTAT_DATA_INT32);
23077c478bd9Sstevel@tonic-gate 	kstat_named_init(&stats->ckn_bfsize, FSSNAP_KSTAT_NUM_BFSIZE,
23087c478bd9Sstevel@tonic-gate 	    KSTAT_DATA_UINT64);
23097c478bd9Sstevel@tonic-gate 	kstat_named_init(&stats->ckn_maxsize, FSSNAP_KSTAT_NUM_MAXSIZE,
23107c478bd9Sstevel@tonic-gate 	    KSTAT_DATA_UINT64);
23117c478bd9Sstevel@tonic-gate 	kstat_named_init(&stats->ckn_createtime, FSSNAP_KSTAT_NUM_CREATETIME,
23127c478bd9Sstevel@tonic-gate 	    KSTAT_DATA_LONG);
23137c478bd9Sstevel@tonic-gate 	kstat_named_init(&stats->ckn_chunksize, FSSNAP_KSTAT_NUM_CHUNKSIZE,
23147c478bd9Sstevel@tonic-gate 	    KSTAT_DATA_UINT32);
23157c478bd9Sstevel@tonic-gate 
23167c478bd9Sstevel@tonic-gate 	/* initialize the static kstats */
23177c478bd9Sstevel@tonic-gate 	stats->ckn_chunksize.value.ui32 = cowp->cow_map.cmap_chunksz;
23187c478bd9Sstevel@tonic-gate 	stats->ckn_maxsize.value.ui64 = cowp->cow_map.cmap_maxsize;
23197c478bd9Sstevel@tonic-gate 	stats->ckn_createtime.value.l = gethrestime_sec();
23207c478bd9Sstevel@tonic-gate 
23217c478bd9Sstevel@tonic-gate 	kstat_install(num);
23227c478bd9Sstevel@tonic-gate }
23237c478bd9Sstevel@tonic-gate 
23247c478bd9Sstevel@tonic-gate /*
23257c478bd9Sstevel@tonic-gate  * fssnap_update_kstat_num() - update a numerical snapshot kstat value
23267c478bd9Sstevel@tonic-gate  *
23277c478bd9Sstevel@tonic-gate  */
23287c478bd9Sstevel@tonic-gate int
23297c478bd9Sstevel@tonic-gate fssnap_update_kstat_num(kstat_t *ksp, int rw)
23307c478bd9Sstevel@tonic-gate {
23317c478bd9Sstevel@tonic-gate 	snapshot_id_t *sidp = (snapshot_id_t *)ksp->ks_private;
23327c478bd9Sstevel@tonic-gate 	struct cow_info *cowp = sidp->sid_cowinfo;
23337c478bd9Sstevel@tonic-gate 	struct cow_kstat_num *stats = ksp->ks_data;
23347c478bd9Sstevel@tonic-gate 
23357c478bd9Sstevel@tonic-gate 	if (rw == KSTAT_WRITE)
23367c478bd9Sstevel@tonic-gate 		return (EACCES);
23377c478bd9Sstevel@tonic-gate 
23387c478bd9Sstevel@tonic-gate 	/* state */
23397c478bd9Sstevel@tonic-gate 	if (sidp->sid_flags & SID_CREATING)
23407c478bd9Sstevel@tonic-gate 		stats->ckn_state.value.i32 = COWSTATE_CREATING;
23417c478bd9Sstevel@tonic-gate 	else if (SID_INACTIVE(sidp))
23427c478bd9Sstevel@tonic-gate 		stats->ckn_state.value.i32 = COWSTATE_DISABLED;
23437c478bd9Sstevel@tonic-gate 	else if (SID_BUSY(sidp))
23447c478bd9Sstevel@tonic-gate 		stats->ckn_state.value.i32 = COWSTATE_ACTIVE;
23457c478bd9Sstevel@tonic-gate 	else
23467c478bd9Sstevel@tonic-gate 		stats->ckn_state.value.i32 = COWSTATE_IDLE;
23477c478bd9Sstevel@tonic-gate 
23487c478bd9Sstevel@tonic-gate 	/* bfsize */
23497c478bd9Sstevel@tonic-gate 	stats->ckn_bfsize.value.ui64 = cowp->cow_map.cmap_nchunks *
23507c478bd9Sstevel@tonic-gate 	    cowp->cow_map.cmap_chunksz;
23517c478bd9Sstevel@tonic-gate 
23527c478bd9Sstevel@tonic-gate 	return (0);
23537c478bd9Sstevel@tonic-gate }
23547c478bd9Sstevel@tonic-gate 
23557c478bd9Sstevel@tonic-gate /*
23567c478bd9Sstevel@tonic-gate  * fssnap_delete_kstats() - deallocate snapshot kstats
23577c478bd9Sstevel@tonic-gate  *
23587c478bd9Sstevel@tonic-gate  */
23597c478bd9Sstevel@tonic-gate void
23607c478bd9Sstevel@tonic-gate fssnap_delete_kstats(struct cow_info *cowp)
23617c478bd9Sstevel@tonic-gate {
23627c478bd9Sstevel@tonic-gate 	if (cowp->cow_kstat_num != NULL) {
23637c478bd9Sstevel@tonic-gate 		kstat_delete(cowp->cow_kstat_num);
23647c478bd9Sstevel@tonic-gate 		cowp->cow_kstat_num = NULL;
23657c478bd9Sstevel@tonic-gate 	}
23667c478bd9Sstevel@tonic-gate 	if (cowp->cow_kstat_mntpt != NULL) {
23677c478bd9Sstevel@tonic-gate 		kstat_delete(cowp->cow_kstat_mntpt);
23687c478bd9Sstevel@tonic-gate 		cowp->cow_kstat_mntpt = NULL;
23697c478bd9Sstevel@tonic-gate 	}
23707c478bd9Sstevel@tonic-gate 	if (cowp->cow_kstat_bfname != NULL) {
23717c478bd9Sstevel@tonic-gate 		kstat_delete(cowp->cow_kstat_bfname);
23727c478bd9Sstevel@tonic-gate 		cowp->cow_kstat_bfname = NULL;
23737c478bd9Sstevel@tonic-gate 	}
23747c478bd9Sstevel@tonic-gate }
2375