xref: /titanic_54/usr/src/uts/common/io/fssnap.c (revision 7c478bd95313f5f23a4c958a745db2134aa03244)
1*7c478bd9Sstevel@tonic-gate /*
2*7c478bd9Sstevel@tonic-gate  * CDDL HEADER START
3*7c478bd9Sstevel@tonic-gate  *
4*7c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5*7c478bd9Sstevel@tonic-gate  * Common Development and Distribution License, Version 1.0 only
6*7c478bd9Sstevel@tonic-gate  * (the "License").  You may not use this file except in compliance
7*7c478bd9Sstevel@tonic-gate  * with the License.
8*7c478bd9Sstevel@tonic-gate  *
9*7c478bd9Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10*7c478bd9Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
11*7c478bd9Sstevel@tonic-gate  * See the License for the specific language governing permissions
12*7c478bd9Sstevel@tonic-gate  * and limitations under the License.
13*7c478bd9Sstevel@tonic-gate  *
14*7c478bd9Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
15*7c478bd9Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16*7c478bd9Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
17*7c478bd9Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
18*7c478bd9Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
19*7c478bd9Sstevel@tonic-gate  *
20*7c478bd9Sstevel@tonic-gate  * CDDL HEADER END
21*7c478bd9Sstevel@tonic-gate  */
22*7c478bd9Sstevel@tonic-gate /*
23*7c478bd9Sstevel@tonic-gate  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
24*7c478bd9Sstevel@tonic-gate  * Use is subject to license terms.
25*7c478bd9Sstevel@tonic-gate  */
26*7c478bd9Sstevel@tonic-gate 
27*7c478bd9Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
28*7c478bd9Sstevel@tonic-gate 
29*7c478bd9Sstevel@tonic-gate #include <sys/debug.h>
30*7c478bd9Sstevel@tonic-gate #include <sys/types.h>
31*7c478bd9Sstevel@tonic-gate #include <sys/file.h>
32*7c478bd9Sstevel@tonic-gate #include <sys/errno.h>
33*7c478bd9Sstevel@tonic-gate #include <sys/uio.h>
34*7c478bd9Sstevel@tonic-gate #include <sys/open.h>
35*7c478bd9Sstevel@tonic-gate #include <sys/cred.h>
36*7c478bd9Sstevel@tonic-gate #include <sys/kmem.h>
37*7c478bd9Sstevel@tonic-gate #include <sys/conf.h>
38*7c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h>
39*7c478bd9Sstevel@tonic-gate #include <sys/modctl.h>
40*7c478bd9Sstevel@tonic-gate #include <sys/disp.h>
41*7c478bd9Sstevel@tonic-gate #include <sys/atomic.h>
42*7c478bd9Sstevel@tonic-gate #include <sys/filio.h>
43*7c478bd9Sstevel@tonic-gate #include <sys/stat.h> /* needed for S_IFBLK and S_IFCHR */
44*7c478bd9Sstevel@tonic-gate #include <sys/kstat.h>
45*7c478bd9Sstevel@tonic-gate 
46*7c478bd9Sstevel@tonic-gate #include <sys/ddi.h>
47*7c478bd9Sstevel@tonic-gate #include <sys/devops.h>
48*7c478bd9Sstevel@tonic-gate #include <sys/sunddi.h>
49*7c478bd9Sstevel@tonic-gate #include <sys/priv_names.h>
50*7c478bd9Sstevel@tonic-gate 
51*7c478bd9Sstevel@tonic-gate #include <sys/fssnap.h>
52*7c478bd9Sstevel@tonic-gate #include <sys/fssnap_if.h>
53*7c478bd9Sstevel@tonic-gate 
54*7c478bd9Sstevel@tonic-gate /*
55*7c478bd9Sstevel@tonic-gate  * This module implements the file system snapshot code, which provides a
56*7c478bd9Sstevel@tonic-gate  * point-in-time image of a file system for the purposes of online backup.
57*7c478bd9Sstevel@tonic-gate  * There are essentially two parts to this project: the driver half and the
58*7c478bd9Sstevel@tonic-gate  * file system half.  The driver half is a pseudo device driver called
59*7c478bd9Sstevel@tonic-gate  * "fssnap" that represents the snapshot.  Each snapshot is assigned a
60*7c478bd9Sstevel@tonic-gate  * number that corresponds to the minor number of the device, and a control
61*7c478bd9Sstevel@tonic-gate  * device with a high minor number is used to initiate snapshot creation and
62*7c478bd9Sstevel@tonic-gate  * deletion.  For all practical purposes the driver half acts like a
63*7c478bd9Sstevel@tonic-gate  * read-only disk device whose contents are exactly the same as the master
64*7c478bd9Sstevel@tonic-gate  * file system at the time the snapshot was created.
65*7c478bd9Sstevel@tonic-gate  *
66*7c478bd9Sstevel@tonic-gate  * The file system half provides interfaces necessary for performing the
67*7c478bd9Sstevel@tonic-gate  * file system dependent operations required to create and delete snapshots
68*7c478bd9Sstevel@tonic-gate  * and a special driver strategy routine that must always be used by the file
69*7c478bd9Sstevel@tonic-gate  * system for snapshots to work correctly.
70*7c478bd9Sstevel@tonic-gate  *
71*7c478bd9Sstevel@tonic-gate  * When a snapshot is to be created, the user utility will send an ioctl to
72*7c478bd9Sstevel@tonic-gate  * the control device of the driver half specifying the file system to be
73*7c478bd9Sstevel@tonic-gate  * snapshotted, the file descriptor of a backing-store file which is used to
74*7c478bd9Sstevel@tonic-gate  * hold old data before it is overwritten, and other snapshot parameters.
75*7c478bd9Sstevel@tonic-gate  * This ioctl is passed on to the file system specified in the original
76*7c478bd9Sstevel@tonic-gate  * ioctl request.  The file system is expected to be able to flush
77*7c478bd9Sstevel@tonic-gate  * everything out to make the file system consistent and lock it to ensure
78*7c478bd9Sstevel@tonic-gate  * no changes occur while the snapshot is being created.  It then calls
79*7c478bd9Sstevel@tonic-gate  * fssnap_create() to create state for a new snapshot, from which an opaque
80*7c478bd9Sstevel@tonic-gate  * handle is returned with the snapshot locked.  Next, the file system must
81*7c478bd9Sstevel@tonic-gate  * populate the "candidate bitmap", which tells the snapshot code which
82*7c478bd9Sstevel@tonic-gate  * "chunks" should be considered for copy-on-write (a chunk is the unit of
83*7c478bd9Sstevel@tonic-gate  * granularity used for copy-on-write, which is independent of the device
84*7c478bd9Sstevel@tonic-gate  * and file system block sizes).  This is typically done by scanning the
85*7c478bd9Sstevel@tonic-gate  * file system allocation bitmaps to determine which chunks contain
86*7c478bd9Sstevel@tonic-gate  * allocated blocks in the file system at the time the snapshot was created.
87*7c478bd9Sstevel@tonic-gate  * If a chunk has no allocated blocks, it does not need to be copied before
88*7c478bd9Sstevel@tonic-gate  * being written to.  Once the candidate bitmap is populated with
89*7c478bd9Sstevel@tonic-gate  * fssnap_set_candidate(), the file system calls fssnap_create_done() to
90*7c478bd9Sstevel@tonic-gate  * complete the snapshot creation and unlock the snapshot.  The file system
91*7c478bd9Sstevel@tonic-gate  * may now be unlocked and modifications to it resumed.
92*7c478bd9Sstevel@tonic-gate  *
93*7c478bd9Sstevel@tonic-gate  * Once a snapshot is created, the file system must perform all writes
94*7c478bd9Sstevel@tonic-gate  * through a special strategy routine, fssnap_strategy().  This strategy
95*7c478bd9Sstevel@tonic-gate  * routine determines whether the chunks contained by the write must be
96*7c478bd9Sstevel@tonic-gate  * copied before being overwritten by consulting the candidate bitmap
97*7c478bd9Sstevel@tonic-gate  * described above, and the "hastrans bitmap" which tells it whether the chunk
98*7c478bd9Sstevel@tonic-gate  * has been copied already or not.  If the chunk is a candidate but has not
99*7c478bd9Sstevel@tonic-gate  * been copied, it reads the old data in and adds it to a queue.  The
100*7c478bd9Sstevel@tonic-gate  * old data can then be overwritten with the new data.  An asynchronous
101*7c478bd9Sstevel@tonic-gate  * task queue is dispatched for each old chunk read in which writes the old
102*7c478bd9Sstevel@tonic-gate  * data to the backing file specified at snapshot creation time.  The
103*7c478bd9Sstevel@tonic-gate  * backing file is a sparse file the same size as the file system that
104*7c478bd9Sstevel@tonic-gate  * contains the old data at the offset that data originally had in the
105*7c478bd9Sstevel@tonic-gate  * file system.  If the queue containing in-memory chunks gets too large,
106*7c478bd9Sstevel@tonic-gate  * writes to the file system may be throttled by a semaphore until the
107*7c478bd9Sstevel@tonic-gate  * task queues have a chance to push some of the chunks to the backing file.
108*7c478bd9Sstevel@tonic-gate  *
109*7c478bd9Sstevel@tonic-gate  * With the candidate bitmap, the hastrans bitmap, the data on the master
110*7c478bd9Sstevel@tonic-gate  * file system, and the old data in memory and in the backing file, the
111*7c478bd9Sstevel@tonic-gate  * snapshot pseudo-driver can piece together the original file system
112*7c478bd9Sstevel@tonic-gate  * information to satisfy read requests.  If the requested chunk is not a
113*7c478bd9Sstevel@tonic-gate  * candidate, it returns a zeroed buffer.  If the chunk is a candidate but
114*7c478bd9Sstevel@tonic-gate  * has not been copied it reads it from the master file system.  If it is a
115*7c478bd9Sstevel@tonic-gate  * candidate and has been copied, it either copies the data from the
116*7c478bd9Sstevel@tonic-gate  * in-memory queue or it reads it in from the backing file.  The result is
117*7c478bd9Sstevel@tonic-gate  * a replication of the original file system that can be backed up, mounted,
118*7c478bd9Sstevel@tonic-gate  * or manipulated by other file system utilities that work on a read-only
119*7c478bd9Sstevel@tonic-gate  * device.
120*7c478bd9Sstevel@tonic-gate  *
121*7c478bd9Sstevel@tonic-gate  * This module is divided into three roughly logical sections:
122*7c478bd9Sstevel@tonic-gate  *
123*7c478bd9Sstevel@tonic-gate  *     - The snapshot driver, which is a character/block driver
124*7c478bd9Sstevel@tonic-gate  *       representing the snapshot itself.  These routines are
125*7c478bd9Sstevel@tonic-gate  *       prefixed with "snap_".
126*7c478bd9Sstevel@tonic-gate  *
127*7c478bd9Sstevel@tonic-gate  *     - The library routines that are defined in fssnap_if.h that
128*7c478bd9Sstevel@tonic-gate  *       are used by file systems that use this snapshot implementation.
129*7c478bd9Sstevel@tonic-gate  *       These functions are prefixed with "fssnap_" and are called through
130*7c478bd9Sstevel@tonic-gate  *       a function vector from the file system.
131*7c478bd9Sstevel@tonic-gate  *
132*7c478bd9Sstevel@tonic-gate  *     - The helper routines used by the snapshot driver and the fssnap
133*7c478bd9Sstevel@tonic-gate  *       library routines for managing the translation table and other
134*7c478bd9Sstevel@tonic-gate  *       useful functions.  These routines are all static and are
135*7c478bd9Sstevel@tonic-gate  *       prefixed with either "fssnap_" or "transtbl_" if they
136*7c478bd9Sstevel@tonic-gate  *       are specifically used for translation table activities.
137*7c478bd9Sstevel@tonic-gate  */
138*7c478bd9Sstevel@tonic-gate 
139*7c478bd9Sstevel@tonic-gate static dev_info_t		*fssnap_dip = NULL;
140*7c478bd9Sstevel@tonic-gate static struct snapshot_id	*snapshot = NULL;
141*7c478bd9Sstevel@tonic-gate static struct snapshot_id	snap_ctl;
142*7c478bd9Sstevel@tonic-gate static int			num_snapshots = 0;
143*7c478bd9Sstevel@tonic-gate static kmutex_t			snapshot_mutex;
144*7c478bd9Sstevel@tonic-gate static char			snapname[] = SNAP_NAME;
145*7c478bd9Sstevel@tonic-gate 
146*7c478bd9Sstevel@tonic-gate /* "tunable" parameters */
147*7c478bd9Sstevel@tonic-gate static int		fssnap_taskq_nthreads = FSSNAP_TASKQ_THREADS;
148*7c478bd9Sstevel@tonic-gate static uint_t		fssnap_max_mem_chunks = FSSNAP_MAX_MEM_CHUNKS;
149*7c478bd9Sstevel@tonic-gate static int		fssnap_taskq_maxtasks = FSSNAP_TASKQ_MAXTASKS;
150*7c478bd9Sstevel@tonic-gate 
151*7c478bd9Sstevel@tonic-gate /* static function prototypes */
152*7c478bd9Sstevel@tonic-gate 
153*7c478bd9Sstevel@tonic-gate /* snapshot driver */
154*7c478bd9Sstevel@tonic-gate static int snap_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
155*7c478bd9Sstevel@tonic-gate static int snap_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
156*7c478bd9Sstevel@tonic-gate static int snap_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
157*7c478bd9Sstevel@tonic-gate static int snap_open(dev_t *devp, int flag, int otyp, cred_t *cred);
158*7c478bd9Sstevel@tonic-gate static int snap_close(dev_t dev, int flag, int otyp, cred_t *cred);
159*7c478bd9Sstevel@tonic-gate static int snap_strategy(struct buf *bp);
160*7c478bd9Sstevel@tonic-gate static int snap_read(dev_t dev, struct uio *uiop, cred_t *credp);
161*7c478bd9Sstevel@tonic-gate static int snap_print(dev_t dev, char *str);
162*7c478bd9Sstevel@tonic-gate static int snap_ioctl(dev_t dev, int cmd, intptr_t arg, int mode,
163*7c478bd9Sstevel@tonic-gate     cred_t *credp, int *rvalp);
164*7c478bd9Sstevel@tonic-gate static int snap_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op,
165*7c478bd9Sstevel@tonic-gate     int flags, char *name, caddr_t valuep, int *lengthp);
166*7c478bd9Sstevel@tonic-gate static int snap_getchunk(struct snapshot_id *sidp, chunknumber_t chunk,
167*7c478bd9Sstevel@tonic-gate     int offset, int len, char *buffer);
168*7c478bd9Sstevel@tonic-gate 
169*7c478bd9Sstevel@tonic-gate 
170*7c478bd9Sstevel@tonic-gate /* fssnap interface implementations (see fssnap_if.h) */
171*7c478bd9Sstevel@tonic-gate static void fssnap_strategy_impl(void *, struct buf *);
172*7c478bd9Sstevel@tonic-gate static void *fssnap_create_impl(chunknumber_t, uint_t, u_offset_t,
173*7c478bd9Sstevel@tonic-gate     struct vnode *, int, struct vnode **, char *, u_offset_t);
174*7c478bd9Sstevel@tonic-gate static void fssnap_set_candidate_impl(void *, chunknumber_t);
175*7c478bd9Sstevel@tonic-gate static int fssnap_is_candidate_impl(void *, u_offset_t);
176*7c478bd9Sstevel@tonic-gate static int fssnap_create_done_impl(void *);
177*7c478bd9Sstevel@tonic-gate static int fssnap_delete_impl(void *);
178*7c478bd9Sstevel@tonic-gate 
179*7c478bd9Sstevel@tonic-gate /* fssnap interface support routines */
180*7c478bd9Sstevel@tonic-gate static int  fssnap_translate(struct snapshot_id **, struct buf *);
181*7c478bd9Sstevel@tonic-gate static void fssnap_write_taskq(void *);
182*7c478bd9Sstevel@tonic-gate static void fssnap_create_kstats(snapshot_id_t *, int, const char *,
183*7c478bd9Sstevel@tonic-gate     const char *);
184*7c478bd9Sstevel@tonic-gate static int  fssnap_update_kstat_num(kstat_t *, int);
185*7c478bd9Sstevel@tonic-gate static void fssnap_delete_kstats(struct cow_info *);
186*7c478bd9Sstevel@tonic-gate 
187*7c478bd9Sstevel@tonic-gate /* translation table prototypes */
188*7c478bd9Sstevel@tonic-gate static cow_map_node_t *transtbl_add(cow_map_t *, chunknumber_t, caddr_t);
189*7c478bd9Sstevel@tonic-gate static cow_map_node_t *transtbl_get(cow_map_t *, chunknumber_t);
190*7c478bd9Sstevel@tonic-gate static void transtbl_delete(cow_map_t *, cow_map_node_t *);
191*7c478bd9Sstevel@tonic-gate static void transtbl_free(cow_map_t *);
192*7c478bd9Sstevel@tonic-gate 
193*7c478bd9Sstevel@tonic-gate static kstat_t *fssnap_highwater_kstat;
194*7c478bd9Sstevel@tonic-gate 
195*7c478bd9Sstevel@tonic-gate /* ************************************************************************ */
196*7c478bd9Sstevel@tonic-gate 
197*7c478bd9Sstevel@tonic-gate /* Device and Module Structures */
198*7c478bd9Sstevel@tonic-gate 
199*7c478bd9Sstevel@tonic-gate static struct cb_ops snap_cb_ops = {
200*7c478bd9Sstevel@tonic-gate 	snap_open,
201*7c478bd9Sstevel@tonic-gate 	snap_close,
202*7c478bd9Sstevel@tonic-gate 	snap_strategy,
203*7c478bd9Sstevel@tonic-gate 	snap_print,
204*7c478bd9Sstevel@tonic-gate 	nodev,		/* no snap_dump */
205*7c478bd9Sstevel@tonic-gate 	snap_read,
206*7c478bd9Sstevel@tonic-gate 	nodev,		/* no snap_write */
207*7c478bd9Sstevel@tonic-gate 	snap_ioctl,
208*7c478bd9Sstevel@tonic-gate 	nodev,		/* no snap_devmap */
209*7c478bd9Sstevel@tonic-gate 	nodev,		/* no snap_mmap   */
210*7c478bd9Sstevel@tonic-gate 	nodev,		/* no snap_segmap */
211*7c478bd9Sstevel@tonic-gate 	nochpoll,
212*7c478bd9Sstevel@tonic-gate 	snap_prop_op,
213*7c478bd9Sstevel@tonic-gate 	NULL,		/* streamtab */
214*7c478bd9Sstevel@tonic-gate 	D_64BIT | D_NEW | D_MP, /* driver compatibility */
215*7c478bd9Sstevel@tonic-gate 	CB_REV,
216*7c478bd9Sstevel@tonic-gate 	nodev,		/* async I/O read entry point */
217*7c478bd9Sstevel@tonic-gate 	nodev		/* async I/O write entry point */
218*7c478bd9Sstevel@tonic-gate };
219*7c478bd9Sstevel@tonic-gate 
220*7c478bd9Sstevel@tonic-gate static struct dev_ops snap_ops = {
221*7c478bd9Sstevel@tonic-gate 	DEVO_REV,
222*7c478bd9Sstevel@tonic-gate 	0,			/* ref count */
223*7c478bd9Sstevel@tonic-gate 	snap_getinfo,
224*7c478bd9Sstevel@tonic-gate 	nulldev,		/* snap_identify obsolete */
225*7c478bd9Sstevel@tonic-gate 	nulldev,		/* no snap_probe */
226*7c478bd9Sstevel@tonic-gate 	snap_attach,
227*7c478bd9Sstevel@tonic-gate 	snap_detach,
228*7c478bd9Sstevel@tonic-gate 	nodev,			/* no snap_reset */
229*7c478bd9Sstevel@tonic-gate 	&snap_cb_ops,
230*7c478bd9Sstevel@tonic-gate 	(struct bus_ops *)NULL,
231*7c478bd9Sstevel@tonic-gate 	nulldev			/* no snap_power() */
232*7c478bd9Sstevel@tonic-gate };
233*7c478bd9Sstevel@tonic-gate 
234*7c478bd9Sstevel@tonic-gate extern struct mod_ops mod_driverops;
235*7c478bd9Sstevel@tonic-gate 
236*7c478bd9Sstevel@tonic-gate static struct modldrv md = {
237*7c478bd9Sstevel@tonic-gate 	&mod_driverops, /* Type of module. This is a driver */
238*7c478bd9Sstevel@tonic-gate 	"snapshot driver %I%", 	/* Name of the module */
239*7c478bd9Sstevel@tonic-gate 	&snap_ops,
240*7c478bd9Sstevel@tonic-gate };
241*7c478bd9Sstevel@tonic-gate 
242*7c478bd9Sstevel@tonic-gate static struct modlinkage ml = {
243*7c478bd9Sstevel@tonic-gate 	MODREV_1,
244*7c478bd9Sstevel@tonic-gate 	&md,
245*7c478bd9Sstevel@tonic-gate 	NULL
246*7c478bd9Sstevel@tonic-gate };
247*7c478bd9Sstevel@tonic-gate 
248*7c478bd9Sstevel@tonic-gate static void *statep;
249*7c478bd9Sstevel@tonic-gate 
250*7c478bd9Sstevel@tonic-gate int
251*7c478bd9Sstevel@tonic-gate _init(void)
252*7c478bd9Sstevel@tonic-gate {
253*7c478bd9Sstevel@tonic-gate 	int	error;
254*7c478bd9Sstevel@tonic-gate 	kstat_t	*ksp;
255*7c478bd9Sstevel@tonic-gate 	kstat_named_t	*ksdata;
256*7c478bd9Sstevel@tonic-gate 
257*7c478bd9Sstevel@tonic-gate 	error = ddi_soft_state_init(&statep, sizeof (struct snapshot_id *), 1);
258*7c478bd9Sstevel@tonic-gate 	if (error) {
259*7c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN, "_init: failed to init ddi_soft_state.");
260*7c478bd9Sstevel@tonic-gate 		return (error);
261*7c478bd9Sstevel@tonic-gate 	}
262*7c478bd9Sstevel@tonic-gate 
263*7c478bd9Sstevel@tonic-gate 	error = mod_install(&ml);
264*7c478bd9Sstevel@tonic-gate 
265*7c478bd9Sstevel@tonic-gate 	if (error) {
266*7c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN, "_init: failed to mod_install.");
267*7c478bd9Sstevel@tonic-gate 		ddi_soft_state_fini(&statep);
268*7c478bd9Sstevel@tonic-gate 		return (error);
269*7c478bd9Sstevel@tonic-gate 	}
270*7c478bd9Sstevel@tonic-gate 
271*7c478bd9Sstevel@tonic-gate 	/*
272*7c478bd9Sstevel@tonic-gate 	 * Fill in the snapshot operations vector for file systems
273*7c478bd9Sstevel@tonic-gate 	 * (defined in fssnap_if.c)
274*7c478bd9Sstevel@tonic-gate 	 */
275*7c478bd9Sstevel@tonic-gate 
276*7c478bd9Sstevel@tonic-gate 	snapops.fssnap_create = fssnap_create_impl;
277*7c478bd9Sstevel@tonic-gate 	snapops.fssnap_set_candidate = fssnap_set_candidate_impl;
278*7c478bd9Sstevel@tonic-gate 	snapops.fssnap_is_candidate = fssnap_is_candidate_impl;
279*7c478bd9Sstevel@tonic-gate 	snapops.fssnap_create_done = fssnap_create_done_impl;
280*7c478bd9Sstevel@tonic-gate 	snapops.fssnap_delete = fssnap_delete_impl;
281*7c478bd9Sstevel@tonic-gate 	snapops.fssnap_strategy = fssnap_strategy_impl;
282*7c478bd9Sstevel@tonic-gate 
283*7c478bd9Sstevel@tonic-gate 	mutex_init(&snapshot_mutex, NULL, MUTEX_DEFAULT, NULL);
284*7c478bd9Sstevel@tonic-gate 
285*7c478bd9Sstevel@tonic-gate 	/*
286*7c478bd9Sstevel@tonic-gate 	 * Initialize the fssnap highwater kstat
287*7c478bd9Sstevel@tonic-gate 	 */
288*7c478bd9Sstevel@tonic-gate 	ksp = kstat_create(snapname, 0, FSSNAP_KSTAT_HIGHWATER, "misc",
289*7c478bd9Sstevel@tonic-gate 	    KSTAT_TYPE_NAMED, 1, 0);
290*7c478bd9Sstevel@tonic-gate 	if (ksp != NULL) {
291*7c478bd9Sstevel@tonic-gate 		ksdata = (kstat_named_t *)ksp->ks_data;
292*7c478bd9Sstevel@tonic-gate 		kstat_named_init(ksdata, FSSNAP_KSTAT_HIGHWATER,
293*7c478bd9Sstevel@tonic-gate 		    KSTAT_DATA_UINT32);
294*7c478bd9Sstevel@tonic-gate 		ksdata->value.ui32 = 0;
295*7c478bd9Sstevel@tonic-gate 		kstat_install(ksp);
296*7c478bd9Sstevel@tonic-gate 	} else {
297*7c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN, "_init: failed to create highwater kstat.");
298*7c478bd9Sstevel@tonic-gate 	}
299*7c478bd9Sstevel@tonic-gate 	fssnap_highwater_kstat = ksp;
300*7c478bd9Sstevel@tonic-gate 
301*7c478bd9Sstevel@tonic-gate 	return (0);
302*7c478bd9Sstevel@tonic-gate }
303*7c478bd9Sstevel@tonic-gate 
304*7c478bd9Sstevel@tonic-gate int
305*7c478bd9Sstevel@tonic-gate _info(struct modinfo *modinfop)
306*7c478bd9Sstevel@tonic-gate {
307*7c478bd9Sstevel@tonic-gate 	return (mod_info(&ml, modinfop));
308*7c478bd9Sstevel@tonic-gate }
309*7c478bd9Sstevel@tonic-gate 
310*7c478bd9Sstevel@tonic-gate int
311*7c478bd9Sstevel@tonic-gate _fini(void)
312*7c478bd9Sstevel@tonic-gate {
313*7c478bd9Sstevel@tonic-gate 	int	error;
314*7c478bd9Sstevel@tonic-gate 
315*7c478bd9Sstevel@tonic-gate 	error = mod_remove(&ml);
316*7c478bd9Sstevel@tonic-gate 	if (error)
317*7c478bd9Sstevel@tonic-gate 		return (error);
318*7c478bd9Sstevel@tonic-gate 	ddi_soft_state_fini(&statep);
319*7c478bd9Sstevel@tonic-gate 
320*7c478bd9Sstevel@tonic-gate 	/*
321*7c478bd9Sstevel@tonic-gate 	 * delete the fssnap highwater kstat
322*7c478bd9Sstevel@tonic-gate 	 */
323*7c478bd9Sstevel@tonic-gate 	kstat_delete(fssnap_highwater_kstat);
324*7c478bd9Sstevel@tonic-gate 
325*7c478bd9Sstevel@tonic-gate 	mutex_destroy(&snapshot_mutex);
326*7c478bd9Sstevel@tonic-gate 
327*7c478bd9Sstevel@tonic-gate 	/* Clear out the file system operations vector */
328*7c478bd9Sstevel@tonic-gate 	snapops.fssnap_create = NULL;
329*7c478bd9Sstevel@tonic-gate 	snapops.fssnap_set_candidate = NULL;
330*7c478bd9Sstevel@tonic-gate 	snapops.fssnap_create_done = NULL;
331*7c478bd9Sstevel@tonic-gate 	snapops.fssnap_delete = NULL;
332*7c478bd9Sstevel@tonic-gate 	snapops.fssnap_strategy = NULL;
333*7c478bd9Sstevel@tonic-gate 
334*7c478bd9Sstevel@tonic-gate 	return (0);
335*7c478bd9Sstevel@tonic-gate }
336*7c478bd9Sstevel@tonic-gate 
337*7c478bd9Sstevel@tonic-gate /* ************************************************************************ */
338*7c478bd9Sstevel@tonic-gate 
339*7c478bd9Sstevel@tonic-gate /*
340*7c478bd9Sstevel@tonic-gate  * Snapshot Driver Routines
341*7c478bd9Sstevel@tonic-gate  *
342*7c478bd9Sstevel@tonic-gate  * This section implements the snapshot character and block drivers.  The
343*7c478bd9Sstevel@tonic-gate  * device will appear to be a consistent read-only file system to
344*7c478bd9Sstevel@tonic-gate  * applications that wish to back it up or mount it.  The snapshot driver
345*7c478bd9Sstevel@tonic-gate  * communicates with the file system through the translation table, which
346*7c478bd9Sstevel@tonic-gate  * tells the snapshot driver where to find the data necessary to piece
347*7c478bd9Sstevel@tonic-gate  * together the frozen file system.  The data may either be on the master
348*7c478bd9Sstevel@tonic-gate  * device (no translation exists), in memory (a translation exists but has
349*7c478bd9Sstevel@tonic-gate  * not been flushed to the backing store), or in the backing store file.
350*7c478bd9Sstevel@tonic-gate  * The read request may require the snapshot driver to retreive data from
351*7c478bd9Sstevel@tonic-gate  * several different places and piece it together to look like a single
352*7c478bd9Sstevel@tonic-gate  * contiguous read.
353*7c478bd9Sstevel@tonic-gate  *
354*7c478bd9Sstevel@tonic-gate  * The device minor number corresponds to the snapshot number in the list of
355*7c478bd9Sstevel@tonic-gate  * snapshot identifiers.  The soft state for each minor number is simply a
356*7c478bd9Sstevel@tonic-gate  * pointer to the snapshot id, which holds all of the snapshot state.  One
357*7c478bd9Sstevel@tonic-gate  * minor number is designated as the control device.  All snapshot create
358*7c478bd9Sstevel@tonic-gate  * and delete requests go through the control device to ensure this module
359*7c478bd9Sstevel@tonic-gate  * is properly loaded and attached before the file system starts calling
360*7c478bd9Sstevel@tonic-gate  * routines defined here.
361*7c478bd9Sstevel@tonic-gate  */
362*7c478bd9Sstevel@tonic-gate 
363*7c478bd9Sstevel@tonic-gate 
364*7c478bd9Sstevel@tonic-gate /*
365*7c478bd9Sstevel@tonic-gate  * snap_getinfo() - snapshot driver getinfo(9E) routine
366*7c478bd9Sstevel@tonic-gate  *
367*7c478bd9Sstevel@tonic-gate  */
368*7c478bd9Sstevel@tonic-gate /*ARGSUSED*/
369*7c478bd9Sstevel@tonic-gate static int
370*7c478bd9Sstevel@tonic-gate snap_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
371*7c478bd9Sstevel@tonic-gate {
372*7c478bd9Sstevel@tonic-gate 	switch (infocmd) {
373*7c478bd9Sstevel@tonic-gate 	case DDI_INFO_DEVT2DEVINFO:
374*7c478bd9Sstevel@tonic-gate 		*result = fssnap_dip;
375*7c478bd9Sstevel@tonic-gate 		return (DDI_SUCCESS);
376*7c478bd9Sstevel@tonic-gate 	case DDI_INFO_DEVT2INSTANCE:
377*7c478bd9Sstevel@tonic-gate 		*result = 0;	/* we only have one instance */
378*7c478bd9Sstevel@tonic-gate 		return (DDI_SUCCESS);
379*7c478bd9Sstevel@tonic-gate 	}
380*7c478bd9Sstevel@tonic-gate 	return (DDI_FAILURE);
381*7c478bd9Sstevel@tonic-gate }
382*7c478bd9Sstevel@tonic-gate 
383*7c478bd9Sstevel@tonic-gate /*
384*7c478bd9Sstevel@tonic-gate  * snap_attach() - snapshot driver attach(9E) routine
385*7c478bd9Sstevel@tonic-gate  *
386*7c478bd9Sstevel@tonic-gate  *    sets up snapshot control device and control state.  The control state
387*7c478bd9Sstevel@tonic-gate  *    is a pointer to an "anonymous" snapshot_id for tracking opens and closes
388*7c478bd9Sstevel@tonic-gate  */
389*7c478bd9Sstevel@tonic-gate static int
390*7c478bd9Sstevel@tonic-gate snap_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
391*7c478bd9Sstevel@tonic-gate {
392*7c478bd9Sstevel@tonic-gate 	int			error;
393*7c478bd9Sstevel@tonic-gate 
394*7c478bd9Sstevel@tonic-gate 	switch (cmd) {
395*7c478bd9Sstevel@tonic-gate 	case DDI_ATTACH:
396*7c478bd9Sstevel@tonic-gate 		/* create the control device */
397*7c478bd9Sstevel@tonic-gate 		error = ddi_create_priv_minor_node(dip, SNAP_CTL_NODE, S_IFCHR,
398*7c478bd9Sstevel@tonic-gate 		    SNAP_CTL_MINOR, DDI_PSEUDO, PRIVONLY_DEV,
399*7c478bd9Sstevel@tonic-gate 		    PRIV_SYS_CONFIG, PRIV_SYS_CONFIG, 0666);
400*7c478bd9Sstevel@tonic-gate 		if (error == DDI_FAILURE) {
401*7c478bd9Sstevel@tonic-gate 			return (DDI_FAILURE);
402*7c478bd9Sstevel@tonic-gate 		}
403*7c478bd9Sstevel@tonic-gate 
404*7c478bd9Sstevel@tonic-gate 		rw_init(&snap_ctl.sid_rwlock, NULL, RW_DEFAULT, NULL);
405*7c478bd9Sstevel@tonic-gate 		rw_enter(&snap_ctl.sid_rwlock, RW_WRITER);
406*7c478bd9Sstevel@tonic-gate 		fssnap_dip = dip;
407*7c478bd9Sstevel@tonic-gate 		snap_ctl.sid_snapnumber = SNAP_CTL_MINOR;
408*7c478bd9Sstevel@tonic-gate 		/* the control sid is not linked into the snapshot list */
409*7c478bd9Sstevel@tonic-gate 		snap_ctl.sid_next = NULL;
410*7c478bd9Sstevel@tonic-gate 		snap_ctl.sid_cowinfo = NULL;
411*7c478bd9Sstevel@tonic-gate 		snap_ctl.sid_flags = 0;
412*7c478bd9Sstevel@tonic-gate 		rw_exit(&snap_ctl.sid_rwlock);
413*7c478bd9Sstevel@tonic-gate 		ddi_report_dev(dip);
414*7c478bd9Sstevel@tonic-gate 
415*7c478bd9Sstevel@tonic-gate 		return (DDI_SUCCESS);
416*7c478bd9Sstevel@tonic-gate 	case DDI_PM_RESUME:
417*7c478bd9Sstevel@tonic-gate 		return (DDI_SUCCESS);
418*7c478bd9Sstevel@tonic-gate 
419*7c478bd9Sstevel@tonic-gate 	case DDI_RESUME:
420*7c478bd9Sstevel@tonic-gate 		return (DDI_SUCCESS);
421*7c478bd9Sstevel@tonic-gate 
422*7c478bd9Sstevel@tonic-gate 	default:
423*7c478bd9Sstevel@tonic-gate 		return (DDI_FAILURE);
424*7c478bd9Sstevel@tonic-gate 	}
425*7c478bd9Sstevel@tonic-gate }
426*7c478bd9Sstevel@tonic-gate 
427*7c478bd9Sstevel@tonic-gate /*
428*7c478bd9Sstevel@tonic-gate  * snap_detach() - snapshot driver detach(9E) routine
429*7c478bd9Sstevel@tonic-gate  *
430*7c478bd9Sstevel@tonic-gate  *    destroys snapshot control device and control state.  If any snapshots
431*7c478bd9Sstevel@tonic-gate  *    are active (ie. num_snapshots != 0), the device will refuse to detach.
432*7c478bd9Sstevel@tonic-gate  */
433*7c478bd9Sstevel@tonic-gate static int
434*7c478bd9Sstevel@tonic-gate snap_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
435*7c478bd9Sstevel@tonic-gate {
436*7c478bd9Sstevel@tonic-gate 	struct snapshot_id *sidp, *sidnextp;
437*7c478bd9Sstevel@tonic-gate 
438*7c478bd9Sstevel@tonic-gate 	switch (cmd) {
439*7c478bd9Sstevel@tonic-gate 	case DDI_DETACH:
440*7c478bd9Sstevel@tonic-gate 		/* do not detach if the device is active */
441*7c478bd9Sstevel@tonic-gate 		mutex_enter(&snapshot_mutex);
442*7c478bd9Sstevel@tonic-gate 		if ((num_snapshots != 0) ||
443*7c478bd9Sstevel@tonic-gate 		    ((snap_ctl.sid_flags & SID_CHAR_BUSY) != 0)) {
444*7c478bd9Sstevel@tonic-gate 			mutex_exit(&snapshot_mutex);
445*7c478bd9Sstevel@tonic-gate 			return (DDI_FAILURE);
446*7c478bd9Sstevel@tonic-gate 		}
447*7c478bd9Sstevel@tonic-gate 
448*7c478bd9Sstevel@tonic-gate 		/* free up the snapshot list */
449*7c478bd9Sstevel@tonic-gate 		for (sidp = snapshot; sidp != NULL; sidp = sidnextp) {
450*7c478bd9Sstevel@tonic-gate 			ASSERT(SID_AVAILABLE(sidp) &&
451*7c478bd9Sstevel@tonic-gate 			    !RW_LOCK_HELD(&sidp->sid_rwlock));
452*7c478bd9Sstevel@tonic-gate 			sidnextp = sidp->sid_next;
453*7c478bd9Sstevel@tonic-gate 			rw_destroy(&sidp->sid_rwlock);
454*7c478bd9Sstevel@tonic-gate 			kmem_free(sidp, sizeof (struct snapshot_id));
455*7c478bd9Sstevel@tonic-gate 		}
456*7c478bd9Sstevel@tonic-gate 		snapshot = NULL;
457*7c478bd9Sstevel@tonic-gate 
458*7c478bd9Sstevel@tonic-gate 		/* delete the control device */
459*7c478bd9Sstevel@tonic-gate 		ddi_remove_minor_node(dip, SNAP_CTL_NODE);
460*7c478bd9Sstevel@tonic-gate 		fssnap_dip = NULL;
461*7c478bd9Sstevel@tonic-gate 
462*7c478bd9Sstevel@tonic-gate 		ASSERT((snap_ctl.sid_flags & SID_CHAR_BUSY) == 0);
463*7c478bd9Sstevel@tonic-gate 		rw_destroy(&snap_ctl.sid_rwlock);
464*7c478bd9Sstevel@tonic-gate 		mutex_exit(&snapshot_mutex);
465*7c478bd9Sstevel@tonic-gate 
466*7c478bd9Sstevel@tonic-gate 		return (DDI_SUCCESS);
467*7c478bd9Sstevel@tonic-gate 
468*7c478bd9Sstevel@tonic-gate 	default:
469*7c478bd9Sstevel@tonic-gate 		return (DDI_FAILURE);
470*7c478bd9Sstevel@tonic-gate 	}
471*7c478bd9Sstevel@tonic-gate }
472*7c478bd9Sstevel@tonic-gate 
473*7c478bd9Sstevel@tonic-gate /*
474*7c478bd9Sstevel@tonic-gate  * snap_open() - snapshot driver open(9E) routine
475*7c478bd9Sstevel@tonic-gate  *
476*7c478bd9Sstevel@tonic-gate  *     marks the snapshot id as busy so it will not be recycled when deleted
477*7c478bd9Sstevel@tonic-gate  *     until the snapshot is closed.
478*7c478bd9Sstevel@tonic-gate  */
479*7c478bd9Sstevel@tonic-gate /* ARGSUSED */
480*7c478bd9Sstevel@tonic-gate static int
481*7c478bd9Sstevel@tonic-gate snap_open(dev_t *devp, int flag, int otyp, cred_t *cred)
482*7c478bd9Sstevel@tonic-gate {
483*7c478bd9Sstevel@tonic-gate 	minor_t	minor;
484*7c478bd9Sstevel@tonic-gate 	struct snapshot_id **sidpp, *sidp;
485*7c478bd9Sstevel@tonic-gate 
486*7c478bd9Sstevel@tonic-gate 	/* snapshots are read-only */
487*7c478bd9Sstevel@tonic-gate 	if (flag & FWRITE)
488*7c478bd9Sstevel@tonic-gate 		return (EROFS);
489*7c478bd9Sstevel@tonic-gate 
490*7c478bd9Sstevel@tonic-gate 	minor = getminor(*devp);
491*7c478bd9Sstevel@tonic-gate 
492*7c478bd9Sstevel@tonic-gate 	if (minor == SNAP_CTL_MINOR) {
493*7c478bd9Sstevel@tonic-gate 		/* control device must be opened exclusively */
494*7c478bd9Sstevel@tonic-gate 		if (((flag & FEXCL) != FEXCL) || (otyp != OTYP_CHR))
495*7c478bd9Sstevel@tonic-gate 			return (EINVAL);
496*7c478bd9Sstevel@tonic-gate 
497*7c478bd9Sstevel@tonic-gate 		rw_enter(&snap_ctl.sid_rwlock, RW_WRITER);
498*7c478bd9Sstevel@tonic-gate 		if ((snap_ctl.sid_flags & SID_CHAR_BUSY) != 0) {
499*7c478bd9Sstevel@tonic-gate 			rw_exit(&snap_ctl.sid_rwlock);
500*7c478bd9Sstevel@tonic-gate 			return (EBUSY);
501*7c478bd9Sstevel@tonic-gate 		}
502*7c478bd9Sstevel@tonic-gate 
503*7c478bd9Sstevel@tonic-gate 		snap_ctl.sid_flags |= SID_CHAR_BUSY;
504*7c478bd9Sstevel@tonic-gate 		rw_exit(&snap_ctl.sid_rwlock);
505*7c478bd9Sstevel@tonic-gate 
506*7c478bd9Sstevel@tonic-gate 		return (0);
507*7c478bd9Sstevel@tonic-gate 	}
508*7c478bd9Sstevel@tonic-gate 
509*7c478bd9Sstevel@tonic-gate 	sidpp = ddi_get_soft_state(statep, minor);
510*7c478bd9Sstevel@tonic-gate 	if (sidpp == NULL || *sidpp == NULL)
511*7c478bd9Sstevel@tonic-gate 		return (ENXIO);
512*7c478bd9Sstevel@tonic-gate 	sidp = *sidpp;
513*7c478bd9Sstevel@tonic-gate 	rw_enter(&sidp->sid_rwlock, RW_WRITER);
514*7c478bd9Sstevel@tonic-gate 
515*7c478bd9Sstevel@tonic-gate 	if ((flag & FEXCL) && SID_BUSY(sidp)) {
516*7c478bd9Sstevel@tonic-gate 		rw_exit(&sidp->sid_rwlock);
517*7c478bd9Sstevel@tonic-gate 		return (EAGAIN);
518*7c478bd9Sstevel@tonic-gate 	}
519*7c478bd9Sstevel@tonic-gate 
520*7c478bd9Sstevel@tonic-gate 	ASSERT(sidpp != NULL && sidp != NULL);
521*7c478bd9Sstevel@tonic-gate 	/* check to see if this snapshot has been killed on us */
522*7c478bd9Sstevel@tonic-gate 	if (SID_INACTIVE(sidp)) {
523*7c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN, "snap_open: snapshot %d does not exist.",
524*7c478bd9Sstevel@tonic-gate 		    minor);
525*7c478bd9Sstevel@tonic-gate 		rw_exit(&sidp->sid_rwlock);
526*7c478bd9Sstevel@tonic-gate 		return (ENXIO);
527*7c478bd9Sstevel@tonic-gate 	}
528*7c478bd9Sstevel@tonic-gate 
529*7c478bd9Sstevel@tonic-gate 	switch (otyp) {
530*7c478bd9Sstevel@tonic-gate 	case OTYP_CHR:
531*7c478bd9Sstevel@tonic-gate 		sidp->sid_flags |= SID_CHAR_BUSY;
532*7c478bd9Sstevel@tonic-gate 		break;
533*7c478bd9Sstevel@tonic-gate 	case OTYP_BLK:
534*7c478bd9Sstevel@tonic-gate 		sidp->sid_flags |= SID_BLOCK_BUSY;
535*7c478bd9Sstevel@tonic-gate 		break;
536*7c478bd9Sstevel@tonic-gate 	default:
537*7c478bd9Sstevel@tonic-gate 		rw_exit(&sidp->sid_rwlock);
538*7c478bd9Sstevel@tonic-gate 		return (EINVAL);
539*7c478bd9Sstevel@tonic-gate 	}
540*7c478bd9Sstevel@tonic-gate 
541*7c478bd9Sstevel@tonic-gate 	rw_exit(&sidp->sid_rwlock);
542*7c478bd9Sstevel@tonic-gate 
543*7c478bd9Sstevel@tonic-gate 	/*
544*7c478bd9Sstevel@tonic-gate 	 * at this point if a valid snapshot was found then it has
545*7c478bd9Sstevel@tonic-gate 	 * been marked busy and we can use it.
546*7c478bd9Sstevel@tonic-gate 	 */
547*7c478bd9Sstevel@tonic-gate 	return (0);
548*7c478bd9Sstevel@tonic-gate }
549*7c478bd9Sstevel@tonic-gate 
550*7c478bd9Sstevel@tonic-gate /*
551*7c478bd9Sstevel@tonic-gate  * snap_close() - snapshot driver close(9E) routine
552*7c478bd9Sstevel@tonic-gate  *
553*7c478bd9Sstevel@tonic-gate  *    unsets the busy bits in the snapshot id.  If the snapshot has been
554*7c478bd9Sstevel@tonic-gate  *    deleted while the snapshot device was open, the close call will clean
555*7c478bd9Sstevel@tonic-gate  *    up the remaining state information.
556*7c478bd9Sstevel@tonic-gate  */
557*7c478bd9Sstevel@tonic-gate /* ARGSUSED */
558*7c478bd9Sstevel@tonic-gate static int
559*7c478bd9Sstevel@tonic-gate snap_close(dev_t dev, int flag, int otyp, cred_t *cred)
560*7c478bd9Sstevel@tonic-gate {
561*7c478bd9Sstevel@tonic-gate 	struct snapshot_id	**sidpp, *sidp;
562*7c478bd9Sstevel@tonic-gate 	minor_t			minor;
563*7c478bd9Sstevel@tonic-gate 	char			name[20];
564*7c478bd9Sstevel@tonic-gate 
565*7c478bd9Sstevel@tonic-gate 	minor = getminor(dev);
566*7c478bd9Sstevel@tonic-gate 
567*7c478bd9Sstevel@tonic-gate 	/* if this is the control device, close it and return */
568*7c478bd9Sstevel@tonic-gate 	if (minor == SNAP_CTL_MINOR) {
569*7c478bd9Sstevel@tonic-gate 		rw_enter(&snap_ctl.sid_rwlock, RW_WRITER);
570*7c478bd9Sstevel@tonic-gate 		snap_ctl.sid_flags &= ~(SID_CHAR_BUSY);
571*7c478bd9Sstevel@tonic-gate 		rw_exit(&snap_ctl.sid_rwlock);
572*7c478bd9Sstevel@tonic-gate 		return (0);
573*7c478bd9Sstevel@tonic-gate 	}
574*7c478bd9Sstevel@tonic-gate 
575*7c478bd9Sstevel@tonic-gate 	sidpp = ddi_get_soft_state(statep, minor);
576*7c478bd9Sstevel@tonic-gate 	if (sidpp == NULL || *sidpp == NULL) {
577*7c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN, "snap_close: could not find state for "
578*7c478bd9Sstevel@tonic-gate 		    "snapshot %d.", minor);
579*7c478bd9Sstevel@tonic-gate 		return (ENXIO);
580*7c478bd9Sstevel@tonic-gate 	}
581*7c478bd9Sstevel@tonic-gate 	sidp = *sidpp;
582*7c478bd9Sstevel@tonic-gate 	mutex_enter(&snapshot_mutex);
583*7c478bd9Sstevel@tonic-gate 	rw_enter(&sidp->sid_rwlock, RW_WRITER);
584*7c478bd9Sstevel@tonic-gate 
585*7c478bd9Sstevel@tonic-gate 	/* Mark the snapshot as not being busy anymore */
586*7c478bd9Sstevel@tonic-gate 	switch (otyp) {
587*7c478bd9Sstevel@tonic-gate 	case OTYP_CHR:
588*7c478bd9Sstevel@tonic-gate 		sidp->sid_flags &= ~(SID_CHAR_BUSY);
589*7c478bd9Sstevel@tonic-gate 		break;
590*7c478bd9Sstevel@tonic-gate 	case OTYP_BLK:
591*7c478bd9Sstevel@tonic-gate 		sidp->sid_flags &= ~(SID_BLOCK_BUSY);
592*7c478bd9Sstevel@tonic-gate 		break;
593*7c478bd9Sstevel@tonic-gate 	default:
594*7c478bd9Sstevel@tonic-gate 		mutex_exit(&snapshot_mutex);
595*7c478bd9Sstevel@tonic-gate 		rw_exit(&sidp->sid_rwlock);
596*7c478bd9Sstevel@tonic-gate 		return (EINVAL);
597*7c478bd9Sstevel@tonic-gate 	}
598*7c478bd9Sstevel@tonic-gate 
599*7c478bd9Sstevel@tonic-gate 	if (SID_AVAILABLE(sidp)) {
600*7c478bd9Sstevel@tonic-gate 		/*
601*7c478bd9Sstevel@tonic-gate 		 * if this is the last close on a snapshot that has been
602*7c478bd9Sstevel@tonic-gate 		 * deleted, then free up the soft state.  The snapdelete
603*7c478bd9Sstevel@tonic-gate 		 * ioctl does not free this when the device is in use so
604*7c478bd9Sstevel@tonic-gate 		 * we do it here after the last reference goes away.
605*7c478bd9Sstevel@tonic-gate 		 */
606*7c478bd9Sstevel@tonic-gate 
607*7c478bd9Sstevel@tonic-gate 		/* remove the device nodes */
608*7c478bd9Sstevel@tonic-gate 		ASSERT(fssnap_dip != NULL);
609*7c478bd9Sstevel@tonic-gate 		(void) snprintf(name, sizeof (name), "%d",
610*7c478bd9Sstevel@tonic-gate 		    sidp->sid_snapnumber);
611*7c478bd9Sstevel@tonic-gate 		ddi_remove_minor_node(fssnap_dip, name);
612*7c478bd9Sstevel@tonic-gate 		(void) snprintf(name, sizeof (name), "%d,raw",
613*7c478bd9Sstevel@tonic-gate 		    sidp->sid_snapnumber);
614*7c478bd9Sstevel@tonic-gate 		ddi_remove_minor_node(fssnap_dip, name);
615*7c478bd9Sstevel@tonic-gate 
616*7c478bd9Sstevel@tonic-gate 		/* delete the state structure */
617*7c478bd9Sstevel@tonic-gate 		ddi_soft_state_free(statep, sidp->sid_snapnumber);
618*7c478bd9Sstevel@tonic-gate 		num_snapshots--;
619*7c478bd9Sstevel@tonic-gate 	}
620*7c478bd9Sstevel@tonic-gate 
621*7c478bd9Sstevel@tonic-gate 	mutex_exit(&snapshot_mutex);
622*7c478bd9Sstevel@tonic-gate 	rw_exit(&sidp->sid_rwlock);
623*7c478bd9Sstevel@tonic-gate 
624*7c478bd9Sstevel@tonic-gate 	return (0);
625*7c478bd9Sstevel@tonic-gate }
626*7c478bd9Sstevel@tonic-gate 
627*7c478bd9Sstevel@tonic-gate /*
628*7c478bd9Sstevel@tonic-gate  * snap_read() - snapshot driver read(9E) routine
629*7c478bd9Sstevel@tonic-gate  *
630*7c478bd9Sstevel@tonic-gate  *    reads data from the snapshot by calling snap_strategy() through physio()
631*7c478bd9Sstevel@tonic-gate  */
632*7c478bd9Sstevel@tonic-gate /* ARGSUSED */
633*7c478bd9Sstevel@tonic-gate static int
634*7c478bd9Sstevel@tonic-gate snap_read(dev_t dev, struct uio *uiop, cred_t *credp)
635*7c478bd9Sstevel@tonic-gate {
636*7c478bd9Sstevel@tonic-gate 	minor_t		minor;
637*7c478bd9Sstevel@tonic-gate 	struct snapshot_id **sidpp;
638*7c478bd9Sstevel@tonic-gate 
639*7c478bd9Sstevel@tonic-gate 	minor = getminor(dev);
640*7c478bd9Sstevel@tonic-gate 	sidpp = ddi_get_soft_state(statep, minor);
641*7c478bd9Sstevel@tonic-gate 	if (sidpp == NULL || *sidpp == NULL) {
642*7c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN,
643*7c478bd9Sstevel@tonic-gate 		    "snap_read: could not find state for snapshot %d.", minor);
644*7c478bd9Sstevel@tonic-gate 		return (ENXIO);
645*7c478bd9Sstevel@tonic-gate 	}
646*7c478bd9Sstevel@tonic-gate 	return (physio(snap_strategy, NULL, dev, B_READ, minphys, uiop));
647*7c478bd9Sstevel@tonic-gate }
648*7c478bd9Sstevel@tonic-gate 
649*7c478bd9Sstevel@tonic-gate /*
650*7c478bd9Sstevel@tonic-gate  * snap_strategy() - snapshot driver strategy(9E) routine
651*7c478bd9Sstevel@tonic-gate  *
652*7c478bd9Sstevel@tonic-gate  *    cycles through each chunk in the requested buffer and calls
653*7c478bd9Sstevel@tonic-gate  *    snap_getchunk() on each chunk to retrieve it from the appropriate
654*7c478bd9Sstevel@tonic-gate  *    place.  Once all of the parts are put together the requested buffer
655*7c478bd9Sstevel@tonic-gate  *    is returned.  The snapshot driver is read-only, so a write is invalid.
656*7c478bd9Sstevel@tonic-gate  */
657*7c478bd9Sstevel@tonic-gate static int
658*7c478bd9Sstevel@tonic-gate snap_strategy(struct buf *bp)
659*7c478bd9Sstevel@tonic-gate {
660*7c478bd9Sstevel@tonic-gate 	struct snapshot_id **sidpp, *sidp;
661*7c478bd9Sstevel@tonic-gate 	minor_t		minor;
662*7c478bd9Sstevel@tonic-gate 	chunknumber_t	chunk;
663*7c478bd9Sstevel@tonic-gate 	int		off, len;
664*7c478bd9Sstevel@tonic-gate 	u_longlong_t	reqptr;
665*7c478bd9Sstevel@tonic-gate 	int		error = 0;
666*7c478bd9Sstevel@tonic-gate 	size_t		chunksz;
667*7c478bd9Sstevel@tonic-gate 	caddr_t		buf;
668*7c478bd9Sstevel@tonic-gate 
669*7c478bd9Sstevel@tonic-gate 	/* snapshot device is read-only */
670*7c478bd9Sstevel@tonic-gate 	if (bp->b_flags & B_WRITE) {
671*7c478bd9Sstevel@tonic-gate 		bioerror(bp, EROFS);
672*7c478bd9Sstevel@tonic-gate 		bp->b_resid = bp->b_bcount;
673*7c478bd9Sstevel@tonic-gate 		biodone(bp);
674*7c478bd9Sstevel@tonic-gate 		return (0);
675*7c478bd9Sstevel@tonic-gate 	}
676*7c478bd9Sstevel@tonic-gate 
677*7c478bd9Sstevel@tonic-gate 	minor = getminor(bp->b_edev);
678*7c478bd9Sstevel@tonic-gate 	sidpp = ddi_get_soft_state(statep, minor);
679*7c478bd9Sstevel@tonic-gate 	if (sidpp == NULL || *sidpp == NULL) {
680*7c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN,
681*7c478bd9Sstevel@tonic-gate 		    "snap_strategy: could not find state for snapshot %d.",
682*7c478bd9Sstevel@tonic-gate 		    minor);
683*7c478bd9Sstevel@tonic-gate 		bioerror(bp, ENXIO);
684*7c478bd9Sstevel@tonic-gate 		bp->b_resid = bp->b_bcount;
685*7c478bd9Sstevel@tonic-gate 		biodone(bp);
686*7c478bd9Sstevel@tonic-gate 		return (0);
687*7c478bd9Sstevel@tonic-gate 	}
688*7c478bd9Sstevel@tonic-gate 	sidp = *sidpp;
689*7c478bd9Sstevel@tonic-gate 	ASSERT(sidp);
690*7c478bd9Sstevel@tonic-gate 	rw_enter(&sidp->sid_rwlock, RW_READER);
691*7c478bd9Sstevel@tonic-gate 
692*7c478bd9Sstevel@tonic-gate 	if (SID_INACTIVE(sidp)) {
693*7c478bd9Sstevel@tonic-gate 		bioerror(bp, ENXIO);
694*7c478bd9Sstevel@tonic-gate 		bp->b_resid = bp->b_bcount;
695*7c478bd9Sstevel@tonic-gate 		biodone(bp);
696*7c478bd9Sstevel@tonic-gate 		rw_exit(&sidp->sid_rwlock);
697*7c478bd9Sstevel@tonic-gate 		return (0);
698*7c478bd9Sstevel@tonic-gate 	}
699*7c478bd9Sstevel@tonic-gate 
700*7c478bd9Sstevel@tonic-gate 	if (bp->b_flags & (B_PAGEIO|B_PHYS))
701*7c478bd9Sstevel@tonic-gate 		bp_mapin(bp);
702*7c478bd9Sstevel@tonic-gate 
703*7c478bd9Sstevel@tonic-gate 	bp->b_resid = bp->b_bcount;
704*7c478bd9Sstevel@tonic-gate 	ASSERT(bp->b_un.b_addr);
705*7c478bd9Sstevel@tonic-gate 	buf = bp->b_un.b_addr;
706*7c478bd9Sstevel@tonic-gate 
707*7c478bd9Sstevel@tonic-gate 	chunksz = sidp->sid_cowinfo->cow_map.cmap_chunksz;
708*7c478bd9Sstevel@tonic-gate 
709*7c478bd9Sstevel@tonic-gate 	/* reqptr is the current DEV_BSIZE offset into the device */
710*7c478bd9Sstevel@tonic-gate 	/* chunk is the chunk containing reqptr */
711*7c478bd9Sstevel@tonic-gate 	/* len is the length of the request (in the current chunk) in bytes */
712*7c478bd9Sstevel@tonic-gate 	/* off is the byte offset into the current chunk */
713*7c478bd9Sstevel@tonic-gate 	reqptr = bp->b_lblkno;
714*7c478bd9Sstevel@tonic-gate 	while (bp->b_resid > 0) {
715*7c478bd9Sstevel@tonic-gate 		chunk = dbtocowchunk(&sidp->sid_cowinfo->cow_map, reqptr);
716*7c478bd9Sstevel@tonic-gate 		off = (reqptr % (chunksz >> DEV_BSHIFT)) << DEV_BSHIFT;
717*7c478bd9Sstevel@tonic-gate 		len = min(chunksz - off, bp->b_resid);
718*7c478bd9Sstevel@tonic-gate 		ASSERT((off + len) <= chunksz);
719*7c478bd9Sstevel@tonic-gate 
720*7c478bd9Sstevel@tonic-gate 		if ((error = snap_getchunk(sidp, chunk, off, len, buf)) != 0) {
721*7c478bd9Sstevel@tonic-gate 			/*
722*7c478bd9Sstevel@tonic-gate 			 * EINVAL means the user tried to go out of range.
723*7c478bd9Sstevel@tonic-gate 			 * Anything else means it's likely that we're
724*7c478bd9Sstevel@tonic-gate 			 * confused.
725*7c478bd9Sstevel@tonic-gate 			 */
726*7c478bd9Sstevel@tonic-gate 			if (error != EINVAL) {
727*7c478bd9Sstevel@tonic-gate 				cmn_err(CE_WARN, "snap_strategy: error "
728*7c478bd9Sstevel@tonic-gate 				    "calling snap_getchunk, chunk = %llu, "
729*7c478bd9Sstevel@tonic-gate 				    "offset = %d, len = %d, resid = %lu, "
730*7c478bd9Sstevel@tonic-gate 				    "error = %d.",
731*7c478bd9Sstevel@tonic-gate 				    chunk, off, len, bp->b_resid, error);
732*7c478bd9Sstevel@tonic-gate 			}
733*7c478bd9Sstevel@tonic-gate 			bioerror(bp, error);
734*7c478bd9Sstevel@tonic-gate 			biodone(bp);
735*7c478bd9Sstevel@tonic-gate 			rw_exit(&sidp->sid_rwlock);
736*7c478bd9Sstevel@tonic-gate 			return (0);
737*7c478bd9Sstevel@tonic-gate 		}
738*7c478bd9Sstevel@tonic-gate 		bp->b_resid -= len;
739*7c478bd9Sstevel@tonic-gate 		reqptr += (len >> DEV_BSHIFT);
740*7c478bd9Sstevel@tonic-gate 		buf += len;
741*7c478bd9Sstevel@tonic-gate 	}
742*7c478bd9Sstevel@tonic-gate 
743*7c478bd9Sstevel@tonic-gate 	ASSERT(bp->b_resid == 0);
744*7c478bd9Sstevel@tonic-gate 	biodone(bp);
745*7c478bd9Sstevel@tonic-gate 
746*7c478bd9Sstevel@tonic-gate 	rw_exit(&sidp->sid_rwlock);
747*7c478bd9Sstevel@tonic-gate 	return (0);
748*7c478bd9Sstevel@tonic-gate }
749*7c478bd9Sstevel@tonic-gate 
750*7c478bd9Sstevel@tonic-gate /*
751*7c478bd9Sstevel@tonic-gate  * snap_getchunk() - helper function for snap_strategy()
752*7c478bd9Sstevel@tonic-gate  *
753*7c478bd9Sstevel@tonic-gate  *    gets the requested data from the appropriate place and fills in the
754*7c478bd9Sstevel@tonic-gate  *    buffer.  chunk is the chunk number of the request, offset is the
755*7c478bd9Sstevel@tonic-gate  *    offset into that chunk and must be less than the chunk size.  len is
756*7c478bd9Sstevel@tonic-gate  *    the length of the request starting at offset, and must not exceed a
757*7c478bd9Sstevel@tonic-gate  *    chunk boundary.  buffer is the address to copy the data to.  len
758*7c478bd9Sstevel@tonic-gate  *    bytes are copied into the buffer starting at the location specified.
759*7c478bd9Sstevel@tonic-gate  *
760*7c478bd9Sstevel@tonic-gate  *    A chunk is located according to the following algorithm:
761*7c478bd9Sstevel@tonic-gate  *        - If the chunk does not have a translation or is not a candidate
762*7c478bd9Sstevel@tonic-gate  *          for translation, it is read straight from the master device.
763*7c478bd9Sstevel@tonic-gate  *        - If the chunk does have a translation, then it is either on
764*7c478bd9Sstevel@tonic-gate  *          disk or in memory:
765*7c478bd9Sstevel@tonic-gate  *            o If it is in memory the requested data is simply copied out
766*7c478bd9Sstevel@tonic-gate  *              of the in-memory buffer.
767*7c478bd9Sstevel@tonic-gate  *            o If it is in the backing store, it is read from there.
768*7c478bd9Sstevel@tonic-gate  *
769*7c478bd9Sstevel@tonic-gate  *    This function does the real work of the snapshot driver.
770*7c478bd9Sstevel@tonic-gate  */
771*7c478bd9Sstevel@tonic-gate static int
772*7c478bd9Sstevel@tonic-gate snap_getchunk(struct snapshot_id *sidp, chunknumber_t chunk, int offset,
773*7c478bd9Sstevel@tonic-gate     int len, char *buffer)
774*7c478bd9Sstevel@tonic-gate {
775*7c478bd9Sstevel@tonic-gate 	cow_map_t	*cmap = &sidp->sid_cowinfo->cow_map;
776*7c478bd9Sstevel@tonic-gate 	cow_map_node_t	*cmn;
777*7c478bd9Sstevel@tonic-gate 	struct buf	*snapbuf;
778*7c478bd9Sstevel@tonic-gate 	int		error = 0;
779*7c478bd9Sstevel@tonic-gate 	char		*newbuffer;
780*7c478bd9Sstevel@tonic-gate 	int		newlen = 0;
781*7c478bd9Sstevel@tonic-gate 	int		partial = 0;
782*7c478bd9Sstevel@tonic-gate 
783*7c478bd9Sstevel@tonic-gate 	ASSERT(RW_READ_HELD(&sidp->sid_rwlock));
784*7c478bd9Sstevel@tonic-gate 	ASSERT(offset + len <= cmap->cmap_chunksz);
785*7c478bd9Sstevel@tonic-gate 
786*7c478bd9Sstevel@tonic-gate 	/*
787*7c478bd9Sstevel@tonic-gate 	 * Check if the chunk number is out of range and if so bail out
788*7c478bd9Sstevel@tonic-gate 	 */
789*7c478bd9Sstevel@tonic-gate 	if (chunk >= (cmap->cmap_bmsize * NBBY)) {
790*7c478bd9Sstevel@tonic-gate 		return (EINVAL);
791*7c478bd9Sstevel@tonic-gate 	}
792*7c478bd9Sstevel@tonic-gate 
793*7c478bd9Sstevel@tonic-gate 	/*
794*7c478bd9Sstevel@tonic-gate 	 * If the chunk is not a candidate for translation, then the chunk
795*7c478bd9Sstevel@tonic-gate 	 * was not allocated when the snapshot was taken.  Since it does
796*7c478bd9Sstevel@tonic-gate 	 * not contain data associated with this snapshot, just return a
797*7c478bd9Sstevel@tonic-gate 	 * zero buffer instead.
798*7c478bd9Sstevel@tonic-gate 	 */
799*7c478bd9Sstevel@tonic-gate 	if (isclr(cmap->cmap_candidate, chunk)) {
800*7c478bd9Sstevel@tonic-gate 		bzero(buffer, len);
801*7c478bd9Sstevel@tonic-gate 		return (0);
802*7c478bd9Sstevel@tonic-gate 	}
803*7c478bd9Sstevel@tonic-gate 
804*7c478bd9Sstevel@tonic-gate 	/*
805*7c478bd9Sstevel@tonic-gate 	 * if the chunk is a candidate for translation but a
806*7c478bd9Sstevel@tonic-gate 	 * translation does not exist, then read through to the
807*7c478bd9Sstevel@tonic-gate 	 * original file system.  The rwlock is held until the read
808*7c478bd9Sstevel@tonic-gate 	 * completes if it hasn't been translated to make sure the
809*7c478bd9Sstevel@tonic-gate 	 * file system does not translate the block before we
810*7c478bd9Sstevel@tonic-gate 	 * access it. If it has already been translated we don't
811*7c478bd9Sstevel@tonic-gate 	 * need the lock, because the translation will never go away.
812*7c478bd9Sstevel@tonic-gate 	 */
813*7c478bd9Sstevel@tonic-gate 	rw_enter(&cmap->cmap_rwlock, RW_READER);
814*7c478bd9Sstevel@tonic-gate 	if (isclr(cmap->cmap_hastrans, chunk)) {
815*7c478bd9Sstevel@tonic-gate 		snapbuf = getrbuf(KM_SLEEP);
816*7c478bd9Sstevel@tonic-gate 		/*
817*7c478bd9Sstevel@tonic-gate 		 * Reading into the buffer saves having to do a copy,
818*7c478bd9Sstevel@tonic-gate 		 * but gets tricky if the request size is not a
819*7c478bd9Sstevel@tonic-gate 		 * multiple of DEV_BSIZE.  However, we are filling the
820*7c478bd9Sstevel@tonic-gate 		 * buffer left to right, so future reads will write
821*7c478bd9Sstevel@tonic-gate 		 * over any extra data we might have read.
822*7c478bd9Sstevel@tonic-gate 		 */
823*7c478bd9Sstevel@tonic-gate 
824*7c478bd9Sstevel@tonic-gate 		partial = len % DEV_BSIZE;
825*7c478bd9Sstevel@tonic-gate 
826*7c478bd9Sstevel@tonic-gate 		snapbuf->b_bcount = len;
827*7c478bd9Sstevel@tonic-gate 		snapbuf->b_lblkno = lbtodb(chunk * cmap->cmap_chunksz + offset);
828*7c478bd9Sstevel@tonic-gate 		snapbuf->b_un.b_addr = buffer;
829*7c478bd9Sstevel@tonic-gate 
830*7c478bd9Sstevel@tonic-gate 		snapbuf->b_iodone = NULL;
831*7c478bd9Sstevel@tonic-gate 		snapbuf->b_proc = NULL;		/* i.e. the kernel */
832*7c478bd9Sstevel@tonic-gate 		snapbuf->b_flags = B_READ | B_BUSY;
833*7c478bd9Sstevel@tonic-gate 		snapbuf->b_edev = sidp->sid_fvp->v_vfsp->vfs_dev;
834*7c478bd9Sstevel@tonic-gate 
835*7c478bd9Sstevel@tonic-gate 		if (partial) {
836*7c478bd9Sstevel@tonic-gate 			/*
837*7c478bd9Sstevel@tonic-gate 			 * Partial block read in progress.
838*7c478bd9Sstevel@tonic-gate 			 * This is bad as modules further down the line
839*7c478bd9Sstevel@tonic-gate 			 * assume buf's are exact multiples of DEV_BSIZE
840*7c478bd9Sstevel@tonic-gate 			 * and we end up with fewer, or zero, bytes read.
841*7c478bd9Sstevel@tonic-gate 			 * To get round this we need to round up to the
842*7c478bd9Sstevel@tonic-gate 			 * nearest full block read and then return only
843*7c478bd9Sstevel@tonic-gate 			 * len bytes.
844*7c478bd9Sstevel@tonic-gate 			 */
845*7c478bd9Sstevel@tonic-gate 			newlen = (len - partial) + DEV_BSIZE;
846*7c478bd9Sstevel@tonic-gate 			newbuffer = kmem_alloc(newlen, KM_SLEEP);
847*7c478bd9Sstevel@tonic-gate 
848*7c478bd9Sstevel@tonic-gate 			snapbuf->b_bcount = newlen;
849*7c478bd9Sstevel@tonic-gate 			snapbuf->b_un.b_addr = newbuffer;
850*7c478bd9Sstevel@tonic-gate 		}
851*7c478bd9Sstevel@tonic-gate 
852*7c478bd9Sstevel@tonic-gate 		(void) bdev_strategy(snapbuf);
853*7c478bd9Sstevel@tonic-gate 		(void) biowait(snapbuf);
854*7c478bd9Sstevel@tonic-gate 
855*7c478bd9Sstevel@tonic-gate 		error = geterror(snapbuf);
856*7c478bd9Sstevel@tonic-gate 
857*7c478bd9Sstevel@tonic-gate 		if (partial) {
858*7c478bd9Sstevel@tonic-gate 			/*
859*7c478bd9Sstevel@tonic-gate 			 * Partial block read. Now we need to bcopy the
860*7c478bd9Sstevel@tonic-gate 			 * correct number of bytes back into the
861*7c478bd9Sstevel@tonic-gate 			 * supplied buffer, and tidy up our temp
862*7c478bd9Sstevel@tonic-gate 			 * buffer.
863*7c478bd9Sstevel@tonic-gate 			 */
864*7c478bd9Sstevel@tonic-gate 			bcopy(newbuffer, buffer, len);
865*7c478bd9Sstevel@tonic-gate 			kmem_free(newbuffer, newlen);
866*7c478bd9Sstevel@tonic-gate 		}
867*7c478bd9Sstevel@tonic-gate 
868*7c478bd9Sstevel@tonic-gate 		freerbuf(snapbuf);
869*7c478bd9Sstevel@tonic-gate 		rw_exit(&cmap->cmap_rwlock);
870*7c478bd9Sstevel@tonic-gate 
871*7c478bd9Sstevel@tonic-gate 		return (error);
872*7c478bd9Sstevel@tonic-gate 	}
873*7c478bd9Sstevel@tonic-gate 
874*7c478bd9Sstevel@tonic-gate 	/*
875*7c478bd9Sstevel@tonic-gate 	 * finally, if the chunk is a candidate for translation and it
876*7c478bd9Sstevel@tonic-gate 	 * has been translated, then we clone the chunk of the buffer
877*7c478bd9Sstevel@tonic-gate 	 * that was copied aside by the file system.
878*7c478bd9Sstevel@tonic-gate 	 * The cmap_rwlock does not need to be held after we know the
879*7c478bd9Sstevel@tonic-gate 	 * data has already been copied. Once a chunk has been copied
880*7c478bd9Sstevel@tonic-gate 	 * to the backing file, it is stable read only data.
881*7c478bd9Sstevel@tonic-gate 	 */
882*7c478bd9Sstevel@tonic-gate 	cmn = transtbl_get(cmap, chunk);
883*7c478bd9Sstevel@tonic-gate 
884*7c478bd9Sstevel@tonic-gate 	/* check whether the data is in memory or in the backing file */
885*7c478bd9Sstevel@tonic-gate 	if (cmn != NULL) {
886*7c478bd9Sstevel@tonic-gate 		ASSERT(cmn->cmn_buf);
887*7c478bd9Sstevel@tonic-gate 		/* already in memory */
888*7c478bd9Sstevel@tonic-gate 		bcopy(cmn->cmn_buf + offset, buffer, len);
889*7c478bd9Sstevel@tonic-gate 		rw_exit(&cmap->cmap_rwlock);
890*7c478bd9Sstevel@tonic-gate 	} else {
891*7c478bd9Sstevel@tonic-gate 		ssize_t resid = len;
892*7c478bd9Sstevel@tonic-gate 		int	bf_index;
893*7c478bd9Sstevel@tonic-gate 		/*
894*7c478bd9Sstevel@tonic-gate 		 * can cause deadlock with writer if we don't drop the
895*7c478bd9Sstevel@tonic-gate 		 * cmap_rwlock before trying to get the backing store file
896*7c478bd9Sstevel@tonic-gate 		 * vnode rwlock.
897*7c478bd9Sstevel@tonic-gate 		 */
898*7c478bd9Sstevel@tonic-gate 		rw_exit(&cmap->cmap_rwlock);
899*7c478bd9Sstevel@tonic-gate 
900*7c478bd9Sstevel@tonic-gate 		bf_index = chunk / cmap->cmap_chunksperbf;
901*7c478bd9Sstevel@tonic-gate 
902*7c478bd9Sstevel@tonic-gate 		/* read buffer from backing file */
903*7c478bd9Sstevel@tonic-gate 		error = vn_rdwr(UIO_READ,
904*7c478bd9Sstevel@tonic-gate 		    (sidp->sid_cowinfo->cow_backfile_array)[bf_index],
905*7c478bd9Sstevel@tonic-gate 		    buffer, len, ((chunk % cmap->cmap_chunksperbf) *
906*7c478bd9Sstevel@tonic-gate 		    cmap->cmap_chunksz) + offset, UIO_SYSSPACE, 0,
907*7c478bd9Sstevel@tonic-gate 		    RLIM64_INFINITY, kcred, &resid);
908*7c478bd9Sstevel@tonic-gate 	}
909*7c478bd9Sstevel@tonic-gate 
910*7c478bd9Sstevel@tonic-gate 	return (error);
911*7c478bd9Sstevel@tonic-gate }
912*7c478bd9Sstevel@tonic-gate 
913*7c478bd9Sstevel@tonic-gate /*
914*7c478bd9Sstevel@tonic-gate  * snap_print() - snapshot driver print(9E) routine
915*7c478bd9Sstevel@tonic-gate  *
916*7c478bd9Sstevel@tonic-gate  *    prints the device identification string.
917*7c478bd9Sstevel@tonic-gate  */
918*7c478bd9Sstevel@tonic-gate static int
919*7c478bd9Sstevel@tonic-gate snap_print(dev_t dev, char *str)
920*7c478bd9Sstevel@tonic-gate {
921*7c478bd9Sstevel@tonic-gate 	struct snapshot_id **sidpp;
922*7c478bd9Sstevel@tonic-gate 	minor_t		minor;
923*7c478bd9Sstevel@tonic-gate 
924*7c478bd9Sstevel@tonic-gate 	minor = getminor(dev);
925*7c478bd9Sstevel@tonic-gate 	sidpp = ddi_get_soft_state(statep, minor);
926*7c478bd9Sstevel@tonic-gate 	if (sidpp == NULL || *sidpp == NULL) {
927*7c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN,
928*7c478bd9Sstevel@tonic-gate 		    "snap_print: could not find state for snapshot %d.", minor);
929*7c478bd9Sstevel@tonic-gate 		return (ENXIO);
930*7c478bd9Sstevel@tonic-gate 	}
931*7c478bd9Sstevel@tonic-gate 
932*7c478bd9Sstevel@tonic-gate 	cmn_err(CE_NOTE, "snap_print: snapshot %d: %s",  minor, str);
933*7c478bd9Sstevel@tonic-gate 
934*7c478bd9Sstevel@tonic-gate 	return (0);
935*7c478bd9Sstevel@tonic-gate }
936*7c478bd9Sstevel@tonic-gate 
937*7c478bd9Sstevel@tonic-gate /*
938*7c478bd9Sstevel@tonic-gate  * snap_prop_op() - snapshot driver prop_op(9E) routine
939*7c478bd9Sstevel@tonic-gate  *
940*7c478bd9Sstevel@tonic-gate  *    get 32-bit and 64-bit values for size (character driver) and nblocks
941*7c478bd9Sstevel@tonic-gate  *    (block driver).
942*7c478bd9Sstevel@tonic-gate  */
943*7c478bd9Sstevel@tonic-gate static int
944*7c478bd9Sstevel@tonic-gate snap_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op,
945*7c478bd9Sstevel@tonic-gate     int flags, char *name, caddr_t valuep, int *lengthp)
946*7c478bd9Sstevel@tonic-gate {
947*7c478bd9Sstevel@tonic-gate 	struct snapshot_id **sidpp;
948*7c478bd9Sstevel@tonic-gate 	int		length, km_flags;
949*7c478bd9Sstevel@tonic-gate 	int		nblocks, size;
950*7c478bd9Sstevel@tonic-gate 	uint64_t	Size, Nblocks;
951*7c478bd9Sstevel@tonic-gate 	caddr_t		buffer;
952*7c478bd9Sstevel@tonic-gate 	int		minor;
953*7c478bd9Sstevel@tonic-gate 	dev_t		mdev;
954*7c478bd9Sstevel@tonic-gate 
955*7c478bd9Sstevel@tonic-gate 	minor = getminor(dev);
956*7c478bd9Sstevel@tonic-gate 	length = *lengthp;		/* Get callers length */
957*7c478bd9Sstevel@tonic-gate 
958*7c478bd9Sstevel@tonic-gate 	/* if this is the control device just check for .conf properties */
959*7c478bd9Sstevel@tonic-gate 	if (minor == SNAP_CTL_MINOR)
960*7c478bd9Sstevel@tonic-gate 		return (ddi_prop_op(dev, dip, prop_op, flags, name,
961*7c478bd9Sstevel@tonic-gate 			valuep, lengthp));
962*7c478bd9Sstevel@tonic-gate 	/* check to see if there is a master device plumbed */
963*7c478bd9Sstevel@tonic-gate 	sidpp = ddi_get_soft_state(statep, minor);
964*7c478bd9Sstevel@tonic-gate 	if (sidpp == NULL || *sidpp == NULL) {
965*7c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN,
966*7c478bd9Sstevel@tonic-gate 		    "snap_prop_op: could not find state for "
967*7c478bd9Sstevel@tonic-gate 		    "snapshot %d.", minor);
968*7c478bd9Sstevel@tonic-gate 		return (DDI_PROP_NOT_FOUND);
969*7c478bd9Sstevel@tonic-gate 	}
970*7c478bd9Sstevel@tonic-gate 
971*7c478bd9Sstevel@tonic-gate 	if (((*sidpp)->sid_fvp == NULL) || ((*sidpp)->sid_fvp->v_vfsp == NULL))
972*7c478bd9Sstevel@tonic-gate 		return (ddi_prop_op(dev, dip, prop_op, flags, name,
973*7c478bd9Sstevel@tonic-gate 			valuep, lengthp));
974*7c478bd9Sstevel@tonic-gate 	mdev = (*sidpp)->sid_fvp->v_vfsp->vfs_dev;
975*7c478bd9Sstevel@tonic-gate 
976*7c478bd9Sstevel@tonic-gate 	/* get size information from the master device. */
977*7c478bd9Sstevel@tonic-gate 
978*7c478bd9Sstevel@tonic-gate 	if (strcmp(name, "nblocks") == 0) {
979*7c478bd9Sstevel@tonic-gate 		nblocks = bdev_size(mdev);
980*7c478bd9Sstevel@tonic-gate 		*lengthp = sizeof (nblocks);	/* Set callers length */
981*7c478bd9Sstevel@tonic-gate 	} else if (strcmp(name, "Nblocks") == 0) {
982*7c478bd9Sstevel@tonic-gate 		Nblocks = bdev_Size(mdev);
983*7c478bd9Sstevel@tonic-gate 		*lengthp = sizeof (Nblocks);	/* Set callers length */
984*7c478bd9Sstevel@tonic-gate 	} else if (strcmp(name, "size") == 0) {
985*7c478bd9Sstevel@tonic-gate 		size = cdev_size(mdev);
986*7c478bd9Sstevel@tonic-gate 		*lengthp = sizeof (size);	/* Set callers length */
987*7c478bd9Sstevel@tonic-gate 	} else if (strcmp(name, "Size") == 0) {
988*7c478bd9Sstevel@tonic-gate 		Size = cdev_Size(mdev);
989*7c478bd9Sstevel@tonic-gate 		*lengthp = sizeof (Size);	/* Set callers length */
990*7c478bd9Sstevel@tonic-gate 	} else {	/* not for us */
991*7c478bd9Sstevel@tonic-gate 		return (ddi_prop_op(dev, dip, prop_op, flags, name,
992*7c478bd9Sstevel@tonic-gate 		    valuep, lengthp));
993*7c478bd9Sstevel@tonic-gate 	}
994*7c478bd9Sstevel@tonic-gate 
995*7c478bd9Sstevel@tonic-gate 	/*
996*7c478bd9Sstevel@tonic-gate 	 * If length only request, just return the length.
997*7c478bd9Sstevel@tonic-gate 	 */
998*7c478bd9Sstevel@tonic-gate 	if (prop_op == PROP_LEN)  {
999*7c478bd9Sstevel@tonic-gate 		return (DDI_PROP_SUCCESS);
1000*7c478bd9Sstevel@tonic-gate 	}
1001*7c478bd9Sstevel@tonic-gate 
1002*7c478bd9Sstevel@tonic-gate 	/*
1003*7c478bd9Sstevel@tonic-gate 	 * Allocate buffer, if required.  Either way, set `buffer' variable.
1004*7c478bd9Sstevel@tonic-gate 	 */
1005*7c478bd9Sstevel@tonic-gate 	switch (prop_op)  {
1006*7c478bd9Sstevel@tonic-gate 	case PROP_LEN_AND_VAL_ALLOC:
1007*7c478bd9Sstevel@tonic-gate 
1008*7c478bd9Sstevel@tonic-gate 		km_flags = KM_NOSLEEP;
1009*7c478bd9Sstevel@tonic-gate 
1010*7c478bd9Sstevel@tonic-gate 		if (flags & DDI_PROP_CANSLEEP)
1011*7c478bd9Sstevel@tonic-gate 			km_flags = KM_SLEEP;
1012*7c478bd9Sstevel@tonic-gate 
1013*7c478bd9Sstevel@tonic-gate 		buffer = kmem_alloc(*lengthp, km_flags);
1014*7c478bd9Sstevel@tonic-gate 		if (buffer == NULL)  {
1015*7c478bd9Sstevel@tonic-gate 			cmn_err(CE_WARN, "snap_get_prop: no mem for "
1016*7c478bd9Sstevel@tonic-gate 			"property %s.", name);
1017*7c478bd9Sstevel@tonic-gate 			return (DDI_PROP_NO_MEMORY);
1018*7c478bd9Sstevel@tonic-gate 		}
1019*7c478bd9Sstevel@tonic-gate 		*(caddr_t *)valuep = buffer; /* Set callers buf ptr */
1020*7c478bd9Sstevel@tonic-gate 		break;
1021*7c478bd9Sstevel@tonic-gate 
1022*7c478bd9Sstevel@tonic-gate 	case PROP_LEN_AND_VAL_BUF:
1023*7c478bd9Sstevel@tonic-gate 
1024*7c478bd9Sstevel@tonic-gate 		if (*lengthp > length)
1025*7c478bd9Sstevel@tonic-gate 			return (DDI_PROP_BUF_TOO_SMALL);
1026*7c478bd9Sstevel@tonic-gate 
1027*7c478bd9Sstevel@tonic-gate 		buffer = valuep; /* get callers buf ptr */
1028*7c478bd9Sstevel@tonic-gate 		break;
1029*7c478bd9Sstevel@tonic-gate 	}
1030*7c478bd9Sstevel@tonic-gate 
1031*7c478bd9Sstevel@tonic-gate 	if (strcmp(name, "nblocks") == 0) {
1032*7c478bd9Sstevel@tonic-gate 		*((uint_t *)buffer) = nblocks;
1033*7c478bd9Sstevel@tonic-gate 	} else if (strcmp(name, "Nblocks") == 0) {
1034*7c478bd9Sstevel@tonic-gate 		*((uint64_t *)buffer) = Nblocks;
1035*7c478bd9Sstevel@tonic-gate 	} else if (strcmp(name, "size") == 0) {
1036*7c478bd9Sstevel@tonic-gate 		*((uint_t *)buffer) = size;
1037*7c478bd9Sstevel@tonic-gate 	} else if (strcmp(name, "Size") == 0) {
1038*7c478bd9Sstevel@tonic-gate 		*((uint64_t *)buffer) = Size;
1039*7c478bd9Sstevel@tonic-gate 	}
1040*7c478bd9Sstevel@tonic-gate 
1041*7c478bd9Sstevel@tonic-gate 	return (DDI_PROP_SUCCESS);
1042*7c478bd9Sstevel@tonic-gate }
1043*7c478bd9Sstevel@tonic-gate 
1044*7c478bd9Sstevel@tonic-gate /*
1045*7c478bd9Sstevel@tonic-gate  * snap_ioctl() - snapshot driver ioctl(9E) routine
1046*7c478bd9Sstevel@tonic-gate  *
1047*7c478bd9Sstevel@tonic-gate  *    only applies to the control device.  The control device accepts two
1048*7c478bd9Sstevel@tonic-gate  *    ioctl requests: create a snapshot or delete a snapshot.  In either
1049*7c478bd9Sstevel@tonic-gate  *    case, the vnode for the requested file system is extracted, and the
1050*7c478bd9Sstevel@tonic-gate  *    request is passed on to the file system via the same ioctl.  The file
1051*7c478bd9Sstevel@tonic-gate  *    system is responsible for doing the things necessary for creating or
1052*7c478bd9Sstevel@tonic-gate  *    destroying a snapshot, including any file system specific operations
1053*7c478bd9Sstevel@tonic-gate  *    that must be performed as well as setting up and deleting the snapshot
1054*7c478bd9Sstevel@tonic-gate  *    state through the fssnap interfaces.
1055*7c478bd9Sstevel@tonic-gate  */
1056*7c478bd9Sstevel@tonic-gate static int
1057*7c478bd9Sstevel@tonic-gate snap_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
1058*7c478bd9Sstevel@tonic-gate int *rvalp)
1059*7c478bd9Sstevel@tonic-gate {
1060*7c478bd9Sstevel@tonic-gate 	minor_t	minor;
1061*7c478bd9Sstevel@tonic-gate 	int error = 0;
1062*7c478bd9Sstevel@tonic-gate 
1063*7c478bd9Sstevel@tonic-gate 	minor = getminor(dev);
1064*7c478bd9Sstevel@tonic-gate 
1065*7c478bd9Sstevel@tonic-gate 	if (minor != SNAP_CTL_MINOR) {
1066*7c478bd9Sstevel@tonic-gate 		return (EINVAL);
1067*7c478bd9Sstevel@tonic-gate 	}
1068*7c478bd9Sstevel@tonic-gate 
1069*7c478bd9Sstevel@tonic-gate 	switch (cmd) {
1070*7c478bd9Sstevel@tonic-gate 	case _FIOSNAPSHOTCREATE:
1071*7c478bd9Sstevel@tonic-gate 	{
1072*7c478bd9Sstevel@tonic-gate 		struct fiosnapcreate	fc;
1073*7c478bd9Sstevel@tonic-gate 		struct file		*fp;
1074*7c478bd9Sstevel@tonic-gate 		struct vnode		*vp;
1075*7c478bd9Sstevel@tonic-gate 
1076*7c478bd9Sstevel@tonic-gate 		if (ddi_copyin((void *)arg, &fc, sizeof (fc), mode))
1077*7c478bd9Sstevel@tonic-gate 			return (EFAULT);
1078*7c478bd9Sstevel@tonic-gate 
1079*7c478bd9Sstevel@tonic-gate 		/* get vnode for file system mount point */
1080*7c478bd9Sstevel@tonic-gate 		if ((fp = getf(fc.rootfiledesc)) == NULL)
1081*7c478bd9Sstevel@tonic-gate 			return (EBADF);
1082*7c478bd9Sstevel@tonic-gate 
1083*7c478bd9Sstevel@tonic-gate 		ASSERT(fp->f_vnode);
1084*7c478bd9Sstevel@tonic-gate 		vp = fp->f_vnode;
1085*7c478bd9Sstevel@tonic-gate 		VN_HOLD(vp);
1086*7c478bd9Sstevel@tonic-gate 		releasef(fc.rootfiledesc);
1087*7c478bd9Sstevel@tonic-gate 
1088*7c478bd9Sstevel@tonic-gate 		/* pass ioctl request to file system */
1089*7c478bd9Sstevel@tonic-gate 		error = VOP_IOCTL(vp, cmd, arg, 0, credp, rvalp);
1090*7c478bd9Sstevel@tonic-gate 		VN_RELE(vp);
1091*7c478bd9Sstevel@tonic-gate 		break;
1092*7c478bd9Sstevel@tonic-gate 	}
1093*7c478bd9Sstevel@tonic-gate 	case _FIOSNAPSHOTCREATE_MULTI:
1094*7c478bd9Sstevel@tonic-gate 	{
1095*7c478bd9Sstevel@tonic-gate 		struct fiosnapcreate_multi	fc;
1096*7c478bd9Sstevel@tonic-gate 		struct file		*fp;
1097*7c478bd9Sstevel@tonic-gate 		struct vnode		*vp;
1098*7c478bd9Sstevel@tonic-gate 
1099*7c478bd9Sstevel@tonic-gate 		if (ddi_copyin((void *)arg, &fc, sizeof (fc), mode))
1100*7c478bd9Sstevel@tonic-gate 			return (EFAULT);
1101*7c478bd9Sstevel@tonic-gate 
1102*7c478bd9Sstevel@tonic-gate 		/* get vnode for file system mount point */
1103*7c478bd9Sstevel@tonic-gate 		if ((fp = getf(fc.rootfiledesc)) == NULL)
1104*7c478bd9Sstevel@tonic-gate 			return (EBADF);
1105*7c478bd9Sstevel@tonic-gate 
1106*7c478bd9Sstevel@tonic-gate 		ASSERT(fp->f_vnode);
1107*7c478bd9Sstevel@tonic-gate 		vp = fp->f_vnode;
1108*7c478bd9Sstevel@tonic-gate 		VN_HOLD(vp);
1109*7c478bd9Sstevel@tonic-gate 		releasef(fc.rootfiledesc);
1110*7c478bd9Sstevel@tonic-gate 
1111*7c478bd9Sstevel@tonic-gate 		/* pass ioctl request to file system */
1112*7c478bd9Sstevel@tonic-gate 		error = VOP_IOCTL(vp, cmd, arg, 0, credp, rvalp);
1113*7c478bd9Sstevel@tonic-gate 		VN_RELE(vp);
1114*7c478bd9Sstevel@tonic-gate 		break;
1115*7c478bd9Sstevel@tonic-gate 	}
1116*7c478bd9Sstevel@tonic-gate 	case _FIOSNAPSHOTDELETE:
1117*7c478bd9Sstevel@tonic-gate 	{
1118*7c478bd9Sstevel@tonic-gate 		major_t			major;
1119*7c478bd9Sstevel@tonic-gate 		struct fiosnapdelete	fc;
1120*7c478bd9Sstevel@tonic-gate 		snapshot_id_t		*sidp = NULL;
1121*7c478bd9Sstevel@tonic-gate 		snapshot_id_t		*sidnextp = NULL;
1122*7c478bd9Sstevel@tonic-gate 		struct file		*fp = NULL;
1123*7c478bd9Sstevel@tonic-gate 		struct vnode		*vp = NULL;
1124*7c478bd9Sstevel@tonic-gate 		struct vfs 		*vfsp = NULL;
1125*7c478bd9Sstevel@tonic-gate 		vfsops_t		*vfsops = EIO_vfsops;
1126*7c478bd9Sstevel@tonic-gate 
1127*7c478bd9Sstevel@tonic-gate 		if (ddi_copyin((void *)arg, &fc, sizeof (fc), mode))
1128*7c478bd9Sstevel@tonic-gate 			return (EFAULT);
1129*7c478bd9Sstevel@tonic-gate 
1130*7c478bd9Sstevel@tonic-gate 		/* get vnode for file system mount point */
1131*7c478bd9Sstevel@tonic-gate 		if ((fp = getf(fc.rootfiledesc)) == NULL)
1132*7c478bd9Sstevel@tonic-gate 			return (EBADF);
1133*7c478bd9Sstevel@tonic-gate 
1134*7c478bd9Sstevel@tonic-gate 		ASSERT(fp->f_vnode);
1135*7c478bd9Sstevel@tonic-gate 		vp = fp->f_vnode;
1136*7c478bd9Sstevel@tonic-gate 		VN_HOLD(vp);
1137*7c478bd9Sstevel@tonic-gate 		releasef(fc.rootfiledesc);
1138*7c478bd9Sstevel@tonic-gate 		/*
1139*7c478bd9Sstevel@tonic-gate 		 * Test for two formats of delete and set correct minor/vp:
1140*7c478bd9Sstevel@tonic-gate 		 * pseudo device:
1141*7c478bd9Sstevel@tonic-gate 		 * fssnap -d [/dev/fssnap/x]
1142*7c478bd9Sstevel@tonic-gate 		 * or
1143*7c478bd9Sstevel@tonic-gate 		 * mount point:
1144*7c478bd9Sstevel@tonic-gate 		 * fssnap -d [/mntpt]
1145*7c478bd9Sstevel@tonic-gate 		 * Note that minor is verified to be equal to SNAP_CTL_MINOR
1146*7c478bd9Sstevel@tonic-gate 		 * at this point which is an invalid minor number.
1147*7c478bd9Sstevel@tonic-gate 		 */
1148*7c478bd9Sstevel@tonic-gate 		ASSERT(fssnap_dip != NULL);
1149*7c478bd9Sstevel@tonic-gate 		major = ddi_driver_major(fssnap_dip);
1150*7c478bd9Sstevel@tonic-gate 		mutex_enter(&snapshot_mutex);
1151*7c478bd9Sstevel@tonic-gate 		for (sidp = snapshot; sidp != NULL; sidp = sidnextp) {
1152*7c478bd9Sstevel@tonic-gate 			rw_enter(&sidp->sid_rwlock, RW_READER);
1153*7c478bd9Sstevel@tonic-gate 			sidnextp = sidp->sid_next;
1154*7c478bd9Sstevel@tonic-gate 			/* pseudo device: */
1155*7c478bd9Sstevel@tonic-gate 			if (major == getmajor(vp->v_rdev)) {
1156*7c478bd9Sstevel@tonic-gate 				minor = getminor(vp->v_rdev);
1157*7c478bd9Sstevel@tonic-gate 				if (sidp->sid_snapnumber == (uint_t)minor &&
1158*7c478bd9Sstevel@tonic-gate 				    sidp->sid_fvp) {
1159*7c478bd9Sstevel@tonic-gate 					VN_RELE(vp);
1160*7c478bd9Sstevel@tonic-gate 					vp = sidp->sid_fvp;
1161*7c478bd9Sstevel@tonic-gate 					VN_HOLD(vp);
1162*7c478bd9Sstevel@tonic-gate 					rw_exit(&sidp->sid_rwlock);
1163*7c478bd9Sstevel@tonic-gate 					break;
1164*7c478bd9Sstevel@tonic-gate 				}
1165*7c478bd9Sstevel@tonic-gate 			/* Mount point: */
1166*7c478bd9Sstevel@tonic-gate 			} else {
1167*7c478bd9Sstevel@tonic-gate 				if (sidp->sid_fvp == vp) {
1168*7c478bd9Sstevel@tonic-gate 					minor = sidp->sid_snapnumber;
1169*7c478bd9Sstevel@tonic-gate 					rw_exit(&sidp->sid_rwlock);
1170*7c478bd9Sstevel@tonic-gate 					break;
1171*7c478bd9Sstevel@tonic-gate 				}
1172*7c478bd9Sstevel@tonic-gate 			}
1173*7c478bd9Sstevel@tonic-gate 			rw_exit(&sidp->sid_rwlock);
1174*7c478bd9Sstevel@tonic-gate 		}
1175*7c478bd9Sstevel@tonic-gate 		mutex_exit(&snapshot_mutex);
1176*7c478bd9Sstevel@tonic-gate 		/* Verify minor got set correctly above */
1177*7c478bd9Sstevel@tonic-gate 		if (minor == SNAP_CTL_MINOR) {
1178*7c478bd9Sstevel@tonic-gate 			VN_RELE(vp);
1179*7c478bd9Sstevel@tonic-gate 			return (EINVAL);
1180*7c478bd9Sstevel@tonic-gate 		}
1181*7c478bd9Sstevel@tonic-gate 		dev = makedevice(major, minor);
1182*7c478bd9Sstevel@tonic-gate 		/*
1183*7c478bd9Sstevel@tonic-gate 		 * Create dummy vfs entry
1184*7c478bd9Sstevel@tonic-gate 		 * to use as a locking semaphore across the IOCTL
1185*7c478bd9Sstevel@tonic-gate 		 * for mount in progress cases...
1186*7c478bd9Sstevel@tonic-gate 		 */
1187*7c478bd9Sstevel@tonic-gate 		vfsp = kmem_alloc(sizeof (vfs_t), KM_SLEEP);
1188*7c478bd9Sstevel@tonic-gate 		VFS_INIT(vfsp, vfsops, NULL);
1189*7c478bd9Sstevel@tonic-gate 		vfs_addmip(dev, vfsp);
1190*7c478bd9Sstevel@tonic-gate 		if ((vfs_devmounting(dev, vfsp)) ||
1191*7c478bd9Sstevel@tonic-gate 		    (vfs_devismounted(dev))) {
1192*7c478bd9Sstevel@tonic-gate 			vfs_delmip(vfsp);
1193*7c478bd9Sstevel@tonic-gate 			kmem_free(vfsp, sizeof (struct vfs));
1194*7c478bd9Sstevel@tonic-gate 			VN_RELE(vp);
1195*7c478bd9Sstevel@tonic-gate 			return (EBUSY);
1196*7c478bd9Sstevel@tonic-gate 		}
1197*7c478bd9Sstevel@tonic-gate 		/*
1198*7c478bd9Sstevel@tonic-gate 		 * Nobody mounted but do not release mount in progress lock
1199*7c478bd9Sstevel@tonic-gate 		 * until IOCTL complete to prohibit a mount sneaking
1200*7c478bd9Sstevel@tonic-gate 		 * in
1201*7c478bd9Sstevel@tonic-gate 		 */
1202*7c478bd9Sstevel@tonic-gate 		error = VOP_IOCTL(vp, cmd, arg, 0, credp, rvalp);
1203*7c478bd9Sstevel@tonic-gate 		vfs_delmip(vfsp);
1204*7c478bd9Sstevel@tonic-gate 		kmem_free(vfsp, sizeof (struct vfs));
1205*7c478bd9Sstevel@tonic-gate 		VN_RELE(vp);
1206*7c478bd9Sstevel@tonic-gate 		break;
1207*7c478bd9Sstevel@tonic-gate 	}
1208*7c478bd9Sstevel@tonic-gate 	default:
1209*7c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN, "snap_ioctl: Invalid ioctl cmd %d, minor %d.",
1210*7c478bd9Sstevel@tonic-gate 		    cmd, minor);
1211*7c478bd9Sstevel@tonic-gate 		return (EINVAL);
1212*7c478bd9Sstevel@tonic-gate 	}
1213*7c478bd9Sstevel@tonic-gate 
1214*7c478bd9Sstevel@tonic-gate 	return (error);
1215*7c478bd9Sstevel@tonic-gate }
1216*7c478bd9Sstevel@tonic-gate 
1217*7c478bd9Sstevel@tonic-gate 
1218*7c478bd9Sstevel@tonic-gate /* ************************************************************************ */
1219*7c478bd9Sstevel@tonic-gate 
1220*7c478bd9Sstevel@tonic-gate /*
1221*7c478bd9Sstevel@tonic-gate  * Translation Table Routines
1222*7c478bd9Sstevel@tonic-gate  *
1223*7c478bd9Sstevel@tonic-gate  *    These support routines implement a simple doubly linked list
1224*7c478bd9Sstevel@tonic-gate  *    to keep track of chunks that are currently in memory.  The maximum
1225*7c478bd9Sstevel@tonic-gate  *    size of the list is determined by the fssnap_max_mem_chunks variable.
1226*7c478bd9Sstevel@tonic-gate  *    The cmap_rwlock is used to protect the linkage of the list.
1227*7c478bd9Sstevel@tonic-gate  */
1228*7c478bd9Sstevel@tonic-gate 
1229*7c478bd9Sstevel@tonic-gate /*
1230*7c478bd9Sstevel@tonic-gate  * transtbl_add() - add a node to the translation table
1231*7c478bd9Sstevel@tonic-gate  *
1232*7c478bd9Sstevel@tonic-gate  *    allocates a new node and points it at the buffer passed in.  The node
1233*7c478bd9Sstevel@tonic-gate  *    is added to the beginning of the doubly linked list and the head of
1234*7c478bd9Sstevel@tonic-gate  *    the list is moved.  The cmap_rwlock must be held as a writer through
1235*7c478bd9Sstevel@tonic-gate  *    this operation.
1236*7c478bd9Sstevel@tonic-gate  */
1237*7c478bd9Sstevel@tonic-gate static cow_map_node_t *
1238*7c478bd9Sstevel@tonic-gate transtbl_add(cow_map_t *cmap, chunknumber_t chunk, caddr_t buf)
1239*7c478bd9Sstevel@tonic-gate {
1240*7c478bd9Sstevel@tonic-gate 	cow_map_node_t	*cmnode;
1241*7c478bd9Sstevel@tonic-gate 
1242*7c478bd9Sstevel@tonic-gate 	ASSERT(RW_WRITE_HELD(&cmap->cmap_rwlock));
1243*7c478bd9Sstevel@tonic-gate 
1244*7c478bd9Sstevel@tonic-gate 	cmnode = kmem_alloc(sizeof (cow_map_node_t), KM_SLEEP);
1245*7c478bd9Sstevel@tonic-gate 
1246*7c478bd9Sstevel@tonic-gate 	/*
1247*7c478bd9Sstevel@tonic-gate 	 * insert new translations at the beginning so cmn_table is always
1248*7c478bd9Sstevel@tonic-gate 	 * the first node.
1249*7c478bd9Sstevel@tonic-gate 	 */
1250*7c478bd9Sstevel@tonic-gate 	cmnode->cmn_chunk = chunk;
1251*7c478bd9Sstevel@tonic-gate 	cmnode->cmn_buf = buf;
1252*7c478bd9Sstevel@tonic-gate 	cmnode->cmn_prev = NULL;
1253*7c478bd9Sstevel@tonic-gate 	cmnode->cmn_next = cmap->cmap_table;
1254*7c478bd9Sstevel@tonic-gate 	if (cmnode->cmn_next)
1255*7c478bd9Sstevel@tonic-gate 		cmnode->cmn_next->cmn_prev = cmnode;
1256*7c478bd9Sstevel@tonic-gate 	cmap->cmap_table = cmnode;
1257*7c478bd9Sstevel@tonic-gate 
1258*7c478bd9Sstevel@tonic-gate 	return (cmnode);
1259*7c478bd9Sstevel@tonic-gate }
1260*7c478bd9Sstevel@tonic-gate 
1261*7c478bd9Sstevel@tonic-gate /*
1262*7c478bd9Sstevel@tonic-gate  * transtbl_get() - look up a node in the translation table
1263*7c478bd9Sstevel@tonic-gate  *
1264*7c478bd9Sstevel@tonic-gate  *    called by the snapshot driver to find data that has been translated.
1265*7c478bd9Sstevel@tonic-gate  *    The lookup is done by the chunk number, and the node is returned.
1266*7c478bd9Sstevel@tonic-gate  *    If the node was not found, NULL is returned.
1267*7c478bd9Sstevel@tonic-gate  */
1268*7c478bd9Sstevel@tonic-gate static cow_map_node_t *
1269*7c478bd9Sstevel@tonic-gate transtbl_get(cow_map_t *cmap, chunknumber_t chunk)
1270*7c478bd9Sstevel@tonic-gate {
1271*7c478bd9Sstevel@tonic-gate 	cow_map_node_t *cmn;
1272*7c478bd9Sstevel@tonic-gate 
1273*7c478bd9Sstevel@tonic-gate 	ASSERT(RW_READ_HELD(&cmap->cmap_rwlock));
1274*7c478bd9Sstevel@tonic-gate 	ASSERT(cmap);
1275*7c478bd9Sstevel@tonic-gate 
1276*7c478bd9Sstevel@tonic-gate 	/* search the translation table */
1277*7c478bd9Sstevel@tonic-gate 	for (cmn = cmap->cmap_table; cmn != NULL; cmn = cmn->cmn_next) {
1278*7c478bd9Sstevel@tonic-gate 		if (cmn->cmn_chunk == chunk)
1279*7c478bd9Sstevel@tonic-gate 			return (cmn);
1280*7c478bd9Sstevel@tonic-gate 	}
1281*7c478bd9Sstevel@tonic-gate 
1282*7c478bd9Sstevel@tonic-gate 	/* not found */
1283*7c478bd9Sstevel@tonic-gate 	return (NULL);
1284*7c478bd9Sstevel@tonic-gate }
1285*7c478bd9Sstevel@tonic-gate 
1286*7c478bd9Sstevel@tonic-gate /*
1287*7c478bd9Sstevel@tonic-gate  * transtbl_delete() - delete a node from the translation table
1288*7c478bd9Sstevel@tonic-gate  *
1289*7c478bd9Sstevel@tonic-gate  *    called when a node's data has been written out to disk.  The
1290*7c478bd9Sstevel@tonic-gate  *    cmap_rwlock must be held as a writer for this operation.  If the node
1291*7c478bd9Sstevel@tonic-gate  *    being deleted is the head of the list, then the head is moved to the
1292*7c478bd9Sstevel@tonic-gate  *    next node.  Both the node's data and the node itself are freed.
1293*7c478bd9Sstevel@tonic-gate  */
1294*7c478bd9Sstevel@tonic-gate static void
1295*7c478bd9Sstevel@tonic-gate transtbl_delete(cow_map_t *cmap, cow_map_node_t *cmn)
1296*7c478bd9Sstevel@tonic-gate {
1297*7c478bd9Sstevel@tonic-gate 	ASSERT(RW_WRITE_HELD(&cmap->cmap_rwlock));
1298*7c478bd9Sstevel@tonic-gate 	ASSERT(cmn);
1299*7c478bd9Sstevel@tonic-gate 	ASSERT(cmap->cmap_table);
1300*7c478bd9Sstevel@tonic-gate 
1301*7c478bd9Sstevel@tonic-gate 	/* if the head of the list is being deleted, then move the head up */
1302*7c478bd9Sstevel@tonic-gate 	if (cmap->cmap_table == cmn) {
1303*7c478bd9Sstevel@tonic-gate 		ASSERT(cmn->cmn_prev == NULL);
1304*7c478bd9Sstevel@tonic-gate 		cmap->cmap_table = cmn->cmn_next;
1305*7c478bd9Sstevel@tonic-gate 	}
1306*7c478bd9Sstevel@tonic-gate 
1307*7c478bd9Sstevel@tonic-gate 
1308*7c478bd9Sstevel@tonic-gate 	/* make previous node's next pointer skip over current node */
1309*7c478bd9Sstevel@tonic-gate 	if (cmn->cmn_prev != NULL) {
1310*7c478bd9Sstevel@tonic-gate 		ASSERT(cmn->cmn_prev->cmn_next == cmn);
1311*7c478bd9Sstevel@tonic-gate 		cmn->cmn_prev->cmn_next = cmn->cmn_next;
1312*7c478bd9Sstevel@tonic-gate 	}
1313*7c478bd9Sstevel@tonic-gate 
1314*7c478bd9Sstevel@tonic-gate 	/* make next node's previous pointer skip over current node */
1315*7c478bd9Sstevel@tonic-gate 	if (cmn->cmn_next != NULL) {
1316*7c478bd9Sstevel@tonic-gate 		ASSERT(cmn->cmn_next->cmn_prev == cmn);
1317*7c478bd9Sstevel@tonic-gate 		cmn->cmn_next->cmn_prev = cmn->cmn_prev;
1318*7c478bd9Sstevel@tonic-gate 	}
1319*7c478bd9Sstevel@tonic-gate 
1320*7c478bd9Sstevel@tonic-gate 	/* free the data and the node */
1321*7c478bd9Sstevel@tonic-gate 	ASSERT(cmn->cmn_buf);
1322*7c478bd9Sstevel@tonic-gate 	kmem_free(cmn->cmn_buf, cmap->cmap_chunksz);
1323*7c478bd9Sstevel@tonic-gate 	kmem_free(cmn, sizeof (cow_map_node_t));
1324*7c478bd9Sstevel@tonic-gate }
1325*7c478bd9Sstevel@tonic-gate 
1326*7c478bd9Sstevel@tonic-gate /*
1327*7c478bd9Sstevel@tonic-gate  * transtbl_free() - free the entire translation table
1328*7c478bd9Sstevel@tonic-gate  *
1329*7c478bd9Sstevel@tonic-gate  *    called when the snapshot is deleted.  This frees all of the nodes in
1330*7c478bd9Sstevel@tonic-gate  *    the translation table (but not the bitmaps).
1331*7c478bd9Sstevel@tonic-gate  */
1332*7c478bd9Sstevel@tonic-gate static void
1333*7c478bd9Sstevel@tonic-gate transtbl_free(cow_map_t *cmap)
1334*7c478bd9Sstevel@tonic-gate {
1335*7c478bd9Sstevel@tonic-gate 	cow_map_node_t	*curnode;
1336*7c478bd9Sstevel@tonic-gate 	cow_map_node_t	*tempnode;
1337*7c478bd9Sstevel@tonic-gate 
1338*7c478bd9Sstevel@tonic-gate 	for (curnode = cmap->cmap_table; curnode != NULL; curnode = tempnode) {
1339*7c478bd9Sstevel@tonic-gate 		tempnode = curnode->cmn_next;
1340*7c478bd9Sstevel@tonic-gate 
1341*7c478bd9Sstevel@tonic-gate 		kmem_free(curnode->cmn_buf, cmap->cmap_chunksz);
1342*7c478bd9Sstevel@tonic-gate 		kmem_free(curnode, sizeof (cow_map_node_t));
1343*7c478bd9Sstevel@tonic-gate 	}
1344*7c478bd9Sstevel@tonic-gate }
1345*7c478bd9Sstevel@tonic-gate 
1346*7c478bd9Sstevel@tonic-gate 
1347*7c478bd9Sstevel@tonic-gate /* ************************************************************************ */
1348*7c478bd9Sstevel@tonic-gate 
1349*7c478bd9Sstevel@tonic-gate /*
1350*7c478bd9Sstevel@tonic-gate  * Interface Implementation Routines
1351*7c478bd9Sstevel@tonic-gate  *
1352*7c478bd9Sstevel@tonic-gate  * The following functions implement snapshot interface routines that are
1353*7c478bd9Sstevel@tonic-gate  * called by the file system to create, delete, and use a snapshot.  The
1354*7c478bd9Sstevel@tonic-gate  * interfaces are defined in fssnap_if.c and are filled in by this driver
1355*7c478bd9Sstevel@tonic-gate  * when it is loaded.  This technique allows the file system to depend on
1356*7c478bd9Sstevel@tonic-gate  * the interface module without having to load the full implementation and
1357*7c478bd9Sstevel@tonic-gate  * snapshot device drivers.
1358*7c478bd9Sstevel@tonic-gate  */
1359*7c478bd9Sstevel@tonic-gate 
1360*7c478bd9Sstevel@tonic-gate /*
1361*7c478bd9Sstevel@tonic-gate  * fssnap_strategy_impl() - strategy routine called by the file system
1362*7c478bd9Sstevel@tonic-gate  *
1363*7c478bd9Sstevel@tonic-gate  *    called by the file system to handle copy-on-write when necessary.  All
1364*7c478bd9Sstevel@tonic-gate  *    reads and writes that the file system performs should go through this
1365*7c478bd9Sstevel@tonic-gate  *    function.  If the file system calls the underlying device's strategy
1366*7c478bd9Sstevel@tonic-gate  *    routine without going through fssnap_strategy() (eg. by calling
1367*7c478bd9Sstevel@tonic-gate  *    bdev_strategy()), the snapshot may not be consistent.
1368*7c478bd9Sstevel@tonic-gate  *
1369*7c478bd9Sstevel@tonic-gate  *    This function starts by doing significant sanity checking to insure
1370*7c478bd9Sstevel@tonic-gate  *    the snapshot was not deleted out from under it or deleted and then
1371*7c478bd9Sstevel@tonic-gate  *    recreated.  To do this, it checks the actual pointer passed into it
1372*7c478bd9Sstevel@tonic-gate  *    (ie. the handle held by the file system).  NOTE that the parameter is
1373*7c478bd9Sstevel@tonic-gate  *    a POINTER TO A POINTER to the snapshot id.  Once the snapshot id is
1374*7c478bd9Sstevel@tonic-gate  *    locked, it knows things are ok and that this snapshot is really for
1375*7c478bd9Sstevel@tonic-gate  *    this file system.
1376*7c478bd9Sstevel@tonic-gate  *
1377*7c478bd9Sstevel@tonic-gate  *    If the request is a write, fssnap_translate() is called to determine
1378*7c478bd9Sstevel@tonic-gate  *    whether a copy-on-write is required.  If it is a read, the read is
1379*7c478bd9Sstevel@tonic-gate  *    simply passed on to the underlying device.
1380*7c478bd9Sstevel@tonic-gate  */
1381*7c478bd9Sstevel@tonic-gate static void
1382*7c478bd9Sstevel@tonic-gate fssnap_strategy_impl(void *snapshot_id, buf_t *bp)
1383*7c478bd9Sstevel@tonic-gate {
1384*7c478bd9Sstevel@tonic-gate 	struct snapshot_id **sidpp;
1385*7c478bd9Sstevel@tonic-gate 	struct snapshot_id *sidp;
1386*7c478bd9Sstevel@tonic-gate 	int error;
1387*7c478bd9Sstevel@tonic-gate 
1388*7c478bd9Sstevel@tonic-gate 	/* read requests are always passed through */
1389*7c478bd9Sstevel@tonic-gate 	if (bp->b_flags & B_READ) {
1390*7c478bd9Sstevel@tonic-gate 		(void) bdev_strategy(bp);
1391*7c478bd9Sstevel@tonic-gate 		return;
1392*7c478bd9Sstevel@tonic-gate 	}
1393*7c478bd9Sstevel@tonic-gate 
1394*7c478bd9Sstevel@tonic-gate 	/*
1395*7c478bd9Sstevel@tonic-gate 	 * Because we were not able to take the snapshot read lock BEFORE
1396*7c478bd9Sstevel@tonic-gate 	 * checking for a snapshot back in the file system, things may have
1397*7c478bd9Sstevel@tonic-gate 	 * drastically changed out from under us.  For instance, the snapshot
1398*7c478bd9Sstevel@tonic-gate 	 * may have been deleted, deleted and recreated, or worse yet, deleted
1399*7c478bd9Sstevel@tonic-gate 	 * for this file system but now the snapshot number is in use by another
1400*7c478bd9Sstevel@tonic-gate 	 * file system.
1401*7c478bd9Sstevel@tonic-gate 	 *
1402*7c478bd9Sstevel@tonic-gate 	 * Having a pointer to the file system's snapshot id pointer allows us
1403*7c478bd9Sstevel@tonic-gate 	 * to sanity check most of this, though it assumes the file system is
1404*7c478bd9Sstevel@tonic-gate 	 * keeping track of a pointer to the snapshot_id somewhere.
1405*7c478bd9Sstevel@tonic-gate 	 */
1406*7c478bd9Sstevel@tonic-gate 	sidpp = (struct snapshot_id **)snapshot_id;
1407*7c478bd9Sstevel@tonic-gate 	sidp = *sidpp;
1408*7c478bd9Sstevel@tonic-gate 
1409*7c478bd9Sstevel@tonic-gate 	/*
1410*7c478bd9Sstevel@tonic-gate 	 * if this file system's snapshot was disabled, just pass the
1411*7c478bd9Sstevel@tonic-gate 	 * request through.
1412*7c478bd9Sstevel@tonic-gate 	 */
1413*7c478bd9Sstevel@tonic-gate 	if (sidp == NULL) {
1414*7c478bd9Sstevel@tonic-gate 		(void) bdev_strategy(bp);
1415*7c478bd9Sstevel@tonic-gate 		return;
1416*7c478bd9Sstevel@tonic-gate 	}
1417*7c478bd9Sstevel@tonic-gate 
1418*7c478bd9Sstevel@tonic-gate 	/*
1419*7c478bd9Sstevel@tonic-gate 	 * Once we have the reader lock the snapshot will not magically go
1420*7c478bd9Sstevel@tonic-gate 	 * away.  But things may have changed on us before this so double check.
1421*7c478bd9Sstevel@tonic-gate 	 */
1422*7c478bd9Sstevel@tonic-gate 	rw_enter(&sidp->sid_rwlock, RW_READER);
1423*7c478bd9Sstevel@tonic-gate 
1424*7c478bd9Sstevel@tonic-gate 	/*
1425*7c478bd9Sstevel@tonic-gate 	 * if an error was founds somewhere the DELETE flag will be
1426*7c478bd9Sstevel@tonic-gate 	 * set to indicate the snapshot should be deleted and no new
1427*7c478bd9Sstevel@tonic-gate 	 * translations should occur.
1428*7c478bd9Sstevel@tonic-gate 	 */
1429*7c478bd9Sstevel@tonic-gate 	if (sidp->sid_flags & SID_DELETE) {
1430*7c478bd9Sstevel@tonic-gate 		rw_exit(&sidp->sid_rwlock);
1431*7c478bd9Sstevel@tonic-gate 		(void) fssnap_delete_impl(sidpp);
1432*7c478bd9Sstevel@tonic-gate 		(void) bdev_strategy(bp);
1433*7c478bd9Sstevel@tonic-gate 		return;
1434*7c478bd9Sstevel@tonic-gate 	}
1435*7c478bd9Sstevel@tonic-gate 
1436*7c478bd9Sstevel@tonic-gate 	/*
1437*7c478bd9Sstevel@tonic-gate 	 * If the file system is no longer pointing to the snapshot we were
1438*7c478bd9Sstevel@tonic-gate 	 * called with, then it should not attempt to translate this buffer as
1439*7c478bd9Sstevel@tonic-gate 	 * it may be going to a snapshot for a different file system.
1440*7c478bd9Sstevel@tonic-gate 	 * Even if the file system snapshot pointer is still the same, the
1441*7c478bd9Sstevel@tonic-gate 	 * snapshot may have been disabled before we got the reader lock.
1442*7c478bd9Sstevel@tonic-gate 	 */
1443*7c478bd9Sstevel@tonic-gate 	if (sidp != *sidpp || SID_INACTIVE(sidp)) {
1444*7c478bd9Sstevel@tonic-gate 		rw_exit(&sidp->sid_rwlock);
1445*7c478bd9Sstevel@tonic-gate 		(void) bdev_strategy(bp);
1446*7c478bd9Sstevel@tonic-gate 		return;
1447*7c478bd9Sstevel@tonic-gate 	}
1448*7c478bd9Sstevel@tonic-gate 
1449*7c478bd9Sstevel@tonic-gate 	/*
1450*7c478bd9Sstevel@tonic-gate 	 * At this point we're sure the snapshot will not go away while the
1451*7c478bd9Sstevel@tonic-gate 	 * reader lock is held, and we are reasonably certain that we are
1452*7c478bd9Sstevel@tonic-gate 	 * writing to the correct snapshot.
1453*7c478bd9Sstevel@tonic-gate 	 */
1454*7c478bd9Sstevel@tonic-gate 	if ((error = fssnap_translate(sidpp, bp)) != 0) {
1455*7c478bd9Sstevel@tonic-gate 		/*
1456*7c478bd9Sstevel@tonic-gate 		 * fssnap_translate can release the reader lock if it
1457*7c478bd9Sstevel@tonic-gate 		 * has to wait for a semaphore.  In this case it is possible
1458*7c478bd9Sstevel@tonic-gate 		 * for the snapshot to be deleted in this time frame.  If this
1459*7c478bd9Sstevel@tonic-gate 		 * happens just sent the buf thru to the filesystems device.
1460*7c478bd9Sstevel@tonic-gate 		 */
1461*7c478bd9Sstevel@tonic-gate 		if (sidp != *sidpp || SID_INACTIVE(sidp)) {
1462*7c478bd9Sstevel@tonic-gate 			rw_exit(&sidp->sid_rwlock);
1463*7c478bd9Sstevel@tonic-gate 			(void) bdev_strategy(bp);
1464*7c478bd9Sstevel@tonic-gate 			return;
1465*7c478bd9Sstevel@tonic-gate 		}
1466*7c478bd9Sstevel@tonic-gate 		bioerror(bp, error);
1467*7c478bd9Sstevel@tonic-gate 		biodone(bp);
1468*7c478bd9Sstevel@tonic-gate 	}
1469*7c478bd9Sstevel@tonic-gate 	rw_exit(&sidp->sid_rwlock);
1470*7c478bd9Sstevel@tonic-gate }
1471*7c478bd9Sstevel@tonic-gate 
1472*7c478bd9Sstevel@tonic-gate /*
1473*7c478bd9Sstevel@tonic-gate  * fssnap_translate() - helper function for fssnap_strategy()
1474*7c478bd9Sstevel@tonic-gate  *
1475*7c478bd9Sstevel@tonic-gate  *    performs the actual copy-on-write for write requests, if required.
1476*7c478bd9Sstevel@tonic-gate  *    This function does the real work of the file system side of things.
1477*7c478bd9Sstevel@tonic-gate  *
1478*7c478bd9Sstevel@tonic-gate  *    It first checks the candidate bitmap to quickly determine whether any
1479*7c478bd9Sstevel@tonic-gate  *    action is necessary.  If the candidate bitmap indicates the chunk was
1480*7c478bd9Sstevel@tonic-gate  *    allocated when the snapshot was created, then it checks to see whether
1481*7c478bd9Sstevel@tonic-gate  *    a translation already exists.  If a translation already exists then no
1482*7c478bd9Sstevel@tonic-gate  *    action is required.  If the chunk is a candidate for copy-on-write,
1483*7c478bd9Sstevel@tonic-gate  *    and a translation does not already exist, then the chunk is read in
1484*7c478bd9Sstevel@tonic-gate  *    and a node is added to the translation table.
1485*7c478bd9Sstevel@tonic-gate  *
1486*7c478bd9Sstevel@tonic-gate  *    Once all of the chunks in the request range have been copied (if they
1487*7c478bd9Sstevel@tonic-gate  *    needed to be), then the original request can be satisfied and the old
1488*7c478bd9Sstevel@tonic-gate  *    data can be overwritten.
1489*7c478bd9Sstevel@tonic-gate  */
1490*7c478bd9Sstevel@tonic-gate static int
1491*7c478bd9Sstevel@tonic-gate fssnap_translate(struct snapshot_id **sidpp, struct buf *wbp)
1492*7c478bd9Sstevel@tonic-gate {
1493*7c478bd9Sstevel@tonic-gate 	snapshot_id_t	*sidp = *sidpp;
1494*7c478bd9Sstevel@tonic-gate 	struct buf	*oldbp;	/* buffer to store old data in */
1495*7c478bd9Sstevel@tonic-gate 	struct cow_info	*cowp = sidp->sid_cowinfo;
1496*7c478bd9Sstevel@tonic-gate 	cow_map_t	*cmap = &cowp->cow_map;
1497*7c478bd9Sstevel@tonic-gate 	cow_map_node_t	*cmn;
1498*7c478bd9Sstevel@tonic-gate 	chunknumber_t	cowchunk, startchunk, endchunk;
1499*7c478bd9Sstevel@tonic-gate 	int		error;
1500*7c478bd9Sstevel@tonic-gate 	int	throttle_write = 0;
1501*7c478bd9Sstevel@tonic-gate 
1502*7c478bd9Sstevel@tonic-gate 	/* make sure the snapshot is active */
1503*7c478bd9Sstevel@tonic-gate 	ASSERT(RW_READ_HELD(&sidp->sid_rwlock));
1504*7c478bd9Sstevel@tonic-gate 
1505*7c478bd9Sstevel@tonic-gate 	startchunk = dbtocowchunk(cmap, wbp->b_lblkno);
1506*7c478bd9Sstevel@tonic-gate 	endchunk   = dbtocowchunk(cmap, wbp->b_lblkno +
1507*7c478bd9Sstevel@tonic-gate 	    ((wbp->b_bcount-1) >> DEV_BSHIFT));
1508*7c478bd9Sstevel@tonic-gate 
1509*7c478bd9Sstevel@tonic-gate 	/*
1510*7c478bd9Sstevel@tonic-gate 	 * Do not throttle the writes of the fssnap taskq thread and
1511*7c478bd9Sstevel@tonic-gate 	 * the log roll (trans_roll) thread. Furthermore the writes to
1512*7c478bd9Sstevel@tonic-gate 	 * the on-disk log are also not subject to throttling.
1513*7c478bd9Sstevel@tonic-gate 	 * The fssnap_write_taskq thread's write can block on the throttling
1514*7c478bd9Sstevel@tonic-gate 	 * semaphore which leads to self-deadlock as this same thread
1515*7c478bd9Sstevel@tonic-gate 	 * releases the throttling semaphore after completing the IO.
1516*7c478bd9Sstevel@tonic-gate 	 * If the trans_roll thread's write is throttled then we can deadlock
1517*7c478bd9Sstevel@tonic-gate 	 * because the fssnap_taskq_thread which releases the throttling
1518*7c478bd9Sstevel@tonic-gate 	 * semaphore can block waiting for log space which can only be
1519*7c478bd9Sstevel@tonic-gate 	 * released by the trans_roll thread.
1520*7c478bd9Sstevel@tonic-gate 	 */
1521*7c478bd9Sstevel@tonic-gate 
1522*7c478bd9Sstevel@tonic-gate 	throttle_write = !(taskq_member(cowp->cow_taskq, curthread) ||
1523*7c478bd9Sstevel@tonic-gate 				    tsd_get(bypass_snapshot_throttle_key));
1524*7c478bd9Sstevel@tonic-gate 
1525*7c478bd9Sstevel@tonic-gate 	/*
1526*7c478bd9Sstevel@tonic-gate 	 * Iterate through all chunks covered by this write and perform the
1527*7c478bd9Sstevel@tonic-gate 	 * copy-aside if necessary.  Once all chunks have been safely
1528*7c478bd9Sstevel@tonic-gate 	 * stowed away, the new data may be written in a single sweep.
1529*7c478bd9Sstevel@tonic-gate 	 *
1530*7c478bd9Sstevel@tonic-gate 	 * For each chunk in the range, the following sequence is performed:
1531*7c478bd9Sstevel@tonic-gate 	 *	- Is the chunk a candidate for translation?
1532*7c478bd9Sstevel@tonic-gate 	 *		o If not, then no translation is necessary, continue
1533*7c478bd9Sstevel@tonic-gate 	 *	- If it is a candidate, then does it already have a translation?
1534*7c478bd9Sstevel@tonic-gate 	 *		o If so, then no translation is necessary, continue
1535*7c478bd9Sstevel@tonic-gate 	 *	- If it is a candidate, but does not yet have a translation,
1536*7c478bd9Sstevel@tonic-gate 	 *	  then read the old data and schedule an asynchronous taskq
1537*7c478bd9Sstevel@tonic-gate 	 *	  to write the old data to the backing file.
1538*7c478bd9Sstevel@tonic-gate 	 *
1539*7c478bd9Sstevel@tonic-gate 	 * Once this has been performed over the entire range of chunks, then
1540*7c478bd9Sstevel@tonic-gate 	 * it is safe to overwrite the data that is there.
1541*7c478bd9Sstevel@tonic-gate 	 *
1542*7c478bd9Sstevel@tonic-gate 	 * Note that no lock is required to check the candidate bitmap because
1543*7c478bd9Sstevel@tonic-gate 	 * it never changes once the snapshot is created.  The reader lock is
1544*7c478bd9Sstevel@tonic-gate 	 * taken to check the hastrans bitmap since it may change.  If it
1545*7c478bd9Sstevel@tonic-gate 	 * turns out a copy is required, then the lock is upgraded to a
1546*7c478bd9Sstevel@tonic-gate 	 * writer, and the bitmap is re-checked as it may have changed while
1547*7c478bd9Sstevel@tonic-gate 	 * the lock was released.  Finally, the write lock is held while
1548*7c478bd9Sstevel@tonic-gate 	 * reading the old data to make sure it is not translated out from
1549*7c478bd9Sstevel@tonic-gate 	 * under us.
1550*7c478bd9Sstevel@tonic-gate 	 *
1551*7c478bd9Sstevel@tonic-gate 	 * This locking mechanism should be sufficient to handle multiple
1552*7c478bd9Sstevel@tonic-gate 	 * threads writing to overlapping chunks simultaneously.
1553*7c478bd9Sstevel@tonic-gate 	 */
1554*7c478bd9Sstevel@tonic-gate 	for (cowchunk = startchunk; cowchunk <= endchunk; cowchunk++) {
1555*7c478bd9Sstevel@tonic-gate 		/*
1556*7c478bd9Sstevel@tonic-gate 		 * If the cowchunk is outside of the range of our
1557*7c478bd9Sstevel@tonic-gate 		 * candidate maps, then simply break out of the
1558*7c478bd9Sstevel@tonic-gate 		 * loop and pass the I/O through to bdev_strategy.
1559*7c478bd9Sstevel@tonic-gate 		 * This would occur if the file system has grown
1560*7c478bd9Sstevel@tonic-gate 		 * larger since the snapshot was taken.
1561*7c478bd9Sstevel@tonic-gate 		 */
1562*7c478bd9Sstevel@tonic-gate 		if (cowchunk >= (cmap->cmap_bmsize * NBBY))
1563*7c478bd9Sstevel@tonic-gate 			break;
1564*7c478bd9Sstevel@tonic-gate 
1565*7c478bd9Sstevel@tonic-gate 		/*
1566*7c478bd9Sstevel@tonic-gate 		 * If no disk blocks were allocated in this chunk when the
1567*7c478bd9Sstevel@tonic-gate 		 * snapshot was created then no copy-on-write will be
1568*7c478bd9Sstevel@tonic-gate 		 * required.  Since this bitmap is read-only no locks are
1569*7c478bd9Sstevel@tonic-gate 		 * necessary.
1570*7c478bd9Sstevel@tonic-gate 		 */
1571*7c478bd9Sstevel@tonic-gate 		if (isclr(cmap->cmap_candidate, cowchunk)) {
1572*7c478bd9Sstevel@tonic-gate 			continue;
1573*7c478bd9Sstevel@tonic-gate 		}
1574*7c478bd9Sstevel@tonic-gate 
1575*7c478bd9Sstevel@tonic-gate 		/*
1576*7c478bd9Sstevel@tonic-gate 		 * If a translation already exists, the data can be written
1577*7c478bd9Sstevel@tonic-gate 		 * through since the old data has already been saved off.
1578*7c478bd9Sstevel@tonic-gate 		 */
1579*7c478bd9Sstevel@tonic-gate 		if (isset(cmap->cmap_hastrans, cowchunk)) {
1580*7c478bd9Sstevel@tonic-gate 			continue;
1581*7c478bd9Sstevel@tonic-gate 		}
1582*7c478bd9Sstevel@tonic-gate 
1583*7c478bd9Sstevel@tonic-gate 
1584*7c478bd9Sstevel@tonic-gate 		/*
1585*7c478bd9Sstevel@tonic-gate 		 * Throttle translations if there are too many outstanding
1586*7c478bd9Sstevel@tonic-gate 		 * chunks in memory.  The semaphore is sema_v'd by the taskq.
1587*7c478bd9Sstevel@tonic-gate 		 *
1588*7c478bd9Sstevel@tonic-gate 		 * You can't keep the sid_rwlock if you would go to sleep.
1589*7c478bd9Sstevel@tonic-gate 		 * This will result in deadlock when someone tries to delete
1590*7c478bd9Sstevel@tonic-gate 		 * the snapshot (wants the sid_rwlock as a writer, but can't
1591*7c478bd9Sstevel@tonic-gate 		 * get it).
1592*7c478bd9Sstevel@tonic-gate 		 */
1593*7c478bd9Sstevel@tonic-gate 		if (throttle_write) {
1594*7c478bd9Sstevel@tonic-gate 			if (sema_tryp(&cmap->cmap_throttle_sem) == 0) {
1595*7c478bd9Sstevel@tonic-gate 				rw_exit(&sidp->sid_rwlock);
1596*7c478bd9Sstevel@tonic-gate 				atomic_add_32(&cmap->cmap_waiters, 1);
1597*7c478bd9Sstevel@tonic-gate 				sema_p(&cmap->cmap_throttle_sem);
1598*7c478bd9Sstevel@tonic-gate 				atomic_add_32(&cmap->cmap_waiters, -1);
1599*7c478bd9Sstevel@tonic-gate 				rw_enter(&sidp->sid_rwlock, RW_READER);
1600*7c478bd9Sstevel@tonic-gate 
1601*7c478bd9Sstevel@tonic-gate 			/*
1602*7c478bd9Sstevel@tonic-gate 			 * Now since we released the sid_rwlock the state may
1603*7c478bd9Sstevel@tonic-gate 			 * have transitioned underneath us. so check that again.
1604*7c478bd9Sstevel@tonic-gate 			 */
1605*7c478bd9Sstevel@tonic-gate 				if (sidp != *sidpp || SID_INACTIVE(sidp)) {
1606*7c478bd9Sstevel@tonic-gate 					sema_v(&cmap->cmap_throttle_sem);
1607*7c478bd9Sstevel@tonic-gate 					return (ENXIO);
1608*7c478bd9Sstevel@tonic-gate 				}
1609*7c478bd9Sstevel@tonic-gate 			}
1610*7c478bd9Sstevel@tonic-gate 		}
1611*7c478bd9Sstevel@tonic-gate 
1612*7c478bd9Sstevel@tonic-gate 		/*
1613*7c478bd9Sstevel@tonic-gate 		 * Acquire the lock as a writer and check to see if a
1614*7c478bd9Sstevel@tonic-gate 		 * translation has been added in the meantime.
1615*7c478bd9Sstevel@tonic-gate 		 */
1616*7c478bd9Sstevel@tonic-gate 		rw_enter(&cmap->cmap_rwlock, RW_WRITER);
1617*7c478bd9Sstevel@tonic-gate 		if (isset(cmap->cmap_hastrans, cowchunk)) {
1618*7c478bd9Sstevel@tonic-gate 			if (throttle_write)
1619*7c478bd9Sstevel@tonic-gate 				sema_v(&cmap->cmap_throttle_sem);
1620*7c478bd9Sstevel@tonic-gate 			rw_exit(&cmap->cmap_rwlock);
1621*7c478bd9Sstevel@tonic-gate 			continue; /* go to the next chunk */
1622*7c478bd9Sstevel@tonic-gate 		}
1623*7c478bd9Sstevel@tonic-gate 
1624*7c478bd9Sstevel@tonic-gate 		/*
1625*7c478bd9Sstevel@tonic-gate 		 * read a full chunk of data from the requested offset rounded
1626*7c478bd9Sstevel@tonic-gate 		 * down to the nearest chunk size.
1627*7c478bd9Sstevel@tonic-gate 		 */
1628*7c478bd9Sstevel@tonic-gate 		oldbp = getrbuf(KM_SLEEP);
1629*7c478bd9Sstevel@tonic-gate 		oldbp->b_lblkno = cowchunktodb(cmap, cowchunk);
1630*7c478bd9Sstevel@tonic-gate 		oldbp->b_edev = wbp->b_edev;
1631*7c478bd9Sstevel@tonic-gate 		oldbp->b_bcount = cmap->cmap_chunksz;
1632*7c478bd9Sstevel@tonic-gate 		oldbp->b_bufsize = cmap->cmap_chunksz;
1633*7c478bd9Sstevel@tonic-gate 		oldbp->b_iodone = NULL;
1634*7c478bd9Sstevel@tonic-gate 		oldbp->b_proc = NULL;
1635*7c478bd9Sstevel@tonic-gate 		oldbp->b_flags = B_READ;
1636*7c478bd9Sstevel@tonic-gate 		oldbp->b_un.b_addr = kmem_alloc(cmap->cmap_chunksz, KM_SLEEP);
1637*7c478bd9Sstevel@tonic-gate 
1638*7c478bd9Sstevel@tonic-gate 		(void) bdev_strategy(oldbp);
1639*7c478bd9Sstevel@tonic-gate 		(void) biowait(oldbp);
1640*7c478bd9Sstevel@tonic-gate 
1641*7c478bd9Sstevel@tonic-gate 		/*
1642*7c478bd9Sstevel@tonic-gate 		 * It's ok to bail in the middle of translating the range
1643*7c478bd9Sstevel@tonic-gate 		 * because the extra copy-asides will not hurt anything
1644*7c478bd9Sstevel@tonic-gate 		 * (except by using extra space in the backing store).
1645*7c478bd9Sstevel@tonic-gate 		 */
1646*7c478bd9Sstevel@tonic-gate 		if ((error = geterror(oldbp)) != 0) {
1647*7c478bd9Sstevel@tonic-gate 			cmn_err(CE_WARN, "fssnap_translate: error reading "
1648*7c478bd9Sstevel@tonic-gate 			    "old data for snapshot %d, chunk %llu, disk block "
1649*7c478bd9Sstevel@tonic-gate 			    "%lld, size %lu, error %d.", sidp->sid_snapnumber,
1650*7c478bd9Sstevel@tonic-gate 			    cowchunk, oldbp->b_lblkno, oldbp->b_bcount, error);
1651*7c478bd9Sstevel@tonic-gate 			kmem_free(oldbp->b_un.b_addr, cmap->cmap_chunksz);
1652*7c478bd9Sstevel@tonic-gate 			freerbuf(oldbp);
1653*7c478bd9Sstevel@tonic-gate 			rw_exit(&cmap->cmap_rwlock);
1654*7c478bd9Sstevel@tonic-gate 			if (throttle_write)
1655*7c478bd9Sstevel@tonic-gate 				sema_v(&cmap->cmap_throttle_sem);
1656*7c478bd9Sstevel@tonic-gate 			return (error);
1657*7c478bd9Sstevel@tonic-gate 		}
1658*7c478bd9Sstevel@tonic-gate 
1659*7c478bd9Sstevel@tonic-gate 		/*
1660*7c478bd9Sstevel@tonic-gate 		 * add the node to the translation table and save a reference
1661*7c478bd9Sstevel@tonic-gate 		 * to pass to the taskq for writing out to the backing file
1662*7c478bd9Sstevel@tonic-gate 		 */
1663*7c478bd9Sstevel@tonic-gate 		cmn = transtbl_add(cmap, cowchunk, oldbp->b_un.b_addr);
1664*7c478bd9Sstevel@tonic-gate 		freerbuf(oldbp);
1665*7c478bd9Sstevel@tonic-gate 
1666*7c478bd9Sstevel@tonic-gate 		/*
1667*7c478bd9Sstevel@tonic-gate 		 * Add a reference to the snapshot id so the lower level
1668*7c478bd9Sstevel@tonic-gate 		 * processing (ie. the taskq) can get back to the state
1669*7c478bd9Sstevel@tonic-gate 		 * information.
1670*7c478bd9Sstevel@tonic-gate 		 */
1671*7c478bd9Sstevel@tonic-gate 		cmn->cmn_sid = sidp;
1672*7c478bd9Sstevel@tonic-gate 		cmn->release_sem = throttle_write;
1673*7c478bd9Sstevel@tonic-gate 		setbit(cmap->cmap_hastrans, cowchunk);
1674*7c478bd9Sstevel@tonic-gate 
1675*7c478bd9Sstevel@tonic-gate 		rw_exit(&cmap->cmap_rwlock);
1676*7c478bd9Sstevel@tonic-gate 
1677*7c478bd9Sstevel@tonic-gate 		/*
1678*7c478bd9Sstevel@tonic-gate 		 * schedule the asynchronous write to the backing file
1679*7c478bd9Sstevel@tonic-gate 		 */
1680*7c478bd9Sstevel@tonic-gate 		if (cowp->cow_backfile_array != NULL)
1681*7c478bd9Sstevel@tonic-gate 			(void) taskq_dispatch(cowp->cow_taskq,
1682*7c478bd9Sstevel@tonic-gate 			    fssnap_write_taskq, cmn, TQ_SLEEP);
1683*7c478bd9Sstevel@tonic-gate 	}
1684*7c478bd9Sstevel@tonic-gate 
1685*7c478bd9Sstevel@tonic-gate 	/*
1686*7c478bd9Sstevel@tonic-gate 	 * Write new data in place of the old data.  At this point all of the
1687*7c478bd9Sstevel@tonic-gate 	 * chunks touched by this write have been copied aside and so the new
1688*7c478bd9Sstevel@tonic-gate 	 * data can be written out all at once.
1689*7c478bd9Sstevel@tonic-gate 	 */
1690*7c478bd9Sstevel@tonic-gate 	(void) bdev_strategy(wbp);
1691*7c478bd9Sstevel@tonic-gate 
1692*7c478bd9Sstevel@tonic-gate 	return (0);
1693*7c478bd9Sstevel@tonic-gate }
1694*7c478bd9Sstevel@tonic-gate 
1695*7c478bd9Sstevel@tonic-gate /*
1696*7c478bd9Sstevel@tonic-gate  * fssnap_write_taskq() - write in-memory translations to the backing file
1697*7c478bd9Sstevel@tonic-gate  *
1698*7c478bd9Sstevel@tonic-gate  *    writes in-memory translations to the backing file asynchronously.  A
1699*7c478bd9Sstevel@tonic-gate  *    task is dispatched each time a new translation is created.  The task
1700*7c478bd9Sstevel@tonic-gate  *    writes the data to the backing file and removes it from the memory
1701*7c478bd9Sstevel@tonic-gate  *    list. The throttling semaphore is released only if the particular
1702*7c478bd9Sstevel@tonic-gate  *    translation was throttled in fssnap_translate.
1703*7c478bd9Sstevel@tonic-gate  */
1704*7c478bd9Sstevel@tonic-gate static void
1705*7c478bd9Sstevel@tonic-gate fssnap_write_taskq(void *arg)
1706*7c478bd9Sstevel@tonic-gate {
1707*7c478bd9Sstevel@tonic-gate 	cow_map_node_t	*cmn = (cow_map_node_t *)arg;
1708*7c478bd9Sstevel@tonic-gate 	snapshot_id_t	*sidp = cmn->cmn_sid;
1709*7c478bd9Sstevel@tonic-gate 	cow_info_t	*cowp = sidp->sid_cowinfo;
1710*7c478bd9Sstevel@tonic-gate 	cow_map_t	*cmap = &cowp->cow_map;
1711*7c478bd9Sstevel@tonic-gate 	int		error;
1712*7c478bd9Sstevel@tonic-gate 	int		bf_index;
1713*7c478bd9Sstevel@tonic-gate 	int		release_sem = cmn->release_sem;
1714*7c478bd9Sstevel@tonic-gate 
1715*7c478bd9Sstevel@tonic-gate 	/*
1716*7c478bd9Sstevel@tonic-gate 	 * The sid_rwlock does not need to be held here because the taskqs
1717*7c478bd9Sstevel@tonic-gate 	 * are destroyed explicitly by fssnap_delete (with the sid_rwlock
1718*7c478bd9Sstevel@tonic-gate 	 * held as a writer).  taskq_destroy() will flush all of the tasks
1719*7c478bd9Sstevel@tonic-gate 	 * out before fssnap_delete frees up all of the structures.
1720*7c478bd9Sstevel@tonic-gate 	 */
1721*7c478bd9Sstevel@tonic-gate 
1722*7c478bd9Sstevel@tonic-gate 	/* if the snapshot was disabled from under us, drop the request. */
1723*7c478bd9Sstevel@tonic-gate 	rw_enter(&sidp->sid_rwlock, RW_READER);
1724*7c478bd9Sstevel@tonic-gate 	if (SID_INACTIVE(sidp)) {
1725*7c478bd9Sstevel@tonic-gate 		rw_exit(&sidp->sid_rwlock);
1726*7c478bd9Sstevel@tonic-gate 		if (release_sem)
1727*7c478bd9Sstevel@tonic-gate 			sema_v(&cmap->cmap_throttle_sem);
1728*7c478bd9Sstevel@tonic-gate 		return;
1729*7c478bd9Sstevel@tonic-gate 	}
1730*7c478bd9Sstevel@tonic-gate 	rw_exit(&sidp->sid_rwlock);
1731*7c478bd9Sstevel@tonic-gate 
1732*7c478bd9Sstevel@tonic-gate 	atomic_add_64((uint64_t *)&cmap->cmap_nchunks, 1);
1733*7c478bd9Sstevel@tonic-gate 
1734*7c478bd9Sstevel@tonic-gate 	if ((cmap->cmap_maxsize != 0) &&
1735*7c478bd9Sstevel@tonic-gate 	    ((cmap->cmap_nchunks * cmap->cmap_chunksz) > cmap->cmap_maxsize)) {
1736*7c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN, "fssnap_write_taskq: snapshot %d (%s) has "
1737*7c478bd9Sstevel@tonic-gate 		    "reached the maximum backing file size specified (%llu "
1738*7c478bd9Sstevel@tonic-gate 		    "bytes) and will be deleted.", sidp->sid_snapnumber,
1739*7c478bd9Sstevel@tonic-gate 		    (char *)cowp->cow_kstat_mntpt->ks_data,
1740*7c478bd9Sstevel@tonic-gate 		    cmap->cmap_maxsize);
1741*7c478bd9Sstevel@tonic-gate 		if (release_sem)
1742*7c478bd9Sstevel@tonic-gate 			sema_v(&cmap->cmap_throttle_sem);
1743*7c478bd9Sstevel@tonic-gate 		atomic_or_uint(&sidp->sid_flags, SID_DELETE);
1744*7c478bd9Sstevel@tonic-gate 		return;
1745*7c478bd9Sstevel@tonic-gate 	}
1746*7c478bd9Sstevel@tonic-gate 
1747*7c478bd9Sstevel@tonic-gate 	/* perform the write */
1748*7c478bd9Sstevel@tonic-gate 	bf_index = cmn->cmn_chunk / cmap->cmap_chunksperbf;
1749*7c478bd9Sstevel@tonic-gate 
1750*7c478bd9Sstevel@tonic-gate 	if (error = vn_rdwr(UIO_WRITE, (cowp->cow_backfile_array)[bf_index],
1751*7c478bd9Sstevel@tonic-gate 	    cmn->cmn_buf, cmap->cmap_chunksz,
1752*7c478bd9Sstevel@tonic-gate 	    (cmn->cmn_chunk % cmap->cmap_chunksperbf) * cmap->cmap_chunksz,
1753*7c478bd9Sstevel@tonic-gate 	    UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, (ssize_t *)NULL)) {
1754*7c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN, "fssnap_write_taskq: error writing to "
1755*7c478bd9Sstevel@tonic-gate 		    "backing file.  DELETING SNAPSHOT %d, backing file path "
1756*7c478bd9Sstevel@tonic-gate 		    "%s, offset %llu bytes, error %d.", sidp->sid_snapnumber,
1757*7c478bd9Sstevel@tonic-gate 		    (char *)cowp->cow_kstat_bfname->ks_data,
1758*7c478bd9Sstevel@tonic-gate 		    cmn->cmn_chunk * cmap->cmap_chunksz, error);
1759*7c478bd9Sstevel@tonic-gate 		if (release_sem)
1760*7c478bd9Sstevel@tonic-gate 			sema_v(&cmap->cmap_throttle_sem);
1761*7c478bd9Sstevel@tonic-gate 		atomic_or_uint(&sidp->sid_flags, SID_DELETE);
1762*7c478bd9Sstevel@tonic-gate 		return;
1763*7c478bd9Sstevel@tonic-gate 	}
1764*7c478bd9Sstevel@tonic-gate 
1765*7c478bd9Sstevel@tonic-gate 	/*
1766*7c478bd9Sstevel@tonic-gate 	 * now remove the node and buffer from memory
1767*7c478bd9Sstevel@tonic-gate 	 */
1768*7c478bd9Sstevel@tonic-gate 	rw_enter(&cmap->cmap_rwlock, RW_WRITER);
1769*7c478bd9Sstevel@tonic-gate 	transtbl_delete(cmap, cmn);
1770*7c478bd9Sstevel@tonic-gate 	rw_exit(&cmap->cmap_rwlock);
1771*7c478bd9Sstevel@tonic-gate 
1772*7c478bd9Sstevel@tonic-gate 	/* Allow more translations */
1773*7c478bd9Sstevel@tonic-gate 	if (release_sem)
1774*7c478bd9Sstevel@tonic-gate 		sema_v(&cmap->cmap_throttle_sem);
1775*7c478bd9Sstevel@tonic-gate 
1776*7c478bd9Sstevel@tonic-gate }
1777*7c478bd9Sstevel@tonic-gate 
1778*7c478bd9Sstevel@tonic-gate /*
1779*7c478bd9Sstevel@tonic-gate  * fssnap_create_impl() - called from the file system to create a new snapshot
1780*7c478bd9Sstevel@tonic-gate  *
1781*7c478bd9Sstevel@tonic-gate  *    allocates and initializes the structures needed for a new snapshot.
1782*7c478bd9Sstevel@tonic-gate  *    This is called by the file system when it receives an ioctl request to
1783*7c478bd9Sstevel@tonic-gate  *    create a new snapshot.  An unused snapshot identifier is either found
1784*7c478bd9Sstevel@tonic-gate  *    or created, and eventually returned as the opaque handle the file
1785*7c478bd9Sstevel@tonic-gate  *    system will use to identify this snapshot.  The snapshot number
1786*7c478bd9Sstevel@tonic-gate  *    associated with the snapshot identifier is the same as the minor
1787*7c478bd9Sstevel@tonic-gate  *    number for the snapshot device that is used to access that snapshot.
1788*7c478bd9Sstevel@tonic-gate  *
1789*7c478bd9Sstevel@tonic-gate  *    The snapshot can not be used until the candidate bitmap is populated
1790*7c478bd9Sstevel@tonic-gate  *    by the file system (see fssnap_set_candidate_impl()), and the file
1791*7c478bd9Sstevel@tonic-gate  *    system finishes the setup process by calling fssnap_create_done().
1792*7c478bd9Sstevel@tonic-gate  *    Nearly all of the snapshot locks are held for the duration of the
1793*7c478bd9Sstevel@tonic-gate  *    create, and are not released until fssnap_create_done is called().
1794*7c478bd9Sstevel@tonic-gate  */
1795*7c478bd9Sstevel@tonic-gate static void *
1796*7c478bd9Sstevel@tonic-gate fssnap_create_impl(chunknumber_t nchunks, uint_t chunksz, u_offset_t maxsize,
1797*7c478bd9Sstevel@tonic-gate     struct vnode *fsvp, int backfilecount, struct vnode **bfvpp, char *backpath,
1798*7c478bd9Sstevel@tonic-gate     u_offset_t max_backfile_size)
1799*7c478bd9Sstevel@tonic-gate {
1800*7c478bd9Sstevel@tonic-gate 	refstr_t *mountpoint;
1801*7c478bd9Sstevel@tonic-gate 	char taskqname[50];
1802*7c478bd9Sstevel@tonic-gate 	struct cow_info *cowp;
1803*7c478bd9Sstevel@tonic-gate 	struct cow_map	*cmap;
1804*7c478bd9Sstevel@tonic-gate 	struct snapshot_id *sidp;
1805*7c478bd9Sstevel@tonic-gate 	int lastsnap;
1806*7c478bd9Sstevel@tonic-gate 
1807*7c478bd9Sstevel@tonic-gate 	/*
1808*7c478bd9Sstevel@tonic-gate 	 * Sanity check the parameters we care about
1809*7c478bd9Sstevel@tonic-gate 	 * (we don't care about the informational parameters)
1810*7c478bd9Sstevel@tonic-gate 	 */
1811*7c478bd9Sstevel@tonic-gate 	if ((nchunks == 0) ||
1812*7c478bd9Sstevel@tonic-gate 	    ((chunksz % DEV_BSIZE) != 0) ||
1813*7c478bd9Sstevel@tonic-gate 	    (bfvpp == NULL)) {
1814*7c478bd9Sstevel@tonic-gate 		return (NULL);
1815*7c478bd9Sstevel@tonic-gate 	}
1816*7c478bd9Sstevel@tonic-gate 
1817*7c478bd9Sstevel@tonic-gate 	/*
1818*7c478bd9Sstevel@tonic-gate 	 * Look for unused snapshot identifiers.  Snapshot ids are never
1819*7c478bd9Sstevel@tonic-gate 	 * freed, but deleted snapshot ids will be recycled as needed.
1820*7c478bd9Sstevel@tonic-gate 	 */
1821*7c478bd9Sstevel@tonic-gate 	mutex_enter(&snapshot_mutex);
1822*7c478bd9Sstevel@tonic-gate 
1823*7c478bd9Sstevel@tonic-gate findagain:
1824*7c478bd9Sstevel@tonic-gate 	lastsnap = 0;
1825*7c478bd9Sstevel@tonic-gate 	for (sidp = snapshot; sidp != NULL; sidp = sidp->sid_next) {
1826*7c478bd9Sstevel@tonic-gate 		if (sidp->sid_snapnumber > lastsnap)
1827*7c478bd9Sstevel@tonic-gate 			lastsnap = sidp->sid_snapnumber;
1828*7c478bd9Sstevel@tonic-gate 
1829*7c478bd9Sstevel@tonic-gate 		/*
1830*7c478bd9Sstevel@tonic-gate 		 * The sid_rwlock is taken as a reader initially so that
1831*7c478bd9Sstevel@tonic-gate 		 * activity on each snapshot is not stalled while searching
1832*7c478bd9Sstevel@tonic-gate 		 * for a free snapshot id.
1833*7c478bd9Sstevel@tonic-gate 		 */
1834*7c478bd9Sstevel@tonic-gate 		rw_enter(&sidp->sid_rwlock, RW_READER);
1835*7c478bd9Sstevel@tonic-gate 
1836*7c478bd9Sstevel@tonic-gate 		/*
1837*7c478bd9Sstevel@tonic-gate 		 * If the snapshot has been deleted and nobody is using the
1838*7c478bd9Sstevel@tonic-gate 		 * snapshot device than we can reuse this snapshot_id.  If
1839*7c478bd9Sstevel@tonic-gate 		 * the snapshot is marked to be deleted (SID_DELETE), then
1840*7c478bd9Sstevel@tonic-gate 		 * it hasn't been deleted yet so don't reuse it.
1841*7c478bd9Sstevel@tonic-gate 		 */
1842*7c478bd9Sstevel@tonic-gate 		if (SID_AVAILABLE(sidp))
1843*7c478bd9Sstevel@tonic-gate 			break; /* This spot is unused, so take it */
1844*7c478bd9Sstevel@tonic-gate 		rw_exit(&sidp->sid_rwlock);
1845*7c478bd9Sstevel@tonic-gate 	}
1846*7c478bd9Sstevel@tonic-gate 
1847*7c478bd9Sstevel@tonic-gate 	/*
1848*7c478bd9Sstevel@tonic-gate 	 * add a new snapshot identifier if there are no deleted
1849*7c478bd9Sstevel@tonic-gate 	 * entries.  Since it doesn't matter what order the entries
1850*7c478bd9Sstevel@tonic-gate 	 * are in we can just add it to the beginning of the list.
1851*7c478bd9Sstevel@tonic-gate 	 */
1852*7c478bd9Sstevel@tonic-gate 	if (sidp) {
1853*7c478bd9Sstevel@tonic-gate 		if (rw_tryupgrade(&sidp->sid_rwlock) == 0) {
1854*7c478bd9Sstevel@tonic-gate 			/* someone else grabbed it as a writer, try again */
1855*7c478bd9Sstevel@tonic-gate 			rw_exit(&sidp->sid_rwlock);
1856*7c478bd9Sstevel@tonic-gate 			goto findagain;
1857*7c478bd9Sstevel@tonic-gate 		}
1858*7c478bd9Sstevel@tonic-gate 	} else {
1859*7c478bd9Sstevel@tonic-gate 		/* Create a new node if we didn't find an unused one */
1860*7c478bd9Sstevel@tonic-gate 		sidp = kmem_alloc(sizeof (struct snapshot_id), KM_SLEEP);
1861*7c478bd9Sstevel@tonic-gate 		rw_init(&sidp->sid_rwlock, NULL, RW_DEFAULT, NULL);
1862*7c478bd9Sstevel@tonic-gate 		rw_enter(&sidp->sid_rwlock, RW_WRITER);
1863*7c478bd9Sstevel@tonic-gate 		sidp->sid_snapnumber = (snapshot == NULL) ? 0 : lastsnap + 1;
1864*7c478bd9Sstevel@tonic-gate 		sidp->sid_cowinfo = NULL;
1865*7c478bd9Sstevel@tonic-gate 		sidp->sid_flags = 0;
1866*7c478bd9Sstevel@tonic-gate 		sidp->sid_next = snapshot;
1867*7c478bd9Sstevel@tonic-gate 		snapshot = sidp;
1868*7c478bd9Sstevel@tonic-gate 	}
1869*7c478bd9Sstevel@tonic-gate 
1870*7c478bd9Sstevel@tonic-gate 	ASSERT(RW_WRITE_HELD(&sidp->sid_rwlock));
1871*7c478bd9Sstevel@tonic-gate 	ASSERT(sidp->sid_cowinfo == NULL);
1872*7c478bd9Sstevel@tonic-gate 	ASSERT(sidp->sid_snapnumber <= (lastsnap + 1));
1873*7c478bd9Sstevel@tonic-gate 
1874*7c478bd9Sstevel@tonic-gate 	sidp->sid_flags |= SID_CREATING;
1875*7c478bd9Sstevel@tonic-gate 	/* The root vnode is held until snap_delete_impl() is called */
1876*7c478bd9Sstevel@tonic-gate 	VN_HOLD(fsvp);
1877*7c478bd9Sstevel@tonic-gate 	sidp->sid_fvp = fsvp;
1878*7c478bd9Sstevel@tonic-gate 	num_snapshots++;
1879*7c478bd9Sstevel@tonic-gate 
1880*7c478bd9Sstevel@tonic-gate 	/* allocate and initialize structures */
1881*7c478bd9Sstevel@tonic-gate 
1882*7c478bd9Sstevel@tonic-gate 	cowp = kmem_zalloc(sizeof (struct cow_info), KM_SLEEP);
1883*7c478bd9Sstevel@tonic-gate 
1884*7c478bd9Sstevel@tonic-gate 	cowp->cow_backfile_array = bfvpp;
1885*7c478bd9Sstevel@tonic-gate 	cowp->cow_backcount = backfilecount;
1886*7c478bd9Sstevel@tonic-gate 	cowp->cow_backfile_sz = max_backfile_size;
1887*7c478bd9Sstevel@tonic-gate 
1888*7c478bd9Sstevel@tonic-gate 	/*
1889*7c478bd9Sstevel@tonic-gate 	 * Initialize task queues for this snapshot.  Only a small number
1890*7c478bd9Sstevel@tonic-gate 	 * of threads are required because they will be serialized on the
1891*7c478bd9Sstevel@tonic-gate 	 * backing file's reader/writer lock anyway.
1892*7c478bd9Sstevel@tonic-gate 	 */
1893*7c478bd9Sstevel@tonic-gate 	(void) snprintf(taskqname, sizeof (taskqname), "%s_taskq_%d", snapname,
1894*7c478bd9Sstevel@tonic-gate 	    sidp->sid_snapnumber);
1895*7c478bd9Sstevel@tonic-gate 	cowp->cow_taskq = taskq_create(taskqname, fssnap_taskq_nthreads,
1896*7c478bd9Sstevel@tonic-gate 	    minclsyspri, 1,  fssnap_taskq_maxtasks, 0);
1897*7c478bd9Sstevel@tonic-gate 
1898*7c478bd9Sstevel@tonic-gate 	/* don't allow tasks to start until after everything is ready */
1899*7c478bd9Sstevel@tonic-gate 	taskq_suspend(cowp->cow_taskq);
1900*7c478bd9Sstevel@tonic-gate 
1901*7c478bd9Sstevel@tonic-gate 	/* initialize translation table */
1902*7c478bd9Sstevel@tonic-gate 	cmap = &cowp->cow_map;
1903*7c478bd9Sstevel@tonic-gate 	rw_init(&cmap->cmap_rwlock, NULL, RW_DEFAULT, NULL);
1904*7c478bd9Sstevel@tonic-gate 	rw_enter(&cmap->cmap_rwlock, RW_WRITER);
1905*7c478bd9Sstevel@tonic-gate 
1906*7c478bd9Sstevel@tonic-gate 	sema_init(&cmap->cmap_throttle_sem, fssnap_max_mem_chunks, NULL,
1907*7c478bd9Sstevel@tonic-gate 	    SEMA_DEFAULT, NULL);
1908*7c478bd9Sstevel@tonic-gate 
1909*7c478bd9Sstevel@tonic-gate 	cmap->cmap_chunksz = chunksz;
1910*7c478bd9Sstevel@tonic-gate 	cmap->cmap_maxsize = maxsize;
1911*7c478bd9Sstevel@tonic-gate 	cmap->cmap_chunksperbf = max_backfile_size / chunksz;
1912*7c478bd9Sstevel@tonic-gate 
1913*7c478bd9Sstevel@tonic-gate 	/*
1914*7c478bd9Sstevel@tonic-gate 	 * allocate one bit per chunk for the bitmaps, round up
1915*7c478bd9Sstevel@tonic-gate 	 */
1916*7c478bd9Sstevel@tonic-gate 	cmap->cmap_bmsize = (nchunks + (NBBY - 1)) / NBBY;
1917*7c478bd9Sstevel@tonic-gate 	cmap->cmap_hastrans  = kmem_zalloc(cmap->cmap_bmsize, KM_SLEEP);
1918*7c478bd9Sstevel@tonic-gate 	cmap->cmap_candidate = kmem_zalloc(cmap->cmap_bmsize, KM_SLEEP);
1919*7c478bd9Sstevel@tonic-gate 
1920*7c478bd9Sstevel@tonic-gate 	sidp->sid_cowinfo = cowp;
1921*7c478bd9Sstevel@tonic-gate 
1922*7c478bd9Sstevel@tonic-gate 	/* initialize kstats for this snapshot */
1923*7c478bd9Sstevel@tonic-gate 	mountpoint = vfs_getmntpoint(fsvp->v_vfsp);
1924*7c478bd9Sstevel@tonic-gate 	fssnap_create_kstats(sidp, sidp->sid_snapnumber,
1925*7c478bd9Sstevel@tonic-gate 	    refstr_value(mountpoint), backpath);
1926*7c478bd9Sstevel@tonic-gate 	refstr_rele(mountpoint);
1927*7c478bd9Sstevel@tonic-gate 
1928*7c478bd9Sstevel@tonic-gate 	mutex_exit(&snapshot_mutex);
1929*7c478bd9Sstevel@tonic-gate 
1930*7c478bd9Sstevel@tonic-gate 	/*
1931*7c478bd9Sstevel@tonic-gate 	 * return with snapshot id rwlock held as a writer until
1932*7c478bd9Sstevel@tonic-gate 	 * fssnap_create_done is called
1933*7c478bd9Sstevel@tonic-gate 	 */
1934*7c478bd9Sstevel@tonic-gate 	return (sidp);
1935*7c478bd9Sstevel@tonic-gate }
1936*7c478bd9Sstevel@tonic-gate 
1937*7c478bd9Sstevel@tonic-gate /*
1938*7c478bd9Sstevel@tonic-gate  * fssnap_set_candidate_impl() - mark a chunk as a candidate for copy-on-write
1939*7c478bd9Sstevel@tonic-gate  *
1940*7c478bd9Sstevel@tonic-gate  *    sets a bit in the candidate bitmap that indicates that a chunk is a
1941*7c478bd9Sstevel@tonic-gate  *    candidate for copy-on-write.  Typically, chunks that are allocated on
1942*7c478bd9Sstevel@tonic-gate  *    the file system at the time the snapshot is taken are candidates,
1943*7c478bd9Sstevel@tonic-gate  *    while chunks that have no allocated data do not need to be copied.
1944*7c478bd9Sstevel@tonic-gate  *    Chunks containing metadata must be marked as candidates as well.
1945*7c478bd9Sstevel@tonic-gate  */
1946*7c478bd9Sstevel@tonic-gate static void
1947*7c478bd9Sstevel@tonic-gate fssnap_set_candidate_impl(void *snapshot_id, chunknumber_t chunknumber)
1948*7c478bd9Sstevel@tonic-gate {
1949*7c478bd9Sstevel@tonic-gate 	struct snapshot_id	*sid = snapshot_id;
1950*7c478bd9Sstevel@tonic-gate 	struct cow_info *cowp = sid->sid_cowinfo;
1951*7c478bd9Sstevel@tonic-gate 	struct cow_map	*cmap = &cowp->cow_map;
1952*7c478bd9Sstevel@tonic-gate 
1953*7c478bd9Sstevel@tonic-gate 	/* simple bitmap operation for now */
1954*7c478bd9Sstevel@tonic-gate 	ASSERT(chunknumber < (cmap->cmap_bmsize * NBBY));
1955*7c478bd9Sstevel@tonic-gate 	setbit(cmap->cmap_candidate, chunknumber);
1956*7c478bd9Sstevel@tonic-gate }
1957*7c478bd9Sstevel@tonic-gate 
1958*7c478bd9Sstevel@tonic-gate /*
1959*7c478bd9Sstevel@tonic-gate  * fssnap_is_candidate_impl() - check whether a chunk is a candidate
1960*7c478bd9Sstevel@tonic-gate  *
1961*7c478bd9Sstevel@tonic-gate  *    returns 0 if the chunk is not a candidate and 1 if the chunk is a
1962*7c478bd9Sstevel@tonic-gate  *    candidate.  This can be used by the file system to change behavior for
1963*7c478bd9Sstevel@tonic-gate  *    chunks that might induce a copy-on-write.  The offset is specified in
1964*7c478bd9Sstevel@tonic-gate  *    bytes since the chunk size may not be known by the file system.
1965*7c478bd9Sstevel@tonic-gate  */
1966*7c478bd9Sstevel@tonic-gate static int
1967*7c478bd9Sstevel@tonic-gate fssnap_is_candidate_impl(void *snapshot_id, u_offset_t off)
1968*7c478bd9Sstevel@tonic-gate {
1969*7c478bd9Sstevel@tonic-gate 	struct snapshot_id	*sid = snapshot_id;
1970*7c478bd9Sstevel@tonic-gate 	struct cow_info *cowp = sid->sid_cowinfo;
1971*7c478bd9Sstevel@tonic-gate 	struct cow_map	*cmap = &cowp->cow_map;
1972*7c478bd9Sstevel@tonic-gate 	ulong_t chunknumber = off / cmap->cmap_chunksz;
1973*7c478bd9Sstevel@tonic-gate 
1974*7c478bd9Sstevel@tonic-gate 	/* simple bitmap operation for now */
1975*7c478bd9Sstevel@tonic-gate 	ASSERT(chunknumber < (cmap->cmap_bmsize * NBBY));
1976*7c478bd9Sstevel@tonic-gate 	return (isset(cmap->cmap_candidate, chunknumber));
1977*7c478bd9Sstevel@tonic-gate }
1978*7c478bd9Sstevel@tonic-gate 
1979*7c478bd9Sstevel@tonic-gate /*
1980*7c478bd9Sstevel@tonic-gate  * fssnap_create_done_impl() - complete the snapshot setup process
1981*7c478bd9Sstevel@tonic-gate  *
1982*7c478bd9Sstevel@tonic-gate  *    called when the file system is done populating the candidate bitmap
1983*7c478bd9Sstevel@tonic-gate  *    and it is ready to start using the snapshot.  This routine releases
1984*7c478bd9Sstevel@tonic-gate  *    the snapshot locks, allows taskq tasks to start processing, and
1985*7c478bd9Sstevel@tonic-gate  *    creates the device minor nodes associated with the snapshot.
1986*7c478bd9Sstevel@tonic-gate  */
1987*7c478bd9Sstevel@tonic-gate static int
1988*7c478bd9Sstevel@tonic-gate fssnap_create_done_impl(void *snapshot_id)
1989*7c478bd9Sstevel@tonic-gate {
1990*7c478bd9Sstevel@tonic-gate 	struct snapshot_id	**sidpp, *sidp = snapshot_id;
1991*7c478bd9Sstevel@tonic-gate 	struct cow_info		*cowp;
1992*7c478bd9Sstevel@tonic-gate 	struct cow_map		*cmap;
1993*7c478bd9Sstevel@tonic-gate 	int			snapnumber = -1;
1994*7c478bd9Sstevel@tonic-gate 	char			name[20];
1995*7c478bd9Sstevel@tonic-gate 
1996*7c478bd9Sstevel@tonic-gate 	/* sid rwlock and cmap rwlock should be taken from fssnap_create */
1997*7c478bd9Sstevel@tonic-gate 	ASSERT(sidp);
1998*7c478bd9Sstevel@tonic-gate 	ASSERT(RW_WRITE_HELD(&sidp->sid_rwlock));
1999*7c478bd9Sstevel@tonic-gate 	ASSERT(sidp->sid_cowinfo);
2000*7c478bd9Sstevel@tonic-gate 
2001*7c478bd9Sstevel@tonic-gate 	cowp = sidp->sid_cowinfo;
2002*7c478bd9Sstevel@tonic-gate 	cmap = &cowp->cow_map;
2003*7c478bd9Sstevel@tonic-gate 
2004*7c478bd9Sstevel@tonic-gate 	ASSERT(RW_WRITE_HELD(&cmap->cmap_rwlock));
2005*7c478bd9Sstevel@tonic-gate 
2006*7c478bd9Sstevel@tonic-gate 	sidp->sid_flags &= ~(SID_CREATING | SID_DISABLED);
2007*7c478bd9Sstevel@tonic-gate 	snapnumber = sidp->sid_snapnumber;
2008*7c478bd9Sstevel@tonic-gate 
2009*7c478bd9Sstevel@tonic-gate 	/* allocate state structure and find new snapshot id */
2010*7c478bd9Sstevel@tonic-gate 	if (ddi_soft_state_zalloc(statep, snapnumber) != DDI_SUCCESS) {
2011*7c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN,
2012*7c478bd9Sstevel@tonic-gate 		    "snap_ioctl: create: could not allocate "
2013*7c478bd9Sstevel@tonic-gate 		    "state for snapshot %d.", snapnumber);
2014*7c478bd9Sstevel@tonic-gate 		snapnumber = -1;
2015*7c478bd9Sstevel@tonic-gate 		goto out;
2016*7c478bd9Sstevel@tonic-gate 	}
2017*7c478bd9Sstevel@tonic-gate 
2018*7c478bd9Sstevel@tonic-gate 	sidpp = ddi_get_soft_state(statep, snapnumber);
2019*7c478bd9Sstevel@tonic-gate 	*sidpp = sidp;
2020*7c478bd9Sstevel@tonic-gate 
2021*7c478bd9Sstevel@tonic-gate 	/* create minor node based on snapshot number */
2022*7c478bd9Sstevel@tonic-gate 	ASSERT(fssnap_dip != NULL);
2023*7c478bd9Sstevel@tonic-gate 	(void) snprintf(name, sizeof (name), "%d", snapnumber);
2024*7c478bd9Sstevel@tonic-gate 	if (ddi_create_minor_node(fssnap_dip, name, S_IFBLK,
2025*7c478bd9Sstevel@tonic-gate 	    snapnumber, DDI_PSEUDO, 0) != DDI_SUCCESS) {
2026*7c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN, "snap_ioctl: could not create "
2027*7c478bd9Sstevel@tonic-gate 		    "block minor node for snapshot %d.", snapnumber);
2028*7c478bd9Sstevel@tonic-gate 		snapnumber = -1;
2029*7c478bd9Sstevel@tonic-gate 		goto out;
2030*7c478bd9Sstevel@tonic-gate 	}
2031*7c478bd9Sstevel@tonic-gate 
2032*7c478bd9Sstevel@tonic-gate 	(void) snprintf(name, sizeof (name), "%d,raw", snapnumber);
2033*7c478bd9Sstevel@tonic-gate 	if (ddi_create_minor_node(fssnap_dip, name, S_IFCHR,
2034*7c478bd9Sstevel@tonic-gate 	    snapnumber, DDI_PSEUDO, 0) != DDI_SUCCESS) {
2035*7c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN, "snap_ioctl: could not create "
2036*7c478bd9Sstevel@tonic-gate 		    "character minor node for snapshot %d.", snapnumber);
2037*7c478bd9Sstevel@tonic-gate 		snapnumber = -1;
2038*7c478bd9Sstevel@tonic-gate 	}
2039*7c478bd9Sstevel@tonic-gate 
2040*7c478bd9Sstevel@tonic-gate out:
2041*7c478bd9Sstevel@tonic-gate 	rw_exit(&sidp->sid_rwlock);
2042*7c478bd9Sstevel@tonic-gate 	rw_exit(&cmap->cmap_rwlock);
2043*7c478bd9Sstevel@tonic-gate 
2044*7c478bd9Sstevel@tonic-gate 	/* let the taskq threads start processing */
2045*7c478bd9Sstevel@tonic-gate 	taskq_resume(cowp->cow_taskq);
2046*7c478bd9Sstevel@tonic-gate 
2047*7c478bd9Sstevel@tonic-gate 	return (snapnumber);
2048*7c478bd9Sstevel@tonic-gate }
2049*7c478bd9Sstevel@tonic-gate 
2050*7c478bd9Sstevel@tonic-gate /*
2051*7c478bd9Sstevel@tonic-gate  * fssnap_delete_impl() - delete a snapshot
2052*7c478bd9Sstevel@tonic-gate  *
2053*7c478bd9Sstevel@tonic-gate  *    used when a snapshot is no longer needed.  This is called by the file
2054*7c478bd9Sstevel@tonic-gate  *    system when it receives an ioctl request to delete a snapshot.  It is
2055*7c478bd9Sstevel@tonic-gate  *    also called internally when error conditions such as disk full, errors
2056*7c478bd9Sstevel@tonic-gate  *    writing to the backing file, or backing file maxsize exceeded occur.
2057*7c478bd9Sstevel@tonic-gate  *    If the snapshot device is busy when the delete request is received,
2058*7c478bd9Sstevel@tonic-gate  *    all state will be deleted except for the soft state and device files
2059*7c478bd9Sstevel@tonic-gate  *    associated with the snapshot; they will be deleted when the snapshot
2060*7c478bd9Sstevel@tonic-gate  *    device is closed.
2061*7c478bd9Sstevel@tonic-gate  *
2062*7c478bd9Sstevel@tonic-gate  *    NOTE this function takes a POINTER TO A POINTER to the snapshot id,
2063*7c478bd9Sstevel@tonic-gate  *    and expects to be able to set the handle held by the file system to
2064*7c478bd9Sstevel@tonic-gate  *    NULL.  This depends on the file system checking that variable for NULL
2065*7c478bd9Sstevel@tonic-gate  *    before calling fssnap_strategy().
2066*7c478bd9Sstevel@tonic-gate  */
2067*7c478bd9Sstevel@tonic-gate static int
2068*7c478bd9Sstevel@tonic-gate fssnap_delete_impl(void *snapshot_id)
2069*7c478bd9Sstevel@tonic-gate {
2070*7c478bd9Sstevel@tonic-gate 	struct snapshot_id	**sidpp = (struct snapshot_id **)snapshot_id;
2071*7c478bd9Sstevel@tonic-gate 	struct snapshot_id	*sidp;
2072*7c478bd9Sstevel@tonic-gate 	struct snapshot_id	**statesidpp;
2073*7c478bd9Sstevel@tonic-gate 	struct cow_info		*cowp;
2074*7c478bd9Sstevel@tonic-gate 	struct cow_map		*cmap;
2075*7c478bd9Sstevel@tonic-gate 	char			name[20];
2076*7c478bd9Sstevel@tonic-gate 	int			snapnumber = -1;
2077*7c478bd9Sstevel@tonic-gate 	vnode_t			**vpp;
2078*7c478bd9Sstevel@tonic-gate 
2079*7c478bd9Sstevel@tonic-gate 	/*
2080*7c478bd9Sstevel@tonic-gate 	 * sidp is guaranteed to be valid if sidpp is valid because
2081*7c478bd9Sstevel@tonic-gate 	 * the snapshot list is append-only.
2082*7c478bd9Sstevel@tonic-gate 	 */
2083*7c478bd9Sstevel@tonic-gate 	if (sidpp == NULL) {
2084*7c478bd9Sstevel@tonic-gate 		return (-1);
2085*7c478bd9Sstevel@tonic-gate 	}
2086*7c478bd9Sstevel@tonic-gate 
2087*7c478bd9Sstevel@tonic-gate 	sidp = *sidpp;
2088*7c478bd9Sstevel@tonic-gate 	rw_enter(&sidp->sid_rwlock, RW_WRITER);
2089*7c478bd9Sstevel@tonic-gate 
2090*7c478bd9Sstevel@tonic-gate 	ASSERT(RW_WRITE_HELD(&sidp->sid_rwlock));
2091*7c478bd9Sstevel@tonic-gate 
2092*7c478bd9Sstevel@tonic-gate 	/*
2093*7c478bd9Sstevel@tonic-gate 	 * double check that the snapshot is still valid for THIS file system
2094*7c478bd9Sstevel@tonic-gate 	 */
2095*7c478bd9Sstevel@tonic-gate 	if (*sidpp == NULL) {
2096*7c478bd9Sstevel@tonic-gate 		rw_exit(&sidp->sid_rwlock);
2097*7c478bd9Sstevel@tonic-gate 		return (-1);
2098*7c478bd9Sstevel@tonic-gate 	}
2099*7c478bd9Sstevel@tonic-gate 
2100*7c478bd9Sstevel@tonic-gate 	/*
2101*7c478bd9Sstevel@tonic-gate 	 * Now we know the snapshot is still valid and will not go away
2102*7c478bd9Sstevel@tonic-gate 	 * because we have the write lock.  Once the state is transitioned
2103*7c478bd9Sstevel@tonic-gate 	 * to "disabling", the sid_rwlock can be released.  Any pending I/O
2104*7c478bd9Sstevel@tonic-gate 	 * waiting for the lock as a reader will check for this state and
2105*7c478bd9Sstevel@tonic-gate 	 * abort without touching data that may be getting freed.
2106*7c478bd9Sstevel@tonic-gate 	 */
2107*7c478bd9Sstevel@tonic-gate 	sidp->sid_flags |= SID_DISABLING;
2108*7c478bd9Sstevel@tonic-gate 	if (sidp->sid_flags & SID_DELETE) {
2109*7c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN, "Snapshot %d automatically deleted.",
2110*7c478bd9Sstevel@tonic-gate 		    sidp->sid_snapnumber);
2111*7c478bd9Sstevel@tonic-gate 		sidp->sid_flags &= ~(SID_DELETE);
2112*7c478bd9Sstevel@tonic-gate 	}
2113*7c478bd9Sstevel@tonic-gate 
2114*7c478bd9Sstevel@tonic-gate 
2115*7c478bd9Sstevel@tonic-gate 	/*
2116*7c478bd9Sstevel@tonic-gate 	 * This is pointing into file system specific data!  The assumption is
2117*7c478bd9Sstevel@tonic-gate 	 * that fssnap_strategy() gets called from the file system based on
2118*7c478bd9Sstevel@tonic-gate 	 * whether this reference to the snapshot_id is NULL or not.  So
2119*7c478bd9Sstevel@tonic-gate 	 * setting this to NULL should disable snapshots for the file system.
2120*7c478bd9Sstevel@tonic-gate 	 */
2121*7c478bd9Sstevel@tonic-gate 	*sidpp = NULL;
2122*7c478bd9Sstevel@tonic-gate 
2123*7c478bd9Sstevel@tonic-gate 	/* remove cowinfo */
2124*7c478bd9Sstevel@tonic-gate 	cowp = sidp->sid_cowinfo;
2125*7c478bd9Sstevel@tonic-gate 	if (cowp == NULL) {
2126*7c478bd9Sstevel@tonic-gate 		rw_exit(&sidp->sid_rwlock);
2127*7c478bd9Sstevel@tonic-gate 		return (-1);
2128*7c478bd9Sstevel@tonic-gate 	}
2129*7c478bd9Sstevel@tonic-gate 	rw_exit(&sidp->sid_rwlock);
2130*7c478bd9Sstevel@tonic-gate 
2131*7c478bd9Sstevel@tonic-gate 	/* destroy task queues first so they don't reference freed data. */
2132*7c478bd9Sstevel@tonic-gate 	if (cowp->cow_taskq) {
2133*7c478bd9Sstevel@tonic-gate 		taskq_destroy(cowp->cow_taskq);
2134*7c478bd9Sstevel@tonic-gate 		cowp->cow_taskq = NULL;
2135*7c478bd9Sstevel@tonic-gate 	}
2136*7c478bd9Sstevel@tonic-gate 
2137*7c478bd9Sstevel@tonic-gate 	if (cowp->cow_backfile_array != NULL) {
2138*7c478bd9Sstevel@tonic-gate 		for (vpp = cowp->cow_backfile_array; *vpp; vpp++)
2139*7c478bd9Sstevel@tonic-gate 			VN_RELE(*vpp);
2140*7c478bd9Sstevel@tonic-gate 		kmem_free(cowp->cow_backfile_array,
2141*7c478bd9Sstevel@tonic-gate 		    (cowp->cow_backcount + 1) * sizeof (vnode_t *));
2142*7c478bd9Sstevel@tonic-gate 		cowp->cow_backfile_array = NULL;
2143*7c478bd9Sstevel@tonic-gate 	}
2144*7c478bd9Sstevel@tonic-gate 
2145*7c478bd9Sstevel@tonic-gate 	sidp->sid_cowinfo = NULL;
2146*7c478bd9Sstevel@tonic-gate 
2147*7c478bd9Sstevel@tonic-gate 	/* remove cmap */
2148*7c478bd9Sstevel@tonic-gate 	cmap = &cowp->cow_map;
2149*7c478bd9Sstevel@tonic-gate 	ASSERT(cmap);
2150*7c478bd9Sstevel@tonic-gate 
2151*7c478bd9Sstevel@tonic-gate 	if (cmap->cmap_candidate)
2152*7c478bd9Sstevel@tonic-gate 		kmem_free(cmap->cmap_candidate, cmap->cmap_bmsize);
2153*7c478bd9Sstevel@tonic-gate 
2154*7c478bd9Sstevel@tonic-gate 	if (cmap->cmap_hastrans)
2155*7c478bd9Sstevel@tonic-gate 		kmem_free(cmap->cmap_hastrans, cmap->cmap_bmsize);
2156*7c478bd9Sstevel@tonic-gate 
2157*7c478bd9Sstevel@tonic-gate 	if (cmap->cmap_table)
2158*7c478bd9Sstevel@tonic-gate 		transtbl_free(&cowp->cow_map);
2159*7c478bd9Sstevel@tonic-gate 
2160*7c478bd9Sstevel@tonic-gate 	rw_destroy(&cmap->cmap_rwlock);
2161*7c478bd9Sstevel@tonic-gate 
2162*7c478bd9Sstevel@tonic-gate 	while (cmap->cmap_waiters) {
2163*7c478bd9Sstevel@tonic-gate 		sema_p(&cmap->cmap_throttle_sem);
2164*7c478bd9Sstevel@tonic-gate 		sema_v(&cmap->cmap_throttle_sem);
2165*7c478bd9Sstevel@tonic-gate 	}
2166*7c478bd9Sstevel@tonic-gate 	sema_destroy(&cmap->cmap_throttle_sem);
2167*7c478bd9Sstevel@tonic-gate 
2168*7c478bd9Sstevel@tonic-gate 	/* remove kstats */
2169*7c478bd9Sstevel@tonic-gate 	fssnap_delete_kstats(cowp);
2170*7c478bd9Sstevel@tonic-gate 
2171*7c478bd9Sstevel@tonic-gate 	kmem_free(cowp, sizeof (struct cow_info));
2172*7c478bd9Sstevel@tonic-gate 
2173*7c478bd9Sstevel@tonic-gate 	statesidpp = ddi_get_soft_state(statep, sidp->sid_snapnumber);
2174*7c478bd9Sstevel@tonic-gate 	if (statesidpp == NULL || *statesidpp == NULL) {
2175*7c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN,
2176*7c478bd9Sstevel@tonic-gate 		    "fssnap_delete_impl: could not find state for snapshot %d.",
2177*7c478bd9Sstevel@tonic-gate 		    sidp->sid_snapnumber);
2178*7c478bd9Sstevel@tonic-gate 	}
2179*7c478bd9Sstevel@tonic-gate 	ASSERT(*statesidpp == sidp);
2180*7c478bd9Sstevel@tonic-gate 
2181*7c478bd9Sstevel@tonic-gate 	/*
2182*7c478bd9Sstevel@tonic-gate 	 * Leave the node in the list marked DISABLED so it can be reused
2183*7c478bd9Sstevel@tonic-gate 	 * and avoid many race conditions.  Return the snapshot number
2184*7c478bd9Sstevel@tonic-gate 	 * that was deleted.
2185*7c478bd9Sstevel@tonic-gate 	 */
2186*7c478bd9Sstevel@tonic-gate 	mutex_enter(&snapshot_mutex);
2187*7c478bd9Sstevel@tonic-gate 	rw_enter(&sidp->sid_rwlock, RW_WRITER);
2188*7c478bd9Sstevel@tonic-gate 	sidp->sid_flags &= ~(SID_DISABLING);
2189*7c478bd9Sstevel@tonic-gate 	sidp->sid_flags |= SID_DISABLED;
2190*7c478bd9Sstevel@tonic-gate 	VN_RELE(sidp->sid_fvp);
2191*7c478bd9Sstevel@tonic-gate 	sidp->sid_fvp = NULL;
2192*7c478bd9Sstevel@tonic-gate 	snapnumber = sidp->sid_snapnumber;
2193*7c478bd9Sstevel@tonic-gate 
2194*7c478bd9Sstevel@tonic-gate 	/*
2195*7c478bd9Sstevel@tonic-gate 	 * If the snapshot is not busy, free the device info now.  Otherwise
2196*7c478bd9Sstevel@tonic-gate 	 * the device nodes are freed in snap_close() when the device is
2197*7c478bd9Sstevel@tonic-gate 	 * closed.  The sid will not be reused until the device is not busy.
2198*7c478bd9Sstevel@tonic-gate 	 */
2199*7c478bd9Sstevel@tonic-gate 	if (SID_AVAILABLE(sidp)) {
2200*7c478bd9Sstevel@tonic-gate 		/* remove the device nodes */
2201*7c478bd9Sstevel@tonic-gate 		ASSERT(fssnap_dip != NULL);
2202*7c478bd9Sstevel@tonic-gate 		(void) snprintf(name, sizeof (name), "%d",
2203*7c478bd9Sstevel@tonic-gate 		    sidp->sid_snapnumber);
2204*7c478bd9Sstevel@tonic-gate 		ddi_remove_minor_node(fssnap_dip, name);
2205*7c478bd9Sstevel@tonic-gate 		(void) snprintf(name, sizeof (name), "%d,raw",
2206*7c478bd9Sstevel@tonic-gate 		    sidp->sid_snapnumber);
2207*7c478bd9Sstevel@tonic-gate 		ddi_remove_minor_node(fssnap_dip, name);
2208*7c478bd9Sstevel@tonic-gate 
2209*7c478bd9Sstevel@tonic-gate 		/* delete the state structure */
2210*7c478bd9Sstevel@tonic-gate 		ddi_soft_state_free(statep, sidp->sid_snapnumber);
2211*7c478bd9Sstevel@tonic-gate 		num_snapshots--;
2212*7c478bd9Sstevel@tonic-gate 	}
2213*7c478bd9Sstevel@tonic-gate 
2214*7c478bd9Sstevel@tonic-gate 	mutex_exit(&snapshot_mutex);
2215*7c478bd9Sstevel@tonic-gate 	rw_exit(&sidp->sid_rwlock);
2216*7c478bd9Sstevel@tonic-gate 
2217*7c478bd9Sstevel@tonic-gate 	return (snapnumber);
2218*7c478bd9Sstevel@tonic-gate }
2219*7c478bd9Sstevel@tonic-gate 
2220*7c478bd9Sstevel@tonic-gate /*
2221*7c478bd9Sstevel@tonic-gate  * fssnap_create_kstats() - allocate and initialize snapshot kstats
2222*7c478bd9Sstevel@tonic-gate  *
2223*7c478bd9Sstevel@tonic-gate  */
2224*7c478bd9Sstevel@tonic-gate static void
2225*7c478bd9Sstevel@tonic-gate fssnap_create_kstats(snapshot_id_t *sidp, int snapnum,
2226*7c478bd9Sstevel@tonic-gate     const char *mountpoint, const char *backfilename)
2227*7c478bd9Sstevel@tonic-gate {
2228*7c478bd9Sstevel@tonic-gate 	kstat_t *num, *mntpoint, *bfname;
2229*7c478bd9Sstevel@tonic-gate 	kstat_named_t *hw;
2230*7c478bd9Sstevel@tonic-gate 	struct cow_info *cowp = sidp->sid_cowinfo;
2231*7c478bd9Sstevel@tonic-gate 	struct cow_kstat_num *stats;
2232*7c478bd9Sstevel@tonic-gate 
2233*7c478bd9Sstevel@tonic-gate 	/* update the high water mark */
2234*7c478bd9Sstevel@tonic-gate 	if (fssnap_highwater_kstat == NULL) {
2235*7c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN, "fssnap_create_kstats: failed to lookup "
2236*7c478bd9Sstevel@tonic-gate 		    "high water mark kstat.");
2237*7c478bd9Sstevel@tonic-gate 		return;
2238*7c478bd9Sstevel@tonic-gate 	}
2239*7c478bd9Sstevel@tonic-gate 
2240*7c478bd9Sstevel@tonic-gate 	hw = (kstat_named_t *)fssnap_highwater_kstat->ks_data;
2241*7c478bd9Sstevel@tonic-gate 	if (hw->value.ui32 < snapnum)
2242*7c478bd9Sstevel@tonic-gate 		hw->value.ui32 = snapnum;
2243*7c478bd9Sstevel@tonic-gate 
2244*7c478bd9Sstevel@tonic-gate 	/* initialize the mount point kstat */
2245*7c478bd9Sstevel@tonic-gate 	kstat_delete_byname(snapname, snapnum, FSSNAP_KSTAT_MNTPT);
2246*7c478bd9Sstevel@tonic-gate 
2247*7c478bd9Sstevel@tonic-gate 	if (mountpoint != NULL) {
2248*7c478bd9Sstevel@tonic-gate 		mntpoint = kstat_create(snapname, snapnum, FSSNAP_KSTAT_MNTPT,
2249*7c478bd9Sstevel@tonic-gate 		    "misc", KSTAT_TYPE_RAW, strlen(mountpoint) + 1, 0);
2250*7c478bd9Sstevel@tonic-gate 		if (mntpoint == NULL) {
2251*7c478bd9Sstevel@tonic-gate 			cowp->cow_kstat_mntpt = NULL;
2252*7c478bd9Sstevel@tonic-gate 			cmn_err(CE_WARN, "fssnap_create_kstats: failed to "
2253*7c478bd9Sstevel@tonic-gate 			    "create mount point kstat");
2254*7c478bd9Sstevel@tonic-gate 		} else {
2255*7c478bd9Sstevel@tonic-gate 			(void) strncpy(mntpoint->ks_data, mountpoint,
2256*7c478bd9Sstevel@tonic-gate 			    strlen(mountpoint));
2257*7c478bd9Sstevel@tonic-gate 			cowp->cow_kstat_mntpt = mntpoint;
2258*7c478bd9Sstevel@tonic-gate 			kstat_install(mntpoint);
2259*7c478bd9Sstevel@tonic-gate 		}
2260*7c478bd9Sstevel@tonic-gate 	} else {
2261*7c478bd9Sstevel@tonic-gate 		cowp->cow_kstat_mntpt = NULL;
2262*7c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN, "fssnap_create_kstats: mount point not "
2263*7c478bd9Sstevel@tonic-gate 		    "specified.");
2264*7c478bd9Sstevel@tonic-gate 	}
2265*7c478bd9Sstevel@tonic-gate 
2266*7c478bd9Sstevel@tonic-gate 	/* initialize the backing file kstat */
2267*7c478bd9Sstevel@tonic-gate 	kstat_delete_byname(snapname, snapnum, FSSNAP_KSTAT_BFNAME);
2268*7c478bd9Sstevel@tonic-gate 
2269*7c478bd9Sstevel@tonic-gate 	if (backfilename == NULL) {
2270*7c478bd9Sstevel@tonic-gate 		cowp->cow_kstat_bfname = NULL;
2271*7c478bd9Sstevel@tonic-gate 	} else {
2272*7c478bd9Sstevel@tonic-gate 		bfname = kstat_create(snapname, snapnum, FSSNAP_KSTAT_BFNAME,
2273*7c478bd9Sstevel@tonic-gate 		    "misc", KSTAT_TYPE_RAW, strlen(backfilename) + 1, 0);
2274*7c478bd9Sstevel@tonic-gate 		if (bfname != NULL) {
2275*7c478bd9Sstevel@tonic-gate 			(void) strncpy(bfname->ks_data, backfilename,
2276*7c478bd9Sstevel@tonic-gate 			    strlen(backfilename));
2277*7c478bd9Sstevel@tonic-gate 			cowp->cow_kstat_bfname = bfname;
2278*7c478bd9Sstevel@tonic-gate 			kstat_install(bfname);
2279*7c478bd9Sstevel@tonic-gate 		} else {
2280*7c478bd9Sstevel@tonic-gate 			cowp->cow_kstat_bfname = NULL;
2281*7c478bd9Sstevel@tonic-gate 			cmn_err(CE_WARN, "fssnap_create_kstats: failed to "
2282*7c478bd9Sstevel@tonic-gate 			    "create backing file name kstat");
2283*7c478bd9Sstevel@tonic-gate 		}
2284*7c478bd9Sstevel@tonic-gate 	}
2285*7c478bd9Sstevel@tonic-gate 
2286*7c478bd9Sstevel@tonic-gate 	/* initialize numeric kstats */
2287*7c478bd9Sstevel@tonic-gate 	kstat_delete_byname(snapname, snapnum, FSSNAP_KSTAT_NUM);
2288*7c478bd9Sstevel@tonic-gate 
2289*7c478bd9Sstevel@tonic-gate 	num = kstat_create(snapname, snapnum, FSSNAP_KSTAT_NUM,
2290*7c478bd9Sstevel@tonic-gate 	    "misc", KSTAT_TYPE_NAMED,
2291*7c478bd9Sstevel@tonic-gate 	    sizeof (struct cow_kstat_num) / sizeof (kstat_named_t),
2292*7c478bd9Sstevel@tonic-gate 	    0);
2293*7c478bd9Sstevel@tonic-gate 	if (num == NULL) {
2294*7c478bd9Sstevel@tonic-gate 		cmn_err(CE_WARN, "fssnap_create_kstats: failed to create "
2295*7c478bd9Sstevel@tonic-gate 		    "numeric kstats");
2296*7c478bd9Sstevel@tonic-gate 		cowp->cow_kstat_num = NULL;
2297*7c478bd9Sstevel@tonic-gate 		return;
2298*7c478bd9Sstevel@tonic-gate 	}
2299*7c478bd9Sstevel@tonic-gate 
2300*7c478bd9Sstevel@tonic-gate 	cowp->cow_kstat_num = num;
2301*7c478bd9Sstevel@tonic-gate 	stats = num->ks_data;
2302*7c478bd9Sstevel@tonic-gate 	num->ks_update = fssnap_update_kstat_num;
2303*7c478bd9Sstevel@tonic-gate 	num->ks_private = sidp;
2304*7c478bd9Sstevel@tonic-gate 
2305*7c478bd9Sstevel@tonic-gate 	kstat_named_init(&stats->ckn_state, FSSNAP_KSTAT_NUM_STATE,
2306*7c478bd9Sstevel@tonic-gate 	    KSTAT_DATA_INT32);
2307*7c478bd9Sstevel@tonic-gate 	kstat_named_init(&stats->ckn_bfsize, FSSNAP_KSTAT_NUM_BFSIZE,
2308*7c478bd9Sstevel@tonic-gate 	    KSTAT_DATA_UINT64);
2309*7c478bd9Sstevel@tonic-gate 	kstat_named_init(&stats->ckn_maxsize, FSSNAP_KSTAT_NUM_MAXSIZE,
2310*7c478bd9Sstevel@tonic-gate 	    KSTAT_DATA_UINT64);
2311*7c478bd9Sstevel@tonic-gate 	kstat_named_init(&stats->ckn_createtime, FSSNAP_KSTAT_NUM_CREATETIME,
2312*7c478bd9Sstevel@tonic-gate 	    KSTAT_DATA_LONG);
2313*7c478bd9Sstevel@tonic-gate 	kstat_named_init(&stats->ckn_chunksize, FSSNAP_KSTAT_NUM_CHUNKSIZE,
2314*7c478bd9Sstevel@tonic-gate 	    KSTAT_DATA_UINT32);
2315*7c478bd9Sstevel@tonic-gate 
2316*7c478bd9Sstevel@tonic-gate 	/* initialize the static kstats */
2317*7c478bd9Sstevel@tonic-gate 	stats->ckn_chunksize.value.ui32 = cowp->cow_map.cmap_chunksz;
2318*7c478bd9Sstevel@tonic-gate 	stats->ckn_maxsize.value.ui64 = cowp->cow_map.cmap_maxsize;
2319*7c478bd9Sstevel@tonic-gate 	stats->ckn_createtime.value.l = gethrestime_sec();
2320*7c478bd9Sstevel@tonic-gate 
2321*7c478bd9Sstevel@tonic-gate 	kstat_install(num);
2322*7c478bd9Sstevel@tonic-gate }
2323*7c478bd9Sstevel@tonic-gate 
2324*7c478bd9Sstevel@tonic-gate /*
2325*7c478bd9Sstevel@tonic-gate  * fssnap_update_kstat_num() - update a numerical snapshot kstat value
2326*7c478bd9Sstevel@tonic-gate  *
2327*7c478bd9Sstevel@tonic-gate  */
2328*7c478bd9Sstevel@tonic-gate int
2329*7c478bd9Sstevel@tonic-gate fssnap_update_kstat_num(kstat_t *ksp, int rw)
2330*7c478bd9Sstevel@tonic-gate {
2331*7c478bd9Sstevel@tonic-gate 	snapshot_id_t *sidp = (snapshot_id_t *)ksp->ks_private;
2332*7c478bd9Sstevel@tonic-gate 	struct cow_info *cowp = sidp->sid_cowinfo;
2333*7c478bd9Sstevel@tonic-gate 	struct cow_kstat_num *stats = ksp->ks_data;
2334*7c478bd9Sstevel@tonic-gate 
2335*7c478bd9Sstevel@tonic-gate 	if (rw == KSTAT_WRITE)
2336*7c478bd9Sstevel@tonic-gate 		return (EACCES);
2337*7c478bd9Sstevel@tonic-gate 
2338*7c478bd9Sstevel@tonic-gate 	/* state */
2339*7c478bd9Sstevel@tonic-gate 	if (sidp->sid_flags & SID_CREATING)
2340*7c478bd9Sstevel@tonic-gate 		stats->ckn_state.value.i32 = COWSTATE_CREATING;
2341*7c478bd9Sstevel@tonic-gate 	else if (SID_INACTIVE(sidp))
2342*7c478bd9Sstevel@tonic-gate 		stats->ckn_state.value.i32 = COWSTATE_DISABLED;
2343*7c478bd9Sstevel@tonic-gate 	else if (SID_BUSY(sidp))
2344*7c478bd9Sstevel@tonic-gate 		stats->ckn_state.value.i32 = COWSTATE_ACTIVE;
2345*7c478bd9Sstevel@tonic-gate 	else
2346*7c478bd9Sstevel@tonic-gate 		stats->ckn_state.value.i32 = COWSTATE_IDLE;
2347*7c478bd9Sstevel@tonic-gate 
2348*7c478bd9Sstevel@tonic-gate 	/* bfsize */
2349*7c478bd9Sstevel@tonic-gate 	stats->ckn_bfsize.value.ui64 = cowp->cow_map.cmap_nchunks *
2350*7c478bd9Sstevel@tonic-gate 	    cowp->cow_map.cmap_chunksz;
2351*7c478bd9Sstevel@tonic-gate 
2352*7c478bd9Sstevel@tonic-gate 	return (0);
2353*7c478bd9Sstevel@tonic-gate }
2354*7c478bd9Sstevel@tonic-gate 
2355*7c478bd9Sstevel@tonic-gate /*
2356*7c478bd9Sstevel@tonic-gate  * fssnap_delete_kstats() - deallocate snapshot kstats
2357*7c478bd9Sstevel@tonic-gate  *
2358*7c478bd9Sstevel@tonic-gate  */
2359*7c478bd9Sstevel@tonic-gate void
2360*7c478bd9Sstevel@tonic-gate fssnap_delete_kstats(struct cow_info *cowp)
2361*7c478bd9Sstevel@tonic-gate {
2362*7c478bd9Sstevel@tonic-gate 	if (cowp->cow_kstat_num != NULL) {
2363*7c478bd9Sstevel@tonic-gate 		kstat_delete(cowp->cow_kstat_num);
2364*7c478bd9Sstevel@tonic-gate 		cowp->cow_kstat_num = NULL;
2365*7c478bd9Sstevel@tonic-gate 	}
2366*7c478bd9Sstevel@tonic-gate 	if (cowp->cow_kstat_mntpt != NULL) {
2367*7c478bd9Sstevel@tonic-gate 		kstat_delete(cowp->cow_kstat_mntpt);
2368*7c478bd9Sstevel@tonic-gate 		cowp->cow_kstat_mntpt = NULL;
2369*7c478bd9Sstevel@tonic-gate 	}
2370*7c478bd9Sstevel@tonic-gate 	if (cowp->cow_kstat_bfname != NULL) {
2371*7c478bd9Sstevel@tonic-gate 		kstat_delete(cowp->cow_kstat_bfname);
2372*7c478bd9Sstevel@tonic-gate 		cowp->cow_kstat_bfname = NULL;
2373*7c478bd9Sstevel@tonic-gate 	}
2374*7c478bd9Sstevel@tonic-gate }
2375