xref: /titanic_51/usr/src/uts/common/io/fssnap.c (revision 672986541be54a7a471bb088e60780c37e371d7e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/debug.h>
29 #include <sys/types.h>
30 #include <sys/file.h>
31 #include <sys/errno.h>
32 #include <sys/uio.h>
33 #include <sys/open.h>
34 #include <sys/cred.h>
35 #include <sys/kmem.h>
36 #include <sys/conf.h>
37 #include <sys/cmn_err.h>
38 #include <sys/modctl.h>
39 #include <sys/disp.h>
40 #include <sys/atomic.h>
41 #include <sys/filio.h>
42 #include <sys/stat.h> /* needed for S_IFBLK and S_IFCHR */
43 #include <sys/kstat.h>
44 
45 #include <sys/ddi.h>
46 #include <sys/devops.h>
47 #include <sys/sunddi.h>
48 #include <sys/priv_names.h>
49 
50 #include <sys/fssnap.h>
51 #include <sys/fssnap_if.h>
52 
53 /*
54  * This module implements the file system snapshot code, which provides a
55  * point-in-time image of a file system for the purposes of online backup.
56  * There are essentially two parts to this project: the driver half and the
57  * file system half.  The driver half is a pseudo device driver called
58  * "fssnap" that represents the snapshot.  Each snapshot is assigned a
59  * number that corresponds to the minor number of the device, and a control
60  * device with a high minor number is used to initiate snapshot creation and
61  * deletion.  For all practical purposes the driver half acts like a
62  * read-only disk device whose contents are exactly the same as the master
63  * file system at the time the snapshot was created.
64  *
65  * The file system half provides interfaces necessary for performing the
66  * file system dependent operations required to create and delete snapshots
67  * and a special driver strategy routine that must always be used by the file
68  * system for snapshots to work correctly.
69  *
70  * When a snapshot is to be created, the user utility will send an ioctl to
71  * the control device of the driver half specifying the file system to be
72  * snapshotted, the file descriptor of a backing-store file which is used to
73  * hold old data before it is overwritten, and other snapshot parameters.
74  * This ioctl is passed on to the file system specified in the original
75  * ioctl request.  The file system is expected to be able to flush
76  * everything out to make the file system consistent and lock it to ensure
77  * no changes occur while the snapshot is being created.  It then calls
78  * fssnap_create() to create state for a new snapshot, from which an opaque
79  * handle is returned with the snapshot locked.  Next, the file system must
80  * populate the "candidate bitmap", which tells the snapshot code which
81  * "chunks" should be considered for copy-on-write (a chunk is the unit of
82  * granularity used for copy-on-write, which is independent of the device
83  * and file system block sizes).  This is typically done by scanning the
84  * file system allocation bitmaps to determine which chunks contain
85  * allocated blocks in the file system at the time the snapshot was created.
86  * If a chunk has no allocated blocks, it does not need to be copied before
87  * being written to.  Once the candidate bitmap is populated with
88  * fssnap_set_candidate(), the file system calls fssnap_create_done() to
89  * complete the snapshot creation and unlock the snapshot.  The file system
90  * may now be unlocked and modifications to it resumed.
91  *
92  * Once a snapshot is created, the file system must perform all writes
93  * through a special strategy routine, fssnap_strategy().  This strategy
94  * routine determines whether the chunks contained by the write must be
95  * copied before being overwritten by consulting the candidate bitmap
96  * described above, and the "hastrans bitmap" which tells it whether the chunk
97  * has been copied already or not.  If the chunk is a candidate but has not
98  * been copied, it reads the old data in and adds it to a queue.  The
99  * old data can then be overwritten with the new data.  An asynchronous
100  * task queue is dispatched for each old chunk read in which writes the old
101  * data to the backing file specified at snapshot creation time.  The
102  * backing file is a sparse file the same size as the file system that
103  * contains the old data at the offset that data originally had in the
104  * file system.  If the queue containing in-memory chunks gets too large,
105  * writes to the file system may be throttled by a semaphore until the
106  * task queues have a chance to push some of the chunks to the backing file.
107  *
108  * With the candidate bitmap, the hastrans bitmap, the data on the master
109  * file system, and the old data in memory and in the backing file, the
110  * snapshot pseudo-driver can piece together the original file system
111  * information to satisfy read requests.  If the requested chunk is not a
112  * candidate, it returns a zeroed buffer.  If the chunk is a candidate but
113  * has not been copied it reads it from the master file system.  If it is a
114  * candidate and has been copied, it either copies the data from the
115  * in-memory queue or it reads it in from the backing file.  The result is
116  * a replication of the original file system that can be backed up, mounted,
117  * or manipulated by other file system utilities that work on a read-only
118  * device.
119  *
120  * This module is divided into three roughly logical sections:
121  *
122  *     - The snapshot driver, which is a character/block driver
123  *       representing the snapshot itself.  These routines are
124  *       prefixed with "snap_".
125  *
126  *     - The library routines that are defined in fssnap_if.h that
127  *       are used by file systems that use this snapshot implementation.
128  *       These functions are prefixed with "fssnap_" and are called through
129  *       a function vector from the file system.
130  *
131  *     - The helper routines used by the snapshot driver and the fssnap
132  *       library routines for managing the translation table and other
133  *       useful functions.  These routines are all static and are
134  *       prefixed with either "fssnap_" or "transtbl_" if they
135  *       are specifically used for translation table activities.
136  */
137 
138 static dev_info_t		*fssnap_dip = NULL;
139 static struct snapshot_id	*snapshot = NULL;
140 static struct snapshot_id	snap_ctl;
141 static int			num_snapshots = 0;
142 static kmutex_t			snapshot_mutex;
143 static char			snapname[] = SNAP_NAME;
144 
145 /* "tunable" parameters */
146 static int		fssnap_taskq_nthreads = FSSNAP_TASKQ_THREADS;
147 static uint_t		fssnap_max_mem_chunks = FSSNAP_MAX_MEM_CHUNKS;
148 static int		fssnap_taskq_maxtasks = FSSNAP_TASKQ_MAXTASKS;
149 
150 /* static function prototypes */
151 
152 /* snapshot driver */
153 static int snap_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
154 static int snap_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
155 static int snap_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
156 static int snap_open(dev_t *devp, int flag, int otyp, cred_t *cred);
157 static int snap_close(dev_t dev, int flag, int otyp, cred_t *cred);
158 static int snap_strategy(struct buf *bp);
159 static int snap_read(dev_t dev, struct uio *uiop, cred_t *credp);
160 static int snap_print(dev_t dev, char *str);
161 static int snap_ioctl(dev_t dev, int cmd, intptr_t arg, int mode,
162     cred_t *credp, int *rvalp);
163 static int snap_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op,
164     int flags, char *name, caddr_t valuep, int *lengthp);
165 static int snap_getchunk(struct snapshot_id *sidp, chunknumber_t chunk,
166     int offset, int len, char *buffer);
167 
168 
169 /* fssnap interface implementations (see fssnap_if.h) */
170 static void fssnap_strategy_impl(void *, struct buf *);
171 static void *fssnap_create_impl(chunknumber_t, uint_t, u_offset_t,
172     struct vnode *, int, struct vnode **, char *, u_offset_t);
173 static void fssnap_set_candidate_impl(void *, chunknumber_t);
174 static int fssnap_is_candidate_impl(void *, u_offset_t);
175 static int fssnap_create_done_impl(void *);
176 static int fssnap_delete_impl(void *);
177 
178 /* fssnap interface support routines */
179 static int  fssnap_translate(struct snapshot_id **, struct buf *);
180 static void fssnap_write_taskq(void *);
181 static void fssnap_create_kstats(snapshot_id_t *, int, const char *,
182     const char *);
183 static int  fssnap_update_kstat_num(kstat_t *, int);
184 static void fssnap_delete_kstats(struct cow_info *);
185 
186 /* translation table prototypes */
187 static cow_map_node_t *transtbl_add(cow_map_t *, chunknumber_t, caddr_t);
188 static cow_map_node_t *transtbl_get(cow_map_t *, chunknumber_t);
189 static void transtbl_delete(cow_map_t *, cow_map_node_t *);
190 static void transtbl_free(cow_map_t *);
191 
192 static kstat_t *fssnap_highwater_kstat;
193 
194 /* ************************************************************************ */
195 
196 /* Device and Module Structures */
197 
198 static struct cb_ops snap_cb_ops = {
199 	snap_open,
200 	snap_close,
201 	snap_strategy,
202 	snap_print,
203 	nodev,		/* no snap_dump */
204 	snap_read,
205 	nodev,		/* no snap_write */
206 	snap_ioctl,
207 	nodev,		/* no snap_devmap */
208 	nodev,		/* no snap_mmap   */
209 	nodev,		/* no snap_segmap */
210 	nochpoll,
211 	snap_prop_op,
212 	NULL,		/* streamtab */
213 	D_64BIT | D_NEW | D_MP, /* driver compatibility */
214 	CB_REV,
215 	nodev,		/* async I/O read entry point */
216 	nodev		/* async I/O write entry point */
217 };
218 
219 static struct dev_ops snap_ops = {
220 	DEVO_REV,
221 	0,			/* ref count */
222 	snap_getinfo,
223 	nulldev,		/* snap_identify obsolete */
224 	nulldev,		/* no snap_probe */
225 	snap_attach,
226 	snap_detach,
227 	nodev,			/* no snap_reset */
228 	&snap_cb_ops,
229 	(struct bus_ops *)NULL,
230 	nulldev			/* no snap_power() */
231 };
232 
233 extern struct mod_ops mod_driverops;
234 
235 static struct modldrv md = {
236 	&mod_driverops, /* Type of module. This is a driver */
237 	"snapshot driver %I%", 	/* Name of the module */
238 	&snap_ops,
239 };
240 
241 static struct modlinkage ml = {
242 	MODREV_1,
243 	&md,
244 	NULL
245 };
246 
247 static void *statep;
248 
249 int
250 _init(void)
251 {
252 	int	error;
253 	kstat_t	*ksp;
254 	kstat_named_t	*ksdata;
255 
256 	error = ddi_soft_state_init(&statep, sizeof (struct snapshot_id *), 1);
257 	if (error) {
258 		cmn_err(CE_WARN, "_init: failed to init ddi_soft_state.");
259 		return (error);
260 	}
261 
262 	error = mod_install(&ml);
263 
264 	if (error) {
265 		cmn_err(CE_WARN, "_init: failed to mod_install.");
266 		ddi_soft_state_fini(&statep);
267 		return (error);
268 	}
269 
270 	/*
271 	 * Fill in the snapshot operations vector for file systems
272 	 * (defined in fssnap_if.c)
273 	 */
274 
275 	snapops.fssnap_create = fssnap_create_impl;
276 	snapops.fssnap_set_candidate = fssnap_set_candidate_impl;
277 	snapops.fssnap_is_candidate = fssnap_is_candidate_impl;
278 	snapops.fssnap_create_done = fssnap_create_done_impl;
279 	snapops.fssnap_delete = fssnap_delete_impl;
280 	snapops.fssnap_strategy = fssnap_strategy_impl;
281 
282 	mutex_init(&snapshot_mutex, NULL, MUTEX_DEFAULT, NULL);
283 
284 	/*
285 	 * Initialize the fssnap highwater kstat
286 	 */
287 	ksp = kstat_create(snapname, 0, FSSNAP_KSTAT_HIGHWATER, "misc",
288 	    KSTAT_TYPE_NAMED, 1, 0);
289 	if (ksp != NULL) {
290 		ksdata = (kstat_named_t *)ksp->ks_data;
291 		kstat_named_init(ksdata, FSSNAP_KSTAT_HIGHWATER,
292 		    KSTAT_DATA_UINT32);
293 		ksdata->value.ui32 = 0;
294 		kstat_install(ksp);
295 	} else {
296 		cmn_err(CE_WARN, "_init: failed to create highwater kstat.");
297 	}
298 	fssnap_highwater_kstat = ksp;
299 
300 	return (0);
301 }
302 
303 int
304 _info(struct modinfo *modinfop)
305 {
306 	return (mod_info(&ml, modinfop));
307 }
308 
309 int
310 _fini(void)
311 {
312 	int	error;
313 
314 	error = mod_remove(&ml);
315 	if (error)
316 		return (error);
317 	ddi_soft_state_fini(&statep);
318 
319 	/*
320 	 * delete the fssnap highwater kstat
321 	 */
322 	kstat_delete(fssnap_highwater_kstat);
323 
324 	mutex_destroy(&snapshot_mutex);
325 
326 	/* Clear out the file system operations vector */
327 	snapops.fssnap_create = NULL;
328 	snapops.fssnap_set_candidate = NULL;
329 	snapops.fssnap_create_done = NULL;
330 	snapops.fssnap_delete = NULL;
331 	snapops.fssnap_strategy = NULL;
332 
333 	return (0);
334 }
335 
336 /* ************************************************************************ */
337 
338 /*
339  * Snapshot Driver Routines
340  *
341  * This section implements the snapshot character and block drivers.  The
342  * device will appear to be a consistent read-only file system to
343  * applications that wish to back it up or mount it.  The snapshot driver
344  * communicates with the file system through the translation table, which
345  * tells the snapshot driver where to find the data necessary to piece
346  * together the frozen file system.  The data may either be on the master
347  * device (no translation exists), in memory (a translation exists but has
348  * not been flushed to the backing store), or in the backing store file.
349  * The read request may require the snapshot driver to retreive data from
350  * several different places and piece it together to look like a single
351  * contiguous read.
352  *
353  * The device minor number corresponds to the snapshot number in the list of
354  * snapshot identifiers.  The soft state for each minor number is simply a
355  * pointer to the snapshot id, which holds all of the snapshot state.  One
356  * minor number is designated as the control device.  All snapshot create
357  * and delete requests go through the control device to ensure this module
358  * is properly loaded and attached before the file system starts calling
359  * routines defined here.
360  */
361 
362 
363 /*
364  * snap_getinfo() - snapshot driver getinfo(9E) routine
365  *
366  */
367 /*ARGSUSED*/
368 static int
369 snap_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
370 {
371 	switch (infocmd) {
372 	case DDI_INFO_DEVT2DEVINFO:
373 		*result = fssnap_dip;
374 		return (DDI_SUCCESS);
375 	case DDI_INFO_DEVT2INSTANCE:
376 		*result = 0;	/* we only have one instance */
377 		return (DDI_SUCCESS);
378 	}
379 	return (DDI_FAILURE);
380 }
381 
382 /*
383  * snap_attach() - snapshot driver attach(9E) routine
384  *
385  *    sets up snapshot control device and control state.  The control state
386  *    is a pointer to an "anonymous" snapshot_id for tracking opens and closes
387  */
388 static int
389 snap_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
390 {
391 	int			error;
392 
393 	switch (cmd) {
394 	case DDI_ATTACH:
395 		/* create the control device */
396 		error = ddi_create_priv_minor_node(dip, SNAP_CTL_NODE, S_IFCHR,
397 		    SNAP_CTL_MINOR, DDI_PSEUDO, PRIVONLY_DEV,
398 		    PRIV_SYS_CONFIG, PRIV_SYS_CONFIG, 0666);
399 		if (error == DDI_FAILURE) {
400 			return (DDI_FAILURE);
401 		}
402 
403 		rw_init(&snap_ctl.sid_rwlock, NULL, RW_DEFAULT, NULL);
404 		rw_enter(&snap_ctl.sid_rwlock, RW_WRITER);
405 		fssnap_dip = dip;
406 		snap_ctl.sid_snapnumber = SNAP_CTL_MINOR;
407 		/* the control sid is not linked into the snapshot list */
408 		snap_ctl.sid_next = NULL;
409 		snap_ctl.sid_cowinfo = NULL;
410 		snap_ctl.sid_flags = 0;
411 		rw_exit(&snap_ctl.sid_rwlock);
412 		ddi_report_dev(dip);
413 
414 		return (DDI_SUCCESS);
415 	case DDI_PM_RESUME:
416 		return (DDI_SUCCESS);
417 
418 	case DDI_RESUME:
419 		return (DDI_SUCCESS);
420 
421 	default:
422 		return (DDI_FAILURE);
423 	}
424 }
425 
426 /*
427  * snap_detach() - snapshot driver detach(9E) routine
428  *
429  *    destroys snapshot control device and control state.  If any snapshots
430  *    are active (ie. num_snapshots != 0), the device will refuse to detach.
431  */
432 static int
433 snap_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
434 {
435 	struct snapshot_id *sidp, *sidnextp;
436 
437 	switch (cmd) {
438 	case DDI_DETACH:
439 		/* do not detach if the device is active */
440 		mutex_enter(&snapshot_mutex);
441 		if ((num_snapshots != 0) ||
442 		    ((snap_ctl.sid_flags & SID_CHAR_BUSY) != 0)) {
443 			mutex_exit(&snapshot_mutex);
444 			return (DDI_FAILURE);
445 		}
446 
447 		/* free up the snapshot list */
448 		for (sidp = snapshot; sidp != NULL; sidp = sidnextp) {
449 			ASSERT(SID_AVAILABLE(sidp) &&
450 			    !RW_LOCK_HELD(&sidp->sid_rwlock));
451 			sidnextp = sidp->sid_next;
452 			rw_destroy(&sidp->sid_rwlock);
453 			kmem_free(sidp, sizeof (struct snapshot_id));
454 		}
455 		snapshot = NULL;
456 
457 		/* delete the control device */
458 		ddi_remove_minor_node(dip, SNAP_CTL_NODE);
459 		fssnap_dip = NULL;
460 
461 		ASSERT((snap_ctl.sid_flags & SID_CHAR_BUSY) == 0);
462 		rw_destroy(&snap_ctl.sid_rwlock);
463 		mutex_exit(&snapshot_mutex);
464 
465 		return (DDI_SUCCESS);
466 
467 	default:
468 		return (DDI_FAILURE);
469 	}
470 }
471 
472 /*
473  * snap_open() - snapshot driver open(9E) routine
474  *
475  *     marks the snapshot id as busy so it will not be recycled when deleted
476  *     until the snapshot is closed.
477  */
478 /* ARGSUSED */
479 static int
480 snap_open(dev_t *devp, int flag, int otyp, cred_t *cred)
481 {
482 	minor_t	minor;
483 	struct snapshot_id **sidpp, *sidp;
484 
485 	/* snapshots are read-only */
486 	if (flag & FWRITE)
487 		return (EROFS);
488 
489 	minor = getminor(*devp);
490 
491 	if (minor == SNAP_CTL_MINOR) {
492 		/* control device must be opened exclusively */
493 		if (((flag & FEXCL) != FEXCL) || (otyp != OTYP_CHR))
494 			return (EINVAL);
495 
496 		rw_enter(&snap_ctl.sid_rwlock, RW_WRITER);
497 		if ((snap_ctl.sid_flags & SID_CHAR_BUSY) != 0) {
498 			rw_exit(&snap_ctl.sid_rwlock);
499 			return (EBUSY);
500 		}
501 
502 		snap_ctl.sid_flags |= SID_CHAR_BUSY;
503 		rw_exit(&snap_ctl.sid_rwlock);
504 
505 		return (0);
506 	}
507 
508 	sidpp = ddi_get_soft_state(statep, minor);
509 	if (sidpp == NULL || *sidpp == NULL)
510 		return (ENXIO);
511 	sidp = *sidpp;
512 	rw_enter(&sidp->sid_rwlock, RW_WRITER);
513 
514 	if ((flag & FEXCL) && SID_BUSY(sidp)) {
515 		rw_exit(&sidp->sid_rwlock);
516 		return (EAGAIN);
517 	}
518 
519 	ASSERT(sidpp != NULL && sidp != NULL);
520 	/* check to see if this snapshot has been killed on us */
521 	if (SID_INACTIVE(sidp)) {
522 		cmn_err(CE_WARN, "snap_open: snapshot %d does not exist.",
523 		    minor);
524 		rw_exit(&sidp->sid_rwlock);
525 		return (ENXIO);
526 	}
527 
528 	switch (otyp) {
529 	case OTYP_CHR:
530 		sidp->sid_flags |= SID_CHAR_BUSY;
531 		break;
532 	case OTYP_BLK:
533 		sidp->sid_flags |= SID_BLOCK_BUSY;
534 		break;
535 	default:
536 		rw_exit(&sidp->sid_rwlock);
537 		return (EINVAL);
538 	}
539 
540 	rw_exit(&sidp->sid_rwlock);
541 
542 	/*
543 	 * at this point if a valid snapshot was found then it has
544 	 * been marked busy and we can use it.
545 	 */
546 	return (0);
547 }
548 
549 /*
550  * snap_close() - snapshot driver close(9E) routine
551  *
552  *    unsets the busy bits in the snapshot id.  If the snapshot has been
553  *    deleted while the snapshot device was open, the close call will clean
554  *    up the remaining state information.
555  */
556 /* ARGSUSED */
557 static int
558 snap_close(dev_t dev, int flag, int otyp, cred_t *cred)
559 {
560 	struct snapshot_id	**sidpp, *sidp;
561 	minor_t			minor;
562 	char			name[20];
563 
564 	minor = getminor(dev);
565 
566 	/* if this is the control device, close it and return */
567 	if (minor == SNAP_CTL_MINOR) {
568 		rw_enter(&snap_ctl.sid_rwlock, RW_WRITER);
569 		snap_ctl.sid_flags &= ~(SID_CHAR_BUSY);
570 		rw_exit(&snap_ctl.sid_rwlock);
571 		return (0);
572 	}
573 
574 	sidpp = ddi_get_soft_state(statep, minor);
575 	if (sidpp == NULL || *sidpp == NULL) {
576 		cmn_err(CE_WARN, "snap_close: could not find state for "
577 		    "snapshot %d.", minor);
578 		return (ENXIO);
579 	}
580 	sidp = *sidpp;
581 	mutex_enter(&snapshot_mutex);
582 	rw_enter(&sidp->sid_rwlock, RW_WRITER);
583 
584 	/* Mark the snapshot as not being busy anymore */
585 	switch (otyp) {
586 	case OTYP_CHR:
587 		sidp->sid_flags &= ~(SID_CHAR_BUSY);
588 		break;
589 	case OTYP_BLK:
590 		sidp->sid_flags &= ~(SID_BLOCK_BUSY);
591 		break;
592 	default:
593 		mutex_exit(&snapshot_mutex);
594 		rw_exit(&sidp->sid_rwlock);
595 		return (EINVAL);
596 	}
597 
598 	if (SID_AVAILABLE(sidp)) {
599 		/*
600 		 * if this is the last close on a snapshot that has been
601 		 * deleted, then free up the soft state.  The snapdelete
602 		 * ioctl does not free this when the device is in use so
603 		 * we do it here after the last reference goes away.
604 		 */
605 
606 		/* remove the device nodes */
607 		ASSERT(fssnap_dip != NULL);
608 		(void) snprintf(name, sizeof (name), "%d",
609 		    sidp->sid_snapnumber);
610 		ddi_remove_minor_node(fssnap_dip, name);
611 		(void) snprintf(name, sizeof (name), "%d,raw",
612 		    sidp->sid_snapnumber);
613 		ddi_remove_minor_node(fssnap_dip, name);
614 
615 		/* delete the state structure */
616 		ddi_soft_state_free(statep, sidp->sid_snapnumber);
617 		num_snapshots--;
618 	}
619 
620 	mutex_exit(&snapshot_mutex);
621 	rw_exit(&sidp->sid_rwlock);
622 
623 	return (0);
624 }
625 
626 /*
627  * snap_read() - snapshot driver read(9E) routine
628  *
629  *    reads data from the snapshot by calling snap_strategy() through physio()
630  */
631 /* ARGSUSED */
632 static int
633 snap_read(dev_t dev, struct uio *uiop, cred_t *credp)
634 {
635 	minor_t		minor;
636 	struct snapshot_id **sidpp;
637 
638 	minor = getminor(dev);
639 	sidpp = ddi_get_soft_state(statep, minor);
640 	if (sidpp == NULL || *sidpp == NULL) {
641 		cmn_err(CE_WARN,
642 		    "snap_read: could not find state for snapshot %d.", minor);
643 		return (ENXIO);
644 	}
645 	return (physio(snap_strategy, NULL, dev, B_READ, minphys, uiop));
646 }
647 
648 /*
649  * snap_strategy() - snapshot driver strategy(9E) routine
650  *
651  *    cycles through each chunk in the requested buffer and calls
652  *    snap_getchunk() on each chunk to retrieve it from the appropriate
653  *    place.  Once all of the parts are put together the requested buffer
654  *    is returned.  The snapshot driver is read-only, so a write is invalid.
655  */
656 static int
657 snap_strategy(struct buf *bp)
658 {
659 	struct snapshot_id **sidpp, *sidp;
660 	minor_t		minor;
661 	chunknumber_t	chunk;
662 	int		off, len;
663 	u_longlong_t	reqptr;
664 	int		error = 0;
665 	size_t		chunksz;
666 	caddr_t		buf;
667 
668 	/* snapshot device is read-only */
669 	if (bp->b_flags & B_WRITE) {
670 		bioerror(bp, EROFS);
671 		bp->b_resid = bp->b_bcount;
672 		biodone(bp);
673 		return (0);
674 	}
675 
676 	minor = getminor(bp->b_edev);
677 	sidpp = ddi_get_soft_state(statep, minor);
678 	if (sidpp == NULL || *sidpp == NULL) {
679 		cmn_err(CE_WARN,
680 		    "snap_strategy: could not find state for snapshot %d.",
681 		    minor);
682 		bioerror(bp, ENXIO);
683 		bp->b_resid = bp->b_bcount;
684 		biodone(bp);
685 		return (0);
686 	}
687 	sidp = *sidpp;
688 	ASSERT(sidp);
689 	rw_enter(&sidp->sid_rwlock, RW_READER);
690 
691 	if (SID_INACTIVE(sidp)) {
692 		bioerror(bp, ENXIO);
693 		bp->b_resid = bp->b_bcount;
694 		biodone(bp);
695 		rw_exit(&sidp->sid_rwlock);
696 		return (0);
697 	}
698 
699 	if (bp->b_flags & (B_PAGEIO|B_PHYS))
700 		bp_mapin(bp);
701 
702 	bp->b_resid = bp->b_bcount;
703 	ASSERT(bp->b_un.b_addr);
704 	buf = bp->b_un.b_addr;
705 
706 	chunksz = sidp->sid_cowinfo->cow_map.cmap_chunksz;
707 
708 	/* reqptr is the current DEV_BSIZE offset into the device */
709 	/* chunk is the chunk containing reqptr */
710 	/* len is the length of the request (in the current chunk) in bytes */
711 	/* off is the byte offset into the current chunk */
712 	reqptr = bp->b_lblkno;
713 	while (bp->b_resid > 0) {
714 		chunk = dbtocowchunk(&sidp->sid_cowinfo->cow_map, reqptr);
715 		off = (reqptr % (chunksz >> DEV_BSHIFT)) << DEV_BSHIFT;
716 		len = min(chunksz - off, bp->b_resid);
717 		ASSERT((off + len) <= chunksz);
718 
719 		if ((error = snap_getchunk(sidp, chunk, off, len, buf)) != 0) {
720 			/*
721 			 * EINVAL means the user tried to go out of range.
722 			 * Anything else means it's likely that we're
723 			 * confused.
724 			 */
725 			if (error != EINVAL) {
726 				cmn_err(CE_WARN, "snap_strategy: error "
727 				    "calling snap_getchunk, chunk = %llu, "
728 				    "offset = %d, len = %d, resid = %lu, "
729 				    "error = %d.",
730 				    chunk, off, len, bp->b_resid, error);
731 			}
732 			bioerror(bp, error);
733 			biodone(bp);
734 			rw_exit(&sidp->sid_rwlock);
735 			return (0);
736 		}
737 		bp->b_resid -= len;
738 		reqptr += (len >> DEV_BSHIFT);
739 		buf += len;
740 	}
741 
742 	ASSERT(bp->b_resid == 0);
743 	biodone(bp);
744 
745 	rw_exit(&sidp->sid_rwlock);
746 	return (0);
747 }
748 
749 /*
750  * snap_getchunk() - helper function for snap_strategy()
751  *
752  *    gets the requested data from the appropriate place and fills in the
753  *    buffer.  chunk is the chunk number of the request, offset is the
754  *    offset into that chunk and must be less than the chunk size.  len is
755  *    the length of the request starting at offset, and must not exceed a
756  *    chunk boundary.  buffer is the address to copy the data to.  len
757  *    bytes are copied into the buffer starting at the location specified.
758  *
759  *    A chunk is located according to the following algorithm:
760  *        - If the chunk does not have a translation or is not a candidate
761  *          for translation, it is read straight from the master device.
762  *        - If the chunk does have a translation, then it is either on
763  *          disk or in memory:
764  *            o If it is in memory the requested data is simply copied out
765  *              of the in-memory buffer.
766  *            o If it is in the backing store, it is read from there.
767  *
768  *    This function does the real work of the snapshot driver.
769  */
770 static int
771 snap_getchunk(struct snapshot_id *sidp, chunknumber_t chunk, int offset,
772     int len, char *buffer)
773 {
774 	cow_map_t	*cmap = &sidp->sid_cowinfo->cow_map;
775 	cow_map_node_t	*cmn;
776 	struct buf	*snapbuf;
777 	int		error = 0;
778 	char		*newbuffer;
779 	int		newlen = 0;
780 	int		partial = 0;
781 
782 	ASSERT(RW_READ_HELD(&sidp->sid_rwlock));
783 	ASSERT(offset + len <= cmap->cmap_chunksz);
784 
785 	/*
786 	 * Check if the chunk number is out of range and if so bail out
787 	 */
788 	if (chunk >= (cmap->cmap_bmsize * NBBY)) {
789 		return (EINVAL);
790 	}
791 
792 	/*
793 	 * If the chunk is not a candidate for translation, then the chunk
794 	 * was not allocated when the snapshot was taken.  Since it does
795 	 * not contain data associated with this snapshot, just return a
796 	 * zero buffer instead.
797 	 */
798 	if (isclr(cmap->cmap_candidate, chunk)) {
799 		bzero(buffer, len);
800 		return (0);
801 	}
802 
803 	/*
804 	 * if the chunk is a candidate for translation but a
805 	 * translation does not exist, then read through to the
806 	 * original file system.  The rwlock is held until the read
807 	 * completes if it hasn't been translated to make sure the
808 	 * file system does not translate the block before we
809 	 * access it. If it has already been translated we don't
810 	 * need the lock, because the translation will never go away.
811 	 */
812 	rw_enter(&cmap->cmap_rwlock, RW_READER);
813 	if (isclr(cmap->cmap_hastrans, chunk)) {
814 		snapbuf = getrbuf(KM_SLEEP);
815 		/*
816 		 * Reading into the buffer saves having to do a copy,
817 		 * but gets tricky if the request size is not a
818 		 * multiple of DEV_BSIZE.  However, we are filling the
819 		 * buffer left to right, so future reads will write
820 		 * over any extra data we might have read.
821 		 */
822 
823 		partial = len % DEV_BSIZE;
824 
825 		snapbuf->b_bcount = len;
826 		snapbuf->b_lblkno = lbtodb(chunk * cmap->cmap_chunksz + offset);
827 		snapbuf->b_un.b_addr = buffer;
828 
829 		snapbuf->b_iodone = NULL;
830 		snapbuf->b_proc = NULL;		/* i.e. the kernel */
831 		snapbuf->b_flags = B_READ | B_BUSY;
832 		snapbuf->b_edev = sidp->sid_fvp->v_vfsp->vfs_dev;
833 
834 		if (partial) {
835 			/*
836 			 * Partial block read in progress.
837 			 * This is bad as modules further down the line
838 			 * assume buf's are exact multiples of DEV_BSIZE
839 			 * and we end up with fewer, or zero, bytes read.
840 			 * To get round this we need to round up to the
841 			 * nearest full block read and then return only
842 			 * len bytes.
843 			 */
844 			newlen = (len - partial) + DEV_BSIZE;
845 			newbuffer = kmem_alloc(newlen, KM_SLEEP);
846 
847 			snapbuf->b_bcount = newlen;
848 			snapbuf->b_un.b_addr = newbuffer;
849 		}
850 
851 		(void) bdev_strategy(snapbuf);
852 		(void) biowait(snapbuf);
853 
854 		error = geterror(snapbuf);
855 
856 		if (partial) {
857 			/*
858 			 * Partial block read. Now we need to bcopy the
859 			 * correct number of bytes back into the
860 			 * supplied buffer, and tidy up our temp
861 			 * buffer.
862 			 */
863 			bcopy(newbuffer, buffer, len);
864 			kmem_free(newbuffer, newlen);
865 		}
866 
867 		freerbuf(snapbuf);
868 		rw_exit(&cmap->cmap_rwlock);
869 
870 		return (error);
871 	}
872 
873 	/*
874 	 * finally, if the chunk is a candidate for translation and it
875 	 * has been translated, then we clone the chunk of the buffer
876 	 * that was copied aside by the file system.
877 	 * The cmap_rwlock does not need to be held after we know the
878 	 * data has already been copied. Once a chunk has been copied
879 	 * to the backing file, it is stable read only data.
880 	 */
881 	cmn = transtbl_get(cmap, chunk);
882 
883 	/* check whether the data is in memory or in the backing file */
884 	if (cmn != NULL) {
885 		ASSERT(cmn->cmn_buf);
886 		/* already in memory */
887 		bcopy(cmn->cmn_buf + offset, buffer, len);
888 		rw_exit(&cmap->cmap_rwlock);
889 	} else {
890 		ssize_t resid = len;
891 		int	bf_index;
892 		/*
893 		 * can cause deadlock with writer if we don't drop the
894 		 * cmap_rwlock before trying to get the backing store file
895 		 * vnode rwlock.
896 		 */
897 		rw_exit(&cmap->cmap_rwlock);
898 
899 		bf_index = chunk / cmap->cmap_chunksperbf;
900 
901 		/* read buffer from backing file */
902 		error = vn_rdwr(UIO_READ,
903 		    (sidp->sid_cowinfo->cow_backfile_array)[bf_index],
904 		    buffer, len, ((chunk % cmap->cmap_chunksperbf) *
905 		    cmap->cmap_chunksz) + offset, UIO_SYSSPACE, 0,
906 		    RLIM64_INFINITY, kcred, &resid);
907 	}
908 
909 	return (error);
910 }
911 
912 /*
913  * snap_print() - snapshot driver print(9E) routine
914  *
915  *    prints the device identification string.
916  */
917 static int
918 snap_print(dev_t dev, char *str)
919 {
920 	struct snapshot_id **sidpp;
921 	minor_t		minor;
922 
923 	minor = getminor(dev);
924 	sidpp = ddi_get_soft_state(statep, minor);
925 	if (sidpp == NULL || *sidpp == NULL) {
926 		cmn_err(CE_WARN,
927 		    "snap_print: could not find state for snapshot %d.", minor);
928 		return (ENXIO);
929 	}
930 
931 	cmn_err(CE_NOTE, "snap_print: snapshot %d: %s",  minor, str);
932 
933 	return (0);
934 }
935 
936 /*
937  * snap_prop_op() - snapshot driver prop_op(9E) routine
938  *
939  *    get 32-bit and 64-bit values for size (character driver) and nblocks
940  *    (block driver).
941  */
942 static int
943 snap_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op,
944     int flags, char *name, caddr_t valuep, int *lengthp)
945 {
946 	struct snapshot_id **sidpp;
947 	int		length, km_flags;
948 	int		nblocks, size;
949 	uint64_t	Size, Nblocks;
950 	caddr_t		buffer;
951 	int		minor;
952 	dev_t		mdev;
953 
954 	minor = getminor(dev);
955 	length = *lengthp;		/* Get callers length */
956 
957 	/* if this is the control device just check for .conf properties */
958 	if (minor == SNAP_CTL_MINOR)
959 		return (ddi_prop_op(dev, dip, prop_op, flags, name,
960 			valuep, lengthp));
961 	/* check to see if there is a master device plumbed */
962 	sidpp = ddi_get_soft_state(statep, minor);
963 	if (sidpp == NULL || *sidpp == NULL) {
964 		cmn_err(CE_WARN,
965 		    "snap_prop_op: could not find state for "
966 		    "snapshot %d.", minor);
967 		return (DDI_PROP_NOT_FOUND);
968 	}
969 
970 	if (((*sidpp)->sid_fvp == NULL) || ((*sidpp)->sid_fvp->v_vfsp == NULL))
971 		return (ddi_prop_op(dev, dip, prop_op, flags, name,
972 			valuep, lengthp));
973 	mdev = (*sidpp)->sid_fvp->v_vfsp->vfs_dev;
974 
975 	/* get size information from the master device. */
976 
977 	if (strcmp(name, "nblocks") == 0) {
978 		nblocks = bdev_size(mdev);
979 		*lengthp = sizeof (nblocks);	/* Set callers length */
980 	} else if (strcmp(name, "Nblocks") == 0) {
981 		Nblocks = bdev_Size(mdev);
982 		*lengthp = sizeof (Nblocks);	/* Set callers length */
983 	} else if (strcmp(name, "size") == 0) {
984 		size = cdev_size(mdev);
985 		*lengthp = sizeof (size);	/* Set callers length */
986 	} else if (strcmp(name, "Size") == 0) {
987 		Size = cdev_Size(mdev);
988 		*lengthp = sizeof (Size);	/* Set callers length */
989 	} else {	/* not for us */
990 		return (ddi_prop_op(dev, dip, prop_op, flags, name,
991 		    valuep, lengthp));
992 	}
993 
994 	/*
995 	 * If length only request, just return the length.
996 	 */
997 	if (prop_op == PROP_LEN)  {
998 		return (DDI_PROP_SUCCESS);
999 	}
1000 
1001 	/*
1002 	 * Allocate buffer, if required.  Either way, set `buffer' variable.
1003 	 */
1004 	switch (prop_op)  {
1005 	case PROP_LEN_AND_VAL_ALLOC:
1006 
1007 		km_flags = KM_NOSLEEP;
1008 
1009 		if (flags & DDI_PROP_CANSLEEP)
1010 			km_flags = KM_SLEEP;
1011 
1012 		buffer = kmem_alloc(*lengthp, km_flags);
1013 		if (buffer == NULL)  {
1014 			cmn_err(CE_WARN, "snap_get_prop: no mem for "
1015 			"property %s.", name);
1016 			return (DDI_PROP_NO_MEMORY);
1017 		}
1018 		*(caddr_t *)valuep = buffer; /* Set callers buf ptr */
1019 		break;
1020 
1021 	case PROP_LEN_AND_VAL_BUF:
1022 
1023 		if (*lengthp > length)
1024 			return (DDI_PROP_BUF_TOO_SMALL);
1025 
1026 		buffer = valuep; /* get callers buf ptr */
1027 		break;
1028 	}
1029 
1030 	if (strcmp(name, "nblocks") == 0) {
1031 		*((uint_t *)buffer) = nblocks;
1032 	} else if (strcmp(name, "Nblocks") == 0) {
1033 		*((uint64_t *)buffer) = Nblocks;
1034 	} else if (strcmp(name, "size") == 0) {
1035 		*((uint_t *)buffer) = size;
1036 	} else if (strcmp(name, "Size") == 0) {
1037 		*((uint64_t *)buffer) = Size;
1038 	}
1039 
1040 	return (DDI_PROP_SUCCESS);
1041 }
1042 
1043 /*
1044  * snap_ioctl() - snapshot driver ioctl(9E) routine
1045  *
1046  *    only applies to the control device.  The control device accepts two
1047  *    ioctl requests: create a snapshot or delete a snapshot.  In either
1048  *    case, the vnode for the requested file system is extracted, and the
1049  *    request is passed on to the file system via the same ioctl.  The file
1050  *    system is responsible for doing the things necessary for creating or
1051  *    destroying a snapshot, including any file system specific operations
1052  *    that must be performed as well as setting up and deleting the snapshot
1053  *    state through the fssnap interfaces.
1054  */
1055 static int
1056 snap_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
1057 int *rvalp)
1058 {
1059 	minor_t	minor;
1060 	int error = 0;
1061 
1062 	minor = getminor(dev);
1063 
1064 	if (minor != SNAP_CTL_MINOR) {
1065 		return (EINVAL);
1066 	}
1067 
1068 	switch (cmd) {
1069 	case _FIOSNAPSHOTCREATE:
1070 	{
1071 		struct fiosnapcreate	fc;
1072 		struct file		*fp;
1073 		struct vnode		*vp;
1074 
1075 		if (ddi_copyin((void *)arg, &fc, sizeof (fc), mode))
1076 			return (EFAULT);
1077 
1078 		/* get vnode for file system mount point */
1079 		if ((fp = getf(fc.rootfiledesc)) == NULL)
1080 			return (EBADF);
1081 
1082 		ASSERT(fp->f_vnode);
1083 		vp = fp->f_vnode;
1084 		VN_HOLD(vp);
1085 		releasef(fc.rootfiledesc);
1086 
1087 		/* pass ioctl request to file system */
1088 		error = VOP_IOCTL(vp, cmd, arg, 0, credp, rvalp);
1089 		VN_RELE(vp);
1090 		break;
1091 	}
1092 	case _FIOSNAPSHOTCREATE_MULTI:
1093 	{
1094 		struct fiosnapcreate_multi	fc;
1095 		struct file		*fp;
1096 		struct vnode		*vp;
1097 
1098 		if (ddi_copyin((void *)arg, &fc, sizeof (fc), mode))
1099 			return (EFAULT);
1100 
1101 		/* get vnode for file system mount point */
1102 		if ((fp = getf(fc.rootfiledesc)) == NULL)
1103 			return (EBADF);
1104 
1105 		ASSERT(fp->f_vnode);
1106 		vp = fp->f_vnode;
1107 		VN_HOLD(vp);
1108 		releasef(fc.rootfiledesc);
1109 
1110 		/* pass ioctl request to file system */
1111 		error = VOP_IOCTL(vp, cmd, arg, 0, credp, rvalp);
1112 		VN_RELE(vp);
1113 		break;
1114 	}
1115 	case _FIOSNAPSHOTDELETE:
1116 	{
1117 		major_t			major;
1118 		struct fiosnapdelete	fc;
1119 		snapshot_id_t		*sidp = NULL;
1120 		snapshot_id_t		*sidnextp = NULL;
1121 		struct file		*fp = NULL;
1122 		struct vnode		*vp = NULL;
1123 		struct vfs 		*vfsp = NULL;
1124 		vfsops_t		*vfsops = EIO_vfsops;
1125 
1126 		if (ddi_copyin((void *)arg, &fc, sizeof (fc), mode))
1127 			return (EFAULT);
1128 
1129 		/* get vnode for file system mount point */
1130 		if ((fp = getf(fc.rootfiledesc)) == NULL)
1131 			return (EBADF);
1132 
1133 		ASSERT(fp->f_vnode);
1134 		vp = fp->f_vnode;
1135 		VN_HOLD(vp);
1136 		releasef(fc.rootfiledesc);
1137 		/*
1138 		 * Test for two formats of delete and set correct minor/vp:
1139 		 * pseudo device:
1140 		 * fssnap -d [/dev/fssnap/x]
1141 		 * or
1142 		 * mount point:
1143 		 * fssnap -d [/mntpt]
1144 		 * Note that minor is verified to be equal to SNAP_CTL_MINOR
1145 		 * at this point which is an invalid minor number.
1146 		 */
1147 		ASSERT(fssnap_dip != NULL);
1148 		major = ddi_driver_major(fssnap_dip);
1149 		mutex_enter(&snapshot_mutex);
1150 		for (sidp = snapshot; sidp != NULL; sidp = sidnextp) {
1151 			rw_enter(&sidp->sid_rwlock, RW_READER);
1152 			sidnextp = sidp->sid_next;
1153 			/* pseudo device: */
1154 			if (major == getmajor(vp->v_rdev)) {
1155 				minor = getminor(vp->v_rdev);
1156 				if (sidp->sid_snapnumber == (uint_t)minor &&
1157 				    sidp->sid_fvp) {
1158 					VN_RELE(vp);
1159 					vp = sidp->sid_fvp;
1160 					VN_HOLD(vp);
1161 					rw_exit(&sidp->sid_rwlock);
1162 					break;
1163 				}
1164 			/* Mount point: */
1165 			} else {
1166 				if (sidp->sid_fvp == vp) {
1167 					minor = sidp->sid_snapnumber;
1168 					rw_exit(&sidp->sid_rwlock);
1169 					break;
1170 				}
1171 			}
1172 			rw_exit(&sidp->sid_rwlock);
1173 		}
1174 		mutex_exit(&snapshot_mutex);
1175 		/* Verify minor got set correctly above */
1176 		if (minor == SNAP_CTL_MINOR) {
1177 			VN_RELE(vp);
1178 			return (EINVAL);
1179 		}
1180 		dev = makedevice(major, minor);
1181 		/*
1182 		 * Create dummy vfs entry
1183 		 * to use as a locking semaphore across the IOCTL
1184 		 * for mount in progress cases...
1185 		 */
1186 		vfsp = kmem_alloc(sizeof (vfs_t), KM_SLEEP);
1187 		VFS_INIT(vfsp, vfsops, NULL);
1188 		VFS_HOLD(vfsp);
1189 		vfs_addmip(dev, vfsp);
1190 		if ((vfs_devmounting(dev, vfsp)) ||
1191 		    (vfs_devismounted(dev))) {
1192 			vfs_delmip(vfsp);
1193 			VFS_RELE(vfsp);
1194 			VN_RELE(vp);
1195 			return (EBUSY);
1196 		}
1197 		/*
1198 		 * Nobody mounted but do not release mount in progress lock
1199 		 * until IOCTL complete to prohibit a mount sneaking
1200 		 * in
1201 		 */
1202 		error = VOP_IOCTL(vp, cmd, arg, 0, credp, rvalp);
1203 		vfs_delmip(vfsp);
1204 		VFS_RELE(vfsp);
1205 		VN_RELE(vp);
1206 		break;
1207 	}
1208 	default:
1209 		cmn_err(CE_WARN, "snap_ioctl: Invalid ioctl cmd %d, minor %d.",
1210 		    cmd, minor);
1211 		return (EINVAL);
1212 	}
1213 
1214 	return (error);
1215 }
1216 
1217 
1218 /* ************************************************************************ */
1219 
1220 /*
1221  * Translation Table Routines
1222  *
1223  *    These support routines implement a simple doubly linked list
1224  *    to keep track of chunks that are currently in memory.  The maximum
1225  *    size of the list is determined by the fssnap_max_mem_chunks variable.
1226  *    The cmap_rwlock is used to protect the linkage of the list.
1227  */
1228 
1229 /*
1230  * transtbl_add() - add a node to the translation table
1231  *
1232  *    allocates a new node and points it at the buffer passed in.  The node
1233  *    is added to the beginning of the doubly linked list and the head of
1234  *    the list is moved.  The cmap_rwlock must be held as a writer through
1235  *    this operation.
1236  */
1237 static cow_map_node_t *
1238 transtbl_add(cow_map_t *cmap, chunknumber_t chunk, caddr_t buf)
1239 {
1240 	cow_map_node_t	*cmnode;
1241 
1242 	ASSERT(RW_WRITE_HELD(&cmap->cmap_rwlock));
1243 
1244 	cmnode = kmem_alloc(sizeof (cow_map_node_t), KM_SLEEP);
1245 
1246 	/*
1247 	 * insert new translations at the beginning so cmn_table is always
1248 	 * the first node.
1249 	 */
1250 	cmnode->cmn_chunk = chunk;
1251 	cmnode->cmn_buf = buf;
1252 	cmnode->cmn_prev = NULL;
1253 	cmnode->cmn_next = cmap->cmap_table;
1254 	if (cmnode->cmn_next)
1255 		cmnode->cmn_next->cmn_prev = cmnode;
1256 	cmap->cmap_table = cmnode;
1257 
1258 	return (cmnode);
1259 }
1260 
1261 /*
1262  * transtbl_get() - look up a node in the translation table
1263  *
1264  *    called by the snapshot driver to find data that has been translated.
1265  *    The lookup is done by the chunk number, and the node is returned.
1266  *    If the node was not found, NULL is returned.
1267  */
1268 static cow_map_node_t *
1269 transtbl_get(cow_map_t *cmap, chunknumber_t chunk)
1270 {
1271 	cow_map_node_t *cmn;
1272 
1273 	ASSERT(RW_READ_HELD(&cmap->cmap_rwlock));
1274 	ASSERT(cmap);
1275 
1276 	/* search the translation table */
1277 	for (cmn = cmap->cmap_table; cmn != NULL; cmn = cmn->cmn_next) {
1278 		if (cmn->cmn_chunk == chunk)
1279 			return (cmn);
1280 	}
1281 
1282 	/* not found */
1283 	return (NULL);
1284 }
1285 
1286 /*
1287  * transtbl_delete() - delete a node from the translation table
1288  *
1289  *    called when a node's data has been written out to disk.  The
1290  *    cmap_rwlock must be held as a writer for this operation.  If the node
1291  *    being deleted is the head of the list, then the head is moved to the
1292  *    next node.  Both the node's data and the node itself are freed.
1293  */
1294 static void
1295 transtbl_delete(cow_map_t *cmap, cow_map_node_t *cmn)
1296 {
1297 	ASSERT(RW_WRITE_HELD(&cmap->cmap_rwlock));
1298 	ASSERT(cmn);
1299 	ASSERT(cmap->cmap_table);
1300 
1301 	/* if the head of the list is being deleted, then move the head up */
1302 	if (cmap->cmap_table == cmn) {
1303 		ASSERT(cmn->cmn_prev == NULL);
1304 		cmap->cmap_table = cmn->cmn_next;
1305 	}
1306 
1307 
1308 	/* make previous node's next pointer skip over current node */
1309 	if (cmn->cmn_prev != NULL) {
1310 		ASSERT(cmn->cmn_prev->cmn_next == cmn);
1311 		cmn->cmn_prev->cmn_next = cmn->cmn_next;
1312 	}
1313 
1314 	/* make next node's previous pointer skip over current node */
1315 	if (cmn->cmn_next != NULL) {
1316 		ASSERT(cmn->cmn_next->cmn_prev == cmn);
1317 		cmn->cmn_next->cmn_prev = cmn->cmn_prev;
1318 	}
1319 
1320 	/* free the data and the node */
1321 	ASSERT(cmn->cmn_buf);
1322 	kmem_free(cmn->cmn_buf, cmap->cmap_chunksz);
1323 	kmem_free(cmn, sizeof (cow_map_node_t));
1324 }
1325 
1326 /*
1327  * transtbl_free() - free the entire translation table
1328  *
1329  *    called when the snapshot is deleted.  This frees all of the nodes in
1330  *    the translation table (but not the bitmaps).
1331  */
1332 static void
1333 transtbl_free(cow_map_t *cmap)
1334 {
1335 	cow_map_node_t	*curnode;
1336 	cow_map_node_t	*tempnode;
1337 
1338 	for (curnode = cmap->cmap_table; curnode != NULL; curnode = tempnode) {
1339 		tempnode = curnode->cmn_next;
1340 
1341 		kmem_free(curnode->cmn_buf, cmap->cmap_chunksz);
1342 		kmem_free(curnode, sizeof (cow_map_node_t));
1343 	}
1344 }
1345 
1346 
1347 /* ************************************************************************ */
1348 
1349 /*
1350  * Interface Implementation Routines
1351  *
1352  * The following functions implement snapshot interface routines that are
1353  * called by the file system to create, delete, and use a snapshot.  The
1354  * interfaces are defined in fssnap_if.c and are filled in by this driver
1355  * when it is loaded.  This technique allows the file system to depend on
1356  * the interface module without having to load the full implementation and
1357  * snapshot device drivers.
1358  */
1359 
1360 /*
1361  * fssnap_strategy_impl() - strategy routine called by the file system
1362  *
1363  *    called by the file system to handle copy-on-write when necessary.  All
1364  *    reads and writes that the file system performs should go through this
1365  *    function.  If the file system calls the underlying device's strategy
1366  *    routine without going through fssnap_strategy() (eg. by calling
1367  *    bdev_strategy()), the snapshot may not be consistent.
1368  *
1369  *    This function starts by doing significant sanity checking to insure
1370  *    the snapshot was not deleted out from under it or deleted and then
1371  *    recreated.  To do this, it checks the actual pointer passed into it
1372  *    (ie. the handle held by the file system).  NOTE that the parameter is
1373  *    a POINTER TO A POINTER to the snapshot id.  Once the snapshot id is
1374  *    locked, it knows things are ok and that this snapshot is really for
1375  *    this file system.
1376  *
1377  *    If the request is a write, fssnap_translate() is called to determine
1378  *    whether a copy-on-write is required.  If it is a read, the read is
1379  *    simply passed on to the underlying device.
1380  */
1381 static void
1382 fssnap_strategy_impl(void *snapshot_id, buf_t *bp)
1383 {
1384 	struct snapshot_id **sidpp;
1385 	struct snapshot_id *sidp;
1386 	int error;
1387 
1388 	/* read requests are always passed through */
1389 	if (bp->b_flags & B_READ) {
1390 		(void) bdev_strategy(bp);
1391 		return;
1392 	}
1393 
1394 	/*
1395 	 * Because we were not able to take the snapshot read lock BEFORE
1396 	 * checking for a snapshot back in the file system, things may have
1397 	 * drastically changed out from under us.  For instance, the snapshot
1398 	 * may have been deleted, deleted and recreated, or worse yet, deleted
1399 	 * for this file system but now the snapshot number is in use by another
1400 	 * file system.
1401 	 *
1402 	 * Having a pointer to the file system's snapshot id pointer allows us
1403 	 * to sanity check most of this, though it assumes the file system is
1404 	 * keeping track of a pointer to the snapshot_id somewhere.
1405 	 */
1406 	sidpp = (struct snapshot_id **)snapshot_id;
1407 	sidp = *sidpp;
1408 
1409 	/*
1410 	 * if this file system's snapshot was disabled, just pass the
1411 	 * request through.
1412 	 */
1413 	if (sidp == NULL) {
1414 		(void) bdev_strategy(bp);
1415 		return;
1416 	}
1417 
1418 	/*
1419 	 * Once we have the reader lock the snapshot will not magically go
1420 	 * away.  But things may have changed on us before this so double check.
1421 	 */
1422 	rw_enter(&sidp->sid_rwlock, RW_READER);
1423 
1424 	/*
1425 	 * if an error was founds somewhere the DELETE flag will be
1426 	 * set to indicate the snapshot should be deleted and no new
1427 	 * translations should occur.
1428 	 */
1429 	if (sidp->sid_flags & SID_DELETE) {
1430 		rw_exit(&sidp->sid_rwlock);
1431 		(void) fssnap_delete_impl(sidpp);
1432 		(void) bdev_strategy(bp);
1433 		return;
1434 	}
1435 
1436 	/*
1437 	 * If the file system is no longer pointing to the snapshot we were
1438 	 * called with, then it should not attempt to translate this buffer as
1439 	 * it may be going to a snapshot for a different file system.
1440 	 * Even if the file system snapshot pointer is still the same, the
1441 	 * snapshot may have been disabled before we got the reader lock.
1442 	 */
1443 	if (sidp != *sidpp || SID_INACTIVE(sidp)) {
1444 		rw_exit(&sidp->sid_rwlock);
1445 		(void) bdev_strategy(bp);
1446 		return;
1447 	}
1448 
1449 	/*
1450 	 * At this point we're sure the snapshot will not go away while the
1451 	 * reader lock is held, and we are reasonably certain that we are
1452 	 * writing to the correct snapshot.
1453 	 */
1454 	if ((error = fssnap_translate(sidpp, bp)) != 0) {
1455 		/*
1456 		 * fssnap_translate can release the reader lock if it
1457 		 * has to wait for a semaphore.  In this case it is possible
1458 		 * for the snapshot to be deleted in this time frame.  If this
1459 		 * happens just sent the buf thru to the filesystems device.
1460 		 */
1461 		if (sidp != *sidpp || SID_INACTIVE(sidp)) {
1462 			rw_exit(&sidp->sid_rwlock);
1463 			(void) bdev_strategy(bp);
1464 			return;
1465 		}
1466 		bioerror(bp, error);
1467 		biodone(bp);
1468 	}
1469 	rw_exit(&sidp->sid_rwlock);
1470 }
1471 
1472 /*
1473  * fssnap_translate() - helper function for fssnap_strategy()
1474  *
1475  *    performs the actual copy-on-write for write requests, if required.
1476  *    This function does the real work of the file system side of things.
1477  *
1478  *    It first checks the candidate bitmap to quickly determine whether any
1479  *    action is necessary.  If the candidate bitmap indicates the chunk was
1480  *    allocated when the snapshot was created, then it checks to see whether
1481  *    a translation already exists.  If a translation already exists then no
1482  *    action is required.  If the chunk is a candidate for copy-on-write,
1483  *    and a translation does not already exist, then the chunk is read in
1484  *    and a node is added to the translation table.
1485  *
1486  *    Once all of the chunks in the request range have been copied (if they
1487  *    needed to be), then the original request can be satisfied and the old
1488  *    data can be overwritten.
1489  */
1490 static int
1491 fssnap_translate(struct snapshot_id **sidpp, struct buf *wbp)
1492 {
1493 	snapshot_id_t	*sidp = *sidpp;
1494 	struct buf	*oldbp;	/* buffer to store old data in */
1495 	struct cow_info	*cowp = sidp->sid_cowinfo;
1496 	cow_map_t	*cmap = &cowp->cow_map;
1497 	cow_map_node_t	*cmn;
1498 	chunknumber_t	cowchunk, startchunk, endchunk;
1499 	int		error;
1500 	int	throttle_write = 0;
1501 
1502 	/* make sure the snapshot is active */
1503 	ASSERT(RW_READ_HELD(&sidp->sid_rwlock));
1504 
1505 	startchunk = dbtocowchunk(cmap, wbp->b_lblkno);
1506 	endchunk   = dbtocowchunk(cmap, wbp->b_lblkno +
1507 	    ((wbp->b_bcount-1) >> DEV_BSHIFT));
1508 
1509 	/*
1510 	 * Do not throttle the writes of the fssnap taskq thread and
1511 	 * the log roll (trans_roll) thread. Furthermore the writes to
1512 	 * the on-disk log are also not subject to throttling.
1513 	 * The fssnap_write_taskq thread's write can block on the throttling
1514 	 * semaphore which leads to self-deadlock as this same thread
1515 	 * releases the throttling semaphore after completing the IO.
1516 	 * If the trans_roll thread's write is throttled then we can deadlock
1517 	 * because the fssnap_taskq_thread which releases the throttling
1518 	 * semaphore can block waiting for log space which can only be
1519 	 * released by the trans_roll thread.
1520 	 */
1521 
1522 	throttle_write = !(taskq_member(cowp->cow_taskq, curthread) ||
1523 				    tsd_get(bypass_snapshot_throttle_key));
1524 
1525 	/*
1526 	 * Iterate through all chunks covered by this write and perform the
1527 	 * copy-aside if necessary.  Once all chunks have been safely
1528 	 * stowed away, the new data may be written in a single sweep.
1529 	 *
1530 	 * For each chunk in the range, the following sequence is performed:
1531 	 *	- Is the chunk a candidate for translation?
1532 	 *		o If not, then no translation is necessary, continue
1533 	 *	- If it is a candidate, then does it already have a translation?
1534 	 *		o If so, then no translation is necessary, continue
1535 	 *	- If it is a candidate, but does not yet have a translation,
1536 	 *	  then read the old data and schedule an asynchronous taskq
1537 	 *	  to write the old data to the backing file.
1538 	 *
1539 	 * Once this has been performed over the entire range of chunks, then
1540 	 * it is safe to overwrite the data that is there.
1541 	 *
1542 	 * Note that no lock is required to check the candidate bitmap because
1543 	 * it never changes once the snapshot is created.  The reader lock is
1544 	 * taken to check the hastrans bitmap since it may change.  If it
1545 	 * turns out a copy is required, then the lock is upgraded to a
1546 	 * writer, and the bitmap is re-checked as it may have changed while
1547 	 * the lock was released.  Finally, the write lock is held while
1548 	 * reading the old data to make sure it is not translated out from
1549 	 * under us.
1550 	 *
1551 	 * This locking mechanism should be sufficient to handle multiple
1552 	 * threads writing to overlapping chunks simultaneously.
1553 	 */
1554 	for (cowchunk = startchunk; cowchunk <= endchunk; cowchunk++) {
1555 		/*
1556 		 * If the cowchunk is outside of the range of our
1557 		 * candidate maps, then simply break out of the
1558 		 * loop and pass the I/O through to bdev_strategy.
1559 		 * This would occur if the file system has grown
1560 		 * larger since the snapshot was taken.
1561 		 */
1562 		if (cowchunk >= (cmap->cmap_bmsize * NBBY))
1563 			break;
1564 
1565 		/*
1566 		 * If no disk blocks were allocated in this chunk when the
1567 		 * snapshot was created then no copy-on-write will be
1568 		 * required.  Since this bitmap is read-only no locks are
1569 		 * necessary.
1570 		 */
1571 		if (isclr(cmap->cmap_candidate, cowchunk)) {
1572 			continue;
1573 		}
1574 
1575 		/*
1576 		 * If a translation already exists, the data can be written
1577 		 * through since the old data has already been saved off.
1578 		 */
1579 		if (isset(cmap->cmap_hastrans, cowchunk)) {
1580 			continue;
1581 		}
1582 
1583 
1584 		/*
1585 		 * Throttle translations if there are too many outstanding
1586 		 * chunks in memory.  The semaphore is sema_v'd by the taskq.
1587 		 *
1588 		 * You can't keep the sid_rwlock if you would go to sleep.
1589 		 * This will result in deadlock when someone tries to delete
1590 		 * the snapshot (wants the sid_rwlock as a writer, but can't
1591 		 * get it).
1592 		 */
1593 		if (throttle_write) {
1594 			if (sema_tryp(&cmap->cmap_throttle_sem) == 0) {
1595 				rw_exit(&sidp->sid_rwlock);
1596 				atomic_add_32(&cmap->cmap_waiters, 1);
1597 				sema_p(&cmap->cmap_throttle_sem);
1598 				atomic_add_32(&cmap->cmap_waiters, -1);
1599 				rw_enter(&sidp->sid_rwlock, RW_READER);
1600 
1601 			/*
1602 			 * Now since we released the sid_rwlock the state may
1603 			 * have transitioned underneath us. so check that again.
1604 			 */
1605 				if (sidp != *sidpp || SID_INACTIVE(sidp)) {
1606 					sema_v(&cmap->cmap_throttle_sem);
1607 					return (ENXIO);
1608 				}
1609 			}
1610 		}
1611 
1612 		/*
1613 		 * Acquire the lock as a writer and check to see if a
1614 		 * translation has been added in the meantime.
1615 		 */
1616 		rw_enter(&cmap->cmap_rwlock, RW_WRITER);
1617 		if (isset(cmap->cmap_hastrans, cowchunk)) {
1618 			if (throttle_write)
1619 				sema_v(&cmap->cmap_throttle_sem);
1620 			rw_exit(&cmap->cmap_rwlock);
1621 			continue; /* go to the next chunk */
1622 		}
1623 
1624 		/*
1625 		 * read a full chunk of data from the requested offset rounded
1626 		 * down to the nearest chunk size.
1627 		 */
1628 		oldbp = getrbuf(KM_SLEEP);
1629 		oldbp->b_lblkno = cowchunktodb(cmap, cowchunk);
1630 		oldbp->b_edev = wbp->b_edev;
1631 		oldbp->b_bcount = cmap->cmap_chunksz;
1632 		oldbp->b_bufsize = cmap->cmap_chunksz;
1633 		oldbp->b_iodone = NULL;
1634 		oldbp->b_proc = NULL;
1635 		oldbp->b_flags = B_READ;
1636 		oldbp->b_un.b_addr = kmem_alloc(cmap->cmap_chunksz, KM_SLEEP);
1637 
1638 		(void) bdev_strategy(oldbp);
1639 		(void) biowait(oldbp);
1640 
1641 		/*
1642 		 * It's ok to bail in the middle of translating the range
1643 		 * because the extra copy-asides will not hurt anything
1644 		 * (except by using extra space in the backing store).
1645 		 */
1646 		if ((error = geterror(oldbp)) != 0) {
1647 			cmn_err(CE_WARN, "fssnap_translate: error reading "
1648 			    "old data for snapshot %d, chunk %llu, disk block "
1649 			    "%lld, size %lu, error %d.", sidp->sid_snapnumber,
1650 			    cowchunk, oldbp->b_lblkno, oldbp->b_bcount, error);
1651 			kmem_free(oldbp->b_un.b_addr, cmap->cmap_chunksz);
1652 			freerbuf(oldbp);
1653 			rw_exit(&cmap->cmap_rwlock);
1654 			if (throttle_write)
1655 				sema_v(&cmap->cmap_throttle_sem);
1656 			return (error);
1657 		}
1658 
1659 		/*
1660 		 * add the node to the translation table and save a reference
1661 		 * to pass to the taskq for writing out to the backing file
1662 		 */
1663 		cmn = transtbl_add(cmap, cowchunk, oldbp->b_un.b_addr);
1664 		freerbuf(oldbp);
1665 
1666 		/*
1667 		 * Add a reference to the snapshot id so the lower level
1668 		 * processing (ie. the taskq) can get back to the state
1669 		 * information.
1670 		 */
1671 		cmn->cmn_sid = sidp;
1672 		cmn->release_sem = throttle_write;
1673 		setbit(cmap->cmap_hastrans, cowchunk);
1674 
1675 		rw_exit(&cmap->cmap_rwlock);
1676 
1677 		/*
1678 		 * schedule the asynchronous write to the backing file
1679 		 */
1680 		if (cowp->cow_backfile_array != NULL)
1681 			(void) taskq_dispatch(cowp->cow_taskq,
1682 			    fssnap_write_taskq, cmn, TQ_SLEEP);
1683 	}
1684 
1685 	/*
1686 	 * Write new data in place of the old data.  At this point all of the
1687 	 * chunks touched by this write have been copied aside and so the new
1688 	 * data can be written out all at once.
1689 	 */
1690 	(void) bdev_strategy(wbp);
1691 
1692 	return (0);
1693 }
1694 
1695 /*
1696  * fssnap_write_taskq() - write in-memory translations to the backing file
1697  *
1698  *    writes in-memory translations to the backing file asynchronously.  A
1699  *    task is dispatched each time a new translation is created.  The task
1700  *    writes the data to the backing file and removes it from the memory
1701  *    list. The throttling semaphore is released only if the particular
1702  *    translation was throttled in fssnap_translate.
1703  */
1704 static void
1705 fssnap_write_taskq(void *arg)
1706 {
1707 	cow_map_node_t	*cmn = (cow_map_node_t *)arg;
1708 	snapshot_id_t	*sidp = cmn->cmn_sid;
1709 	cow_info_t	*cowp = sidp->sid_cowinfo;
1710 	cow_map_t	*cmap = &cowp->cow_map;
1711 	int		error;
1712 	int		bf_index;
1713 	int		release_sem = cmn->release_sem;
1714 
1715 	/*
1716 	 * The sid_rwlock does not need to be held here because the taskqs
1717 	 * are destroyed explicitly by fssnap_delete (with the sid_rwlock
1718 	 * held as a writer).  taskq_destroy() will flush all of the tasks
1719 	 * out before fssnap_delete frees up all of the structures.
1720 	 */
1721 
1722 	/* if the snapshot was disabled from under us, drop the request. */
1723 	rw_enter(&sidp->sid_rwlock, RW_READER);
1724 	if (SID_INACTIVE(sidp)) {
1725 		rw_exit(&sidp->sid_rwlock);
1726 		if (release_sem)
1727 			sema_v(&cmap->cmap_throttle_sem);
1728 		return;
1729 	}
1730 	rw_exit(&sidp->sid_rwlock);
1731 
1732 	atomic_add_64((uint64_t *)&cmap->cmap_nchunks, 1);
1733 
1734 	if ((cmap->cmap_maxsize != 0) &&
1735 	    ((cmap->cmap_nchunks * cmap->cmap_chunksz) > cmap->cmap_maxsize)) {
1736 		cmn_err(CE_WARN, "fssnap_write_taskq: snapshot %d (%s) has "
1737 		    "reached the maximum backing file size specified (%llu "
1738 		    "bytes) and will be deleted.", sidp->sid_snapnumber,
1739 		    (char *)cowp->cow_kstat_mntpt->ks_data,
1740 		    cmap->cmap_maxsize);
1741 		if (release_sem)
1742 			sema_v(&cmap->cmap_throttle_sem);
1743 		atomic_or_uint(&sidp->sid_flags, SID_DELETE);
1744 		return;
1745 	}
1746 
1747 	/* perform the write */
1748 	bf_index = cmn->cmn_chunk / cmap->cmap_chunksperbf;
1749 
1750 	if (error = vn_rdwr(UIO_WRITE, (cowp->cow_backfile_array)[bf_index],
1751 	    cmn->cmn_buf, cmap->cmap_chunksz,
1752 	    (cmn->cmn_chunk % cmap->cmap_chunksperbf) * cmap->cmap_chunksz,
1753 	    UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, (ssize_t *)NULL)) {
1754 		cmn_err(CE_WARN, "fssnap_write_taskq: error writing to "
1755 		    "backing file.  DELETING SNAPSHOT %d, backing file path "
1756 		    "%s, offset %llu bytes, error %d.", sidp->sid_snapnumber,
1757 		    (char *)cowp->cow_kstat_bfname->ks_data,
1758 		    cmn->cmn_chunk * cmap->cmap_chunksz, error);
1759 		if (release_sem)
1760 			sema_v(&cmap->cmap_throttle_sem);
1761 		atomic_or_uint(&sidp->sid_flags, SID_DELETE);
1762 		return;
1763 	}
1764 
1765 	/*
1766 	 * now remove the node and buffer from memory
1767 	 */
1768 	rw_enter(&cmap->cmap_rwlock, RW_WRITER);
1769 	transtbl_delete(cmap, cmn);
1770 	rw_exit(&cmap->cmap_rwlock);
1771 
1772 	/* Allow more translations */
1773 	if (release_sem)
1774 		sema_v(&cmap->cmap_throttle_sem);
1775 
1776 }
1777 
1778 /*
1779  * fssnap_create_impl() - called from the file system to create a new snapshot
1780  *
1781  *    allocates and initializes the structures needed for a new snapshot.
1782  *    This is called by the file system when it receives an ioctl request to
1783  *    create a new snapshot.  An unused snapshot identifier is either found
1784  *    or created, and eventually returned as the opaque handle the file
1785  *    system will use to identify this snapshot.  The snapshot number
1786  *    associated with the snapshot identifier is the same as the minor
1787  *    number for the snapshot device that is used to access that snapshot.
1788  *
1789  *    The snapshot can not be used until the candidate bitmap is populated
1790  *    by the file system (see fssnap_set_candidate_impl()), and the file
1791  *    system finishes the setup process by calling fssnap_create_done().
1792  *    Nearly all of the snapshot locks are held for the duration of the
1793  *    create, and are not released until fssnap_create_done is called().
1794  */
1795 static void *
1796 fssnap_create_impl(chunknumber_t nchunks, uint_t chunksz, u_offset_t maxsize,
1797     struct vnode *fsvp, int backfilecount, struct vnode **bfvpp, char *backpath,
1798     u_offset_t max_backfile_size)
1799 {
1800 	refstr_t *mountpoint;
1801 	char taskqname[50];
1802 	struct cow_info *cowp;
1803 	struct cow_map	*cmap;
1804 	struct snapshot_id *sidp;
1805 	int lastsnap;
1806 
1807 	/*
1808 	 * Sanity check the parameters we care about
1809 	 * (we don't care about the informational parameters)
1810 	 */
1811 	if ((nchunks == 0) ||
1812 	    ((chunksz % DEV_BSIZE) != 0) ||
1813 	    (bfvpp == NULL)) {
1814 		return (NULL);
1815 	}
1816 
1817 	/*
1818 	 * Look for unused snapshot identifiers.  Snapshot ids are never
1819 	 * freed, but deleted snapshot ids will be recycled as needed.
1820 	 */
1821 	mutex_enter(&snapshot_mutex);
1822 
1823 findagain:
1824 	lastsnap = 0;
1825 	for (sidp = snapshot; sidp != NULL; sidp = sidp->sid_next) {
1826 		if (sidp->sid_snapnumber > lastsnap)
1827 			lastsnap = sidp->sid_snapnumber;
1828 
1829 		/*
1830 		 * The sid_rwlock is taken as a reader initially so that
1831 		 * activity on each snapshot is not stalled while searching
1832 		 * for a free snapshot id.
1833 		 */
1834 		rw_enter(&sidp->sid_rwlock, RW_READER);
1835 
1836 		/*
1837 		 * If the snapshot has been deleted and nobody is using the
1838 		 * snapshot device than we can reuse this snapshot_id.  If
1839 		 * the snapshot is marked to be deleted (SID_DELETE), then
1840 		 * it hasn't been deleted yet so don't reuse it.
1841 		 */
1842 		if (SID_AVAILABLE(sidp))
1843 			break; /* This spot is unused, so take it */
1844 		rw_exit(&sidp->sid_rwlock);
1845 	}
1846 
1847 	/*
1848 	 * add a new snapshot identifier if there are no deleted
1849 	 * entries.  Since it doesn't matter what order the entries
1850 	 * are in we can just add it to the beginning of the list.
1851 	 */
1852 	if (sidp) {
1853 		if (rw_tryupgrade(&sidp->sid_rwlock) == 0) {
1854 			/* someone else grabbed it as a writer, try again */
1855 			rw_exit(&sidp->sid_rwlock);
1856 			goto findagain;
1857 		}
1858 	} else {
1859 		/* Create a new node if we didn't find an unused one */
1860 		sidp = kmem_alloc(sizeof (struct snapshot_id), KM_SLEEP);
1861 		rw_init(&sidp->sid_rwlock, NULL, RW_DEFAULT, NULL);
1862 		rw_enter(&sidp->sid_rwlock, RW_WRITER);
1863 		sidp->sid_snapnumber = (snapshot == NULL) ? 0 : lastsnap + 1;
1864 		sidp->sid_cowinfo = NULL;
1865 		sidp->sid_flags = 0;
1866 		sidp->sid_next = snapshot;
1867 		snapshot = sidp;
1868 	}
1869 
1870 	ASSERT(RW_WRITE_HELD(&sidp->sid_rwlock));
1871 	ASSERT(sidp->sid_cowinfo == NULL);
1872 	ASSERT(sidp->sid_snapnumber <= (lastsnap + 1));
1873 
1874 	sidp->sid_flags |= SID_CREATING;
1875 	/* The root vnode is held until snap_delete_impl() is called */
1876 	VN_HOLD(fsvp);
1877 	sidp->sid_fvp = fsvp;
1878 	num_snapshots++;
1879 
1880 	/* allocate and initialize structures */
1881 
1882 	cowp = kmem_zalloc(sizeof (struct cow_info), KM_SLEEP);
1883 
1884 	cowp->cow_backfile_array = bfvpp;
1885 	cowp->cow_backcount = backfilecount;
1886 	cowp->cow_backfile_sz = max_backfile_size;
1887 
1888 	/*
1889 	 * Initialize task queues for this snapshot.  Only a small number
1890 	 * of threads are required because they will be serialized on the
1891 	 * backing file's reader/writer lock anyway.
1892 	 */
1893 	(void) snprintf(taskqname, sizeof (taskqname), "%s_taskq_%d", snapname,
1894 	    sidp->sid_snapnumber);
1895 	cowp->cow_taskq = taskq_create(taskqname, fssnap_taskq_nthreads,
1896 	    minclsyspri, 1,  fssnap_taskq_maxtasks, 0);
1897 
1898 	/* don't allow tasks to start until after everything is ready */
1899 	taskq_suspend(cowp->cow_taskq);
1900 
1901 	/* initialize translation table */
1902 	cmap = &cowp->cow_map;
1903 	rw_init(&cmap->cmap_rwlock, NULL, RW_DEFAULT, NULL);
1904 	rw_enter(&cmap->cmap_rwlock, RW_WRITER);
1905 
1906 	sema_init(&cmap->cmap_throttle_sem, fssnap_max_mem_chunks, NULL,
1907 	    SEMA_DEFAULT, NULL);
1908 
1909 	cmap->cmap_chunksz = chunksz;
1910 	cmap->cmap_maxsize = maxsize;
1911 	cmap->cmap_chunksperbf = max_backfile_size / chunksz;
1912 
1913 	/*
1914 	 * allocate one bit per chunk for the bitmaps, round up
1915 	 */
1916 	cmap->cmap_bmsize = (nchunks + (NBBY - 1)) / NBBY;
1917 	cmap->cmap_hastrans  = kmem_zalloc(cmap->cmap_bmsize, KM_SLEEP);
1918 	cmap->cmap_candidate = kmem_zalloc(cmap->cmap_bmsize, KM_SLEEP);
1919 
1920 	sidp->sid_cowinfo = cowp;
1921 
1922 	/* initialize kstats for this snapshot */
1923 	mountpoint = vfs_getmntpoint(fsvp->v_vfsp);
1924 	fssnap_create_kstats(sidp, sidp->sid_snapnumber,
1925 	    refstr_value(mountpoint), backpath);
1926 	refstr_rele(mountpoint);
1927 
1928 	mutex_exit(&snapshot_mutex);
1929 
1930 	/*
1931 	 * return with snapshot id rwlock held as a writer until
1932 	 * fssnap_create_done is called
1933 	 */
1934 	return (sidp);
1935 }
1936 
1937 /*
1938  * fssnap_set_candidate_impl() - mark a chunk as a candidate for copy-on-write
1939  *
1940  *    sets a bit in the candidate bitmap that indicates that a chunk is a
1941  *    candidate for copy-on-write.  Typically, chunks that are allocated on
1942  *    the file system at the time the snapshot is taken are candidates,
1943  *    while chunks that have no allocated data do not need to be copied.
1944  *    Chunks containing metadata must be marked as candidates as well.
1945  */
1946 static void
1947 fssnap_set_candidate_impl(void *snapshot_id, chunknumber_t chunknumber)
1948 {
1949 	struct snapshot_id	*sid = snapshot_id;
1950 	struct cow_info *cowp = sid->sid_cowinfo;
1951 	struct cow_map	*cmap = &cowp->cow_map;
1952 
1953 	/* simple bitmap operation for now */
1954 	ASSERT(chunknumber < (cmap->cmap_bmsize * NBBY));
1955 	setbit(cmap->cmap_candidate, chunknumber);
1956 }
1957 
1958 /*
1959  * fssnap_is_candidate_impl() - check whether a chunk is a candidate
1960  *
1961  *    returns 0 if the chunk is not a candidate and 1 if the chunk is a
1962  *    candidate.  This can be used by the file system to change behavior for
1963  *    chunks that might induce a copy-on-write.  The offset is specified in
1964  *    bytes since the chunk size may not be known by the file system.
1965  */
1966 static int
1967 fssnap_is_candidate_impl(void *snapshot_id, u_offset_t off)
1968 {
1969 	struct snapshot_id	*sid = snapshot_id;
1970 	struct cow_info *cowp = sid->sid_cowinfo;
1971 	struct cow_map	*cmap = &cowp->cow_map;
1972 	ulong_t chunknumber = off / cmap->cmap_chunksz;
1973 
1974 	/* simple bitmap operation for now */
1975 	ASSERT(chunknumber < (cmap->cmap_bmsize * NBBY));
1976 	return (isset(cmap->cmap_candidate, chunknumber));
1977 }
1978 
1979 /*
1980  * fssnap_create_done_impl() - complete the snapshot setup process
1981  *
1982  *    called when the file system is done populating the candidate bitmap
1983  *    and it is ready to start using the snapshot.  This routine releases
1984  *    the snapshot locks, allows taskq tasks to start processing, and
1985  *    creates the device minor nodes associated with the snapshot.
1986  */
1987 static int
1988 fssnap_create_done_impl(void *snapshot_id)
1989 {
1990 	struct snapshot_id	**sidpp, *sidp = snapshot_id;
1991 	struct cow_info		*cowp;
1992 	struct cow_map		*cmap;
1993 	int			snapnumber = -1;
1994 	char			name[20];
1995 
1996 	/* sid rwlock and cmap rwlock should be taken from fssnap_create */
1997 	ASSERT(sidp);
1998 	ASSERT(RW_WRITE_HELD(&sidp->sid_rwlock));
1999 	ASSERT(sidp->sid_cowinfo);
2000 
2001 	cowp = sidp->sid_cowinfo;
2002 	cmap = &cowp->cow_map;
2003 
2004 	ASSERT(RW_WRITE_HELD(&cmap->cmap_rwlock));
2005 
2006 	sidp->sid_flags &= ~(SID_CREATING | SID_DISABLED);
2007 	snapnumber = sidp->sid_snapnumber;
2008 
2009 	/* allocate state structure and find new snapshot id */
2010 	if (ddi_soft_state_zalloc(statep, snapnumber) != DDI_SUCCESS) {
2011 		cmn_err(CE_WARN,
2012 		    "snap_ioctl: create: could not allocate "
2013 		    "state for snapshot %d.", snapnumber);
2014 		snapnumber = -1;
2015 		goto out;
2016 	}
2017 
2018 	sidpp = ddi_get_soft_state(statep, snapnumber);
2019 	*sidpp = sidp;
2020 
2021 	/* create minor node based on snapshot number */
2022 	ASSERT(fssnap_dip != NULL);
2023 	(void) snprintf(name, sizeof (name), "%d", snapnumber);
2024 	if (ddi_create_minor_node(fssnap_dip, name, S_IFBLK,
2025 	    snapnumber, DDI_PSEUDO, 0) != DDI_SUCCESS) {
2026 		cmn_err(CE_WARN, "snap_ioctl: could not create "
2027 		    "block minor node for snapshot %d.", snapnumber);
2028 		snapnumber = -1;
2029 		goto out;
2030 	}
2031 
2032 	(void) snprintf(name, sizeof (name), "%d,raw", snapnumber);
2033 	if (ddi_create_minor_node(fssnap_dip, name, S_IFCHR,
2034 	    snapnumber, DDI_PSEUDO, 0) != DDI_SUCCESS) {
2035 		cmn_err(CE_WARN, "snap_ioctl: could not create "
2036 		    "character minor node for snapshot %d.", snapnumber);
2037 		snapnumber = -1;
2038 	}
2039 
2040 out:
2041 	rw_exit(&sidp->sid_rwlock);
2042 	rw_exit(&cmap->cmap_rwlock);
2043 
2044 	/* let the taskq threads start processing */
2045 	taskq_resume(cowp->cow_taskq);
2046 
2047 	return (snapnumber);
2048 }
2049 
2050 /*
2051  * fssnap_delete_impl() - delete a snapshot
2052  *
2053  *    used when a snapshot is no longer needed.  This is called by the file
2054  *    system when it receives an ioctl request to delete a snapshot.  It is
2055  *    also called internally when error conditions such as disk full, errors
2056  *    writing to the backing file, or backing file maxsize exceeded occur.
2057  *    If the snapshot device is busy when the delete request is received,
2058  *    all state will be deleted except for the soft state and device files
2059  *    associated with the snapshot; they will be deleted when the snapshot
2060  *    device is closed.
2061  *
2062  *    NOTE this function takes a POINTER TO A POINTER to the snapshot id,
2063  *    and expects to be able to set the handle held by the file system to
2064  *    NULL.  This depends on the file system checking that variable for NULL
2065  *    before calling fssnap_strategy().
2066  */
2067 static int
2068 fssnap_delete_impl(void *snapshot_id)
2069 {
2070 	struct snapshot_id	**sidpp = (struct snapshot_id **)snapshot_id;
2071 	struct snapshot_id	*sidp;
2072 	struct snapshot_id	**statesidpp;
2073 	struct cow_info		*cowp;
2074 	struct cow_map		*cmap;
2075 	char			name[20];
2076 	int			snapnumber = -1;
2077 	vnode_t			**vpp;
2078 
2079 	/*
2080 	 * sidp is guaranteed to be valid if sidpp is valid because
2081 	 * the snapshot list is append-only.
2082 	 */
2083 	if (sidpp == NULL) {
2084 		return (-1);
2085 	}
2086 
2087 	sidp = *sidpp;
2088 	rw_enter(&sidp->sid_rwlock, RW_WRITER);
2089 
2090 	ASSERT(RW_WRITE_HELD(&sidp->sid_rwlock));
2091 
2092 	/*
2093 	 * double check that the snapshot is still valid for THIS file system
2094 	 */
2095 	if (*sidpp == NULL) {
2096 		rw_exit(&sidp->sid_rwlock);
2097 		return (-1);
2098 	}
2099 
2100 	/*
2101 	 * Now we know the snapshot is still valid and will not go away
2102 	 * because we have the write lock.  Once the state is transitioned
2103 	 * to "disabling", the sid_rwlock can be released.  Any pending I/O
2104 	 * waiting for the lock as a reader will check for this state and
2105 	 * abort without touching data that may be getting freed.
2106 	 */
2107 	sidp->sid_flags |= SID_DISABLING;
2108 	if (sidp->sid_flags & SID_DELETE) {
2109 		cmn_err(CE_WARN, "Snapshot %d automatically deleted.",
2110 		    sidp->sid_snapnumber);
2111 		sidp->sid_flags &= ~(SID_DELETE);
2112 	}
2113 
2114 
2115 	/*
2116 	 * This is pointing into file system specific data!  The assumption is
2117 	 * that fssnap_strategy() gets called from the file system based on
2118 	 * whether this reference to the snapshot_id is NULL or not.  So
2119 	 * setting this to NULL should disable snapshots for the file system.
2120 	 */
2121 	*sidpp = NULL;
2122 
2123 	/* remove cowinfo */
2124 	cowp = sidp->sid_cowinfo;
2125 	if (cowp == NULL) {
2126 		rw_exit(&sidp->sid_rwlock);
2127 		return (-1);
2128 	}
2129 	rw_exit(&sidp->sid_rwlock);
2130 
2131 	/* destroy task queues first so they don't reference freed data. */
2132 	if (cowp->cow_taskq) {
2133 		taskq_destroy(cowp->cow_taskq);
2134 		cowp->cow_taskq = NULL;
2135 	}
2136 
2137 	if (cowp->cow_backfile_array != NULL) {
2138 		for (vpp = cowp->cow_backfile_array; *vpp; vpp++)
2139 			VN_RELE(*vpp);
2140 		kmem_free(cowp->cow_backfile_array,
2141 		    (cowp->cow_backcount + 1) * sizeof (vnode_t *));
2142 		cowp->cow_backfile_array = NULL;
2143 	}
2144 
2145 	sidp->sid_cowinfo = NULL;
2146 
2147 	/* remove cmap */
2148 	cmap = &cowp->cow_map;
2149 	ASSERT(cmap);
2150 
2151 	if (cmap->cmap_candidate)
2152 		kmem_free(cmap->cmap_candidate, cmap->cmap_bmsize);
2153 
2154 	if (cmap->cmap_hastrans)
2155 		kmem_free(cmap->cmap_hastrans, cmap->cmap_bmsize);
2156 
2157 	if (cmap->cmap_table)
2158 		transtbl_free(&cowp->cow_map);
2159 
2160 	rw_destroy(&cmap->cmap_rwlock);
2161 
2162 	while (cmap->cmap_waiters) {
2163 		sema_p(&cmap->cmap_throttle_sem);
2164 		sema_v(&cmap->cmap_throttle_sem);
2165 	}
2166 	sema_destroy(&cmap->cmap_throttle_sem);
2167 
2168 	/* remove kstats */
2169 	fssnap_delete_kstats(cowp);
2170 
2171 	kmem_free(cowp, sizeof (struct cow_info));
2172 
2173 	statesidpp = ddi_get_soft_state(statep, sidp->sid_snapnumber);
2174 	if (statesidpp == NULL || *statesidpp == NULL) {
2175 		cmn_err(CE_WARN,
2176 		    "fssnap_delete_impl: could not find state for snapshot %d.",
2177 		    sidp->sid_snapnumber);
2178 	}
2179 	ASSERT(*statesidpp == sidp);
2180 
2181 	/*
2182 	 * Leave the node in the list marked DISABLED so it can be reused
2183 	 * and avoid many race conditions.  Return the snapshot number
2184 	 * that was deleted.
2185 	 */
2186 	mutex_enter(&snapshot_mutex);
2187 	rw_enter(&sidp->sid_rwlock, RW_WRITER);
2188 	sidp->sid_flags &= ~(SID_DISABLING);
2189 	sidp->sid_flags |= SID_DISABLED;
2190 	VN_RELE(sidp->sid_fvp);
2191 	sidp->sid_fvp = NULL;
2192 	snapnumber = sidp->sid_snapnumber;
2193 
2194 	/*
2195 	 * If the snapshot is not busy, free the device info now.  Otherwise
2196 	 * the device nodes are freed in snap_close() when the device is
2197 	 * closed.  The sid will not be reused until the device is not busy.
2198 	 */
2199 	if (SID_AVAILABLE(sidp)) {
2200 		/* remove the device nodes */
2201 		ASSERT(fssnap_dip != NULL);
2202 		(void) snprintf(name, sizeof (name), "%d",
2203 		    sidp->sid_snapnumber);
2204 		ddi_remove_minor_node(fssnap_dip, name);
2205 		(void) snprintf(name, sizeof (name), "%d,raw",
2206 		    sidp->sid_snapnumber);
2207 		ddi_remove_minor_node(fssnap_dip, name);
2208 
2209 		/* delete the state structure */
2210 		ddi_soft_state_free(statep, sidp->sid_snapnumber);
2211 		num_snapshots--;
2212 	}
2213 
2214 	mutex_exit(&snapshot_mutex);
2215 	rw_exit(&sidp->sid_rwlock);
2216 
2217 	return (snapnumber);
2218 }
2219 
2220 /*
2221  * fssnap_create_kstats() - allocate and initialize snapshot kstats
2222  *
2223  */
2224 static void
2225 fssnap_create_kstats(snapshot_id_t *sidp, int snapnum,
2226     const char *mountpoint, const char *backfilename)
2227 {
2228 	kstat_t *num, *mntpoint, *bfname;
2229 	kstat_named_t *hw;
2230 	struct cow_info *cowp = sidp->sid_cowinfo;
2231 	struct cow_kstat_num *stats;
2232 
2233 	/* update the high water mark */
2234 	if (fssnap_highwater_kstat == NULL) {
2235 		cmn_err(CE_WARN, "fssnap_create_kstats: failed to lookup "
2236 		    "high water mark kstat.");
2237 		return;
2238 	}
2239 
2240 	hw = (kstat_named_t *)fssnap_highwater_kstat->ks_data;
2241 	if (hw->value.ui32 < snapnum)
2242 		hw->value.ui32 = snapnum;
2243 
2244 	/* initialize the mount point kstat */
2245 	kstat_delete_byname(snapname, snapnum, FSSNAP_KSTAT_MNTPT);
2246 
2247 	if (mountpoint != NULL) {
2248 		mntpoint = kstat_create(snapname, snapnum, FSSNAP_KSTAT_MNTPT,
2249 		    "misc", KSTAT_TYPE_RAW, strlen(mountpoint) + 1, 0);
2250 		if (mntpoint == NULL) {
2251 			cowp->cow_kstat_mntpt = NULL;
2252 			cmn_err(CE_WARN, "fssnap_create_kstats: failed to "
2253 			    "create mount point kstat");
2254 		} else {
2255 			(void) strncpy(mntpoint->ks_data, mountpoint,
2256 			    strlen(mountpoint));
2257 			cowp->cow_kstat_mntpt = mntpoint;
2258 			kstat_install(mntpoint);
2259 		}
2260 	} else {
2261 		cowp->cow_kstat_mntpt = NULL;
2262 		cmn_err(CE_WARN, "fssnap_create_kstats: mount point not "
2263 		    "specified.");
2264 	}
2265 
2266 	/* initialize the backing file kstat */
2267 	kstat_delete_byname(snapname, snapnum, FSSNAP_KSTAT_BFNAME);
2268 
2269 	if (backfilename == NULL) {
2270 		cowp->cow_kstat_bfname = NULL;
2271 	} else {
2272 		bfname = kstat_create(snapname, snapnum, FSSNAP_KSTAT_BFNAME,
2273 		    "misc", KSTAT_TYPE_RAW, strlen(backfilename) + 1, 0);
2274 		if (bfname != NULL) {
2275 			(void) strncpy(bfname->ks_data, backfilename,
2276 			    strlen(backfilename));
2277 			cowp->cow_kstat_bfname = bfname;
2278 			kstat_install(bfname);
2279 		} else {
2280 			cowp->cow_kstat_bfname = NULL;
2281 			cmn_err(CE_WARN, "fssnap_create_kstats: failed to "
2282 			    "create backing file name kstat");
2283 		}
2284 	}
2285 
2286 	/* initialize numeric kstats */
2287 	kstat_delete_byname(snapname, snapnum, FSSNAP_KSTAT_NUM);
2288 
2289 	num = kstat_create(snapname, snapnum, FSSNAP_KSTAT_NUM,
2290 	    "misc", KSTAT_TYPE_NAMED,
2291 	    sizeof (struct cow_kstat_num) / sizeof (kstat_named_t),
2292 	    0);
2293 	if (num == NULL) {
2294 		cmn_err(CE_WARN, "fssnap_create_kstats: failed to create "
2295 		    "numeric kstats");
2296 		cowp->cow_kstat_num = NULL;
2297 		return;
2298 	}
2299 
2300 	cowp->cow_kstat_num = num;
2301 	stats = num->ks_data;
2302 	num->ks_update = fssnap_update_kstat_num;
2303 	num->ks_private = sidp;
2304 
2305 	kstat_named_init(&stats->ckn_state, FSSNAP_KSTAT_NUM_STATE,
2306 	    KSTAT_DATA_INT32);
2307 	kstat_named_init(&stats->ckn_bfsize, FSSNAP_KSTAT_NUM_BFSIZE,
2308 	    KSTAT_DATA_UINT64);
2309 	kstat_named_init(&stats->ckn_maxsize, FSSNAP_KSTAT_NUM_MAXSIZE,
2310 	    KSTAT_DATA_UINT64);
2311 	kstat_named_init(&stats->ckn_createtime, FSSNAP_KSTAT_NUM_CREATETIME,
2312 	    KSTAT_DATA_LONG);
2313 	kstat_named_init(&stats->ckn_chunksize, FSSNAP_KSTAT_NUM_CHUNKSIZE,
2314 	    KSTAT_DATA_UINT32);
2315 
2316 	/* initialize the static kstats */
2317 	stats->ckn_chunksize.value.ui32 = cowp->cow_map.cmap_chunksz;
2318 	stats->ckn_maxsize.value.ui64 = cowp->cow_map.cmap_maxsize;
2319 	stats->ckn_createtime.value.l = gethrestime_sec();
2320 
2321 	kstat_install(num);
2322 }
2323 
2324 /*
2325  * fssnap_update_kstat_num() - update a numerical snapshot kstat value
2326  *
2327  */
2328 int
2329 fssnap_update_kstat_num(kstat_t *ksp, int rw)
2330 {
2331 	snapshot_id_t *sidp = (snapshot_id_t *)ksp->ks_private;
2332 	struct cow_info *cowp = sidp->sid_cowinfo;
2333 	struct cow_kstat_num *stats = ksp->ks_data;
2334 
2335 	if (rw == KSTAT_WRITE)
2336 		return (EACCES);
2337 
2338 	/* state */
2339 	if (sidp->sid_flags & SID_CREATING)
2340 		stats->ckn_state.value.i32 = COWSTATE_CREATING;
2341 	else if (SID_INACTIVE(sidp))
2342 		stats->ckn_state.value.i32 = COWSTATE_DISABLED;
2343 	else if (SID_BUSY(sidp))
2344 		stats->ckn_state.value.i32 = COWSTATE_ACTIVE;
2345 	else
2346 		stats->ckn_state.value.i32 = COWSTATE_IDLE;
2347 
2348 	/* bfsize */
2349 	stats->ckn_bfsize.value.ui64 = cowp->cow_map.cmap_nchunks *
2350 	    cowp->cow_map.cmap_chunksz;
2351 
2352 	return (0);
2353 }
2354 
2355 /*
2356  * fssnap_delete_kstats() - deallocate snapshot kstats
2357  *
2358  */
2359 void
2360 fssnap_delete_kstats(struct cow_info *cowp)
2361 {
2362 	if (cowp->cow_kstat_num != NULL) {
2363 		kstat_delete(cowp->cow_kstat_num);
2364 		cowp->cow_kstat_num = NULL;
2365 	}
2366 	if (cowp->cow_kstat_mntpt != NULL) {
2367 		kstat_delete(cowp->cow_kstat_mntpt);
2368 		cowp->cow_kstat_mntpt = NULL;
2369 	}
2370 	if (cowp->cow_kstat_bfname != NULL) {
2371 		kstat_delete(cowp->cow_kstat_bfname);
2372 		cowp->cow_kstat_bfname = NULL;
2373 	}
2374 }
2375