xref: /illumos-gate/usr/src/uts/common/fs/zfs/zvol.c (revision a2eea2e101e6a163a537dcc6d4e3c4da2a0ea5b2)
1fa9e4066Sahrens /*
2fa9e4066Sahrens  * CDDL HEADER START
3fa9e4066Sahrens  *
4fa9e4066Sahrens  * The contents of this file are subject to the terms of the
5ea8dc4b6Seschrock  * Common Development and Distribution License (the "License").
6ea8dc4b6Seschrock  * You may not use this file except in compliance with the License.
7fa9e4066Sahrens  *
8fa9e4066Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e4066Sahrens  * or http://www.opensolaris.org/os/licensing.
10fa9e4066Sahrens  * See the License for the specific language governing permissions
11fa9e4066Sahrens  * and limitations under the License.
12fa9e4066Sahrens  *
13fa9e4066Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
14fa9e4066Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e4066Sahrens  * If applicable, add the following below this CDDL HEADER, with the
16fa9e4066Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e4066Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e4066Sahrens  *
19fa9e4066Sahrens  * CDDL HEADER END
20fa9e4066Sahrens  */
21fa9e4066Sahrens /*
227f7322feSeschrock  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23fa9e4066Sahrens  * Use is subject to license terms.
24fa9e4066Sahrens  */
25fa9e4066Sahrens 
26fa9e4066Sahrens #pragma ident	"%Z%%M%	%I%	%E% SMI"
27fa9e4066Sahrens 
28fa9e4066Sahrens /*
29fa9e4066Sahrens  * ZFS volume emulation driver.
30fa9e4066Sahrens  *
31fa9e4066Sahrens  * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
32fa9e4066Sahrens  * Volumes are accessed through the symbolic links named:
33fa9e4066Sahrens  *
34fa9e4066Sahrens  * /dev/zvol/dsk/<pool_name>/<dataset_name>
35fa9e4066Sahrens  * /dev/zvol/rdsk/<pool_name>/<dataset_name>
36fa9e4066Sahrens  *
37fa9e4066Sahrens  * These links are created by the ZFS-specific devfsadm link generator.
38fa9e4066Sahrens  * Volumes are persistent through reboot.  No user command needs to be
39fa9e4066Sahrens  * run before opening and using a device.
40fa9e4066Sahrens  */
41fa9e4066Sahrens 
42fa9e4066Sahrens #include <sys/types.h>
43fa9e4066Sahrens #include <sys/param.h>
44fa9e4066Sahrens #include <sys/errno.h>
45fa9e4066Sahrens #include <sys/aio_req.h>
46fa9e4066Sahrens #include <sys/uio.h>
47fa9e4066Sahrens #include <sys/buf.h>
48fa9e4066Sahrens #include <sys/modctl.h>
49fa9e4066Sahrens #include <sys/open.h>
50fa9e4066Sahrens #include <sys/kmem.h>
51fa9e4066Sahrens #include <sys/conf.h>
52fa9e4066Sahrens #include <sys/cmn_err.h>
53fa9e4066Sahrens #include <sys/stat.h>
54fa9e4066Sahrens #include <sys/zap.h>
55fa9e4066Sahrens #include <sys/spa.h>
56fa9e4066Sahrens #include <sys/zio.h>
57fa9e4066Sahrens #include <sys/dsl_prop.h>
58fa9e4066Sahrens #include <sys/dkio.h>
59fa9e4066Sahrens #include <sys/efi_partition.h>
60fa9e4066Sahrens #include <sys/byteorder.h>
61fa9e4066Sahrens #include <sys/pathname.h>
62fa9e4066Sahrens #include <sys/ddi.h>
63fa9e4066Sahrens #include <sys/sunddi.h>
64fa9e4066Sahrens #include <sys/crc32.h>
65fa9e4066Sahrens #include <sys/dirent.h>
66fa9e4066Sahrens #include <sys/policy.h>
67fa9e4066Sahrens #include <sys/fs/zfs.h>
68fa9e4066Sahrens #include <sys/zfs_ioctl.h>
69fa9e4066Sahrens #include <sys/mkdev.h>
7022ac5be4Sperrin #include <sys/zil.h>
71c5c6ffa0Smaybee #include <sys/refcount.h>
72fa9e4066Sahrens 
73fa9e4066Sahrens #include "zfs_namecheck.h"
74fa9e4066Sahrens 
75fa9e4066Sahrens #define	ZVOL_OBJ		1ULL
76fa9e4066Sahrens #define	ZVOL_ZAP_OBJ		2ULL
77fa9e4066Sahrens 
78fa9e4066Sahrens static void *zvol_state;
79fa9e4066Sahrens 
80fa9e4066Sahrens /*
81fa9e4066Sahrens  * This lock protects the zvol_state structure from being modified
82fa9e4066Sahrens  * while it's being used, e.g. an open that comes in before a create
83fa9e4066Sahrens  * finishes.  It also protects temporary opens of the dataset so that,
84fa9e4066Sahrens  * e.g., an open doesn't get a spurious EBUSY.
85fa9e4066Sahrens  */
86fa9e4066Sahrens static kmutex_t zvol_state_lock;
87fa9e4066Sahrens static uint32_t zvol_minors;
88fa9e4066Sahrens 
89fa9e4066Sahrens /*
90fa9e4066Sahrens  * The in-core state of each volume.
91fa9e4066Sahrens  */
92fa9e4066Sahrens typedef struct zvol_state {
93fa9e4066Sahrens 	char		zv_name[MAXPATHLEN]; /* pool/dd name */
94fa9e4066Sahrens 	uint64_t	zv_volsize;	/* amount of space we advertise */
95fa9e4066Sahrens 	minor_t		zv_minor;	/* minor number */
96fa9e4066Sahrens 	uint8_t		zv_min_bs;	/* minimum addressable block shift */
97fa9e4066Sahrens 	uint8_t		zv_readonly;	/* hard readonly; like write-protect */
98fa9e4066Sahrens 	objset_t	*zv_objset;	/* objset handle */
99fa9e4066Sahrens 	uint32_t	zv_mode;	/* DS_MODE_* flags at open time */
100fa9e4066Sahrens 	uint32_t	zv_open_count[OTYPCNT];	/* open counts */
101fa9e4066Sahrens 	uint32_t	zv_total_opens;	/* total open count */
10222ac5be4Sperrin 	zilog_t		*zv_zilog;	/* ZIL handle */
10322ac5be4Sperrin 	uint64_t	zv_txg_assign;	/* txg to assign during ZIL replay */
104a24e15ceSperrin 	krwlock_t	zv_dslock;	/* dmu_sync() rwlock */
105fa9e4066Sahrens } zvol_state_t;
106fa9e4066Sahrens 
107fa9e4066Sahrens static void
108fa9e4066Sahrens zvol_size_changed(zvol_state_t *zv, dev_t dev)
109fa9e4066Sahrens {
110fa9e4066Sahrens 	dev = makedevice(getmajor(dev), zv->zv_minor);
111fa9e4066Sahrens 
112fa9e4066Sahrens 	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
113fa9e4066Sahrens 	    "Size", zv->zv_volsize) == DDI_SUCCESS);
114fa9e4066Sahrens 	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
115fa9e4066Sahrens 	    "Nblocks", lbtodb(zv->zv_volsize)) == DDI_SUCCESS);
116fa9e4066Sahrens }
117fa9e4066Sahrens 
118fa9e4066Sahrens int
119e9dbad6fSeschrock zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
120fa9e4066Sahrens {
121e9dbad6fSeschrock 	if (volsize == 0)
122fa9e4066Sahrens 		return (EINVAL);
123fa9e4066Sahrens 
124e9dbad6fSeschrock 	if (volsize % blocksize != 0)
1255c5460e9Seschrock 		return (EINVAL);
1265c5460e9Seschrock 
127fa9e4066Sahrens #ifdef _ILP32
128e9dbad6fSeschrock 	if (volsize - 1 > SPEC_MAXOFFSET_T)
129fa9e4066Sahrens 		return (EOVERFLOW);
130fa9e4066Sahrens #endif
131fa9e4066Sahrens 	return (0);
132fa9e4066Sahrens }
133fa9e4066Sahrens 
134fa9e4066Sahrens int
135e9dbad6fSeschrock zvol_check_volblocksize(uint64_t volblocksize)
136fa9e4066Sahrens {
137e9dbad6fSeschrock 	if (volblocksize < SPA_MINBLOCKSIZE ||
138e9dbad6fSeschrock 	    volblocksize > SPA_MAXBLOCKSIZE ||
139e9dbad6fSeschrock 	    !ISP2(volblocksize))
140fa9e4066Sahrens 		return (EDOM);
141fa9e4066Sahrens 
142fa9e4066Sahrens 	return (0);
143fa9e4066Sahrens }
144fa9e4066Sahrens 
145fa9e4066Sahrens static void
146fa9e4066Sahrens zvol_readonly_changed_cb(void *arg, uint64_t newval)
147fa9e4066Sahrens {
148fa9e4066Sahrens 	zvol_state_t *zv = arg;
149fa9e4066Sahrens 
150fa9e4066Sahrens 	zv->zv_readonly = (uint8_t)newval;
151fa9e4066Sahrens }
152fa9e4066Sahrens 
153fa9e4066Sahrens int
154*a2eea2e1Sahrens zvol_get_stats(objset_t *os, nvlist_t *nv)
155fa9e4066Sahrens {
156fa9e4066Sahrens 	int error;
157fa9e4066Sahrens 	dmu_object_info_t doi;
158*a2eea2e1Sahrens 	uint64_t val;
159fa9e4066Sahrens 
160fa9e4066Sahrens 
161*a2eea2e1Sahrens 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
162fa9e4066Sahrens 	if (error)
163fa9e4066Sahrens 		return (error);
164fa9e4066Sahrens 
165*a2eea2e1Sahrens 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
166*a2eea2e1Sahrens 
167fa9e4066Sahrens 	error = dmu_object_info(os, ZVOL_OBJ, &doi);
168fa9e4066Sahrens 
169*a2eea2e1Sahrens 	if (error == 0) {
170*a2eea2e1Sahrens 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
171*a2eea2e1Sahrens 		    doi.doi_data_block_size);
172*a2eea2e1Sahrens 	}
173fa9e4066Sahrens 
174fa9e4066Sahrens 	return (error);
175fa9e4066Sahrens }
176fa9e4066Sahrens 
177fa9e4066Sahrens /*
178fa9e4066Sahrens  * Find a free minor number.
179fa9e4066Sahrens  */
180fa9e4066Sahrens static minor_t
181fa9e4066Sahrens zvol_minor_alloc(void)
182fa9e4066Sahrens {
183fa9e4066Sahrens 	minor_t minor;
184fa9e4066Sahrens 
185fa9e4066Sahrens 	ASSERT(MUTEX_HELD(&zvol_state_lock));
186fa9e4066Sahrens 
187fa9e4066Sahrens 	for (minor = 1; minor <= ZVOL_MAX_MINOR; minor++)
188fa9e4066Sahrens 		if (ddi_get_soft_state(zvol_state, minor) == NULL)
189fa9e4066Sahrens 			return (minor);
190fa9e4066Sahrens 
191fa9e4066Sahrens 	return (0);
192fa9e4066Sahrens }
193fa9e4066Sahrens 
194fa9e4066Sahrens static zvol_state_t *
195e9dbad6fSeschrock zvol_minor_lookup(const char *name)
196fa9e4066Sahrens {
197fa9e4066Sahrens 	minor_t minor;
198fa9e4066Sahrens 	zvol_state_t *zv;
199fa9e4066Sahrens 
200fa9e4066Sahrens 	ASSERT(MUTEX_HELD(&zvol_state_lock));
201fa9e4066Sahrens 
202fa9e4066Sahrens 	for (minor = 1; minor <= ZVOL_MAX_MINOR; minor++) {
203fa9e4066Sahrens 		zv = ddi_get_soft_state(zvol_state, minor);
204fa9e4066Sahrens 		if (zv == NULL)
205fa9e4066Sahrens 			continue;
206fa9e4066Sahrens 		if (strcmp(zv->zv_name, name) == 0)
207fa9e4066Sahrens 			break;
208fa9e4066Sahrens 	}
209fa9e4066Sahrens 
210fa9e4066Sahrens 	return (zv);
211fa9e4066Sahrens }
212fa9e4066Sahrens 
213fa9e4066Sahrens void
214fa9e4066Sahrens zvol_create_cb(objset_t *os, void *arg, dmu_tx_t *tx)
215fa9e4066Sahrens {
216e9dbad6fSeschrock 	zfs_create_data_t *zc = arg;
217fa9e4066Sahrens 	int error;
218e9dbad6fSeschrock 	uint64_t volblocksize, volsize;
219fa9e4066Sahrens 
220e9dbad6fSeschrock 	VERIFY(nvlist_lookup_uint64(zc->zc_props,
221e9dbad6fSeschrock 	    zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
222e9dbad6fSeschrock 	if (nvlist_lookup_uint64(zc->zc_props,
223e9dbad6fSeschrock 	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
224e9dbad6fSeschrock 		volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
225e9dbad6fSeschrock 
226e9dbad6fSeschrock 	/*
227e9dbad6fSeschrock 	 * These properites must be removed from the list so the generic
228e9dbad6fSeschrock 	 * property setting step won't apply to them.
229e9dbad6fSeschrock 	 */
230e9dbad6fSeschrock 	VERIFY(nvlist_remove_all(zc->zc_props,
231e9dbad6fSeschrock 	    zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
232e9dbad6fSeschrock 	(void) nvlist_remove_all(zc->zc_props,
233e9dbad6fSeschrock 	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
234e9dbad6fSeschrock 
235e9dbad6fSeschrock 	error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
236fa9e4066Sahrens 	    DMU_OT_NONE, 0, tx);
237fa9e4066Sahrens 	ASSERT(error == 0);
238fa9e4066Sahrens 
239fa9e4066Sahrens 	error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
240fa9e4066Sahrens 	    DMU_OT_NONE, 0, tx);
241fa9e4066Sahrens 	ASSERT(error == 0);
242fa9e4066Sahrens 
243e9dbad6fSeschrock 	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
244fa9e4066Sahrens 	ASSERT(error == 0);
245fa9e4066Sahrens }
246fa9e4066Sahrens 
247fa9e4066Sahrens /*
24822ac5be4Sperrin  * Replay a TX_WRITE ZIL transaction that didn't get committed
24922ac5be4Sperrin  * after a system failure
25022ac5be4Sperrin  */
25122ac5be4Sperrin static int
25222ac5be4Sperrin zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap)
25322ac5be4Sperrin {
25422ac5be4Sperrin 	objset_t *os = zv->zv_objset;
25522ac5be4Sperrin 	char *data = (char *)(lr + 1);	/* data follows lr_write_t */
25622ac5be4Sperrin 	uint64_t off = lr->lr_offset;
25722ac5be4Sperrin 	uint64_t len = lr->lr_length;
25822ac5be4Sperrin 	dmu_tx_t *tx;
25922ac5be4Sperrin 	int error;
26022ac5be4Sperrin 
26122ac5be4Sperrin 	if (byteswap)
26222ac5be4Sperrin 		byteswap_uint64_array(lr, sizeof (*lr));
26322ac5be4Sperrin 
26422ac5be4Sperrin 	tx = dmu_tx_create(os);
26522ac5be4Sperrin 	dmu_tx_hold_write(tx, ZVOL_OBJ, off, len);
26622ac5be4Sperrin 	error = dmu_tx_assign(tx, zv->zv_txg_assign);
26722ac5be4Sperrin 	if (error) {
26822ac5be4Sperrin 		dmu_tx_abort(tx);
26922ac5be4Sperrin 	} else {
27022ac5be4Sperrin 		dmu_write(os, ZVOL_OBJ, off, len, data, tx);
27122ac5be4Sperrin 		dmu_tx_commit(tx);
27222ac5be4Sperrin 	}
27322ac5be4Sperrin 
27422ac5be4Sperrin 	return (error);
27522ac5be4Sperrin }
27622ac5be4Sperrin 
27722ac5be4Sperrin /* ARGSUSED */
27822ac5be4Sperrin static int
27922ac5be4Sperrin zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap)
28022ac5be4Sperrin {
28122ac5be4Sperrin 	return (ENOTSUP);
28222ac5be4Sperrin }
28322ac5be4Sperrin 
28422ac5be4Sperrin /*
28522ac5be4Sperrin  * Callback vectors for replaying records.
28622ac5be4Sperrin  * Only TX_WRITE is needed for zvol.
28722ac5be4Sperrin  */
28822ac5be4Sperrin zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
28922ac5be4Sperrin 	zvol_replay_err,	/* 0 no such transaction type */
29022ac5be4Sperrin 	zvol_replay_err,	/* TX_CREATE */
29122ac5be4Sperrin 	zvol_replay_err,	/* TX_MKDIR */
29222ac5be4Sperrin 	zvol_replay_err,	/* TX_MKXATTR */
29322ac5be4Sperrin 	zvol_replay_err,	/* TX_SYMLINK */
29422ac5be4Sperrin 	zvol_replay_err,	/* TX_REMOVE */
29522ac5be4Sperrin 	zvol_replay_err,	/* TX_RMDIR */
29622ac5be4Sperrin 	zvol_replay_err,	/* TX_LINK */
29722ac5be4Sperrin 	zvol_replay_err,	/* TX_RENAME */
29822ac5be4Sperrin 	zvol_replay_write,	/* TX_WRITE */
29922ac5be4Sperrin 	zvol_replay_err,	/* TX_TRUNCATE */
30022ac5be4Sperrin 	zvol_replay_err,	/* TX_SETATTR */
30122ac5be4Sperrin 	zvol_replay_err,	/* TX_ACL */
30222ac5be4Sperrin };
30322ac5be4Sperrin 
30422ac5be4Sperrin /*
305fa9e4066Sahrens  * Create a minor node for the specified volume.
306fa9e4066Sahrens  */
307fa9e4066Sahrens int
308e9dbad6fSeschrock zvol_create_minor(const char *name, dev_t dev)
309fa9e4066Sahrens {
310fa9e4066Sahrens 	zvol_state_t *zv;
311fa9e4066Sahrens 	objset_t *os;
312fa9e4066Sahrens 	uint64_t volsize;
313fa9e4066Sahrens 	minor_t minor = 0;
314fa9e4066Sahrens 	struct pathname linkpath;
315fa9e4066Sahrens 	int ds_mode = DS_MODE_PRIMARY;
316fa9e4066Sahrens 	vnode_t *vp = NULL;
317fa9e4066Sahrens 	char *devpath;
318fa9e4066Sahrens 	size_t devpathlen = strlen(ZVOL_FULL_DEV_DIR) + 1 + strlen(name) + 1;
319fa9e4066Sahrens 	char chrbuf[30], blkbuf[30];
320fa9e4066Sahrens 	int error;
321fa9e4066Sahrens 
322fa9e4066Sahrens 	mutex_enter(&zvol_state_lock);
323fa9e4066Sahrens 
324fa9e4066Sahrens 	if ((zv = zvol_minor_lookup(name)) != NULL) {
325fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
326fa9e4066Sahrens 		return (EEXIST);
327fa9e4066Sahrens 	}
328fa9e4066Sahrens 
329fa9e4066Sahrens 	if (strchr(name, '@') != 0)
330fa9e4066Sahrens 		ds_mode |= DS_MODE_READONLY;
331fa9e4066Sahrens 
332fa9e4066Sahrens 	error = dmu_objset_open(name, DMU_OST_ZVOL, ds_mode, &os);
333fa9e4066Sahrens 
334fa9e4066Sahrens 	if (error) {
335fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
336fa9e4066Sahrens 		return (error);
337fa9e4066Sahrens 	}
338fa9e4066Sahrens 
339fa9e4066Sahrens 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
340fa9e4066Sahrens 
341fa9e4066Sahrens 	if (error) {
342fa9e4066Sahrens 		dmu_objset_close(os);
343fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
344fa9e4066Sahrens 		return (error);
345fa9e4066Sahrens 	}
346fa9e4066Sahrens 
347fa9e4066Sahrens 	/*
348fa9e4066Sahrens 	 * If there's an existing /dev/zvol symlink, try to use the
349fa9e4066Sahrens 	 * same minor number we used last time.
350fa9e4066Sahrens 	 */
351fa9e4066Sahrens 	devpath = kmem_alloc(devpathlen, KM_SLEEP);
352fa9e4066Sahrens 
353fa9e4066Sahrens 	(void) sprintf(devpath, "%s/%s", ZVOL_FULL_DEV_DIR, name);
354fa9e4066Sahrens 
355fa9e4066Sahrens 	error = lookupname(devpath, UIO_SYSSPACE, NO_FOLLOW, NULL, &vp);
356fa9e4066Sahrens 
357fa9e4066Sahrens 	kmem_free(devpath, devpathlen);
358fa9e4066Sahrens 
359fa9e4066Sahrens 	if (error == 0 && vp->v_type != VLNK)
360fa9e4066Sahrens 		error = EINVAL;
361fa9e4066Sahrens 
362fa9e4066Sahrens 	if (error == 0) {
363fa9e4066Sahrens 		pn_alloc(&linkpath);
364fa9e4066Sahrens 		error = pn_getsymlink(vp, &linkpath, kcred);
365fa9e4066Sahrens 		if (error == 0) {
366fa9e4066Sahrens 			char *ms = strstr(linkpath.pn_path, ZVOL_PSEUDO_DEV);
367fa9e4066Sahrens 			if (ms != NULL) {
368fa9e4066Sahrens 				ms += strlen(ZVOL_PSEUDO_DEV);
369fa9e4066Sahrens 				minor = stoi(&ms);
370fa9e4066Sahrens 			}
371fa9e4066Sahrens 		}
372fa9e4066Sahrens 		pn_free(&linkpath);
373fa9e4066Sahrens 	}
374fa9e4066Sahrens 
375fa9e4066Sahrens 	if (vp != NULL)
376fa9e4066Sahrens 		VN_RELE(vp);
377fa9e4066Sahrens 
378fa9e4066Sahrens 	/*
379fa9e4066Sahrens 	 * If we found a minor but it's already in use, we must pick a new one.
380fa9e4066Sahrens 	 */
381fa9e4066Sahrens 	if (minor != 0 && ddi_get_soft_state(zvol_state, minor) != NULL)
382fa9e4066Sahrens 		minor = 0;
383fa9e4066Sahrens 
384fa9e4066Sahrens 	if (minor == 0)
385fa9e4066Sahrens 		minor = zvol_minor_alloc();
386fa9e4066Sahrens 
387fa9e4066Sahrens 	if (minor == 0) {
388fa9e4066Sahrens 		dmu_objset_close(os);
389fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
390fa9e4066Sahrens 		return (ENXIO);
391fa9e4066Sahrens 	}
392fa9e4066Sahrens 
393fa9e4066Sahrens 	if (ddi_soft_state_zalloc(zvol_state, minor) != DDI_SUCCESS) {
394fa9e4066Sahrens 		dmu_objset_close(os);
395fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
396fa9e4066Sahrens 		return (EAGAIN);
397fa9e4066Sahrens 	}
398fa9e4066Sahrens 
399e9dbad6fSeschrock 	(void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME,
400e9dbad6fSeschrock 	    (char *)name);
401fa9e4066Sahrens 
402fa9e4066Sahrens 	(void) sprintf(chrbuf, "%uc,raw", minor);
403fa9e4066Sahrens 
404fa9e4066Sahrens 	if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR,
405fa9e4066Sahrens 	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
406fa9e4066Sahrens 		ddi_soft_state_free(zvol_state, minor);
407fa9e4066Sahrens 		dmu_objset_close(os);
408fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
409fa9e4066Sahrens 		return (EAGAIN);
410fa9e4066Sahrens 	}
411fa9e4066Sahrens 
412fa9e4066Sahrens 	(void) sprintf(blkbuf, "%uc", minor);
413fa9e4066Sahrens 
414fa9e4066Sahrens 	if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK,
415fa9e4066Sahrens 	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
416fa9e4066Sahrens 		ddi_remove_minor_node(zfs_dip, chrbuf);
417fa9e4066Sahrens 		ddi_soft_state_free(zvol_state, minor);
418fa9e4066Sahrens 		dmu_objset_close(os);
419fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
420fa9e4066Sahrens 		return (EAGAIN);
421fa9e4066Sahrens 	}
422fa9e4066Sahrens 
423fa9e4066Sahrens 	zv = ddi_get_soft_state(zvol_state, minor);
424fa9e4066Sahrens 
425fa9e4066Sahrens 	(void) strcpy(zv->zv_name, name);
426fa9e4066Sahrens 	zv->zv_min_bs = DEV_BSHIFT;
427fa9e4066Sahrens 	zv->zv_minor = minor;
428fa9e4066Sahrens 	zv->zv_volsize = volsize;
429fa9e4066Sahrens 	zv->zv_objset = os;
430fa9e4066Sahrens 	zv->zv_mode = ds_mode;
43122ac5be4Sperrin 	zv->zv_zilog = zil_open(os, NULL);
43222ac5be4Sperrin 
433a24e15ceSperrin 	rw_init(&zv->zv_dslock, NULL, RW_DEFAULT, NULL);
434a24e15ceSperrin 
43522ac5be4Sperrin 	zil_replay(os, zv, &zv->zv_txg_assign, zvol_replay_vector, NULL);
436fa9e4066Sahrens 
437fa9e4066Sahrens 	zvol_size_changed(zv, dev);
438fa9e4066Sahrens 
439ea8dc4b6Seschrock 	/* XXX this should handle the possible i/o error */
440fa9e4066Sahrens 	VERIFY(dsl_prop_register(dmu_objset_ds(zv->zv_objset),
441fa9e4066Sahrens 	    "readonly", zvol_readonly_changed_cb, zv) == 0);
442fa9e4066Sahrens 
443fa9e4066Sahrens 	zvol_minors++;
444fa9e4066Sahrens 
445fa9e4066Sahrens 	mutex_exit(&zvol_state_lock);
446fa9e4066Sahrens 
447fa9e4066Sahrens 	return (0);
448fa9e4066Sahrens }
449fa9e4066Sahrens 
450fa9e4066Sahrens /*
451fa9e4066Sahrens  * Remove minor node for the specified volume.
452fa9e4066Sahrens  */
453fa9e4066Sahrens int
454e9dbad6fSeschrock zvol_remove_minor(const char *name)
455fa9e4066Sahrens {
456fa9e4066Sahrens 	zvol_state_t *zv;
457fa9e4066Sahrens 	char namebuf[30];
458fa9e4066Sahrens 
459fa9e4066Sahrens 	mutex_enter(&zvol_state_lock);
460fa9e4066Sahrens 
461e9dbad6fSeschrock 	if ((zv = zvol_minor_lookup(name)) == NULL) {
462fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
463fa9e4066Sahrens 		return (ENXIO);
464fa9e4066Sahrens 	}
465fa9e4066Sahrens 
466fa9e4066Sahrens 	if (zv->zv_total_opens != 0) {
467fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
468fa9e4066Sahrens 		return (EBUSY);
469fa9e4066Sahrens 	}
470fa9e4066Sahrens 
471fa9e4066Sahrens 	(void) sprintf(namebuf, "%uc,raw", zv->zv_minor);
472fa9e4066Sahrens 	ddi_remove_minor_node(zfs_dip, namebuf);
473fa9e4066Sahrens 
474fa9e4066Sahrens 	(void) sprintf(namebuf, "%uc", zv->zv_minor);
475fa9e4066Sahrens 	ddi_remove_minor_node(zfs_dip, namebuf);
476fa9e4066Sahrens 
477fa9e4066Sahrens 	VERIFY(dsl_prop_unregister(dmu_objset_ds(zv->zv_objset),
478fa9e4066Sahrens 	    "readonly", zvol_readonly_changed_cb, zv) == 0);
479fa9e4066Sahrens 
48022ac5be4Sperrin 	zil_close(zv->zv_zilog);
48122ac5be4Sperrin 	zv->zv_zilog = NULL;
482fa9e4066Sahrens 	dmu_objset_close(zv->zv_objset);
483fa9e4066Sahrens 	zv->zv_objset = NULL;
484fa9e4066Sahrens 
485fa9e4066Sahrens 	ddi_soft_state_free(zvol_state, zv->zv_minor);
486fa9e4066Sahrens 
487fa9e4066Sahrens 	zvol_minors--;
488fa9e4066Sahrens 
489fa9e4066Sahrens 	mutex_exit(&zvol_state_lock);
490fa9e4066Sahrens 
491fa9e4066Sahrens 	return (0);
492fa9e4066Sahrens }
493fa9e4066Sahrens 
494fa9e4066Sahrens int
495e9dbad6fSeschrock zvol_set_volsize(const char *name, dev_t dev, uint64_t volsize)
496fa9e4066Sahrens {
497fa9e4066Sahrens 	zvol_state_t *zv;
498fa9e4066Sahrens 	dmu_tx_t *tx;
499fa9e4066Sahrens 	int error;
5005c5460e9Seschrock 	dmu_object_info_t doi;
501fa9e4066Sahrens 
502fa9e4066Sahrens 	mutex_enter(&zvol_state_lock);
503fa9e4066Sahrens 
504e9dbad6fSeschrock 	if ((zv = zvol_minor_lookup(name)) == NULL) {
505fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
506fa9e4066Sahrens 		return (ENXIO);
507fa9e4066Sahrens 	}
508fa9e4066Sahrens 
5095c5460e9Seschrock 	if ((error = dmu_object_info(zv->zv_objset, ZVOL_OBJ, &doi)) != 0 ||
510e9dbad6fSeschrock 	    (error = zvol_check_volsize(volsize,
511e9dbad6fSeschrock 	    doi.doi_data_block_size)) != 0) {
5125c5460e9Seschrock 		mutex_exit(&zvol_state_lock);
5135c5460e9Seschrock 		return (error);
5145c5460e9Seschrock 	}
5155c5460e9Seschrock 
516fa9e4066Sahrens 	if (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)) {
517fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
518fa9e4066Sahrens 		return (EROFS);
519fa9e4066Sahrens 	}
520fa9e4066Sahrens 
521fa9e4066Sahrens 	tx = dmu_tx_create(zv->zv_objset);
522ea8dc4b6Seschrock 	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
523e9dbad6fSeschrock 	dmu_tx_hold_free(tx, ZVOL_OBJ, volsize, DMU_OBJECT_END);
524fa9e4066Sahrens 	error = dmu_tx_assign(tx, TXG_WAIT);
525fa9e4066Sahrens 	if (error) {
526fa9e4066Sahrens 		dmu_tx_abort(tx);
527fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
528fa9e4066Sahrens 		return (error);
529fa9e4066Sahrens 	}
530fa9e4066Sahrens 
531fa9e4066Sahrens 	error = zap_update(zv->zv_objset, ZVOL_ZAP_OBJ, "size", 8, 1,
532e9dbad6fSeschrock 	    &volsize, tx);
533ea8dc4b6Seschrock 	if (error == 0) {
534e9dbad6fSeschrock 		error = dmu_free_range(zv->zv_objset, ZVOL_OBJ, volsize,
535fa9e4066Sahrens 		    DMU_OBJECT_END, tx);
536ea8dc4b6Seschrock 	}
537fa9e4066Sahrens 
538fa9e4066Sahrens 	dmu_tx_commit(tx);
539fa9e4066Sahrens 
540fa9e4066Sahrens 	if (error == 0) {
541e9dbad6fSeschrock 		zv->zv_volsize = volsize;
542fa9e4066Sahrens 		zvol_size_changed(zv, dev);
543fa9e4066Sahrens 	}
544fa9e4066Sahrens 
545fa9e4066Sahrens 	mutex_exit(&zvol_state_lock);
546fa9e4066Sahrens 
547fa9e4066Sahrens 	return (error);
548fa9e4066Sahrens }
549fa9e4066Sahrens 
550fa9e4066Sahrens int
551e9dbad6fSeschrock zvol_set_volblocksize(const char *name, uint64_t volblocksize)
552fa9e4066Sahrens {
553fa9e4066Sahrens 	zvol_state_t *zv;
554fa9e4066Sahrens 	dmu_tx_t *tx;
555fa9e4066Sahrens 	int error;
556fa9e4066Sahrens 
557fa9e4066Sahrens 	mutex_enter(&zvol_state_lock);
558fa9e4066Sahrens 
559e9dbad6fSeschrock 	if ((zv = zvol_minor_lookup(name)) == NULL) {
560fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
561fa9e4066Sahrens 		return (ENXIO);
562fa9e4066Sahrens 	}
563fa9e4066Sahrens 
564fa9e4066Sahrens 	if (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)) {
565fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
566fa9e4066Sahrens 		return (EROFS);
567fa9e4066Sahrens 	}
568fa9e4066Sahrens 
569fa9e4066Sahrens 	tx = dmu_tx_create(zv->zv_objset);
570fa9e4066Sahrens 	dmu_tx_hold_bonus(tx, ZVOL_OBJ);
571fa9e4066Sahrens 	error = dmu_tx_assign(tx, TXG_WAIT);
572fa9e4066Sahrens 	if (error) {
573fa9e4066Sahrens 		dmu_tx_abort(tx);
574fa9e4066Sahrens 	} else {
575fa9e4066Sahrens 		error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ,
576e9dbad6fSeschrock 		    volblocksize, 0, tx);
577fa9e4066Sahrens 		if (error == ENOTSUP)
578fa9e4066Sahrens 			error = EBUSY;
579fa9e4066Sahrens 		dmu_tx_commit(tx);
580fa9e4066Sahrens 	}
581fa9e4066Sahrens 
582fa9e4066Sahrens 	mutex_exit(&zvol_state_lock);
583fa9e4066Sahrens 
584fa9e4066Sahrens 	return (error);
585fa9e4066Sahrens }
586fa9e4066Sahrens 
587fa9e4066Sahrens /*ARGSUSED*/
588fa9e4066Sahrens int
589fa9e4066Sahrens zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr)
590fa9e4066Sahrens {
591fa9e4066Sahrens 	minor_t minor = getminor(*devp);
592fa9e4066Sahrens 	zvol_state_t *zv;
593fa9e4066Sahrens 
594fa9e4066Sahrens 	if (minor == 0)			/* This is the control device */
595fa9e4066Sahrens 		return (0);
596fa9e4066Sahrens 
597fa9e4066Sahrens 	mutex_enter(&zvol_state_lock);
598fa9e4066Sahrens 
599fa9e4066Sahrens 	zv = ddi_get_soft_state(zvol_state, minor);
600fa9e4066Sahrens 	if (zv == NULL) {
601fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
602fa9e4066Sahrens 		return (ENXIO);
603fa9e4066Sahrens 	}
604fa9e4066Sahrens 
605fa9e4066Sahrens 	ASSERT(zv->zv_objset != NULL);
606fa9e4066Sahrens 
607fa9e4066Sahrens 	if ((flag & FWRITE) &&
608fa9e4066Sahrens 	    (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY))) {
609fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
610fa9e4066Sahrens 		return (EROFS);
611fa9e4066Sahrens 	}
612fa9e4066Sahrens 
613fa9e4066Sahrens 	if (zv->zv_open_count[otyp] == 0 || otyp == OTYP_LYR) {
614fa9e4066Sahrens 		zv->zv_open_count[otyp]++;
615fa9e4066Sahrens 		zv->zv_total_opens++;
616fa9e4066Sahrens 	}
617fa9e4066Sahrens 
618fa9e4066Sahrens 	mutex_exit(&zvol_state_lock);
619fa9e4066Sahrens 
620fa9e4066Sahrens 	return (0);
621fa9e4066Sahrens }
622fa9e4066Sahrens 
623fa9e4066Sahrens /*ARGSUSED*/
624fa9e4066Sahrens int
625fa9e4066Sahrens zvol_close(dev_t dev, int flag, int otyp, cred_t *cr)
626fa9e4066Sahrens {
627fa9e4066Sahrens 	minor_t minor = getminor(dev);
628fa9e4066Sahrens 	zvol_state_t *zv;
629fa9e4066Sahrens 
630fa9e4066Sahrens 	if (minor == 0)		/* This is the control device */
631fa9e4066Sahrens 		return (0);
632fa9e4066Sahrens 
633fa9e4066Sahrens 	mutex_enter(&zvol_state_lock);
634fa9e4066Sahrens 
635fa9e4066Sahrens 	zv = ddi_get_soft_state(zvol_state, minor);
636fa9e4066Sahrens 	if (zv == NULL) {
637fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
638fa9e4066Sahrens 		return (ENXIO);
639fa9e4066Sahrens 	}
640fa9e4066Sahrens 
641fa9e4066Sahrens 	/*
642fa9e4066Sahrens 	 * The next statement is a workaround for the following DDI bug:
643fa9e4066Sahrens 	 * 6343604 specfs race: multiple "last-close" of the same device
644fa9e4066Sahrens 	 */
645fa9e4066Sahrens 	if (zv->zv_total_opens == 0) {
646fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
647fa9e4066Sahrens 		return (0);
648fa9e4066Sahrens 	}
649fa9e4066Sahrens 
650fa9e4066Sahrens 	/*
651fa9e4066Sahrens 	 * If the open count is zero, this is a spurious close.
652fa9e4066Sahrens 	 * That indicates a bug in the kernel / DDI framework.
653fa9e4066Sahrens 	 */
654fa9e4066Sahrens 	ASSERT(zv->zv_open_count[otyp] != 0);
655fa9e4066Sahrens 	ASSERT(zv->zv_total_opens != 0);
656fa9e4066Sahrens 
657fa9e4066Sahrens 	/*
658fa9e4066Sahrens 	 * You may get multiple opens, but only one close.
659fa9e4066Sahrens 	 */
660fa9e4066Sahrens 	zv->zv_open_count[otyp]--;
661fa9e4066Sahrens 	zv->zv_total_opens--;
662fa9e4066Sahrens 
663fa9e4066Sahrens 	mutex_exit(&zvol_state_lock);
664fa9e4066Sahrens 
665fa9e4066Sahrens 	return (0);
666fa9e4066Sahrens }
667fa9e4066Sahrens 
66822ac5be4Sperrin /*
669a24e15ceSperrin  * Create and return an immediate write ZIL transaction.
670a24e15ceSperrin  */
671a24e15ceSperrin itx_t *
672a24e15ceSperrin zvol_immediate_itx(offset_t off, ssize_t len, char *addr)
673a24e15ceSperrin {
674a24e15ceSperrin 	itx_t *itx;
675a24e15ceSperrin 	lr_write_t *lr;
676a24e15ceSperrin 
677a24e15ceSperrin 	itx = zil_itx_create(TX_WRITE, sizeof (*lr) + len);
678a24e15ceSperrin 	lr = (lr_write_t *)&itx->itx_lr;
679a24e15ceSperrin 	lr->lr_foid = ZVOL_OBJ;
680a24e15ceSperrin 	lr->lr_offset = off;
681a24e15ceSperrin 	lr->lr_length = len;
682a24e15ceSperrin 	lr->lr_blkoff = 0;
683a24e15ceSperrin 	BP_ZERO(&lr->lr_blkptr);
684a24e15ceSperrin 	bcopy(addr, (char *)itx + offsetof(itx_t, itx_lr) +
685a24e15ceSperrin 	    sizeof (*lr), len);
686a24e15ceSperrin 	itx->itx_wr_state = WR_COPIED;
687a24e15ceSperrin 	return (itx);
688a24e15ceSperrin }
689a24e15ceSperrin 
690a24e15ceSperrin /*
691a24e15ceSperrin  * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
69222ac5be4Sperrin  *
69322ac5be4Sperrin  * We store data in the log buffers if it's small enough.
69422ac5be4Sperrin  * Otherwise we flush the data out via dmu_sync().
69522ac5be4Sperrin  */
69622ac5be4Sperrin ssize_t zvol_immediate_write_sz = 65536;
69722ac5be4Sperrin 
69822ac5be4Sperrin int
69922ac5be4Sperrin zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len,
70022ac5be4Sperrin     char *addr)
70122ac5be4Sperrin {
702a24e15ceSperrin 	dmu_object_info_t doi;
703a24e15ceSperrin 	ssize_t nbytes;
70422ac5be4Sperrin 	itx_t *itx;
70522ac5be4Sperrin 	lr_write_t *lr;
706a24e15ceSperrin 	objset_t *os;
707c5c6ffa0Smaybee 	dmu_buf_t *db;
708a24e15ceSperrin 	uint64_t txg;
709c5c6ffa0Smaybee 	uint64_t boff;
71022ac5be4Sperrin 	int error;
711a24e15ceSperrin 	uint32_t blocksize;
71222ac5be4Sperrin 
713a24e15ceSperrin 	/* handle common case */
714a24e15ceSperrin 	if (len <= zvol_immediate_write_sz) {
715a24e15ceSperrin 		itx = zvol_immediate_itx(off, len, addr);
716a24e15ceSperrin 		(void) zil_itx_assign(zv->zv_zilog, itx, tx);
717a24e15ceSperrin 		return (0);
718a24e15ceSperrin 	}
719a24e15ceSperrin 
720a24e15ceSperrin 	txg = dmu_tx_get_txg(tx);
721a24e15ceSperrin 	os = zv->zv_objset;
722a24e15ceSperrin 
723a24e15ceSperrin 	/*
724a24e15ceSperrin 	 * We need to dmu_sync() each block in the range.
725a24e15ceSperrin 	 * For this we need the blocksize.
726a24e15ceSperrin 	 */
727a24e15ceSperrin 	error = dmu_object_info(os, ZVOL_OBJ, &doi);
728a24e15ceSperrin 	if (error)
729a24e15ceSperrin 		return (error);
730a24e15ceSperrin 	blocksize = doi.doi_data_block_size;
731a24e15ceSperrin 
732a24e15ceSperrin 	/*
733a24e15ceSperrin 	 * We need to immediate write or dmu_sync() each block in the range.
734a24e15ceSperrin 	 */
735a24e15ceSperrin 	while (len) {
736a24e15ceSperrin 		nbytes = MIN(len, blocksize - P2PHASE(off, blocksize));
737a24e15ceSperrin 		if (nbytes <= zvol_immediate_write_sz) {
738a24e15ceSperrin 			itx = zvol_immediate_itx(off, nbytes, addr);
739a24e15ceSperrin 		} else {
740c5c6ffa0Smaybee 			boff =  P2ALIGN_TYPED(off, blocksize, uint64_t);
741a24e15ceSperrin 			itx = zil_itx_create(TX_WRITE, sizeof (*lr));
74222ac5be4Sperrin 			lr = (lr_write_t *)&itx->itx_lr;
74322ac5be4Sperrin 			lr->lr_foid = ZVOL_OBJ;
74422ac5be4Sperrin 			lr->lr_offset = off;
745a24e15ceSperrin 			lr->lr_length = nbytes;
746c5c6ffa0Smaybee 			lr->lr_blkoff = off - boff;
74722ac5be4Sperrin 			BP_ZERO(&lr->lr_blkptr);
74822ac5be4Sperrin 
749c5c6ffa0Smaybee 			/* XXX - we should do these IOs in parallel */
750c5c6ffa0Smaybee 			VERIFY(0 == dmu_buf_hold(os, ZVOL_OBJ, boff,
751c5c6ffa0Smaybee 			    FTAG, &db));
752c5c6ffa0Smaybee 			ASSERT(boff == db->db_offset);
753c5c6ffa0Smaybee 			error = dmu_sync(NULL, db, &lr->lr_blkptr,
754c5c6ffa0Smaybee 			    txg, NULL, NULL);
755c5c6ffa0Smaybee 			dmu_buf_rele(db, FTAG);
75622ac5be4Sperrin 			if (error) {
75722ac5be4Sperrin 				kmem_free(itx, offsetof(itx_t, itx_lr));
75822ac5be4Sperrin 				return (error);
75922ac5be4Sperrin 			}
760104e2ed7Sperrin 			itx->itx_wr_state = WR_COPIED;
761a24e15ceSperrin 		}
76222ac5be4Sperrin 		(void) zil_itx_assign(zv->zv_zilog, itx, tx);
763a24e15ceSperrin 		len -= nbytes;
764a24e15ceSperrin 		off += nbytes;
765a24e15ceSperrin 	}
76622ac5be4Sperrin 	return (0);
76722ac5be4Sperrin }
76822ac5be4Sperrin 
769fa9e4066Sahrens int
770fa9e4066Sahrens zvol_strategy(buf_t *bp)
771fa9e4066Sahrens {
772fa9e4066Sahrens 	zvol_state_t *zv = ddi_get_soft_state(zvol_state, getminor(bp->b_edev));
773fa9e4066Sahrens 	uint64_t off, volsize;
774fa9e4066Sahrens 	size_t size, resid;
775fa9e4066Sahrens 	char *addr;
77622ac5be4Sperrin 	objset_t *os;
777fa9e4066Sahrens 	int error = 0;
77822ac5be4Sperrin 	int sync;
779a24e15ceSperrin 	int reading;
780a24e15ceSperrin 	int txg_sync_needed = B_FALSE;
781fa9e4066Sahrens 
782fa9e4066Sahrens 	if (zv == NULL) {
783fa9e4066Sahrens 		bioerror(bp, ENXIO);
784fa9e4066Sahrens 		biodone(bp);
785fa9e4066Sahrens 		return (0);
786fa9e4066Sahrens 	}
787fa9e4066Sahrens 
788fa9e4066Sahrens 	if (getminor(bp->b_edev) == 0) {
789fa9e4066Sahrens 		bioerror(bp, EINVAL);
790fa9e4066Sahrens 		biodone(bp);
791fa9e4066Sahrens 		return (0);
792fa9e4066Sahrens 	}
793fa9e4066Sahrens 
794*a2eea2e1Sahrens 	if ((zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)) &&
795*a2eea2e1Sahrens 	    !(bp->b_flags & B_READ)) {
796fa9e4066Sahrens 		bioerror(bp, EROFS);
797fa9e4066Sahrens 		biodone(bp);
798fa9e4066Sahrens 		return (0);
799fa9e4066Sahrens 	}
800fa9e4066Sahrens 
801fa9e4066Sahrens 	off = ldbtob(bp->b_blkno);
802fa9e4066Sahrens 	volsize = zv->zv_volsize;
803fa9e4066Sahrens 
80422ac5be4Sperrin 	os = zv->zv_objset;
80522ac5be4Sperrin 	ASSERT(os != NULL);
80622ac5be4Sperrin 	sync = !(bp->b_flags & B_ASYNC) && !(zil_disable);
807fa9e4066Sahrens 
808fa9e4066Sahrens 	bp_mapin(bp);
809fa9e4066Sahrens 	addr = bp->b_un.b_addr;
810fa9e4066Sahrens 	resid = bp->b_bcount;
811fa9e4066Sahrens 
812a24e15ceSperrin 	/*
813a24e15ceSperrin 	 * There must be no buffer changes when doing a dmu_sync() because
814a24e15ceSperrin 	 * we can't change the data whilst calculating the checksum.
815a24e15ceSperrin 	 * A better approach than a per zvol rwlock would be to lock ranges.
816a24e15ceSperrin 	 */
817a24e15ceSperrin 	reading = bp->b_flags & B_READ;
818a24e15ceSperrin 	if (reading || resid <= zvol_immediate_write_sz)
819a24e15ceSperrin 		rw_enter(&zv->zv_dslock, RW_READER);
820a24e15ceSperrin 	else
821a24e15ceSperrin 		rw_enter(&zv->zv_dslock, RW_WRITER);
822a24e15ceSperrin 
823fa9e4066Sahrens 	while (resid != 0 && off < volsize) {
824fa9e4066Sahrens 
825fa9e4066Sahrens 		size = MIN(resid, 1UL << 20);	/* cap at 1MB per tx */
826fa9e4066Sahrens 
827fa9e4066Sahrens 		if (size > volsize - off)	/* don't write past the end */
828fa9e4066Sahrens 			size = volsize - off;
829fa9e4066Sahrens 
830a24e15ceSperrin 		if (reading) {
831a24e15ceSperrin 			error = dmu_read(os, ZVOL_OBJ, off, size, addr);
832fa9e4066Sahrens 		} else {
83322ac5be4Sperrin 			dmu_tx_t *tx = dmu_tx_create(os);
834fa9e4066Sahrens 			dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
835fa9e4066Sahrens 			error = dmu_tx_assign(tx, TXG_WAIT);
836fa9e4066Sahrens 			if (error) {
837fa9e4066Sahrens 				dmu_tx_abort(tx);
838fa9e4066Sahrens 			} else {
83922ac5be4Sperrin 				dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
84022ac5be4Sperrin 				if (sync) {
84122ac5be4Sperrin 					/* use the ZIL to commit this write */
842a24e15ceSperrin 					if (zvol_log_write(zv, tx, off, size,
843a24e15ceSperrin 					    addr) != 0) {
844a24e15ceSperrin 						txg_sync_needed = B_TRUE;
84522ac5be4Sperrin 					}
84622ac5be4Sperrin 				}
847fa9e4066Sahrens 				dmu_tx_commit(tx);
848fa9e4066Sahrens 			}
849fa9e4066Sahrens 		}
850fa9e4066Sahrens 		if (error)
851fa9e4066Sahrens 			break;
852fa9e4066Sahrens 		off += size;
853fa9e4066Sahrens 		addr += size;
854fa9e4066Sahrens 		resid -= size;
855fa9e4066Sahrens 	}
856a24e15ceSperrin 	rw_exit(&zv->zv_dslock);
857fa9e4066Sahrens 
858fa9e4066Sahrens 	if ((bp->b_resid = resid) == bp->b_bcount)
859fa9e4066Sahrens 		bioerror(bp, off > volsize ? EINVAL : error);
860fa9e4066Sahrens 
861fa9e4066Sahrens 	biodone(bp);
86222ac5be4Sperrin 
863a24e15ceSperrin 	if (sync) {
864a24e15ceSperrin 		if (txg_sync_needed)
865a24e15ceSperrin 			txg_wait_synced(dmu_objset_pool(os), 0);
866a24e15ceSperrin 		else
867b19a79ecSperrin 			zil_commit(zv->zv_zilog, UINT64_MAX, 0);
868a24e15ceSperrin 	}
86922ac5be4Sperrin 
870fa9e4066Sahrens 	return (0);
871fa9e4066Sahrens }
872fa9e4066Sahrens 
873fa9e4066Sahrens /*ARGSUSED*/
874fa9e4066Sahrens int
875fa9e4066Sahrens zvol_read(dev_t dev, uio_t *uiop, cred_t *cr)
876fa9e4066Sahrens {
877fa9e4066Sahrens 	return (physio(zvol_strategy, NULL, dev, B_READ, minphys, uiop));
878fa9e4066Sahrens }
879fa9e4066Sahrens 
880fa9e4066Sahrens /*ARGSUSED*/
881fa9e4066Sahrens int
882fa9e4066Sahrens zvol_write(dev_t dev, uio_t *uiop, cred_t *cr)
883fa9e4066Sahrens {
884fa9e4066Sahrens 	return (physio(zvol_strategy, NULL, dev, B_WRITE, minphys, uiop));
885fa9e4066Sahrens }
886fa9e4066Sahrens 
887fa9e4066Sahrens /*ARGSUSED*/
888fa9e4066Sahrens int
889fa9e4066Sahrens zvol_aread(dev_t dev, struct aio_req *aio, cred_t *cr)
890fa9e4066Sahrens {
891fa9e4066Sahrens 	return (aphysio(zvol_strategy, anocancel, dev, B_READ, minphys, aio));
892fa9e4066Sahrens }
893fa9e4066Sahrens 
894fa9e4066Sahrens /*ARGSUSED*/
895fa9e4066Sahrens int
896fa9e4066Sahrens zvol_awrite(dev_t dev, struct aio_req *aio, cred_t *cr)
897fa9e4066Sahrens {
898fa9e4066Sahrens 	return (aphysio(zvol_strategy, anocancel, dev, B_WRITE, minphys, aio));
899fa9e4066Sahrens }
900fa9e4066Sahrens 
901fa9e4066Sahrens /*
902fa9e4066Sahrens  * Dirtbag ioctls to support mkfs(1M) for UFS filesystems.  See dkio(7I).
903fa9e4066Sahrens  */
904fa9e4066Sahrens /*ARGSUSED*/
905fa9e4066Sahrens int
906fa9e4066Sahrens zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
907fa9e4066Sahrens {
908fa9e4066Sahrens 	zvol_state_t *zv;
909fa9e4066Sahrens 	struct dk_cinfo dkc;
910fa9e4066Sahrens 	struct dk_minfo dkm;
911fa9e4066Sahrens 	dk_efi_t efi;
912fa9e4066Sahrens 	efi_gpt_t gpt;
913fa9e4066Sahrens 	efi_gpe_t gpe;
914fa9e4066Sahrens 	struct uuid uuid = EFI_RESERVED;
915fa9e4066Sahrens 	uint32_t crc;
916fa9e4066Sahrens 	int error = 0;
917fa9e4066Sahrens 
918fa9e4066Sahrens 	mutex_enter(&zvol_state_lock);
919fa9e4066Sahrens 
920fa9e4066Sahrens 	zv = ddi_get_soft_state(zvol_state, getminor(dev));
921fa9e4066Sahrens 
922fa9e4066Sahrens 	if (zv == NULL) {
923fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
924fa9e4066Sahrens 		return (ENXIO);
925fa9e4066Sahrens 	}
926fa9e4066Sahrens 
927fa9e4066Sahrens 	switch (cmd) {
928fa9e4066Sahrens 
929fa9e4066Sahrens 	case DKIOCINFO:
930fa9e4066Sahrens 		bzero(&dkc, sizeof (dkc));
931fa9e4066Sahrens 		(void) strcpy(dkc.dki_cname, "zvol");
932fa9e4066Sahrens 		(void) strcpy(dkc.dki_dname, "zvol");
933fa9e4066Sahrens 		dkc.dki_ctype = DKC_UNKNOWN;
9348f8be083Sbonwick 		dkc.dki_maxtransfer = 1 << (SPA_MAXBLOCKSHIFT - zv->zv_min_bs);
935fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
936fa9e4066Sahrens 		if (ddi_copyout(&dkc, (void *)arg, sizeof (dkc), flag))
937fa9e4066Sahrens 			error = EFAULT;
938fa9e4066Sahrens 		return (error);
939fa9e4066Sahrens 
940fa9e4066Sahrens 	case DKIOCGMEDIAINFO:
941fa9e4066Sahrens 		bzero(&dkm, sizeof (dkm));
942fa9e4066Sahrens 		dkm.dki_lbsize = 1U << zv->zv_min_bs;
943fa9e4066Sahrens 		dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
944fa9e4066Sahrens 		dkm.dki_media_type = DK_UNKNOWN;
945fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
946fa9e4066Sahrens 		if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag))
947fa9e4066Sahrens 			error = EFAULT;
948fa9e4066Sahrens 		return (error);
949fa9e4066Sahrens 
950fa9e4066Sahrens 	case DKIOCGETEFI:
951fa9e4066Sahrens 		if (ddi_copyin((void *)arg, &efi, sizeof (dk_efi_t), flag)) {
952fa9e4066Sahrens 			mutex_exit(&zvol_state_lock);
953fa9e4066Sahrens 			return (EFAULT);
954fa9e4066Sahrens 		}
955fa9e4066Sahrens 
956fa9e4066Sahrens 		bzero(&gpt, sizeof (gpt));
957fa9e4066Sahrens 		bzero(&gpe, sizeof (gpe));
958fa9e4066Sahrens 
959fa9e4066Sahrens 		efi.dki_data = (void *)(uintptr_t)efi.dki_data_64;
960fa9e4066Sahrens 
961fa9e4066Sahrens 		if (efi.dki_length < sizeof (gpt) + sizeof (gpe)) {
962fa9e4066Sahrens 			mutex_exit(&zvol_state_lock);
963fa9e4066Sahrens 			return (EINVAL);
964fa9e4066Sahrens 		}
965fa9e4066Sahrens 
966fa9e4066Sahrens 		efi.dki_length = sizeof (gpt) + sizeof (gpe);
967fa9e4066Sahrens 
968fa9e4066Sahrens 		gpt.efi_gpt_Signature = LE_64(EFI_SIGNATURE);
9695c5460e9Seschrock 		gpt.efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT);
970fa9e4066Sahrens 		gpt.efi_gpt_HeaderSize = LE_32(sizeof (gpt));
971fa9e4066Sahrens 		gpt.efi_gpt_FirstUsableLBA = LE_64(0ULL);
972fa9e4066Sahrens 		gpt.efi_gpt_LastUsableLBA =
973fa9e4066Sahrens 		    LE_64((zv->zv_volsize >> zv->zv_min_bs) - 1);
974fa9e4066Sahrens 		gpt.efi_gpt_NumberOfPartitionEntries = LE_32(1);
975fa9e4066Sahrens 		gpt.efi_gpt_SizeOfPartitionEntry = LE_32(sizeof (gpe));
976fa9e4066Sahrens 
977fa9e4066Sahrens 		UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid);
978fa9e4066Sahrens 		gpe.efi_gpe_StartingLBA = gpt.efi_gpt_FirstUsableLBA;
979fa9e4066Sahrens 		gpe.efi_gpe_EndingLBA = gpt.efi_gpt_LastUsableLBA;
980fa9e4066Sahrens 
981fa9e4066Sahrens 		CRC32(crc, &gpe, sizeof (gpe), -1U, crc32_table);
982fa9e4066Sahrens 		gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
983fa9e4066Sahrens 
984fa9e4066Sahrens 		CRC32(crc, &gpt, sizeof (gpt), -1U, crc32_table);
985fa9e4066Sahrens 		gpt.efi_gpt_HeaderCRC32 = LE_32(~crc);
986fa9e4066Sahrens 
987fa9e4066Sahrens 		mutex_exit(&zvol_state_lock);
988fa9e4066Sahrens 		if (ddi_copyout(&gpt, efi.dki_data, sizeof (gpt), flag) ||
989fa9e4066Sahrens 		    ddi_copyout(&gpe, efi.dki_data + 1, sizeof (gpe), flag))
990fa9e4066Sahrens 			error = EFAULT;
991fa9e4066Sahrens 		return (error);
992fa9e4066Sahrens 
993fa9e4066Sahrens 	default:
994fa9e4066Sahrens 		error = ENOTSUP;
995fa9e4066Sahrens 		break;
996fa9e4066Sahrens 
997fa9e4066Sahrens 	}
998fa9e4066Sahrens 	mutex_exit(&zvol_state_lock);
999fa9e4066Sahrens 	return (error);
1000fa9e4066Sahrens }
1001fa9e4066Sahrens 
1002fa9e4066Sahrens int
1003fa9e4066Sahrens zvol_busy(void)
1004fa9e4066Sahrens {
1005fa9e4066Sahrens 	return (zvol_minors != 0);
1006fa9e4066Sahrens }
1007fa9e4066Sahrens 
1008fa9e4066Sahrens void
1009fa9e4066Sahrens zvol_init(void)
1010fa9e4066Sahrens {
1011fa9e4066Sahrens 	VERIFY(ddi_soft_state_init(&zvol_state, sizeof (zvol_state_t), 1) == 0);
1012fa9e4066Sahrens 	mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL);
1013fa9e4066Sahrens }
1014fa9e4066Sahrens 
1015fa9e4066Sahrens void
1016fa9e4066Sahrens zvol_fini(void)
1017fa9e4066Sahrens {
1018fa9e4066Sahrens 	mutex_destroy(&zvol_state_lock);
1019fa9e4066Sahrens 	ddi_soft_state_fini(&zvol_state);
1020fa9e4066Sahrens }
1021