xref: /titanic_41/usr/src/uts/common/fs/zfs/zvol.c (revision a192e900f6d2b0e1a822e3252c0dfd795ed49d76)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * ZFS volume emulation driver.
30  *
31  * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
32  * Volumes are accessed through the symbolic links named:
33  *
34  * /dev/zvol/dsk/<pool_name>/<dataset_name>
35  * /dev/zvol/rdsk/<pool_name>/<dataset_name>
36  *
37  * These links are created by the ZFS-specific devfsadm link generator.
38  * Volumes are persistent through reboot.  No user command needs to be
39  * run before opening and using a device.
40  */
41 
42 #include <sys/types.h>
43 #include <sys/param.h>
44 #include <sys/errno.h>
45 #include <sys/aio_req.h>
46 #include <sys/uio.h>
47 #include <sys/buf.h>
48 #include <sys/modctl.h>
49 #include <sys/open.h>
50 #include <sys/kmem.h>
51 #include <sys/conf.h>
52 #include <sys/cmn_err.h>
53 #include <sys/stat.h>
54 #include <sys/zap.h>
55 #include <sys/spa.h>
56 #include <sys/zio.h>
57 #include <sys/dsl_prop.h>
58 #include <sys/dkio.h>
59 #include <sys/efi_partition.h>
60 #include <sys/byteorder.h>
61 #include <sys/pathname.h>
62 #include <sys/ddi.h>
63 #include <sys/sunddi.h>
64 #include <sys/crc32.h>
65 #include <sys/dirent.h>
66 #include <sys/policy.h>
67 #include <sys/fs/zfs.h>
68 #include <sys/zfs_ioctl.h>
69 #include <sys/mkdev.h>
70 #include <sys/zil.h>
71 #include <sys/refcount.h>
72 
73 #include "zfs_namecheck.h"
74 
75 #define	ZVOL_OBJ		1ULL
76 #define	ZVOL_ZAP_OBJ		2ULL
77 
78 static void *zvol_state;
79 
80 /*
81  * This lock protects the zvol_state structure from being modified
82  * while it's being used, e.g. an open that comes in before a create
83  * finishes.  It also protects temporary opens of the dataset so that,
84  * e.g., an open doesn't get a spurious EBUSY.
85  */
86 static kmutex_t zvol_state_lock;
87 static uint32_t zvol_minors;
88 
89 /*
90  * The in-core state of each volume.
91  */
92 typedef struct zvol_state {
93 	char		zv_name[MAXPATHLEN]; /* pool/dd name */
94 	uint64_t	zv_volsize;	/* amount of space we advertise */
95 	minor_t		zv_minor;	/* minor number */
96 	uint8_t		zv_min_bs;	/* minimum addressable block shift */
97 	uint8_t		zv_readonly;	/* hard readonly; like write-protect */
98 	objset_t	*zv_objset;	/* objset handle */
99 	uint32_t	zv_mode;	/* DS_MODE_* flags at open time */
100 	uint32_t	zv_open_count[OTYPCNT];	/* open counts */
101 	uint32_t	zv_total_opens;	/* total open count */
102 	zilog_t		*zv_zilog;	/* ZIL handle */
103 	uint64_t	zv_txg_assign;	/* txg to assign during ZIL replay */
104 	krwlock_t	zv_dslock;	/* dmu_sync() rwlock */
105 } zvol_state_t;
106 
107 static void
108 zvol_size_changed(zvol_state_t *zv, dev_t dev)
109 {
110 	dev = makedevice(getmajor(dev), zv->zv_minor);
111 
112 	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
113 	    "Size", zv->zv_volsize) == DDI_SUCCESS);
114 	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
115 	    "Nblocks", lbtodb(zv->zv_volsize)) == DDI_SUCCESS);
116 }
117 
118 int
119 zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
120 {
121 	if (volsize == 0)
122 		return (EINVAL);
123 
124 	if (volsize % blocksize != 0)
125 		return (EINVAL);
126 
127 #ifdef _ILP32
128 	if (volsize - 1 > SPEC_MAXOFFSET_T)
129 		return (EOVERFLOW);
130 #endif
131 	return (0);
132 }
133 
134 int
135 zvol_check_volblocksize(uint64_t volblocksize)
136 {
137 	if (volblocksize < SPA_MINBLOCKSIZE ||
138 	    volblocksize > SPA_MAXBLOCKSIZE ||
139 	    !ISP2(volblocksize))
140 		return (EDOM);
141 
142 	return (0);
143 }
144 
145 static void
146 zvol_readonly_changed_cb(void *arg, uint64_t newval)
147 {
148 	zvol_state_t *zv = arg;
149 
150 	zv->zv_readonly = (uint8_t)newval;
151 }
152 
153 int
154 zvol_get_stats(objset_t *os, nvlist_t *nv)
155 {
156 	int error;
157 	dmu_object_info_t doi;
158 	uint64_t val;
159 
160 
161 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
162 	if (error)
163 		return (error);
164 
165 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
166 
167 	error = dmu_object_info(os, ZVOL_OBJ, &doi);
168 
169 	if (error == 0) {
170 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
171 		    doi.doi_data_block_size);
172 	}
173 
174 	return (error);
175 }
176 
177 /*
178  * Find a free minor number.
179  */
180 static minor_t
181 zvol_minor_alloc(void)
182 {
183 	minor_t minor;
184 
185 	ASSERT(MUTEX_HELD(&zvol_state_lock));
186 
187 	for (minor = 1; minor <= ZVOL_MAX_MINOR; minor++)
188 		if (ddi_get_soft_state(zvol_state, minor) == NULL)
189 			return (minor);
190 
191 	return (0);
192 }
193 
194 static zvol_state_t *
195 zvol_minor_lookup(const char *name)
196 {
197 	minor_t minor;
198 	zvol_state_t *zv;
199 
200 	ASSERT(MUTEX_HELD(&zvol_state_lock));
201 
202 	for (minor = 1; minor <= ZVOL_MAX_MINOR; minor++) {
203 		zv = ddi_get_soft_state(zvol_state, minor);
204 		if (zv == NULL)
205 			continue;
206 		if (strcmp(zv->zv_name, name) == 0)
207 			break;
208 	}
209 
210 	return (zv);
211 }
212 
213 void
214 zvol_create_cb(objset_t *os, void *arg, dmu_tx_t *tx)
215 {
216 	zfs_create_data_t *zc = arg;
217 	int error;
218 	uint64_t volblocksize, volsize;
219 
220 	VERIFY(nvlist_lookup_uint64(zc->zc_props,
221 	    zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
222 	if (nvlist_lookup_uint64(zc->zc_props,
223 	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
224 		volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
225 
226 	/*
227 	 * These properites must be removed from the list so the generic
228 	 * property setting step won't apply to them.
229 	 */
230 	VERIFY(nvlist_remove_all(zc->zc_props,
231 	    zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
232 	(void) nvlist_remove_all(zc->zc_props,
233 	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
234 
235 	error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
236 	    DMU_OT_NONE, 0, tx);
237 	ASSERT(error == 0);
238 
239 	error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
240 	    DMU_OT_NONE, 0, tx);
241 	ASSERT(error == 0);
242 
243 	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
244 	ASSERT(error == 0);
245 }
246 
247 /*
248  * Replay a TX_WRITE ZIL transaction that didn't get committed
249  * after a system failure
250  */
251 static int
252 zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap)
253 {
254 	objset_t *os = zv->zv_objset;
255 	char *data = (char *)(lr + 1);	/* data follows lr_write_t */
256 	uint64_t off = lr->lr_offset;
257 	uint64_t len = lr->lr_length;
258 	dmu_tx_t *tx;
259 	int error;
260 
261 	if (byteswap)
262 		byteswap_uint64_array(lr, sizeof (*lr));
263 
264 	tx = dmu_tx_create(os);
265 	dmu_tx_hold_write(tx, ZVOL_OBJ, off, len);
266 	error = dmu_tx_assign(tx, zv->zv_txg_assign);
267 	if (error) {
268 		dmu_tx_abort(tx);
269 	} else {
270 		dmu_write(os, ZVOL_OBJ, off, len, data, tx);
271 		dmu_tx_commit(tx);
272 	}
273 
274 	return (error);
275 }
276 
277 /* ARGSUSED */
278 static int
279 zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap)
280 {
281 	return (ENOTSUP);
282 }
283 
284 /*
285  * Callback vectors for replaying records.
286  * Only TX_WRITE is needed for zvol.
287  */
288 zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
289 	zvol_replay_err,	/* 0 no such transaction type */
290 	zvol_replay_err,	/* TX_CREATE */
291 	zvol_replay_err,	/* TX_MKDIR */
292 	zvol_replay_err,	/* TX_MKXATTR */
293 	zvol_replay_err,	/* TX_SYMLINK */
294 	zvol_replay_err,	/* TX_REMOVE */
295 	zvol_replay_err,	/* TX_RMDIR */
296 	zvol_replay_err,	/* TX_LINK */
297 	zvol_replay_err,	/* TX_RENAME */
298 	zvol_replay_write,	/* TX_WRITE */
299 	zvol_replay_err,	/* TX_TRUNCATE */
300 	zvol_replay_err,	/* TX_SETATTR */
301 	zvol_replay_err,	/* TX_ACL */
302 };
303 
304 /*
305  * Create a minor node for the specified volume.
306  */
307 int
308 zvol_create_minor(const char *name, dev_t dev)
309 {
310 	zvol_state_t *zv;
311 	objset_t *os;
312 	uint64_t volsize;
313 	minor_t minor = 0;
314 	struct pathname linkpath;
315 	int ds_mode = DS_MODE_PRIMARY;
316 	vnode_t *vp = NULL;
317 	char *devpath;
318 	size_t devpathlen = strlen(ZVOL_FULL_DEV_DIR) + 1 + strlen(name) + 1;
319 	char chrbuf[30], blkbuf[30];
320 	int error;
321 
322 	mutex_enter(&zvol_state_lock);
323 
324 	if ((zv = zvol_minor_lookup(name)) != NULL) {
325 		mutex_exit(&zvol_state_lock);
326 		return (EEXIST);
327 	}
328 
329 	if (strchr(name, '@') != 0)
330 		ds_mode |= DS_MODE_READONLY;
331 
332 	error = dmu_objset_open(name, DMU_OST_ZVOL, ds_mode, &os);
333 
334 	if (error) {
335 		mutex_exit(&zvol_state_lock);
336 		return (error);
337 	}
338 
339 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
340 
341 	if (error) {
342 		dmu_objset_close(os);
343 		mutex_exit(&zvol_state_lock);
344 		return (error);
345 	}
346 
347 	/*
348 	 * If there's an existing /dev/zvol symlink, try to use the
349 	 * same minor number we used last time.
350 	 */
351 	devpath = kmem_alloc(devpathlen, KM_SLEEP);
352 
353 	(void) sprintf(devpath, "%s/%s", ZVOL_FULL_DEV_DIR, name);
354 
355 	error = lookupname(devpath, UIO_SYSSPACE, NO_FOLLOW, NULL, &vp);
356 
357 	kmem_free(devpath, devpathlen);
358 
359 	if (error == 0 && vp->v_type != VLNK)
360 		error = EINVAL;
361 
362 	if (error == 0) {
363 		pn_alloc(&linkpath);
364 		error = pn_getsymlink(vp, &linkpath, kcred);
365 		if (error == 0) {
366 			char *ms = strstr(linkpath.pn_path, ZVOL_PSEUDO_DEV);
367 			if (ms != NULL) {
368 				ms += strlen(ZVOL_PSEUDO_DEV);
369 				minor = stoi(&ms);
370 			}
371 		}
372 		pn_free(&linkpath);
373 	}
374 
375 	if (vp != NULL)
376 		VN_RELE(vp);
377 
378 	/*
379 	 * If we found a minor but it's already in use, we must pick a new one.
380 	 */
381 	if (minor != 0 && ddi_get_soft_state(zvol_state, minor) != NULL)
382 		minor = 0;
383 
384 	if (minor == 0)
385 		minor = zvol_minor_alloc();
386 
387 	if (minor == 0) {
388 		dmu_objset_close(os);
389 		mutex_exit(&zvol_state_lock);
390 		return (ENXIO);
391 	}
392 
393 	if (ddi_soft_state_zalloc(zvol_state, minor) != DDI_SUCCESS) {
394 		dmu_objset_close(os);
395 		mutex_exit(&zvol_state_lock);
396 		return (EAGAIN);
397 	}
398 
399 	(void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME,
400 	    (char *)name);
401 
402 	(void) sprintf(chrbuf, "%uc,raw", minor);
403 
404 	if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR,
405 	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
406 		ddi_soft_state_free(zvol_state, minor);
407 		dmu_objset_close(os);
408 		mutex_exit(&zvol_state_lock);
409 		return (EAGAIN);
410 	}
411 
412 	(void) sprintf(blkbuf, "%uc", minor);
413 
414 	if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK,
415 	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
416 		ddi_remove_minor_node(zfs_dip, chrbuf);
417 		ddi_soft_state_free(zvol_state, minor);
418 		dmu_objset_close(os);
419 		mutex_exit(&zvol_state_lock);
420 		return (EAGAIN);
421 	}
422 
423 	zv = ddi_get_soft_state(zvol_state, minor);
424 
425 	(void) strcpy(zv->zv_name, name);
426 	zv->zv_min_bs = DEV_BSHIFT;
427 	zv->zv_minor = minor;
428 	zv->zv_volsize = volsize;
429 	zv->zv_objset = os;
430 	zv->zv_mode = ds_mode;
431 	zv->zv_zilog = zil_open(os, NULL);
432 
433 	rw_init(&zv->zv_dslock, NULL, RW_DEFAULT, NULL);
434 
435 	zil_replay(os, zv, &zv->zv_txg_assign, zvol_replay_vector, NULL);
436 
437 	zvol_size_changed(zv, dev);
438 
439 	/* XXX this should handle the possible i/o error */
440 	VERIFY(dsl_prop_register(dmu_objset_ds(zv->zv_objset),
441 	    "readonly", zvol_readonly_changed_cb, zv) == 0);
442 
443 	zvol_minors++;
444 
445 	mutex_exit(&zvol_state_lock);
446 
447 	return (0);
448 }
449 
450 /*
451  * Remove minor node for the specified volume.
452  */
453 int
454 zvol_remove_minor(const char *name)
455 {
456 	zvol_state_t *zv;
457 	char namebuf[30];
458 
459 	mutex_enter(&zvol_state_lock);
460 
461 	if ((zv = zvol_minor_lookup(name)) == NULL) {
462 		mutex_exit(&zvol_state_lock);
463 		return (ENXIO);
464 	}
465 
466 	if (zv->zv_total_opens != 0) {
467 		mutex_exit(&zvol_state_lock);
468 		return (EBUSY);
469 	}
470 
471 	(void) sprintf(namebuf, "%uc,raw", zv->zv_minor);
472 	ddi_remove_minor_node(zfs_dip, namebuf);
473 
474 	(void) sprintf(namebuf, "%uc", zv->zv_minor);
475 	ddi_remove_minor_node(zfs_dip, namebuf);
476 
477 	VERIFY(dsl_prop_unregister(dmu_objset_ds(zv->zv_objset),
478 	    "readonly", zvol_readonly_changed_cb, zv) == 0);
479 
480 	zil_close(zv->zv_zilog);
481 	zv->zv_zilog = NULL;
482 	dmu_objset_close(zv->zv_objset);
483 	zv->zv_objset = NULL;
484 
485 	ddi_soft_state_free(zvol_state, zv->zv_minor);
486 
487 	zvol_minors--;
488 
489 	mutex_exit(&zvol_state_lock);
490 
491 	return (0);
492 }
493 
494 int
495 zvol_set_volsize(const char *name, dev_t dev, uint64_t volsize)
496 {
497 	zvol_state_t *zv;
498 	dmu_tx_t *tx;
499 	int error;
500 	dmu_object_info_t doi;
501 
502 	mutex_enter(&zvol_state_lock);
503 
504 	if ((zv = zvol_minor_lookup(name)) == NULL) {
505 		mutex_exit(&zvol_state_lock);
506 		return (ENXIO);
507 	}
508 
509 	if ((error = dmu_object_info(zv->zv_objset, ZVOL_OBJ, &doi)) != 0 ||
510 	    (error = zvol_check_volsize(volsize,
511 	    doi.doi_data_block_size)) != 0) {
512 		mutex_exit(&zvol_state_lock);
513 		return (error);
514 	}
515 
516 	if (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)) {
517 		mutex_exit(&zvol_state_lock);
518 		return (EROFS);
519 	}
520 
521 	tx = dmu_tx_create(zv->zv_objset);
522 	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
523 	dmu_tx_hold_free(tx, ZVOL_OBJ, volsize, DMU_OBJECT_END);
524 	error = dmu_tx_assign(tx, TXG_WAIT);
525 	if (error) {
526 		dmu_tx_abort(tx);
527 		mutex_exit(&zvol_state_lock);
528 		return (error);
529 	}
530 
531 	error = zap_update(zv->zv_objset, ZVOL_ZAP_OBJ, "size", 8, 1,
532 	    &volsize, tx);
533 	if (error == 0) {
534 		error = dmu_free_range(zv->zv_objset, ZVOL_OBJ, volsize,
535 		    DMU_OBJECT_END, tx);
536 	}
537 
538 	dmu_tx_commit(tx);
539 
540 	if (error == 0) {
541 		zv->zv_volsize = volsize;
542 		zvol_size_changed(zv, dev);
543 	}
544 
545 	mutex_exit(&zvol_state_lock);
546 
547 	return (error);
548 }
549 
550 int
551 zvol_set_volblocksize(const char *name, uint64_t volblocksize)
552 {
553 	zvol_state_t *zv;
554 	dmu_tx_t *tx;
555 	int error;
556 
557 	mutex_enter(&zvol_state_lock);
558 
559 	if ((zv = zvol_minor_lookup(name)) == NULL) {
560 		mutex_exit(&zvol_state_lock);
561 		return (ENXIO);
562 	}
563 
564 	if (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)) {
565 		mutex_exit(&zvol_state_lock);
566 		return (EROFS);
567 	}
568 
569 	tx = dmu_tx_create(zv->zv_objset);
570 	dmu_tx_hold_bonus(tx, ZVOL_OBJ);
571 	error = dmu_tx_assign(tx, TXG_WAIT);
572 	if (error) {
573 		dmu_tx_abort(tx);
574 	} else {
575 		error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ,
576 		    volblocksize, 0, tx);
577 		if (error == ENOTSUP)
578 			error = EBUSY;
579 		dmu_tx_commit(tx);
580 	}
581 
582 	mutex_exit(&zvol_state_lock);
583 
584 	return (error);
585 }
586 
587 /*ARGSUSED*/
588 int
589 zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr)
590 {
591 	minor_t minor = getminor(*devp);
592 	zvol_state_t *zv;
593 
594 	if (minor == 0)			/* This is the control device */
595 		return (0);
596 
597 	mutex_enter(&zvol_state_lock);
598 
599 	zv = ddi_get_soft_state(zvol_state, minor);
600 	if (zv == NULL) {
601 		mutex_exit(&zvol_state_lock);
602 		return (ENXIO);
603 	}
604 
605 	ASSERT(zv->zv_objset != NULL);
606 
607 	if ((flag & FWRITE) &&
608 	    (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY))) {
609 		mutex_exit(&zvol_state_lock);
610 		return (EROFS);
611 	}
612 
613 	if (zv->zv_open_count[otyp] == 0 || otyp == OTYP_LYR) {
614 		zv->zv_open_count[otyp]++;
615 		zv->zv_total_opens++;
616 	}
617 
618 	mutex_exit(&zvol_state_lock);
619 
620 	return (0);
621 }
622 
623 /*ARGSUSED*/
624 int
625 zvol_close(dev_t dev, int flag, int otyp, cred_t *cr)
626 {
627 	minor_t minor = getminor(dev);
628 	zvol_state_t *zv;
629 
630 	if (minor == 0)		/* This is the control device */
631 		return (0);
632 
633 	mutex_enter(&zvol_state_lock);
634 
635 	zv = ddi_get_soft_state(zvol_state, minor);
636 	if (zv == NULL) {
637 		mutex_exit(&zvol_state_lock);
638 		return (ENXIO);
639 	}
640 
641 	/*
642 	 * The next statement is a workaround for the following DDI bug:
643 	 * 6343604 specfs race: multiple "last-close" of the same device
644 	 */
645 	if (zv->zv_total_opens == 0) {
646 		mutex_exit(&zvol_state_lock);
647 		return (0);
648 	}
649 
650 	/*
651 	 * If the open count is zero, this is a spurious close.
652 	 * That indicates a bug in the kernel / DDI framework.
653 	 */
654 	ASSERT(zv->zv_open_count[otyp] != 0);
655 	ASSERT(zv->zv_total_opens != 0);
656 
657 	/*
658 	 * You may get multiple opens, but only one close.
659 	 */
660 	zv->zv_open_count[otyp]--;
661 	zv->zv_total_opens--;
662 
663 	mutex_exit(&zvol_state_lock);
664 
665 	return (0);
666 }
667 
668 /*
669  * Create and return an immediate write ZIL transaction.
670  */
671 itx_t *
672 zvol_immediate_itx(offset_t off, ssize_t len, char *addr)
673 {
674 	itx_t *itx;
675 	lr_write_t *lr;
676 
677 	itx = zil_itx_create(TX_WRITE, sizeof (*lr) + len);
678 	lr = (lr_write_t *)&itx->itx_lr;
679 	lr->lr_foid = ZVOL_OBJ;
680 	lr->lr_offset = off;
681 	lr->lr_length = len;
682 	lr->lr_blkoff = 0;
683 	BP_ZERO(&lr->lr_blkptr);
684 	bcopy(addr, (char *)itx + offsetof(itx_t, itx_lr) +
685 	    sizeof (*lr), len);
686 	itx->itx_wr_state = WR_COPIED;
687 	return (itx);
688 }
689 
690 /*
691  * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
692  *
693  * We store data in the log buffers if it's small enough.
694  * Otherwise we flush the data out via dmu_sync().
695  */
696 ssize_t zvol_immediate_write_sz = 65536;
697 
698 int
699 zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len,
700     char *addr)
701 {
702 	dmu_object_info_t doi;
703 	ssize_t nbytes;
704 	itx_t *itx;
705 	lr_write_t *lr;
706 	objset_t *os;
707 	dmu_buf_t *db;
708 	uint64_t txg;
709 	uint64_t boff;
710 	int error;
711 	uint32_t blocksize;
712 
713 	/* handle common case */
714 	if (len <= zvol_immediate_write_sz) {
715 		itx = zvol_immediate_itx(off, len, addr);
716 		(void) zil_itx_assign(zv->zv_zilog, itx, tx);
717 		return (0);
718 	}
719 
720 	txg = dmu_tx_get_txg(tx);
721 	os = zv->zv_objset;
722 
723 	/*
724 	 * We need to dmu_sync() each block in the range.
725 	 * For this we need the blocksize.
726 	 */
727 	error = dmu_object_info(os, ZVOL_OBJ, &doi);
728 	if (error)
729 		return (error);
730 	blocksize = doi.doi_data_block_size;
731 
732 	/*
733 	 * We need to immediate write or dmu_sync() each block in the range.
734 	 */
735 	while (len) {
736 		nbytes = MIN(len, blocksize - P2PHASE(off, blocksize));
737 		if (nbytes <= zvol_immediate_write_sz) {
738 			itx = zvol_immediate_itx(off, nbytes, addr);
739 		} else {
740 			boff =  P2ALIGN_TYPED(off, blocksize, uint64_t);
741 			itx = zil_itx_create(TX_WRITE, sizeof (*lr));
742 			lr = (lr_write_t *)&itx->itx_lr;
743 			lr->lr_foid = ZVOL_OBJ;
744 			lr->lr_offset = off;
745 			lr->lr_length = nbytes;
746 			lr->lr_blkoff = off - boff;
747 			BP_ZERO(&lr->lr_blkptr);
748 
749 			/* XXX - we should do these IOs in parallel */
750 			VERIFY(0 == dmu_buf_hold(os, ZVOL_OBJ, boff,
751 			    FTAG, &db));
752 			ASSERT(boff == db->db_offset);
753 			error = dmu_sync(NULL, db, &lr->lr_blkptr,
754 			    txg, NULL, NULL);
755 			dmu_buf_rele(db, FTAG);
756 			if (error) {
757 				kmem_free(itx, offsetof(itx_t, itx_lr));
758 				return (error);
759 			}
760 			itx->itx_wr_state = WR_COPIED;
761 		}
762 		(void) zil_itx_assign(zv->zv_zilog, itx, tx);
763 		len -= nbytes;
764 		off += nbytes;
765 	}
766 	return (0);
767 }
768 
769 int
770 zvol_strategy(buf_t *bp)
771 {
772 	zvol_state_t *zv = ddi_get_soft_state(zvol_state, getminor(bp->b_edev));
773 	uint64_t off, volsize;
774 	size_t size, resid;
775 	char *addr;
776 	objset_t *os;
777 	int error = 0;
778 	int sync;
779 	int reading;
780 	int txg_sync_needed = B_FALSE;
781 
782 	if (zv == NULL) {
783 		bioerror(bp, ENXIO);
784 		biodone(bp);
785 		return (0);
786 	}
787 
788 	if (getminor(bp->b_edev) == 0) {
789 		bioerror(bp, EINVAL);
790 		biodone(bp);
791 		return (0);
792 	}
793 
794 	if ((zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)) &&
795 	    !(bp->b_flags & B_READ)) {
796 		bioerror(bp, EROFS);
797 		biodone(bp);
798 		return (0);
799 	}
800 
801 	off = ldbtob(bp->b_blkno);
802 	volsize = zv->zv_volsize;
803 
804 	os = zv->zv_objset;
805 	ASSERT(os != NULL);
806 	sync = !(bp->b_flags & B_ASYNC) && !(zil_disable);
807 
808 	bp_mapin(bp);
809 	addr = bp->b_un.b_addr;
810 	resid = bp->b_bcount;
811 
812 	/*
813 	 * There must be no buffer changes when doing a dmu_sync() because
814 	 * we can't change the data whilst calculating the checksum.
815 	 * A better approach than a per zvol rwlock would be to lock ranges.
816 	 */
817 	reading = bp->b_flags & B_READ;
818 	if (reading || resid <= zvol_immediate_write_sz)
819 		rw_enter(&zv->zv_dslock, RW_READER);
820 	else
821 		rw_enter(&zv->zv_dslock, RW_WRITER);
822 
823 	while (resid != 0 && off < volsize) {
824 
825 		size = MIN(resid, 1UL << 20);	/* cap at 1MB per tx */
826 
827 		if (size > volsize - off)	/* don't write past the end */
828 			size = volsize - off;
829 
830 		if (reading) {
831 			error = dmu_read(os, ZVOL_OBJ, off, size, addr);
832 		} else {
833 			dmu_tx_t *tx = dmu_tx_create(os);
834 			dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
835 			error = dmu_tx_assign(tx, TXG_WAIT);
836 			if (error) {
837 				dmu_tx_abort(tx);
838 			} else {
839 				dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
840 				if (sync) {
841 					/* use the ZIL to commit this write */
842 					if (zvol_log_write(zv, tx, off, size,
843 					    addr) != 0) {
844 						txg_sync_needed = B_TRUE;
845 					}
846 				}
847 				dmu_tx_commit(tx);
848 			}
849 		}
850 		if (error)
851 			break;
852 		off += size;
853 		addr += size;
854 		resid -= size;
855 	}
856 	rw_exit(&zv->zv_dslock);
857 
858 	if ((bp->b_resid = resid) == bp->b_bcount)
859 		bioerror(bp, off > volsize ? EINVAL : error);
860 
861 	biodone(bp);
862 
863 	if (sync) {
864 		if (txg_sync_needed)
865 			txg_wait_synced(dmu_objset_pool(os), 0);
866 		else
867 			zil_commit(zv->zv_zilog, UINT64_MAX, 0);
868 	}
869 
870 	return (0);
871 }
872 
873 /*ARGSUSED*/
874 int
875 zvol_read(dev_t dev, uio_t *uiop, cred_t *cr)
876 {
877 	return (physio(zvol_strategy, NULL, dev, B_READ, minphys, uiop));
878 }
879 
880 /*ARGSUSED*/
881 int
882 zvol_write(dev_t dev, uio_t *uiop, cred_t *cr)
883 {
884 	return (physio(zvol_strategy, NULL, dev, B_WRITE, minphys, uiop));
885 }
886 
887 /*ARGSUSED*/
888 int
889 zvol_aread(dev_t dev, struct aio_req *aio, cred_t *cr)
890 {
891 	return (aphysio(zvol_strategy, anocancel, dev, B_READ, minphys, aio));
892 }
893 
894 /*ARGSUSED*/
895 int
896 zvol_awrite(dev_t dev, struct aio_req *aio, cred_t *cr)
897 {
898 	return (aphysio(zvol_strategy, anocancel, dev, B_WRITE, minphys, aio));
899 }
900 
901 /*
902  * Dirtbag ioctls to support mkfs(1M) for UFS filesystems.  See dkio(7I).
903  */
904 /*ARGSUSED*/
905 int
906 zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
907 {
908 	zvol_state_t *zv;
909 	struct dk_cinfo dkc;
910 	struct dk_minfo dkm;
911 	dk_efi_t efi;
912 	struct uuid uuid = EFI_RESERVED;
913 	uint32_t crc;
914 	int error = 0;
915 
916 	mutex_enter(&zvol_state_lock);
917 
918 	zv = ddi_get_soft_state(zvol_state, getminor(dev));
919 
920 	if (zv == NULL) {
921 		mutex_exit(&zvol_state_lock);
922 		return (ENXIO);
923 	}
924 
925 	switch (cmd) {
926 
927 	case DKIOCINFO:
928 		bzero(&dkc, sizeof (dkc));
929 		(void) strcpy(dkc.dki_cname, "zvol");
930 		(void) strcpy(dkc.dki_dname, "zvol");
931 		dkc.dki_ctype = DKC_UNKNOWN;
932 		dkc.dki_maxtransfer = 1 << (SPA_MAXBLOCKSHIFT - zv->zv_min_bs);
933 		mutex_exit(&zvol_state_lock);
934 		if (ddi_copyout(&dkc, (void *)arg, sizeof (dkc), flag))
935 			error = EFAULT;
936 		return (error);
937 
938 	case DKIOCGMEDIAINFO:
939 		bzero(&dkm, sizeof (dkm));
940 		dkm.dki_lbsize = 1U << zv->zv_min_bs;
941 		dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
942 		dkm.dki_media_type = DK_UNKNOWN;
943 		mutex_exit(&zvol_state_lock);
944 		if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag))
945 			error = EFAULT;
946 		return (error);
947 
948 	case DKIOCGETEFI:
949 		if (ddi_copyin((void *)arg, &efi, sizeof (dk_efi_t), flag)) {
950 			mutex_exit(&zvol_state_lock);
951 			return (EFAULT);
952 		}
953 		efi.dki_data = (void *)(uintptr_t)efi.dki_data_64;
954 
955 		/*
956 		 * Some clients may attempt to request a PMBR for the
957 		 * zvol.  Currently this interface will return ENOTTY to
958 		 * such requests.  These requests could be supported by
959 		 * adding a check for lba == 0 and consing up an appropriate
960 		 * RMBR.
961 		 */
962 		if (efi.dki_lba == 1) {
963 			efi_gpt_t gpt;
964 			efi_gpe_t gpe;
965 
966 			bzero(&gpt, sizeof (gpt));
967 			bzero(&gpe, sizeof (gpe));
968 
969 			if (efi.dki_length < sizeof (gpt)) {
970 				mutex_exit(&zvol_state_lock);
971 				return (EINVAL);
972 			}
973 
974 			gpt.efi_gpt_Signature = LE_64(EFI_SIGNATURE);
975 			gpt.efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT);
976 			gpt.efi_gpt_HeaderSize = LE_32(sizeof (gpt));
977 			gpt.efi_gpt_FirstUsableLBA = LE_64(34ULL);
978 			gpt.efi_gpt_LastUsableLBA =
979 			    LE_64((zv->zv_volsize >> zv->zv_min_bs) - 1);
980 			gpt.efi_gpt_NumberOfPartitionEntries = LE_32(1);
981 			gpt.efi_gpt_PartitionEntryLBA = LE_32(2);
982 			gpt.efi_gpt_SizeOfPartitionEntry = LE_32(sizeof (gpe));
983 
984 			UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid);
985 			gpe.efi_gpe_StartingLBA = gpt.efi_gpt_FirstUsableLBA;
986 			gpe.efi_gpe_EndingLBA = gpt.efi_gpt_LastUsableLBA;
987 
988 			CRC32(crc, &gpe, sizeof (gpe), -1U, crc32_table);
989 			gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
990 
991 			CRC32(crc, &gpt, sizeof (gpt), -1U, crc32_table);
992 			gpt.efi_gpt_HeaderCRC32 = LE_32(~crc);
993 
994 			mutex_exit(&zvol_state_lock);
995 			if (ddi_copyout(&gpt, efi.dki_data, sizeof (gpt), flag))
996 				error = EFAULT;
997 		} else if (efi.dki_lba == 2) {
998 			efi_gpe_t gpe;
999 
1000 			bzero(&gpe, sizeof (gpe));
1001 
1002 			if (efi.dki_length < sizeof (gpe)) {
1003 				mutex_exit(&zvol_state_lock);
1004 				return (EINVAL);
1005 			}
1006 
1007 			UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid);
1008 			gpe.efi_gpe_StartingLBA = LE_64(34ULL);
1009 			gpe.efi_gpe_EndingLBA =
1010 			    LE_64((zv->zv_volsize >> zv->zv_min_bs) - 1);
1011 
1012 			mutex_exit(&zvol_state_lock);
1013 			if (ddi_copyout(&gpe, efi.dki_data, sizeof (gpe), flag))
1014 				error = EFAULT;
1015 		} else {
1016 			mutex_exit(&zvol_state_lock);
1017 			error = EINVAL;
1018 		}
1019 		return (error);
1020 
1021 	default:
1022 		error = ENOTTY;
1023 		break;
1024 
1025 	}
1026 	mutex_exit(&zvol_state_lock);
1027 	return (error);
1028 }
1029 
1030 int
1031 zvol_busy(void)
1032 {
1033 	return (zvol_minors != 0);
1034 }
1035 
1036 void
1037 zvol_init(void)
1038 {
1039 	VERIFY(ddi_soft_state_init(&zvol_state, sizeof (zvol_state_t), 1) == 0);
1040 	mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL);
1041 }
1042 
1043 void
1044 zvol_fini(void)
1045 {
1046 	mutex_destroy(&zvol_state_lock);
1047 	ddi_soft_state_fini(&zvol_state);
1048 }
1049