xref: /titanic_53/usr/src/uts/sun4v/io/vds.c (revision f0ca1d9a12d54d304791bc74525e2010ca924726)
11ae08745Sheppo /*
21ae08745Sheppo  * CDDL HEADER START
31ae08745Sheppo  *
41ae08745Sheppo  * The contents of this file are subject to the terms of the
51ae08745Sheppo  * Common Development and Distribution License (the "License").
61ae08745Sheppo  * You may not use this file except in compliance with the License.
71ae08745Sheppo  *
81ae08745Sheppo  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
91ae08745Sheppo  * or http://www.opensolaris.org/os/licensing.
101ae08745Sheppo  * See the License for the specific language governing permissions
111ae08745Sheppo  * and limitations under the License.
121ae08745Sheppo  *
131ae08745Sheppo  * When distributing Covered Code, include this CDDL HEADER in each
141ae08745Sheppo  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
151ae08745Sheppo  * If applicable, add the following below this CDDL HEADER, with the
161ae08745Sheppo  * fields enclosed by brackets "[]" replaced with your own identifying
171ae08745Sheppo  * information: Portions Copyright [yyyy] [name of copyright owner]
181ae08745Sheppo  *
191ae08745Sheppo  * CDDL HEADER END
201ae08745Sheppo  */
211ae08745Sheppo 
221ae08745Sheppo /*
23edcc0754Sachartre  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
241ae08745Sheppo  * Use is subject to license terms.
251ae08745Sheppo  */
261ae08745Sheppo 
271ae08745Sheppo #pragma ident	"%Z%%M%	%I%	%E% SMI"
281ae08745Sheppo 
291ae08745Sheppo /*
301ae08745Sheppo  * Virtual disk server
311ae08745Sheppo  */
321ae08745Sheppo 
331ae08745Sheppo 
341ae08745Sheppo #include <sys/types.h>
351ae08745Sheppo #include <sys/conf.h>
364bac2208Snarayan #include <sys/crc32.h>
371ae08745Sheppo #include <sys/ddi.h>
381ae08745Sheppo #include <sys/dkio.h>
391ae08745Sheppo #include <sys/file.h>
4017cadca8Slm66018 #include <sys/fs/hsfs_isospec.h>
411ae08745Sheppo #include <sys/mdeg.h>
422f5224aeSachartre #include <sys/mhd.h>
431ae08745Sheppo #include <sys/modhash.h>
441ae08745Sheppo #include <sys/note.h>
451ae08745Sheppo #include <sys/pathname.h>
46205eeb1aSlm66018 #include <sys/sdt.h>
471ae08745Sheppo #include <sys/sunddi.h>
481ae08745Sheppo #include <sys/sunldi.h>
491ae08745Sheppo #include <sys/sysmacros.h>
501ae08745Sheppo #include <sys/vio_common.h>
5117cadca8Slm66018 #include <sys/vio_util.h>
521ae08745Sheppo #include <sys/vdsk_mailbox.h>
531ae08745Sheppo #include <sys/vdsk_common.h>
541ae08745Sheppo #include <sys/vtoc.h>
553c96341aSnarayan #include <sys/vfs.h>
563c96341aSnarayan #include <sys/stat.h>
5787a7269eSachartre #include <sys/scsi/impl/uscsi.h>
58690555a1Sachartre #include <vm/seg_map.h>
591ae08745Sheppo 
601ae08745Sheppo /* Virtual disk server initialization flags */
61d10e4ef2Snarayan #define	VDS_LDI			0x01
62d10e4ef2Snarayan #define	VDS_MDEG		0x02
631ae08745Sheppo 
641ae08745Sheppo /* Virtual disk server tunable parameters */
653c96341aSnarayan #define	VDS_RETRIES		5
663c96341aSnarayan #define	VDS_LDC_DELAY		1000 /* 1 msecs */
673c96341aSnarayan #define	VDS_DEV_DELAY		10000000 /* 10 secs */
681ae08745Sheppo #define	VDS_NCHAINS		32
691ae08745Sheppo 
701ae08745Sheppo /* Identification parameters for MD, synthetic dkio(7i) structures, etc. */
711ae08745Sheppo #define	VDS_NAME		"virtual-disk-server"
721ae08745Sheppo 
731ae08745Sheppo #define	VD_NAME			"vd"
741ae08745Sheppo #define	VD_VOLUME_NAME		"vdisk"
751ae08745Sheppo #define	VD_ASCIILABEL		"Virtual Disk"
761ae08745Sheppo 
771ae08745Sheppo #define	VD_CHANNEL_ENDPOINT	"channel-endpoint"
781ae08745Sheppo #define	VD_ID_PROP		"id"
791ae08745Sheppo #define	VD_BLOCK_DEVICE_PROP	"vds-block-device"
80047ba61eSachartre #define	VD_BLOCK_DEVICE_OPTS	"vds-block-device-opts"
81445b4c2eSsb155480 #define	VD_REG_PROP		"reg"
821ae08745Sheppo 
831ae08745Sheppo /* Virtual disk initialization flags */
843c96341aSnarayan #define	VD_DISK_READY		0x01
853c96341aSnarayan #define	VD_LOCKING		0x02
863c96341aSnarayan #define	VD_LDC			0x04
873c96341aSnarayan #define	VD_DRING		0x08
883c96341aSnarayan #define	VD_SID			0x10
893c96341aSnarayan #define	VD_SEQ_NUM		0x20
90047ba61eSachartre #define	VD_SETUP_ERROR		0x40
911ae08745Sheppo 
92eba0cb4eSachartre /* Flags for writing to a vdisk which is a file */
93eba0cb4eSachartre #define	VD_FILE_WRITE_FLAGS	SM_ASYNC
94eba0cb4eSachartre 
9587a7269eSachartre /* Number of backup labels */
9687a7269eSachartre #define	VD_FILE_NUM_BACKUP	5
9787a7269eSachartre 
9887a7269eSachartre /* Timeout for SCSI I/O */
9987a7269eSachartre #define	VD_SCSI_RDWR_TIMEOUT	30	/* 30 secs */
10087a7269eSachartre 
101edcc0754Sachartre /* Maximum number of logical partitions */
102edcc0754Sachartre #define	VD_MAXPART	(NDKMAP + 1)
103edcc0754Sachartre 
1041ae08745Sheppo /*
1051ae08745Sheppo  * By Solaris convention, slice/partition 2 represents the entire disk;
1061ae08745Sheppo  * unfortunately, this convention does not appear to be codified.
1071ae08745Sheppo  */
1081ae08745Sheppo #define	VD_ENTIRE_DISK_SLICE	2
1091ae08745Sheppo 
1101ae08745Sheppo /* Return a cpp token as a string */
1111ae08745Sheppo #define	STRINGIZE(token)	#token
1121ae08745Sheppo 
1131ae08745Sheppo /*
1141ae08745Sheppo  * Print a message prefixed with the current function name to the message log
1151ae08745Sheppo  * (and optionally to the console for verbose boots); these macros use cpp's
1161ae08745Sheppo  * concatenation of string literals and C99 variable-length-argument-list
1171ae08745Sheppo  * macros
1181ae08745Sheppo  */
1191ae08745Sheppo #define	PRN(...)	_PRN("?%s():  "__VA_ARGS__, "")
1201ae08745Sheppo #define	_PRN(format, ...)					\
1211ae08745Sheppo 	cmn_err(CE_CONT, format"%s", __func__, __VA_ARGS__)
1221ae08745Sheppo 
1231ae08745Sheppo /* Return a pointer to the "i"th vdisk dring element */
1241ae08745Sheppo #define	VD_DRING_ELEM(i)	((vd_dring_entry_t *)(void *)	\
1251ae08745Sheppo 	    (vd->dring + (i)*vd->descriptor_size))
1261ae08745Sheppo 
1271ae08745Sheppo /* Return the virtual disk client's type as a string (for use in messages) */
1281ae08745Sheppo #define	VD_CLIENT(vd)							\
1291ae08745Sheppo 	(((vd)->xfer_mode == VIO_DESC_MODE) ? "in-band client" :	\
130*f0ca1d9aSsb155480 	    (((vd)->xfer_mode == VIO_DRING_MODE_V1_0) ? "dring client" :    \
1311ae08745Sheppo 		(((vd)->xfer_mode == 0) ? "null client" :		\
1321ae08745Sheppo 		    "unsupported client")))
1331ae08745Sheppo 
134690555a1Sachartre /* Read disk label from a disk on file */
135690555a1Sachartre #define	VD_FILE_LABEL_READ(vd, labelp) \
13687a7269eSachartre 	vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BREAD, (caddr_t)labelp, \
137690555a1Sachartre 	    0, sizeof (struct dk_label))
138690555a1Sachartre 
139690555a1Sachartre /* Write disk label to a disk on file */
140690555a1Sachartre #define	VD_FILE_LABEL_WRITE(vd, labelp)	\
14187a7269eSachartre 	vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BWRITE, (caddr_t)labelp, \
142690555a1Sachartre 	    0, sizeof (struct dk_label))
143690555a1Sachartre 
1442f5224aeSachartre /* Message for disk access rights reset failure */
1452f5224aeSachartre #define	VD_RESET_ACCESS_FAILURE_MSG \
1462f5224aeSachartre 	"Fail to reset disk access rights for disk %s"
1472f5224aeSachartre 
148445b4c2eSsb155480 /*
149445b4c2eSsb155480  * Specification of an MD node passed to the MDEG to filter any
150445b4c2eSsb155480  * 'vport' nodes that do not belong to the specified node. This
151445b4c2eSsb155480  * template is copied for each vds instance and filled in with
152445b4c2eSsb155480  * the appropriate 'cfg-handle' value before being passed to the MDEG.
153445b4c2eSsb155480  */
154445b4c2eSsb155480 static mdeg_prop_spec_t	vds_prop_template[] = {
155445b4c2eSsb155480 	{ MDET_PROP_STR,	"name",		VDS_NAME },
156445b4c2eSsb155480 	{ MDET_PROP_VAL,	"cfg-handle",	NULL },
157445b4c2eSsb155480 	{ MDET_LIST_END,	NULL, 		NULL }
158445b4c2eSsb155480 };
159445b4c2eSsb155480 
160445b4c2eSsb155480 #define	VDS_SET_MDEG_PROP_INST(specp, val) (specp)[1].ps_val = (val);
161445b4c2eSsb155480 
162445b4c2eSsb155480 /*
163445b4c2eSsb155480  * Matching criteria passed to the MDEG to register interest
164445b4c2eSsb155480  * in changes to 'virtual-device-port' nodes identified by their
165445b4c2eSsb155480  * 'id' property.
166445b4c2eSsb155480  */
167445b4c2eSsb155480 static md_prop_match_t	vd_prop_match[] = {
168445b4c2eSsb155480 	{ MDET_PROP_VAL,	VD_ID_PROP },
169445b4c2eSsb155480 	{ MDET_LIST_END,	NULL }
170445b4c2eSsb155480 };
171445b4c2eSsb155480 
172445b4c2eSsb155480 static mdeg_node_match_t vd_match = {"virtual-device-port",
173445b4c2eSsb155480 				    vd_prop_match};
174445b4c2eSsb155480 
175047ba61eSachartre /*
176047ba61eSachartre  * Options for the VD_BLOCK_DEVICE_OPTS property.
177047ba61eSachartre  */
178047ba61eSachartre #define	VD_OPT_RDONLY		0x1	/* read-only  */
179047ba61eSachartre #define	VD_OPT_SLICE		0x2	/* single slice */
180047ba61eSachartre #define	VD_OPT_EXCLUSIVE	0x4	/* exclusive access */
181047ba61eSachartre 
182047ba61eSachartre #define	VD_OPTION_NLEN	128
183047ba61eSachartre 
184047ba61eSachartre typedef struct vd_option {
185047ba61eSachartre 	char vdo_name[VD_OPTION_NLEN];
186047ba61eSachartre 	uint64_t vdo_value;
187047ba61eSachartre } vd_option_t;
188047ba61eSachartre 
189047ba61eSachartre vd_option_t vd_bdev_options[] = {
190047ba61eSachartre 	{ "ro",		VD_OPT_RDONLY },
191047ba61eSachartre 	{ "slice", 	VD_OPT_SLICE },
192047ba61eSachartre 	{ "excl",	VD_OPT_EXCLUSIVE }
193047ba61eSachartre };
194047ba61eSachartre 
1951ae08745Sheppo /* Debugging macros */
1961ae08745Sheppo #ifdef DEBUG
1973af08d82Slm66018 
1983af08d82Slm66018 static int	vd_msglevel = 0;
1993af08d82Slm66018 
2001ae08745Sheppo #define	PR0 if (vd_msglevel > 0)	PRN
2011ae08745Sheppo #define	PR1 if (vd_msglevel > 1)	PRN
2021ae08745Sheppo #define	PR2 if (vd_msglevel > 2)	PRN
2031ae08745Sheppo 
2041ae08745Sheppo #define	VD_DUMP_DRING_ELEM(elem)					\
2053c96341aSnarayan 	PR0("dst:%x op:%x st:%u nb:%lx addr:%lx ncook:%u\n",		\
2061ae08745Sheppo 	    elem->hdr.dstate,						\
2071ae08745Sheppo 	    elem->payload.operation,					\
2081ae08745Sheppo 	    elem->payload.status,					\
2091ae08745Sheppo 	    elem->payload.nbytes,					\
2101ae08745Sheppo 	    elem->payload.addr,						\
2111ae08745Sheppo 	    elem->payload.ncookies);
2121ae08745Sheppo 
2133af08d82Slm66018 char *
2143af08d82Slm66018 vd_decode_state(int state)
2153af08d82Slm66018 {
2163af08d82Slm66018 	char *str;
2173af08d82Slm66018 
2183af08d82Slm66018 #define	CASE_STATE(_s)	case _s: str = #_s; break;
2193af08d82Slm66018 
2203af08d82Slm66018 	switch (state) {
2213af08d82Slm66018 	CASE_STATE(VD_STATE_INIT)
2223af08d82Slm66018 	CASE_STATE(VD_STATE_VER)
2233af08d82Slm66018 	CASE_STATE(VD_STATE_ATTR)
2243af08d82Slm66018 	CASE_STATE(VD_STATE_DRING)
2253af08d82Slm66018 	CASE_STATE(VD_STATE_RDX)
2263af08d82Slm66018 	CASE_STATE(VD_STATE_DATA)
2273af08d82Slm66018 	default: str = "unknown"; break;
2283af08d82Slm66018 	}
2293af08d82Slm66018 
2303af08d82Slm66018 #undef CASE_STATE
2313af08d82Slm66018 
2323af08d82Slm66018 	return (str);
2333af08d82Slm66018 }
2343af08d82Slm66018 
2353af08d82Slm66018 void
2363af08d82Slm66018 vd_decode_tag(vio_msg_t *msg)
2373af08d82Slm66018 {
2383af08d82Slm66018 	char *tstr, *sstr, *estr;
2393af08d82Slm66018 
2403af08d82Slm66018 #define	CASE_TYPE(_s)	case _s: tstr = #_s; break;
2413af08d82Slm66018 
2423af08d82Slm66018 	switch (msg->tag.vio_msgtype) {
2433af08d82Slm66018 	CASE_TYPE(VIO_TYPE_CTRL)
2443af08d82Slm66018 	CASE_TYPE(VIO_TYPE_DATA)
2453af08d82Slm66018 	CASE_TYPE(VIO_TYPE_ERR)
2463af08d82Slm66018 	default: tstr = "unknown"; break;
2473af08d82Slm66018 	}
2483af08d82Slm66018 
2493af08d82Slm66018 #undef CASE_TYPE
2503af08d82Slm66018 
2513af08d82Slm66018 #define	CASE_SUBTYPE(_s) case _s: sstr = #_s; break;
2523af08d82Slm66018 
2533af08d82Slm66018 	switch (msg->tag.vio_subtype) {
2543af08d82Slm66018 	CASE_SUBTYPE(VIO_SUBTYPE_INFO)
2553af08d82Slm66018 	CASE_SUBTYPE(VIO_SUBTYPE_ACK)
2563af08d82Slm66018 	CASE_SUBTYPE(VIO_SUBTYPE_NACK)
2573af08d82Slm66018 	default: sstr = "unknown"; break;
2583af08d82Slm66018 	}
2593af08d82Slm66018 
2603af08d82Slm66018 #undef CASE_SUBTYPE
2613af08d82Slm66018 
2623af08d82Slm66018 #define	CASE_ENV(_s)	case _s: estr = #_s; break;
2633af08d82Slm66018 
2643af08d82Slm66018 	switch (msg->tag.vio_subtype_env) {
2653af08d82Slm66018 	CASE_ENV(VIO_VER_INFO)
2663af08d82Slm66018 	CASE_ENV(VIO_ATTR_INFO)
2673af08d82Slm66018 	CASE_ENV(VIO_DRING_REG)
2683af08d82Slm66018 	CASE_ENV(VIO_DRING_UNREG)
2693af08d82Slm66018 	CASE_ENV(VIO_RDX)
2703af08d82Slm66018 	CASE_ENV(VIO_PKT_DATA)
2713af08d82Slm66018 	CASE_ENV(VIO_DESC_DATA)
2723af08d82Slm66018 	CASE_ENV(VIO_DRING_DATA)
2733af08d82Slm66018 	default: estr = "unknown"; break;
2743af08d82Slm66018 	}
2753af08d82Slm66018 
2763af08d82Slm66018 #undef CASE_ENV
2773af08d82Slm66018 
2783af08d82Slm66018 	PR1("(%x/%x/%x) message : (%s/%s/%s)",
2793af08d82Slm66018 	    msg->tag.vio_msgtype, msg->tag.vio_subtype,
2803af08d82Slm66018 	    msg->tag.vio_subtype_env, tstr, sstr, estr);
2813af08d82Slm66018 }
2823af08d82Slm66018 
2831ae08745Sheppo #else	/* !DEBUG */
2843af08d82Slm66018 
2851ae08745Sheppo #define	PR0(...)
2861ae08745Sheppo #define	PR1(...)
2871ae08745Sheppo #define	PR2(...)
2881ae08745Sheppo 
2891ae08745Sheppo #define	VD_DUMP_DRING_ELEM(elem)
2901ae08745Sheppo 
2913af08d82Slm66018 #define	vd_decode_state(_s)	(NULL)
2923af08d82Slm66018 #define	vd_decode_tag(_s)	(NULL)
2933af08d82Slm66018 
2941ae08745Sheppo #endif	/* DEBUG */
2951ae08745Sheppo 
2961ae08745Sheppo 
297d10e4ef2Snarayan /*
298d10e4ef2Snarayan  * Soft state structure for a vds instance
299d10e4ef2Snarayan  */
3001ae08745Sheppo typedef struct vds {
3011ae08745Sheppo 	uint_t		initialized;	/* driver inst initialization flags */
3021ae08745Sheppo 	dev_info_t	*dip;		/* driver inst devinfo pointer */
3031ae08745Sheppo 	ldi_ident_t	ldi_ident;	/* driver's identifier for LDI */
3041ae08745Sheppo 	mod_hash_t	*vd_table;	/* table of virtual disks served */
305445b4c2eSsb155480 	mdeg_node_spec_t *ispecp;	/* mdeg node specification */
3061ae08745Sheppo 	mdeg_handle_t	mdeg;		/* handle for MDEG operations  */
3071ae08745Sheppo } vds_t;
3081ae08745Sheppo 
309d10e4ef2Snarayan /*
310d10e4ef2Snarayan  * Types of descriptor-processing tasks
311d10e4ef2Snarayan  */
312d10e4ef2Snarayan typedef enum vd_task_type {
313d10e4ef2Snarayan 	VD_NONFINAL_RANGE_TASK,	/* task for intermediate descriptor in range */
314d10e4ef2Snarayan 	VD_FINAL_RANGE_TASK,	/* task for last in a range of descriptors */
315d10e4ef2Snarayan } vd_task_type_t;
316d10e4ef2Snarayan 
317d10e4ef2Snarayan /*
318d10e4ef2Snarayan  * Structure describing the task for processing a descriptor
319d10e4ef2Snarayan  */
320d10e4ef2Snarayan typedef struct vd_task {
321d10e4ef2Snarayan 	struct vd		*vd;		/* vd instance task is for */
322d10e4ef2Snarayan 	vd_task_type_t		type;		/* type of descriptor task */
323d10e4ef2Snarayan 	int			index;		/* dring elem index for task */
324d10e4ef2Snarayan 	vio_msg_t		*msg;		/* VIO message task is for */
325d10e4ef2Snarayan 	size_t			msglen;		/* length of message content */
326d10e4ef2Snarayan 	vd_dring_payload_t	*request;	/* request task will perform */
327d10e4ef2Snarayan 	struct buf		buf;		/* buf(9s) for I/O request */
3284bac2208Snarayan 	ldc_mem_handle_t	mhdl;		/* task memory handle */
329205eeb1aSlm66018 	int			status;		/* status of processing task */
330205eeb1aSlm66018 	int	(*completef)(struct vd_task *task); /* completion func ptr */
331d10e4ef2Snarayan } vd_task_t;
332d10e4ef2Snarayan 
333d10e4ef2Snarayan /*
334d10e4ef2Snarayan  * Soft state structure for a virtual disk instance
335d10e4ef2Snarayan  */
3361ae08745Sheppo typedef struct vd {
3371ae08745Sheppo 	uint_t			initialized;	/* vdisk initialization flags */
33817cadca8Slm66018 	uint64_t		operations;	/* bitmask of VD_OPs exported */
33917cadca8Slm66018 	vio_ver_t		version;	/* ver negotiated with client */
3401ae08745Sheppo 	vds_t			*vds;		/* server for this vdisk */
341d10e4ef2Snarayan 	ddi_taskq_t		*startq;	/* queue for I/O start tasks */
342d10e4ef2Snarayan 	ddi_taskq_t		*completionq;	/* queue for completion tasks */
3431ae08745Sheppo 	ldi_handle_t		ldi_handle[V_NUMPAR];	/* LDI slice handles */
3443c96341aSnarayan 	char			device_path[MAXPATHLEN + 1]; /* vdisk device */
3451ae08745Sheppo 	dev_t			dev[V_NUMPAR];	/* dev numbers for slices */
346047ba61eSachartre 	int			open_flags;	/* open flags */
347e1ebb9ecSlm66018 	uint_t			nslices;	/* number of slices */
3481ae08745Sheppo 	size_t			vdisk_size;	/* number of blocks in vdisk */
34917cadca8Slm66018 	size_t			vdisk_block_size; /* size of each vdisk block */
3501ae08745Sheppo 	vd_disk_type_t		vdisk_type;	/* slice or entire disk */
3514bac2208Snarayan 	vd_disk_label_t		vdisk_label;	/* EFI or VTOC label */
35217cadca8Slm66018 	vd_media_t		vdisk_media;	/* media type of backing dev. */
35317cadca8Slm66018 	boolean_t		is_atapi_dev;	/* Is this an IDE CD-ROM dev? */
354e1ebb9ecSlm66018 	ushort_t		max_xfer_sz;	/* max xfer size in DEV_BSIZE */
35517cadca8Slm66018 	size_t			block_size;	/* blk size of actual device */
3561ae08745Sheppo 	boolean_t		pseudo;		/* underlying pseudo dev */
35717cadca8Slm66018 	boolean_t		file;		/* is vDisk backed by a file? */
3582f5224aeSachartre 	boolean_t		scsi;		/* is vDisk backed by scsi? */
3593c96341aSnarayan 	vnode_t			*file_vnode;	/* file vnode */
3603c96341aSnarayan 	size_t			file_size;	/* file size */
36187a7269eSachartre 	ddi_devid_t		file_devid;	/* devid for disk image */
362edcc0754Sachartre 	efi_gpt_t		efi_gpt;	/* EFI GPT for slice type */
363edcc0754Sachartre 	efi_gpe_t		efi_gpe;	/* EFI GPE for slice type */
364edcc0754Sachartre 	int			efi_reserved;	/* EFI reserved slice */
3651ae08745Sheppo 	struct dk_geom		dk_geom;	/* synthetic for slice type */
3661ae08745Sheppo 	struct vtoc		vtoc;		/* synthetic for slice type */
367edcc0754Sachartre 	vd_slice_t		slices[VD_MAXPART]; /* logical partitions */
3682f5224aeSachartre 	boolean_t		ownership;	/* disk ownership status */
3691ae08745Sheppo 	ldc_status_t		ldc_state;	/* LDC connection state */
3701ae08745Sheppo 	ldc_handle_t		ldc_handle;	/* handle for LDC comm */
3711ae08745Sheppo 	size_t			max_msglen;	/* largest LDC message len */
3721ae08745Sheppo 	vd_state_t		state;		/* client handshake state */
3731ae08745Sheppo 	uint8_t			xfer_mode;	/* transfer mode with client */
3741ae08745Sheppo 	uint32_t		sid;		/* client's session ID */
3751ae08745Sheppo 	uint64_t		seq_num;	/* message sequence number */
3761ae08745Sheppo 	uint64_t		dring_ident;	/* identifier of dring */
3771ae08745Sheppo 	ldc_dring_handle_t	dring_handle;	/* handle for dring ops */
3781ae08745Sheppo 	uint32_t		descriptor_size;	/* num bytes in desc */
3791ae08745Sheppo 	uint32_t		dring_len;	/* number of dring elements */
3801ae08745Sheppo 	caddr_t			dring;		/* address of dring */
3813af08d82Slm66018 	caddr_t			vio_msgp;	/* vio msg staging buffer */
382d10e4ef2Snarayan 	vd_task_t		inband_task;	/* task for inband descriptor */
383d10e4ef2Snarayan 	vd_task_t		*dring_task;	/* tasks dring elements */
384d10e4ef2Snarayan 
385d10e4ef2Snarayan 	kmutex_t		lock;		/* protects variables below */
386d10e4ef2Snarayan 	boolean_t		enabled;	/* is vdisk enabled? */
387d10e4ef2Snarayan 	boolean_t		reset_state;	/* reset connection state? */
388d10e4ef2Snarayan 	boolean_t		reset_ldc;	/* reset LDC channel? */
3891ae08745Sheppo } vd_t;
3901ae08745Sheppo 
3911ae08745Sheppo typedef struct vds_operation {
3923af08d82Slm66018 	char	*namep;
3931ae08745Sheppo 	uint8_t	operation;
394d10e4ef2Snarayan 	int	(*start)(vd_task_t *task);
395205eeb1aSlm66018 	int	(*complete)(vd_task_t *task);
3961ae08745Sheppo } vds_operation_t;
3971ae08745Sheppo 
3980a55fbb7Slm66018 typedef struct vd_ioctl {
3990a55fbb7Slm66018 	uint8_t		operation;		/* vdisk operation */
4000a55fbb7Slm66018 	const char	*operation_name;	/* vdisk operation name */
4010a55fbb7Slm66018 	size_t		nbytes;			/* size of operation buffer */
4020a55fbb7Slm66018 	int		cmd;			/* corresponding ioctl cmd */
4030a55fbb7Slm66018 	const char	*cmd_name;		/* ioctl cmd name */
4040a55fbb7Slm66018 	void		*arg;			/* ioctl cmd argument */
4050a55fbb7Slm66018 	/* convert input vd_buf to output ioctl_arg */
4062f5224aeSachartre 	int		(*copyin)(void *vd_buf, size_t, void *ioctl_arg);
4070a55fbb7Slm66018 	/* convert input ioctl_arg to output vd_buf */
4080a55fbb7Slm66018 	void		(*copyout)(void *ioctl_arg, void *vd_buf);
409047ba61eSachartre 	/* write is true if the operation writes any data to the backend */
410047ba61eSachartre 	boolean_t	write;
4110a55fbb7Slm66018 } vd_ioctl_t;
4120a55fbb7Slm66018 
4130a55fbb7Slm66018 /* Define trivial copyin/copyout conversion function flag */
4142f5224aeSachartre #define	VD_IDENTITY_IN	((int (*)(void *, size_t, void *))-1)
4152f5224aeSachartre #define	VD_IDENTITY_OUT	((void (*)(void *, void *))-1)
4161ae08745Sheppo 
4171ae08745Sheppo 
4183c96341aSnarayan static int	vds_ldc_retries = VDS_RETRIES;
4193af08d82Slm66018 static int	vds_ldc_delay = VDS_LDC_DELAY;
4203c96341aSnarayan static int	vds_dev_retries = VDS_RETRIES;
4213c96341aSnarayan static int	vds_dev_delay = VDS_DEV_DELAY;
4221ae08745Sheppo static void	*vds_state;
4231ae08745Sheppo 
424eba0cb4eSachartre static uint_t	vd_file_write_flags = VD_FILE_WRITE_FLAGS;
425eba0cb4eSachartre 
42687a7269eSachartre static short	vd_scsi_rdwr_timeout = VD_SCSI_RDWR_TIMEOUT;
4272f5224aeSachartre static int	vd_scsi_debug = USCSI_SILENT;
4282f5224aeSachartre 
4292f5224aeSachartre /*
4302f5224aeSachartre  * Tunable to define the behavior of the service domain if the vdisk server
4312f5224aeSachartre  * fails to reset disk exclusive access when a LDC channel is reset. When a
4322f5224aeSachartre  * LDC channel is reset the vdisk server will try to reset disk exclusive
4332f5224aeSachartre  * access by releasing any SCSI-2 reservation or resetting the disk. If these
4342f5224aeSachartre  * actions fail then the default behavior (vd_reset_access_failure = 0) is to
4352f5224aeSachartre  * print a warning message. This default behavior can be changed by setting
4362f5224aeSachartre  * the vd_reset_access_failure variable to A_REBOOT (= 0x1) and that will
4372f5224aeSachartre  * cause the service domain to reboot, or A_DUMP (= 0x5) and that will cause
4382f5224aeSachartre  * the service domain to panic. In both cases, the reset of the service domain
4392f5224aeSachartre  * should trigger a reset SCSI buses and hopefully clear any SCSI-2 reservation.
4402f5224aeSachartre  */
4412f5224aeSachartre static int 	vd_reset_access_failure = 0;
4422f5224aeSachartre 
4432f5224aeSachartre /*
4442f5224aeSachartre  * Tunable for backward compatibility. When this variable is set to B_TRUE,
4452f5224aeSachartre  * all disk volumes (ZFS, SVM, VxvM volumes) will be exported as single
4462f5224aeSachartre  * slice disks whether or not they have the "slice" option set. This is
4472f5224aeSachartre  * to provide a simple backward compatibility mechanism when upgrading
4482f5224aeSachartre  * the vds driver and using a domain configuration created before the
4492f5224aeSachartre  * "slice" option was available.
4502f5224aeSachartre  */
4512f5224aeSachartre static boolean_t vd_volume_force_slice = B_FALSE;
45287a7269eSachartre 
4530a55fbb7Slm66018 /*
4540a55fbb7Slm66018  * Supported protocol version pairs, from highest (newest) to lowest (oldest)
4550a55fbb7Slm66018  *
4560a55fbb7Slm66018  * Each supported major version should appear only once, paired with (and only
4570a55fbb7Slm66018  * with) its highest supported minor version number (as the protocol requires
4580a55fbb7Slm66018  * supporting all lower minor version numbers as well)
4590a55fbb7Slm66018  */
46017cadca8Slm66018 static const vio_ver_t	vds_version[] = {{1, 1}};
4610a55fbb7Slm66018 static const size_t	vds_num_versions =
4620a55fbb7Slm66018     sizeof (vds_version)/sizeof (vds_version[0]);
4630a55fbb7Slm66018 
4643af08d82Slm66018 static void vd_free_dring_task(vd_t *vdp);
4653c96341aSnarayan static int vd_setup_vd(vd_t *vd);
466047ba61eSachartre static int vd_setup_single_slice_disk(vd_t *vd);
4672f5224aeSachartre static int vd_setup_mediainfo(vd_t *vd);
4683c96341aSnarayan static boolean_t vd_enabled(vd_t *vd);
46978fcd0a1Sachartre static ushort_t vd_lbl2cksum(struct dk_label *label);
47078fcd0a1Sachartre static int vd_file_validate_geometry(vd_t *vd);
47117cadca8Slm66018 static boolean_t vd_file_is_iso_image(vd_t *vd);
47217cadca8Slm66018 static void vd_set_exported_operations(vd_t *vd);
4732f5224aeSachartre static void vd_reset_access(vd_t *vd);
474edcc0754Sachartre static int vd_backend_ioctl(vd_t *vd, int cmd, caddr_t arg);
475edcc0754Sachartre static int vds_efi_alloc_and_read(vd_t *, efi_gpt_t **, efi_gpe_t **);
476edcc0754Sachartre static void vds_efi_free(vd_t *, efi_gpt_t *, efi_gpe_t *);
477047ba61eSachartre 
478690555a1Sachartre /*
479690555a1Sachartre  * Function:
480690555a1Sachartre  *	vd_file_rw
481690555a1Sachartre  *
482690555a1Sachartre  * Description:
483690555a1Sachartre  * 	Read or write to a disk on file.
484690555a1Sachartre  *
485690555a1Sachartre  * Parameters:
486690555a1Sachartre  *	vd		- disk on which the operation is performed.
487690555a1Sachartre  *	slice		- slice on which the operation is performed,
48887a7269eSachartre  *			  VD_SLICE_NONE indicates that the operation
48987a7269eSachartre  *			  is done using an absolute disk offset.
490690555a1Sachartre  *	operation	- operation to execute: read (VD_OP_BREAD) or
491690555a1Sachartre  *			  write (VD_OP_BWRITE).
492690555a1Sachartre  *	data		- buffer where data are read to or written from.
493690555a1Sachartre  *	blk		- starting block for the operation.
494690555a1Sachartre  *	len		- number of bytes to read or write.
495690555a1Sachartre  *
496690555a1Sachartre  * Return Code:
497690555a1Sachartre  *	n >= 0		- success, n indicates the number of bytes read
498690555a1Sachartre  *			  or written.
499690555a1Sachartre  *	-1		- error.
500690555a1Sachartre  */
501690555a1Sachartre static ssize_t
502690555a1Sachartre vd_file_rw(vd_t *vd, int slice, int operation, caddr_t data, size_t blk,
503690555a1Sachartre     size_t len)
504690555a1Sachartre {
505690555a1Sachartre 	caddr_t	maddr;
506690555a1Sachartre 	size_t offset, maxlen, moffset, mlen, n;
507690555a1Sachartre 	uint_t smflags;
508690555a1Sachartre 	enum seg_rw srw;
509690555a1Sachartre 
510690555a1Sachartre 	ASSERT(vd->file);
511690555a1Sachartre 	ASSERT(len > 0);
512690555a1Sachartre 
513047ba61eSachartre 	/*
514047ba61eSachartre 	 * If a file is exported as a slice then we don't care about the vtoc.
515047ba61eSachartre 	 * In that case, the vtoc is a fake mainly to make newfs happy and we
516047ba61eSachartre 	 * handle any I/O as a raw disk access so that we can have access to the
517047ba61eSachartre 	 * entire backend.
518047ba61eSachartre 	 */
519047ba61eSachartre 	if (vd->vdisk_type == VD_DISK_TYPE_SLICE || slice == VD_SLICE_NONE) {
520690555a1Sachartre 		/* raw disk access */
521690555a1Sachartre 		offset = blk * DEV_BSIZE;
522690555a1Sachartre 	} else {
523690555a1Sachartre 		ASSERT(slice >= 0 && slice < V_NUMPAR);
52478fcd0a1Sachartre 
52517cadca8Slm66018 		/*
52617cadca8Slm66018 		 * v1.0 vDisk clients depended on the server not verifying
52717cadca8Slm66018 		 * the label of a unformatted disk.  This "feature" is
52817cadca8Slm66018 		 * maintained for backward compatibility but all versions
52917cadca8Slm66018 		 * from v1.1 onwards must do the right thing.
53017cadca8Slm66018 		 */
53178fcd0a1Sachartre 		if (vd->vdisk_label == VD_DISK_LABEL_UNK &&
532edcc0754Sachartre 		    vio_ver_is_supported(vd->version, 1, 1)) {
533edcc0754Sachartre 			(void) vd_file_validate_geometry(vd);
534edcc0754Sachartre 			if (vd->vdisk_label == VD_DISK_LABEL_UNK) {
535edcc0754Sachartre 				PR0("Unknown disk label, can't do I/O "
536edcc0754Sachartre 				    "from slice %d", slice);
53778fcd0a1Sachartre 				return (-1);
53878fcd0a1Sachartre 			}
539edcc0754Sachartre 		}
54078fcd0a1Sachartre 
541edcc0754Sachartre 		if (vd->vdisk_label == VD_DISK_LABEL_VTOC) {
542edcc0754Sachartre 			ASSERT(vd->vtoc.v_sectorsz == DEV_BSIZE);
543edcc0754Sachartre 		} else {
544edcc0754Sachartre 			ASSERT(vd->vdisk_label == VD_DISK_LABEL_EFI);
545edcc0754Sachartre 			ASSERT(vd->vdisk_block_size == DEV_BSIZE);
546edcc0754Sachartre 		}
547edcc0754Sachartre 
548edcc0754Sachartre 		if (blk >= vd->slices[slice].nblocks) {
549690555a1Sachartre 			/* address past the end of the slice */
550690555a1Sachartre 			PR0("req_addr (0x%lx) > psize (0x%lx)",
551edcc0754Sachartre 			    blk, vd->slices[slice].nblocks);
552690555a1Sachartre 			return (0);
553690555a1Sachartre 		}
554690555a1Sachartre 
555edcc0754Sachartre 		offset = (vd->slices[slice].start + blk) * DEV_BSIZE;
556690555a1Sachartre 
557690555a1Sachartre 		/*
558690555a1Sachartre 		 * If the requested size is greater than the size
559690555a1Sachartre 		 * of the partition, truncate the read/write.
560690555a1Sachartre 		 */
561edcc0754Sachartre 		maxlen = (vd->slices[slice].nblocks - blk) * DEV_BSIZE;
562690555a1Sachartre 
563690555a1Sachartre 		if (len > maxlen) {
564690555a1Sachartre 			PR0("I/O size truncated to %lu bytes from %lu bytes",
565690555a1Sachartre 			    maxlen, len);
566690555a1Sachartre 			len = maxlen;
567690555a1Sachartre 		}
568690555a1Sachartre 	}
569690555a1Sachartre 
570690555a1Sachartre 	/*
571690555a1Sachartre 	 * We have to ensure that we are reading/writing into the mmap
572690555a1Sachartre 	 * range. If we have a partial disk image (e.g. an image of
573690555a1Sachartre 	 * s0 instead s2) the system can try to access slices that
574690555a1Sachartre 	 * are not included into the disk image.
575690555a1Sachartre 	 */
576edcc0754Sachartre 	if ((offset + len) > vd->file_size) {
577edcc0754Sachartre 		PR0("offset + nbytes (0x%lx + 0x%lx) > "
578690555a1Sachartre 		    "file_size (0x%lx)", offset, len, vd->file_size);
579690555a1Sachartre 		return (-1);
580690555a1Sachartre 	}
581690555a1Sachartre 
582690555a1Sachartre 	srw = (operation == VD_OP_BREAD)? S_READ : S_WRITE;
583eba0cb4eSachartre 	smflags = (operation == VD_OP_BREAD)? 0 :
584eba0cb4eSachartre 	    (SM_WRITE | vd_file_write_flags);
585690555a1Sachartre 	n = len;
586690555a1Sachartre 
587690555a1Sachartre 	do {
588690555a1Sachartre 		/*
589690555a1Sachartre 		 * segmap_getmapflt() returns a MAXBSIZE chunk which is
590690555a1Sachartre 		 * MAXBSIZE aligned.
591690555a1Sachartre 		 */
592690555a1Sachartre 		moffset = offset & MAXBOFFSET;
593690555a1Sachartre 		mlen = MIN(MAXBSIZE - moffset, n);
594690555a1Sachartre 		maddr = segmap_getmapflt(segkmap, vd->file_vnode, offset,
595690555a1Sachartre 		    mlen, 1, srw);
596690555a1Sachartre 		/*
597690555a1Sachartre 		 * Fault in the pages so we can check for error and ensure
598690555a1Sachartre 		 * that we can safely used the mapped address.
599690555a1Sachartre 		 */
600690555a1Sachartre 		if (segmap_fault(kas.a_hat, segkmap, maddr, mlen,
601690555a1Sachartre 		    F_SOFTLOCK, srw) != 0) {
602690555a1Sachartre 			(void) segmap_release(segkmap, maddr, 0);
603690555a1Sachartre 			return (-1);
604690555a1Sachartre 		}
605690555a1Sachartre 
606690555a1Sachartre 		if (operation == VD_OP_BREAD)
607690555a1Sachartre 			bcopy(maddr + moffset, data, mlen);
608690555a1Sachartre 		else
609690555a1Sachartre 			bcopy(data, maddr + moffset, mlen);
610690555a1Sachartre 
611690555a1Sachartre 		if (segmap_fault(kas.a_hat, segkmap, maddr, mlen,
612690555a1Sachartre 		    F_SOFTUNLOCK, srw) != 0) {
613690555a1Sachartre 			(void) segmap_release(segkmap, maddr, 0);
614690555a1Sachartre 			return (-1);
615690555a1Sachartre 		}
616690555a1Sachartre 		if (segmap_release(segkmap, maddr, smflags) != 0)
617690555a1Sachartre 			return (-1);
618690555a1Sachartre 		n -= mlen;
619690555a1Sachartre 		offset += mlen;
620690555a1Sachartre 		data += mlen;
621690555a1Sachartre 
622690555a1Sachartre 	} while (n > 0);
623690555a1Sachartre 
624690555a1Sachartre 	return (len);
625690555a1Sachartre }
626690555a1Sachartre 
62787a7269eSachartre /*
62887a7269eSachartre  * Function:
62978fcd0a1Sachartre  *	vd_file_build_default_label
63078fcd0a1Sachartre  *
63178fcd0a1Sachartre  * Description:
63278fcd0a1Sachartre  *	Return a default label for the given disk. This is used when the disk
63378fcd0a1Sachartre  *	does not have a valid VTOC so that the user can get a valid default
63417cadca8Slm66018  *	configuration. The default label has all slice sizes set to 0 (except
63578fcd0a1Sachartre  *	slice 2 which is the entire disk) to force the user to write a valid
63678fcd0a1Sachartre  *	label onto the disk image.
63778fcd0a1Sachartre  *
63878fcd0a1Sachartre  * Parameters:
63978fcd0a1Sachartre  *	vd		- disk on which the operation is performed.
64078fcd0a1Sachartre  *	label		- the returned default label.
64178fcd0a1Sachartre  *
64278fcd0a1Sachartre  * Return Code:
64378fcd0a1Sachartre  *	none.
64478fcd0a1Sachartre  */
64578fcd0a1Sachartre static void
64678fcd0a1Sachartre vd_file_build_default_label(vd_t *vd, struct dk_label *label)
64778fcd0a1Sachartre {
64878fcd0a1Sachartre 	size_t size;
64978fcd0a1Sachartre 	char prefix;
65078fcd0a1Sachartre 
65178fcd0a1Sachartre 	ASSERT(vd->file);
652edcc0754Sachartre 	ASSERT(vd->vdisk_type == VD_DISK_TYPE_DISK);
653edcc0754Sachartre 
654edcc0754Sachartre 	bzero(label, sizeof (struct dk_label));
65578fcd0a1Sachartre 
65678fcd0a1Sachartre 	/*
65778fcd0a1Sachartre 	 * We must have a resonable number of cylinders and sectors so
65878fcd0a1Sachartre 	 * that newfs can run using default values.
65978fcd0a1Sachartre 	 *
66078fcd0a1Sachartre 	 * if (disk_size < 2MB)
66178fcd0a1Sachartre 	 * 	phys_cylinders = disk_size / 100K
66278fcd0a1Sachartre 	 * else
66378fcd0a1Sachartre 	 * 	phys_cylinders = disk_size / 300K
66478fcd0a1Sachartre 	 *
66578fcd0a1Sachartre 	 * phys_cylinders = (phys_cylinders == 0) ? 1 : phys_cylinders
66678fcd0a1Sachartre 	 * alt_cylinders = (phys_cylinders > 2) ? 2 : 0;
66778fcd0a1Sachartre 	 * data_cylinders = phys_cylinders - alt_cylinders
66878fcd0a1Sachartre 	 *
66978fcd0a1Sachartre 	 * sectors = disk_size / (phys_cylinders * blk_size)
67078fcd0a1Sachartre 	 *
67178fcd0a1Sachartre 	 * The file size test is an attempt to not have too few cylinders
67278fcd0a1Sachartre 	 * for a small file, or so many on a big file that you waste space
67378fcd0a1Sachartre 	 * for backup superblocks or cylinder group structures.
67478fcd0a1Sachartre 	 */
67578fcd0a1Sachartre 	if (vd->file_size < (2 * 1024 * 1024))
67678fcd0a1Sachartre 		label->dkl_pcyl = vd->file_size / (100 * 1024);
67778fcd0a1Sachartre 	else
67878fcd0a1Sachartre 		label->dkl_pcyl = vd->file_size / (300 * 1024);
67978fcd0a1Sachartre 
68078fcd0a1Sachartre 	if (label->dkl_pcyl == 0)
68178fcd0a1Sachartre 		label->dkl_pcyl = 1;
68278fcd0a1Sachartre 
683047ba61eSachartre 	label->dkl_acyl = 0;
684047ba61eSachartre 
68578fcd0a1Sachartre 	if (label->dkl_pcyl > 2)
68678fcd0a1Sachartre 		label->dkl_acyl = 2;
68778fcd0a1Sachartre 
68878fcd0a1Sachartre 	label->dkl_nsect = vd->file_size /
68978fcd0a1Sachartre 	    (DEV_BSIZE * label->dkl_pcyl);
69078fcd0a1Sachartre 	label->dkl_ncyl = label->dkl_pcyl - label->dkl_acyl;
69178fcd0a1Sachartre 	label->dkl_nhead = 1;
69278fcd0a1Sachartre 	label->dkl_write_reinstruct = 0;
69378fcd0a1Sachartre 	label->dkl_read_reinstruct = 0;
69478fcd0a1Sachartre 	label->dkl_rpm = 7200;
69578fcd0a1Sachartre 	label->dkl_apc = 0;
69678fcd0a1Sachartre 	label->dkl_intrlv = 0;
69778fcd0a1Sachartre 
69878fcd0a1Sachartre 	PR0("requested disk size: %ld bytes\n", vd->file_size);
69978fcd0a1Sachartre 	PR0("setup: ncyl=%d nhead=%d nsec=%d\n", label->dkl_pcyl,
70078fcd0a1Sachartre 	    label->dkl_nhead, label->dkl_nsect);
70178fcd0a1Sachartre 	PR0("provided disk size: %ld bytes\n", (uint64_t)
70278fcd0a1Sachartre 	    (label->dkl_pcyl * label->dkl_nhead *
70378fcd0a1Sachartre 	    label->dkl_nsect * DEV_BSIZE));
70478fcd0a1Sachartre 
70578fcd0a1Sachartre 	if (vd->file_size < (1ULL << 20)) {
70678fcd0a1Sachartre 		size = vd->file_size >> 10;
70778fcd0a1Sachartre 		prefix = 'K'; /* Kilobyte */
70878fcd0a1Sachartre 	} else if (vd->file_size < (1ULL << 30)) {
70978fcd0a1Sachartre 		size = vd->file_size >> 20;
71078fcd0a1Sachartre 		prefix = 'M'; /* Megabyte */
71178fcd0a1Sachartre 	} else if (vd->file_size < (1ULL << 40)) {
71278fcd0a1Sachartre 		size = vd->file_size >> 30;
71378fcd0a1Sachartre 		prefix = 'G'; /* Gigabyte */
71478fcd0a1Sachartre 	} else {
71578fcd0a1Sachartre 		size = vd->file_size >> 40;
71678fcd0a1Sachartre 		prefix = 'T'; /* Terabyte */
71778fcd0a1Sachartre 	}
71878fcd0a1Sachartre 
71978fcd0a1Sachartre 	/*
72078fcd0a1Sachartre 	 * We must have a correct label name otherwise format(1m) will
72178fcd0a1Sachartre 	 * not recognized the disk as labeled.
72278fcd0a1Sachartre 	 */
72378fcd0a1Sachartre 	(void) snprintf(label->dkl_asciilabel, LEN_DKL_ASCII,
72478fcd0a1Sachartre 	    "SUN-DiskImage-%ld%cB cyl %d alt %d hd %d sec %d",
72578fcd0a1Sachartre 	    size, prefix,
72678fcd0a1Sachartre 	    label->dkl_ncyl, label->dkl_acyl, label->dkl_nhead,
72778fcd0a1Sachartre 	    label->dkl_nsect);
72878fcd0a1Sachartre 
72978fcd0a1Sachartre 	/* default VTOC */
73078fcd0a1Sachartre 	label->dkl_vtoc.v_version = V_VERSION;
731edcc0754Sachartre 	label->dkl_vtoc.v_nparts = V_NUMPAR;
73278fcd0a1Sachartre 	label->dkl_vtoc.v_sanity = VTOC_SANE;
733edcc0754Sachartre 	label->dkl_vtoc.v_part[VD_ENTIRE_DISK_SLICE].p_tag = V_BACKUP;
734edcc0754Sachartre 	label->dkl_map[VD_ENTIRE_DISK_SLICE].dkl_cylno = 0;
735edcc0754Sachartre 	label->dkl_map[VD_ENTIRE_DISK_SLICE].dkl_nblk = label->dkl_ncyl *
73678fcd0a1Sachartre 	    label->dkl_nhead * label->dkl_nsect;
737edcc0754Sachartre 	label->dkl_magic = DKL_MAGIC;
73878fcd0a1Sachartre 	label->dkl_cksum = vd_lbl2cksum(label);
73978fcd0a1Sachartre }
74078fcd0a1Sachartre 
74178fcd0a1Sachartre /*
74278fcd0a1Sachartre  * Function:
74387a7269eSachartre  *	vd_file_set_vtoc
74487a7269eSachartre  *
74587a7269eSachartre  * Description:
74687a7269eSachartre  *	Set the vtoc of a disk image by writing the label and backup
74787a7269eSachartre  *	labels into the disk image backend.
74887a7269eSachartre  *
74987a7269eSachartre  * Parameters:
75087a7269eSachartre  *	vd		- disk on which the operation is performed.
75187a7269eSachartre  *	label		- the data to be written.
75287a7269eSachartre  *
75387a7269eSachartre  * Return Code:
75487a7269eSachartre  *	0		- success.
75587a7269eSachartre  *	n > 0		- error, n indicates the errno code.
75687a7269eSachartre  */
75787a7269eSachartre static int
75887a7269eSachartre vd_file_set_vtoc(vd_t *vd, struct dk_label *label)
75987a7269eSachartre {
76087a7269eSachartre 	int blk, sec, cyl, head, cnt;
76187a7269eSachartre 
76287a7269eSachartre 	ASSERT(vd->file);
76387a7269eSachartre 
76487a7269eSachartre 	if (VD_FILE_LABEL_WRITE(vd, label) < 0) {
76587a7269eSachartre 		PR0("fail to write disk label");
76687a7269eSachartre 		return (EIO);
76787a7269eSachartre 	}
76887a7269eSachartre 
76987a7269eSachartre 	/*
77087a7269eSachartre 	 * Backup labels are on the last alternate cylinder's
77187a7269eSachartre 	 * first five odd sectors.
77287a7269eSachartre 	 */
77387a7269eSachartre 	if (label->dkl_acyl == 0) {
77487a7269eSachartre 		PR0("no alternate cylinder, can not store backup labels");
77587a7269eSachartre 		return (0);
77687a7269eSachartre 	}
77787a7269eSachartre 
77887a7269eSachartre 	cyl = label->dkl_ncyl  + label->dkl_acyl - 1;
77987a7269eSachartre 	head = label->dkl_nhead - 1;
78087a7269eSachartre 
78187a7269eSachartre 	blk = (cyl * ((label->dkl_nhead * label->dkl_nsect) - label->dkl_apc)) +
78287a7269eSachartre 	    (head * label->dkl_nsect);
78387a7269eSachartre 
78487a7269eSachartre 	/*
78587a7269eSachartre 	 * Write the backup labels. Make sure we don't try to write past
78687a7269eSachartre 	 * the last cylinder.
78787a7269eSachartre 	 */
78887a7269eSachartre 	sec = 1;
78987a7269eSachartre 
79087a7269eSachartre 	for (cnt = 0; cnt < VD_FILE_NUM_BACKUP; cnt++) {
79187a7269eSachartre 
79287a7269eSachartre 		if (sec >= label->dkl_nsect) {
79387a7269eSachartre 			PR0("not enough sector to store all backup labels");
79487a7269eSachartre 			return (0);
79587a7269eSachartre 		}
79687a7269eSachartre 
79787a7269eSachartre 		if (vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BWRITE, (caddr_t)label,
79887a7269eSachartre 		    blk + sec, sizeof (struct dk_label)) < 0) {
79987a7269eSachartre 			PR0("error writing backup label at block %d\n",
80087a7269eSachartre 			    blk + sec);
80187a7269eSachartre 			return (EIO);
80287a7269eSachartre 		}
80387a7269eSachartre 
80487a7269eSachartre 		PR1("wrote backup label at block %d\n", blk + sec);
80587a7269eSachartre 
80687a7269eSachartre 		sec += 2;
80787a7269eSachartre 	}
80887a7269eSachartre 
80987a7269eSachartre 	return (0);
81087a7269eSachartre }
81187a7269eSachartre 
81287a7269eSachartre /*
81387a7269eSachartre  * Function:
81487a7269eSachartre  *	vd_file_get_devid_block
81587a7269eSachartre  *
81687a7269eSachartre  * Description:
81787a7269eSachartre  *	Return the block number where the device id is stored.
81887a7269eSachartre  *
81987a7269eSachartre  * Parameters:
82087a7269eSachartre  *	vd		- disk on which the operation is performed.
82187a7269eSachartre  *	blkp		- pointer to the block number
82287a7269eSachartre  *
82387a7269eSachartre  * Return Code:
82487a7269eSachartre  *	0		- success
82587a7269eSachartre  *	ENOSPC		- disk has no space to store a device id
82687a7269eSachartre  */
82787a7269eSachartre static int
82887a7269eSachartre vd_file_get_devid_block(vd_t *vd, size_t *blkp)
82987a7269eSachartre {
83087a7269eSachartre 	diskaddr_t spc, head, cyl;
83187a7269eSachartre 
83287a7269eSachartre 	ASSERT(vd->file);
833edcc0754Sachartre 
834edcc0754Sachartre 	if (vd->vdisk_label == VD_DISK_LABEL_UNK) {
835edcc0754Sachartre 		/*
836edcc0754Sachartre 		 * If no label is defined we don't know where to find
837edcc0754Sachartre 		 * a device id.
838edcc0754Sachartre 		 */
839edcc0754Sachartre 		return (ENOSPC);
840edcc0754Sachartre 	}
841edcc0754Sachartre 
842edcc0754Sachartre 	if (vd->vdisk_label == VD_DISK_LABEL_EFI) {
843edcc0754Sachartre 		/*
844edcc0754Sachartre 		 * For an EFI disk, the devid is at the beginning of
845edcc0754Sachartre 		 * the reserved slice
846edcc0754Sachartre 		 */
847edcc0754Sachartre 		if (vd->efi_reserved == -1) {
848edcc0754Sachartre 			PR0("EFI disk has no reserved slice");
849edcc0754Sachartre 			return (ENOSPC);
850edcc0754Sachartre 		}
851edcc0754Sachartre 
852edcc0754Sachartre 		*blkp = vd->slices[vd->efi_reserved].start;
853edcc0754Sachartre 		return (0);
854edcc0754Sachartre 	}
855edcc0754Sachartre 
85687a7269eSachartre 	ASSERT(vd->vdisk_label == VD_DISK_LABEL_VTOC);
85787a7269eSachartre 
85887a7269eSachartre 	/* this geometry doesn't allow us to have a devid */
85987a7269eSachartre 	if (vd->dk_geom.dkg_acyl < 2) {
86087a7269eSachartre 		PR0("not enough alternate cylinder available for devid "
86187a7269eSachartre 		    "(acyl=%u)", vd->dk_geom.dkg_acyl);
86287a7269eSachartre 		return (ENOSPC);
86387a7269eSachartre 	}
86487a7269eSachartre 
86587a7269eSachartre 	/* the devid is in on the track next to the last cylinder */
86687a7269eSachartre 	cyl = vd->dk_geom.dkg_ncyl + vd->dk_geom.dkg_acyl - 2;
86787a7269eSachartre 	spc = vd->dk_geom.dkg_nhead * vd->dk_geom.dkg_nsect;
86887a7269eSachartre 	head = vd->dk_geom.dkg_nhead - 1;
86987a7269eSachartre 
87087a7269eSachartre 	*blkp = (cyl * (spc - vd->dk_geom.dkg_apc)) +
87187a7269eSachartre 	    (head * vd->dk_geom.dkg_nsect) + 1;
87287a7269eSachartre 
87387a7269eSachartre 	return (0);
87487a7269eSachartre }
87587a7269eSachartre 
87687a7269eSachartre /*
87787a7269eSachartre  * Return the checksum of a disk block containing an on-disk devid.
87887a7269eSachartre  */
87987a7269eSachartre static uint_t
88087a7269eSachartre vd_dkdevid2cksum(struct dk_devid *dkdevid)
88187a7269eSachartre {
88287a7269eSachartre 	uint_t chksum, *ip;
88387a7269eSachartre 	int i;
88487a7269eSachartre 
88587a7269eSachartre 	chksum = 0;
88687a7269eSachartre 	ip = (uint_t *)dkdevid;
88787a7269eSachartre 	for (i = 0; i < ((DEV_BSIZE - sizeof (int)) / sizeof (int)); i++)
88887a7269eSachartre 		chksum ^= ip[i];
88987a7269eSachartre 
89087a7269eSachartre 	return (chksum);
89187a7269eSachartre }
89287a7269eSachartre 
89387a7269eSachartre /*
89487a7269eSachartre  * Function:
89587a7269eSachartre  *	vd_file_read_devid
89687a7269eSachartre  *
89787a7269eSachartre  * Description:
89887a7269eSachartre  *	Read the device id stored on a disk image.
89987a7269eSachartre  *
90087a7269eSachartre  * Parameters:
90187a7269eSachartre  *	vd		- disk on which the operation is performed.
90287a7269eSachartre  *	devid		- the return address of the device ID.
90387a7269eSachartre  *
90487a7269eSachartre  * Return Code:
90587a7269eSachartre  *	0		- success
90687a7269eSachartre  *	EIO		- I/O error while trying to access the disk image
90787a7269eSachartre  *	EINVAL		- no valid device id was found
90887a7269eSachartre  *	ENOSPC		- disk has no space to store a device id
90987a7269eSachartre  */
91087a7269eSachartre static int
91187a7269eSachartre vd_file_read_devid(vd_t *vd, ddi_devid_t *devid)
91287a7269eSachartre {
91387a7269eSachartre 	struct dk_devid *dkdevid;
91487a7269eSachartre 	size_t blk;
91587a7269eSachartre 	uint_t chksum;
91687a7269eSachartre 	int status, sz;
91787a7269eSachartre 
91887a7269eSachartre 	if ((status = vd_file_get_devid_block(vd, &blk)) != 0)
91987a7269eSachartre 		return (status);
92087a7269eSachartre 
92187a7269eSachartre 	dkdevid = kmem_zalloc(DEV_BSIZE, KM_SLEEP);
92287a7269eSachartre 
92387a7269eSachartre 	/* get the devid */
92487a7269eSachartre 	if ((vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BREAD, (caddr_t)dkdevid, blk,
92587a7269eSachartre 	    DEV_BSIZE)) < 0) {
92687a7269eSachartre 		PR0("error reading devid block at %lu", blk);
92787a7269eSachartre 		status = EIO;
92887a7269eSachartre 		goto done;
92987a7269eSachartre 	}
93087a7269eSachartre 
93187a7269eSachartre 	/* validate the revision */
93287a7269eSachartre 	if ((dkdevid->dkd_rev_hi != DK_DEVID_REV_MSB) ||
93387a7269eSachartre 	    (dkdevid->dkd_rev_lo != DK_DEVID_REV_LSB)) {
93487a7269eSachartre 		PR0("invalid devid found at block %lu (bad revision)", blk);
93587a7269eSachartre 		status = EINVAL;
93687a7269eSachartre 		goto done;
93787a7269eSachartre 	}
93887a7269eSachartre 
93987a7269eSachartre 	/* compute checksum */
94087a7269eSachartre 	chksum = vd_dkdevid2cksum(dkdevid);
94187a7269eSachartre 
94287a7269eSachartre 	/* compare the checksums */
94387a7269eSachartre 	if (DKD_GETCHKSUM(dkdevid) != chksum) {
94487a7269eSachartre 		PR0("invalid devid found at block %lu (bad checksum)", blk);
94587a7269eSachartre 		status = EINVAL;
94687a7269eSachartre 		goto done;
94787a7269eSachartre 	}
94887a7269eSachartre 
94987a7269eSachartre 	/* validate the device id */
95087a7269eSachartre 	if (ddi_devid_valid((ddi_devid_t)&dkdevid->dkd_devid) != DDI_SUCCESS) {
95187a7269eSachartre 		PR0("invalid devid found at block %lu", blk);
95287a7269eSachartre 		status = EINVAL;
95387a7269eSachartre 		goto done;
95487a7269eSachartre 	}
95587a7269eSachartre 
95687a7269eSachartre 	PR1("devid read at block %lu", blk);
95787a7269eSachartre 
95887a7269eSachartre 	sz = ddi_devid_sizeof((ddi_devid_t)&dkdevid->dkd_devid);
95987a7269eSachartre 	*devid = kmem_alloc(sz, KM_SLEEP);
96087a7269eSachartre 	bcopy(&dkdevid->dkd_devid, *devid, sz);
96187a7269eSachartre 
96287a7269eSachartre done:
96387a7269eSachartre 	kmem_free(dkdevid, DEV_BSIZE);
96487a7269eSachartre 	return (status);
96587a7269eSachartre 
96687a7269eSachartre }
96787a7269eSachartre 
96887a7269eSachartre /*
96987a7269eSachartre  * Function:
97087a7269eSachartre  *	vd_file_write_devid
97187a7269eSachartre  *
97287a7269eSachartre  * Description:
97387a7269eSachartre  *	Write a device id into disk image.
97487a7269eSachartre  *
97587a7269eSachartre  * Parameters:
97687a7269eSachartre  *	vd		- disk on which the operation is performed.
97787a7269eSachartre  *	devid		- the device ID to store.
97887a7269eSachartre  *
97987a7269eSachartre  * Return Code:
98087a7269eSachartre  *	0		- success
98187a7269eSachartre  *	EIO		- I/O error while trying to access the disk image
98287a7269eSachartre  *	ENOSPC		- disk has no space to store a device id
98387a7269eSachartre  */
98487a7269eSachartre static int
98587a7269eSachartre vd_file_write_devid(vd_t *vd, ddi_devid_t devid)
98687a7269eSachartre {
98787a7269eSachartre 	struct dk_devid *dkdevid;
98887a7269eSachartre 	uint_t chksum;
98987a7269eSachartre 	size_t blk;
99087a7269eSachartre 	int status;
99187a7269eSachartre 
992edcc0754Sachartre 	if (devid == NULL) {
993edcc0754Sachartre 		/* nothing to write */
994edcc0754Sachartre 		return (0);
995edcc0754Sachartre 	}
996edcc0754Sachartre 
99787a7269eSachartre 	if ((status = vd_file_get_devid_block(vd, &blk)) != 0)
99887a7269eSachartre 		return (status);
99987a7269eSachartre 
100087a7269eSachartre 	dkdevid = kmem_zalloc(DEV_BSIZE, KM_SLEEP);
100187a7269eSachartre 
100287a7269eSachartre 	/* set revision */
100387a7269eSachartre 	dkdevid->dkd_rev_hi = DK_DEVID_REV_MSB;
100487a7269eSachartre 	dkdevid->dkd_rev_lo = DK_DEVID_REV_LSB;
100587a7269eSachartre 
100687a7269eSachartre 	/* copy devid */
100787a7269eSachartre 	bcopy(devid, &dkdevid->dkd_devid, ddi_devid_sizeof(devid));
100887a7269eSachartre 
100987a7269eSachartre 	/* compute checksum */
101087a7269eSachartre 	chksum = vd_dkdevid2cksum(dkdevid);
101187a7269eSachartre 
101287a7269eSachartre 	/* set checksum */
101387a7269eSachartre 	DKD_FORMCHKSUM(chksum, dkdevid);
101487a7269eSachartre 
101587a7269eSachartre 	/* store the devid */
101687a7269eSachartre 	if ((status = vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BWRITE,
101787a7269eSachartre 	    (caddr_t)dkdevid, blk, DEV_BSIZE)) < 0) {
101887a7269eSachartre 		PR0("Error writing devid block at %lu", blk);
101987a7269eSachartre 		status = EIO;
102087a7269eSachartre 	} else {
102187a7269eSachartre 		PR1("devid written at block %lu", blk);
102287a7269eSachartre 		status = 0;
102387a7269eSachartre 	}
102487a7269eSachartre 
102587a7269eSachartre 	kmem_free(dkdevid, DEV_BSIZE);
102687a7269eSachartre 	return (status);
102787a7269eSachartre }
102887a7269eSachartre 
102987a7269eSachartre /*
103087a7269eSachartre  * Function:
103117cadca8Slm66018  *	vd_do_scsi_rdwr
103287a7269eSachartre  *
103387a7269eSachartre  * Description:
103487a7269eSachartre  * 	Read or write to a SCSI disk using an absolute disk offset.
103587a7269eSachartre  *
103687a7269eSachartre  * Parameters:
103787a7269eSachartre  *	vd		- disk on which the operation is performed.
103887a7269eSachartre  *	operation	- operation to execute: read (VD_OP_BREAD) or
103987a7269eSachartre  *			  write (VD_OP_BWRITE).
104087a7269eSachartre  *	data		- buffer where data are read to or written from.
104187a7269eSachartre  *	blk		- starting block for the operation.
104287a7269eSachartre  *	len		- number of bytes to read or write.
104387a7269eSachartre  *
104487a7269eSachartre  * Return Code:
104587a7269eSachartre  *	0		- success
104687a7269eSachartre  *	n != 0		- error.
104787a7269eSachartre  */
104887a7269eSachartre static int
104917cadca8Slm66018 vd_do_scsi_rdwr(vd_t *vd, int operation, caddr_t data, size_t blk, size_t len)
105087a7269eSachartre {
105187a7269eSachartre 	struct uscsi_cmd ucmd;
105287a7269eSachartre 	union scsi_cdb cdb;
105387a7269eSachartre 	int nsectors, nblk;
105487a7269eSachartre 	int max_sectors;
105587a7269eSachartre 	int status, rval;
105687a7269eSachartre 
105787a7269eSachartre 	ASSERT(!vd->file);
105817cadca8Slm66018 	ASSERT(vd->vdisk_block_size > 0);
105987a7269eSachartre 
106087a7269eSachartre 	max_sectors = vd->max_xfer_sz;
106117cadca8Slm66018 	nblk = (len / vd->vdisk_block_size);
106287a7269eSachartre 
106317cadca8Slm66018 	if (len % vd->vdisk_block_size != 0)
106487a7269eSachartre 		return (EINVAL);
106587a7269eSachartre 
106687a7269eSachartre 	/*
106787a7269eSachartre 	 * Build and execute the uscsi ioctl.  We build a group0, group1
106887a7269eSachartre 	 * or group4 command as necessary, since some targets
106987a7269eSachartre 	 * do not support group1 commands.
107087a7269eSachartre 	 */
107187a7269eSachartre 	while (nblk) {
107287a7269eSachartre 
107387a7269eSachartre 		bzero(&ucmd, sizeof (ucmd));
107487a7269eSachartre 		bzero(&cdb, sizeof (cdb));
107587a7269eSachartre 
107687a7269eSachartre 		nsectors = (max_sectors < nblk) ? max_sectors : nblk;
107787a7269eSachartre 
107817cadca8Slm66018 		/*
107917cadca8Slm66018 		 * Some of the optical drives on sun4v machines are ATAPI
108017cadca8Slm66018 		 * devices which use Group 1 Read/Write commands so we need
108117cadca8Slm66018 		 * to explicitly check a flag which is set when a domain
108217cadca8Slm66018 		 * is bound.
108317cadca8Slm66018 		 */
108417cadca8Slm66018 		if (blk < (2 << 20) && nsectors <= 0xff && !vd->is_atapi_dev) {
108587a7269eSachartre 			FORMG0ADDR(&cdb, blk);
108687a7269eSachartre 			FORMG0COUNT(&cdb, nsectors);
108787a7269eSachartre 			ucmd.uscsi_cdblen = CDB_GROUP0;
108887a7269eSachartre 		} else if (blk > 0xffffffff) {
108987a7269eSachartre 			FORMG4LONGADDR(&cdb, blk);
109087a7269eSachartre 			FORMG4COUNT(&cdb, nsectors);
109187a7269eSachartre 			ucmd.uscsi_cdblen = CDB_GROUP4;
109287a7269eSachartre 			cdb.scc_cmd |= SCMD_GROUP4;
109387a7269eSachartre 		} else {
109487a7269eSachartre 			FORMG1ADDR(&cdb, blk);
109587a7269eSachartre 			FORMG1COUNT(&cdb, nsectors);
109687a7269eSachartre 			ucmd.uscsi_cdblen = CDB_GROUP1;
109787a7269eSachartre 			cdb.scc_cmd |= SCMD_GROUP1;
109887a7269eSachartre 		}
109987a7269eSachartre 		ucmd.uscsi_cdb = (caddr_t)&cdb;
110087a7269eSachartre 		ucmd.uscsi_bufaddr = data;
110117cadca8Slm66018 		ucmd.uscsi_buflen = nsectors * vd->block_size;
110287a7269eSachartre 		ucmd.uscsi_timeout = vd_scsi_rdwr_timeout;
110387a7269eSachartre 		/*
110487a7269eSachartre 		 * Set flags so that the command is isolated from normal
110587a7269eSachartre 		 * commands and no error message is printed.
110687a7269eSachartre 		 */
110787a7269eSachartre 		ucmd.uscsi_flags = USCSI_ISOLATE | USCSI_SILENT;
110887a7269eSachartre 
110987a7269eSachartre 		if (operation == VD_OP_BREAD) {
111087a7269eSachartre 			cdb.scc_cmd |= SCMD_READ;
111187a7269eSachartre 			ucmd.uscsi_flags |= USCSI_READ;
111287a7269eSachartre 		} else {
111387a7269eSachartre 			cdb.scc_cmd |= SCMD_WRITE;
111487a7269eSachartre 		}
111587a7269eSachartre 
111687a7269eSachartre 		status = ldi_ioctl(vd->ldi_handle[VD_ENTIRE_DISK_SLICE],
1117047ba61eSachartre 		    USCSICMD, (intptr_t)&ucmd, (vd->open_flags | FKIOCTL),
111887a7269eSachartre 		    kcred, &rval);
111987a7269eSachartre 
112087a7269eSachartre 		if (status == 0)
112187a7269eSachartre 			status = ucmd.uscsi_status;
112287a7269eSachartre 
112387a7269eSachartre 		if (status != 0)
112487a7269eSachartre 			break;
112587a7269eSachartre 
112687a7269eSachartre 		/*
112787a7269eSachartre 		 * Check if partial DMA breakup is required. If so, reduce
112887a7269eSachartre 		 * the request size by half and retry the last request.
112987a7269eSachartre 		 */
113087a7269eSachartre 		if (ucmd.uscsi_resid == ucmd.uscsi_buflen) {
113187a7269eSachartre 			max_sectors >>= 1;
113287a7269eSachartre 			if (max_sectors <= 0) {
113387a7269eSachartre 				status = EIO;
113487a7269eSachartre 				break;
113587a7269eSachartre 			}
113687a7269eSachartre 			continue;
113787a7269eSachartre 		}
113887a7269eSachartre 
113987a7269eSachartre 		if (ucmd.uscsi_resid != 0) {
114087a7269eSachartre 			status = EIO;
114187a7269eSachartre 			break;
114287a7269eSachartre 		}
114387a7269eSachartre 
114487a7269eSachartre 		blk += nsectors;
114587a7269eSachartre 		nblk -= nsectors;
114617cadca8Slm66018 		data += nsectors * vd->vdisk_block_size; /* SECSIZE */
114787a7269eSachartre 	}
114887a7269eSachartre 
114987a7269eSachartre 	return (status);
115087a7269eSachartre }
115187a7269eSachartre 
1152205eeb1aSlm66018 /*
115317cadca8Slm66018  * Function:
115417cadca8Slm66018  *	vd_scsi_rdwr
115517cadca8Slm66018  *
115617cadca8Slm66018  * Description:
115717cadca8Slm66018  * 	Wrapper function to read or write to a SCSI disk using an absolute
115817cadca8Slm66018  *	disk offset. It checks the blocksize of the underlying device and,
115917cadca8Slm66018  *	if necessary, adjusts the buffers accordingly before calling
116017cadca8Slm66018  *	vd_do_scsi_rdwr() to do the actual read or write.
116117cadca8Slm66018  *
116217cadca8Slm66018  * Parameters:
116317cadca8Slm66018  *	vd		- disk on which the operation is performed.
116417cadca8Slm66018  *	operation	- operation to execute: read (VD_OP_BREAD) or
116517cadca8Slm66018  *			  write (VD_OP_BWRITE).
116617cadca8Slm66018  *	data		- buffer where data are read to or written from.
116717cadca8Slm66018  *	blk		- starting block for the operation.
116817cadca8Slm66018  *	len		- number of bytes to read or write.
116917cadca8Slm66018  *
117017cadca8Slm66018  * Return Code:
117117cadca8Slm66018  *	0		- success
117217cadca8Slm66018  *	n != 0		- error.
117317cadca8Slm66018  */
117417cadca8Slm66018 static int
117517cadca8Slm66018 vd_scsi_rdwr(vd_t *vd, int operation, caddr_t data, size_t vblk, size_t vlen)
117617cadca8Slm66018 {
117717cadca8Slm66018 	int	rv;
117817cadca8Slm66018 
117917cadca8Slm66018 	size_t	pblk;	/* physical device block number of data on device */
118017cadca8Slm66018 	size_t	delta;	/* relative offset between pblk and vblk */
118117cadca8Slm66018 	size_t	pnblk;	/* number of physical blocks to be read from device */
118217cadca8Slm66018 	size_t	plen;	/* length of data to be read from physical device */
118317cadca8Slm66018 	char	*buf;	/* buffer area to fit physical device's block size */
118417cadca8Slm66018 
11852f5224aeSachartre 	if (vd->block_size == 0) {
11862f5224aeSachartre 		/*
11872f5224aeSachartre 		 * The block size was not available during the attach,
11882f5224aeSachartre 		 * try to update it now.
11892f5224aeSachartre 		 */
11902f5224aeSachartre 		if (vd_setup_mediainfo(vd) != 0)
11912f5224aeSachartre 			return (EIO);
11922f5224aeSachartre 	}
11932f5224aeSachartre 
119417cadca8Slm66018 	/*
119517cadca8Slm66018 	 * If the vdisk block size and the block size of the underlying device
119617cadca8Slm66018 	 * match we can skip straight to vd_do_scsi_rdwr(), otherwise we need
119717cadca8Slm66018 	 * to create a buffer large enough to handle the device's block size
119817cadca8Slm66018 	 * and adjust the block to be read from and the amount of data to
119917cadca8Slm66018 	 * read to correspond with the device's block size.
120017cadca8Slm66018 	 */
120117cadca8Slm66018 	if (vd->vdisk_block_size == vd->block_size)
120217cadca8Slm66018 		return (vd_do_scsi_rdwr(vd, operation, data, vblk, vlen));
120317cadca8Slm66018 
120417cadca8Slm66018 	if (vd->vdisk_block_size > vd->block_size)
120517cadca8Slm66018 		return (EINVAL);
120617cadca8Slm66018 
120717cadca8Slm66018 	/*
120817cadca8Slm66018 	 * Writing of physical block sizes larger than the virtual block size
120917cadca8Slm66018 	 * is not supported. This would be added if/when support for guests
121017cadca8Slm66018 	 * writing to DVDs is implemented.
121117cadca8Slm66018 	 */
121217cadca8Slm66018 	if (operation == VD_OP_BWRITE)
121317cadca8Slm66018 		return (ENOTSUP);
121417cadca8Slm66018 
121517cadca8Slm66018 	/* BEGIN CSTYLED */
121617cadca8Slm66018 	/*
121717cadca8Slm66018 	 * Below is a diagram showing the relationship between the physical
121817cadca8Slm66018 	 * and virtual blocks. If the virtual blocks marked by 'X' below are
121917cadca8Slm66018 	 * requested, then the physical blocks denoted by 'Y' are read.
122017cadca8Slm66018 	 *
122117cadca8Slm66018 	 *           vblk
122217cadca8Slm66018 	 *             |      vlen
122317cadca8Slm66018 	 *             |<--------------->|
122417cadca8Slm66018 	 *             v                 v
122517cadca8Slm66018 	 *  --+--+--+--+--+--+--+--+--+--+--+--+--+--+--+-   virtual disk:
122617cadca8Slm66018 	 *    |  |  |  |XX|XX|XX|XX|XX|XX|  |  |  |  |  |  } block size is
122717cadca8Slm66018 	 *  --+--+--+--+--+--+--+--+--+--+--+--+--+--+--+-  vd->vdisk_block_size
122817cadca8Slm66018 	 *          :  :                 :  :
122917cadca8Slm66018 	 *         >:==:< delta          :  :
123017cadca8Slm66018 	 *          :  :                 :  :
123117cadca8Slm66018 	 *  --+-----+-----+-----+-----+-----+-----+-----+--   physical disk:
123217cadca8Slm66018 	 *    |     |YY:YY|YYYYY|YYYYY|YY:YY|     |     |   } block size is
123317cadca8Slm66018 	 *  --+-----+-----+-----+-----+-----+-----+-----+--   vd->block_size
123417cadca8Slm66018 	 *          ^                       ^
123517cadca8Slm66018 	 *          |<--------------------->|
123617cadca8Slm66018 	 *          |         plen
123717cadca8Slm66018 	 *         pblk
123817cadca8Slm66018 	 */
123917cadca8Slm66018 	/* END CSTYLED */
124017cadca8Slm66018 	pblk = (vblk * vd->vdisk_block_size) / vd->block_size;
124117cadca8Slm66018 	delta = (vblk * vd->vdisk_block_size) - (pblk * vd->block_size);
124217cadca8Slm66018 	pnblk = ((delta + vlen - 1) / vd->block_size) + 1;
124317cadca8Slm66018 	plen = pnblk * vd->block_size;
124417cadca8Slm66018 
124517cadca8Slm66018 	PR2("vblk %lx:pblk %lx: vlen %ld:plen %ld", vblk, pblk, vlen, plen);
124617cadca8Slm66018 
124717cadca8Slm66018 	buf = kmem_zalloc(sizeof (caddr_t) * plen, KM_SLEEP);
124817cadca8Slm66018 	rv = vd_do_scsi_rdwr(vd, operation, (caddr_t)buf, pblk, plen);
124917cadca8Slm66018 	bcopy(buf + delta, data, vlen);
125017cadca8Slm66018 
125117cadca8Slm66018 	kmem_free(buf, sizeof (caddr_t) * plen);
125217cadca8Slm66018 
125317cadca8Slm66018 	return (rv);
125417cadca8Slm66018 }
125517cadca8Slm66018 
125617cadca8Slm66018 /*
1257205eeb1aSlm66018  * Return Values
1258205eeb1aSlm66018  *	EINPROGRESS	- operation was successfully started
1259205eeb1aSlm66018  *	EIO		- encountered LDC (aka. task error)
1260205eeb1aSlm66018  *	0		- operation completed successfully
1261205eeb1aSlm66018  *
1262205eeb1aSlm66018  * Side Effect
1263205eeb1aSlm66018  *     sets request->status = <disk operation status>
1264205eeb1aSlm66018  */
12651ae08745Sheppo static int
1266d10e4ef2Snarayan vd_start_bio(vd_task_t *task)
12671ae08745Sheppo {
12684bac2208Snarayan 	int			rv, status = 0;
1269d10e4ef2Snarayan 	vd_t			*vd		= task->vd;
1270d10e4ef2Snarayan 	vd_dring_payload_t	*request	= task->request;
1271d10e4ef2Snarayan 	struct buf		*buf		= &task->buf;
12724bac2208Snarayan 	uint8_t			mtype;
12733c96341aSnarayan 	int 			slice;
1274047ba61eSachartre 	char			*bufaddr = 0;
1275047ba61eSachartre 	size_t			buflen;
1276d10e4ef2Snarayan 
1277d10e4ef2Snarayan 	ASSERT(vd != NULL);
1278d10e4ef2Snarayan 	ASSERT(request != NULL);
12793c96341aSnarayan 
12803c96341aSnarayan 	slice = request->slice;
12813c96341aSnarayan 
128287a7269eSachartre 	ASSERT(slice == VD_SLICE_NONE || slice < vd->nslices);
1283d10e4ef2Snarayan 	ASSERT((request->operation == VD_OP_BREAD) ||
1284d10e4ef2Snarayan 	    (request->operation == VD_OP_BWRITE));
1285d10e4ef2Snarayan 
1286205eeb1aSlm66018 	if (request->nbytes == 0) {
1287205eeb1aSlm66018 		/* no service for trivial requests */
1288205eeb1aSlm66018 		request->status = EINVAL;
1289205eeb1aSlm66018 		return (0);
1290205eeb1aSlm66018 	}
12911ae08745Sheppo 
1292d10e4ef2Snarayan 	PR1("%s %lu bytes at block %lu",
1293d10e4ef2Snarayan 	    (request->operation == VD_OP_BREAD) ? "Read" : "Write",
1294d10e4ef2Snarayan 	    request->nbytes, request->addr);
12951ae08745Sheppo 
1296047ba61eSachartre 	/*
1297047ba61eSachartre 	 * We have to check the open flags because the functions processing
1298047ba61eSachartre 	 * the read/write request will not do it.
1299047ba61eSachartre 	 */
1300047ba61eSachartre 	if (request->operation == VD_OP_BWRITE && !(vd->open_flags & FWRITE)) {
1301047ba61eSachartre 		PR0("write fails because backend is opened read-only");
1302047ba61eSachartre 		request->nbytes = 0;
1303047ba61eSachartre 		request->status = EROFS;
1304047ba61eSachartre 		return (0);
1305047ba61eSachartre 	}
1306d10e4ef2Snarayan 
13074bac2208Snarayan 	mtype = (&vd->inband_task == task) ? LDC_SHADOW_MAP : LDC_DIRECT_MAP;
13084bac2208Snarayan 
13094bac2208Snarayan 	/* Map memory exported by client */
13104bac2208Snarayan 	status = ldc_mem_map(task->mhdl, request->cookie, request->ncookies,
13114bac2208Snarayan 	    mtype, (request->operation == VD_OP_BREAD) ? LDC_MEM_W : LDC_MEM_R,
1312047ba61eSachartre 	    &bufaddr, NULL);
13134bac2208Snarayan 	if (status != 0) {
13143af08d82Slm66018 		PR0("ldc_mem_map() returned err %d ", status);
1315205eeb1aSlm66018 		return (EIO);
1316d10e4ef2Snarayan 	}
1317d10e4ef2Snarayan 
1318047ba61eSachartre 	buflen = request->nbytes;
1319047ba61eSachartre 
1320047ba61eSachartre 	status = ldc_mem_acquire(task->mhdl, 0, buflen);
13214bac2208Snarayan 	if (status != 0) {
13224bac2208Snarayan 		(void) ldc_mem_unmap(task->mhdl);
13233af08d82Slm66018 		PR0("ldc_mem_acquire() returned err %d ", status);
1324205eeb1aSlm66018 		return (EIO);
13254bac2208Snarayan 	}
13264bac2208Snarayan 
1327d10e4ef2Snarayan 	/* Start the block I/O */
13283c96341aSnarayan 	if (vd->file) {
1329047ba61eSachartre 		rv = vd_file_rw(vd, slice, request->operation, bufaddr,
1330690555a1Sachartre 		    request->addr, request->nbytes);
1331690555a1Sachartre 		if (rv < 0) {
13323c96341aSnarayan 			request->nbytes = 0;
1333205eeb1aSlm66018 			request->status = EIO;
1334690555a1Sachartre 		} else {
1335690555a1Sachartre 			request->nbytes = rv;
1336205eeb1aSlm66018 			request->status = 0;
13373c96341aSnarayan 		}
13383c96341aSnarayan 	} else {
133987a7269eSachartre 		if (slice == VD_SLICE_NONE) {
134087a7269eSachartre 			/*
134187a7269eSachartre 			 * This is not a disk image so it is a real disk. We
134287a7269eSachartre 			 * assume that the underlying device driver supports
134387a7269eSachartre 			 * USCSICMD ioctls. This is the case of all SCSI devices
134487a7269eSachartre 			 * (sd, ssd...).
134587a7269eSachartre 			 *
134687a7269eSachartre 			 * In the future if we have non-SCSI disks we would need
134787a7269eSachartre 			 * to invoke the appropriate function to do I/O using an
134817cadca8Slm66018 			 * absolute disk offset (for example using DIOCTL_RWCMD
134987a7269eSachartre 			 * for IDE disks).
135087a7269eSachartre 			 */
1351047ba61eSachartre 			rv = vd_scsi_rdwr(vd, request->operation, bufaddr,
1352047ba61eSachartre 			    request->addr, request->nbytes);
135387a7269eSachartre 			if (rv != 0) {
135487a7269eSachartre 				request->nbytes = 0;
1355205eeb1aSlm66018 				request->status = EIO;
135687a7269eSachartre 			} else {
1357205eeb1aSlm66018 				request->status = 0;
135887a7269eSachartre 			}
135987a7269eSachartre 		} else {
1360047ba61eSachartre 			bioinit(buf);
1361047ba61eSachartre 			buf->b_flags	= B_BUSY;
1362047ba61eSachartre 			buf->b_bcount	= request->nbytes;
1363047ba61eSachartre 			buf->b_lblkno	= request->addr;
1364047ba61eSachartre 			buf->b_edev 	= vd->dev[slice];
1365047ba61eSachartre 			buf->b_un.b_addr = bufaddr;
1366047ba61eSachartre 			buf->b_flags 	|= (request->operation == VD_OP_BREAD)?
1367047ba61eSachartre 			    B_READ : B_WRITE;
1368047ba61eSachartre 
1369205eeb1aSlm66018 			request->status =
1370205eeb1aSlm66018 			    ldi_strategy(vd->ldi_handle[slice], buf);
1371205eeb1aSlm66018 
1372205eeb1aSlm66018 			/*
1373205eeb1aSlm66018 			 * This is to indicate to the caller that the request
1374205eeb1aSlm66018 			 * needs to be finished by vd_complete_bio() by calling
1375205eeb1aSlm66018 			 * biowait() there and waiting for that to return before
1376205eeb1aSlm66018 			 * triggering the notification of the vDisk client.
1377205eeb1aSlm66018 			 *
1378205eeb1aSlm66018 			 * This is necessary when writing to real disks as
1379205eeb1aSlm66018 			 * otherwise calls to ldi_strategy() would be serialized
1380205eeb1aSlm66018 			 * behind the calls to biowait() and performance would
1381205eeb1aSlm66018 			 * suffer.
1382205eeb1aSlm66018 			 */
1383205eeb1aSlm66018 			if (request->status == 0)
138487a7269eSachartre 				return (EINPROGRESS);
1385047ba61eSachartre 
1386047ba61eSachartre 			biofini(buf);
138787a7269eSachartre 		}
13883c96341aSnarayan 	}
13893c96341aSnarayan 
1390d10e4ef2Snarayan 	/* Clean up after error */
1391047ba61eSachartre 	rv = ldc_mem_release(task->mhdl, 0, buflen);
13924bac2208Snarayan 	if (rv) {
13933af08d82Slm66018 		PR0("ldc_mem_release() returned err %d ", rv);
1394205eeb1aSlm66018 		status = EIO;
13954bac2208Snarayan 	}
13964bac2208Snarayan 	rv = ldc_mem_unmap(task->mhdl);
13974bac2208Snarayan 	if (rv) {
1398205eeb1aSlm66018 		PR0("ldc_mem_unmap() returned err %d ", rv);
1399205eeb1aSlm66018 		status = EIO;
14004bac2208Snarayan 	}
14014bac2208Snarayan 
1402d10e4ef2Snarayan 	return (status);
1403d10e4ef2Snarayan }
1404d10e4ef2Snarayan 
1405205eeb1aSlm66018 /*
1406205eeb1aSlm66018  * This function should only be called from vd_notify to ensure that requests
1407205eeb1aSlm66018  * are responded to in the order that they are received.
1408205eeb1aSlm66018  */
1409d10e4ef2Snarayan static int
1410d10e4ef2Snarayan send_msg(ldc_handle_t ldc_handle, void *msg, size_t msglen)
1411d10e4ef2Snarayan {
14123af08d82Slm66018 	int	status;
1413d10e4ef2Snarayan 	size_t	nbytes;
1414d10e4ef2Snarayan 
14153af08d82Slm66018 	do {
1416d10e4ef2Snarayan 		nbytes = msglen;
1417d10e4ef2Snarayan 		status = ldc_write(ldc_handle, msg, &nbytes);
14183af08d82Slm66018 		if (status != EWOULDBLOCK)
14193af08d82Slm66018 			break;
14203af08d82Slm66018 		drv_usecwait(vds_ldc_delay);
14213af08d82Slm66018 	} while (status == EWOULDBLOCK);
1422d10e4ef2Snarayan 
1423d10e4ef2Snarayan 	if (status != 0) {
14243af08d82Slm66018 		if (status != ECONNRESET)
14253af08d82Slm66018 			PR0("ldc_write() returned errno %d", status);
1426d10e4ef2Snarayan 		return (status);
1427d10e4ef2Snarayan 	} else if (nbytes != msglen) {
14283af08d82Slm66018 		PR0("ldc_write() performed only partial write");
1429d10e4ef2Snarayan 		return (EIO);
1430d10e4ef2Snarayan 	}
1431d10e4ef2Snarayan 
1432d10e4ef2Snarayan 	PR1("SENT %lu bytes", msglen);
1433d10e4ef2Snarayan 	return (0);
1434d10e4ef2Snarayan }
1435d10e4ef2Snarayan 
1436d10e4ef2Snarayan static void
1437d10e4ef2Snarayan vd_need_reset(vd_t *vd, boolean_t reset_ldc)
1438d10e4ef2Snarayan {
1439d10e4ef2Snarayan 	mutex_enter(&vd->lock);
1440d10e4ef2Snarayan 	vd->reset_state	= B_TRUE;
1441d10e4ef2Snarayan 	vd->reset_ldc	= reset_ldc;
1442d10e4ef2Snarayan 	mutex_exit(&vd->lock);
1443d10e4ef2Snarayan }
1444d10e4ef2Snarayan 
1445d10e4ef2Snarayan /*
1446d10e4ef2Snarayan  * Reset the state of the connection with a client, if needed; reset the LDC
1447d10e4ef2Snarayan  * transport as well, if needed.  This function should only be called from the
14483af08d82Slm66018  * "vd_recv_msg", as it waits for tasks - otherwise a deadlock can occur.
1449d10e4ef2Snarayan  */
1450d10e4ef2Snarayan static void
1451d10e4ef2Snarayan vd_reset_if_needed(vd_t *vd)
1452d10e4ef2Snarayan {
1453d10e4ef2Snarayan 	int	status = 0;
1454d10e4ef2Snarayan 
1455d10e4ef2Snarayan 	mutex_enter(&vd->lock);
1456d10e4ef2Snarayan 	if (!vd->reset_state) {
1457d10e4ef2Snarayan 		ASSERT(!vd->reset_ldc);
1458d10e4ef2Snarayan 		mutex_exit(&vd->lock);
1459d10e4ef2Snarayan 		return;
1460d10e4ef2Snarayan 	}
1461d10e4ef2Snarayan 	mutex_exit(&vd->lock);
1462d10e4ef2Snarayan 
1463d10e4ef2Snarayan 	PR0("Resetting connection state with %s", VD_CLIENT(vd));
1464d10e4ef2Snarayan 
1465d10e4ef2Snarayan 	/*
1466d10e4ef2Snarayan 	 * Let any asynchronous I/O complete before possibly pulling the rug
1467d10e4ef2Snarayan 	 * out from under it; defer checking vd->reset_ldc, as one of the
1468d10e4ef2Snarayan 	 * asynchronous tasks might set it
1469d10e4ef2Snarayan 	 */
1470d10e4ef2Snarayan 	ddi_taskq_wait(vd->completionq);
1471d10e4ef2Snarayan 
14723c96341aSnarayan 	if (vd->file) {
1473da6c28aaSamw 		status = VOP_FSYNC(vd->file_vnode, FSYNC, kcred, NULL);
14743c96341aSnarayan 		if (status) {
14753c96341aSnarayan 			PR0("VOP_FSYNC returned errno %d", status);
14763c96341aSnarayan 		}
14773c96341aSnarayan 	}
14783c96341aSnarayan 
1479d10e4ef2Snarayan 	if ((vd->initialized & VD_DRING) &&
1480d10e4ef2Snarayan 	    ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0))
14813af08d82Slm66018 		PR0("ldc_mem_dring_unmap() returned errno %d", status);
1482d10e4ef2Snarayan 
14833af08d82Slm66018 	vd_free_dring_task(vd);
14843af08d82Slm66018 
14853af08d82Slm66018 	/* Free the staging buffer for msgs */
14863af08d82Slm66018 	if (vd->vio_msgp != NULL) {
14873af08d82Slm66018 		kmem_free(vd->vio_msgp, vd->max_msglen);
14883af08d82Slm66018 		vd->vio_msgp = NULL;
1489d10e4ef2Snarayan 	}
1490d10e4ef2Snarayan 
14913af08d82Slm66018 	/* Free the inband message buffer */
14923af08d82Slm66018 	if (vd->inband_task.msg != NULL) {
14933af08d82Slm66018 		kmem_free(vd->inband_task.msg, vd->max_msglen);
14943af08d82Slm66018 		vd->inband_task.msg = NULL;
14953af08d82Slm66018 	}
1496d10e4ef2Snarayan 
1497d10e4ef2Snarayan 	mutex_enter(&vd->lock);
14983af08d82Slm66018 
14993af08d82Slm66018 	if (vd->reset_ldc)
15003af08d82Slm66018 		PR0("taking down LDC channel");
1501e1ebb9ecSlm66018 	if (vd->reset_ldc && ((status = ldc_down(vd->ldc_handle)) != 0))
15023af08d82Slm66018 		PR0("ldc_down() returned errno %d", status);
1503d10e4ef2Snarayan 
15042f5224aeSachartre 	/* Reset exclusive access rights */
15052f5224aeSachartre 	vd_reset_access(vd);
15062f5224aeSachartre 
1507d10e4ef2Snarayan 	vd->initialized	&= ~(VD_SID | VD_SEQ_NUM | VD_DRING);
1508d10e4ef2Snarayan 	vd->state	= VD_STATE_INIT;
1509d10e4ef2Snarayan 	vd->max_msglen	= sizeof (vio_msg_t);	/* baseline vio message size */
1510d10e4ef2Snarayan 
15113af08d82Slm66018 	/* Allocate the staging buffer */
15123af08d82Slm66018 	vd->vio_msgp = kmem_alloc(vd->max_msglen, KM_SLEEP);
15133af08d82Slm66018 
15143af08d82Slm66018 	PR0("calling ldc_up\n");
15153af08d82Slm66018 	(void) ldc_up(vd->ldc_handle);
15163af08d82Slm66018 
1517d10e4ef2Snarayan 	vd->reset_state	= B_FALSE;
1518d10e4ef2Snarayan 	vd->reset_ldc	= B_FALSE;
15193af08d82Slm66018 
1520d10e4ef2Snarayan 	mutex_exit(&vd->lock);
1521d10e4ef2Snarayan }
1522d10e4ef2Snarayan 
15233af08d82Slm66018 static void vd_recv_msg(void *arg);
15243af08d82Slm66018 
15253af08d82Slm66018 static void
15263af08d82Slm66018 vd_mark_in_reset(vd_t *vd)
15273af08d82Slm66018 {
15283af08d82Slm66018 	int status;
15293af08d82Slm66018 
15303af08d82Slm66018 	PR0("vd_mark_in_reset: marking vd in reset\n");
15313af08d82Slm66018 
15323af08d82Slm66018 	vd_need_reset(vd, B_FALSE);
15333af08d82Slm66018 	status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd, DDI_SLEEP);
15343af08d82Slm66018 	if (status == DDI_FAILURE) {
15353af08d82Slm66018 		PR0("cannot schedule task to recv msg\n");
15363af08d82Slm66018 		vd_need_reset(vd, B_TRUE);
15373af08d82Slm66018 		return;
15383af08d82Slm66018 	}
15393af08d82Slm66018 }
15403af08d82Slm66018 
1541d10e4ef2Snarayan static int
15423c96341aSnarayan vd_mark_elem_done(vd_t *vd, int idx, int elem_status, int elem_nbytes)
1543d10e4ef2Snarayan {
1544d10e4ef2Snarayan 	boolean_t		accepted;
1545d10e4ef2Snarayan 	int			status;
1546d10e4ef2Snarayan 	vd_dring_entry_t	*elem = VD_DRING_ELEM(idx);
1547d10e4ef2Snarayan 
15483af08d82Slm66018 	if (vd->reset_state)
15493af08d82Slm66018 		return (0);
1550d10e4ef2Snarayan 
1551d10e4ef2Snarayan 	/* Acquire the element */
15523af08d82Slm66018 	if (!vd->reset_state &&
15533af08d82Slm66018 	    (status = ldc_mem_dring_acquire(vd->dring_handle, idx, idx)) != 0) {
15543af08d82Slm66018 		if (status == ECONNRESET) {
15553af08d82Slm66018 			vd_mark_in_reset(vd);
15563af08d82Slm66018 			return (0);
15573af08d82Slm66018 		} else {
15583af08d82Slm66018 			PR0("ldc_mem_dring_acquire() returned errno %d",
15593af08d82Slm66018 			    status);
1560d10e4ef2Snarayan 			return (status);
1561d10e4ef2Snarayan 		}
15623af08d82Slm66018 	}
1563d10e4ef2Snarayan 
1564d10e4ef2Snarayan 	/* Set the element's status and mark it done */
1565d10e4ef2Snarayan 	accepted = (elem->hdr.dstate == VIO_DESC_ACCEPTED);
1566d10e4ef2Snarayan 	if (accepted) {
15673c96341aSnarayan 		elem->payload.nbytes	= elem_nbytes;
1568d10e4ef2Snarayan 		elem->payload.status	= elem_status;
1569d10e4ef2Snarayan 		elem->hdr.dstate	= VIO_DESC_DONE;
1570d10e4ef2Snarayan 	} else {
1571d10e4ef2Snarayan 		/* Perhaps client timed out waiting for I/O... */
15723af08d82Slm66018 		PR0("element %u no longer \"accepted\"", idx);
1573d10e4ef2Snarayan 		VD_DUMP_DRING_ELEM(elem);
1574d10e4ef2Snarayan 	}
1575d10e4ef2Snarayan 	/* Release the element */
15763af08d82Slm66018 	if (!vd->reset_state &&
15773af08d82Slm66018 	    (status = ldc_mem_dring_release(vd->dring_handle, idx, idx)) != 0) {
15783af08d82Slm66018 		if (status == ECONNRESET) {
15793af08d82Slm66018 			vd_mark_in_reset(vd);
15803af08d82Slm66018 			return (0);
15813af08d82Slm66018 		} else {
15823af08d82Slm66018 			PR0("ldc_mem_dring_release() returned errno %d",
15833af08d82Slm66018 			    status);
1584d10e4ef2Snarayan 			return (status);
1585d10e4ef2Snarayan 		}
15863af08d82Slm66018 	}
1587d10e4ef2Snarayan 
1588d10e4ef2Snarayan 	return (accepted ? 0 : EINVAL);
1589d10e4ef2Snarayan }
1590d10e4ef2Snarayan 
1591205eeb1aSlm66018 /*
1592205eeb1aSlm66018  * Return Values
1593205eeb1aSlm66018  *	0	- operation completed successfully
1594205eeb1aSlm66018  *	EIO	- encountered LDC / task error
1595205eeb1aSlm66018  *
1596205eeb1aSlm66018  * Side Effect
1597205eeb1aSlm66018  *	sets request->status = <disk operation status>
1598205eeb1aSlm66018  */
1599205eeb1aSlm66018 static int
1600205eeb1aSlm66018 vd_complete_bio(vd_task_t *task)
1601d10e4ef2Snarayan {
1602d10e4ef2Snarayan 	int			status		= 0;
1603205eeb1aSlm66018 	int			rv		= 0;
1604d10e4ef2Snarayan 	vd_t			*vd		= task->vd;
1605d10e4ef2Snarayan 	vd_dring_payload_t	*request	= task->request;
1606d10e4ef2Snarayan 	struct buf		*buf		= &task->buf;
1607d10e4ef2Snarayan 
1608d10e4ef2Snarayan 
1609d10e4ef2Snarayan 	ASSERT(vd != NULL);
1610d10e4ef2Snarayan 	ASSERT(request != NULL);
1611d10e4ef2Snarayan 	ASSERT(task->msg != NULL);
1612d10e4ef2Snarayan 	ASSERT(task->msglen >= sizeof (*task->msg));
16133c96341aSnarayan 	ASSERT(!vd->file);
1614205eeb1aSlm66018 	ASSERT(request->slice != VD_SLICE_NONE);
1615d10e4ef2Snarayan 
1616205eeb1aSlm66018 	/* Wait for the I/O to complete [ call to ldi_strategy(9f) ] */
1617d10e4ef2Snarayan 	request->status = biowait(buf);
1618d10e4ef2Snarayan 
16193c96341aSnarayan 	/* return back the number of bytes read/written */
16203c96341aSnarayan 	request->nbytes = buf->b_bcount - buf->b_resid;
16213c96341aSnarayan 
16224bac2208Snarayan 	/* Release the buffer */
16233af08d82Slm66018 	if (!vd->reset_state)
16244bac2208Snarayan 		status = ldc_mem_release(task->mhdl, 0, buf->b_bcount);
16254bac2208Snarayan 	if (status) {
16263af08d82Slm66018 		PR0("ldc_mem_release() returned errno %d copying to "
16273af08d82Slm66018 		    "client", status);
16283af08d82Slm66018 		if (status == ECONNRESET) {
16293af08d82Slm66018 			vd_mark_in_reset(vd);
16303af08d82Slm66018 		}
1631205eeb1aSlm66018 		rv = EIO;
16321ae08745Sheppo 	}
16331ae08745Sheppo 
16343af08d82Slm66018 	/* Unmap the memory, even if in reset */
16354bac2208Snarayan 	status = ldc_mem_unmap(task->mhdl);
16364bac2208Snarayan 	if (status) {
16373af08d82Slm66018 		PR0("ldc_mem_unmap() returned errno %d copying to client",
16384bac2208Snarayan 		    status);
16393af08d82Slm66018 		if (status == ECONNRESET) {
16403af08d82Slm66018 			vd_mark_in_reset(vd);
16413af08d82Slm66018 		}
1642205eeb1aSlm66018 		rv = EIO;
16434bac2208Snarayan 	}
16444bac2208Snarayan 
1645d10e4ef2Snarayan 	biofini(buf);
16461ae08745Sheppo 
1647205eeb1aSlm66018 	return (rv);
1648205eeb1aSlm66018 }
1649205eeb1aSlm66018 
1650205eeb1aSlm66018 /*
1651205eeb1aSlm66018  * Description:
1652205eeb1aSlm66018  *	This function is called by the two functions called by a taskq
1653205eeb1aSlm66018  *	[ vd_complete_notify() and vd_serial_notify()) ] to send the
1654205eeb1aSlm66018  *	message to the client.
1655205eeb1aSlm66018  *
1656205eeb1aSlm66018  * Parameters:
1657205eeb1aSlm66018  *	arg 	- opaque pointer to structure containing task to be completed
1658205eeb1aSlm66018  *
1659205eeb1aSlm66018  * Return Values
1660205eeb1aSlm66018  *	None
1661205eeb1aSlm66018  */
1662205eeb1aSlm66018 static void
1663205eeb1aSlm66018 vd_notify(vd_task_t *task)
1664205eeb1aSlm66018 {
1665205eeb1aSlm66018 	int	status;
1666205eeb1aSlm66018 
1667205eeb1aSlm66018 	ASSERT(task != NULL);
1668205eeb1aSlm66018 	ASSERT(task->vd != NULL);
1669205eeb1aSlm66018 
1670205eeb1aSlm66018 	if (task->vd->reset_state)
1671205eeb1aSlm66018 		return;
1672205eeb1aSlm66018 
1673205eeb1aSlm66018 	/*
1674205eeb1aSlm66018 	 * Send the "ack" or "nack" back to the client; if sending the message
1675205eeb1aSlm66018 	 * via LDC fails, arrange to reset both the connection state and LDC
1676205eeb1aSlm66018 	 * itself
1677205eeb1aSlm66018 	 */
1678205eeb1aSlm66018 	PR2("Sending %s",
1679205eeb1aSlm66018 	    (task->msg->tag.vio_subtype == VIO_SUBTYPE_ACK) ? "ACK" : "NACK");
1680205eeb1aSlm66018 
1681205eeb1aSlm66018 	status = send_msg(task->vd->ldc_handle, task->msg, task->msglen);
1682205eeb1aSlm66018 	switch (status) {
1683205eeb1aSlm66018 	case 0:
1684205eeb1aSlm66018 		break;
1685205eeb1aSlm66018 	case ECONNRESET:
1686205eeb1aSlm66018 		vd_mark_in_reset(task->vd);
1687205eeb1aSlm66018 		break;
1688205eeb1aSlm66018 	default:
1689205eeb1aSlm66018 		PR0("initiating full reset");
1690205eeb1aSlm66018 		vd_need_reset(task->vd, B_TRUE);
1691205eeb1aSlm66018 		break;
1692205eeb1aSlm66018 	}
1693205eeb1aSlm66018 
1694205eeb1aSlm66018 	DTRACE_PROBE1(task__end, vd_task_t *, task);
1695205eeb1aSlm66018 }
1696205eeb1aSlm66018 
1697205eeb1aSlm66018 /*
1698205eeb1aSlm66018  * Description:
1699205eeb1aSlm66018  *	Mark the Dring entry as Done and (if necessary) send an ACK/NACK to
1700205eeb1aSlm66018  *	the vDisk client
1701205eeb1aSlm66018  *
1702205eeb1aSlm66018  * Parameters:
1703205eeb1aSlm66018  *	task 		- structure containing the request sent from client
1704205eeb1aSlm66018  *
1705205eeb1aSlm66018  * Return Values
1706205eeb1aSlm66018  *	None
1707205eeb1aSlm66018  */
1708205eeb1aSlm66018 static void
1709205eeb1aSlm66018 vd_complete_notify(vd_task_t *task)
1710205eeb1aSlm66018 {
1711205eeb1aSlm66018 	int			status		= 0;
1712205eeb1aSlm66018 	vd_t			*vd		= task->vd;
1713205eeb1aSlm66018 	vd_dring_payload_t	*request	= task->request;
1714205eeb1aSlm66018 
1715d10e4ef2Snarayan 	/* Update the dring element for a dring client */
1716*f0ca1d9aSsb155480 	if (!vd->reset_state && (vd->xfer_mode == VIO_DRING_MODE_V1_0)) {
17173c96341aSnarayan 		status = vd_mark_elem_done(vd, task->index,
17183c96341aSnarayan 		    request->status, request->nbytes);
17193af08d82Slm66018 		if (status == ECONNRESET)
17203af08d82Slm66018 			vd_mark_in_reset(vd);
17213af08d82Slm66018 	}
17221ae08745Sheppo 
1723d10e4ef2Snarayan 	/*
1724205eeb1aSlm66018 	 * If a transport error occurred while marking the element done or
1725205eeb1aSlm66018 	 * previously while executing the task, arrange to "nack" the message
1726205eeb1aSlm66018 	 * when the final task in the descriptor element range completes
1727d10e4ef2Snarayan 	 */
1728205eeb1aSlm66018 	if ((status != 0) || (task->status != 0))
1729d10e4ef2Snarayan 		task->msg->tag.vio_subtype = VIO_SUBTYPE_NACK;
17301ae08745Sheppo 
1731d10e4ef2Snarayan 	/*
1732d10e4ef2Snarayan 	 * Only the final task for a range of elements will respond to and
1733d10e4ef2Snarayan 	 * free the message
1734d10e4ef2Snarayan 	 */
17353af08d82Slm66018 	if (task->type == VD_NONFINAL_RANGE_TASK) {
1736d10e4ef2Snarayan 		return;
17373af08d82Slm66018 	}
17381ae08745Sheppo 
1739205eeb1aSlm66018 	vd_notify(task);
1740205eeb1aSlm66018 }
1741205eeb1aSlm66018 
1742d10e4ef2Snarayan /*
1743205eeb1aSlm66018  * Description:
1744205eeb1aSlm66018  *	This is the basic completion function called to handle inband data
1745205eeb1aSlm66018  *	requests and handshake messages. All it needs to do is trigger a
1746205eeb1aSlm66018  *	message to the client that the request is completed.
1747205eeb1aSlm66018  *
1748205eeb1aSlm66018  * Parameters:
1749205eeb1aSlm66018  *	arg 	- opaque pointer to structure containing task to be completed
1750205eeb1aSlm66018  *
1751205eeb1aSlm66018  * Return Values
1752205eeb1aSlm66018  *	None
1753d10e4ef2Snarayan  */
1754205eeb1aSlm66018 static void
1755205eeb1aSlm66018 vd_serial_notify(void *arg)
1756205eeb1aSlm66018 {
1757205eeb1aSlm66018 	vd_task_t		*task = (vd_task_t *)arg;
1758205eeb1aSlm66018 
1759205eeb1aSlm66018 	ASSERT(task != NULL);
1760205eeb1aSlm66018 	vd_notify(task);
17611ae08745Sheppo }
17621ae08745Sheppo 
17632f5224aeSachartre /* ARGSUSED */
17642f5224aeSachartre static int
17652f5224aeSachartre vd_geom2dk_geom(void *vd_buf, size_t vd_buf_len, void *ioctl_arg)
17660a55fbb7Slm66018 {
17670a55fbb7Slm66018 	VD_GEOM2DK_GEOM((vd_geom_t *)vd_buf, (struct dk_geom *)ioctl_arg);
17682f5224aeSachartre 	return (0);
17690a55fbb7Slm66018 }
17700a55fbb7Slm66018 
17712f5224aeSachartre /* ARGSUSED */
17722f5224aeSachartre static int
17732f5224aeSachartre vd_vtoc2vtoc(void *vd_buf, size_t vd_buf_len, void *ioctl_arg)
17740a55fbb7Slm66018 {
17750a55fbb7Slm66018 	VD_VTOC2VTOC((vd_vtoc_t *)vd_buf, (struct vtoc *)ioctl_arg);
17762f5224aeSachartre 	return (0);
17770a55fbb7Slm66018 }
17780a55fbb7Slm66018 
17790a55fbb7Slm66018 static void
17800a55fbb7Slm66018 dk_geom2vd_geom(void *ioctl_arg, void *vd_buf)
17810a55fbb7Slm66018 {
17820a55fbb7Slm66018 	DK_GEOM2VD_GEOM((struct dk_geom *)ioctl_arg, (vd_geom_t *)vd_buf);
17830a55fbb7Slm66018 }
17840a55fbb7Slm66018 
17850a55fbb7Slm66018 static void
17860a55fbb7Slm66018 vtoc2vd_vtoc(void *ioctl_arg, void *vd_buf)
17870a55fbb7Slm66018 {
17880a55fbb7Slm66018 	VTOC2VD_VTOC((struct vtoc *)ioctl_arg, (vd_vtoc_t *)vd_buf);
17890a55fbb7Slm66018 }
17900a55fbb7Slm66018 
17912f5224aeSachartre static int
17922f5224aeSachartre vd_get_efi_in(void *vd_buf, size_t vd_buf_len, void *ioctl_arg)
17934bac2208Snarayan {
17944bac2208Snarayan 	vd_efi_t *vd_efi = (vd_efi_t *)vd_buf;
17954bac2208Snarayan 	dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg;
17962f5224aeSachartre 	size_t data_len;
17972f5224aeSachartre 
17982f5224aeSachartre 	data_len = vd_buf_len - (sizeof (vd_efi_t) - sizeof (uint64_t));
17992f5224aeSachartre 	if (vd_efi->length > data_len)
18002f5224aeSachartre 		return (EINVAL);
18014bac2208Snarayan 
18024bac2208Snarayan 	dk_efi->dki_lba = vd_efi->lba;
18034bac2208Snarayan 	dk_efi->dki_length = vd_efi->length;
18044bac2208Snarayan 	dk_efi->dki_data = kmem_zalloc(vd_efi->length, KM_SLEEP);
18052f5224aeSachartre 	return (0);
18064bac2208Snarayan }
18074bac2208Snarayan 
18084bac2208Snarayan static void
18094bac2208Snarayan vd_get_efi_out(void *ioctl_arg, void *vd_buf)
18104bac2208Snarayan {
18114bac2208Snarayan 	int len;
18124bac2208Snarayan 	vd_efi_t *vd_efi = (vd_efi_t *)vd_buf;
18134bac2208Snarayan 	dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg;
18144bac2208Snarayan 
18154bac2208Snarayan 	len = vd_efi->length;
18164bac2208Snarayan 	DK_EFI2VD_EFI(dk_efi, vd_efi);
18174bac2208Snarayan 	kmem_free(dk_efi->dki_data, len);
18184bac2208Snarayan }
18194bac2208Snarayan 
18202f5224aeSachartre static int
18212f5224aeSachartre vd_set_efi_in(void *vd_buf, size_t vd_buf_len, void *ioctl_arg)
18224bac2208Snarayan {
18234bac2208Snarayan 	vd_efi_t *vd_efi = (vd_efi_t *)vd_buf;
18244bac2208Snarayan 	dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg;
18252f5224aeSachartre 	size_t data_len;
18262f5224aeSachartre 
18272f5224aeSachartre 	data_len = vd_buf_len - (sizeof (vd_efi_t) - sizeof (uint64_t));
18282f5224aeSachartre 	if (vd_efi->length > data_len)
18292f5224aeSachartre 		return (EINVAL);
18304bac2208Snarayan 
18314bac2208Snarayan 	dk_efi->dki_data = kmem_alloc(vd_efi->length, KM_SLEEP);
18324bac2208Snarayan 	VD_EFI2DK_EFI(vd_efi, dk_efi);
18332f5224aeSachartre 	return (0);
18344bac2208Snarayan }
18354bac2208Snarayan 
18364bac2208Snarayan static void
18374bac2208Snarayan vd_set_efi_out(void *ioctl_arg, void *vd_buf)
18384bac2208Snarayan {
18394bac2208Snarayan 	vd_efi_t *vd_efi = (vd_efi_t *)vd_buf;
18404bac2208Snarayan 	dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg;
18414bac2208Snarayan 
18424bac2208Snarayan 	kmem_free(dk_efi->dki_data, vd_efi->length);
18434bac2208Snarayan }
18444bac2208Snarayan 
18452f5224aeSachartre static int
18462f5224aeSachartre vd_scsicmd_in(void *vd_buf, size_t vd_buf_len, void *ioctl_arg)
18472f5224aeSachartre {
18482f5224aeSachartre 	size_t vd_scsi_len;
18492f5224aeSachartre 	vd_scsi_t *vd_scsi = (vd_scsi_t *)vd_buf;
18502f5224aeSachartre 	struct uscsi_cmd *uscsi = (struct uscsi_cmd *)ioctl_arg;
18512f5224aeSachartre 
18522f5224aeSachartre 	/* check buffer size */
18532f5224aeSachartre 	vd_scsi_len = VD_SCSI_SIZE;
18542f5224aeSachartre 	vd_scsi_len += P2ROUNDUP(vd_scsi->cdb_len, sizeof (uint64_t));
18552f5224aeSachartre 	vd_scsi_len += P2ROUNDUP(vd_scsi->sense_len, sizeof (uint64_t));
18562f5224aeSachartre 	vd_scsi_len += P2ROUNDUP(vd_scsi->datain_len, sizeof (uint64_t));
18572f5224aeSachartre 	vd_scsi_len += P2ROUNDUP(vd_scsi->dataout_len, sizeof (uint64_t));
18582f5224aeSachartre 
18592f5224aeSachartre 	ASSERT(vd_scsi_len % sizeof (uint64_t) == 0);
18602f5224aeSachartre 
18612f5224aeSachartre 	if (vd_buf_len < vd_scsi_len)
18622f5224aeSachartre 		return (EINVAL);
18632f5224aeSachartre 
18642f5224aeSachartre 	/* set flags */
18652f5224aeSachartre 	uscsi->uscsi_flags = vd_scsi_debug;
18662f5224aeSachartre 
18672f5224aeSachartre 	if (vd_scsi->options & VD_SCSI_OPT_NORETRY) {
18682f5224aeSachartre 		uscsi->uscsi_flags |= USCSI_ISOLATE;
18692f5224aeSachartre 		uscsi->uscsi_flags |= USCSI_DIAGNOSE;
18702f5224aeSachartre 	}
18712f5224aeSachartre 
18722f5224aeSachartre 	/* task attribute */
18732f5224aeSachartre 	switch (vd_scsi->task_attribute) {
18742f5224aeSachartre 	case VD_SCSI_TASK_ACA:
18752f5224aeSachartre 		uscsi->uscsi_flags |= USCSI_HEAD;
18762f5224aeSachartre 		break;
18772f5224aeSachartre 	case VD_SCSI_TASK_HQUEUE:
18782f5224aeSachartre 		uscsi->uscsi_flags |= USCSI_HTAG;
18792f5224aeSachartre 		break;
18802f5224aeSachartre 	case VD_SCSI_TASK_ORDERED:
18812f5224aeSachartre 		uscsi->uscsi_flags |= USCSI_OTAG;
18822f5224aeSachartre 		break;
18832f5224aeSachartre 	default:
18842f5224aeSachartre 		uscsi->uscsi_flags |= USCSI_NOTAG;
18852f5224aeSachartre 		break;
18862f5224aeSachartre 	}
18872f5224aeSachartre 
18882f5224aeSachartre 	/* timeout */
18892f5224aeSachartre 	uscsi->uscsi_timeout = vd_scsi->timeout;
18902f5224aeSachartre 
18912f5224aeSachartre 	/* cdb data */
18922f5224aeSachartre 	uscsi->uscsi_cdb = (caddr_t)VD_SCSI_DATA_CDB(vd_scsi);
18932f5224aeSachartre 	uscsi->uscsi_cdblen = vd_scsi->cdb_len;
18942f5224aeSachartre 
18952f5224aeSachartre 	/* sense buffer */
18962f5224aeSachartre 	if (vd_scsi->sense_len != 0) {
18972f5224aeSachartre 		uscsi->uscsi_flags |= USCSI_RQENABLE;
18982f5224aeSachartre 		uscsi->uscsi_rqbuf = (caddr_t)VD_SCSI_DATA_SENSE(vd_scsi);
18992f5224aeSachartre 		uscsi->uscsi_rqlen = vd_scsi->sense_len;
19002f5224aeSachartre 	}
19012f5224aeSachartre 
19022f5224aeSachartre 	if (vd_scsi->datain_len != 0 && vd_scsi->dataout_len != 0) {
19032f5224aeSachartre 		/* uscsi does not support read/write request */
19042f5224aeSachartre 		return (EINVAL);
19052f5224aeSachartre 	}
19062f5224aeSachartre 
19072f5224aeSachartre 	/* request data-in */
19082f5224aeSachartre 	if (vd_scsi->datain_len != 0) {
19092f5224aeSachartre 		uscsi->uscsi_flags |= USCSI_READ;
19102f5224aeSachartre 		uscsi->uscsi_buflen = vd_scsi->datain_len;
19112f5224aeSachartre 		uscsi->uscsi_bufaddr = (char *)VD_SCSI_DATA_IN(vd_scsi);
19122f5224aeSachartre 	}
19132f5224aeSachartre 
19142f5224aeSachartre 	/* request data-out */
19152f5224aeSachartre 	if (vd_scsi->dataout_len != 0) {
19162f5224aeSachartre 		uscsi->uscsi_buflen = vd_scsi->dataout_len;
19172f5224aeSachartre 		uscsi->uscsi_bufaddr = (char *)VD_SCSI_DATA_OUT(vd_scsi);
19182f5224aeSachartre 	}
19192f5224aeSachartre 
19202f5224aeSachartre 	return (0);
19212f5224aeSachartre }
19222f5224aeSachartre 
19232f5224aeSachartre static void
19242f5224aeSachartre vd_scsicmd_out(void *ioctl_arg, void *vd_buf)
19252f5224aeSachartre {
19262f5224aeSachartre 	vd_scsi_t *vd_scsi = (vd_scsi_t *)vd_buf;
19272f5224aeSachartre 	struct uscsi_cmd *uscsi = (struct uscsi_cmd *)ioctl_arg;
19282f5224aeSachartre 
19292f5224aeSachartre 	/* output fields */
19302f5224aeSachartre 	vd_scsi->cmd_status = uscsi->uscsi_status;
19312f5224aeSachartre 
19322f5224aeSachartre 	/* sense data */
19332f5224aeSachartre 	if ((uscsi->uscsi_flags & USCSI_RQENABLE) &&
19342f5224aeSachartre 	    (uscsi->uscsi_status == STATUS_CHECK ||
19352f5224aeSachartre 	    uscsi->uscsi_status == STATUS_TERMINATED)) {
19362f5224aeSachartre 		vd_scsi->sense_status = uscsi->uscsi_rqstatus;
19372f5224aeSachartre 		if (uscsi->uscsi_rqstatus == STATUS_GOOD)
19382f5224aeSachartre 			vd_scsi->sense_len -= uscsi->uscsi_resid;
19392f5224aeSachartre 		else
19402f5224aeSachartre 			vd_scsi->sense_len = 0;
19412f5224aeSachartre 	} else {
19422f5224aeSachartre 		vd_scsi->sense_len = 0;
19432f5224aeSachartre 	}
19442f5224aeSachartre 
19452f5224aeSachartre 	if (uscsi->uscsi_status != STATUS_GOOD) {
19462f5224aeSachartre 		vd_scsi->dataout_len = 0;
19472f5224aeSachartre 		vd_scsi->datain_len = 0;
19482f5224aeSachartre 		return;
19492f5224aeSachartre 	}
19502f5224aeSachartre 
19512f5224aeSachartre 	if (uscsi->uscsi_flags & USCSI_READ) {
19522f5224aeSachartre 		/* request data (read) */
19532f5224aeSachartre 		vd_scsi->datain_len -= uscsi->uscsi_resid;
19542f5224aeSachartre 		vd_scsi->dataout_len = 0;
19552f5224aeSachartre 	} else {
19562f5224aeSachartre 		/* request data (write) */
19572f5224aeSachartre 		vd_scsi->datain_len = 0;
19582f5224aeSachartre 		vd_scsi->dataout_len -= uscsi->uscsi_resid;
19592f5224aeSachartre 	}
19602f5224aeSachartre }
19612f5224aeSachartre 
1962690555a1Sachartre static ushort_t
19633c96341aSnarayan vd_lbl2cksum(struct dk_label *label)
19643c96341aSnarayan {
19653c96341aSnarayan 	int	count;
1966690555a1Sachartre 	ushort_t sum, *sp;
19673c96341aSnarayan 
19683c96341aSnarayan 	count =	(sizeof (struct dk_label)) / (sizeof (short)) - 1;
1969690555a1Sachartre 	sp = (ushort_t *)label;
19703c96341aSnarayan 	sum = 0;
19713c96341aSnarayan 	while (count--) {
19723c96341aSnarayan 		sum ^= *sp++;
19733c96341aSnarayan 	}
19743c96341aSnarayan 
19753c96341aSnarayan 	return (sum);
19763c96341aSnarayan }
19773c96341aSnarayan 
197887a7269eSachartre /*
197987a7269eSachartre  * Handle ioctls to a disk slice.
1980205eeb1aSlm66018  *
1981205eeb1aSlm66018  * Return Values
1982205eeb1aSlm66018  *	0	- Indicates that there are no errors in disk operations
1983205eeb1aSlm66018  *	ENOTSUP	- Unknown disk label type or unsupported DKIO ioctl
1984205eeb1aSlm66018  *	EINVAL	- Not enough room to copy the EFI label
1985205eeb1aSlm66018  *
198687a7269eSachartre  */
19871ae08745Sheppo static int
19880a55fbb7Slm66018 vd_do_slice_ioctl(vd_t *vd, int cmd, void *ioctl_arg)
19891ae08745Sheppo {
19904bac2208Snarayan 	dk_efi_t *dk_ioc;
1991edcc0754Sachartre 	int rval;
1992edcc0754Sachartre 
1993edcc0754Sachartre 	ASSERT(vd->vdisk_type == VD_DISK_TYPE_SLICE);
1994edcc0754Sachartre 
1995edcc0754Sachartre 	if (cmd == DKIOCFLUSHWRITECACHE) {
1996edcc0754Sachartre 		if (vd->file) {
1997edcc0754Sachartre 			return (VOP_FSYNC(vd->file_vnode, FSYNC, kcred, NULL));
1998edcc0754Sachartre 		} else {
1999edcc0754Sachartre 			return (ldi_ioctl(vd->ldi_handle[0], cmd,
2000edcc0754Sachartre 			    (intptr_t)ioctl_arg, vd->open_flags | FKIOCTL,
2001edcc0754Sachartre 			    kcred, &rval));
2002edcc0754Sachartre 		}
2003edcc0754Sachartre 	}
20044bac2208Snarayan 
20054bac2208Snarayan 	switch (vd->vdisk_label) {
20064bac2208Snarayan 
2007edcc0754Sachartre 	/* ioctls for a single slice disk with a VTOC label */
20084bac2208Snarayan 	case VD_DISK_LABEL_VTOC:
20094bac2208Snarayan 
20101ae08745Sheppo 		switch (cmd) {
20111ae08745Sheppo 		case DKIOCGGEOM:
20120a55fbb7Slm66018 			ASSERT(ioctl_arg != NULL);
20130a55fbb7Slm66018 			bcopy(&vd->dk_geom, ioctl_arg, sizeof (vd->dk_geom));
20141ae08745Sheppo 			return (0);
20151ae08745Sheppo 		case DKIOCGVTOC:
20160a55fbb7Slm66018 			ASSERT(ioctl_arg != NULL);
20170a55fbb7Slm66018 			bcopy(&vd->vtoc, ioctl_arg, sizeof (vd->vtoc));
20181ae08745Sheppo 			return (0);
201987a7269eSachartre 		default:
20203c96341aSnarayan 			return (ENOTSUP);
202187a7269eSachartre 		}
202287a7269eSachartre 
2023edcc0754Sachartre 	/* ioctls for a single slice disk with an EFI label */
202487a7269eSachartre 	case VD_DISK_LABEL_EFI:
202587a7269eSachartre 
202687a7269eSachartre 		switch (cmd) {
202787a7269eSachartre 		case DKIOCGETEFI:
20283c96341aSnarayan 			ASSERT(ioctl_arg != NULL);
202987a7269eSachartre 			dk_ioc = (dk_efi_t *)ioctl_arg;
2030edcc0754Sachartre 
2031edcc0754Sachartre 			/*
2032edcc0754Sachartre 			 * For a single slice disk with an EFI label, we define
2033edcc0754Sachartre 			 * a fake EFI label with the GPT at LBA 1 and one GPE
2034edcc0754Sachartre 			 * at LBA 2. So we return the GPT or the GPE depending
2035edcc0754Sachartre 			 * on which LBA is requested.
2036edcc0754Sachartre 			 */
2037edcc0754Sachartre 			if (dk_ioc->dki_lba == 1) {
2038edcc0754Sachartre 
2039edcc0754Sachartre 				/* return the EFI GPT */
2040edcc0754Sachartre 				if (dk_ioc->dki_length < sizeof (efi_gpt_t))
204187a7269eSachartre 					return (EINVAL);
2042edcc0754Sachartre 
2043edcc0754Sachartre 				bcopy(&vd->efi_gpt, dk_ioc->dki_data,
2044edcc0754Sachartre 				    sizeof (efi_gpt_t));
2045edcc0754Sachartre 
2046edcc0754Sachartre 				/* also return the GPE if possible */
2047edcc0754Sachartre 				if (dk_ioc->dki_length >= sizeof (efi_gpt_t) +
2048edcc0754Sachartre 				    sizeof (efi_gpe_t)) {
2049edcc0754Sachartre 					bcopy(&vd->efi_gpe, dk_ioc->dki_data +
2050edcc0754Sachartre 					    1, sizeof (efi_gpe_t));
2051edcc0754Sachartre 				}
2052edcc0754Sachartre 
2053edcc0754Sachartre 			} else if (dk_ioc->dki_lba == 2) {
2054edcc0754Sachartre 
2055edcc0754Sachartre 				/* return the EFI GPE */
2056edcc0754Sachartre 				if (dk_ioc->dki_length < sizeof (efi_gpe_t))
2057edcc0754Sachartre 					return (EINVAL);
2058edcc0754Sachartre 
2059edcc0754Sachartre 				bcopy(&vd->efi_gpe, dk_ioc->dki_data,
2060edcc0754Sachartre 				    sizeof (efi_gpe_t));
2061edcc0754Sachartre 
2062edcc0754Sachartre 			} else {
2063edcc0754Sachartre 				return (EINVAL);
2064edcc0754Sachartre 			}
2065edcc0754Sachartre 
206687a7269eSachartre 			return (0);
206787a7269eSachartre 		default:
206887a7269eSachartre 			return (ENOTSUP);
206987a7269eSachartre 		}
207087a7269eSachartre 
207187a7269eSachartre 	default:
2072205eeb1aSlm66018 		/* Unknown disk label type */
207387a7269eSachartre 		return (ENOTSUP);
207487a7269eSachartre 	}
207587a7269eSachartre }
207687a7269eSachartre 
2077edcc0754Sachartre static int
2078edcc0754Sachartre vds_efi_alloc_and_read(vd_t *vd, efi_gpt_t **gpt, efi_gpe_t **gpe)
2079edcc0754Sachartre {
2080edcc0754Sachartre 	vd_efi_dev_t edev;
2081edcc0754Sachartre 	int status;
2082edcc0754Sachartre 
2083edcc0754Sachartre 	VD_EFI_DEV_SET(edev, vd, (vd_efi_ioctl_func)vd_backend_ioctl);
2084edcc0754Sachartre 
2085edcc0754Sachartre 	status = vd_efi_alloc_and_read(&edev, gpt, gpe);
2086edcc0754Sachartre 
2087edcc0754Sachartre 	return (status);
2088edcc0754Sachartre }
2089edcc0754Sachartre 
2090edcc0754Sachartre static void
2091edcc0754Sachartre vds_efi_free(vd_t *vd, efi_gpt_t *gpt, efi_gpe_t *gpe)
2092edcc0754Sachartre {
2093edcc0754Sachartre 	vd_efi_dev_t edev;
2094edcc0754Sachartre 
2095edcc0754Sachartre 	VD_EFI_DEV_SET(edev, vd, (vd_efi_ioctl_func)vd_backend_ioctl);
2096edcc0754Sachartre 
2097edcc0754Sachartre 	vd_efi_free(&edev, gpt, gpe);
2098edcc0754Sachartre }
2099edcc0754Sachartre 
2100edcc0754Sachartre static int
2101edcc0754Sachartre vd_file_validate_efi(vd_t *vd)
2102edcc0754Sachartre {
2103edcc0754Sachartre 	efi_gpt_t *gpt;
2104edcc0754Sachartre 	efi_gpe_t *gpe;
2105edcc0754Sachartre 	int i, nparts, status;
2106edcc0754Sachartre 	struct uuid efi_reserved = EFI_RESERVED;
2107edcc0754Sachartre 
2108edcc0754Sachartre 	if ((status = vds_efi_alloc_and_read(vd, &gpt, &gpe)) != 0)
2109edcc0754Sachartre 		return (status);
2110edcc0754Sachartre 
2111edcc0754Sachartre 	bzero(&vd->vtoc, sizeof (struct vtoc));
2112edcc0754Sachartre 	bzero(&vd->dk_geom, sizeof (struct dk_geom));
2113edcc0754Sachartre 	bzero(vd->slices, sizeof (vd_slice_t) * VD_MAXPART);
2114edcc0754Sachartre 
2115edcc0754Sachartre 	vd->efi_reserved = -1;
2116edcc0754Sachartre 
2117edcc0754Sachartre 	nparts = gpt->efi_gpt_NumberOfPartitionEntries;
2118edcc0754Sachartre 
2119edcc0754Sachartre 	for (i = 0; i < nparts && i < VD_MAXPART; i++) {
2120edcc0754Sachartre 
2121edcc0754Sachartre 		if (gpe[i].efi_gpe_StartingLBA == 0 ||
2122edcc0754Sachartre 		    gpe[i].efi_gpe_EndingLBA == 0) {
2123edcc0754Sachartre 			continue;
2124edcc0754Sachartre 		}
2125edcc0754Sachartre 
2126edcc0754Sachartre 		vd->slices[i].start = gpe[i].efi_gpe_StartingLBA;
2127edcc0754Sachartre 		vd->slices[i].nblocks = gpe[i].efi_gpe_EndingLBA -
2128edcc0754Sachartre 		    gpe[i].efi_gpe_StartingLBA + 1;
2129edcc0754Sachartre 
2130edcc0754Sachartre 		if (bcmp(&gpe[i].efi_gpe_PartitionTypeGUID, &efi_reserved,
2131edcc0754Sachartre 		    sizeof (struct uuid)) == 0)
2132edcc0754Sachartre 			vd->efi_reserved = i;
2133edcc0754Sachartre 
2134edcc0754Sachartre 	}
2135edcc0754Sachartre 
2136edcc0754Sachartre 	ASSERT(vd->vdisk_size != 0);
2137edcc0754Sachartre 	vd->slices[VD_EFI_WD_SLICE].start = 0;
2138edcc0754Sachartre 	vd->slices[VD_EFI_WD_SLICE].nblocks = vd->vdisk_size;
2139edcc0754Sachartre 
2140edcc0754Sachartre 	vds_efi_free(vd, gpt, gpe);
2141edcc0754Sachartre 
2142edcc0754Sachartre 	return (status);
2143edcc0754Sachartre }
2144edcc0754Sachartre 
214587a7269eSachartre /*
214678fcd0a1Sachartre  * Function:
214778fcd0a1Sachartre  *	vd_file_validate_geometry
2148205eeb1aSlm66018  *
214978fcd0a1Sachartre  * Description:
215078fcd0a1Sachartre  *	Read the label and validate the geometry of a disk image. The driver
215178fcd0a1Sachartre  *	label, vtoc and geometry information are updated according to the
215278fcd0a1Sachartre  *	label read from the disk image.
215378fcd0a1Sachartre  *
215478fcd0a1Sachartre  *	If no valid label is found, the label is set to unknown and the
215578fcd0a1Sachartre  *	function returns EINVAL, but a default vtoc and geometry are provided
2156edcc0754Sachartre  *	to the driver. If an EFI label is found, ENOTSUP is returned.
215778fcd0a1Sachartre  *
215878fcd0a1Sachartre  * Parameters:
215978fcd0a1Sachartre  *	vd	- disk on which the operation is performed.
216078fcd0a1Sachartre  *
216178fcd0a1Sachartre  * Return Code:
216278fcd0a1Sachartre  *	0	- success.
216378fcd0a1Sachartre  *	EIO	- error reading the label from the disk image.
216478fcd0a1Sachartre  *	EINVAL	- unknown disk label.
2165edcc0754Sachartre  *	ENOTSUP	- geometry not applicable (EFI label).
216687a7269eSachartre  */
216787a7269eSachartre static int
216878fcd0a1Sachartre vd_file_validate_geometry(vd_t *vd)
216987a7269eSachartre {
217087a7269eSachartre 	struct dk_label label;
217178fcd0a1Sachartre 	struct dk_geom *geom = &vd->dk_geom;
217278fcd0a1Sachartre 	struct vtoc *vtoc = &vd->vtoc;
217378fcd0a1Sachartre 	int i;
217478fcd0a1Sachartre 	int status = 0;
217587a7269eSachartre 
217687a7269eSachartre 	ASSERT(vd->file);
2177edcc0754Sachartre 	ASSERT(vd->vdisk_type == VD_DISK_TYPE_DISK);
217887a7269eSachartre 
217987a7269eSachartre 	if (VD_FILE_LABEL_READ(vd, &label) < 0)
218087a7269eSachartre 		return (EIO);
218187a7269eSachartre 
218287a7269eSachartre 	if (label.dkl_magic != DKL_MAGIC ||
218378fcd0a1Sachartre 	    label.dkl_cksum != vd_lbl2cksum(&label) ||
218478fcd0a1Sachartre 	    label.dkl_vtoc.v_sanity != VTOC_SANE ||
218578fcd0a1Sachartre 	    label.dkl_vtoc.v_nparts != V_NUMPAR) {
2186edcc0754Sachartre 
2187edcc0754Sachartre 		if (vd_file_validate_efi(vd) == 0) {
2188edcc0754Sachartre 			vd->vdisk_label = VD_DISK_LABEL_EFI;
2189edcc0754Sachartre 			return (ENOTSUP);
2190edcc0754Sachartre 		}
2191edcc0754Sachartre 
219278fcd0a1Sachartre 		vd->vdisk_label = VD_DISK_LABEL_UNK;
219378fcd0a1Sachartre 		vd_file_build_default_label(vd, &label);
219478fcd0a1Sachartre 		status = EINVAL;
219578fcd0a1Sachartre 	} else {
219678fcd0a1Sachartre 		vd->vdisk_label = VD_DISK_LABEL_VTOC;
219778fcd0a1Sachartre 	}
219887a7269eSachartre 
219978fcd0a1Sachartre 	/* Update the driver geometry */
220087a7269eSachartre 	bzero(geom, sizeof (struct dk_geom));
220178fcd0a1Sachartre 
220287a7269eSachartre 	geom->dkg_ncyl = label.dkl_ncyl;
220387a7269eSachartre 	geom->dkg_acyl = label.dkl_acyl;
220487a7269eSachartre 	geom->dkg_nhead = label.dkl_nhead;
220587a7269eSachartre 	geom->dkg_nsect = label.dkl_nsect;
220687a7269eSachartre 	geom->dkg_intrlv = label.dkl_intrlv;
220787a7269eSachartre 	geom->dkg_apc = label.dkl_apc;
220887a7269eSachartre 	geom->dkg_rpm = label.dkl_rpm;
220987a7269eSachartre 	geom->dkg_pcyl = label.dkl_pcyl;
221087a7269eSachartre 	geom->dkg_write_reinstruct = label.dkl_write_reinstruct;
221187a7269eSachartre 	geom->dkg_read_reinstruct = label.dkl_read_reinstruct;
221287a7269eSachartre 
221378fcd0a1Sachartre 	/* Update the driver vtoc */
221487a7269eSachartre 	bzero(vtoc, sizeof (struct vtoc));
221587a7269eSachartre 
221687a7269eSachartre 	vtoc->v_sanity = label.dkl_vtoc.v_sanity;
221787a7269eSachartre 	vtoc->v_version = label.dkl_vtoc.v_version;
221887a7269eSachartre 	vtoc->v_sectorsz = DEV_BSIZE;
221987a7269eSachartre 	vtoc->v_nparts = label.dkl_vtoc.v_nparts;
222087a7269eSachartre 
222187a7269eSachartre 	for (i = 0; i < vtoc->v_nparts; i++) {
222287a7269eSachartre 		vtoc->v_part[i].p_tag =
222387a7269eSachartre 		    label.dkl_vtoc.v_part[i].p_tag;
222487a7269eSachartre 		vtoc->v_part[i].p_flag =
222587a7269eSachartre 		    label.dkl_vtoc.v_part[i].p_flag;
222687a7269eSachartre 		vtoc->v_part[i].p_start =
222787a7269eSachartre 		    label.dkl_map[i].dkl_cylno *
222887a7269eSachartre 		    (label.dkl_nhead * label.dkl_nsect);
222987a7269eSachartre 		vtoc->v_part[i].p_size = label.dkl_map[i].dkl_nblk;
223087a7269eSachartre 		vtoc->timestamp[i] =
223187a7269eSachartre 		    label.dkl_vtoc.v_timestamp[i];
223287a7269eSachartre 	}
223387a7269eSachartre 	/*
223487a7269eSachartre 	 * The bootinfo array can not be copied with bcopy() because
223587a7269eSachartre 	 * elements are of type long in vtoc (so 64-bit) and of type
223687a7269eSachartre 	 * int in dk_vtoc (so 32-bit).
223787a7269eSachartre 	 */
223887a7269eSachartre 	vtoc->v_bootinfo[0] = label.dkl_vtoc.v_bootinfo[0];
223987a7269eSachartre 	vtoc->v_bootinfo[1] = label.dkl_vtoc.v_bootinfo[1];
224087a7269eSachartre 	vtoc->v_bootinfo[2] = label.dkl_vtoc.v_bootinfo[2];
224187a7269eSachartre 	bcopy(label.dkl_asciilabel, vtoc->v_asciilabel,
224287a7269eSachartre 	    LEN_DKL_ASCII);
224387a7269eSachartre 	bcopy(label.dkl_vtoc.v_volume, vtoc->v_volume,
224487a7269eSachartre 	    LEN_DKL_VVOL);
224587a7269eSachartre 
2246edcc0754Sachartre 	/* Update logical partitions */
2247edcc0754Sachartre 	bzero(vd->slices, sizeof (vd_slice_t) * VD_MAXPART);
2248edcc0754Sachartre 	if (vd->vdisk_label != VD_DISK_LABEL_UNK) {
2249edcc0754Sachartre 		for (i = 0; i < vtoc->v_nparts; i++) {
2250edcc0754Sachartre 			vd->slices[i].start = vtoc->v_part[i].p_start;
2251edcc0754Sachartre 			vd->slices[i].nblocks = vtoc->v_part[i].p_size;
2252edcc0754Sachartre 		}
2253edcc0754Sachartre 	}
2254edcc0754Sachartre 
225578fcd0a1Sachartre 	return (status);
225678fcd0a1Sachartre }
225778fcd0a1Sachartre 
225878fcd0a1Sachartre /*
225978fcd0a1Sachartre  * Handle ioctls to a disk image (file-based).
226078fcd0a1Sachartre  *
226178fcd0a1Sachartre  * Return Values
226278fcd0a1Sachartre  *	0	- Indicates that there are no errors
226378fcd0a1Sachartre  *	!= 0	- Disk operation returned an error
226478fcd0a1Sachartre  */
226578fcd0a1Sachartre static int
226678fcd0a1Sachartre vd_do_file_ioctl(vd_t *vd, int cmd, void *ioctl_arg)
226778fcd0a1Sachartre {
226878fcd0a1Sachartre 	struct dk_label label;
226978fcd0a1Sachartre 	struct dk_geom *geom;
227078fcd0a1Sachartre 	struct vtoc *vtoc;
2271edcc0754Sachartre 	dk_efi_t *efi;
227278fcd0a1Sachartre 	int i, rc;
227378fcd0a1Sachartre 
227478fcd0a1Sachartre 	ASSERT(vd->file);
2275edcc0754Sachartre 	ASSERT(vd->vdisk_type == VD_DISK_TYPE_DISK);
227678fcd0a1Sachartre 
227778fcd0a1Sachartre 	switch (cmd) {
227878fcd0a1Sachartre 
227978fcd0a1Sachartre 	case DKIOCGGEOM:
228078fcd0a1Sachartre 		ASSERT(ioctl_arg != NULL);
228178fcd0a1Sachartre 		geom = (struct dk_geom *)ioctl_arg;
228278fcd0a1Sachartre 
228378fcd0a1Sachartre 		rc = vd_file_validate_geometry(vd);
2284edcc0754Sachartre 		if (rc != 0 && rc != EINVAL)
228578fcd0a1Sachartre 			return (rc);
228678fcd0a1Sachartre 		bcopy(&vd->dk_geom, geom, sizeof (struct dk_geom));
228778fcd0a1Sachartre 		return (0);
228878fcd0a1Sachartre 
228978fcd0a1Sachartre 	case DKIOCGVTOC:
229078fcd0a1Sachartre 		ASSERT(ioctl_arg != NULL);
229178fcd0a1Sachartre 		vtoc = (struct vtoc *)ioctl_arg;
229278fcd0a1Sachartre 
229378fcd0a1Sachartre 		rc = vd_file_validate_geometry(vd);
2294edcc0754Sachartre 		if (rc != 0 && rc != EINVAL)
229578fcd0a1Sachartre 			return (rc);
229678fcd0a1Sachartre 		bcopy(&vd->vtoc, vtoc, sizeof (struct vtoc));
229787a7269eSachartre 		return (0);
229887a7269eSachartre 
229987a7269eSachartre 	case DKIOCSGEOM:
230087a7269eSachartre 		ASSERT(ioctl_arg != NULL);
230187a7269eSachartre 		geom = (struct dk_geom *)ioctl_arg;
230287a7269eSachartre 
230387a7269eSachartre 		if (geom->dkg_nhead == 0 || geom->dkg_nsect == 0)
230487a7269eSachartre 			return (EINVAL);
230587a7269eSachartre 
230687a7269eSachartre 		/*
230787a7269eSachartre 		 * The current device geometry is not updated, just the driver
230887a7269eSachartre 		 * "notion" of it. The device geometry will be effectively
230987a7269eSachartre 		 * updated when a label is written to the device during a next
231087a7269eSachartre 		 * DKIOCSVTOC.
231187a7269eSachartre 		 */
231287a7269eSachartre 		bcopy(ioctl_arg, &vd->dk_geom, sizeof (vd->dk_geom));
231387a7269eSachartre 		return (0);
231487a7269eSachartre 
231587a7269eSachartre 	case DKIOCSVTOC:
231687a7269eSachartre 		ASSERT(ioctl_arg != NULL);
231787a7269eSachartre 		ASSERT(vd->dk_geom.dkg_nhead != 0 &&
231887a7269eSachartre 		    vd->dk_geom.dkg_nsect != 0);
2319690555a1Sachartre 		vtoc = (struct vtoc *)ioctl_arg;
2320690555a1Sachartre 
2321690555a1Sachartre 		if (vtoc->v_sanity != VTOC_SANE ||
2322690555a1Sachartre 		    vtoc->v_sectorsz != DEV_BSIZE ||
2323690555a1Sachartre 		    vtoc->v_nparts != V_NUMPAR)
2324690555a1Sachartre 			return (EINVAL);
2325690555a1Sachartre 
2326690555a1Sachartre 		bzero(&label, sizeof (label));
2327690555a1Sachartre 		label.dkl_ncyl = vd->dk_geom.dkg_ncyl;
2328690555a1Sachartre 		label.dkl_acyl = vd->dk_geom.dkg_acyl;
2329690555a1Sachartre 		label.dkl_pcyl = vd->dk_geom.dkg_pcyl;
2330690555a1Sachartre 		label.dkl_nhead = vd->dk_geom.dkg_nhead;
2331690555a1Sachartre 		label.dkl_nsect = vd->dk_geom.dkg_nsect;
2332690555a1Sachartre 		label.dkl_intrlv = vd->dk_geom.dkg_intrlv;
2333690555a1Sachartre 		label.dkl_apc = vd->dk_geom.dkg_apc;
2334690555a1Sachartre 		label.dkl_rpm = vd->dk_geom.dkg_rpm;
233587a7269eSachartre 		label.dkl_write_reinstruct = vd->dk_geom.dkg_write_reinstruct;
233687a7269eSachartre 		label.dkl_read_reinstruct = vd->dk_geom.dkg_read_reinstruct;
2337690555a1Sachartre 
233887a7269eSachartre 		label.dkl_vtoc.v_nparts = V_NUMPAR;
233987a7269eSachartre 		label.dkl_vtoc.v_sanity = VTOC_SANE;
2340690555a1Sachartre 		label.dkl_vtoc.v_version = vtoc->v_version;
234187a7269eSachartre 		for (i = 0; i < V_NUMPAR; i++) {
2342690555a1Sachartre 			label.dkl_vtoc.v_timestamp[i] =
2343690555a1Sachartre 			    vtoc->timestamp[i];
2344690555a1Sachartre 			label.dkl_vtoc.v_part[i].p_tag =
2345690555a1Sachartre 			    vtoc->v_part[i].p_tag;
2346690555a1Sachartre 			label.dkl_vtoc.v_part[i].p_flag =
2347690555a1Sachartre 			    vtoc->v_part[i].p_flag;
2348690555a1Sachartre 			label.dkl_map[i].dkl_cylno =
2349690555a1Sachartre 			    vtoc->v_part[i].p_start /
2350690555a1Sachartre 			    (label.dkl_nhead * label.dkl_nsect);
2351690555a1Sachartre 			label.dkl_map[i].dkl_nblk =
2352690555a1Sachartre 			    vtoc->v_part[i].p_size;
23533c96341aSnarayan 		}
235487a7269eSachartre 		/*
235587a7269eSachartre 		 * The bootinfo array can not be copied with bcopy() because
235687a7269eSachartre 		 * elements are of type long in vtoc (so 64-bit) and of type
235787a7269eSachartre 		 * int in dk_vtoc (so 32-bit).
235887a7269eSachartre 		 */
235987a7269eSachartre 		label.dkl_vtoc.v_bootinfo[0] = vtoc->v_bootinfo[0];
236087a7269eSachartre 		label.dkl_vtoc.v_bootinfo[1] = vtoc->v_bootinfo[1];
236187a7269eSachartre 		label.dkl_vtoc.v_bootinfo[2] = vtoc->v_bootinfo[2];
2362690555a1Sachartre 		bcopy(vtoc->v_asciilabel, label.dkl_asciilabel,
2363690555a1Sachartre 		    LEN_DKL_ASCII);
2364690555a1Sachartre 		bcopy(vtoc->v_volume, label.dkl_vtoc.v_volume,
2365690555a1Sachartre 		    LEN_DKL_VVOL);
23663c96341aSnarayan 
23673c96341aSnarayan 		/* re-compute checksum */
2368690555a1Sachartre 		label.dkl_magic = DKL_MAGIC;
2369690555a1Sachartre 		label.dkl_cksum = vd_lbl2cksum(&label);
2370690555a1Sachartre 
237187a7269eSachartre 		/* write label to the disk image */
237287a7269eSachartre 		if ((rc = vd_file_set_vtoc(vd, &label)) != 0)
237387a7269eSachartre 			return (rc);
2374690555a1Sachartre 
2375edcc0754Sachartre 		break;
2376edcc0754Sachartre 
2377edcc0754Sachartre 	case DKIOCFLUSHWRITECACHE:
2378edcc0754Sachartre 		return (VOP_FSYNC(vd->file_vnode, FSYNC, kcred, NULL));
2379edcc0754Sachartre 
2380edcc0754Sachartre 	case DKIOCGETEFI:
2381edcc0754Sachartre 		ASSERT(ioctl_arg != NULL);
2382edcc0754Sachartre 		efi = (dk_efi_t *)ioctl_arg;
2383edcc0754Sachartre 
2384edcc0754Sachartre 		if (vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BREAD,
2385edcc0754Sachartre 		    (caddr_t)efi->dki_data, efi->dki_lba, efi->dki_length) < 0)
2386edcc0754Sachartre 			return (EIO);
2387edcc0754Sachartre 
2388edcc0754Sachartre 		return (0);
2389edcc0754Sachartre 
2390edcc0754Sachartre 	case DKIOCSETEFI:
2391edcc0754Sachartre 		ASSERT(ioctl_arg != NULL);
2392edcc0754Sachartre 		efi = (dk_efi_t *)ioctl_arg;
2393edcc0754Sachartre 
2394edcc0754Sachartre 		if (vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BWRITE,
2395edcc0754Sachartre 		    (caddr_t)efi->dki_data, efi->dki_lba, efi->dki_length) < 0)
2396edcc0754Sachartre 			return (EIO);
2397edcc0754Sachartre 
2398edcc0754Sachartre 		break;
2399edcc0754Sachartre 
2400edcc0754Sachartre 
2401edcc0754Sachartre 	default:
2402edcc0754Sachartre 		return (ENOTSUP);
2403edcc0754Sachartre 	}
2404edcc0754Sachartre 
2405edcc0754Sachartre 	ASSERT(cmd == DKIOCSVTOC || cmd == DKIOCSETEFI);
2406edcc0754Sachartre 
2407edcc0754Sachartre 	/* label has changed, revalidate the geometry */
2408edcc0754Sachartre 	(void) vd_file_validate_geometry(vd);
24093c96341aSnarayan 
241087a7269eSachartre 	/*
241187a7269eSachartre 	 * The disk geometry may have changed, so we need to write
241287a7269eSachartre 	 * the devid (if there is one) so that it is stored at the
241387a7269eSachartre 	 * right location.
241487a7269eSachartre 	 */
2415edcc0754Sachartre 	if (vd_file_write_devid(vd, vd->file_devid) != 0) {
241687a7269eSachartre 		PR0("Fail to write devid");
24171ae08745Sheppo 	}
24184bac2208Snarayan 
24194bac2208Snarayan 	return (0);
24204bac2208Snarayan }
2421edcc0754Sachartre 
2422edcc0754Sachartre static int
2423edcc0754Sachartre vd_backend_ioctl(vd_t *vd, int cmd, caddr_t arg)
2424edcc0754Sachartre {
2425edcc0754Sachartre 	int rval = 0, status;
2426edcc0754Sachartre 
2427edcc0754Sachartre 	/*
2428edcc0754Sachartre 	 * Call the appropriate function to execute the ioctl depending
2429edcc0754Sachartre 	 * on the type of vdisk.
2430edcc0754Sachartre 	 */
2431edcc0754Sachartre 	if (vd->vdisk_type == VD_DISK_TYPE_SLICE) {
2432edcc0754Sachartre 
2433edcc0754Sachartre 		/* slice, file or volume exported as a single slice disk */
2434edcc0754Sachartre 		status = vd_do_slice_ioctl(vd, cmd, arg);
2435edcc0754Sachartre 
2436edcc0754Sachartre 	} else if (vd->file) {
2437edcc0754Sachartre 
2438edcc0754Sachartre 		/* file or volume exported as a full disk */
2439edcc0754Sachartre 		status = vd_do_file_ioctl(vd, cmd, arg);
2440edcc0754Sachartre 
2441edcc0754Sachartre 	} else {
2442edcc0754Sachartre 
2443edcc0754Sachartre 		/* disk device exported as a full disk */
2444edcc0754Sachartre 		status = ldi_ioctl(vd->ldi_handle[0], cmd, (intptr_t)arg,
2445edcc0754Sachartre 		    vd->open_flags | FKIOCTL, kcred, &rval);
2446edcc0754Sachartre 	}
2447edcc0754Sachartre 
2448edcc0754Sachartre #ifdef DEBUG
2449edcc0754Sachartre 	if (rval != 0) {
2450edcc0754Sachartre 		PR0("ioctl %x set rval = %d, which is not being returned"
2451edcc0754Sachartre 		    " to caller", cmd, rval);
2452edcc0754Sachartre 	}
2453edcc0754Sachartre #endif /* DEBUG */
2454edcc0754Sachartre 
2455edcc0754Sachartre 	return (status);
24561ae08745Sheppo }
24571ae08745Sheppo 
2458205eeb1aSlm66018 /*
2459205eeb1aSlm66018  * Description:
2460205eeb1aSlm66018  *	This is the function that processes the ioctl requests (farming it
2461205eeb1aSlm66018  *	out to functions that handle slices, files or whole disks)
2462205eeb1aSlm66018  *
2463205eeb1aSlm66018  * Return Values
2464205eeb1aSlm66018  *     0		- ioctl operation completed successfully
2465205eeb1aSlm66018  *     != 0		- The LDC error value encountered
2466205eeb1aSlm66018  *			  (propagated back up the call stack as a task error)
2467205eeb1aSlm66018  *
2468205eeb1aSlm66018  * Side Effect
2469205eeb1aSlm66018  *     sets request->status to the return value of the ioctl function.
2470205eeb1aSlm66018  */
24711ae08745Sheppo static int
24720a55fbb7Slm66018 vd_do_ioctl(vd_t *vd, vd_dring_payload_t *request, void* buf, vd_ioctl_t *ioctl)
24731ae08745Sheppo {
2474edcc0754Sachartre 	int	status = 0;
24751ae08745Sheppo 	size_t	nbytes = request->nbytes;	/* modifiable copy */
24761ae08745Sheppo 
24771ae08745Sheppo 
24781ae08745Sheppo 	ASSERT(request->slice < vd->nslices);
24791ae08745Sheppo 	PR0("Performing %s", ioctl->operation_name);
24801ae08745Sheppo 
24810a55fbb7Slm66018 	/* Get data from client and convert, if necessary */
24820a55fbb7Slm66018 	if (ioctl->copyin != NULL)  {
24831ae08745Sheppo 		ASSERT(nbytes != 0 && buf != NULL);
24841ae08745Sheppo 		PR1("Getting \"arg\" data from client");
24851ae08745Sheppo 		if ((status = ldc_mem_copy(vd->ldc_handle, buf, 0, &nbytes,
24861ae08745Sheppo 		    request->cookie, request->ncookies,
24871ae08745Sheppo 		    LDC_COPY_IN)) != 0) {
24883af08d82Slm66018 			PR0("ldc_mem_copy() returned errno %d "
24891ae08745Sheppo 			    "copying from client", status);
24901ae08745Sheppo 			return (status);
24911ae08745Sheppo 		}
24920a55fbb7Slm66018 
24930a55fbb7Slm66018 		/* Convert client's data, if necessary */
24942f5224aeSachartre 		if (ioctl->copyin == VD_IDENTITY_IN) {
24952f5224aeSachartre 			/* use client buffer */
24960a55fbb7Slm66018 			ioctl->arg = buf;
24972f5224aeSachartre 		} else {
24982f5224aeSachartre 			/* convert client vdisk operation data to ioctl data */
24992f5224aeSachartre 			status = (ioctl->copyin)(buf, nbytes,
25002f5224aeSachartre 			    (void *)ioctl->arg);
25012f5224aeSachartre 			if (status != 0) {
25022f5224aeSachartre 				request->status = status;
25032f5224aeSachartre 				return (0);
25042f5224aeSachartre 			}
25052f5224aeSachartre 		}
25062f5224aeSachartre 	}
25072f5224aeSachartre 
25082f5224aeSachartre 	if (ioctl->operation == VD_OP_SCSICMD) {
25092f5224aeSachartre 		struct uscsi_cmd *uscsi = (struct uscsi_cmd *)ioctl->arg;
25102f5224aeSachartre 
25112f5224aeSachartre 		/* check write permission */
25122f5224aeSachartre 		if (!(vd->open_flags & FWRITE) &&
25132f5224aeSachartre 		    !(uscsi->uscsi_flags & USCSI_READ)) {
25142f5224aeSachartre 			PR0("uscsi fails because backend is opened read-only");
25152f5224aeSachartre 			request->status = EROFS;
25162f5224aeSachartre 			return (0);
25172f5224aeSachartre 		}
25181ae08745Sheppo 	}
25191ae08745Sheppo 
25201ae08745Sheppo 	/*
2521edcc0754Sachartre 	 * Send the ioctl to the disk backend.
25221ae08745Sheppo 	 */
2523edcc0754Sachartre 	request->status = vd_backend_ioctl(vd, ioctl->cmd, ioctl->arg);
2524205eeb1aSlm66018 
2525205eeb1aSlm66018 	if (request->status != 0) {
2526205eeb1aSlm66018 		PR0("ioctl(%s) = errno %d", ioctl->cmd_name, request->status);
25272f5224aeSachartre 		if (ioctl->operation == VD_OP_SCSICMD &&
25282f5224aeSachartre 		    ((struct uscsi_cmd *)ioctl->arg)->uscsi_status != 0)
25292f5224aeSachartre 			/*
25302f5224aeSachartre 			 * USCSICMD has reported an error and the uscsi_status
25312f5224aeSachartre 			 * field is not zero. This means that the SCSI command
25322f5224aeSachartre 			 * has completed but it has an error. So we should
25332f5224aeSachartre 			 * mark the VD operation has succesfully completed
25342f5224aeSachartre 			 * and clients can check the SCSI status field for
25352f5224aeSachartre 			 * SCSI errors.
25362f5224aeSachartre 			 */
25372f5224aeSachartre 			request->status = 0;
25382f5224aeSachartre 		else
2539205eeb1aSlm66018 			return (0);
2540205eeb1aSlm66018 	}
25411ae08745Sheppo 
25420a55fbb7Slm66018 	/* Convert data and send to client, if necessary */
25430a55fbb7Slm66018 	if (ioctl->copyout != NULL)  {
25441ae08745Sheppo 		ASSERT(nbytes != 0 && buf != NULL);
25451ae08745Sheppo 		PR1("Sending \"arg\" data to client");
25460a55fbb7Slm66018 
25470a55fbb7Slm66018 		/* Convert ioctl data to vdisk operation data, if necessary */
25482f5224aeSachartre 		if (ioctl->copyout != VD_IDENTITY_OUT)
25490a55fbb7Slm66018 			(ioctl->copyout)((void *)ioctl->arg, buf);
25500a55fbb7Slm66018 
25511ae08745Sheppo 		if ((status = ldc_mem_copy(vd->ldc_handle, buf, 0, &nbytes,
25521ae08745Sheppo 		    request->cookie, request->ncookies,
25531ae08745Sheppo 		    LDC_COPY_OUT)) != 0) {
25543af08d82Slm66018 			PR0("ldc_mem_copy() returned errno %d "
25551ae08745Sheppo 			    "copying to client", status);
25561ae08745Sheppo 			return (status);
25571ae08745Sheppo 		}
25581ae08745Sheppo 	}
25591ae08745Sheppo 
25601ae08745Sheppo 	return (status);
25611ae08745Sheppo }
25621ae08745Sheppo 
25631ae08745Sheppo #define	RNDSIZE(expr) P2ROUNDUP(sizeof (expr), sizeof (uint64_t))
2564205eeb1aSlm66018 
2565205eeb1aSlm66018 /*
2566205eeb1aSlm66018  * Description:
2567205eeb1aSlm66018  *	This generic function is called by the task queue to complete
2568205eeb1aSlm66018  *	the processing of the tasks. The specific completion function
2569205eeb1aSlm66018  *	is passed in as a field in the task pointer.
2570205eeb1aSlm66018  *
2571205eeb1aSlm66018  * Parameters:
2572205eeb1aSlm66018  *	arg 	- opaque pointer to structure containing task to be completed
2573205eeb1aSlm66018  *
2574205eeb1aSlm66018  * Return Values
2575205eeb1aSlm66018  *	None
2576205eeb1aSlm66018  */
2577205eeb1aSlm66018 static void
2578205eeb1aSlm66018 vd_complete(void *arg)
2579205eeb1aSlm66018 {
2580205eeb1aSlm66018 	vd_task_t	*task = (vd_task_t *)arg;
2581205eeb1aSlm66018 
2582205eeb1aSlm66018 	ASSERT(task != NULL);
2583205eeb1aSlm66018 	ASSERT(task->status == EINPROGRESS);
2584205eeb1aSlm66018 	ASSERT(task->completef != NULL);
2585205eeb1aSlm66018 
2586205eeb1aSlm66018 	task->status = task->completef(task);
2587205eeb1aSlm66018 	if (task->status)
2588205eeb1aSlm66018 		PR0("%s: Error %d completing task", __func__, task->status);
2589205eeb1aSlm66018 
2590205eeb1aSlm66018 	/* Now notify the vDisk client */
2591205eeb1aSlm66018 	vd_complete_notify(task);
2592205eeb1aSlm66018 }
2593205eeb1aSlm66018 
25941ae08745Sheppo static int
2595d10e4ef2Snarayan vd_ioctl(vd_task_t *task)
25961ae08745Sheppo {
259787a7269eSachartre 	int			i, status;
25981ae08745Sheppo 	void			*buf = NULL;
25990a55fbb7Slm66018 	struct dk_geom		dk_geom = {0};
26000a55fbb7Slm66018 	struct vtoc		vtoc = {0};
26014bac2208Snarayan 	struct dk_efi		dk_efi = {0};
26022f5224aeSachartre 	struct uscsi_cmd	uscsi = {0};
2603d10e4ef2Snarayan 	vd_t			*vd		= task->vd;
2604d10e4ef2Snarayan 	vd_dring_payload_t	*request	= task->request;
26050a55fbb7Slm66018 	vd_ioctl_t		ioctl[] = {
26060a55fbb7Slm66018 		/* Command (no-copy) operations */
26070a55fbb7Slm66018 		{VD_OP_FLUSH, STRINGIZE(VD_OP_FLUSH), 0,
26080a55fbb7Slm66018 		    DKIOCFLUSHWRITECACHE, STRINGIZE(DKIOCFLUSHWRITECACHE),
2609047ba61eSachartre 		    NULL, NULL, NULL, B_TRUE},
26100a55fbb7Slm66018 
26110a55fbb7Slm66018 		/* "Get" (copy-out) operations */
26120a55fbb7Slm66018 		{VD_OP_GET_WCE, STRINGIZE(VD_OP_GET_WCE), RNDSIZE(int),
26130a55fbb7Slm66018 		    DKIOCGETWCE, STRINGIZE(DKIOCGETWCE),
26142f5224aeSachartre 		    NULL, VD_IDENTITY_IN, VD_IDENTITY_OUT, B_FALSE},
26150a55fbb7Slm66018 		{VD_OP_GET_DISKGEOM, STRINGIZE(VD_OP_GET_DISKGEOM),
26160a55fbb7Slm66018 		    RNDSIZE(vd_geom_t),
26170a55fbb7Slm66018 		    DKIOCGGEOM, STRINGIZE(DKIOCGGEOM),
2618047ba61eSachartre 		    &dk_geom, NULL, dk_geom2vd_geom, B_FALSE},
26190a55fbb7Slm66018 		{VD_OP_GET_VTOC, STRINGIZE(VD_OP_GET_VTOC), RNDSIZE(vd_vtoc_t),
26200a55fbb7Slm66018 		    DKIOCGVTOC, STRINGIZE(DKIOCGVTOC),
2621047ba61eSachartre 		    &vtoc, NULL, vtoc2vd_vtoc, B_FALSE},
26224bac2208Snarayan 		{VD_OP_GET_EFI, STRINGIZE(VD_OP_GET_EFI), RNDSIZE(vd_efi_t),
26234bac2208Snarayan 		    DKIOCGETEFI, STRINGIZE(DKIOCGETEFI),
2624047ba61eSachartre 		    &dk_efi, vd_get_efi_in, vd_get_efi_out, B_FALSE},
26250a55fbb7Slm66018 
26260a55fbb7Slm66018 		/* "Set" (copy-in) operations */
26270a55fbb7Slm66018 		{VD_OP_SET_WCE, STRINGIZE(VD_OP_SET_WCE), RNDSIZE(int),
26280a55fbb7Slm66018 		    DKIOCSETWCE, STRINGIZE(DKIOCSETWCE),
26292f5224aeSachartre 		    NULL, VD_IDENTITY_IN, VD_IDENTITY_OUT, B_TRUE},
26300a55fbb7Slm66018 		{VD_OP_SET_DISKGEOM, STRINGIZE(VD_OP_SET_DISKGEOM),
26310a55fbb7Slm66018 		    RNDSIZE(vd_geom_t),
26320a55fbb7Slm66018 		    DKIOCSGEOM, STRINGIZE(DKIOCSGEOM),
2633047ba61eSachartre 		    &dk_geom, vd_geom2dk_geom, NULL, B_TRUE},
26340a55fbb7Slm66018 		{VD_OP_SET_VTOC, STRINGIZE(VD_OP_SET_VTOC), RNDSIZE(vd_vtoc_t),
26350a55fbb7Slm66018 		    DKIOCSVTOC, STRINGIZE(DKIOCSVTOC),
2636047ba61eSachartre 		    &vtoc, vd_vtoc2vtoc, NULL, B_TRUE},
26374bac2208Snarayan 		{VD_OP_SET_EFI, STRINGIZE(VD_OP_SET_EFI), RNDSIZE(vd_efi_t),
26384bac2208Snarayan 		    DKIOCSETEFI, STRINGIZE(DKIOCSETEFI),
2639047ba61eSachartre 		    &dk_efi, vd_set_efi_in, vd_set_efi_out, B_TRUE},
26402f5224aeSachartre 
26412f5224aeSachartre 		{VD_OP_SCSICMD, STRINGIZE(VD_OP_SCSICMD), RNDSIZE(vd_scsi_t),
26422f5224aeSachartre 		    USCSICMD, STRINGIZE(USCSICMD),
26432f5224aeSachartre 		    &uscsi, vd_scsicmd_in, vd_scsicmd_out, B_FALSE},
26440a55fbb7Slm66018 	};
26451ae08745Sheppo 	size_t		nioctls = (sizeof (ioctl))/(sizeof (ioctl[0]));
26461ae08745Sheppo 
26471ae08745Sheppo 
2648d10e4ef2Snarayan 	ASSERT(vd != NULL);
2649d10e4ef2Snarayan 	ASSERT(request != NULL);
26501ae08745Sheppo 	ASSERT(request->slice < vd->nslices);
26511ae08745Sheppo 
26521ae08745Sheppo 	/*
26531ae08745Sheppo 	 * Determine ioctl corresponding to caller's "operation" and
26541ae08745Sheppo 	 * validate caller's "nbytes"
26551ae08745Sheppo 	 */
26561ae08745Sheppo 	for (i = 0; i < nioctls; i++) {
26571ae08745Sheppo 		if (request->operation == ioctl[i].operation) {
26580a55fbb7Slm66018 			/* LDC memory operations require 8-byte multiples */
26590a55fbb7Slm66018 			ASSERT(ioctl[i].nbytes % sizeof (uint64_t) == 0);
26600a55fbb7Slm66018 
26614bac2208Snarayan 			if (request->operation == VD_OP_GET_EFI ||
26622f5224aeSachartre 			    request->operation == VD_OP_SET_EFI ||
26632f5224aeSachartre 			    request->operation == VD_OP_SCSICMD) {
26644bac2208Snarayan 				if (request->nbytes >= ioctl[i].nbytes)
26654bac2208Snarayan 					break;
26663af08d82Slm66018 				PR0("%s:  Expected at least nbytes = %lu, "
26674bac2208Snarayan 				    "got %lu", ioctl[i].operation_name,
26684bac2208Snarayan 				    ioctl[i].nbytes, request->nbytes);
26694bac2208Snarayan 				return (EINVAL);
26704bac2208Snarayan 			}
26714bac2208Snarayan 
26720a55fbb7Slm66018 			if (request->nbytes != ioctl[i].nbytes) {
26733af08d82Slm66018 				PR0("%s:  Expected nbytes = %lu, got %lu",
26740a55fbb7Slm66018 				    ioctl[i].operation_name, ioctl[i].nbytes,
26750a55fbb7Slm66018 				    request->nbytes);
26761ae08745Sheppo 				return (EINVAL);
26771ae08745Sheppo 			}
26781ae08745Sheppo 
26791ae08745Sheppo 			break;
26801ae08745Sheppo 		}
26811ae08745Sheppo 	}
26821ae08745Sheppo 	ASSERT(i < nioctls);	/* because "operation" already validated */
26831ae08745Sheppo 
2684047ba61eSachartre 	if (!(vd->open_flags & FWRITE) && ioctl[i].write) {
2685047ba61eSachartre 		PR0("%s fails because backend is opened read-only",
2686047ba61eSachartre 		    ioctl[i].operation_name);
2687047ba61eSachartre 		request->status = EROFS;
2688047ba61eSachartre 		return (0);
2689047ba61eSachartre 	}
2690047ba61eSachartre 
26911ae08745Sheppo 	if (request->nbytes)
26921ae08745Sheppo 		buf = kmem_zalloc(request->nbytes, KM_SLEEP);
26931ae08745Sheppo 	status = vd_do_ioctl(vd, request, buf, &ioctl[i]);
26941ae08745Sheppo 	if (request->nbytes)
26951ae08745Sheppo 		kmem_free(buf, request->nbytes);
269687a7269eSachartre 
26971ae08745Sheppo 	return (status);
26981ae08745Sheppo }
26991ae08745Sheppo 
27004bac2208Snarayan static int
27014bac2208Snarayan vd_get_devid(vd_task_t *task)
27024bac2208Snarayan {
27034bac2208Snarayan 	vd_t *vd = task->vd;
27044bac2208Snarayan 	vd_dring_payload_t *request = task->request;
27054bac2208Snarayan 	vd_devid_t *vd_devid;
27064bac2208Snarayan 	impl_devid_t *devid;
270787a7269eSachartre 	int status, bufid_len, devid_len, len, sz;
27083af08d82Slm66018 	int bufbytes;
27094bac2208Snarayan 
27103af08d82Slm66018 	PR1("Get Device ID, nbytes=%ld", request->nbytes);
27114bac2208Snarayan 
27123c96341aSnarayan 	if (vd->file) {
271387a7269eSachartre 		if (vd->file_devid == NULL) {
27143af08d82Slm66018 			PR2("No Device ID");
2715205eeb1aSlm66018 			request->status = ENOENT;
2716205eeb1aSlm66018 			return (0);
271787a7269eSachartre 		} else {
271887a7269eSachartre 			sz = ddi_devid_sizeof(vd->file_devid);
271987a7269eSachartre 			devid = kmem_alloc(sz, KM_SLEEP);
272087a7269eSachartre 			bcopy(vd->file_devid, devid, sz);
272187a7269eSachartre 		}
272287a7269eSachartre 	} else {
272387a7269eSachartre 		if (ddi_lyr_get_devid(vd->dev[request->slice],
272487a7269eSachartre 		    (ddi_devid_t *)&devid) != DDI_SUCCESS) {
272587a7269eSachartre 			PR2("No Device ID");
2726205eeb1aSlm66018 			request->status = ENOENT;
2727205eeb1aSlm66018 			return (0);
272887a7269eSachartre 		}
27294bac2208Snarayan 	}
27304bac2208Snarayan 
27314bac2208Snarayan 	bufid_len = request->nbytes - sizeof (vd_devid_t) + 1;
27324bac2208Snarayan 	devid_len = DEVID_GETLEN(devid);
27334bac2208Snarayan 
27343af08d82Slm66018 	/*
27353af08d82Slm66018 	 * Save the buffer size here for use in deallocation.
27363af08d82Slm66018 	 * The actual number of bytes copied is returned in
27373af08d82Slm66018 	 * the 'nbytes' field of the request structure.
27383af08d82Slm66018 	 */
27393af08d82Slm66018 	bufbytes = request->nbytes;
27403af08d82Slm66018 
27413af08d82Slm66018 	vd_devid = kmem_zalloc(bufbytes, KM_SLEEP);
27424bac2208Snarayan 	vd_devid->length = devid_len;
27434bac2208Snarayan 	vd_devid->type = DEVID_GETTYPE(devid);
27444bac2208Snarayan 
27454bac2208Snarayan 	len = (devid_len > bufid_len)? bufid_len : devid_len;
27464bac2208Snarayan 
27474bac2208Snarayan 	bcopy(devid->did_id, vd_devid->id, len);
27484bac2208Snarayan 
274978fcd0a1Sachartre 	request->status = 0;
275078fcd0a1Sachartre 
27514bac2208Snarayan 	/* LDC memory operations require 8-byte multiples */
27524bac2208Snarayan 	ASSERT(request->nbytes % sizeof (uint64_t) == 0);
27534bac2208Snarayan 
27544bac2208Snarayan 	if ((status = ldc_mem_copy(vd->ldc_handle, (caddr_t)vd_devid, 0,
27554bac2208Snarayan 	    &request->nbytes, request->cookie, request->ncookies,
27564bac2208Snarayan 	    LDC_COPY_OUT)) != 0) {
27573af08d82Slm66018 		PR0("ldc_mem_copy() returned errno %d copying to client",
27584bac2208Snarayan 		    status);
27594bac2208Snarayan 	}
27603af08d82Slm66018 	PR1("post mem_copy: nbytes=%ld", request->nbytes);
27614bac2208Snarayan 
27623af08d82Slm66018 	kmem_free(vd_devid, bufbytes);
27634bac2208Snarayan 	ddi_devid_free((ddi_devid_t)devid);
27644bac2208Snarayan 
27654bac2208Snarayan 	return (status);
27664bac2208Snarayan }
27674bac2208Snarayan 
27682f5224aeSachartre static int
27692f5224aeSachartre vd_scsi_reset(vd_t *vd)
27702f5224aeSachartre {
27712f5224aeSachartre 	int rval, status;
27722f5224aeSachartre 	struct uscsi_cmd uscsi = { 0 };
27732f5224aeSachartre 
27742f5224aeSachartre 	uscsi.uscsi_flags = vd_scsi_debug | USCSI_RESET;
27752f5224aeSachartre 	uscsi.uscsi_timeout = vd_scsi_rdwr_timeout;
27762f5224aeSachartre 
27772f5224aeSachartre 	status = ldi_ioctl(vd->ldi_handle[0], USCSICMD, (intptr_t)&uscsi,
27782f5224aeSachartre 	    (vd->open_flags | FKIOCTL), kcred, &rval);
27792f5224aeSachartre 
27802f5224aeSachartre 	return (status);
27812f5224aeSachartre }
27822f5224aeSachartre 
27832f5224aeSachartre static int
27842f5224aeSachartre vd_reset(vd_task_t *task)
27852f5224aeSachartre {
27862f5224aeSachartre 	vd_t *vd = task->vd;
27872f5224aeSachartre 	vd_dring_payload_t *request = task->request;
27882f5224aeSachartre 
27892f5224aeSachartre 	ASSERT(request->operation == VD_OP_RESET);
27902f5224aeSachartre 	ASSERT(vd->scsi);
27912f5224aeSachartre 
27922f5224aeSachartre 	PR0("Performing VD_OP_RESET");
27932f5224aeSachartre 
27942f5224aeSachartre 	if (request->nbytes != 0) {
27952f5224aeSachartre 		PR0("VD_OP_RESET:  Expected nbytes = 0, got %lu",
27962f5224aeSachartre 		    request->nbytes);
27972f5224aeSachartre 		return (EINVAL);
27982f5224aeSachartre 	}
27992f5224aeSachartre 
28002f5224aeSachartre 	request->status = vd_scsi_reset(vd);
28012f5224aeSachartre 
28022f5224aeSachartre 	return (0);
28032f5224aeSachartre }
28042f5224aeSachartre 
28052f5224aeSachartre static int
28062f5224aeSachartre vd_get_capacity(vd_task_t *task)
28072f5224aeSachartre {
28082f5224aeSachartre 	int rv;
28092f5224aeSachartre 	size_t nbytes;
28102f5224aeSachartre 	vd_t *vd = task->vd;
28112f5224aeSachartre 	vd_dring_payload_t *request = task->request;
28122f5224aeSachartre 	vd_capacity_t vd_cap = { 0 };
28132f5224aeSachartre 
28142f5224aeSachartre 	ASSERT(request->operation == VD_OP_GET_CAPACITY);
28152f5224aeSachartre 	ASSERT(vd->scsi);
28162f5224aeSachartre 
28172f5224aeSachartre 	PR0("Performing VD_OP_GET_CAPACITY");
28182f5224aeSachartre 
28192f5224aeSachartre 	nbytes = request->nbytes;
28202f5224aeSachartre 
28212f5224aeSachartre 	if (nbytes != RNDSIZE(vd_capacity_t)) {
28222f5224aeSachartre 		PR0("VD_OP_GET_CAPACITY:  Expected nbytes = %lu, got %lu",
28232f5224aeSachartre 		    RNDSIZE(vd_capacity_t), nbytes);
28242f5224aeSachartre 		return (EINVAL);
28252f5224aeSachartre 	}
28262f5224aeSachartre 
28272f5224aeSachartre 	if (vd->vdisk_size == VD_SIZE_UNKNOWN) {
28282f5224aeSachartre 		if (vd_setup_mediainfo(vd) != 0)
28292f5224aeSachartre 			ASSERT(vd->vdisk_size == VD_SIZE_UNKNOWN);
28302f5224aeSachartre 	}
28312f5224aeSachartre 
28322f5224aeSachartre 	ASSERT(vd->vdisk_size != 0);
28332f5224aeSachartre 
28342f5224aeSachartre 	request->status = 0;
28352f5224aeSachartre 
28362f5224aeSachartre 	vd_cap.vdisk_block_size = vd->vdisk_block_size;
28372f5224aeSachartre 	vd_cap.vdisk_size = vd->vdisk_size;
28382f5224aeSachartre 
28392f5224aeSachartre 	if ((rv = ldc_mem_copy(vd->ldc_handle, (char *)&vd_cap, 0, &nbytes,
28402f5224aeSachartre 	    request->cookie, request->ncookies, LDC_COPY_OUT)) != 0) {
28412f5224aeSachartre 		PR0("ldc_mem_copy() returned errno %d copying to client", rv);
28422f5224aeSachartre 		return (rv);
28432f5224aeSachartre 	}
28442f5224aeSachartre 
28452f5224aeSachartre 	return (0);
28462f5224aeSachartre }
28472f5224aeSachartre 
28482f5224aeSachartre static int
28492f5224aeSachartre vd_get_access(vd_task_t *task)
28502f5224aeSachartre {
28512f5224aeSachartre 	uint64_t access;
28522f5224aeSachartre 	int rv, rval = 0;
28532f5224aeSachartre 	size_t nbytes;
28542f5224aeSachartre 	vd_t *vd = task->vd;
28552f5224aeSachartre 	vd_dring_payload_t *request = task->request;
28562f5224aeSachartre 
28572f5224aeSachartre 	ASSERT(request->operation == VD_OP_GET_ACCESS);
28582f5224aeSachartre 	ASSERT(vd->scsi);
28592f5224aeSachartre 
28602f5224aeSachartre 	PR0("Performing VD_OP_GET_ACCESS");
28612f5224aeSachartre 
28622f5224aeSachartre 	nbytes = request->nbytes;
28632f5224aeSachartre 
28642f5224aeSachartre 	if (nbytes != sizeof (uint64_t)) {
28652f5224aeSachartre 		PR0("VD_OP_GET_ACCESS:  Expected nbytes = %lu, got %lu",
28662f5224aeSachartre 		    sizeof (uint64_t), nbytes);
28672f5224aeSachartre 		return (EINVAL);
28682f5224aeSachartre 	}
28692f5224aeSachartre 
28702f5224aeSachartre 	request->status = ldi_ioctl(vd->ldi_handle[request->slice], MHIOCSTATUS,
28712f5224aeSachartre 	    NULL, (vd->open_flags | FKIOCTL), kcred, &rval);
28722f5224aeSachartre 
28732f5224aeSachartre 	if (request->status != 0)
28742f5224aeSachartre 		return (0);
28752f5224aeSachartre 
28762f5224aeSachartre 	access = (rval == 0)? VD_ACCESS_ALLOWED : VD_ACCESS_DENIED;
28772f5224aeSachartre 
28782f5224aeSachartre 	if ((rv = ldc_mem_copy(vd->ldc_handle, (char *)&access, 0, &nbytes,
28792f5224aeSachartre 	    request->cookie, request->ncookies, LDC_COPY_OUT)) != 0) {
28802f5224aeSachartre 		PR0("ldc_mem_copy() returned errno %d copying to client", rv);
28812f5224aeSachartre 		return (rv);
28822f5224aeSachartre 	}
28832f5224aeSachartre 
28842f5224aeSachartre 	return (0);
28852f5224aeSachartre }
28862f5224aeSachartre 
28872f5224aeSachartre static int
28882f5224aeSachartre vd_set_access(vd_task_t *task)
28892f5224aeSachartre {
28902f5224aeSachartre 	uint64_t flags;
28912f5224aeSachartre 	int rv, rval;
28922f5224aeSachartre 	size_t nbytes;
28932f5224aeSachartre 	vd_t *vd = task->vd;
28942f5224aeSachartre 	vd_dring_payload_t *request = task->request;
28952f5224aeSachartre 
28962f5224aeSachartre 	ASSERT(request->operation == VD_OP_SET_ACCESS);
28972f5224aeSachartre 	ASSERT(vd->scsi);
28982f5224aeSachartre 
28992f5224aeSachartre 	nbytes = request->nbytes;
29002f5224aeSachartre 
29012f5224aeSachartre 	if (nbytes != sizeof (uint64_t)) {
29022f5224aeSachartre 		PR0("VD_OP_SET_ACCESS:  Expected nbytes = %lu, got %lu",
29032f5224aeSachartre 		    sizeof (uint64_t), nbytes);
29042f5224aeSachartre 		return (EINVAL);
29052f5224aeSachartre 	}
29062f5224aeSachartre 
29072f5224aeSachartre 	if ((rv = ldc_mem_copy(vd->ldc_handle, (char *)&flags, 0, &nbytes,
29082f5224aeSachartre 	    request->cookie, request->ncookies, LDC_COPY_IN)) != 0) {
29092f5224aeSachartre 		PR0("ldc_mem_copy() returned errno %d copying from client", rv);
29102f5224aeSachartre 		return (rv);
29112f5224aeSachartre 	}
29122f5224aeSachartre 
29132f5224aeSachartre 	if (flags == VD_ACCESS_SET_CLEAR) {
29142f5224aeSachartre 		PR0("Performing VD_OP_SET_ACCESS (CLEAR)");
29152f5224aeSachartre 		request->status = ldi_ioctl(vd->ldi_handle[request->slice],
29162f5224aeSachartre 		    MHIOCRELEASE, NULL, (vd->open_flags | FKIOCTL), kcred,
29172f5224aeSachartre 		    &rval);
29182f5224aeSachartre 		if (request->status == 0)
29192f5224aeSachartre 			vd->ownership = B_FALSE;
29202f5224aeSachartre 		return (0);
29212f5224aeSachartre 	}
29222f5224aeSachartre 
29232f5224aeSachartre 	/*
29242f5224aeSachartre 	 * As per the VIO spec, the PREEMPT and PRESERVE flags are only valid
29252f5224aeSachartre 	 * when the EXCLUSIVE flag is set.
29262f5224aeSachartre 	 */
29272f5224aeSachartre 	if (!(flags & VD_ACCESS_SET_EXCLUSIVE)) {
29282f5224aeSachartre 		PR0("Invalid VD_OP_SET_ACCESS flags: 0x%lx", flags);
29292f5224aeSachartre 		request->status = EINVAL;
29302f5224aeSachartre 		return (0);
29312f5224aeSachartre 	}
29322f5224aeSachartre 
29332f5224aeSachartre 	switch (flags & (VD_ACCESS_SET_PREEMPT | VD_ACCESS_SET_PRESERVE)) {
29342f5224aeSachartre 
29352f5224aeSachartre 	case VD_ACCESS_SET_PREEMPT | VD_ACCESS_SET_PRESERVE:
29362f5224aeSachartre 		/*
29372f5224aeSachartre 		 * Flags EXCLUSIVE and PREEMPT and PRESERVE. We have to
29382f5224aeSachartre 		 * acquire exclusive access rights, preserve them and we
29392f5224aeSachartre 		 * can use preemption. So we can use the MHIOCTKNOWN ioctl.
29402f5224aeSachartre 		 */
29412f5224aeSachartre 		PR0("Performing VD_OP_SET_ACCESS (EXCLUSIVE|PREEMPT|PRESERVE)");
29422f5224aeSachartre 		request->status = ldi_ioctl(vd->ldi_handle[request->slice],
29432f5224aeSachartre 		    MHIOCTKOWN, NULL, (vd->open_flags | FKIOCTL), kcred, &rval);
29442f5224aeSachartre 		break;
29452f5224aeSachartre 
29462f5224aeSachartre 	case VD_ACCESS_SET_PRESERVE:
29472f5224aeSachartre 		/*
29482f5224aeSachartre 		 * Flags EXCLUSIVE and PRESERVE. We have to acquire exclusive
29492f5224aeSachartre 		 * access rights and preserve them, but not preempt any other
29502f5224aeSachartre 		 * host. So we need to use the MHIOCTKOWN ioctl to enable the
29512f5224aeSachartre 		 * "preserve" feature but we can not called it directly
29522f5224aeSachartre 		 * because it uses preemption. So before that, we use the
29532f5224aeSachartre 		 * MHIOCQRESERVE ioctl to ensure we can get exclusive rights
29542f5224aeSachartre 		 * without preempting anyone.
29552f5224aeSachartre 		 */
29562f5224aeSachartre 		PR0("Performing VD_OP_SET_ACCESS (EXCLUSIVE|PRESERVE)");
29572f5224aeSachartre 		request->status = ldi_ioctl(vd->ldi_handle[request->slice],
29582f5224aeSachartre 		    MHIOCQRESERVE, NULL, (vd->open_flags | FKIOCTL), kcred,
29592f5224aeSachartre 		    &rval);
29602f5224aeSachartre 		if (request->status != 0)
29612f5224aeSachartre 			break;
29622f5224aeSachartre 		request->status = ldi_ioctl(vd->ldi_handle[request->slice],
29632f5224aeSachartre 		    MHIOCTKOWN, NULL, (vd->open_flags | FKIOCTL), kcred, &rval);
29642f5224aeSachartre 		break;
29652f5224aeSachartre 
29662f5224aeSachartre 	case VD_ACCESS_SET_PREEMPT:
29672f5224aeSachartre 		/*
29682f5224aeSachartre 		 * Flags EXCLUSIVE and PREEMPT. We have to acquire exclusive
29692f5224aeSachartre 		 * access rights and we can use preemption. So we try to do
29702f5224aeSachartre 		 * a SCSI reservation, if it fails we reset the disk to clear
29712f5224aeSachartre 		 * any reservation and we try to reserve again.
29722f5224aeSachartre 		 */
29732f5224aeSachartre 		PR0("Performing VD_OP_SET_ACCESS (EXCLUSIVE|PREEMPT)");
29742f5224aeSachartre 		request->status = ldi_ioctl(vd->ldi_handle[request->slice],
29752f5224aeSachartre 		    MHIOCQRESERVE, NULL, (vd->open_flags | FKIOCTL), kcred,
29762f5224aeSachartre 		    &rval);
29772f5224aeSachartre 		if (request->status == 0)
29782f5224aeSachartre 			break;
29792f5224aeSachartre 
29802f5224aeSachartre 		/* reset the disk */
29812f5224aeSachartre 		(void) vd_scsi_reset(vd);
29822f5224aeSachartre 
29832f5224aeSachartre 		/* try again even if the reset has failed */
29842f5224aeSachartre 		request->status = ldi_ioctl(vd->ldi_handle[request->slice],
29852f5224aeSachartre 		    MHIOCQRESERVE, NULL, (vd->open_flags | FKIOCTL), kcred,
29862f5224aeSachartre 		    &rval);
29872f5224aeSachartre 		break;
29882f5224aeSachartre 
29892f5224aeSachartre 	case 0:
29902f5224aeSachartre 		/* Flag EXCLUSIVE only. Just issue a SCSI reservation */
29912f5224aeSachartre 		PR0("Performing VD_OP_SET_ACCESS (EXCLUSIVE)");
29922f5224aeSachartre 		request->status = ldi_ioctl(vd->ldi_handle[request->slice],
29932f5224aeSachartre 		    MHIOCQRESERVE, NULL, (vd->open_flags | FKIOCTL), kcred,
29942f5224aeSachartre 		    &rval);
29952f5224aeSachartre 		break;
29962f5224aeSachartre 	}
29972f5224aeSachartre 
29982f5224aeSachartre 	if (request->status == 0)
29992f5224aeSachartre 		vd->ownership = B_TRUE;
30002f5224aeSachartre 	else
30012f5224aeSachartre 		PR0("VD_OP_SET_ACCESS: error %d", request->status);
30022f5224aeSachartre 
30032f5224aeSachartre 	return (0);
30042f5224aeSachartre }
30052f5224aeSachartre 
30062f5224aeSachartre static void
30072f5224aeSachartre vd_reset_access(vd_t *vd)
30082f5224aeSachartre {
30092f5224aeSachartre 	int status, rval;
30102f5224aeSachartre 
30112f5224aeSachartre 	if (vd->file || !vd->ownership)
30122f5224aeSachartre 		return;
30132f5224aeSachartre 
30142f5224aeSachartre 	PR0("Releasing disk ownership");
30152f5224aeSachartre 	status = ldi_ioctl(vd->ldi_handle[0], MHIOCRELEASE, NULL,
30162f5224aeSachartre 	    (vd->open_flags | FKIOCTL), kcred, &rval);
30172f5224aeSachartre 
30182f5224aeSachartre 	/*
30192f5224aeSachartre 	 * An EACCES failure means that there is a reservation conflict,
30202f5224aeSachartre 	 * so we are not the owner of the disk anymore.
30212f5224aeSachartre 	 */
30222f5224aeSachartre 	if (status == 0 || status == EACCES) {
30232f5224aeSachartre 		vd->ownership = B_FALSE;
30242f5224aeSachartre 		return;
30252f5224aeSachartre 	}
30262f5224aeSachartre 
30272f5224aeSachartre 	PR0("Fail to release ownership, error %d", status);
30282f5224aeSachartre 
30292f5224aeSachartre 	/*
30302f5224aeSachartre 	 * We have failed to release the ownership, try to reset the disk
30312f5224aeSachartre 	 * to release reservations.
30322f5224aeSachartre 	 */
30332f5224aeSachartre 	PR0("Resetting disk");
30342f5224aeSachartre 	status = vd_scsi_reset(vd);
30352f5224aeSachartre 
30362f5224aeSachartre 	if (status != 0)
30372f5224aeSachartre 		PR0("Fail to reset disk, error %d", status);
30382f5224aeSachartre 
30392f5224aeSachartre 	/* whatever the result of the reset is, we try the release again */
30402f5224aeSachartre 	status = ldi_ioctl(vd->ldi_handle[0], MHIOCRELEASE, NULL,
30412f5224aeSachartre 	    (vd->open_flags | FKIOCTL), kcred, &rval);
30422f5224aeSachartre 
30432f5224aeSachartre 	if (status == 0 || status == EACCES) {
30442f5224aeSachartre 		vd->ownership = B_FALSE;
30452f5224aeSachartre 		return;
30462f5224aeSachartre 	}
30472f5224aeSachartre 
30482f5224aeSachartre 	PR0("Fail to release ownership, error %d", status);
30492f5224aeSachartre 
30502f5224aeSachartre 	/*
30512f5224aeSachartre 	 * At this point we have done our best to try to reset the
30522f5224aeSachartre 	 * access rights to the disk and we don't know if we still
30532f5224aeSachartre 	 * own a reservation and if any mechanism to preserve the
30542f5224aeSachartre 	 * ownership is still in place. The ultimate solution would
30552f5224aeSachartre 	 * be to reset the system but this is usually not what we
30562f5224aeSachartre 	 * want to happen.
30572f5224aeSachartre 	 */
30582f5224aeSachartre 
30592f5224aeSachartre 	if (vd_reset_access_failure == A_REBOOT) {
30602f5224aeSachartre 		cmn_err(CE_WARN, VD_RESET_ACCESS_FAILURE_MSG
30612f5224aeSachartre 		    ", rebooting the system", vd->device_path);
30622f5224aeSachartre 		(void) uadmin(A_SHUTDOWN, AD_BOOT, NULL);
30632f5224aeSachartre 	} else if (vd_reset_access_failure == A_DUMP) {
30642f5224aeSachartre 		panic(VD_RESET_ACCESS_FAILURE_MSG, vd->device_path);
30652f5224aeSachartre 	}
30662f5224aeSachartre 
30672f5224aeSachartre 	cmn_err(CE_WARN, VD_RESET_ACCESS_FAILURE_MSG, vd->device_path);
30682f5224aeSachartre }
30692f5224aeSachartre 
30701ae08745Sheppo /*
30711ae08745Sheppo  * Define the supported operations once the functions for performing them have
30721ae08745Sheppo  * been defined
30731ae08745Sheppo  */
30741ae08745Sheppo static const vds_operation_t	vds_operation[] = {
30753af08d82Slm66018 #define	X(_s)	#_s, _s
30763af08d82Slm66018 	{X(VD_OP_BREAD),	vd_start_bio,	vd_complete_bio},
30773af08d82Slm66018 	{X(VD_OP_BWRITE),	vd_start_bio,	vd_complete_bio},
30783af08d82Slm66018 	{X(VD_OP_FLUSH),	vd_ioctl,	NULL},
30793af08d82Slm66018 	{X(VD_OP_GET_WCE),	vd_ioctl,	NULL},
30803af08d82Slm66018 	{X(VD_OP_SET_WCE),	vd_ioctl,	NULL},
30813af08d82Slm66018 	{X(VD_OP_GET_VTOC),	vd_ioctl,	NULL},
30823af08d82Slm66018 	{X(VD_OP_SET_VTOC),	vd_ioctl,	NULL},
30833af08d82Slm66018 	{X(VD_OP_GET_DISKGEOM),	vd_ioctl,	NULL},
30843af08d82Slm66018 	{X(VD_OP_SET_DISKGEOM),	vd_ioctl,	NULL},
30853af08d82Slm66018 	{X(VD_OP_GET_EFI),	vd_ioctl,	NULL},
30863af08d82Slm66018 	{X(VD_OP_SET_EFI),	vd_ioctl,	NULL},
30873af08d82Slm66018 	{X(VD_OP_GET_DEVID),	vd_get_devid,	NULL},
30882f5224aeSachartre 	{X(VD_OP_SCSICMD),	vd_ioctl,	NULL},
30892f5224aeSachartre 	{X(VD_OP_RESET),	vd_reset,	NULL},
30902f5224aeSachartre 	{X(VD_OP_GET_CAPACITY),	vd_get_capacity, NULL},
30912f5224aeSachartre 	{X(VD_OP_SET_ACCESS),	vd_set_access,	NULL},
30922f5224aeSachartre 	{X(VD_OP_GET_ACCESS),	vd_get_access,	NULL},
30933af08d82Slm66018 #undef	X
30941ae08745Sheppo };
30951ae08745Sheppo 
30961ae08745Sheppo static const size_t	vds_noperations =
30971ae08745Sheppo 	(sizeof (vds_operation))/(sizeof (vds_operation[0]));
30981ae08745Sheppo 
30991ae08745Sheppo /*
3100d10e4ef2Snarayan  * Process a task specifying a client I/O request
3101205eeb1aSlm66018  *
3102205eeb1aSlm66018  * Parameters:
3103205eeb1aSlm66018  *	task 		- structure containing the request sent from client
3104205eeb1aSlm66018  *
3105205eeb1aSlm66018  * Return Value
3106205eeb1aSlm66018  *	0	- success
3107205eeb1aSlm66018  *	ENOTSUP	- Unknown/Unsupported VD_OP_XXX operation
3108205eeb1aSlm66018  *	EINVAL	- Invalid disk slice
3109205eeb1aSlm66018  *	!= 0	- some other non-zero return value from start function
31101ae08745Sheppo  */
31111ae08745Sheppo static int
3112205eeb1aSlm66018 vd_do_process_task(vd_task_t *task)
31131ae08745Sheppo {
3114205eeb1aSlm66018 	int			i;
3115d10e4ef2Snarayan 	vd_t			*vd		= task->vd;
3116d10e4ef2Snarayan 	vd_dring_payload_t	*request	= task->request;
31171ae08745Sheppo 
3118d10e4ef2Snarayan 	ASSERT(vd != NULL);
3119d10e4ef2Snarayan 	ASSERT(request != NULL);
31201ae08745Sheppo 
3121d10e4ef2Snarayan 	/* Find the requested operation */
3122205eeb1aSlm66018 	for (i = 0; i < vds_noperations; i++) {
3123205eeb1aSlm66018 		if (request->operation == vds_operation[i].operation) {
3124205eeb1aSlm66018 			/* all operations should have a start func */
3125205eeb1aSlm66018 			ASSERT(vds_operation[i].start != NULL);
3126205eeb1aSlm66018 
3127205eeb1aSlm66018 			task->completef = vds_operation[i].complete;
3128d10e4ef2Snarayan 			break;
3129205eeb1aSlm66018 		}
3130205eeb1aSlm66018 	}
313117cadca8Slm66018 
313217cadca8Slm66018 	/*
313317cadca8Slm66018 	 * We need to check that the requested operation is permitted
313417cadca8Slm66018 	 * for the particular client that sent it or that the loop above
313517cadca8Slm66018 	 * did not complete without finding the operation type (indicating
313617cadca8Slm66018 	 * that the requested operation is unknown/unimplemented)
313717cadca8Slm66018 	 */
313817cadca8Slm66018 	if ((VD_OP_SUPPORTED(vd->operations, request->operation) == B_FALSE) ||
313917cadca8Slm66018 	    (i == vds_noperations)) {
31403af08d82Slm66018 		PR0("Unsupported operation %u", request->operation);
314117cadca8Slm66018 		request->status = ENOTSUP;
314217cadca8Slm66018 		return (0);
31431ae08745Sheppo 	}
31441ae08745Sheppo 
31457636cb21Slm66018 	/* Range-check slice */
314687a7269eSachartre 	if (request->slice >= vd->nslices &&
314787a7269eSachartre 	    (vd->vdisk_type != VD_DISK_TYPE_DISK ||
314887a7269eSachartre 	    request->slice != VD_SLICE_NONE)) {
31493af08d82Slm66018 		PR0("Invalid \"slice\" %u (max %u) for virtual disk",
31507636cb21Slm66018 		    request->slice, (vd->nslices - 1));
31517636cb21Slm66018 		return (EINVAL);
31527636cb21Slm66018 	}
31537636cb21Slm66018 
3154205eeb1aSlm66018 	/*
3155205eeb1aSlm66018 	 * Call the function pointer that starts the operation.
3156205eeb1aSlm66018 	 */
3157205eeb1aSlm66018 	return (vds_operation[i].start(task));
31581ae08745Sheppo }
31591ae08745Sheppo 
3160205eeb1aSlm66018 /*
3161205eeb1aSlm66018  * Description:
3162205eeb1aSlm66018  *	This function is called by both the in-band and descriptor ring
3163205eeb1aSlm66018  *	message processing functions paths to actually execute the task
3164205eeb1aSlm66018  *	requested by the vDisk client. It in turn calls its worker
3165205eeb1aSlm66018  *	function, vd_do_process_task(), to carry our the request.
3166205eeb1aSlm66018  *
3167205eeb1aSlm66018  *	Any transport errors (e.g. LDC errors, vDisk protocol errors) are
3168205eeb1aSlm66018  *	saved in the 'status' field of the task and are propagated back
3169205eeb1aSlm66018  *	up the call stack to trigger a NACK
3170205eeb1aSlm66018  *
3171205eeb1aSlm66018  *	Any request errors (e.g. ENOTTY from an ioctl) are saved in
3172205eeb1aSlm66018  *	the 'status' field of the request and result in an ACK being sent
3173205eeb1aSlm66018  *	by the completion handler.
3174205eeb1aSlm66018  *
3175205eeb1aSlm66018  * Parameters:
3176205eeb1aSlm66018  *	task 		- structure containing the request sent from client
3177205eeb1aSlm66018  *
3178205eeb1aSlm66018  * Return Value
3179205eeb1aSlm66018  *	0		- successful synchronous request.
3180205eeb1aSlm66018  *	!= 0		- transport error (e.g. LDC errors, vDisk protocol)
3181205eeb1aSlm66018  *	EINPROGRESS	- task will be finished in a completion handler
3182205eeb1aSlm66018  */
3183205eeb1aSlm66018 static int
3184205eeb1aSlm66018 vd_process_task(vd_task_t *task)
3185205eeb1aSlm66018 {
3186205eeb1aSlm66018 	vd_t	*vd = task->vd;
3187205eeb1aSlm66018 	int	status;
31881ae08745Sheppo 
3189205eeb1aSlm66018 	DTRACE_PROBE1(task__start, vd_task_t *, task);
31903af08d82Slm66018 
3191205eeb1aSlm66018 	task->status =  vd_do_process_task(task);
3192205eeb1aSlm66018 
3193205eeb1aSlm66018 	/*
3194205eeb1aSlm66018 	 * If the task processing function returned EINPROGRESS indicating
3195205eeb1aSlm66018 	 * that the task needs completing then schedule a taskq entry to
3196205eeb1aSlm66018 	 * finish it now.
3197205eeb1aSlm66018 	 *
3198205eeb1aSlm66018 	 * Otherwise the task processing function returned either zero
3199205eeb1aSlm66018 	 * indicating that the task was finished in the start function (and we
3200205eeb1aSlm66018 	 * don't need to wait in a completion function) or the start function
3201205eeb1aSlm66018 	 * returned an error - in both cases all that needs to happen is the
3202205eeb1aSlm66018 	 * notification to the vDisk client higher up the call stack.
3203205eeb1aSlm66018 	 * If the task was using a Descriptor Ring, we need to mark it as done
3204205eeb1aSlm66018 	 * at this stage.
3205205eeb1aSlm66018 	 */
3206205eeb1aSlm66018 	if (task->status == EINPROGRESS) {
3207d10e4ef2Snarayan 		/* Queue a task to complete the operation */
3208205eeb1aSlm66018 		(void) ddi_taskq_dispatch(vd->completionq, vd_complete,
3209d10e4ef2Snarayan 		    task, DDI_SLEEP);
3210d10e4ef2Snarayan 
3211*f0ca1d9aSsb155480 	} else if (!vd->reset_state && (vd->xfer_mode == VIO_DRING_MODE_V1_0)) {
3212205eeb1aSlm66018 		/* Update the dring element if it's a dring client */
3213205eeb1aSlm66018 		status = vd_mark_elem_done(vd, task->index,
3214205eeb1aSlm66018 		    task->request->status, task->request->nbytes);
3215205eeb1aSlm66018 		if (status == ECONNRESET)
3216205eeb1aSlm66018 			vd_mark_in_reset(vd);
3217205eeb1aSlm66018 	}
3218205eeb1aSlm66018 
3219205eeb1aSlm66018 	return (task->status);
32201ae08745Sheppo }
32211ae08745Sheppo 
32221ae08745Sheppo /*
32230a55fbb7Slm66018  * Return true if the "type", "subtype", and "env" fields of the "tag" first
32240a55fbb7Slm66018  * argument match the corresponding remaining arguments; otherwise, return false
32251ae08745Sheppo  */
32260a55fbb7Slm66018 boolean_t
32271ae08745Sheppo vd_msgtype(vio_msg_tag_t *tag, int type, int subtype, int env)
32281ae08745Sheppo {
32291ae08745Sheppo 	return ((tag->vio_msgtype == type) &&
32301ae08745Sheppo 	    (tag->vio_subtype == subtype) &&
32310a55fbb7Slm66018 	    (tag->vio_subtype_env == env)) ? B_TRUE : B_FALSE;
32321ae08745Sheppo }
32331ae08745Sheppo 
32340a55fbb7Slm66018 /*
32350a55fbb7Slm66018  * Check whether the major/minor version specified in "ver_msg" is supported
32360a55fbb7Slm66018  * by this server.
32370a55fbb7Slm66018  */
32380a55fbb7Slm66018 static boolean_t
32390a55fbb7Slm66018 vds_supported_version(vio_ver_msg_t *ver_msg)
32400a55fbb7Slm66018 {
32410a55fbb7Slm66018 	for (int i = 0; i < vds_num_versions; i++) {
32420a55fbb7Slm66018 		ASSERT(vds_version[i].major > 0);
32430a55fbb7Slm66018 		ASSERT((i == 0) ||
32440a55fbb7Slm66018 		    (vds_version[i].major < vds_version[i-1].major));
32450a55fbb7Slm66018 
32460a55fbb7Slm66018 		/*
32470a55fbb7Slm66018 		 * If the major versions match, adjust the minor version, if
32480a55fbb7Slm66018 		 * necessary, down to the highest value supported by this
32490a55fbb7Slm66018 		 * server and return true so this message will get "ack"ed;
32500a55fbb7Slm66018 		 * the client should also support all minor versions lower
32510a55fbb7Slm66018 		 * than the value it sent
32520a55fbb7Slm66018 		 */
32530a55fbb7Slm66018 		if (ver_msg->ver_major == vds_version[i].major) {
32540a55fbb7Slm66018 			if (ver_msg->ver_minor > vds_version[i].minor) {
32550a55fbb7Slm66018 				PR0("Adjusting minor version from %u to %u",
32560a55fbb7Slm66018 				    ver_msg->ver_minor, vds_version[i].minor);
32570a55fbb7Slm66018 				ver_msg->ver_minor = vds_version[i].minor;
32580a55fbb7Slm66018 			}
32590a55fbb7Slm66018 			return (B_TRUE);
32600a55fbb7Slm66018 		}
32610a55fbb7Slm66018 
32620a55fbb7Slm66018 		/*
32630a55fbb7Slm66018 		 * If the message contains a higher major version number, set
32640a55fbb7Slm66018 		 * the message's major/minor versions to the current values
32650a55fbb7Slm66018 		 * and return false, so this message will get "nack"ed with
32660a55fbb7Slm66018 		 * these values, and the client will potentially try again
32670a55fbb7Slm66018 		 * with the same or a lower version
32680a55fbb7Slm66018 		 */
32690a55fbb7Slm66018 		if (ver_msg->ver_major > vds_version[i].major) {
32700a55fbb7Slm66018 			ver_msg->ver_major = vds_version[i].major;
32710a55fbb7Slm66018 			ver_msg->ver_minor = vds_version[i].minor;
32720a55fbb7Slm66018 			return (B_FALSE);
32730a55fbb7Slm66018 		}
32740a55fbb7Slm66018 
32750a55fbb7Slm66018 		/*
32760a55fbb7Slm66018 		 * Otherwise, the message's major version is less than the
32770a55fbb7Slm66018 		 * current major version, so continue the loop to the next
32780a55fbb7Slm66018 		 * (lower) supported version
32790a55fbb7Slm66018 		 */
32800a55fbb7Slm66018 	}
32810a55fbb7Slm66018 
32820a55fbb7Slm66018 	/*
32830a55fbb7Slm66018 	 * No common version was found; "ground" the version pair in the
32840a55fbb7Slm66018 	 * message to terminate negotiation
32850a55fbb7Slm66018 	 */
32860a55fbb7Slm66018 	ver_msg->ver_major = 0;
32870a55fbb7Slm66018 	ver_msg->ver_minor = 0;
32880a55fbb7Slm66018 	return (B_FALSE);
32890a55fbb7Slm66018 }
32900a55fbb7Slm66018 
32910a55fbb7Slm66018 /*
32920a55fbb7Slm66018  * Process a version message from a client.  vds expects to receive version
32930a55fbb7Slm66018  * messages from clients seeking service, but never issues version messages
32940a55fbb7Slm66018  * itself; therefore, vds can ACK or NACK client version messages, but does
32950a55fbb7Slm66018  * not expect to receive version-message ACKs or NACKs (and will treat such
32960a55fbb7Slm66018  * messages as invalid).
32970a55fbb7Slm66018  */
32981ae08745Sheppo static int
32990a55fbb7Slm66018 vd_process_ver_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
33001ae08745Sheppo {
33011ae08745Sheppo 	vio_ver_msg_t	*ver_msg = (vio_ver_msg_t *)msg;
33021ae08745Sheppo 
33031ae08745Sheppo 
33041ae08745Sheppo 	ASSERT(msglen >= sizeof (msg->tag));
33051ae08745Sheppo 
33061ae08745Sheppo 	if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO,
33071ae08745Sheppo 	    VIO_VER_INFO)) {
33081ae08745Sheppo 		return (ENOMSG);	/* not a version message */
33091ae08745Sheppo 	}
33101ae08745Sheppo 
33111ae08745Sheppo 	if (msglen != sizeof (*ver_msg)) {
33123af08d82Slm66018 		PR0("Expected %lu-byte version message; "
33131ae08745Sheppo 		    "received %lu bytes", sizeof (*ver_msg), msglen);
33141ae08745Sheppo 		return (EBADMSG);
33151ae08745Sheppo 	}
33161ae08745Sheppo 
33171ae08745Sheppo 	if (ver_msg->dev_class != VDEV_DISK) {
33183af08d82Slm66018 		PR0("Expected device class %u (disk); received %u",
33191ae08745Sheppo 		    VDEV_DISK, ver_msg->dev_class);
33201ae08745Sheppo 		return (EBADMSG);
33211ae08745Sheppo 	}
33221ae08745Sheppo 
33230a55fbb7Slm66018 	/*
33240a55fbb7Slm66018 	 * We're talking to the expected kind of client; set our device class
33250a55fbb7Slm66018 	 * for "ack/nack" back to the client
33260a55fbb7Slm66018 	 */
33271ae08745Sheppo 	ver_msg->dev_class = VDEV_DISK_SERVER;
33280a55fbb7Slm66018 
33290a55fbb7Slm66018 	/*
33300a55fbb7Slm66018 	 * Check whether the (valid) version message specifies a version
33310a55fbb7Slm66018 	 * supported by this server.  If the version is not supported, return
33320a55fbb7Slm66018 	 * EBADMSG so the message will get "nack"ed; vds_supported_version()
33330a55fbb7Slm66018 	 * will have updated the message with a supported version for the
33340a55fbb7Slm66018 	 * client to consider
33350a55fbb7Slm66018 	 */
33360a55fbb7Slm66018 	if (!vds_supported_version(ver_msg))
33370a55fbb7Slm66018 		return (EBADMSG);
33380a55fbb7Slm66018 
33390a55fbb7Slm66018 
33400a55fbb7Slm66018 	/*
33410a55fbb7Slm66018 	 * A version has been agreed upon; use the client's SID for
33420a55fbb7Slm66018 	 * communication on this channel now
33430a55fbb7Slm66018 	 */
33440a55fbb7Slm66018 	ASSERT(!(vd->initialized & VD_SID));
33450a55fbb7Slm66018 	vd->sid = ver_msg->tag.vio_sid;
33460a55fbb7Slm66018 	vd->initialized |= VD_SID;
33470a55fbb7Slm66018 
33480a55fbb7Slm66018 	/*
334917cadca8Slm66018 	 * Store the negotiated major and minor version values in the "vd" data
335017cadca8Slm66018 	 * structure so that we can check if certain operations are supported
335117cadca8Slm66018 	 * by the client.
33520a55fbb7Slm66018 	 */
335317cadca8Slm66018 	vd->version.major = ver_msg->ver_major;
335417cadca8Slm66018 	vd->version.minor = ver_msg->ver_minor;
33550a55fbb7Slm66018 
33560a55fbb7Slm66018 	PR0("Using major version %u, minor version %u",
33570a55fbb7Slm66018 	    ver_msg->ver_major, ver_msg->ver_minor);
33581ae08745Sheppo 	return (0);
33591ae08745Sheppo }
33601ae08745Sheppo 
336117cadca8Slm66018 static void
336217cadca8Slm66018 vd_set_exported_operations(vd_t *vd)
336317cadca8Slm66018 {
336417cadca8Slm66018 	vd->operations = 0;	/* clear field */
336517cadca8Slm66018 
336617cadca8Slm66018 	/*
336717cadca8Slm66018 	 * We need to check from the highest version supported to the
336817cadca8Slm66018 	 * lowest because versions with a higher minor number implicitly
336917cadca8Slm66018 	 * support versions with a lower minor number.
337017cadca8Slm66018 	 */
337117cadca8Slm66018 	if (vio_ver_is_supported(vd->version, 1, 1)) {
337217cadca8Slm66018 		ASSERT(vd->open_flags & FREAD);
337317cadca8Slm66018 		vd->operations |= VD_OP_MASK_READ;
337417cadca8Slm66018 
337517cadca8Slm66018 		if (vd->open_flags & FWRITE)
337617cadca8Slm66018 			vd->operations |= VD_OP_MASK_WRITE;
337717cadca8Slm66018 
33782f5224aeSachartre 		if (vd->scsi)
33792f5224aeSachartre 			vd->operations |= VD_OP_MASK_SCSI;
33802f5224aeSachartre 
338117cadca8Slm66018 		if (vd->file && vd_file_is_iso_image(vd)) {
338217cadca8Slm66018 			/*
338317cadca8Slm66018 			 * can't write to ISO images, make sure that write
338417cadca8Slm66018 			 * support is not set in case administrator did not
338517cadca8Slm66018 			 * use "options=ro" when doing an ldm add-vdsdev
338617cadca8Slm66018 			 */
338717cadca8Slm66018 			vd->operations &= ~VD_OP_MASK_WRITE;
338817cadca8Slm66018 		}
338917cadca8Slm66018 	} else if (vio_ver_is_supported(vd->version, 1, 0)) {
339017cadca8Slm66018 		vd->operations = VD_OP_MASK_READ | VD_OP_MASK_WRITE;
339117cadca8Slm66018 	}
339217cadca8Slm66018 
339317cadca8Slm66018 	/* we should have already agreed on a version */
339417cadca8Slm66018 	ASSERT(vd->operations != 0);
339517cadca8Slm66018 }
339617cadca8Slm66018 
33971ae08745Sheppo static int
33981ae08745Sheppo vd_process_attr_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
33991ae08745Sheppo {
34001ae08745Sheppo 	vd_attr_msg_t	*attr_msg = (vd_attr_msg_t *)msg;
34013c96341aSnarayan 	int		status, retry = 0;
34021ae08745Sheppo 
34031ae08745Sheppo 
34041ae08745Sheppo 	ASSERT(msglen >= sizeof (msg->tag));
34051ae08745Sheppo 
34061ae08745Sheppo 	if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO,
34071ae08745Sheppo 	    VIO_ATTR_INFO)) {
3408d10e4ef2Snarayan 		PR0("Message is not an attribute message");
3409d10e4ef2Snarayan 		return (ENOMSG);
34101ae08745Sheppo 	}
34111ae08745Sheppo 
34121ae08745Sheppo 	if (msglen != sizeof (*attr_msg)) {
34133af08d82Slm66018 		PR0("Expected %lu-byte attribute message; "
34141ae08745Sheppo 		    "received %lu bytes", sizeof (*attr_msg), msglen);
34151ae08745Sheppo 		return (EBADMSG);
34161ae08745Sheppo 	}
34171ae08745Sheppo 
34181ae08745Sheppo 	if (attr_msg->max_xfer_sz == 0) {
34193af08d82Slm66018 		PR0("Received maximum transfer size of 0 from client");
34201ae08745Sheppo 		return (EBADMSG);
34211ae08745Sheppo 	}
34221ae08745Sheppo 
34231ae08745Sheppo 	if ((attr_msg->xfer_mode != VIO_DESC_MODE) &&
3424*f0ca1d9aSsb155480 	    (attr_msg->xfer_mode != VIO_DRING_MODE_V1_0)) {
34253af08d82Slm66018 		PR0("Client requested unsupported transfer mode");
34261ae08745Sheppo 		return (EBADMSG);
34271ae08745Sheppo 	}
34281ae08745Sheppo 
34293c96341aSnarayan 	/*
34303c96341aSnarayan 	 * check if the underlying disk is ready, if not try accessing
34313c96341aSnarayan 	 * the device again. Open the vdisk device and extract info
34323c96341aSnarayan 	 * about it, as this is needed to respond to the attr info msg
34333c96341aSnarayan 	 */
34343c96341aSnarayan 	if ((vd->initialized & VD_DISK_READY) == 0) {
34353c96341aSnarayan 		PR0("Retry setting up disk (%s)", vd->device_path);
34363c96341aSnarayan 		do {
34373c96341aSnarayan 			status = vd_setup_vd(vd);
34383c96341aSnarayan 			if (status != EAGAIN || ++retry > vds_dev_retries)
34393c96341aSnarayan 				break;
34403c96341aSnarayan 
34413c96341aSnarayan 			/* incremental delay */
34423c96341aSnarayan 			delay(drv_usectohz(vds_dev_delay));
34433c96341aSnarayan 
34443c96341aSnarayan 			/* if vdisk is no longer enabled - return error */
34453c96341aSnarayan 			if (!vd_enabled(vd))
34463c96341aSnarayan 				return (ENXIO);
34473c96341aSnarayan 
34483c96341aSnarayan 		} while (status == EAGAIN);
34493c96341aSnarayan 
34503c96341aSnarayan 		if (status)
34513c96341aSnarayan 			return (ENXIO);
34523c96341aSnarayan 
34533c96341aSnarayan 		vd->initialized |= VD_DISK_READY;
34543c96341aSnarayan 		ASSERT(vd->nslices > 0 && vd->nslices <= V_NUMPAR);
34553c96341aSnarayan 		PR0("vdisk_type = %s, pseudo = %s, file = %s, nslices = %u",
34563c96341aSnarayan 		    ((vd->vdisk_type == VD_DISK_TYPE_DISK) ? "disk" : "slice"),
34573c96341aSnarayan 		    (vd->pseudo ? "yes" : "no"),
34583c96341aSnarayan 		    (vd->file ? "yes" : "no"),
34593c96341aSnarayan 		    vd->nslices);
34603c96341aSnarayan 	}
34613c96341aSnarayan 
34621ae08745Sheppo 	/* Success:  valid message and transfer mode */
34631ae08745Sheppo 	vd->xfer_mode = attr_msg->xfer_mode;
34643af08d82Slm66018 
34651ae08745Sheppo 	if (vd->xfer_mode == VIO_DESC_MODE) {
34663af08d82Slm66018 
34671ae08745Sheppo 		/*
34681ae08745Sheppo 		 * The vd_dring_inband_msg_t contains one cookie; need room
34691ae08745Sheppo 		 * for up to n-1 more cookies, where "n" is the number of full
34701ae08745Sheppo 		 * pages plus possibly one partial page required to cover
34711ae08745Sheppo 		 * "max_xfer_sz".  Add room for one more cookie if
34721ae08745Sheppo 		 * "max_xfer_sz" isn't an integral multiple of the page size.
34731ae08745Sheppo 		 * Must first get the maximum transfer size in bytes.
34741ae08745Sheppo 		 */
34751ae08745Sheppo 		size_t	max_xfer_bytes = attr_msg->vdisk_block_size ?
34761ae08745Sheppo 		    attr_msg->vdisk_block_size*attr_msg->max_xfer_sz :
34771ae08745Sheppo 		    attr_msg->max_xfer_sz;
34781ae08745Sheppo 		size_t	max_inband_msglen =
34791ae08745Sheppo 		    sizeof (vd_dring_inband_msg_t) +
34801ae08745Sheppo 		    ((max_xfer_bytes/PAGESIZE +
34811ae08745Sheppo 		    ((max_xfer_bytes % PAGESIZE) ? 1 : 0))*
34821ae08745Sheppo 		    (sizeof (ldc_mem_cookie_t)));
34831ae08745Sheppo 
34841ae08745Sheppo 		/*
34851ae08745Sheppo 		 * Set the maximum expected message length to
34861ae08745Sheppo 		 * accommodate in-band-descriptor messages with all
34871ae08745Sheppo 		 * their cookies
34881ae08745Sheppo 		 */
34891ae08745Sheppo 		vd->max_msglen = MAX(vd->max_msglen, max_inband_msglen);
3490d10e4ef2Snarayan 
3491d10e4ef2Snarayan 		/*
3492d10e4ef2Snarayan 		 * Initialize the data structure for processing in-band I/O
3493d10e4ef2Snarayan 		 * request descriptors
3494d10e4ef2Snarayan 		 */
3495d10e4ef2Snarayan 		vd->inband_task.vd	= vd;
34963af08d82Slm66018 		vd->inband_task.msg	= kmem_alloc(vd->max_msglen, KM_SLEEP);
3497d10e4ef2Snarayan 		vd->inband_task.index	= 0;
3498d10e4ef2Snarayan 		vd->inband_task.type	= VD_FINAL_RANGE_TASK;	/* range == 1 */
34991ae08745Sheppo 	}
35001ae08745Sheppo 
3501e1ebb9ecSlm66018 	/* Return the device's block size and max transfer size to the client */
35022f5224aeSachartre 	attr_msg->vdisk_block_size	= vd->vdisk_block_size;
3503e1ebb9ecSlm66018 	attr_msg->max_xfer_sz		= vd->max_xfer_sz;
3504e1ebb9ecSlm66018 
35051ae08745Sheppo 	attr_msg->vdisk_size = vd->vdisk_size;
35061ae08745Sheppo 	attr_msg->vdisk_type = vd->vdisk_type;
350717cadca8Slm66018 	attr_msg->vdisk_media = vd->vdisk_media;
350817cadca8Slm66018 
350917cadca8Slm66018 	/* Discover and save the list of supported VD_OP_XXX operations */
351017cadca8Slm66018 	vd_set_exported_operations(vd);
351117cadca8Slm66018 	attr_msg->operations = vd->operations;
351217cadca8Slm66018 
35131ae08745Sheppo 	PR0("%s", VD_CLIENT(vd));
35143af08d82Slm66018 
35153af08d82Slm66018 	ASSERT(vd->dring_task == NULL);
35163af08d82Slm66018 
35171ae08745Sheppo 	return (0);
35181ae08745Sheppo }
35191ae08745Sheppo 
35201ae08745Sheppo static int
35211ae08745Sheppo vd_process_dring_reg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
35221ae08745Sheppo {
35231ae08745Sheppo 	int			status;
35241ae08745Sheppo 	size_t			expected;
35251ae08745Sheppo 	ldc_mem_info_t		dring_minfo;
35261ae08745Sheppo 	vio_dring_reg_msg_t	*reg_msg = (vio_dring_reg_msg_t *)msg;
35271ae08745Sheppo 
35281ae08745Sheppo 
35291ae08745Sheppo 	ASSERT(msglen >= sizeof (msg->tag));
35301ae08745Sheppo 
35311ae08745Sheppo 	if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO,
35321ae08745Sheppo 	    VIO_DRING_REG)) {
3533d10e4ef2Snarayan 		PR0("Message is not a register-dring message");
3534d10e4ef2Snarayan 		return (ENOMSG);
35351ae08745Sheppo 	}
35361ae08745Sheppo 
35371ae08745Sheppo 	if (msglen < sizeof (*reg_msg)) {
35383af08d82Slm66018 		PR0("Expected at least %lu-byte register-dring message; "
35391ae08745Sheppo 		    "received %lu bytes", sizeof (*reg_msg), msglen);
35401ae08745Sheppo 		return (EBADMSG);
35411ae08745Sheppo 	}
35421ae08745Sheppo 
35431ae08745Sheppo 	expected = sizeof (*reg_msg) +
35441ae08745Sheppo 	    (reg_msg->ncookies - 1)*(sizeof (reg_msg->cookie[0]));
35451ae08745Sheppo 	if (msglen != expected) {
35463af08d82Slm66018 		PR0("Expected %lu-byte register-dring message; "
35471ae08745Sheppo 		    "received %lu bytes", expected, msglen);
35481ae08745Sheppo 		return (EBADMSG);
35491ae08745Sheppo 	}
35501ae08745Sheppo 
35511ae08745Sheppo 	if (vd->initialized & VD_DRING) {
35523af08d82Slm66018 		PR0("A dring was previously registered; only support one");
35531ae08745Sheppo 		return (EBADMSG);
35541ae08745Sheppo 	}
35551ae08745Sheppo 
3556d10e4ef2Snarayan 	if (reg_msg->num_descriptors > INT32_MAX) {
35573af08d82Slm66018 		PR0("reg_msg->num_descriptors = %u; must be <= %u (%s)",
3558d10e4ef2Snarayan 		    reg_msg->ncookies, INT32_MAX, STRINGIZE(INT32_MAX));
3559d10e4ef2Snarayan 		return (EBADMSG);
3560d10e4ef2Snarayan 	}
3561d10e4ef2Snarayan 
35621ae08745Sheppo 	if (reg_msg->ncookies != 1) {
35631ae08745Sheppo 		/*
35641ae08745Sheppo 		 * In addition to fixing the assertion in the success case
35651ae08745Sheppo 		 * below, supporting drings which require more than one
35661ae08745Sheppo 		 * "cookie" requires increasing the value of vd->max_msglen
35671ae08745Sheppo 		 * somewhere in the code path prior to receiving the message
35681ae08745Sheppo 		 * which results in calling this function.  Note that without
35691ae08745Sheppo 		 * making this change, the larger message size required to
35701ae08745Sheppo 		 * accommodate multiple cookies cannot be successfully
35711ae08745Sheppo 		 * received, so this function will not even get called.
35721ae08745Sheppo 		 * Gracefully accommodating more dring cookies might
35731ae08745Sheppo 		 * reasonably demand exchanging an additional attribute or
35741ae08745Sheppo 		 * making a minor protocol adjustment
35751ae08745Sheppo 		 */
35763af08d82Slm66018 		PR0("reg_msg->ncookies = %u != 1", reg_msg->ncookies);
35771ae08745Sheppo 		return (EBADMSG);
35781ae08745Sheppo 	}
35791ae08745Sheppo 
35801ae08745Sheppo 	status = ldc_mem_dring_map(vd->ldc_handle, reg_msg->cookie,
35811ae08745Sheppo 	    reg_msg->ncookies, reg_msg->num_descriptors,
35824bac2208Snarayan 	    reg_msg->descriptor_size, LDC_DIRECT_MAP, &vd->dring_handle);
35831ae08745Sheppo 	if (status != 0) {
35843af08d82Slm66018 		PR0("ldc_mem_dring_map() returned errno %d", status);
35851ae08745Sheppo 		return (status);
35861ae08745Sheppo 	}
35871ae08745Sheppo 
35881ae08745Sheppo 	/*
35891ae08745Sheppo 	 * To remove the need for this assertion, must call
35901ae08745Sheppo 	 * ldc_mem_dring_nextcookie() successfully ncookies-1 times after a
35911ae08745Sheppo 	 * successful call to ldc_mem_dring_map()
35921ae08745Sheppo 	 */
35931ae08745Sheppo 	ASSERT(reg_msg->ncookies == 1);
35941ae08745Sheppo 
35951ae08745Sheppo 	if ((status =
35961ae08745Sheppo 	    ldc_mem_dring_info(vd->dring_handle, &dring_minfo)) != 0) {
35973af08d82Slm66018 		PR0("ldc_mem_dring_info() returned errno %d", status);
35981ae08745Sheppo 		if ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0)
35993af08d82Slm66018 			PR0("ldc_mem_dring_unmap() returned errno %d", status);
36001ae08745Sheppo 		return (status);
36011ae08745Sheppo 	}
36021ae08745Sheppo 
36031ae08745Sheppo 	if (dring_minfo.vaddr == NULL) {
36043af08d82Slm66018 		PR0("Descriptor ring virtual address is NULL");
36050a55fbb7Slm66018 		return (ENXIO);
36061ae08745Sheppo 	}
36071ae08745Sheppo 
36081ae08745Sheppo 
3609d10e4ef2Snarayan 	/* Initialize for valid message and mapped dring */
36101ae08745Sheppo 	PR1("descriptor size = %u, dring length = %u",
36111ae08745Sheppo 	    vd->descriptor_size, vd->dring_len);
36121ae08745Sheppo 	vd->initialized |= VD_DRING;
36131ae08745Sheppo 	vd->dring_ident = 1;	/* "There Can Be Only One" */
36141ae08745Sheppo 	vd->dring = dring_minfo.vaddr;
36151ae08745Sheppo 	vd->descriptor_size = reg_msg->descriptor_size;
36161ae08745Sheppo 	vd->dring_len = reg_msg->num_descriptors;
36171ae08745Sheppo 	reg_msg->dring_ident = vd->dring_ident;
3618d10e4ef2Snarayan 
3619d10e4ef2Snarayan 	/*
3620d10e4ef2Snarayan 	 * Allocate and initialize a "shadow" array of data structures for
3621d10e4ef2Snarayan 	 * tasks to process I/O requests in dring elements
3622d10e4ef2Snarayan 	 */
3623d10e4ef2Snarayan 	vd->dring_task =
3624d10e4ef2Snarayan 	    kmem_zalloc((sizeof (*vd->dring_task)) * vd->dring_len, KM_SLEEP);
3625d10e4ef2Snarayan 	for (int i = 0; i < vd->dring_len; i++) {
3626d10e4ef2Snarayan 		vd->dring_task[i].vd		= vd;
3627d10e4ef2Snarayan 		vd->dring_task[i].index		= i;
3628d10e4ef2Snarayan 		vd->dring_task[i].request	= &VD_DRING_ELEM(i)->payload;
36294bac2208Snarayan 
36304bac2208Snarayan 		status = ldc_mem_alloc_handle(vd->ldc_handle,
36314bac2208Snarayan 		    &(vd->dring_task[i].mhdl));
36324bac2208Snarayan 		if (status) {
36333af08d82Slm66018 			PR0("ldc_mem_alloc_handle() returned err %d ", status);
36344bac2208Snarayan 			return (ENXIO);
36354bac2208Snarayan 		}
36363af08d82Slm66018 
36373af08d82Slm66018 		vd->dring_task[i].msg = kmem_alloc(vd->max_msglen, KM_SLEEP);
3638d10e4ef2Snarayan 	}
3639d10e4ef2Snarayan 
36401ae08745Sheppo 	return (0);
36411ae08745Sheppo }
36421ae08745Sheppo 
36431ae08745Sheppo static int
36441ae08745Sheppo vd_process_dring_unreg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
36451ae08745Sheppo {
36461ae08745Sheppo 	vio_dring_unreg_msg_t	*unreg_msg = (vio_dring_unreg_msg_t *)msg;
36471ae08745Sheppo 
36481ae08745Sheppo 
36491ae08745Sheppo 	ASSERT(msglen >= sizeof (msg->tag));
36501ae08745Sheppo 
36511ae08745Sheppo 	if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO,
36521ae08745Sheppo 	    VIO_DRING_UNREG)) {
3653d10e4ef2Snarayan 		PR0("Message is not an unregister-dring message");
3654d10e4ef2Snarayan 		return (ENOMSG);
36551ae08745Sheppo 	}
36561ae08745Sheppo 
36571ae08745Sheppo 	if (msglen != sizeof (*unreg_msg)) {
36583af08d82Slm66018 		PR0("Expected %lu-byte unregister-dring message; "
36591ae08745Sheppo 		    "received %lu bytes", sizeof (*unreg_msg), msglen);
36601ae08745Sheppo 		return (EBADMSG);
36611ae08745Sheppo 	}
36621ae08745Sheppo 
36631ae08745Sheppo 	if (unreg_msg->dring_ident != vd->dring_ident) {
36643af08d82Slm66018 		PR0("Expected dring ident %lu; received %lu",
36651ae08745Sheppo 		    vd->dring_ident, unreg_msg->dring_ident);
36661ae08745Sheppo 		return (EBADMSG);
36671ae08745Sheppo 	}
36681ae08745Sheppo 
36691ae08745Sheppo 	return (0);
36701ae08745Sheppo }
36711ae08745Sheppo 
36721ae08745Sheppo static int
36731ae08745Sheppo process_rdx_msg(vio_msg_t *msg, size_t msglen)
36741ae08745Sheppo {
36751ae08745Sheppo 	ASSERT(msglen >= sizeof (msg->tag));
36761ae08745Sheppo 
3677d10e4ef2Snarayan 	if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, VIO_RDX)) {
3678d10e4ef2Snarayan 		PR0("Message is not an RDX message");
3679d10e4ef2Snarayan 		return (ENOMSG);
3680d10e4ef2Snarayan 	}
36811ae08745Sheppo 
36821ae08745Sheppo 	if (msglen != sizeof (vio_rdx_msg_t)) {
36833af08d82Slm66018 		PR0("Expected %lu-byte RDX message; received %lu bytes",
36841ae08745Sheppo 		    sizeof (vio_rdx_msg_t), msglen);
36851ae08745Sheppo 		return (EBADMSG);
36861ae08745Sheppo 	}
36871ae08745Sheppo 
3688d10e4ef2Snarayan 	PR0("Valid RDX message");
36891ae08745Sheppo 	return (0);
36901ae08745Sheppo }
36911ae08745Sheppo 
36921ae08745Sheppo static int
36931ae08745Sheppo vd_check_seq_num(vd_t *vd, uint64_t seq_num)
36941ae08745Sheppo {
36951ae08745Sheppo 	if ((vd->initialized & VD_SEQ_NUM) && (seq_num != vd->seq_num + 1)) {
36963af08d82Slm66018 		PR0("Received seq_num %lu; expected %lu",
36971ae08745Sheppo 		    seq_num, (vd->seq_num + 1));
36983af08d82Slm66018 		PR0("initiating soft reset");
3699d10e4ef2Snarayan 		vd_need_reset(vd, B_FALSE);
37001ae08745Sheppo 		return (1);
37011ae08745Sheppo 	}
37021ae08745Sheppo 
37031ae08745Sheppo 	vd->seq_num = seq_num;
37041ae08745Sheppo 	vd->initialized |= VD_SEQ_NUM;	/* superfluous after first time... */
37051ae08745Sheppo 	return (0);
37061ae08745Sheppo }
37071ae08745Sheppo 
37081ae08745Sheppo /*
37091ae08745Sheppo  * Return the expected size of an inband-descriptor message with all the
37101ae08745Sheppo  * cookies it claims to include
37111ae08745Sheppo  */
37121ae08745Sheppo static size_t
37131ae08745Sheppo expected_inband_size(vd_dring_inband_msg_t *msg)
37141ae08745Sheppo {
37151ae08745Sheppo 	return ((sizeof (*msg)) +
37161ae08745Sheppo 	    (msg->payload.ncookies - 1)*(sizeof (msg->payload.cookie[0])));
37171ae08745Sheppo }
37181ae08745Sheppo 
37191ae08745Sheppo /*
37201ae08745Sheppo  * Process an in-band descriptor message:  used with clients like OBP, with
37211ae08745Sheppo  * which vds exchanges descriptors within VIO message payloads, rather than
37221ae08745Sheppo  * operating on them within a descriptor ring
37231ae08745Sheppo  */
37241ae08745Sheppo static int
37253af08d82Slm66018 vd_process_desc_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
37261ae08745Sheppo {
37271ae08745Sheppo 	size_t			expected;
37281ae08745Sheppo 	vd_dring_inband_msg_t	*desc_msg = (vd_dring_inband_msg_t *)msg;
37291ae08745Sheppo 
37301ae08745Sheppo 
37311ae08745Sheppo 	ASSERT(msglen >= sizeof (msg->tag));
37321ae08745Sheppo 
37331ae08745Sheppo 	if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO,
3734d10e4ef2Snarayan 	    VIO_DESC_DATA)) {
3735d10e4ef2Snarayan 		PR1("Message is not an in-band-descriptor message");
3736d10e4ef2Snarayan 		return (ENOMSG);
3737d10e4ef2Snarayan 	}
37381ae08745Sheppo 
37391ae08745Sheppo 	if (msglen < sizeof (*desc_msg)) {
37403af08d82Slm66018 		PR0("Expected at least %lu-byte descriptor message; "
37411ae08745Sheppo 		    "received %lu bytes", sizeof (*desc_msg), msglen);
37421ae08745Sheppo 		return (EBADMSG);
37431ae08745Sheppo 	}
37441ae08745Sheppo 
37451ae08745Sheppo 	if (msglen != (expected = expected_inband_size(desc_msg))) {
37463af08d82Slm66018 		PR0("Expected %lu-byte descriptor message; "
37471ae08745Sheppo 		    "received %lu bytes", expected, msglen);
37481ae08745Sheppo 		return (EBADMSG);
37491ae08745Sheppo 	}
37501ae08745Sheppo 
3751d10e4ef2Snarayan 	if (vd_check_seq_num(vd, desc_msg->hdr.seq_num) != 0)
37521ae08745Sheppo 		return (EBADMSG);
37531ae08745Sheppo 
3754d10e4ef2Snarayan 	/*
3755d10e4ef2Snarayan 	 * Valid message:  Set up the in-band descriptor task and process the
3756d10e4ef2Snarayan 	 * request.  Arrange to acknowledge the client's message, unless an
3757d10e4ef2Snarayan 	 * error processing the descriptor task results in setting
3758d10e4ef2Snarayan 	 * VIO_SUBTYPE_NACK
3759d10e4ef2Snarayan 	 */
3760d10e4ef2Snarayan 	PR1("Valid in-band-descriptor message");
3761d10e4ef2Snarayan 	msg->tag.vio_subtype = VIO_SUBTYPE_ACK;
37623af08d82Slm66018 
37633af08d82Slm66018 	ASSERT(vd->inband_task.msg != NULL);
37643af08d82Slm66018 
37653af08d82Slm66018 	bcopy(msg, vd->inband_task.msg, msglen);
3766d10e4ef2Snarayan 	vd->inband_task.msglen	= msglen;
37673af08d82Slm66018 
37683af08d82Slm66018 	/*
37693af08d82Slm66018 	 * The task request is now the payload of the message
37703af08d82Slm66018 	 * that was just copied into the body of the task.
37713af08d82Slm66018 	 */
37723af08d82Slm66018 	desc_msg = (vd_dring_inband_msg_t *)vd->inband_task.msg;
3773d10e4ef2Snarayan 	vd->inband_task.request	= &desc_msg->payload;
37743af08d82Slm66018 
3775d10e4ef2Snarayan 	return (vd_process_task(&vd->inband_task));
37761ae08745Sheppo }
37771ae08745Sheppo 
37781ae08745Sheppo static int
3779d10e4ef2Snarayan vd_process_element(vd_t *vd, vd_task_type_t type, uint32_t idx,
37803af08d82Slm66018     vio_msg_t *msg, size_t msglen)
37811ae08745Sheppo {
37821ae08745Sheppo 	int			status;
3783d10e4ef2Snarayan 	boolean_t		ready;
3784d10e4ef2Snarayan 	vd_dring_entry_t	*elem = VD_DRING_ELEM(idx);
37851ae08745Sheppo 
37861ae08745Sheppo 
3787d10e4ef2Snarayan 	/* Accept the updated dring element */
3788d10e4ef2Snarayan 	if ((status = ldc_mem_dring_acquire(vd->dring_handle, idx, idx)) != 0) {
37893af08d82Slm66018 		PR0("ldc_mem_dring_acquire() returned errno %d", status);
37901ae08745Sheppo 		return (status);
37911ae08745Sheppo 	}
3792d10e4ef2Snarayan 	ready = (elem->hdr.dstate == VIO_DESC_READY);
3793d10e4ef2Snarayan 	if (ready) {
3794d10e4ef2Snarayan 		elem->hdr.dstate = VIO_DESC_ACCEPTED;
3795d10e4ef2Snarayan 	} else {
37963af08d82Slm66018 		PR0("descriptor %u not ready", idx);
3797d10e4ef2Snarayan 		VD_DUMP_DRING_ELEM(elem);
3798d10e4ef2Snarayan 	}
3799d10e4ef2Snarayan 	if ((status = ldc_mem_dring_release(vd->dring_handle, idx, idx)) != 0) {
38003af08d82Slm66018 		PR0("ldc_mem_dring_release() returned errno %d", status);
38011ae08745Sheppo 		return (status);
38021ae08745Sheppo 	}
3803d10e4ef2Snarayan 	if (!ready)
3804d10e4ef2Snarayan 		return (EBUSY);
38051ae08745Sheppo 
38061ae08745Sheppo 
3807d10e4ef2Snarayan 	/* Initialize a task and process the accepted element */
3808d10e4ef2Snarayan 	PR1("Processing dring element %u", idx);
3809d10e4ef2Snarayan 	vd->dring_task[idx].type	= type;
38103af08d82Slm66018 
38113af08d82Slm66018 	/* duplicate msg buf for cookies etc. */
38123af08d82Slm66018 	bcopy(msg, vd->dring_task[idx].msg, msglen);
38133af08d82Slm66018 
3814d10e4ef2Snarayan 	vd->dring_task[idx].msglen	= msglen;
3815205eeb1aSlm66018 	return (vd_process_task(&vd->dring_task[idx]));
38161ae08745Sheppo }
38171ae08745Sheppo 
38181ae08745Sheppo static int
3819d10e4ef2Snarayan vd_process_element_range(vd_t *vd, int start, int end,
38203af08d82Slm66018     vio_msg_t *msg, size_t msglen)
3821d10e4ef2Snarayan {
3822d10e4ef2Snarayan 	int		i, n, nelem, status = 0;
3823d10e4ef2Snarayan 	boolean_t	inprogress = B_FALSE;
3824d10e4ef2Snarayan 	vd_task_type_t	type;
3825d10e4ef2Snarayan 
3826d10e4ef2Snarayan 
3827d10e4ef2Snarayan 	ASSERT(start >= 0);
3828d10e4ef2Snarayan 	ASSERT(end >= 0);
3829d10e4ef2Snarayan 
3830d10e4ef2Snarayan 	/*
3831d10e4ef2Snarayan 	 * Arrange to acknowledge the client's message, unless an error
3832d10e4ef2Snarayan 	 * processing one of the dring elements results in setting
3833d10e4ef2Snarayan 	 * VIO_SUBTYPE_NACK
3834d10e4ef2Snarayan 	 */
3835d10e4ef2Snarayan 	msg->tag.vio_subtype = VIO_SUBTYPE_ACK;
3836d10e4ef2Snarayan 
3837d10e4ef2Snarayan 	/*
3838d10e4ef2Snarayan 	 * Process the dring elements in the range
3839d10e4ef2Snarayan 	 */
3840d10e4ef2Snarayan 	nelem = ((end < start) ? end + vd->dring_len : end) - start + 1;
3841d10e4ef2Snarayan 	for (i = start, n = nelem; n > 0; i = (i + 1) % vd->dring_len, n--) {
3842d10e4ef2Snarayan 		((vio_dring_msg_t *)msg)->end_idx = i;
3843d10e4ef2Snarayan 		type = (n == 1) ? VD_FINAL_RANGE_TASK : VD_NONFINAL_RANGE_TASK;
38443af08d82Slm66018 		status = vd_process_element(vd, type, i, msg, msglen);
3845d10e4ef2Snarayan 		if (status == EINPROGRESS)
3846d10e4ef2Snarayan 			inprogress = B_TRUE;
3847d10e4ef2Snarayan 		else if (status != 0)
3848d10e4ef2Snarayan 			break;
3849d10e4ef2Snarayan 	}
3850d10e4ef2Snarayan 
3851d10e4ef2Snarayan 	/*
3852d10e4ef2Snarayan 	 * If some, but not all, operations of a multi-element range are in
3853d10e4ef2Snarayan 	 * progress, wait for other operations to complete before returning
3854d10e4ef2Snarayan 	 * (which will result in "ack" or "nack" of the message).  Note that
3855d10e4ef2Snarayan 	 * all outstanding operations will need to complete, not just the ones
3856d10e4ef2Snarayan 	 * corresponding to the current range of dring elements; howevever, as
3857d10e4ef2Snarayan 	 * this situation is an error case, performance is less critical.
3858d10e4ef2Snarayan 	 */
3859d10e4ef2Snarayan 	if ((nelem > 1) && (status != EINPROGRESS) && inprogress)
3860d10e4ef2Snarayan 		ddi_taskq_wait(vd->completionq);
3861d10e4ef2Snarayan 
3862d10e4ef2Snarayan 	return (status);
3863d10e4ef2Snarayan }
3864d10e4ef2Snarayan 
3865d10e4ef2Snarayan static int
38663af08d82Slm66018 vd_process_dring_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
38671ae08745Sheppo {
38681ae08745Sheppo 	vio_dring_msg_t	*dring_msg = (vio_dring_msg_t *)msg;
38691ae08745Sheppo 
38701ae08745Sheppo 
38711ae08745Sheppo 	ASSERT(msglen >= sizeof (msg->tag));
38721ae08745Sheppo 
38731ae08745Sheppo 	if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO,
38741ae08745Sheppo 	    VIO_DRING_DATA)) {
3875d10e4ef2Snarayan 		PR1("Message is not a dring-data message");
3876d10e4ef2Snarayan 		return (ENOMSG);
38771ae08745Sheppo 	}
38781ae08745Sheppo 
38791ae08745Sheppo 	if (msglen != sizeof (*dring_msg)) {
38803af08d82Slm66018 		PR0("Expected %lu-byte dring message; received %lu bytes",
38811ae08745Sheppo 		    sizeof (*dring_msg), msglen);
38821ae08745Sheppo 		return (EBADMSG);
38831ae08745Sheppo 	}
38841ae08745Sheppo 
3885d10e4ef2Snarayan 	if (vd_check_seq_num(vd, dring_msg->seq_num) != 0)
38861ae08745Sheppo 		return (EBADMSG);
38871ae08745Sheppo 
38881ae08745Sheppo 	if (dring_msg->dring_ident != vd->dring_ident) {
38893af08d82Slm66018 		PR0("Expected dring ident %lu; received ident %lu",
38901ae08745Sheppo 		    vd->dring_ident, dring_msg->dring_ident);
38911ae08745Sheppo 		return (EBADMSG);
38921ae08745Sheppo 	}
38931ae08745Sheppo 
3894d10e4ef2Snarayan 	if (dring_msg->start_idx >= vd->dring_len) {
38953af08d82Slm66018 		PR0("\"start_idx\" = %u; must be less than %u",
3896d10e4ef2Snarayan 		    dring_msg->start_idx, vd->dring_len);
3897d10e4ef2Snarayan 		return (EBADMSG);
3898d10e4ef2Snarayan 	}
38991ae08745Sheppo 
3900d10e4ef2Snarayan 	if ((dring_msg->end_idx < 0) ||
3901d10e4ef2Snarayan 	    (dring_msg->end_idx >= vd->dring_len)) {
39023af08d82Slm66018 		PR0("\"end_idx\" = %u; must be >= 0 and less than %u",
3903d10e4ef2Snarayan 		    dring_msg->end_idx, vd->dring_len);
3904d10e4ef2Snarayan 		return (EBADMSG);
3905d10e4ef2Snarayan 	}
3906d10e4ef2Snarayan 
3907d10e4ef2Snarayan 	/* Valid message; process range of updated dring elements */
3908d10e4ef2Snarayan 	PR1("Processing descriptor range, start = %u, end = %u",
3909d10e4ef2Snarayan 	    dring_msg->start_idx, dring_msg->end_idx);
3910d10e4ef2Snarayan 	return (vd_process_element_range(vd, dring_msg->start_idx,
39113af08d82Slm66018 	    dring_msg->end_idx, msg, msglen));
39121ae08745Sheppo }
39131ae08745Sheppo 
39141ae08745Sheppo static int
39151ae08745Sheppo recv_msg(ldc_handle_t ldc_handle, void *msg, size_t *nbytes)
39161ae08745Sheppo {
39171ae08745Sheppo 	int	retry, status;
39181ae08745Sheppo 	size_t	size = *nbytes;
39191ae08745Sheppo 
39201ae08745Sheppo 
39211ae08745Sheppo 	for (retry = 0, status = ETIMEDOUT;
39221ae08745Sheppo 	    retry < vds_ldc_retries && status == ETIMEDOUT;
39231ae08745Sheppo 	    retry++) {
39241ae08745Sheppo 		PR1("ldc_read() attempt %d", (retry + 1));
39251ae08745Sheppo 		*nbytes = size;
39261ae08745Sheppo 		status = ldc_read(ldc_handle, msg, nbytes);
39271ae08745Sheppo 	}
39281ae08745Sheppo 
39293af08d82Slm66018 	if (status) {
39303af08d82Slm66018 		PR0("ldc_read() returned errno %d", status);
39313af08d82Slm66018 		if (status != ECONNRESET)
39323af08d82Slm66018 			return (ENOMSG);
39331ae08745Sheppo 		return (status);
39341ae08745Sheppo 	} else if (*nbytes == 0) {
39351ae08745Sheppo 		PR1("ldc_read() returned 0 and no message read");
39361ae08745Sheppo 		return (ENOMSG);
39371ae08745Sheppo 	}
39381ae08745Sheppo 
39391ae08745Sheppo 	PR1("RCVD %lu-byte message", *nbytes);
39401ae08745Sheppo 	return (0);
39411ae08745Sheppo }
39421ae08745Sheppo 
39431ae08745Sheppo static int
39443af08d82Slm66018 vd_do_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
39451ae08745Sheppo {
39461ae08745Sheppo 	int		status;
39471ae08745Sheppo 
39481ae08745Sheppo 
39491ae08745Sheppo 	PR1("Processing (%x/%x/%x) message", msg->tag.vio_msgtype,
39501ae08745Sheppo 	    msg->tag.vio_subtype, msg->tag.vio_subtype_env);
39513af08d82Slm66018 #ifdef	DEBUG
39523af08d82Slm66018 	vd_decode_tag(msg);
39533af08d82Slm66018 #endif
39541ae08745Sheppo 
39551ae08745Sheppo 	/*
39561ae08745Sheppo 	 * Validate session ID up front, since it applies to all messages
39571ae08745Sheppo 	 * once set
39581ae08745Sheppo 	 */
39591ae08745Sheppo 	if ((msg->tag.vio_sid != vd->sid) && (vd->initialized & VD_SID)) {
39603af08d82Slm66018 		PR0("Expected SID %u, received %u", vd->sid,
39611ae08745Sheppo 		    msg->tag.vio_sid);
39621ae08745Sheppo 		return (EBADMSG);
39631ae08745Sheppo 	}
39641ae08745Sheppo 
39653af08d82Slm66018 	PR1("\tWhile in state %d (%s)", vd->state, vd_decode_state(vd->state));
39661ae08745Sheppo 
39671ae08745Sheppo 	/*
39681ae08745Sheppo 	 * Process the received message based on connection state
39691ae08745Sheppo 	 */
39701ae08745Sheppo 	switch (vd->state) {
39711ae08745Sheppo 	case VD_STATE_INIT:	/* expect version message */
39720a55fbb7Slm66018 		if ((status = vd_process_ver_msg(vd, msg, msglen)) != 0)
39731ae08745Sheppo 			return (status);
39741ae08745Sheppo 
39751ae08745Sheppo 		/* Version negotiated, move to that state */
39761ae08745Sheppo 		vd->state = VD_STATE_VER;
39771ae08745Sheppo 		return (0);
39781ae08745Sheppo 
39791ae08745Sheppo 	case VD_STATE_VER:	/* expect attribute message */
39801ae08745Sheppo 		if ((status = vd_process_attr_msg(vd, msg, msglen)) != 0)
39811ae08745Sheppo 			return (status);
39821ae08745Sheppo 
39831ae08745Sheppo 		/* Attributes exchanged, move to that state */
39841ae08745Sheppo 		vd->state = VD_STATE_ATTR;
39851ae08745Sheppo 		return (0);
39861ae08745Sheppo 
39871ae08745Sheppo 	case VD_STATE_ATTR:
39881ae08745Sheppo 		switch (vd->xfer_mode) {
39891ae08745Sheppo 		case VIO_DESC_MODE:	/* expect RDX message */
39901ae08745Sheppo 			if ((status = process_rdx_msg(msg, msglen)) != 0)
39911ae08745Sheppo 				return (status);
39921ae08745Sheppo 
39931ae08745Sheppo 			/* Ready to receive in-band descriptors */
39941ae08745Sheppo 			vd->state = VD_STATE_DATA;
39951ae08745Sheppo 			return (0);
39961ae08745Sheppo 
3997*f0ca1d9aSsb155480 		case VIO_DRING_MODE_V1_0:  /* expect register-dring message */
39981ae08745Sheppo 			if ((status =
39991ae08745Sheppo 			    vd_process_dring_reg_msg(vd, msg, msglen)) != 0)
40001ae08745Sheppo 				return (status);
40011ae08745Sheppo 
40021ae08745Sheppo 			/* One dring negotiated, move to that state */
40031ae08745Sheppo 			vd->state = VD_STATE_DRING;
40041ae08745Sheppo 			return (0);
40051ae08745Sheppo 
40061ae08745Sheppo 		default:
40071ae08745Sheppo 			ASSERT("Unsupported transfer mode");
40083af08d82Slm66018 			PR0("Unsupported transfer mode");
40091ae08745Sheppo 			return (ENOTSUP);
40101ae08745Sheppo 		}
40111ae08745Sheppo 
40121ae08745Sheppo 	case VD_STATE_DRING:	/* expect RDX, register-dring, or unreg-dring */
40131ae08745Sheppo 		if ((status = process_rdx_msg(msg, msglen)) == 0) {
40141ae08745Sheppo 			/* Ready to receive data */
40151ae08745Sheppo 			vd->state = VD_STATE_DATA;
40161ae08745Sheppo 			return (0);
40171ae08745Sheppo 		} else if (status != ENOMSG) {
40181ae08745Sheppo 			return (status);
40191ae08745Sheppo 		}
40201ae08745Sheppo 
40211ae08745Sheppo 
40221ae08745Sheppo 		/*
40231ae08745Sheppo 		 * If another register-dring message is received, stay in
40241ae08745Sheppo 		 * dring state in case the client sends RDX; although the
40251ae08745Sheppo 		 * protocol allows multiple drings, this server does not
40261ae08745Sheppo 		 * support using more than one
40271ae08745Sheppo 		 */
40281ae08745Sheppo 		if ((status =
40291ae08745Sheppo 		    vd_process_dring_reg_msg(vd, msg, msglen)) != ENOMSG)
40301ae08745Sheppo 			return (status);
40311ae08745Sheppo 
40321ae08745Sheppo 		/*
40331ae08745Sheppo 		 * Acknowledge an unregister-dring message, but reset the
40341ae08745Sheppo 		 * connection anyway:  Although the protocol allows
40351ae08745Sheppo 		 * unregistering drings, this server cannot serve a vdisk
40361ae08745Sheppo 		 * without its only dring
40371ae08745Sheppo 		 */
40381ae08745Sheppo 		status = vd_process_dring_unreg_msg(vd, msg, msglen);
40391ae08745Sheppo 		return ((status == 0) ? ENOTSUP : status);
40401ae08745Sheppo 
40411ae08745Sheppo 	case VD_STATE_DATA:
40421ae08745Sheppo 		switch (vd->xfer_mode) {
40431ae08745Sheppo 		case VIO_DESC_MODE:	/* expect in-band-descriptor message */
40443af08d82Slm66018 			return (vd_process_desc_msg(vd, msg, msglen));
40451ae08745Sheppo 
4046*f0ca1d9aSsb155480 		case VIO_DRING_MODE_V1_0: /* expect dring-data or unreg-dring */
40471ae08745Sheppo 			/*
40481ae08745Sheppo 			 * Typically expect dring-data messages, so handle
40491ae08745Sheppo 			 * them first
40501ae08745Sheppo 			 */
40511ae08745Sheppo 			if ((status = vd_process_dring_msg(vd, msg,
40523af08d82Slm66018 			    msglen)) != ENOMSG)
40531ae08745Sheppo 				return (status);
40541ae08745Sheppo 
40551ae08745Sheppo 			/*
40561ae08745Sheppo 			 * Acknowledge an unregister-dring message, but reset
40571ae08745Sheppo 			 * the connection anyway:  Although the protocol
40581ae08745Sheppo 			 * allows unregistering drings, this server cannot
40591ae08745Sheppo 			 * serve a vdisk without its only dring
40601ae08745Sheppo 			 */
40611ae08745Sheppo 			status = vd_process_dring_unreg_msg(vd, msg, msglen);
40621ae08745Sheppo 			return ((status == 0) ? ENOTSUP : status);
40631ae08745Sheppo 
40641ae08745Sheppo 		default:
40651ae08745Sheppo 			ASSERT("Unsupported transfer mode");
40663af08d82Slm66018 			PR0("Unsupported transfer mode");
40671ae08745Sheppo 			return (ENOTSUP);
40681ae08745Sheppo 		}
40691ae08745Sheppo 
40701ae08745Sheppo 	default:
40711ae08745Sheppo 		ASSERT("Invalid client connection state");
40723af08d82Slm66018 		PR0("Invalid client connection state");
40731ae08745Sheppo 		return (ENOTSUP);
40741ae08745Sheppo 	}
40751ae08745Sheppo }
40761ae08745Sheppo 
4077d10e4ef2Snarayan static int
40783af08d82Slm66018 vd_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
40791ae08745Sheppo {
40801ae08745Sheppo 	int		status;
40811ae08745Sheppo 	boolean_t	reset_ldc = B_FALSE;
4082205eeb1aSlm66018 	vd_task_t	task;
40831ae08745Sheppo 
40841ae08745Sheppo 	/*
40851ae08745Sheppo 	 * Check that the message is at least big enough for a "tag", so that
40861ae08745Sheppo 	 * message processing can proceed based on tag-specified message type
40871ae08745Sheppo 	 */
40881ae08745Sheppo 	if (msglen < sizeof (vio_msg_tag_t)) {
40893af08d82Slm66018 		PR0("Received short (%lu-byte) message", msglen);
40901ae08745Sheppo 		/* Can't "nack" short message, so drop the big hammer */
40913af08d82Slm66018 		PR0("initiating full reset");
4092d10e4ef2Snarayan 		vd_need_reset(vd, B_TRUE);
4093d10e4ef2Snarayan 		return (EBADMSG);
40941ae08745Sheppo 	}
40951ae08745Sheppo 
40961ae08745Sheppo 	/*
40971ae08745Sheppo 	 * Process the message
40981ae08745Sheppo 	 */
40993af08d82Slm66018 	switch (status = vd_do_process_msg(vd, msg, msglen)) {
41001ae08745Sheppo 	case 0:
41011ae08745Sheppo 		/* "ack" valid, successfully-processed messages */
41021ae08745Sheppo 		msg->tag.vio_subtype = VIO_SUBTYPE_ACK;
41031ae08745Sheppo 		break;
41041ae08745Sheppo 
4105d10e4ef2Snarayan 	case EINPROGRESS:
4106d10e4ef2Snarayan 		/* The completion handler will "ack" or "nack" the message */
4107d10e4ef2Snarayan 		return (EINPROGRESS);
41081ae08745Sheppo 	case ENOMSG:
41093af08d82Slm66018 		PR0("Received unexpected message");
41101ae08745Sheppo 		_NOTE(FALLTHROUGH);
41111ae08745Sheppo 	case EBADMSG:
41121ae08745Sheppo 	case ENOTSUP:
4113205eeb1aSlm66018 		/* "transport" error will cause NACK of invalid messages */
41141ae08745Sheppo 		msg->tag.vio_subtype = VIO_SUBTYPE_NACK;
41151ae08745Sheppo 		break;
41161ae08745Sheppo 
41171ae08745Sheppo 	default:
4118205eeb1aSlm66018 		/* "transport" error will cause NACK of invalid messages */
41191ae08745Sheppo 		msg->tag.vio_subtype = VIO_SUBTYPE_NACK;
41201ae08745Sheppo 		/* An LDC error probably occurred, so try resetting it */
41211ae08745Sheppo 		reset_ldc = B_TRUE;
41221ae08745Sheppo 		break;
41231ae08745Sheppo 	}
41241ae08745Sheppo 
41253af08d82Slm66018 	PR1("\tResulting in state %d (%s)", vd->state,
41263af08d82Slm66018 	    vd_decode_state(vd->state));
41273af08d82Slm66018 
4128205eeb1aSlm66018 	/* populate the task so we can dispatch it on the taskq */
4129205eeb1aSlm66018 	task.vd = vd;
4130205eeb1aSlm66018 	task.msg = msg;
4131205eeb1aSlm66018 	task.msglen = msglen;
4132205eeb1aSlm66018 
4133205eeb1aSlm66018 	/*
4134205eeb1aSlm66018 	 * Queue a task to send the notification that the operation completed.
4135205eeb1aSlm66018 	 * We need to ensure that requests are responded to in the correct
4136205eeb1aSlm66018 	 * order and since the taskq is processed serially this ordering
4137205eeb1aSlm66018 	 * is maintained.
4138205eeb1aSlm66018 	 */
4139205eeb1aSlm66018 	(void) ddi_taskq_dispatch(vd->completionq, vd_serial_notify,
4140205eeb1aSlm66018 	    &task, DDI_SLEEP);
4141205eeb1aSlm66018 
4142205eeb1aSlm66018 	/*
4143205eeb1aSlm66018 	 * To ensure handshake negotiations do not happen out of order, such
4144205eeb1aSlm66018 	 * requests that come through this path should not be done in parallel
4145205eeb1aSlm66018 	 * so we need to wait here until the response is sent to the client.
4146205eeb1aSlm66018 	 */
4147205eeb1aSlm66018 	ddi_taskq_wait(vd->completionq);
41481ae08745Sheppo 
4149d10e4ef2Snarayan 	/* Arrange to reset the connection for nack'ed or failed messages */
41503af08d82Slm66018 	if ((status != 0) || reset_ldc) {
41513af08d82Slm66018 		PR0("initiating %s reset",
41523af08d82Slm66018 		    (reset_ldc) ? "full" : "soft");
4153d10e4ef2Snarayan 		vd_need_reset(vd, reset_ldc);
41543af08d82Slm66018 	}
4155d10e4ef2Snarayan 
4156d10e4ef2Snarayan 	return (status);
4157d10e4ef2Snarayan }
4158d10e4ef2Snarayan 
4159d10e4ef2Snarayan static boolean_t
4160d10e4ef2Snarayan vd_enabled(vd_t *vd)
4161d10e4ef2Snarayan {
4162d10e4ef2Snarayan 	boolean_t	enabled;
4163d10e4ef2Snarayan 
4164d10e4ef2Snarayan 	mutex_enter(&vd->lock);
4165d10e4ef2Snarayan 	enabled = vd->enabled;
4166d10e4ef2Snarayan 	mutex_exit(&vd->lock);
4167d10e4ef2Snarayan 	return (enabled);
41681ae08745Sheppo }
41691ae08745Sheppo 
41701ae08745Sheppo static void
41710a55fbb7Slm66018 vd_recv_msg(void *arg)
41721ae08745Sheppo {
41731ae08745Sheppo 	vd_t	*vd = (vd_t *)arg;
41743af08d82Slm66018 	int	rv = 0, status = 0;
41751ae08745Sheppo 
41761ae08745Sheppo 	ASSERT(vd != NULL);
41773af08d82Slm66018 
4178d10e4ef2Snarayan 	PR2("New task to receive incoming message(s)");
41793af08d82Slm66018 
41803af08d82Slm66018 
4181d10e4ef2Snarayan 	while (vd_enabled(vd) && status == 0) {
4182d10e4ef2Snarayan 		size_t		msglen, msgsize;
41833af08d82Slm66018 		ldc_status_t	lstatus;
4184d10e4ef2Snarayan 
41850a55fbb7Slm66018 		/*
4186d10e4ef2Snarayan 		 * Receive and process a message
41870a55fbb7Slm66018 		 */
4188d10e4ef2Snarayan 		vd_reset_if_needed(vd);	/* can change vd->max_msglen */
41893af08d82Slm66018 
41903af08d82Slm66018 		/*
41913af08d82Slm66018 		 * check if channel is UP - else break out of loop
41923af08d82Slm66018 		 */
41933af08d82Slm66018 		status = ldc_status(vd->ldc_handle, &lstatus);
41943af08d82Slm66018 		if (lstatus != LDC_UP) {
41953af08d82Slm66018 			PR0("channel not up (status=%d), exiting recv loop\n",
41963af08d82Slm66018 			    lstatus);
41973af08d82Slm66018 			break;
41983af08d82Slm66018 		}
41993af08d82Slm66018 
42003af08d82Slm66018 		ASSERT(vd->max_msglen != 0);
42013af08d82Slm66018 
4202d10e4ef2Snarayan 		msgsize = vd->max_msglen; /* stable copy for alloc/free */
42033af08d82Slm66018 		msglen	= msgsize;	  /* actual len after recv_msg() */
42043af08d82Slm66018 
42053af08d82Slm66018 		status = recv_msg(vd->ldc_handle, vd->vio_msgp, &msglen);
42063af08d82Slm66018 		switch (status) {
42073af08d82Slm66018 		case 0:
42083af08d82Slm66018 			rv = vd_process_msg(vd, (vio_msg_t *)vd->vio_msgp,
42093af08d82Slm66018 			    msglen);
42103af08d82Slm66018 			/* check if max_msglen changed */
42113af08d82Slm66018 			if (msgsize != vd->max_msglen) {
42123af08d82Slm66018 				PR0("max_msglen changed 0x%lx to 0x%lx bytes\n",
42133af08d82Slm66018 				    msgsize, vd->max_msglen);
42143af08d82Slm66018 				kmem_free(vd->vio_msgp, msgsize);
42153af08d82Slm66018 				vd->vio_msgp =
42163af08d82Slm66018 				    kmem_alloc(vd->max_msglen, KM_SLEEP);
42173af08d82Slm66018 			}
42183af08d82Slm66018 			if (rv == EINPROGRESS)
42193af08d82Slm66018 				continue;
42203af08d82Slm66018 			break;
42213af08d82Slm66018 
42223af08d82Slm66018 		case ENOMSG:
42233af08d82Slm66018 			break;
42243af08d82Slm66018 
42253af08d82Slm66018 		case ECONNRESET:
42263af08d82Slm66018 			PR0("initiating soft reset (ECONNRESET)\n");
42273af08d82Slm66018 			vd_need_reset(vd, B_FALSE);
42283af08d82Slm66018 			status = 0;
42293af08d82Slm66018 			break;
42303af08d82Slm66018 
42313af08d82Slm66018 		default:
4232d10e4ef2Snarayan 			/* Probably an LDC failure; arrange to reset it */
42333af08d82Slm66018 			PR0("initiating full reset (status=0x%x)", status);
4234d10e4ef2Snarayan 			vd_need_reset(vd, B_TRUE);
42353af08d82Slm66018 			break;
42360a55fbb7Slm66018 		}
42371ae08745Sheppo 	}
42383af08d82Slm66018 
4239d10e4ef2Snarayan 	PR2("Task finished");
42400a55fbb7Slm66018 }
42410a55fbb7Slm66018 
42420a55fbb7Slm66018 static uint_t
42431ae08745Sheppo vd_handle_ldc_events(uint64_t event, caddr_t arg)
42441ae08745Sheppo {
42451ae08745Sheppo 	vd_t	*vd = (vd_t *)(void *)arg;
42463af08d82Slm66018 	int	status;
42471ae08745Sheppo 
42481ae08745Sheppo 	ASSERT(vd != NULL);
4249d10e4ef2Snarayan 
4250d10e4ef2Snarayan 	if (!vd_enabled(vd))
4251d10e4ef2Snarayan 		return (LDC_SUCCESS);
4252d10e4ef2Snarayan 
42533af08d82Slm66018 	if (event & LDC_EVT_DOWN) {
425434683adeSsg70180 		PR0("LDC_EVT_DOWN: LDC channel went down");
42553af08d82Slm66018 
42563af08d82Slm66018 		vd_need_reset(vd, B_TRUE);
42573af08d82Slm66018 		status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd,
42583af08d82Slm66018 		    DDI_SLEEP);
42593af08d82Slm66018 		if (status == DDI_FAILURE) {
42603af08d82Slm66018 			PR0("cannot schedule task to recv msg\n");
42613af08d82Slm66018 			vd_need_reset(vd, B_TRUE);
42623af08d82Slm66018 		}
42633af08d82Slm66018 	}
42643af08d82Slm66018 
4265d10e4ef2Snarayan 	if (event & LDC_EVT_RESET) {
42663af08d82Slm66018 		PR0("LDC_EVT_RESET: LDC channel was reset");
42673af08d82Slm66018 
42683af08d82Slm66018 		if (vd->state != VD_STATE_INIT) {
42693af08d82Slm66018 			PR0("scheduling full reset");
42703af08d82Slm66018 			vd_need_reset(vd, B_FALSE);
42713af08d82Slm66018 			status = ddi_taskq_dispatch(vd->startq, vd_recv_msg,
42723af08d82Slm66018 			    vd, DDI_SLEEP);
42733af08d82Slm66018 			if (status == DDI_FAILURE) {
42743af08d82Slm66018 				PR0("cannot schedule task to recv msg\n");
42753af08d82Slm66018 				vd_need_reset(vd, B_TRUE);
42763af08d82Slm66018 			}
42773af08d82Slm66018 
42783af08d82Slm66018 		} else {
42793af08d82Slm66018 			PR0("channel already reset, ignoring...\n");
42803af08d82Slm66018 			PR0("doing ldc up...\n");
42813af08d82Slm66018 			(void) ldc_up(vd->ldc_handle);
42823af08d82Slm66018 		}
42833af08d82Slm66018 
4284d10e4ef2Snarayan 		return (LDC_SUCCESS);
4285d10e4ef2Snarayan 	}
4286d10e4ef2Snarayan 
4287d10e4ef2Snarayan 	if (event & LDC_EVT_UP) {
42883af08d82Slm66018 		PR0("EVT_UP: LDC is up\nResetting client connection state");
42893af08d82Slm66018 		PR0("initiating soft reset");
4290d10e4ef2Snarayan 		vd_need_reset(vd, B_FALSE);
42913af08d82Slm66018 		status = ddi_taskq_dispatch(vd->startq, vd_recv_msg,
42923af08d82Slm66018 		    vd, DDI_SLEEP);
42933af08d82Slm66018 		if (status == DDI_FAILURE) {
42943af08d82Slm66018 			PR0("cannot schedule task to recv msg\n");
42953af08d82Slm66018 			vd_need_reset(vd, B_TRUE);
42963af08d82Slm66018 			return (LDC_SUCCESS);
42973af08d82Slm66018 		}
4298d10e4ef2Snarayan 	}
4299d10e4ef2Snarayan 
4300d10e4ef2Snarayan 	if (event & LDC_EVT_READ) {
4301d10e4ef2Snarayan 		int	status;
4302d10e4ef2Snarayan 
4303d10e4ef2Snarayan 		PR1("New data available");
4304d10e4ef2Snarayan 		/* Queue a task to receive the new data */
4305d10e4ef2Snarayan 		status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd,
4306d10e4ef2Snarayan 		    DDI_SLEEP);
43073af08d82Slm66018 
43083af08d82Slm66018 		if (status == DDI_FAILURE) {
43093af08d82Slm66018 			PR0("cannot schedule task to recv msg\n");
43103af08d82Slm66018 			vd_need_reset(vd, B_TRUE);
43113af08d82Slm66018 		}
4312d10e4ef2Snarayan 	}
4313d10e4ef2Snarayan 
4314d10e4ef2Snarayan 	return (LDC_SUCCESS);
43151ae08745Sheppo }
43161ae08745Sheppo 
43171ae08745Sheppo static uint_t
43181ae08745Sheppo vds_check_for_vd(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
43191ae08745Sheppo {
43201ae08745Sheppo 	_NOTE(ARGUNUSED(key, val))
43211ae08745Sheppo 	(*((uint_t *)arg))++;
43221ae08745Sheppo 	return (MH_WALK_TERMINATE);
43231ae08745Sheppo }
43241ae08745Sheppo 
43251ae08745Sheppo 
43261ae08745Sheppo static int
43271ae08745Sheppo vds_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
43281ae08745Sheppo {
43291ae08745Sheppo 	uint_t	vd_present = 0;
43301ae08745Sheppo 	minor_t	instance;
43311ae08745Sheppo 	vds_t	*vds;
43321ae08745Sheppo 
43331ae08745Sheppo 
43341ae08745Sheppo 	switch (cmd) {
43351ae08745Sheppo 	case DDI_DETACH:
43361ae08745Sheppo 		/* the real work happens below */
43371ae08745Sheppo 		break;
43381ae08745Sheppo 	case DDI_SUSPEND:
4339d10e4ef2Snarayan 		PR0("No action required for DDI_SUSPEND");
43401ae08745Sheppo 		return (DDI_SUCCESS);
43411ae08745Sheppo 	default:
43423af08d82Slm66018 		PR0("Unrecognized \"cmd\"");
43431ae08745Sheppo 		return (DDI_FAILURE);
43441ae08745Sheppo 	}
43451ae08745Sheppo 
43461ae08745Sheppo 	ASSERT(cmd == DDI_DETACH);
43471ae08745Sheppo 	instance = ddi_get_instance(dip);
43481ae08745Sheppo 	if ((vds = ddi_get_soft_state(vds_state, instance)) == NULL) {
43493af08d82Slm66018 		PR0("Could not get state for instance %u", instance);
43501ae08745Sheppo 		ddi_soft_state_free(vds_state, instance);
43511ae08745Sheppo 		return (DDI_FAILURE);
43521ae08745Sheppo 	}
43531ae08745Sheppo 
43541ae08745Sheppo 	/* Do no detach when serving any vdisks */
43551ae08745Sheppo 	mod_hash_walk(vds->vd_table, vds_check_for_vd, &vd_present);
43561ae08745Sheppo 	if (vd_present) {
43571ae08745Sheppo 		PR0("Not detaching because serving vdisks");
43581ae08745Sheppo 		return (DDI_FAILURE);
43591ae08745Sheppo 	}
43601ae08745Sheppo 
43611ae08745Sheppo 	PR0("Detaching");
4362445b4c2eSsb155480 	if (vds->initialized & VDS_MDEG) {
43631ae08745Sheppo 		(void) mdeg_unregister(vds->mdeg);
4364445b4c2eSsb155480 		kmem_free(vds->ispecp->specp, sizeof (vds_prop_template));
4365445b4c2eSsb155480 		kmem_free(vds->ispecp, sizeof (mdeg_node_spec_t));
4366445b4c2eSsb155480 		vds->ispecp = NULL;
4367445b4c2eSsb155480 		vds->mdeg = NULL;
4368445b4c2eSsb155480 	}
4369445b4c2eSsb155480 
43701ae08745Sheppo 	if (vds->initialized & VDS_LDI)
43711ae08745Sheppo 		(void) ldi_ident_release(vds->ldi_ident);
43721ae08745Sheppo 	mod_hash_destroy_hash(vds->vd_table);
43731ae08745Sheppo 	ddi_soft_state_free(vds_state, instance);
43741ae08745Sheppo 	return (DDI_SUCCESS);
43751ae08745Sheppo }
43761ae08745Sheppo 
43771ae08745Sheppo static boolean_t
43781ae08745Sheppo is_pseudo_device(dev_info_t *dip)
43791ae08745Sheppo {
43801ae08745Sheppo 	dev_info_t	*parent, *root = ddi_root_node();
43811ae08745Sheppo 
43821ae08745Sheppo 
43831ae08745Sheppo 	for (parent = ddi_get_parent(dip); (parent != NULL) && (parent != root);
43841ae08745Sheppo 	    parent = ddi_get_parent(parent)) {
43851ae08745Sheppo 		if (strcmp(ddi_get_name(parent), DEVI_PSEUDO_NEXNAME) == 0)
43861ae08745Sheppo 			return (B_TRUE);
43871ae08745Sheppo 	}
43881ae08745Sheppo 
43891ae08745Sheppo 	return (B_FALSE);
43901ae08745Sheppo }
43911ae08745Sheppo 
439217cadca8Slm66018 /*
439317cadca8Slm66018  * Description:
439417cadca8Slm66018  *	This function checks to see if the file being used as a
439517cadca8Slm66018  *	virtual disk is an ISO image. An ISO image is a special
439617cadca8Slm66018  *	case which can be booted/installed from like a CD/DVD
439717cadca8Slm66018  *
439817cadca8Slm66018  * Parameters:
439917cadca8Slm66018  *	vd		- disk on which the operation is performed.
440017cadca8Slm66018  *
440117cadca8Slm66018  * Return Code:
440217cadca8Slm66018  *	B_TRUE		- The file is an ISO 9660 compliant image
440317cadca8Slm66018  *	B_FALSE		- just a regular disk image file
440417cadca8Slm66018  */
440517cadca8Slm66018 static boolean_t
440617cadca8Slm66018 vd_file_is_iso_image(vd_t *vd)
440717cadca8Slm66018 {
440817cadca8Slm66018 	char	iso_buf[ISO_SECTOR_SIZE];
440917cadca8Slm66018 	int	i, rv;
441017cadca8Slm66018 	uint_t	sec;
441117cadca8Slm66018 
441217cadca8Slm66018 	ASSERT(vd->file);
441317cadca8Slm66018 
441417cadca8Slm66018 	/*
441517cadca8Slm66018 	 * If we have already discovered and saved this info we can
441617cadca8Slm66018 	 * short-circuit the check and avoid reading the file.
441717cadca8Slm66018 	 */
441817cadca8Slm66018 	if (vd->vdisk_media == VD_MEDIA_DVD || vd->vdisk_media == VD_MEDIA_CD)
441917cadca8Slm66018 		return (B_TRUE);
442017cadca8Slm66018 
442117cadca8Slm66018 	/*
442217cadca8Slm66018 	 * We wish to read the sector that should contain the 2nd ISO volume
442317cadca8Slm66018 	 * descriptor. The second field in this descriptor is called the
442417cadca8Slm66018 	 * Standard Identifier and is set to CD001 for a CD-ROM compliant
442517cadca8Slm66018 	 * to the ISO 9660 standard.
442617cadca8Slm66018 	 */
442717cadca8Slm66018 	sec = (ISO_VOLDESC_SEC * ISO_SECTOR_SIZE) / vd->vdisk_block_size;
442817cadca8Slm66018 	rv = vd_file_rw(vd, VD_SLICE_NONE, VD_OP_BREAD, (caddr_t)iso_buf,
442917cadca8Slm66018 	    sec, ISO_SECTOR_SIZE);
443017cadca8Slm66018 
443117cadca8Slm66018 	if (rv < 0)
443217cadca8Slm66018 		return (B_FALSE);
443317cadca8Slm66018 
443417cadca8Slm66018 	for (i = 0; i < ISO_ID_STRLEN; i++) {
443517cadca8Slm66018 		if (ISO_STD_ID(iso_buf)[i] != ISO_ID_STRING[i])
443617cadca8Slm66018 			return (B_FALSE);
443717cadca8Slm66018 	}
443817cadca8Slm66018 
443917cadca8Slm66018 	return (B_TRUE);
444017cadca8Slm66018 }
444117cadca8Slm66018 
444217cadca8Slm66018 /*
444317cadca8Slm66018  * Description:
444417cadca8Slm66018  *	This function checks to see if the virtual device is an ATAPI
444517cadca8Slm66018  *	device. ATAPI devices use Group 1 Read/Write commands, so
444617cadca8Slm66018  *	any USCSI calls vds makes need to take this into account.
444717cadca8Slm66018  *
444817cadca8Slm66018  * Parameters:
444917cadca8Slm66018  *	vd		- disk on which the operation is performed.
445017cadca8Slm66018  *
445117cadca8Slm66018  * Return Code:
445217cadca8Slm66018  *	B_TRUE		- The virtual disk is backed by an ATAPI device
445317cadca8Slm66018  *	B_FALSE		- not an ATAPI device (presumably SCSI)
445417cadca8Slm66018  */
445517cadca8Slm66018 static boolean_t
445617cadca8Slm66018 vd_is_atapi_device(vd_t *vd)
445717cadca8Slm66018 {
445817cadca8Slm66018 	boolean_t	is_atapi = B_FALSE;
445917cadca8Slm66018 	char		*variantp;
446017cadca8Slm66018 	int		rv;
446117cadca8Slm66018 
446217cadca8Slm66018 	ASSERT(vd->ldi_handle[0] != NULL);
446317cadca8Slm66018 	ASSERT(!vd->file);
446417cadca8Slm66018 
446517cadca8Slm66018 	rv = ldi_prop_lookup_string(vd->ldi_handle[0],
446617cadca8Slm66018 	    (LDI_DEV_T_ANY | DDI_PROP_DONTPASS), "variant", &variantp);
446717cadca8Slm66018 	if (rv == DDI_PROP_SUCCESS) {
446817cadca8Slm66018 		PR0("'variant' property exists for %s", vd->device_path);
446917cadca8Slm66018 		if (strcmp(variantp, "atapi") == 0)
447017cadca8Slm66018 			is_atapi = B_TRUE;
447117cadca8Slm66018 		ddi_prop_free(variantp);
447217cadca8Slm66018 	}
447317cadca8Slm66018 
447417cadca8Slm66018 	rv = ldi_prop_exists(vd->ldi_handle[0], LDI_DEV_T_ANY, "atapi");
447517cadca8Slm66018 	if (rv) {
447617cadca8Slm66018 		PR0("'atapi' property exists for %s", vd->device_path);
447717cadca8Slm66018 		is_atapi = B_TRUE;
447817cadca8Slm66018 	}
447917cadca8Slm66018 
448017cadca8Slm66018 	return (is_atapi);
448117cadca8Slm66018 }
448217cadca8Slm66018 
44831ae08745Sheppo static int
44842f5224aeSachartre vd_setup_mediainfo(vd_t *vd)
44850a55fbb7Slm66018 {
44862f5224aeSachartre 	int status, rval;
44874bac2208Snarayan 	struct dk_minfo	dk_minfo;
44880a55fbb7Slm66018 
44892f5224aeSachartre 	ASSERT(vd->ldi_handle[0] != NULL);
44902f5224aeSachartre 	ASSERT(vd->vdisk_block_size != 0);
44912f5224aeSachartre 
44922f5224aeSachartre 	if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCGMEDIAINFO,
44932f5224aeSachartre 	    (intptr_t)&dk_minfo, (vd->open_flags | FKIOCTL),
44942f5224aeSachartre 	    kcred, &rval)) != 0)
44952f5224aeSachartre 		return (status);
44962f5224aeSachartre 
44972f5224aeSachartre 	ASSERT(dk_minfo.dki_lbsize % vd->vdisk_block_size == 0);
44982f5224aeSachartre 
44992f5224aeSachartre 	vd->block_size = dk_minfo.dki_lbsize;
45002f5224aeSachartre 	vd->vdisk_size = (dk_minfo.dki_capacity * dk_minfo.dki_lbsize) /
45012f5224aeSachartre 	    vd->vdisk_block_size;
45022f5224aeSachartre 	vd->vdisk_media = DK_MEDIATYPE2VD_MEDIATYPE(dk_minfo.dki_media_type);
45032f5224aeSachartre 	return (0);
45042f5224aeSachartre }
45052f5224aeSachartre 
45062f5224aeSachartre static int
45072f5224aeSachartre vd_setup_full_disk(vd_t *vd)
45082f5224aeSachartre {
45092f5224aeSachartre 	int		status;
45102f5224aeSachartre 	major_t		major = getmajor(vd->dev[0]);
45112f5224aeSachartre 	minor_t		minor = getminor(vd->dev[0]) - VD_ENTIRE_DISK_SLICE;
45122f5224aeSachartre 
4513047ba61eSachartre 	ASSERT(vd->vdisk_type == VD_DISK_TYPE_DISK);
4514047ba61eSachartre 
45152f5224aeSachartre 	vd->vdisk_block_size = DEV_BSIZE;
45162f5224aeSachartre 
45174bac2208Snarayan 	/*
45184bac2208Snarayan 	 * At this point, vdisk_size is set to the size of partition 2 but
45194bac2208Snarayan 	 * this does not represent the size of the disk because partition 2
45204bac2208Snarayan 	 * may not cover the entire disk and its size does not include reserved
45212f5224aeSachartre 	 * blocks. So we call vd_get_mediainfo to udpate this information and
45222f5224aeSachartre 	 * set the block size and the media type of the disk.
45234bac2208Snarayan 	 */
45242f5224aeSachartre 	status = vd_setup_mediainfo(vd);
45252f5224aeSachartre 
45262f5224aeSachartre 	if (status != 0) {
45272f5224aeSachartre 		if (!vd->scsi) {
45282f5224aeSachartre 			/* unexpected failure */
4529690555a1Sachartre 			PRN("ldi_ioctl(DKIOCGMEDIAINFO) returned errno %d",
45304bac2208Snarayan 			    status);
45310a55fbb7Slm66018 			return (status);
45320a55fbb7Slm66018 		}
45332f5224aeSachartre 
45342f5224aeSachartre 		/*
45352f5224aeSachartre 		 * The function can fail for SCSI disks which are present but
45362f5224aeSachartre 		 * reserved by another system. In that case, we don't know the
45372f5224aeSachartre 		 * size of the disk and the block size.
45382f5224aeSachartre 		 */
45392f5224aeSachartre 		vd->vdisk_size = VD_SIZE_UNKNOWN;
45402f5224aeSachartre 		vd->block_size = 0;
45412f5224aeSachartre 		vd->vdisk_media = VD_MEDIA_FIXED;
45422f5224aeSachartre 	}
45430a55fbb7Slm66018 
45440a55fbb7Slm66018 	/* Move dev number and LDI handle to entire-disk-slice array elements */
45450a55fbb7Slm66018 	vd->dev[VD_ENTIRE_DISK_SLICE]		= vd->dev[0];
45460a55fbb7Slm66018 	vd->dev[0]				= 0;
45470a55fbb7Slm66018 	vd->ldi_handle[VD_ENTIRE_DISK_SLICE]	= vd->ldi_handle[0];
45480a55fbb7Slm66018 	vd->ldi_handle[0]			= NULL;
45490a55fbb7Slm66018 
45500a55fbb7Slm66018 	/* Initialize device numbers for remaining slices and open them */
45510a55fbb7Slm66018 	for (int slice = 0; slice < vd->nslices; slice++) {
45520a55fbb7Slm66018 		/*
45530a55fbb7Slm66018 		 * Skip the entire-disk slice, as it's already open and its
45540a55fbb7Slm66018 		 * device known
45550a55fbb7Slm66018 		 */
45560a55fbb7Slm66018 		if (slice == VD_ENTIRE_DISK_SLICE)
45570a55fbb7Slm66018 			continue;
45580a55fbb7Slm66018 		ASSERT(vd->dev[slice] == 0);
45590a55fbb7Slm66018 		ASSERT(vd->ldi_handle[slice] == NULL);
45600a55fbb7Slm66018 
45610a55fbb7Slm66018 		/*
45620a55fbb7Slm66018 		 * Construct the device number for the current slice
45630a55fbb7Slm66018 		 */
45640a55fbb7Slm66018 		vd->dev[slice] = makedevice(major, (minor + slice));
45650a55fbb7Slm66018 
45660a55fbb7Slm66018 		/*
456734683adeSsg70180 		 * Open all slices of the disk to serve them to the client.
456834683adeSsg70180 		 * Slices are opened exclusively to prevent other threads or
456934683adeSsg70180 		 * processes in the service domain from performing I/O to
457034683adeSsg70180 		 * slices being accessed by a client.  Failure to open a slice
457134683adeSsg70180 		 * results in vds not serving this disk, as the client could
457234683adeSsg70180 		 * attempt (and should be able) to access any slice immediately.
457334683adeSsg70180 		 * Any slices successfully opened before a failure will get
457434683adeSsg70180 		 * closed by vds_destroy_vd() as a result of the error returned
457534683adeSsg70180 		 * by this function.
457634683adeSsg70180 		 *
457734683adeSsg70180 		 * We need to do the open with FNDELAY so that opening an empty
457834683adeSsg70180 		 * slice does not fail.
45790a55fbb7Slm66018 		 */
45800a55fbb7Slm66018 		PR0("Opening device major %u, minor %u = slice %u",
45810a55fbb7Slm66018 		    major, minor, slice);
4582047ba61eSachartre 
4583047ba61eSachartre 		/*
4584047ba61eSachartre 		 * Try to open the device. This can fail for example if we are
4585047ba61eSachartre 		 * opening an empty slice. So in case of a failure, we try the
4586047ba61eSachartre 		 * open again but this time with the FNDELAY flag.
4587047ba61eSachartre 		 */
4588047ba61eSachartre 		status = ldi_open_by_dev(&vd->dev[slice], OTYP_BLK,
4589047ba61eSachartre 		    vd->open_flags, kcred, &vd->ldi_handle[slice],
4590047ba61eSachartre 		    vd->vds->ldi_ident);
4591047ba61eSachartre 
4592047ba61eSachartre 		if (status != 0) {
4593047ba61eSachartre 			status = ldi_open_by_dev(&vd->dev[slice], OTYP_BLK,
4594047ba61eSachartre 			    vd->open_flags | FNDELAY, kcred,
4595047ba61eSachartre 			    &vd->ldi_handle[slice], vd->vds->ldi_ident);
4596047ba61eSachartre 		}
4597047ba61eSachartre 
4598047ba61eSachartre 		if (status != 0) {
4599690555a1Sachartre 			PRN("ldi_open_by_dev() returned errno %d "
46000a55fbb7Slm66018 			    "for slice %u", status, slice);
46010a55fbb7Slm66018 			/* vds_destroy_vd() will close any open slices */
4602690555a1Sachartre 			vd->ldi_handle[slice] = NULL;
46030a55fbb7Slm66018 			return (status);
46040a55fbb7Slm66018 		}
46050a55fbb7Slm66018 	}
46060a55fbb7Slm66018 
46070a55fbb7Slm66018 	return (0);
46080a55fbb7Slm66018 }
46090a55fbb7Slm66018 
4610edcc0754Sachartre /*
4611edcc0754Sachartre  * When a slice or a volume is exported as a single-slice disk, we want
4612edcc0754Sachartre  * the disk backend (i.e. the slice or volume) to be entirely mapped as
4613edcc0754Sachartre  * a slice without the addition of any metadata.
4614edcc0754Sachartre  *
4615edcc0754Sachartre  * So when exporting the disk as a VTOC disk, we fake a disk with the following
4616edcc0754Sachartre  * layout:
4617edcc0754Sachartre  *
4618edcc0754Sachartre  *                 0 1                         N+1
4619edcc0754Sachartre  *                 +-+--------------------------+
4620edcc0754Sachartre  *  virtual disk:  |L|           slice 0        |
4621edcc0754Sachartre  *                 +-+--------------------------+
4622edcc0754Sachartre  *                  ^:                          :
4623edcc0754Sachartre  *                  |:                          :
4624edcc0754Sachartre  *      VTOC LABEL--+:                          :
4625edcc0754Sachartre  *                   +--------------------------+
4626edcc0754Sachartre  *  disk backend:    |       slice/volume       |
4627edcc0754Sachartre  *                   +--------------------------+
4628edcc0754Sachartre  *                   0                          N
4629edcc0754Sachartre  *
4630edcc0754Sachartre  * N is the number of blocks in the slice/volume.
4631edcc0754Sachartre  *
4632edcc0754Sachartre  * We simulate a disk with N+1 blocks. The first block (block 0) is faked and
4633edcc0754Sachartre  * can not be changed. The remaining blocks (1 to N+1) defines slice 0 and are
4634edcc0754Sachartre  * mapped to the exported slice or volume:
4635edcc0754Sachartre  *
4636edcc0754Sachartre  * - block 0 (L) can return a fake VTOC label if raw read was implemented.
4637edcc0754Sachartre  * - block 1 to N+1 is mapped to the exported slice or volume.
4638edcc0754Sachartre  *
4639edcc0754Sachartre  */
46400a55fbb7Slm66018 static int
464178fcd0a1Sachartre vd_setup_partition_vtoc(vd_t *vd)
464278fcd0a1Sachartre {
464378fcd0a1Sachartre 	int rval, status;
464478fcd0a1Sachartre 	char *device_path = vd->device_path;
464578fcd0a1Sachartre 
464678fcd0a1Sachartre 	status = ldi_ioctl(vd->ldi_handle[0], DKIOCGGEOM,
4647047ba61eSachartre 	    (intptr_t)&vd->dk_geom, (vd->open_flags | FKIOCTL), kcred, &rval);
464878fcd0a1Sachartre 
464978fcd0a1Sachartre 	if (status != 0) {
465078fcd0a1Sachartre 		PRN("ldi_ioctl(DKIOCGEOM) returned errno %d for %s",
465178fcd0a1Sachartre 		    status, device_path);
465278fcd0a1Sachartre 		return (status);
465378fcd0a1Sachartre 	}
465478fcd0a1Sachartre 
465578fcd0a1Sachartre 	/* Initialize dk_geom structure for single-slice device */
465678fcd0a1Sachartre 	if (vd->dk_geom.dkg_nsect == 0) {
465778fcd0a1Sachartre 		PRN("%s geometry claims 0 sectors per track", device_path);
465878fcd0a1Sachartre 		return (EIO);
465978fcd0a1Sachartre 	}
466078fcd0a1Sachartre 	if (vd->dk_geom.dkg_nhead == 0) {
466178fcd0a1Sachartre 		PRN("%s geometry claims 0 heads", device_path);
466278fcd0a1Sachartre 		return (EIO);
466378fcd0a1Sachartre 	}
4664edcc0754Sachartre 	vd->dk_geom.dkg_ncyl = (vd->vdisk_size + 1) / vd->dk_geom.dkg_nsect /
466578fcd0a1Sachartre 	    vd->dk_geom.dkg_nhead;
466678fcd0a1Sachartre 	vd->dk_geom.dkg_acyl = 0;
466778fcd0a1Sachartre 	vd->dk_geom.dkg_pcyl = vd->dk_geom.dkg_ncyl + vd->dk_geom.dkg_acyl;
466878fcd0a1Sachartre 
466978fcd0a1Sachartre 
467078fcd0a1Sachartre 	/* Initialize vtoc structure for single-slice device */
467178fcd0a1Sachartre 	bcopy(VD_VOLUME_NAME, vd->vtoc.v_volume,
467278fcd0a1Sachartre 	    MIN(sizeof (VD_VOLUME_NAME), sizeof (vd->vtoc.v_volume)));
467378fcd0a1Sachartre 	bzero(vd->vtoc.v_part, sizeof (vd->vtoc.v_part));
467478fcd0a1Sachartre 	vd->vtoc.v_nparts = 1;
467578fcd0a1Sachartre 	vd->vtoc.v_part[0].p_tag = V_UNASSIGNED;
467678fcd0a1Sachartre 	vd->vtoc.v_part[0].p_flag = 0;
4677edcc0754Sachartre 	vd->vtoc.v_part[0].p_start = 1;
467878fcd0a1Sachartre 	vd->vtoc.v_part[0].p_size = vd->vdisk_size;
467978fcd0a1Sachartre 	bcopy(VD_ASCIILABEL, vd->vtoc.v_asciilabel,
468078fcd0a1Sachartre 	    MIN(sizeof (VD_ASCIILABEL), sizeof (vd->vtoc.v_asciilabel)));
468178fcd0a1Sachartre 
4682edcc0754Sachartre 	/* adjust the vdisk_size, we emulate the first block */
4683edcc0754Sachartre 	vd->vdisk_size += 1;
4684edcc0754Sachartre 
468578fcd0a1Sachartre 	return (0);
468678fcd0a1Sachartre }
468778fcd0a1Sachartre 
4688edcc0754Sachartre /*
4689edcc0754Sachartre  * When a slice, volume or file is exported as a single-slice disk, we want
4690edcc0754Sachartre  * the disk backend (i.e. the slice, volume or file) to be entirely mapped
4691edcc0754Sachartre  * as a slice without the addition of any metadata.
4692edcc0754Sachartre  *
4693edcc0754Sachartre  * So when exporting the disk as an EFI disk, we fake a disk with the following
4694edcc0754Sachartre  * layout:
4695edcc0754Sachartre  *
4696edcc0754Sachartre  *                 0 1 2 3      34                        34+N
4697edcc0754Sachartre  *                 +-+-+-+-------+--------------------------+
4698edcc0754Sachartre  *  virtual disk:  |X|T|E|XXXXXXX|           slice 0        |
4699edcc0754Sachartre  *                 +-+-+-+-------+--------------------------+
4700edcc0754Sachartre  *                    ^ ^        :                          :
4701edcc0754Sachartre  *                    | |        :                          :
4702edcc0754Sachartre  *                GPT-+ +-GPE    :                          :
4703edcc0754Sachartre  *                               +--------------------------+
4704edcc0754Sachartre  *  disk backend:                |     slice/volume/file    |
4705edcc0754Sachartre  *                               +--------------------------+
4706edcc0754Sachartre  *                               0                          N
4707edcc0754Sachartre  *
4708edcc0754Sachartre  * N is the number of blocks in the slice/volume/file.
4709edcc0754Sachartre  *
4710edcc0754Sachartre  * We simulate a disk with 34+N blocks. The first 34 blocks (0 to 33) are
4711edcc0754Sachartre  * emulated and can not be changed. The remaining blocks (34 to 34+N) defines
4712edcc0754Sachartre  * slice 0 and are mapped to the exported slice, volume or file:
4713edcc0754Sachartre  *
4714edcc0754Sachartre  * - block 0 (X) is unused and can return 0 if raw read was implemented.
4715edcc0754Sachartre  * - block 1 (T) returns a fake EFI GPT (via DKIOCGETEFI)
4716edcc0754Sachartre  * - block 2 (E) returns a fake EFI GPE (via DKIOCGETEFI)
4717edcc0754Sachartre  * - block 3 to 33 (X) are unused and return 0 if raw read is implemented.
4718edcc0754Sachartre  * - block 34 to 34+N is mapped to the exported slice, volume or file.
4719edcc0754Sachartre  *
4720edcc0754Sachartre  */
472178fcd0a1Sachartre static int
47224bac2208Snarayan vd_setup_partition_efi(vd_t *vd)
47234bac2208Snarayan {
47244bac2208Snarayan 	efi_gpt_t *gpt;
47254bac2208Snarayan 	efi_gpe_t *gpe;
4726edcc0754Sachartre 	struct uuid uuid = EFI_USR;
47274bac2208Snarayan 	uint32_t crc;
47284bac2208Snarayan 
4729edcc0754Sachartre 	gpt = &vd->efi_gpt;
4730edcc0754Sachartre 	gpe = &vd->efi_gpe;
47314bac2208Snarayan 
4732edcc0754Sachartre 	bzero(gpt, sizeof (efi_gpt_t));
4733edcc0754Sachartre 	bzero(gpe, sizeof (efi_gpe_t));
4734edcc0754Sachartre 
4735edcc0754Sachartre 	/* adjust the vdisk_size, we emulate the first 34 blocks */
4736edcc0754Sachartre 	vd->vdisk_size += 34;
47374bac2208Snarayan 
47384bac2208Snarayan 	gpt->efi_gpt_Signature = LE_64(EFI_SIGNATURE);
47394bac2208Snarayan 	gpt->efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT);
47404bac2208Snarayan 	gpt->efi_gpt_HeaderSize = LE_32(sizeof (efi_gpt_t));
4741edcc0754Sachartre 	gpt->efi_gpt_FirstUsableLBA = LE_64(34ULL);
47424bac2208Snarayan 	gpt->efi_gpt_LastUsableLBA = LE_64(vd->vdisk_size - 1);
47434bac2208Snarayan 	gpt->efi_gpt_NumberOfPartitionEntries = LE_32(1);
4744edcc0754Sachartre 	gpt->efi_gpt_PartitionEntryLBA = LE_64(2ULL);
47454bac2208Snarayan 	gpt->efi_gpt_SizeOfPartitionEntry = LE_32(sizeof (efi_gpe_t));
47464bac2208Snarayan 
47474bac2208Snarayan 	UUID_LE_CONVERT(gpe->efi_gpe_PartitionTypeGUID, uuid);
47484bac2208Snarayan 	gpe->efi_gpe_StartingLBA = gpt->efi_gpt_FirstUsableLBA;
47494bac2208Snarayan 	gpe->efi_gpe_EndingLBA = gpt->efi_gpt_LastUsableLBA;
47504bac2208Snarayan 
47514bac2208Snarayan 	CRC32(crc, gpe, sizeof (efi_gpe_t), -1U, crc32_table);
47524bac2208Snarayan 	gpt->efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
47534bac2208Snarayan 
47544bac2208Snarayan 	CRC32(crc, gpt, sizeof (efi_gpt_t), -1U, crc32_table);
47554bac2208Snarayan 	gpt->efi_gpt_HeaderCRC32 = LE_32(~crc);
47564bac2208Snarayan 
47574bac2208Snarayan 	return (0);
47584bac2208Snarayan }
47594bac2208Snarayan 
4760047ba61eSachartre /*
4761047ba61eSachartre  * Setup for a virtual disk whose backend is a file (exported as a single slice
4762047ba61eSachartre  * or as a full disk) or a pseudo device (for example a ZFS, SVM or VxVM volume)
4763047ba61eSachartre  * exported as a full disk. In these cases, the backend is accessed using the
4764047ba61eSachartre  * vnode interface.
4765047ba61eSachartre  */
47664bac2208Snarayan static int
4767047ba61eSachartre vd_setup_backend_vnode(vd_t *vd)
47683c96341aSnarayan {
476978fcd0a1Sachartre 	int 		rval, status;
47703c96341aSnarayan 	vattr_t		vattr;
47713c96341aSnarayan 	dev_t		dev;
47723c96341aSnarayan 	char		*file_path = vd->device_path;
47733c96341aSnarayan 	char		dev_path[MAXPATHLEN + 1];
47743c96341aSnarayan 	ldi_handle_t	lhandle;
47753c96341aSnarayan 	struct dk_cinfo	dk_cinfo;
47763c96341aSnarayan 
4777047ba61eSachartre 	if ((status = vn_open(file_path, UIO_SYSSPACE, vd->open_flags | FOFFMAX,
47783c96341aSnarayan 	    0, &vd->file_vnode, 0, 0)) != 0) {
4779690555a1Sachartre 		PRN("vn_open(%s) = errno %d", file_path, status);
47803c96341aSnarayan 		return (status);
47813c96341aSnarayan 	}
47823c96341aSnarayan 
4783690555a1Sachartre 	/*
4784690555a1Sachartre 	 * We set vd->file now so that vds_destroy_vd will take care of
4785690555a1Sachartre 	 * closing the file and releasing the vnode in case of an error.
4786690555a1Sachartre 	 */
4787690555a1Sachartre 	vd->file = B_TRUE;
4788690555a1Sachartre 
47893c96341aSnarayan 	vattr.va_mask = AT_SIZE;
4790da6c28aaSamw 	if ((status = VOP_GETATTR(vd->file_vnode, &vattr, 0, kcred, NULL))
4791da6c28aaSamw 	    != 0) {
4792690555a1Sachartre 		PRN("VOP_GETATTR(%s) = errno %d", file_path, status);
47933c96341aSnarayan 		return (EIO);
47943c96341aSnarayan 	}
47953c96341aSnarayan 
47963c96341aSnarayan 	vd->file_size = vattr.va_size;
47973c96341aSnarayan 	/* size should be at least sizeof(dk_label) */
47983c96341aSnarayan 	if (vd->file_size < sizeof (struct dk_label)) {
47993c96341aSnarayan 		PRN("Size of file has to be at least %ld bytes",
48003c96341aSnarayan 		    sizeof (struct dk_label));
48013c96341aSnarayan 		return (EIO);
48023c96341aSnarayan 	}
48033c96341aSnarayan 
4804690555a1Sachartre 	if (vd->file_vnode->v_flag & VNOMAP) {
4805690555a1Sachartre 		PRN("File %s cannot be mapped", file_path);
48063c96341aSnarayan 		return (EIO);
48073c96341aSnarayan 	}
48083c96341aSnarayan 
48093c96341aSnarayan 	/* sector size = block size = DEV_BSIZE */
481017cadca8Slm66018 	vd->block_size = DEV_BSIZE;
481117cadca8Slm66018 	vd->vdisk_block_size = DEV_BSIZE;
481287a7269eSachartre 	vd->vdisk_size = vd->file_size / DEV_BSIZE;
48133c96341aSnarayan 	vd->max_xfer_sz = maxphys / DEV_BSIZE; /* default transfer size */
48143c96341aSnarayan 
4815047ba61eSachartre 	/*
4816047ba61eSachartre 	 * Get max_xfer_sz from the device where the file is or from the device
4817047ba61eSachartre 	 * itself if we have a pseudo device.
4818047ba61eSachartre 	 */
4819047ba61eSachartre 	dev_path[0] = '\0';
4820047ba61eSachartre 
4821047ba61eSachartre 	if (vd->pseudo) {
4822047ba61eSachartre 		status = ldi_open_by_name(file_path, FREAD, kcred, &lhandle,
4823047ba61eSachartre 		    vd->vds->ldi_ident);
4824047ba61eSachartre 	} else {
48253c96341aSnarayan 		dev = vd->file_vnode->v_vfsp->vfs_dev;
48263c96341aSnarayan 		if (ddi_dev_pathname(dev, S_IFBLK, dev_path) == DDI_SUCCESS) {
48273c96341aSnarayan 			PR0("underlying device = %s\n", dev_path);
48283c96341aSnarayan 		}
48293c96341aSnarayan 
4830047ba61eSachartre 		status = ldi_open_by_dev(&dev, OTYP_BLK, FREAD, kcred, &lhandle,
4831047ba61eSachartre 		    vd->vds->ldi_ident);
4832047ba61eSachartre 	}
4833047ba61eSachartre 
4834047ba61eSachartre 	if (status != 0) {
4835047ba61eSachartre 		PR0("ldi_open() returned errno %d for device %s",
4836047ba61eSachartre 		    status, (dev_path[0] == '\0')? file_path : dev_path);
48373c96341aSnarayan 	} else {
48383c96341aSnarayan 		if ((status = ldi_ioctl(lhandle, DKIOCINFO,
4839047ba61eSachartre 		    (intptr_t)&dk_cinfo, (vd->open_flags | FKIOCTL), kcred,
48403c96341aSnarayan 		    &rval)) != 0) {
48413c96341aSnarayan 			PR0("ldi_ioctl(DKIOCINFO) returned errno %d for %s",
48423c96341aSnarayan 			    status, dev_path);
48433c96341aSnarayan 		} else {
48443c96341aSnarayan 			/*
48453c96341aSnarayan 			 * Store the device's max transfer size for
48463c96341aSnarayan 			 * return to the client
48473c96341aSnarayan 			 */
48483c96341aSnarayan 			vd->max_xfer_sz = dk_cinfo.dki_maxtransfer;
48493c96341aSnarayan 		}
48503c96341aSnarayan 
48513c96341aSnarayan 		PR0("close the device %s", dev_path);
48523c96341aSnarayan 		(void) ldi_close(lhandle, FREAD, kcred);
48533c96341aSnarayan 	}
48543c96341aSnarayan 
4855205eeb1aSlm66018 	PR0("using file %s, dev %s, max_xfer = %u blks",
48563c96341aSnarayan 	    file_path, dev_path, vd->max_xfer_sz);
48573c96341aSnarayan 
4858edcc0754Sachartre 	if (vd->vdisk_type == VD_DISK_TYPE_SLICE) {
4859edcc0754Sachartre 		ASSERT(!vd->pseudo);
4860edcc0754Sachartre 		vd->vdisk_label = VD_DISK_LABEL_EFI;
4861edcc0754Sachartre 		status = vd_setup_partition_efi(vd);
4862047ba61eSachartre 		return (0);
4863edcc0754Sachartre 	}
4864edcc0754Sachartre 
4865edcc0754Sachartre 	/*
4866edcc0754Sachartre 	 * Find and validate the geometry of a disk image.
4867edcc0754Sachartre 	 */
4868edcc0754Sachartre 	status = vd_file_validate_geometry(vd);
4869edcc0754Sachartre 	if (status != 0 && status != EINVAL && status != ENOTSUP) {
4870edcc0754Sachartre 		PRN("Failed to read label from %s", file_path);
4871edcc0754Sachartre 		return (EIO);
4872edcc0754Sachartre 	}
4873edcc0754Sachartre 
4874edcc0754Sachartre 	if (vd_file_is_iso_image(vd)) {
4875edcc0754Sachartre 		/*
4876edcc0754Sachartre 		 * Indicate whether to call this a CD or DVD from the size
4877edcc0754Sachartre 		 * of the ISO image (images for both drive types are stored
4878edcc0754Sachartre 		 * in the ISO-9600 format). CDs can store up to just under 1Gb
4879edcc0754Sachartre 		 */
4880edcc0754Sachartre 		if ((vd->vdisk_size * vd->vdisk_block_size) >
4881edcc0754Sachartre 		    (1024 * 1024 * 1024))
4882edcc0754Sachartre 			vd->vdisk_media = VD_MEDIA_DVD;
4883edcc0754Sachartre 		else
4884edcc0754Sachartre 			vd->vdisk_media = VD_MEDIA_CD;
4885edcc0754Sachartre 	} else {
4886edcc0754Sachartre 		vd->vdisk_media = VD_MEDIA_FIXED;
4887edcc0754Sachartre 	}
4888edcc0754Sachartre 
4889edcc0754Sachartre 	/* Setup devid for the disk image */
4890047ba61eSachartre 
489178fcd0a1Sachartre 	if (vd->vdisk_label != VD_DISK_LABEL_UNK) {
489278fcd0a1Sachartre 
489387a7269eSachartre 		status = vd_file_read_devid(vd, &vd->file_devid);
489487a7269eSachartre 
489587a7269eSachartre 		if (status == 0) {
489687a7269eSachartre 			/* a valid devid was found */
489787a7269eSachartre 			return (0);
489887a7269eSachartre 		}
489987a7269eSachartre 
490087a7269eSachartre 		if (status != EINVAL) {
490187a7269eSachartre 			/*
490278fcd0a1Sachartre 			 * There was an error while trying to read the devid.
490378fcd0a1Sachartre 			 * So this disk image may have a devid but we are
490478fcd0a1Sachartre 			 * unable to read it.
490587a7269eSachartre 			 */
490687a7269eSachartre 			PR0("can not read devid for %s", file_path);
490787a7269eSachartre 			vd->file_devid = NULL;
490887a7269eSachartre 			return (0);
490987a7269eSachartre 		}
491078fcd0a1Sachartre 	}
491187a7269eSachartre 
491287a7269eSachartre 	/*
491387a7269eSachartre 	 * No valid device id was found so we create one. Note that a failure
491487a7269eSachartre 	 * to create a device id is not fatal and does not prevent the disk
491587a7269eSachartre 	 * image from being attached.
491687a7269eSachartre 	 */
491787a7269eSachartre 	PR1("creating devid for %s", file_path);
491887a7269eSachartre 
491987a7269eSachartre 	if (ddi_devid_init(vd->vds->dip, DEVID_FAB, NULL, 0,
492087a7269eSachartre 	    &vd->file_devid) != DDI_SUCCESS) {
492187a7269eSachartre 		PR0("fail to create devid for %s", file_path);
492287a7269eSachartre 		vd->file_devid = NULL;
492387a7269eSachartre 		return (0);
492487a7269eSachartre 	}
492587a7269eSachartre 
492678fcd0a1Sachartre 	/*
492778fcd0a1Sachartre 	 * Write devid to the disk image. The devid is stored into the disk
492878fcd0a1Sachartre 	 * image if we have a valid label; otherwise the devid will be stored
492978fcd0a1Sachartre 	 * when the user writes a valid label.
493078fcd0a1Sachartre 	 */
493178fcd0a1Sachartre 	if (vd->vdisk_label != VD_DISK_LABEL_UNK) {
493287a7269eSachartre 		if (vd_file_write_devid(vd, vd->file_devid) != 0) {
493387a7269eSachartre 			PR0("fail to write devid for %s", file_path);
493487a7269eSachartre 			ddi_devid_free(vd->file_devid);
493587a7269eSachartre 			vd->file_devid = NULL;
493687a7269eSachartre 		}
493778fcd0a1Sachartre 	}
493887a7269eSachartre 
49393c96341aSnarayan 	return (0);
49403c96341aSnarayan }
49413c96341aSnarayan 
494217cadca8Slm66018 
494317cadca8Slm66018 /*
494417cadca8Slm66018  * Description:
494517cadca8Slm66018  *	Open a device using its device path (supplied by ldm(1m))
494617cadca8Slm66018  *
494717cadca8Slm66018  * Parameters:
494817cadca8Slm66018  *	vd 	- pointer to structure containing the vDisk info
494917cadca8Slm66018  *
495017cadca8Slm66018  * Return Value
495117cadca8Slm66018  *	0	- success
495217cadca8Slm66018  *	EIO	- Invalid number of partitions
495317cadca8Slm66018  *	!= 0	- some other non-zero return value from ldi(9F) functions
495417cadca8Slm66018  */
495517cadca8Slm66018 static int
495617cadca8Slm66018 vd_open_using_ldi_by_name(vd_t *vd)
495717cadca8Slm66018 {
495817cadca8Slm66018 	int		rval, status, open_flags;
495917cadca8Slm66018 	struct dk_cinfo	dk_cinfo;
496017cadca8Slm66018 	char		*device_path = vd->device_path;
496117cadca8Slm66018 
496217cadca8Slm66018 	/*
496317cadca8Slm66018 	 * Try to open the device. If the flags indicate that the device should
496417cadca8Slm66018 	 * be opened write-enabled, we first we try to open it "read-only"
496517cadca8Slm66018 	 * to see if we have an optical device such as a CD-ROM which, for
496617cadca8Slm66018 	 * now, we do not permit writes to and thus should not export write
496717cadca8Slm66018 	 * operations to the client.
496817cadca8Slm66018 	 *
496917cadca8Slm66018 	 * Future: if/when we implement support for guest domains writing to
497017cadca8Slm66018 	 * optical devices we will need to do further checking of the media type
497117cadca8Slm66018 	 * to distinguish between read-only and writable discs.
497217cadca8Slm66018 	 */
497317cadca8Slm66018 	if (vd->open_flags & FWRITE) {
497417cadca8Slm66018 		open_flags = vd->open_flags & ~FWRITE;
497517cadca8Slm66018 		status = ldi_open_by_name(device_path, open_flags, kcred,
497617cadca8Slm66018 		    &vd->ldi_handle[0], vd->vds->ldi_ident);
497717cadca8Slm66018 
497817cadca8Slm66018 		if (status == 0) {
497917cadca8Slm66018 			/* Verify backing device supports dk_cinfo */
498017cadca8Slm66018 			status = ldi_ioctl(vd->ldi_handle[0], DKIOCINFO,
498117cadca8Slm66018 			    (intptr_t)&dk_cinfo, (open_flags | FKIOCTL),
498217cadca8Slm66018 			    kcred, &rval);
498317cadca8Slm66018 			if (status != 0) {
498417cadca8Slm66018 				PRN("ldi_ioctl(DKIOCINFO) returned errno %d for"
498517cadca8Slm66018 				    " %s opened as RO", status, device_path);
498617cadca8Slm66018 				return (status);
498717cadca8Slm66018 			}
498817cadca8Slm66018 
498917cadca8Slm66018 			if (dk_cinfo.dki_partition >= V_NUMPAR) {
499017cadca8Slm66018 				PRN("slice %u >= maximum slice %u for %s",
499117cadca8Slm66018 				    dk_cinfo.dki_partition, V_NUMPAR,
499217cadca8Slm66018 				    device_path);
499317cadca8Slm66018 				return (EIO);
499417cadca8Slm66018 			}
499517cadca8Slm66018 
499617cadca8Slm66018 			/*
499717cadca8Slm66018 			 * If this is an optical device then we disable
499817cadca8Slm66018 			 * write access and return, otherwise we close
499917cadca8Slm66018 			 * the device and try again with writes enabled.
500017cadca8Slm66018 			 */
500117cadca8Slm66018 			if (dk_cinfo.dki_ctype == DKC_CDROM) {
500217cadca8Slm66018 				vd->open_flags = open_flags;
500317cadca8Slm66018 				return (0);
500417cadca8Slm66018 			} else {
500517cadca8Slm66018 				(void) ldi_close(vd->ldi_handle[0],
500617cadca8Slm66018 				    open_flags, kcred);
500717cadca8Slm66018 			}
500817cadca8Slm66018 		}
500917cadca8Slm66018 	}
501017cadca8Slm66018 
501117cadca8Slm66018 	/* Attempt to (re)open device */
501217cadca8Slm66018 	status = ldi_open_by_name(device_path, open_flags, kcred,
501317cadca8Slm66018 	    &vd->ldi_handle[0], vd->vds->ldi_ident);
501417cadca8Slm66018 
501517cadca8Slm66018 	/*
501617cadca8Slm66018 	 * The open can fail for example if we are opening an empty slice.
501717cadca8Slm66018 	 * In case of a failure, we try the open again but this time with
501817cadca8Slm66018 	 * the FNDELAY flag.
501917cadca8Slm66018 	 */
502017cadca8Slm66018 	if (status != 0)
502117cadca8Slm66018 		status = ldi_open_by_name(device_path, vd->open_flags | FNDELAY,
502217cadca8Slm66018 		    kcred, &vd->ldi_handle[0], vd->vds->ldi_ident);
502317cadca8Slm66018 
502417cadca8Slm66018 	if (status != 0) {
502517cadca8Slm66018 		PR0("ldi_open_by_name(%s) = errno %d", device_path, status);
502617cadca8Slm66018 		vd->ldi_handle[0] = NULL;
502717cadca8Slm66018 		return (status);
502817cadca8Slm66018 	}
502917cadca8Slm66018 
503017cadca8Slm66018 	/* Verify backing device supports dk_cinfo */
503117cadca8Slm66018 	if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCINFO,
503217cadca8Slm66018 	    (intptr_t)&dk_cinfo, (vd->open_flags | FKIOCTL), kcred,
503317cadca8Slm66018 	    &rval)) != 0) {
503417cadca8Slm66018 		PRN("ldi_ioctl(DKIOCINFO) returned errno %d for %s",
503517cadca8Slm66018 		    status, device_path);
503617cadca8Slm66018 		return (status);
503717cadca8Slm66018 	}
503817cadca8Slm66018 	if (dk_cinfo.dki_partition >= V_NUMPAR) {
503917cadca8Slm66018 		PRN("slice %u >= maximum slice %u for %s",
504017cadca8Slm66018 		    dk_cinfo.dki_partition, V_NUMPAR, device_path);
504117cadca8Slm66018 		return (EIO);
504217cadca8Slm66018 	}
504317cadca8Slm66018 
504417cadca8Slm66018 	return (0);
504517cadca8Slm66018 }
504617cadca8Slm66018 
504717cadca8Slm66018 
5048047ba61eSachartre /*
5049047ba61eSachartre  * Setup for a virtual disk which backend is a device (a physical disk,
5050047ba61eSachartre  * slice or pseudo device) that is directly exported either as a full disk
5051047ba61eSachartre  * for a physical disk or as a slice for a pseudo device or a disk slice.
5052047ba61eSachartre  * In these cases, the backend is accessed using the LDI interface.
5053047ba61eSachartre  */
50543c96341aSnarayan static int
5055047ba61eSachartre vd_setup_backend_ldi(vd_t *vd)
50561ae08745Sheppo {
5057e1ebb9ecSlm66018 	int		rval, status;
50581ae08745Sheppo 	struct dk_cinfo	dk_cinfo;
50593c96341aSnarayan 	char		*device_path = vd->device_path;
50601ae08745Sheppo 
506117cadca8Slm66018 	status = vd_open_using_ldi_by_name(vd);
5062047ba61eSachartre 	if (status != 0) {
506317cadca8Slm66018 		PR0("Failed to open (%s) = errno %d", device_path, status);
50640a55fbb7Slm66018 		return (status);
50650a55fbb7Slm66018 	}
50660a55fbb7Slm66018 
50673c96341aSnarayan 	vd->file = B_FALSE;
50684bac2208Snarayan 
5069047ba61eSachartre 	/* Get device number of backing device */
50700a55fbb7Slm66018 	if ((status = ldi_get_dev(vd->ldi_handle[0], &vd->dev[0])) != 0) {
50711ae08745Sheppo 		PRN("ldi_get_dev() returned errno %d for %s",
5072e1ebb9ecSlm66018 		    status, device_path);
50731ae08745Sheppo 		return (status);
50741ae08745Sheppo 	}
50751ae08745Sheppo 
507678fcd0a1Sachartre 	/* Verify backing device supports dk_cinfo */
5077e1ebb9ecSlm66018 	if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCINFO,
5078047ba61eSachartre 	    (intptr_t)&dk_cinfo, (vd->open_flags | FKIOCTL), kcred,
5079e1ebb9ecSlm66018 	    &rval)) != 0) {
5080e1ebb9ecSlm66018 		PRN("ldi_ioctl(DKIOCINFO) returned errno %d for %s",
5081e1ebb9ecSlm66018 		    status, device_path);
5082e1ebb9ecSlm66018 		return (status);
5083e1ebb9ecSlm66018 	}
5084e1ebb9ecSlm66018 	if (dk_cinfo.dki_partition >= V_NUMPAR) {
5085e1ebb9ecSlm66018 		PRN("slice %u >= maximum slice %u for %s",
5086e1ebb9ecSlm66018 		    dk_cinfo.dki_partition, V_NUMPAR, device_path);
5087e1ebb9ecSlm66018 		return (EIO);
5088e1ebb9ecSlm66018 	}
50894bac2208Snarayan 
5090e1ebb9ecSlm66018 	/* Store the device's max transfer size for return to the client */
5091e1ebb9ecSlm66018 	vd->max_xfer_sz = dk_cinfo.dki_maxtransfer;
5092e1ebb9ecSlm66018 
5093047ba61eSachartre 	/*
509417cadca8Slm66018 	 * We need to work out if it's an ATAPI (IDE CD-ROM) or SCSI device so
509517cadca8Slm66018 	 * that we can use the correct CDB group when sending USCSI commands.
509617cadca8Slm66018 	 */
509717cadca8Slm66018 	vd->is_atapi_dev = vd_is_atapi_device(vd);
509817cadca8Slm66018 
509917cadca8Slm66018 	/*
5100047ba61eSachartre 	 * Export a full disk.
5101047ba61eSachartre 	 *
5102047ba61eSachartre 	 * When we use the LDI interface, we export a device as a full disk
5103047ba61eSachartre 	 * if we have an entire disk slice (slice 2) and if this slice is
5104047ba61eSachartre 	 * exported as a full disk and not as a single slice disk.
510517cadca8Slm66018 	 * Similarly, we want to use LDI if we are accessing a CD or DVD
510617cadca8Slm66018 	 * device (even if it isn't s2)
5107047ba61eSachartre 	 *
5108047ba61eSachartre 	 * Note that pseudo devices are exported as full disks using the vnode
5109047ba61eSachartre 	 * interface, not the LDI interface.
5110047ba61eSachartre 	 */
511117cadca8Slm66018 	if ((dk_cinfo.dki_partition == VD_ENTIRE_DISK_SLICE &&
511217cadca8Slm66018 	    vd->vdisk_type == VD_DISK_TYPE_DISK) ||
511317cadca8Slm66018 	    dk_cinfo.dki_ctype == DKC_CDROM) {
5114047ba61eSachartre 		ASSERT(!vd->pseudo);
51152f5224aeSachartre 		if (dk_cinfo.dki_ctype == DKC_SCSI_CCS)
51162f5224aeSachartre 			vd->scsi = B_TRUE;
5117047ba61eSachartre 		return (vd_setup_full_disk(vd));
5118047ba61eSachartre 	}
5119047ba61eSachartre 
5120047ba61eSachartre 	/*
5121047ba61eSachartre 	 * Export a single slice disk.
5122047ba61eSachartre 	 *
5123047ba61eSachartre 	 * The exported device can be either a pseudo device or a disk slice. If
5124047ba61eSachartre 	 * it is a disk slice different from slice 2 then it is always exported
5125047ba61eSachartre 	 * as a single slice disk even if the "slice" option is not specified.
5126047ba61eSachartre 	 * If it is disk slice 2 or a pseudo device then it is exported as a
5127047ba61eSachartre 	 * single slice disk only if the "slice" option is specified.
5128047ba61eSachartre 	 */
5129047ba61eSachartre 	return (vd_setup_single_slice_disk(vd));
5130047ba61eSachartre }
5131047ba61eSachartre 
5132047ba61eSachartre static int
5133047ba61eSachartre vd_setup_single_slice_disk(vd_t *vd)
5134047ba61eSachartre {
5135edcc0754Sachartre 	int status, rval;
5136047ba61eSachartre 	char *device_path = vd->device_path;
5137047ba61eSachartre 
5138047ba61eSachartre 	/* Get size of backing device */
5139047ba61eSachartre 	if (ldi_get_size(vd->ldi_handle[0], &vd->vdisk_size) != DDI_SUCCESS) {
5140047ba61eSachartre 		PRN("ldi_get_size() failed for %s", device_path);
51411ae08745Sheppo 		return (EIO);
51421ae08745Sheppo 	}
5143047ba61eSachartre 	vd->vdisk_size = lbtodb(vd->vdisk_size);	/* convert to blocks */
514417cadca8Slm66018 	vd->block_size = DEV_BSIZE;
514517cadca8Slm66018 	vd->vdisk_block_size = DEV_BSIZE;
514617cadca8Slm66018 	vd->vdisk_media = VD_MEDIA_FIXED;
5147047ba61eSachartre 
51481ae08745Sheppo 	if (vd->pseudo) {
5149047ba61eSachartre 		ASSERT(vd->vdisk_type == VD_DISK_TYPE_SLICE);
515078fcd0a1Sachartre 	}
51510a55fbb7Slm66018 
5152047ba61eSachartre 	/*
5153047ba61eSachartre 	 * We export the slice as a single slice disk even if the "slice"
5154047ba61eSachartre 	 * option was not specified.
5155047ba61eSachartre 	 */
51561ae08745Sheppo 	vd->vdisk_type  = VD_DISK_TYPE_SLICE;
51571ae08745Sheppo 	vd->nslices	= 1;
51581ae08745Sheppo 
5159edcc0754Sachartre 	/*
5160edcc0754Sachartre 	 * When exporting a slice or a device as a single slice disk, we don't
5161edcc0754Sachartre 	 * care about any partitioning exposed by the backend. The goal is just
5162edcc0754Sachartre 	 * to export the backend as a flat storage. We provide a fake partition
5163edcc0754Sachartre 	 * table (either a VTOC or EFI), which presents only one slice, to
5164edcc0754Sachartre 	 * accommodate tools expecting a disk label.
5165edcc0754Sachartre 	 *
5166edcc0754Sachartre 	 * We check the label of the backend to export the device as a slice
5167edcc0754Sachartre 	 * using the same type of label (VTOC or EFI). If there is no label
5168edcc0754Sachartre 	 * then we create a fake EFI label.
5169edcc0754Sachartre 	 *
5170edcc0754Sachartre 	 * Note that the partition table we are creating could also be faked
5171edcc0754Sachartre 	 * by the client based on the size of the backend device.
5172edcc0754Sachartre 	 */
5173edcc0754Sachartre 	status = ldi_ioctl(vd->ldi_handle[0], DKIOCGVTOC, (intptr_t)&vd->vtoc,
5174edcc0754Sachartre 	    (vd->open_flags | FKIOCTL), kcred, &rval);
5175edcc0754Sachartre 
5176edcc0754Sachartre 	if (status == 0) {
5177edcc0754Sachartre 		/* export with a fake VTOC label */
5178edcc0754Sachartre 		vd->vdisk_label = VD_DISK_LABEL_VTOC;
517978fcd0a1Sachartre 		status = vd_setup_partition_vtoc(vd);
5180edcc0754Sachartre 	} else {
5181edcc0754Sachartre 		/* export with a fake EFI label */
5182edcc0754Sachartre 		vd->vdisk_label = VD_DISK_LABEL_EFI;
5183edcc0754Sachartre 		status = vd_setup_partition_efi(vd);
518478fcd0a1Sachartre 	}
518578fcd0a1Sachartre 
51864bac2208Snarayan 	return (status);
51874bac2208Snarayan }
51881ae08745Sheppo 
51891ae08745Sheppo static int
5190047ba61eSachartre vd_setup_vd(vd_t *vd)
5191047ba61eSachartre {
5192047ba61eSachartre 	int		status;
5193047ba61eSachartre 	dev_info_t	*dip;
5194047ba61eSachartre 	vnode_t 	*vnp;
5195047ba61eSachartre 	char		*path = vd->device_path;
5196047ba61eSachartre 
5197047ba61eSachartre 	/* make sure the vdisk backend is valid */
5198047ba61eSachartre 	if ((status = lookupname(path, UIO_SYSSPACE,
5199047ba61eSachartre 	    FOLLOW, NULLVPP, &vnp)) != 0) {
5200047ba61eSachartre 		PR0("Cannot lookup %s errno %d", path, status);
5201047ba61eSachartre 		goto done;
5202047ba61eSachartre 	}
5203047ba61eSachartre 
5204047ba61eSachartre 	switch (vnp->v_type) {
5205047ba61eSachartre 	case VREG:
5206047ba61eSachartre 		/*
5207047ba61eSachartre 		 * Backend is a file so it is exported as a full disk or as a
5208047ba61eSachartre 		 * single slice disk using the vnode interface.
5209047ba61eSachartre 		 */
5210047ba61eSachartre 		VN_RELE(vnp);
5211047ba61eSachartre 		vd->pseudo = B_FALSE;
5212047ba61eSachartre 		status = vd_setup_backend_vnode(vd);
5213047ba61eSachartre 		break;
5214047ba61eSachartre 
5215047ba61eSachartre 	case VBLK:
5216047ba61eSachartre 	case VCHR:
5217047ba61eSachartre 		/*
5218047ba61eSachartre 		 * Backend is a device. The way it is exported depends on the
5219047ba61eSachartre 		 * type of the device.
5220047ba61eSachartre 		 *
5221047ba61eSachartre 		 * - A pseudo device is exported as a full disk using the vnode
5222047ba61eSachartre 		 *   interface or as a single slice disk using the LDI
5223047ba61eSachartre 		 *   interface.
5224047ba61eSachartre 		 *
5225047ba61eSachartre 		 * - A disk (represented by the slice 2 of that disk) is
5226047ba61eSachartre 		 *   exported as a full disk using the LDI interface.
5227047ba61eSachartre 		 *
5228047ba61eSachartre 		 * - A disk slice (different from slice 2) is always exported
5229047ba61eSachartre 		 *   as a single slice disk using the LDI interface.
5230047ba61eSachartre 		 *
5231047ba61eSachartre 		 * - The slice 2 of a disk is exported as a single slice disk
5232047ba61eSachartre 		 *   if the "slice" option is specified, otherwise the entire
5233047ba61eSachartre 		 *   disk will be exported. In any case, the LDI interface is
5234047ba61eSachartre 		 *   used.
5235047ba61eSachartre 		 */
5236047ba61eSachartre 
5237047ba61eSachartre 		/* check if this is a pseudo device */
5238047ba61eSachartre 		if ((dip = ddi_hold_devi_by_instance(getmajor(vnp->v_rdev),
5239047ba61eSachartre 		    dev_to_instance(vnp->v_rdev), 0))  == NULL) {
5240047ba61eSachartre 			PRN("%s is no longer accessible", path);
5241047ba61eSachartre 			VN_RELE(vnp);
5242047ba61eSachartre 			status = EIO;
5243047ba61eSachartre 			break;
5244047ba61eSachartre 		}
5245047ba61eSachartre 		vd->pseudo = is_pseudo_device(dip);
5246047ba61eSachartre 		ddi_release_devi(dip);
5247047ba61eSachartre 		VN_RELE(vnp);
5248047ba61eSachartre 
52492f5224aeSachartre 		if (!vd->pseudo) {
52502f5224aeSachartre 			status = vd_setup_backend_ldi(vd);
52512f5224aeSachartre 			break;
52522f5224aeSachartre 		}
52532f5224aeSachartre 
5254047ba61eSachartre 		/*
5255047ba61eSachartre 		 * If this is a pseudo device then its usage depends if the
5256047ba61eSachartre 		 * "slice" option is set or not. If the "slice" option is set
5257047ba61eSachartre 		 * then the pseudo device will be exported as a single slice,
5258047ba61eSachartre 		 * otherwise it will be exported as a full disk.
52592f5224aeSachartre 		 *
52602f5224aeSachartre 		 * For backward compatibility, if vd_volume_force_slice is set
52612f5224aeSachartre 		 * then we always export pseudo devices as slices.
5262047ba61eSachartre 		 */
52632f5224aeSachartre 		if (vd_volume_force_slice) {
52642f5224aeSachartre 			vd->vdisk_type = VD_DISK_TYPE_SLICE;
52652f5224aeSachartre 			vd->nslices = 1;
52662f5224aeSachartre 		}
52672f5224aeSachartre 
52682f5224aeSachartre 		if (vd->vdisk_type == VD_DISK_TYPE_DISK)
5269047ba61eSachartre 			status = vd_setup_backend_vnode(vd);
5270047ba61eSachartre 		else
5271047ba61eSachartre 			status = vd_setup_backend_ldi(vd);
5272047ba61eSachartre 		break;
5273047ba61eSachartre 
5274047ba61eSachartre 	default:
5275047ba61eSachartre 		PRN("Unsupported vdisk backend %s", path);
5276047ba61eSachartre 		VN_RELE(vnp);
5277047ba61eSachartre 		status = EBADF;
5278047ba61eSachartre 	}
5279047ba61eSachartre 
5280047ba61eSachartre done:
5281047ba61eSachartre 	if (status != 0) {
5282047ba61eSachartre 		/*
5283047ba61eSachartre 		 * If the error is retryable print an error message only
5284047ba61eSachartre 		 * during the first try.
5285047ba61eSachartre 		 */
5286047ba61eSachartre 		if (status == ENXIO || status == ENODEV ||
5287047ba61eSachartre 		    status == ENOENT || status == EROFS) {
5288047ba61eSachartre 			if (!(vd->initialized & VD_SETUP_ERROR)) {
5289047ba61eSachartre 				PRN("%s is currently inaccessible (error %d)",
5290047ba61eSachartre 				    path, status);
5291047ba61eSachartre 			}
5292047ba61eSachartre 			status = EAGAIN;
5293047ba61eSachartre 		} else {
5294047ba61eSachartre 			PRN("%s can not be exported as a virtual disk "
5295047ba61eSachartre 			    "(error %d)", path, status);
5296047ba61eSachartre 		}
5297047ba61eSachartre 		vd->initialized |= VD_SETUP_ERROR;
5298047ba61eSachartre 
5299047ba61eSachartre 	} else if (vd->initialized & VD_SETUP_ERROR) {
5300047ba61eSachartre 		/* print a message only if we previously had an error */
5301047ba61eSachartre 		PRN("%s is now online", path);
5302047ba61eSachartre 		vd->initialized &= ~VD_SETUP_ERROR;
5303047ba61eSachartre 	}
5304047ba61eSachartre 
5305047ba61eSachartre 	return (status);
5306047ba61eSachartre }
5307047ba61eSachartre 
5308047ba61eSachartre static int
5309047ba61eSachartre vds_do_init_vd(vds_t *vds, uint64_t id, char *device_path, uint64_t options,
5310047ba61eSachartre     uint64_t ldc_id, vd_t **vdp)
53111ae08745Sheppo {
53121ae08745Sheppo 	char			tq_name[TASKQ_NAMELEN];
53130a55fbb7Slm66018 	int			status;
53141ae08745Sheppo 	ddi_iblock_cookie_t	iblock = NULL;
53151ae08745Sheppo 	ldc_attr_t		ldc_attr;
53161ae08745Sheppo 	vd_t			*vd;
53171ae08745Sheppo 
53181ae08745Sheppo 
53191ae08745Sheppo 	ASSERT(vds != NULL);
5320e1ebb9ecSlm66018 	ASSERT(device_path != NULL);
53211ae08745Sheppo 	ASSERT(vdp != NULL);
5322e1ebb9ecSlm66018 	PR0("Adding vdisk for %s", device_path);
53231ae08745Sheppo 
53241ae08745Sheppo 	if ((vd = kmem_zalloc(sizeof (*vd), KM_NOSLEEP)) == NULL) {
53251ae08745Sheppo 		PRN("No memory for virtual disk");
53261ae08745Sheppo 		return (EAGAIN);
53271ae08745Sheppo 	}
53281ae08745Sheppo 	*vdp = vd;	/* assign here so vds_destroy_vd() can cleanup later */
53291ae08745Sheppo 	vd->vds = vds;
53303c96341aSnarayan 	(void) strncpy(vd->device_path, device_path, MAXPATHLEN);
53311ae08745Sheppo 
5332047ba61eSachartre 	/* Setup open flags */
5333047ba61eSachartre 	vd->open_flags = FREAD;
5334047ba61eSachartre 
5335047ba61eSachartre 	if (!(options & VD_OPT_RDONLY))
5336047ba61eSachartre 		vd->open_flags |= FWRITE;
5337047ba61eSachartre 
5338047ba61eSachartre 	if (options & VD_OPT_EXCLUSIVE)
5339047ba61eSachartre 		vd->open_flags |= FEXCL;
5340047ba61eSachartre 
5341047ba61eSachartre 	/* Setup disk type */
5342047ba61eSachartre 	if (options & VD_OPT_SLICE) {
5343047ba61eSachartre 		vd->vdisk_type = VD_DISK_TYPE_SLICE;
5344047ba61eSachartre 		vd->nslices = 1;
5345047ba61eSachartre 	} else {
5346047ba61eSachartre 		vd->vdisk_type = VD_DISK_TYPE_DISK;
5347047ba61eSachartre 		vd->nslices = V_NUMPAR;
5348047ba61eSachartre 	}
5349047ba61eSachartre 
5350047ba61eSachartre 	/* default disk label */
5351047ba61eSachartre 	vd->vdisk_label = VD_DISK_LABEL_UNK;
5352047ba61eSachartre 
53530a55fbb7Slm66018 	/* Open vdisk and initialize parameters */
53543c96341aSnarayan 	if ((status = vd_setup_vd(vd)) == 0) {
53553c96341aSnarayan 		vd->initialized |= VD_DISK_READY;
53561ae08745Sheppo 
53573c96341aSnarayan 		ASSERT(vd->nslices > 0 && vd->nslices <= V_NUMPAR);
53583c96341aSnarayan 		PR0("vdisk_type = %s, pseudo = %s, file = %s, nslices = %u",
53593c96341aSnarayan 		    ((vd->vdisk_type == VD_DISK_TYPE_DISK) ? "disk" : "slice"),
53603c96341aSnarayan 		    (vd->pseudo ? "yes" : "no"), (vd->file ? "yes" : "no"),
53613c96341aSnarayan 		    vd->nslices);
53623c96341aSnarayan 	} else {
53633c96341aSnarayan 		if (status != EAGAIN)
53643c96341aSnarayan 			return (status);
53653c96341aSnarayan 	}
53661ae08745Sheppo 
53671ae08745Sheppo 	/* Initialize locking */
53681ae08745Sheppo 	if (ddi_get_soft_iblock_cookie(vds->dip, DDI_SOFTINT_MED,
53691ae08745Sheppo 	    &iblock) != DDI_SUCCESS) {
53701ae08745Sheppo 		PRN("Could not get iblock cookie.");
53711ae08745Sheppo 		return (EIO);
53721ae08745Sheppo 	}
53731ae08745Sheppo 
53741ae08745Sheppo 	mutex_init(&vd->lock, NULL, MUTEX_DRIVER, iblock);
53751ae08745Sheppo 	vd->initialized |= VD_LOCKING;
53761ae08745Sheppo 
53771ae08745Sheppo 
5378d10e4ef2Snarayan 	/* Create start and completion task queues for the vdisk */
5379d10e4ef2Snarayan 	(void) snprintf(tq_name, sizeof (tq_name), "vd_startq%lu", id);
53801ae08745Sheppo 	PR1("tq_name = %s", tq_name);
5381d10e4ef2Snarayan 	if ((vd->startq = ddi_taskq_create(vds->dip, tq_name, 1,
53821ae08745Sheppo 	    TASKQ_DEFAULTPRI, 0)) == NULL) {
53831ae08745Sheppo 		PRN("Could not create task queue");
53841ae08745Sheppo 		return (EIO);
53851ae08745Sheppo 	}
5386d10e4ef2Snarayan 	(void) snprintf(tq_name, sizeof (tq_name), "vd_completionq%lu", id);
5387d10e4ef2Snarayan 	PR1("tq_name = %s", tq_name);
5388d10e4ef2Snarayan 	if ((vd->completionq = ddi_taskq_create(vds->dip, tq_name, 1,
5389d10e4ef2Snarayan 	    TASKQ_DEFAULTPRI, 0)) == NULL) {
5390d10e4ef2Snarayan 		PRN("Could not create task queue");
5391d10e4ef2Snarayan 		return (EIO);
5392d10e4ef2Snarayan 	}
5393d10e4ef2Snarayan 	vd->enabled = 1;	/* before callback can dispatch to startq */
53941ae08745Sheppo 
53951ae08745Sheppo 
53961ae08745Sheppo 	/* Bring up LDC */
53971ae08745Sheppo 	ldc_attr.devclass	= LDC_DEV_BLK_SVC;
53981ae08745Sheppo 	ldc_attr.instance	= ddi_get_instance(vds->dip);
53991ae08745Sheppo 	ldc_attr.mode		= LDC_MODE_UNRELIABLE;
5400e1ebb9ecSlm66018 	ldc_attr.mtu		= VD_LDC_MTU;
54011ae08745Sheppo 	if ((status = ldc_init(ldc_id, &ldc_attr, &vd->ldc_handle)) != 0) {
540217cadca8Slm66018 		PRN("Could not initialize LDC channel %lx, "
5403690555a1Sachartre 		    "init failed with error %d", ldc_id, status);
54041ae08745Sheppo 		return (status);
54051ae08745Sheppo 	}
54061ae08745Sheppo 	vd->initialized |= VD_LDC;
54071ae08745Sheppo 
54081ae08745Sheppo 	if ((status = ldc_reg_callback(vd->ldc_handle, vd_handle_ldc_events,
54091ae08745Sheppo 	    (caddr_t)vd)) != 0) {
5410690555a1Sachartre 		PRN("Could not initialize LDC channel %lu,"
5411690555a1Sachartre 		    "reg_callback failed with error %d", ldc_id, status);
54121ae08745Sheppo 		return (status);
54131ae08745Sheppo 	}
54141ae08745Sheppo 
54151ae08745Sheppo 	if ((status = ldc_open(vd->ldc_handle)) != 0) {
5416690555a1Sachartre 		PRN("Could not initialize LDC channel %lu,"
5417690555a1Sachartre 		    "open failed with error %d", ldc_id, status);
54181ae08745Sheppo 		return (status);
54191ae08745Sheppo 	}
54201ae08745Sheppo 
54213af08d82Slm66018 	if ((status = ldc_up(vd->ldc_handle)) != 0) {
542234683adeSsg70180 		PR0("ldc_up() returned errno %d", status);
54233af08d82Slm66018 	}
54243af08d82Slm66018 
54254bac2208Snarayan 	/* Allocate the inband task memory handle */
54264bac2208Snarayan 	status = ldc_mem_alloc_handle(vd->ldc_handle, &(vd->inband_task.mhdl));
54274bac2208Snarayan 	if (status) {
5428690555a1Sachartre 		PRN("Could not initialize LDC channel %lu,"
5429690555a1Sachartre 		    "alloc_handle failed with error %d", ldc_id, status);
54304bac2208Snarayan 		return (ENXIO);
54314bac2208Snarayan 	}
54321ae08745Sheppo 
54331ae08745Sheppo 	/* Add the successfully-initialized vdisk to the server's table */
54341ae08745Sheppo 	if (mod_hash_insert(vds->vd_table, (mod_hash_key_t)id, vd) != 0) {
54351ae08745Sheppo 		PRN("Error adding vdisk ID %lu to table", id);
54361ae08745Sheppo 		return (EIO);
54371ae08745Sheppo 	}
54381ae08745Sheppo 
54393af08d82Slm66018 	/* Allocate the staging buffer */
54403af08d82Slm66018 	vd->max_msglen	= sizeof (vio_msg_t);	/* baseline vio message size */
54413af08d82Slm66018 	vd->vio_msgp = kmem_alloc(vd->max_msglen, KM_SLEEP);
54423af08d82Slm66018 
54433af08d82Slm66018 	/* store initial state */
54443af08d82Slm66018 	vd->state = VD_STATE_INIT;
54453af08d82Slm66018 
54461ae08745Sheppo 	return (0);
54471ae08745Sheppo }
54481ae08745Sheppo 
54493af08d82Slm66018 static void
54503af08d82Slm66018 vd_free_dring_task(vd_t *vdp)
54513af08d82Slm66018 {
54523af08d82Slm66018 	if (vdp->dring_task != NULL) {
54533af08d82Slm66018 		ASSERT(vdp->dring_len != 0);
54543af08d82Slm66018 		/* Free all dring_task memory handles */
54553af08d82Slm66018 		for (int i = 0; i < vdp->dring_len; i++) {
54563af08d82Slm66018 			(void) ldc_mem_free_handle(vdp->dring_task[i].mhdl);
54573af08d82Slm66018 			kmem_free(vdp->dring_task[i].msg, vdp->max_msglen);
54583af08d82Slm66018 			vdp->dring_task[i].msg = NULL;
54593af08d82Slm66018 		}
54603af08d82Slm66018 		kmem_free(vdp->dring_task,
54613af08d82Slm66018 		    (sizeof (*vdp->dring_task)) * vdp->dring_len);
54623af08d82Slm66018 		vdp->dring_task = NULL;
54633af08d82Slm66018 	}
54643af08d82Slm66018 }
54653af08d82Slm66018 
54661ae08745Sheppo /*
54671ae08745Sheppo  * Destroy the state associated with a virtual disk
54681ae08745Sheppo  */
54691ae08745Sheppo static void
54701ae08745Sheppo vds_destroy_vd(void *arg)
54711ae08745Sheppo {
54721ae08745Sheppo 	vd_t	*vd = (vd_t *)arg;
547334683adeSsg70180 	int	retry = 0, rv;
54741ae08745Sheppo 
54751ae08745Sheppo 	if (vd == NULL)
54761ae08745Sheppo 		return;
54771ae08745Sheppo 
5478d10e4ef2Snarayan 	PR0("Destroying vdisk state");
5479d10e4ef2Snarayan 
54801ae08745Sheppo 	/* Disable queuing requests for the vdisk */
54811ae08745Sheppo 	if (vd->initialized & VD_LOCKING) {
54821ae08745Sheppo 		mutex_enter(&vd->lock);
54831ae08745Sheppo 		vd->enabled = 0;
54841ae08745Sheppo 		mutex_exit(&vd->lock);
54851ae08745Sheppo 	}
54861ae08745Sheppo 
5487d10e4ef2Snarayan 	/* Drain and destroy start queue (*before* destroying completionq) */
5488d10e4ef2Snarayan 	if (vd->startq != NULL)
5489d10e4ef2Snarayan 		ddi_taskq_destroy(vd->startq);	/* waits for queued tasks */
5490d10e4ef2Snarayan 
5491d10e4ef2Snarayan 	/* Drain and destroy completion queue (*before* shutting down LDC) */
5492d10e4ef2Snarayan 	if (vd->completionq != NULL)
5493d10e4ef2Snarayan 		ddi_taskq_destroy(vd->completionq);	/* waits for tasks */
5494d10e4ef2Snarayan 
54953af08d82Slm66018 	vd_free_dring_task(vd);
54963af08d82Slm66018 
549734683adeSsg70180 	/* Free the inband task memory handle */
549834683adeSsg70180 	(void) ldc_mem_free_handle(vd->inband_task.mhdl);
549934683adeSsg70180 
550034683adeSsg70180 	/* Shut down LDC */
550134683adeSsg70180 	if (vd->initialized & VD_LDC) {
550234683adeSsg70180 		/* unmap the dring */
550334683adeSsg70180 		if (vd->initialized & VD_DRING)
550434683adeSsg70180 			(void) ldc_mem_dring_unmap(vd->dring_handle);
550534683adeSsg70180 
550634683adeSsg70180 		/* close LDC channel - retry on EAGAIN */
550734683adeSsg70180 		while ((rv = ldc_close(vd->ldc_handle)) == EAGAIN) {
550834683adeSsg70180 			if (++retry > vds_ldc_retries) {
550934683adeSsg70180 				PR0("Timed out closing channel");
551034683adeSsg70180 				break;
551134683adeSsg70180 			}
551234683adeSsg70180 			drv_usecwait(vds_ldc_delay);
551334683adeSsg70180 		}
551434683adeSsg70180 		if (rv == 0) {
551534683adeSsg70180 			(void) ldc_unreg_callback(vd->ldc_handle);
551634683adeSsg70180 			(void) ldc_fini(vd->ldc_handle);
551734683adeSsg70180 		} else {
551834683adeSsg70180 			/*
551934683adeSsg70180 			 * Closing the LDC channel has failed. Ideally we should
552034683adeSsg70180 			 * fail here but there is no Zeus level infrastructure
552134683adeSsg70180 			 * to handle this. The MD has already been changed and
552234683adeSsg70180 			 * we have to do the close. So we try to do as much
552334683adeSsg70180 			 * clean up as we can.
552434683adeSsg70180 			 */
552534683adeSsg70180 			(void) ldc_set_cb_mode(vd->ldc_handle, LDC_CB_DISABLE);
552634683adeSsg70180 			while (ldc_unreg_callback(vd->ldc_handle) == EAGAIN)
552734683adeSsg70180 				drv_usecwait(vds_ldc_delay);
552834683adeSsg70180 		}
552934683adeSsg70180 	}
553034683adeSsg70180 
55313af08d82Slm66018 	/* Free the staging buffer for msgs */
55323af08d82Slm66018 	if (vd->vio_msgp != NULL) {
55333af08d82Slm66018 		kmem_free(vd->vio_msgp, vd->max_msglen);
55343af08d82Slm66018 		vd->vio_msgp = NULL;
55353af08d82Slm66018 	}
55363af08d82Slm66018 
55373af08d82Slm66018 	/* Free the inband message buffer */
55383af08d82Slm66018 	if (vd->inband_task.msg != NULL) {
55393af08d82Slm66018 		kmem_free(vd->inband_task.msg, vd->max_msglen);
55403af08d82Slm66018 		vd->inband_task.msg = NULL;
5541d10e4ef2Snarayan 	}
5542da6c28aaSamw 
55433c96341aSnarayan 	if (vd->file) {
5544690555a1Sachartre 		/* Close file */
5545047ba61eSachartre 		(void) VOP_CLOSE(vd->file_vnode, vd->open_flags, 1,
5546da6c28aaSamw 		    0, kcred, NULL);
55473c96341aSnarayan 		VN_RELE(vd->file_vnode);
554887a7269eSachartre 		if (vd->file_devid != NULL)
554987a7269eSachartre 			ddi_devid_free(vd->file_devid);
55503c96341aSnarayan 	} else {
55511ae08745Sheppo 		/* Close any open backing-device slices */
55521ae08745Sheppo 		for (uint_t slice = 0; slice < vd->nslices; slice++) {
55531ae08745Sheppo 			if (vd->ldi_handle[slice] != NULL) {
55541ae08745Sheppo 				PR0("Closing slice %u", slice);
55551ae08745Sheppo 				(void) ldi_close(vd->ldi_handle[slice],
5556047ba61eSachartre 				    vd->open_flags, kcred);
55571ae08745Sheppo 			}
55581ae08745Sheppo 		}
55593c96341aSnarayan 	}
55601ae08745Sheppo 
55611ae08745Sheppo 	/* Free lock */
55621ae08745Sheppo 	if (vd->initialized & VD_LOCKING)
55631ae08745Sheppo 		mutex_destroy(&vd->lock);
55641ae08745Sheppo 
55651ae08745Sheppo 	/* Finally, free the vdisk structure itself */
55661ae08745Sheppo 	kmem_free(vd, sizeof (*vd));
55671ae08745Sheppo }
55681ae08745Sheppo 
55691ae08745Sheppo static int
5570047ba61eSachartre vds_init_vd(vds_t *vds, uint64_t id, char *device_path, uint64_t options,
5571047ba61eSachartre     uint64_t ldc_id)
55721ae08745Sheppo {
55731ae08745Sheppo 	int	status;
55741ae08745Sheppo 	vd_t	*vd = NULL;
55751ae08745Sheppo 
55761ae08745Sheppo 
5577047ba61eSachartre 	if ((status = vds_do_init_vd(vds, id, device_path, options,
5578047ba61eSachartre 	    ldc_id, &vd)) != 0)
55791ae08745Sheppo 		vds_destroy_vd(vd);
55801ae08745Sheppo 
55811ae08745Sheppo 	return (status);
55821ae08745Sheppo }
55831ae08745Sheppo 
55841ae08745Sheppo static int
55851ae08745Sheppo vds_do_get_ldc_id(md_t *md, mde_cookie_t vd_node, mde_cookie_t *channel,
55861ae08745Sheppo     uint64_t *ldc_id)
55871ae08745Sheppo {
55881ae08745Sheppo 	int	num_channels;
55891ae08745Sheppo 
55901ae08745Sheppo 
55911ae08745Sheppo 	/* Look for channel endpoint child(ren) of the vdisk MD node */
55921ae08745Sheppo 	if ((num_channels = md_scan_dag(md, vd_node,
55931ae08745Sheppo 	    md_find_name(md, VD_CHANNEL_ENDPOINT),
55941ae08745Sheppo 	    md_find_name(md, "fwd"), channel)) <= 0) {
55951ae08745Sheppo 		PRN("No \"%s\" found for virtual disk", VD_CHANNEL_ENDPOINT);
55961ae08745Sheppo 		return (-1);
55971ae08745Sheppo 	}
55981ae08745Sheppo 
55991ae08745Sheppo 	/* Get the "id" value for the first channel endpoint node */
56001ae08745Sheppo 	if (md_get_prop_val(md, channel[0], VD_ID_PROP, ldc_id) != 0) {
56011ae08745Sheppo 		PRN("No \"%s\" property found for \"%s\" of vdisk",
56021ae08745Sheppo 		    VD_ID_PROP, VD_CHANNEL_ENDPOINT);
56031ae08745Sheppo 		return (-1);
56041ae08745Sheppo 	}
56051ae08745Sheppo 
56061ae08745Sheppo 	if (num_channels > 1) {
56071ae08745Sheppo 		PRN("Using ID of first of multiple channels for this vdisk");
56081ae08745Sheppo 	}
56091ae08745Sheppo 
56101ae08745Sheppo 	return (0);
56111ae08745Sheppo }
56121ae08745Sheppo 
56131ae08745Sheppo static int
56141ae08745Sheppo vds_get_ldc_id(md_t *md, mde_cookie_t vd_node, uint64_t *ldc_id)
56151ae08745Sheppo {
56161ae08745Sheppo 	int		num_nodes, status;
56171ae08745Sheppo 	size_t		size;
56181ae08745Sheppo 	mde_cookie_t	*channel;
56191ae08745Sheppo 
56201ae08745Sheppo 
56211ae08745Sheppo 	if ((num_nodes = md_node_count(md)) <= 0) {
56221ae08745Sheppo 		PRN("Invalid node count in Machine Description subtree");
56231ae08745Sheppo 		return (-1);
56241ae08745Sheppo 	}
56251ae08745Sheppo 	size = num_nodes*(sizeof (*channel));
56261ae08745Sheppo 	channel = kmem_zalloc(size, KM_SLEEP);
56271ae08745Sheppo 	status = vds_do_get_ldc_id(md, vd_node, channel, ldc_id);
56281ae08745Sheppo 	kmem_free(channel, size);
56291ae08745Sheppo 
56301ae08745Sheppo 	return (status);
56311ae08745Sheppo }
56321ae08745Sheppo 
5633047ba61eSachartre /*
5634047ba61eSachartre  * Function:
5635047ba61eSachartre  *	vds_get_options
5636047ba61eSachartre  *
5637047ba61eSachartre  * Description:
5638047ba61eSachartre  * 	Parse the options of a vds node. Options are defined as an array
5639047ba61eSachartre  *	of strings in the vds-block-device-opts property of the vds node
5640047ba61eSachartre  *	in the machine description. Options are returned as a bitmask. The
5641047ba61eSachartre  *	mapping between the bitmask options and the options strings from the
5642047ba61eSachartre  *	machine description is defined in the vd_bdev_options[] array.
5643047ba61eSachartre  *
5644047ba61eSachartre  *	The vds-block-device-opts property is optional. If a vds has no such
5645047ba61eSachartre  *	property then no option is defined.
5646047ba61eSachartre  *
5647047ba61eSachartre  * Parameters:
5648047ba61eSachartre  *	md		- machine description.
5649047ba61eSachartre  *	vd_node		- vds node in the machine description for which
5650047ba61eSachartre  *			  options have to be parsed.
5651047ba61eSachartre  *	options		- the returned options.
5652047ba61eSachartre  *
5653047ba61eSachartre  * Return Code:
5654047ba61eSachartre  *	none.
5655047ba61eSachartre  */
5656047ba61eSachartre static void
5657047ba61eSachartre vds_get_options(md_t *md, mde_cookie_t vd_node, uint64_t *options)
5658047ba61eSachartre {
5659047ba61eSachartre 	char	*optstr, *opt;
5660047ba61eSachartre 	int	len, n, i;
5661047ba61eSachartre 
5662047ba61eSachartre 	*options = 0;
5663047ba61eSachartre 
5664047ba61eSachartre 	if (md_get_prop_data(md, vd_node, VD_BLOCK_DEVICE_OPTS,
5665047ba61eSachartre 	    (uint8_t **)&optstr, &len) != 0) {
5666047ba61eSachartre 		PR0("No options found");
5667047ba61eSachartre 		return;
5668047ba61eSachartre 	}
5669047ba61eSachartre 
5670047ba61eSachartre 	/* parse options */
5671047ba61eSachartre 	opt = optstr;
5672047ba61eSachartre 	n = sizeof (vd_bdev_options) / sizeof (vd_option_t);
5673047ba61eSachartre 
5674047ba61eSachartre 	while (opt < optstr + len) {
5675047ba61eSachartre 		for (i = 0; i < n; i++) {
5676047ba61eSachartre 			if (strncmp(vd_bdev_options[i].vdo_name,
5677047ba61eSachartre 			    opt, VD_OPTION_NLEN) == 0) {
5678047ba61eSachartre 				*options |= vd_bdev_options[i].vdo_value;
5679047ba61eSachartre 				break;
5680047ba61eSachartre 			}
5681047ba61eSachartre 		}
5682047ba61eSachartre 
5683047ba61eSachartre 		if (i < n) {
5684047ba61eSachartre 			PR0("option: %s", opt);
5685047ba61eSachartre 		} else {
5686047ba61eSachartre 			PRN("option %s is unknown or unsupported", opt);
5687047ba61eSachartre 		}
5688047ba61eSachartre 
5689047ba61eSachartre 		opt += strlen(opt) + 1;
5690047ba61eSachartre 	}
5691047ba61eSachartre }
5692047ba61eSachartre 
56931ae08745Sheppo static void
56941ae08745Sheppo vds_add_vd(vds_t *vds, md_t *md, mde_cookie_t vd_node)
56951ae08745Sheppo {
5696e1ebb9ecSlm66018 	char		*device_path = NULL;
5697047ba61eSachartre 	uint64_t	id = 0, ldc_id = 0, options = 0;
56981ae08745Sheppo 
56991ae08745Sheppo 	if (md_get_prop_val(md, vd_node, VD_ID_PROP, &id) != 0) {
57001ae08745Sheppo 		PRN("Error getting vdisk \"%s\"", VD_ID_PROP);
57011ae08745Sheppo 		return;
57021ae08745Sheppo 	}
57031ae08745Sheppo 	PR0("Adding vdisk ID %lu", id);
57041ae08745Sheppo 	if (md_get_prop_str(md, vd_node, VD_BLOCK_DEVICE_PROP,
5705e1ebb9ecSlm66018 	    &device_path) != 0) {
57061ae08745Sheppo 		PRN("Error getting vdisk \"%s\"", VD_BLOCK_DEVICE_PROP);
57071ae08745Sheppo 		return;
57081ae08745Sheppo 	}
57091ae08745Sheppo 
5710047ba61eSachartre 	vds_get_options(md, vd_node, &options);
5711047ba61eSachartre 
57121ae08745Sheppo 	if (vds_get_ldc_id(md, vd_node, &ldc_id) != 0) {
57131ae08745Sheppo 		PRN("Error getting LDC ID for vdisk %lu", id);
57141ae08745Sheppo 		return;
57151ae08745Sheppo 	}
57161ae08745Sheppo 
5717047ba61eSachartre 	if (vds_init_vd(vds, id, device_path, options, ldc_id) != 0) {
57181ae08745Sheppo 		PRN("Failed to add vdisk ID %lu", id);
571917cadca8Slm66018 		if (mod_hash_destroy(vds->vd_table, (mod_hash_key_t)id) != 0)
572017cadca8Slm66018 			PRN("No vDisk entry found for vdisk ID %lu", id);
57211ae08745Sheppo 		return;
57221ae08745Sheppo 	}
57231ae08745Sheppo }
57241ae08745Sheppo 
57251ae08745Sheppo static void
57261ae08745Sheppo vds_remove_vd(vds_t *vds, md_t *md, mde_cookie_t vd_node)
57271ae08745Sheppo {
57281ae08745Sheppo 	uint64_t	id = 0;
57291ae08745Sheppo 
57301ae08745Sheppo 
57311ae08745Sheppo 	if (md_get_prop_val(md, vd_node, VD_ID_PROP, &id) != 0) {
57321ae08745Sheppo 		PRN("Unable to get \"%s\" property from vdisk's MD node",
57331ae08745Sheppo 		    VD_ID_PROP);
57341ae08745Sheppo 		return;
57351ae08745Sheppo 	}
57361ae08745Sheppo 	PR0("Removing vdisk ID %lu", id);
57371ae08745Sheppo 	if (mod_hash_destroy(vds->vd_table, (mod_hash_key_t)id) != 0)
57381ae08745Sheppo 		PRN("No vdisk entry found for vdisk ID %lu", id);
57391ae08745Sheppo }
57401ae08745Sheppo 
57411ae08745Sheppo static void
57421ae08745Sheppo vds_change_vd(vds_t *vds, md_t *prev_md, mde_cookie_t prev_vd_node,
57431ae08745Sheppo     md_t *curr_md, mde_cookie_t curr_vd_node)
57441ae08745Sheppo {
57451ae08745Sheppo 	char		*curr_dev, *prev_dev;
5746047ba61eSachartre 	uint64_t	curr_id = 0, curr_ldc_id = 0, curr_options = 0;
5747047ba61eSachartre 	uint64_t	prev_id = 0, prev_ldc_id = 0, prev_options = 0;
57481ae08745Sheppo 	size_t		len;
57491ae08745Sheppo 
57501ae08745Sheppo 
57511ae08745Sheppo 	/* Validate that vdisk ID has not changed */
57521ae08745Sheppo 	if (md_get_prop_val(prev_md, prev_vd_node, VD_ID_PROP, &prev_id) != 0) {
57531ae08745Sheppo 		PRN("Error getting previous vdisk \"%s\" property",
57541ae08745Sheppo 		    VD_ID_PROP);
57551ae08745Sheppo 		return;
57561ae08745Sheppo 	}
57571ae08745Sheppo 	if (md_get_prop_val(curr_md, curr_vd_node, VD_ID_PROP, &curr_id) != 0) {
57581ae08745Sheppo 		PRN("Error getting current vdisk \"%s\" property", VD_ID_PROP);
57591ae08745Sheppo 		return;
57601ae08745Sheppo 	}
57611ae08745Sheppo 	if (curr_id != prev_id) {
57621ae08745Sheppo 		PRN("Not changing vdisk:  ID changed from %lu to %lu",
57631ae08745Sheppo 		    prev_id, curr_id);
57641ae08745Sheppo 		return;
57651ae08745Sheppo 	}
57661ae08745Sheppo 
57671ae08745Sheppo 	/* Validate that LDC ID has not changed */
57681ae08745Sheppo 	if (vds_get_ldc_id(prev_md, prev_vd_node, &prev_ldc_id) != 0) {
57691ae08745Sheppo 		PRN("Error getting LDC ID for vdisk %lu", prev_id);
57701ae08745Sheppo 		return;
57711ae08745Sheppo 	}
57721ae08745Sheppo 
57731ae08745Sheppo 	if (vds_get_ldc_id(curr_md, curr_vd_node, &curr_ldc_id) != 0) {
57741ae08745Sheppo 		PRN("Error getting LDC ID for vdisk %lu", curr_id);
57751ae08745Sheppo 		return;
57761ae08745Sheppo 	}
57771ae08745Sheppo 	if (curr_ldc_id != prev_ldc_id) {
57780a55fbb7Slm66018 		_NOTE(NOTREACHED);	/* lint is confused */
57791ae08745Sheppo 		PRN("Not changing vdisk:  "
57801ae08745Sheppo 		    "LDC ID changed from %lu to %lu", prev_ldc_id, curr_ldc_id);
57811ae08745Sheppo 		return;
57821ae08745Sheppo 	}
57831ae08745Sheppo 
57841ae08745Sheppo 	/* Determine whether device path has changed */
57851ae08745Sheppo 	if (md_get_prop_str(prev_md, prev_vd_node, VD_BLOCK_DEVICE_PROP,
57861ae08745Sheppo 	    &prev_dev) != 0) {
57871ae08745Sheppo 		PRN("Error getting previous vdisk \"%s\"",
57881ae08745Sheppo 		    VD_BLOCK_DEVICE_PROP);
57891ae08745Sheppo 		return;
57901ae08745Sheppo 	}
57911ae08745Sheppo 	if (md_get_prop_str(curr_md, curr_vd_node, VD_BLOCK_DEVICE_PROP,
57921ae08745Sheppo 	    &curr_dev) != 0) {
57931ae08745Sheppo 		PRN("Error getting current vdisk \"%s\"", VD_BLOCK_DEVICE_PROP);
57941ae08745Sheppo 		return;
57951ae08745Sheppo 	}
57961ae08745Sheppo 	if (((len = strlen(curr_dev)) == strlen(prev_dev)) &&
57971ae08745Sheppo 	    (strncmp(curr_dev, prev_dev, len) == 0))
57981ae08745Sheppo 		return;	/* no relevant (supported) change */
57991ae08745Sheppo 
5800047ba61eSachartre 	/* Validate that options have not changed */
5801047ba61eSachartre 	vds_get_options(prev_md, prev_vd_node, &prev_options);
5802047ba61eSachartre 	vds_get_options(curr_md, curr_vd_node, &curr_options);
5803047ba61eSachartre 	if (prev_options != curr_options) {
5804047ba61eSachartre 		PRN("Not changing vdisk:  options changed from %lx to %lx",
5805047ba61eSachartre 		    prev_options, curr_options);
5806047ba61eSachartre 		return;
5807047ba61eSachartre 	}
5808047ba61eSachartre 
58091ae08745Sheppo 	PR0("Changing vdisk ID %lu", prev_id);
58103af08d82Slm66018 
58111ae08745Sheppo 	/* Remove old state, which will close vdisk and reset */
58121ae08745Sheppo 	if (mod_hash_destroy(vds->vd_table, (mod_hash_key_t)prev_id) != 0)
58131ae08745Sheppo 		PRN("No entry found for vdisk ID %lu", prev_id);
58143af08d82Slm66018 
58151ae08745Sheppo 	/* Re-initialize vdisk with new state */
5816047ba61eSachartre 	if (vds_init_vd(vds, curr_id, curr_dev, curr_options,
5817047ba61eSachartre 	    curr_ldc_id) != 0) {
58181ae08745Sheppo 		PRN("Failed to change vdisk ID %lu", curr_id);
58191ae08745Sheppo 		return;
58201ae08745Sheppo 	}
58211ae08745Sheppo }
58221ae08745Sheppo 
58231ae08745Sheppo static int
58241ae08745Sheppo vds_process_md(void *arg, mdeg_result_t *md)
58251ae08745Sheppo {
58261ae08745Sheppo 	int	i;
58271ae08745Sheppo 	vds_t	*vds = arg;
58281ae08745Sheppo 
58291ae08745Sheppo 
58301ae08745Sheppo 	if (md == NULL)
58311ae08745Sheppo 		return (MDEG_FAILURE);
58321ae08745Sheppo 	ASSERT(vds != NULL);
58331ae08745Sheppo 
58341ae08745Sheppo 	for (i = 0; i < md->removed.nelem; i++)
58351ae08745Sheppo 		vds_remove_vd(vds, md->removed.mdp, md->removed.mdep[i]);
58361ae08745Sheppo 	for (i = 0; i < md->match_curr.nelem; i++)
58371ae08745Sheppo 		vds_change_vd(vds, md->match_prev.mdp, md->match_prev.mdep[i],
58381ae08745Sheppo 		    md->match_curr.mdp, md->match_curr.mdep[i]);
58391ae08745Sheppo 	for (i = 0; i < md->added.nelem; i++)
58401ae08745Sheppo 		vds_add_vd(vds, md->added.mdp, md->added.mdep[i]);
58411ae08745Sheppo 
58421ae08745Sheppo 	return (MDEG_SUCCESS);
58431ae08745Sheppo }
58441ae08745Sheppo 
58453c96341aSnarayan 
58461ae08745Sheppo static int
58471ae08745Sheppo vds_do_attach(dev_info_t *dip)
58481ae08745Sheppo {
5849445b4c2eSsb155480 	int			status, sz;
5850445b4c2eSsb155480 	int			cfg_handle;
58511ae08745Sheppo 	minor_t			instance = ddi_get_instance(dip);
58521ae08745Sheppo 	vds_t			*vds;
5853445b4c2eSsb155480 	mdeg_prop_spec_t	*pspecp;
5854445b4c2eSsb155480 	mdeg_node_spec_t	*ispecp;
58551ae08745Sheppo 
58561ae08745Sheppo 	/*
58571ae08745Sheppo 	 * The "cfg-handle" property of a vds node in an MD contains the MD's
58581ae08745Sheppo 	 * notion of "instance", or unique identifier, for that node; OBP
58591ae08745Sheppo 	 * stores the value of the "cfg-handle" MD property as the value of
58601ae08745Sheppo 	 * the "reg" property on the node in the device tree it builds from
58611ae08745Sheppo 	 * the MD and passes to Solaris.  Thus, we look up the devinfo node's
58621ae08745Sheppo 	 * "reg" property value to uniquely identify this device instance when
58631ae08745Sheppo 	 * registering with the MD event-generation framework.  If the "reg"
58641ae08745Sheppo 	 * property cannot be found, the device tree state is presumably so
58651ae08745Sheppo 	 * broken that there is no point in continuing.
58661ae08745Sheppo 	 */
5867445b4c2eSsb155480 	if (!ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
5868445b4c2eSsb155480 	    VD_REG_PROP)) {
5869445b4c2eSsb155480 		PRN("vds \"%s\" property does not exist", VD_REG_PROP);
58701ae08745Sheppo 		return (DDI_FAILURE);
58711ae08745Sheppo 	}
58721ae08745Sheppo 
58731ae08745Sheppo 	/* Get the MD instance for later MDEG registration */
58741ae08745Sheppo 	cfg_handle = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
5875445b4c2eSsb155480 	    VD_REG_PROP, -1);
58761ae08745Sheppo 
58771ae08745Sheppo 	if (ddi_soft_state_zalloc(vds_state, instance) != DDI_SUCCESS) {
58781ae08745Sheppo 		PRN("Could not allocate state for instance %u", instance);
58791ae08745Sheppo 		return (DDI_FAILURE);
58801ae08745Sheppo 	}
58811ae08745Sheppo 
58821ae08745Sheppo 	if ((vds = ddi_get_soft_state(vds_state, instance)) == NULL) {
58831ae08745Sheppo 		PRN("Could not get state for instance %u", instance);
58841ae08745Sheppo 		ddi_soft_state_free(vds_state, instance);
58851ae08745Sheppo 		return (DDI_FAILURE);
58861ae08745Sheppo 	}
58871ae08745Sheppo 
58881ae08745Sheppo 	vds->dip	= dip;
58891ae08745Sheppo 	vds->vd_table	= mod_hash_create_ptrhash("vds_vd_table", VDS_NCHAINS,
589087a7269eSachartre 	    vds_destroy_vd, sizeof (void *));
589187a7269eSachartre 
58921ae08745Sheppo 	ASSERT(vds->vd_table != NULL);
58931ae08745Sheppo 
58941ae08745Sheppo 	if ((status = ldi_ident_from_dip(dip, &vds->ldi_ident)) != 0) {
58951ae08745Sheppo 		PRN("ldi_ident_from_dip() returned errno %d", status);
58961ae08745Sheppo 		return (DDI_FAILURE);
58971ae08745Sheppo 	}
58981ae08745Sheppo 	vds->initialized |= VDS_LDI;
58991ae08745Sheppo 
59001ae08745Sheppo 	/* Register for MD updates */
5901445b4c2eSsb155480 	sz = sizeof (vds_prop_template);
5902445b4c2eSsb155480 	pspecp = kmem_alloc(sz, KM_SLEEP);
5903445b4c2eSsb155480 	bcopy(vds_prop_template, pspecp, sz);
5904445b4c2eSsb155480 
5905445b4c2eSsb155480 	VDS_SET_MDEG_PROP_INST(pspecp, cfg_handle);
5906445b4c2eSsb155480 
5907445b4c2eSsb155480 	/* initialize the complete prop spec structure */
5908445b4c2eSsb155480 	ispecp = kmem_zalloc(sizeof (mdeg_node_spec_t), KM_SLEEP);
5909445b4c2eSsb155480 	ispecp->namep = "virtual-device";
5910445b4c2eSsb155480 	ispecp->specp = pspecp;
5911445b4c2eSsb155480 
5912445b4c2eSsb155480 	if (mdeg_register(ispecp, &vd_match, vds_process_md, vds,
59131ae08745Sheppo 	    &vds->mdeg) != MDEG_SUCCESS) {
59141ae08745Sheppo 		PRN("Unable to register for MD updates");
5915445b4c2eSsb155480 		kmem_free(ispecp, sizeof (mdeg_node_spec_t));
5916445b4c2eSsb155480 		kmem_free(pspecp, sz);
59171ae08745Sheppo 		return (DDI_FAILURE);
59181ae08745Sheppo 	}
5919445b4c2eSsb155480 
5920445b4c2eSsb155480 	vds->ispecp = ispecp;
59211ae08745Sheppo 	vds->initialized |= VDS_MDEG;
59221ae08745Sheppo 
59230a55fbb7Slm66018 	/* Prevent auto-detaching so driver is available whenever MD changes */
59240a55fbb7Slm66018 	if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) !=
59250a55fbb7Slm66018 	    DDI_PROP_SUCCESS) {
59260a55fbb7Slm66018 		PRN("failed to set \"%s\" property for instance %u",
59270a55fbb7Slm66018 		    DDI_NO_AUTODETACH, instance);
59280a55fbb7Slm66018 	}
59290a55fbb7Slm66018 
59301ae08745Sheppo 	ddi_report_dev(dip);
59311ae08745Sheppo 	return (DDI_SUCCESS);
59321ae08745Sheppo }
59331ae08745Sheppo 
59341ae08745Sheppo static int
59351ae08745Sheppo vds_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
59361ae08745Sheppo {
59371ae08745Sheppo 	int	status;
59381ae08745Sheppo 
59391ae08745Sheppo 	switch (cmd) {
59401ae08745Sheppo 	case DDI_ATTACH:
5941d10e4ef2Snarayan 		PR0("Attaching");
59421ae08745Sheppo 		if ((status = vds_do_attach(dip)) != DDI_SUCCESS)
59431ae08745Sheppo 			(void) vds_detach(dip, DDI_DETACH);
59441ae08745Sheppo 		return (status);
59451ae08745Sheppo 	case DDI_RESUME:
5946d10e4ef2Snarayan 		PR0("No action required for DDI_RESUME");
59471ae08745Sheppo 		return (DDI_SUCCESS);
59481ae08745Sheppo 	default:
59491ae08745Sheppo 		return (DDI_FAILURE);
59501ae08745Sheppo 	}
59511ae08745Sheppo }
59521ae08745Sheppo 
59531ae08745Sheppo static struct dev_ops vds_ops = {
59541ae08745Sheppo 	DEVO_REV,	/* devo_rev */
59551ae08745Sheppo 	0,		/* devo_refcnt */
59561ae08745Sheppo 	ddi_no_info,	/* devo_getinfo */
59571ae08745Sheppo 	nulldev,	/* devo_identify */
59581ae08745Sheppo 	nulldev,	/* devo_probe */
59591ae08745Sheppo 	vds_attach,	/* devo_attach */
59601ae08745Sheppo 	vds_detach,	/* devo_detach */
59611ae08745Sheppo 	nodev,		/* devo_reset */
59621ae08745Sheppo 	NULL,		/* devo_cb_ops */
59631ae08745Sheppo 	NULL,		/* devo_bus_ops */
59641ae08745Sheppo 	nulldev		/* devo_power */
59651ae08745Sheppo };
59661ae08745Sheppo 
59671ae08745Sheppo static struct modldrv modldrv = {
59681ae08745Sheppo 	&mod_driverops,
5969205eeb1aSlm66018 	"virtual disk server",
59701ae08745Sheppo 	&vds_ops,
59711ae08745Sheppo };
59721ae08745Sheppo 
59731ae08745Sheppo static struct modlinkage modlinkage = {
59741ae08745Sheppo 	MODREV_1,
59751ae08745Sheppo 	&modldrv,
59761ae08745Sheppo 	NULL
59771ae08745Sheppo };
59781ae08745Sheppo 
59791ae08745Sheppo 
59801ae08745Sheppo int
59811ae08745Sheppo _init(void)
59821ae08745Sheppo {
598317cadca8Slm66018 	int		status;
5984d10e4ef2Snarayan 
59851ae08745Sheppo 	if ((status = ddi_soft_state_init(&vds_state, sizeof (vds_t), 1)) != 0)
59861ae08745Sheppo 		return (status);
598717cadca8Slm66018 
59881ae08745Sheppo 	if ((status = mod_install(&modlinkage)) != 0) {
59891ae08745Sheppo 		ddi_soft_state_fini(&vds_state);
59901ae08745Sheppo 		return (status);
59911ae08745Sheppo 	}
59921ae08745Sheppo 
59931ae08745Sheppo 	return (0);
59941ae08745Sheppo }
59951ae08745Sheppo 
59961ae08745Sheppo int
59971ae08745Sheppo _info(struct modinfo *modinfop)
59981ae08745Sheppo {
59991ae08745Sheppo 	return (mod_info(&modlinkage, modinfop));
60001ae08745Sheppo }
60011ae08745Sheppo 
60021ae08745Sheppo int
60031ae08745Sheppo _fini(void)
60041ae08745Sheppo {
60051ae08745Sheppo 	int	status;
60061ae08745Sheppo 
60071ae08745Sheppo 	if ((status = mod_remove(&modlinkage)) != 0)
60081ae08745Sheppo 		return (status);
60091ae08745Sheppo 	ddi_soft_state_fini(&vds_state);
60101ae08745Sheppo 	return (0);
60111ae08745Sheppo }
6012