xref: /titanic_51/usr/src/uts/sun4v/io/vds.c (revision 8162146132b0fb9b7c6dc3371ff205edc236ebfa)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Virtual disk server
31  */
32 
33 
34 #include <sys/types.h>
35 #include <sys/conf.h>
36 #include <sys/crc32.h>
37 #include <sys/ddi.h>
38 #include <sys/dkio.h>
39 #include <sys/file.h>
40 #include <sys/mdeg.h>
41 #include <sys/modhash.h>
42 #include <sys/note.h>
43 #include <sys/pathname.h>
44 #include <sys/sunddi.h>
45 #include <sys/sunldi.h>
46 #include <sys/sysmacros.h>
47 #include <sys/vio_common.h>
48 #include <sys/vdsk_mailbox.h>
49 #include <sys/vdsk_common.h>
50 #include <sys/vtoc.h>
51 
52 
53 /* Virtual disk server initialization flags */
54 #define	VDS_LDI			0x01
55 #define	VDS_MDEG		0x02
56 
57 /* Virtual disk server tunable parameters */
58 #define	VDS_LDC_RETRIES		5
59 #define	VDS_LDC_DELAY		1000 /* usec */
60 #define	VDS_NCHAINS		32
61 
62 /* Identification parameters for MD, synthetic dkio(7i) structures, etc. */
63 #define	VDS_NAME		"virtual-disk-server"
64 
65 #define	VD_NAME			"vd"
66 #define	VD_VOLUME_NAME		"vdisk"
67 #define	VD_ASCIILABEL		"Virtual Disk"
68 
69 #define	VD_CHANNEL_ENDPOINT	"channel-endpoint"
70 #define	VD_ID_PROP		"id"
71 #define	VD_BLOCK_DEVICE_PROP	"vds-block-device"
72 #define	VD_REG_PROP		"reg"
73 
74 /* Virtual disk initialization flags */
75 #define	VD_LOCKING		0x01
76 #define	VD_LDC			0x02
77 #define	VD_DRING		0x04
78 #define	VD_SID			0x08
79 #define	VD_SEQ_NUM		0x10
80 
81 /* Flags for opening/closing backing devices via LDI */
82 #define	VD_OPEN_FLAGS		(FEXCL | FREAD | FWRITE)
83 
84 /*
85  * By Solaris convention, slice/partition 2 represents the entire disk;
86  * unfortunately, this convention does not appear to be codified.
87  */
88 #define	VD_ENTIRE_DISK_SLICE	2
89 
90 /* Return a cpp token as a string */
91 #define	STRINGIZE(token)	#token
92 
93 /*
94  * Print a message prefixed with the current function name to the message log
95  * (and optionally to the console for verbose boots); these macros use cpp's
96  * concatenation of string literals and C99 variable-length-argument-list
97  * macros
98  */
99 #define	PRN(...)	_PRN("?%s():  "__VA_ARGS__, "")
100 #define	_PRN(format, ...)					\
101 	cmn_err(CE_CONT, format"%s", __func__, __VA_ARGS__)
102 
103 /* Return a pointer to the "i"th vdisk dring element */
104 #define	VD_DRING_ELEM(i)	((vd_dring_entry_t *)(void *)	\
105 	    (vd->dring + (i)*vd->descriptor_size))
106 
107 /* Return the virtual disk client's type as a string (for use in messages) */
108 #define	VD_CLIENT(vd)							\
109 	(((vd)->xfer_mode == VIO_DESC_MODE) ? "in-band client" :	\
110 	    (((vd)->xfer_mode == VIO_DRING_MODE) ? "dring client" :	\
111 		(((vd)->xfer_mode == 0) ? "null client" :		\
112 		    "unsupported client")))
113 
114 /*
115  * Specification of an MD node passed to the MDEG to filter any
116  * 'vport' nodes that do not belong to the specified node. This
117  * template is copied for each vds instance and filled in with
118  * the appropriate 'cfg-handle' value before being passed to the MDEG.
119  */
120 static mdeg_prop_spec_t	vds_prop_template[] = {
121 	{ MDET_PROP_STR,	"name",		VDS_NAME },
122 	{ MDET_PROP_VAL,	"cfg-handle",	NULL },
123 	{ MDET_LIST_END,	NULL, 		NULL }
124 };
125 
126 #define	VDS_SET_MDEG_PROP_INST(specp, val) (specp)[1].ps_val = (val);
127 
128 /*
129  * Matching criteria passed to the MDEG to register interest
130  * in changes to 'virtual-device-port' nodes identified by their
131  * 'id' property.
132  */
133 static md_prop_match_t	vd_prop_match[] = {
134 	{ MDET_PROP_VAL,	VD_ID_PROP },
135 	{ MDET_LIST_END,	NULL }
136 };
137 
138 static mdeg_node_match_t vd_match = {"virtual-device-port",
139 				    vd_prop_match};
140 
141 /* Debugging macros */
142 #ifdef DEBUG
143 
144 static int	vd_msglevel = 0;
145 
146 
147 #define	PR0 if (vd_msglevel > 0)	PRN
148 #define	PR1 if (vd_msglevel > 1)	PRN
149 #define	PR2 if (vd_msglevel > 2)	PRN
150 
151 #define	VD_DUMP_DRING_ELEM(elem)					\
152 	PRN("dst:%x op:%x st:%u nb:%lx addr:%lx ncook:%u\n",		\
153 	    elem->hdr.dstate,						\
154 	    elem->payload.operation,					\
155 	    elem->payload.status,					\
156 	    elem->payload.nbytes,					\
157 	    elem->payload.addr,						\
158 	    elem->payload.ncookies);
159 
160 char *
161 vd_decode_state(int state)
162 {
163 	char *str;
164 
165 #define	CASE_STATE(_s)	case _s: str = #_s; break;
166 
167 	switch (state) {
168 	CASE_STATE(VD_STATE_INIT)
169 	CASE_STATE(VD_STATE_VER)
170 	CASE_STATE(VD_STATE_ATTR)
171 	CASE_STATE(VD_STATE_DRING)
172 	CASE_STATE(VD_STATE_RDX)
173 	CASE_STATE(VD_STATE_DATA)
174 	default: str = "unknown"; break;
175 	}
176 
177 #undef CASE_STATE
178 
179 	return (str);
180 }
181 
182 void
183 vd_decode_tag(vio_msg_t *msg)
184 {
185 	char *tstr, *sstr, *estr;
186 
187 #define	CASE_TYPE(_s)	case _s: tstr = #_s; break;
188 
189 	switch (msg->tag.vio_msgtype) {
190 	CASE_TYPE(VIO_TYPE_CTRL)
191 	CASE_TYPE(VIO_TYPE_DATA)
192 	CASE_TYPE(VIO_TYPE_ERR)
193 	default: tstr = "unknown"; break;
194 	}
195 
196 #undef CASE_TYPE
197 
198 #define	CASE_SUBTYPE(_s) case _s: sstr = #_s; break;
199 
200 	switch (msg->tag.vio_subtype) {
201 	CASE_SUBTYPE(VIO_SUBTYPE_INFO)
202 	CASE_SUBTYPE(VIO_SUBTYPE_ACK)
203 	CASE_SUBTYPE(VIO_SUBTYPE_NACK)
204 	default: sstr = "unknown"; break;
205 	}
206 
207 #undef CASE_SUBTYPE
208 
209 #define	CASE_ENV(_s)	case _s: estr = #_s; break;
210 
211 	switch (msg->tag.vio_subtype_env) {
212 	CASE_ENV(VIO_VER_INFO)
213 	CASE_ENV(VIO_ATTR_INFO)
214 	CASE_ENV(VIO_DRING_REG)
215 	CASE_ENV(VIO_DRING_UNREG)
216 	CASE_ENV(VIO_RDX)
217 	CASE_ENV(VIO_PKT_DATA)
218 	CASE_ENV(VIO_DESC_DATA)
219 	CASE_ENV(VIO_DRING_DATA)
220 	default: estr = "unknown"; break;
221 	}
222 
223 #undef CASE_ENV
224 
225 	PR1("(%x/%x/%x) message : (%s/%s/%s)",
226 	    msg->tag.vio_msgtype, msg->tag.vio_subtype,
227 	    msg->tag.vio_subtype_env, tstr, sstr, estr);
228 }
229 
230 #else	/* !DEBUG */
231 
232 #define	PR0(...)
233 #define	PR1(...)
234 #define	PR2(...)
235 
236 #define	VD_DUMP_DRING_ELEM(elem)
237 
238 #define	vd_decode_state(_s)	(NULL)
239 #define	vd_decode_tag(_s)	(NULL)
240 
241 #endif	/* DEBUG */
242 
243 
244 /*
245  * Soft state structure for a vds instance
246  */
247 typedef struct vds {
248 	uint_t		initialized;	/* driver inst initialization flags */
249 	dev_info_t	*dip;		/* driver inst devinfo pointer */
250 	ldi_ident_t	ldi_ident;	/* driver's identifier for LDI */
251 	mod_hash_t	*vd_table;	/* table of virtual disks served */
252 	mdeg_node_spec_t *ispecp;	/* mdeg node specification */
253 	mdeg_handle_t	mdeg;		/* handle for MDEG operations  */
254 } vds_t;
255 
256 /*
257  * Types of descriptor-processing tasks
258  */
259 typedef enum vd_task_type {
260 	VD_NONFINAL_RANGE_TASK,	/* task for intermediate descriptor in range */
261 	VD_FINAL_RANGE_TASK,	/* task for last in a range of descriptors */
262 } vd_task_type_t;
263 
264 /*
265  * Structure describing the task for processing a descriptor
266  */
267 typedef struct vd_task {
268 	struct vd		*vd;		/* vd instance task is for */
269 	vd_task_type_t		type;		/* type of descriptor task */
270 	int			index;		/* dring elem index for task */
271 	vio_msg_t		*msg;		/* VIO message task is for */
272 	size_t			msglen;		/* length of message content */
273 	vd_dring_payload_t	*request;	/* request task will perform */
274 	struct buf		buf;		/* buf(9s) for I/O request */
275 	ldc_mem_handle_t	mhdl;		/* task memory handle */
276 } vd_task_t;
277 
278 /*
279  * Soft state structure for a virtual disk instance
280  */
281 typedef struct vd {
282 	uint_t			initialized;	/* vdisk initialization flags */
283 	vds_t			*vds;		/* server for this vdisk */
284 	ddi_taskq_t		*startq;	/* queue for I/O start tasks */
285 	ddi_taskq_t		*completionq;	/* queue for completion tasks */
286 	ldi_handle_t		ldi_handle[V_NUMPAR];	/* LDI slice handles */
287 	dev_t			dev[V_NUMPAR];	/* dev numbers for slices */
288 	uint_t			nslices;	/* number of slices */
289 	size_t			vdisk_size;	/* number of blocks in vdisk */
290 	vd_disk_type_t		vdisk_type;	/* slice or entire disk */
291 	vd_disk_label_t		vdisk_label;	/* EFI or VTOC label */
292 	ushort_t		max_xfer_sz;	/* max xfer size in DEV_BSIZE */
293 	boolean_t		pseudo;		/* underlying pseudo dev */
294 	struct dk_efi		dk_efi;		/* synthetic for slice type */
295 	struct dk_geom		dk_geom;	/* synthetic for slice type */
296 	struct vtoc		vtoc;		/* synthetic for slice type */
297 	ldc_status_t		ldc_state;	/* LDC connection state */
298 	ldc_handle_t		ldc_handle;	/* handle for LDC comm */
299 	size_t			max_msglen;	/* largest LDC message len */
300 	vd_state_t		state;		/* client handshake state */
301 	uint8_t			xfer_mode;	/* transfer mode with client */
302 	uint32_t		sid;		/* client's session ID */
303 	uint64_t		seq_num;	/* message sequence number */
304 	uint64_t		dring_ident;	/* identifier of dring */
305 	ldc_dring_handle_t	dring_handle;	/* handle for dring ops */
306 	uint32_t		descriptor_size;	/* num bytes in desc */
307 	uint32_t		dring_len;	/* number of dring elements */
308 	caddr_t			dring;		/* address of dring */
309 	caddr_t			vio_msgp;	/* vio msg staging buffer */
310 	vd_task_t		inband_task;	/* task for inband descriptor */
311 	vd_task_t		*dring_task;	/* tasks dring elements */
312 
313 	kmutex_t		lock;		/* protects variables below */
314 	boolean_t		enabled;	/* is vdisk enabled? */
315 	boolean_t		reset_state;	/* reset connection state? */
316 	boolean_t		reset_ldc;	/* reset LDC channel? */
317 } vd_t;
318 
319 typedef struct vds_operation {
320 	char	*namep;
321 	uint8_t	operation;
322 	int	(*start)(vd_task_t *task);
323 	void	(*complete)(void *arg);
324 } vds_operation_t;
325 
326 typedef struct vd_ioctl {
327 	uint8_t		operation;		/* vdisk operation */
328 	const char	*operation_name;	/* vdisk operation name */
329 	size_t		nbytes;			/* size of operation buffer */
330 	int		cmd;			/* corresponding ioctl cmd */
331 	const char	*cmd_name;		/* ioctl cmd name */
332 	void		*arg;			/* ioctl cmd argument */
333 	/* convert input vd_buf to output ioctl_arg */
334 	void		(*copyin)(void *vd_buf, void *ioctl_arg);
335 	/* convert input ioctl_arg to output vd_buf */
336 	void		(*copyout)(void *ioctl_arg, void *vd_buf);
337 } vd_ioctl_t;
338 
339 /* Define trivial copyin/copyout conversion function flag */
340 #define	VD_IDENTITY	((void (*)(void *, void *))-1)
341 
342 
343 static int	vds_ldc_retries = VDS_LDC_RETRIES;
344 static int	vds_ldc_delay = VDS_LDC_DELAY;
345 static void	*vds_state;
346 static uint64_t	vds_operations;	/* see vds_operation[] definition below */
347 
348 static int	vd_open_flags = VD_OPEN_FLAGS;
349 
350 /*
351  * Supported protocol version pairs, from highest (newest) to lowest (oldest)
352  *
353  * Each supported major version should appear only once, paired with (and only
354  * with) its highest supported minor version number (as the protocol requires
355  * supporting all lower minor version numbers as well)
356  */
357 static const vio_ver_t	vds_version[] = {{1, 0}};
358 static const size_t	vds_num_versions =
359     sizeof (vds_version)/sizeof (vds_version[0]);
360 
361 static void vd_free_dring_task(vd_t *vdp);
362 
363 static int
364 vd_start_bio(vd_task_t *task)
365 {
366 	int			rv, status = 0;
367 	vd_t			*vd		= task->vd;
368 	vd_dring_payload_t	*request	= task->request;
369 	struct buf		*buf		= &task->buf;
370 	uint8_t			mtype;
371 
372 
373 	ASSERT(vd != NULL);
374 	ASSERT(request != NULL);
375 	ASSERT(request->slice < vd->nslices);
376 	ASSERT((request->operation == VD_OP_BREAD) ||
377 	    (request->operation == VD_OP_BWRITE));
378 
379 	if (request->nbytes == 0)
380 		return (EINVAL);	/* no service for trivial requests */
381 
382 	PR1("%s %lu bytes at block %lu",
383 	    (request->operation == VD_OP_BREAD) ? "Read" : "Write",
384 	    request->nbytes, request->addr);
385 
386 	bioinit(buf);
387 	buf->b_flags		= B_BUSY;
388 	buf->b_bcount		= request->nbytes;
389 	buf->b_lblkno		= request->addr;
390 	buf->b_edev		= vd->dev[request->slice];
391 
392 	mtype = (&vd->inband_task == task) ? LDC_SHADOW_MAP : LDC_DIRECT_MAP;
393 
394 	/* Map memory exported by client */
395 	status = ldc_mem_map(task->mhdl, request->cookie, request->ncookies,
396 	    mtype, (request->operation == VD_OP_BREAD) ? LDC_MEM_W : LDC_MEM_R,
397 	    &(buf->b_un.b_addr), NULL);
398 	if (status != 0) {
399 		PR0("ldc_mem_map() returned err %d ", status);
400 		biofini(buf);
401 		return (status);
402 	}
403 
404 	status = ldc_mem_acquire(task->mhdl, 0, buf->b_bcount);
405 	if (status != 0) {
406 		(void) ldc_mem_unmap(task->mhdl);
407 		PR0("ldc_mem_acquire() returned err %d ", status);
408 		biofini(buf);
409 		return (status);
410 	}
411 
412 	buf->b_flags |= (request->operation == VD_OP_BREAD) ? B_READ : B_WRITE;
413 
414 	/* Start the block I/O */
415 	if ((status = ldi_strategy(vd->ldi_handle[request->slice], buf)) == 0)
416 		return (EINPROGRESS);	/* will complete on completionq */
417 
418 	/* Clean up after error */
419 	rv = ldc_mem_release(task->mhdl, 0, buf->b_bcount);
420 	if (rv) {
421 		PR0("ldc_mem_release() returned err %d ", rv);
422 	}
423 	rv = ldc_mem_unmap(task->mhdl);
424 	if (rv) {
425 		PR0("ldc_mem_unmap() returned err %d ", status);
426 	}
427 
428 	biofini(buf);
429 	return (status);
430 }
431 
432 static int
433 send_msg(ldc_handle_t ldc_handle, void *msg, size_t msglen)
434 {
435 	int	status;
436 	size_t	nbytes;
437 
438 	do {
439 		nbytes = msglen;
440 		status = ldc_write(ldc_handle, msg, &nbytes);
441 		if (status != EWOULDBLOCK)
442 			break;
443 		drv_usecwait(vds_ldc_delay);
444 	} while (status == EWOULDBLOCK);
445 
446 	if (status != 0) {
447 		if (status != ECONNRESET)
448 			PR0("ldc_write() returned errno %d", status);
449 		return (status);
450 	} else if (nbytes != msglen) {
451 		PR0("ldc_write() performed only partial write");
452 		return (EIO);
453 	}
454 
455 	PR1("SENT %lu bytes", msglen);
456 	return (0);
457 }
458 
459 static void
460 vd_need_reset(vd_t *vd, boolean_t reset_ldc)
461 {
462 	mutex_enter(&vd->lock);
463 	vd->reset_state	= B_TRUE;
464 	vd->reset_ldc	= reset_ldc;
465 	mutex_exit(&vd->lock);
466 }
467 
468 /*
469  * Reset the state of the connection with a client, if needed; reset the LDC
470  * transport as well, if needed.  This function should only be called from the
471  * "vd_recv_msg", as it waits for tasks - otherwise a deadlock can occur.
472  */
473 static void
474 vd_reset_if_needed(vd_t *vd)
475 {
476 	int	status = 0;
477 
478 	mutex_enter(&vd->lock);
479 	if (!vd->reset_state) {
480 		ASSERT(!vd->reset_ldc);
481 		mutex_exit(&vd->lock);
482 		return;
483 	}
484 	mutex_exit(&vd->lock);
485 
486 	PR0("Resetting connection state with %s", VD_CLIENT(vd));
487 
488 	/*
489 	 * Let any asynchronous I/O complete before possibly pulling the rug
490 	 * out from under it; defer checking vd->reset_ldc, as one of the
491 	 * asynchronous tasks might set it
492 	 */
493 	ddi_taskq_wait(vd->completionq);
494 
495 	if ((vd->initialized & VD_DRING) &&
496 	    ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0))
497 		PR0("ldc_mem_dring_unmap() returned errno %d", status);
498 
499 	vd_free_dring_task(vd);
500 
501 	/* Free the staging buffer for msgs */
502 	if (vd->vio_msgp != NULL) {
503 		kmem_free(vd->vio_msgp, vd->max_msglen);
504 		vd->vio_msgp = NULL;
505 	}
506 
507 	/* Free the inband message buffer */
508 	if (vd->inband_task.msg != NULL) {
509 		kmem_free(vd->inband_task.msg, vd->max_msglen);
510 		vd->inband_task.msg = NULL;
511 	}
512 
513 	mutex_enter(&vd->lock);
514 
515 	if (vd->reset_ldc)
516 		PR0("taking down LDC channel");
517 	if (vd->reset_ldc && ((status = ldc_down(vd->ldc_handle)) != 0))
518 		PR0("ldc_down() returned errno %d", status);
519 
520 	vd->initialized	&= ~(VD_SID | VD_SEQ_NUM | VD_DRING);
521 	vd->state	= VD_STATE_INIT;
522 	vd->max_msglen	= sizeof (vio_msg_t);	/* baseline vio message size */
523 
524 	/* Allocate the staging buffer */
525 	vd->vio_msgp = kmem_alloc(vd->max_msglen, KM_SLEEP);
526 
527 	PR0("calling ldc_up\n");
528 	(void) ldc_up(vd->ldc_handle);
529 
530 	vd->reset_state	= B_FALSE;
531 	vd->reset_ldc	= B_FALSE;
532 
533 	mutex_exit(&vd->lock);
534 }
535 
536 static void vd_recv_msg(void *arg);
537 
538 static void
539 vd_mark_in_reset(vd_t *vd)
540 {
541 	int status;
542 
543 	PR0("vd_mark_in_reset: marking vd in reset\n");
544 
545 	vd_need_reset(vd, B_FALSE);
546 	status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd, DDI_SLEEP);
547 	if (status == DDI_FAILURE) {
548 		PR0("cannot schedule task to recv msg\n");
549 		vd_need_reset(vd, B_TRUE);
550 		return;
551 	}
552 }
553 
554 static int
555 vd_mark_elem_done(vd_t *vd, int idx, int elem_status)
556 {
557 	boolean_t		accepted;
558 	int			status;
559 	vd_dring_entry_t	*elem = VD_DRING_ELEM(idx);
560 
561 	if (vd->reset_state)
562 		return (0);
563 
564 	/* Acquire the element */
565 	if (!vd->reset_state &&
566 	    (status = ldc_mem_dring_acquire(vd->dring_handle, idx, idx)) != 0) {
567 		if (status == ECONNRESET) {
568 			vd_mark_in_reset(vd);
569 			return (0);
570 		} else {
571 			PR0("ldc_mem_dring_acquire() returned errno %d",
572 			    status);
573 			return (status);
574 		}
575 	}
576 
577 	/* Set the element's status and mark it done */
578 	accepted = (elem->hdr.dstate == VIO_DESC_ACCEPTED);
579 	if (accepted) {
580 		elem->payload.status	= elem_status;
581 		elem->hdr.dstate	= VIO_DESC_DONE;
582 	} else {
583 		/* Perhaps client timed out waiting for I/O... */
584 		PR0("element %u no longer \"accepted\"", idx);
585 		VD_DUMP_DRING_ELEM(elem);
586 	}
587 	/* Release the element */
588 	if (!vd->reset_state &&
589 	    (status = ldc_mem_dring_release(vd->dring_handle, idx, idx)) != 0) {
590 		if (status == ECONNRESET) {
591 			vd_mark_in_reset(vd);
592 			return (0);
593 		} else {
594 			PR0("ldc_mem_dring_release() returned errno %d",
595 			    status);
596 			return (status);
597 		}
598 	}
599 
600 	return (accepted ? 0 : EINVAL);
601 }
602 
603 static void
604 vd_complete_bio(void *arg)
605 {
606 	int			status		= 0;
607 	vd_task_t		*task		= (vd_task_t *)arg;
608 	vd_t			*vd		= task->vd;
609 	vd_dring_payload_t	*request	= task->request;
610 	struct buf		*buf		= &task->buf;
611 
612 
613 	ASSERT(vd != NULL);
614 	ASSERT(request != NULL);
615 	ASSERT(task->msg != NULL);
616 	ASSERT(task->msglen >= sizeof (*task->msg));
617 
618 	/* Wait for the I/O to complete */
619 	request->status = biowait(buf);
620 
621 	/* Release the buffer */
622 	if (!vd->reset_state)
623 		status = ldc_mem_release(task->mhdl, 0, buf->b_bcount);
624 	if (status) {
625 		PR0("ldc_mem_release() returned errno %d copying to "
626 		    "client", status);
627 		if (status == ECONNRESET) {
628 			vd_mark_in_reset(vd);
629 		}
630 	}
631 
632 	/* Unmap the memory, even if in reset */
633 	status = ldc_mem_unmap(task->mhdl);
634 	if (status) {
635 		PR0("ldc_mem_unmap() returned errno %d copying to client",
636 		    status);
637 		if (status == ECONNRESET) {
638 			vd_mark_in_reset(vd);
639 		}
640 	}
641 
642 	biofini(buf);
643 
644 	/* Update the dring element for a dring client */
645 	if (!vd->reset_state && (status == 0) &&
646 	    (vd->xfer_mode == VIO_DRING_MODE)) {
647 		status = vd_mark_elem_done(vd, task->index, request->status);
648 		if (status == ECONNRESET)
649 			vd_mark_in_reset(vd);
650 	}
651 
652 	/*
653 	 * If a transport error occurred, arrange to "nack" the message when
654 	 * the final task in the descriptor element range completes
655 	 */
656 	if (status != 0)
657 		task->msg->tag.vio_subtype = VIO_SUBTYPE_NACK;
658 
659 	/*
660 	 * Only the final task for a range of elements will respond to and
661 	 * free the message
662 	 */
663 	if (task->type == VD_NONFINAL_RANGE_TASK) {
664 		return;
665 	}
666 
667 	/*
668 	 * Send the "ack" or "nack" back to the client; if sending the message
669 	 * via LDC fails, arrange to reset both the connection state and LDC
670 	 * itself
671 	 */
672 	PR1("Sending %s",
673 	    (task->msg->tag.vio_subtype == VIO_SUBTYPE_ACK) ? "ACK" : "NACK");
674 	if (!vd->reset_state) {
675 		status = send_msg(vd->ldc_handle, task->msg, task->msglen);
676 		switch (status) {
677 		case 0:
678 			break;
679 		case ECONNRESET:
680 			vd_mark_in_reset(vd);
681 			break;
682 		default:
683 			PR0("initiating full reset");
684 			vd_need_reset(vd, B_TRUE);
685 			break;
686 		}
687 	}
688 }
689 
690 static void
691 vd_geom2dk_geom(void *vd_buf, void *ioctl_arg)
692 {
693 	VD_GEOM2DK_GEOM((vd_geom_t *)vd_buf, (struct dk_geom *)ioctl_arg);
694 }
695 
696 static void
697 vd_vtoc2vtoc(void *vd_buf, void *ioctl_arg)
698 {
699 	VD_VTOC2VTOC((vd_vtoc_t *)vd_buf, (struct vtoc *)ioctl_arg);
700 }
701 
702 static void
703 dk_geom2vd_geom(void *ioctl_arg, void *vd_buf)
704 {
705 	DK_GEOM2VD_GEOM((struct dk_geom *)ioctl_arg, (vd_geom_t *)vd_buf);
706 }
707 
708 static void
709 vtoc2vd_vtoc(void *ioctl_arg, void *vd_buf)
710 {
711 	VTOC2VD_VTOC((struct vtoc *)ioctl_arg, (vd_vtoc_t *)vd_buf);
712 }
713 
714 static void
715 vd_get_efi_in(void *vd_buf, void *ioctl_arg)
716 {
717 	vd_efi_t *vd_efi = (vd_efi_t *)vd_buf;
718 	dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg;
719 
720 	dk_efi->dki_lba = vd_efi->lba;
721 	dk_efi->dki_length = vd_efi->length;
722 	dk_efi->dki_data = kmem_zalloc(vd_efi->length, KM_SLEEP);
723 }
724 
725 static void
726 vd_get_efi_out(void *ioctl_arg, void *vd_buf)
727 {
728 	int len;
729 	vd_efi_t *vd_efi = (vd_efi_t *)vd_buf;
730 	dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg;
731 
732 	len = vd_efi->length;
733 	DK_EFI2VD_EFI(dk_efi, vd_efi);
734 	kmem_free(dk_efi->dki_data, len);
735 }
736 
737 static void
738 vd_set_efi_in(void *vd_buf, void *ioctl_arg)
739 {
740 	vd_efi_t *vd_efi = (vd_efi_t *)vd_buf;
741 	dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg;
742 
743 	dk_efi->dki_data = kmem_alloc(vd_efi->length, KM_SLEEP);
744 	VD_EFI2DK_EFI(vd_efi, dk_efi);
745 }
746 
747 static void
748 vd_set_efi_out(void *ioctl_arg, void *vd_buf)
749 {
750 	vd_efi_t *vd_efi = (vd_efi_t *)vd_buf;
751 	dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg;
752 
753 	kmem_free(dk_efi->dki_data, vd_efi->length);
754 }
755 
756 static int
757 vd_read_vtoc(ldi_handle_t handle, struct vtoc *vtoc, vd_disk_label_t *label)
758 {
759 	int status, rval;
760 	struct dk_gpt *efi;
761 	size_t efi_len;
762 
763 	*label = VD_DISK_LABEL_UNK;
764 
765 	status = ldi_ioctl(handle, DKIOCGVTOC, (intptr_t)vtoc,
766 	    (vd_open_flags | FKIOCTL), kcred, &rval);
767 
768 	if (status == 0) {
769 		*label = VD_DISK_LABEL_VTOC;
770 		return (0);
771 	} else if (status != ENOTSUP) {
772 		PR0("ldi_ioctl(DKIOCGVTOC) returned error %d", status);
773 		return (status);
774 	}
775 
776 	status = vds_efi_alloc_and_read(handle, &efi, &efi_len);
777 
778 	if (status) {
779 		PR0("vds_efi_alloc_and_read returned error %d", status);
780 		return (status);
781 	}
782 
783 	*label = VD_DISK_LABEL_EFI;
784 	vd_efi_to_vtoc(efi, vtoc);
785 	vd_efi_free(efi, efi_len);
786 
787 	return (0);
788 }
789 
790 static int
791 vd_do_slice_ioctl(vd_t *vd, int cmd, void *ioctl_arg)
792 {
793 	dk_efi_t *dk_ioc;
794 
795 	switch (vd->vdisk_label) {
796 
797 	case VD_DISK_LABEL_VTOC:
798 
799 		switch (cmd) {
800 		case DKIOCGGEOM:
801 			ASSERT(ioctl_arg != NULL);
802 			bcopy(&vd->dk_geom, ioctl_arg, sizeof (vd->dk_geom));
803 			return (0);
804 		case DKIOCGVTOC:
805 			ASSERT(ioctl_arg != NULL);
806 			bcopy(&vd->vtoc, ioctl_arg, sizeof (vd->vtoc));
807 			return (0);
808 		default:
809 			return (ENOTSUP);
810 		}
811 
812 	case VD_DISK_LABEL_EFI:
813 
814 		switch (cmd) {
815 		case DKIOCGETEFI:
816 			ASSERT(ioctl_arg != NULL);
817 			dk_ioc = (dk_efi_t *)ioctl_arg;
818 			if (dk_ioc->dki_length < vd->dk_efi.dki_length)
819 				return (EINVAL);
820 			bcopy(vd->dk_efi.dki_data, dk_ioc->dki_data,
821 			    vd->dk_efi.dki_length);
822 			return (0);
823 		default:
824 			return (ENOTSUP);
825 		}
826 
827 	default:
828 		return (ENOTSUP);
829 	}
830 }
831 
832 static int
833 vd_do_ioctl(vd_t *vd, vd_dring_payload_t *request, void* buf, vd_ioctl_t *ioctl)
834 {
835 	int	rval = 0, status;
836 	size_t	nbytes = request->nbytes;	/* modifiable copy */
837 
838 
839 	ASSERT(request->slice < vd->nslices);
840 	PR0("Performing %s", ioctl->operation_name);
841 
842 	/* Get data from client and convert, if necessary */
843 	if (ioctl->copyin != NULL)  {
844 		ASSERT(nbytes != 0 && buf != NULL);
845 		PR1("Getting \"arg\" data from client");
846 		if ((status = ldc_mem_copy(vd->ldc_handle, buf, 0, &nbytes,
847 			    request->cookie, request->ncookies,
848 			    LDC_COPY_IN)) != 0) {
849 			PR0("ldc_mem_copy() returned errno %d "
850 			    "copying from client", status);
851 			return (status);
852 		}
853 
854 		/* Convert client's data, if necessary */
855 		if (ioctl->copyin == VD_IDENTITY)	/* use client buffer */
856 			ioctl->arg = buf;
857 		else	/* convert client vdisk operation data to ioctl data */
858 			(ioctl->copyin)(buf, (void *)ioctl->arg);
859 	}
860 
861 	/*
862 	 * Handle single-slice block devices internally; otherwise, have the
863 	 * real driver perform the ioctl()
864 	 */
865 	if (vd->vdisk_type == VD_DISK_TYPE_SLICE && !vd->pseudo) {
866 		if ((status = vd_do_slice_ioctl(vd, ioctl->cmd,
867 			    (void *)ioctl->arg)) != 0)
868 			return (status);
869 	} else if ((status = ldi_ioctl(vd->ldi_handle[request->slice],
870 		    ioctl->cmd, (intptr_t)ioctl->arg, (vd_open_flags | FKIOCTL),
871 		    kcred, &rval)) != 0) {
872 		PR0("ldi_ioctl(%s) = errno %d", ioctl->cmd_name, status);
873 		return (status);
874 	}
875 #ifdef DEBUG
876 	if (rval != 0) {
877 		PR0("%s set rval = %d, which is not being returned to client",
878 		    ioctl->cmd_name, rval);
879 	}
880 #endif /* DEBUG */
881 
882 	/* Convert data and send to client, if necessary */
883 	if (ioctl->copyout != NULL)  {
884 		ASSERT(nbytes != 0 && buf != NULL);
885 		PR1("Sending \"arg\" data to client");
886 
887 		/* Convert ioctl data to vdisk operation data, if necessary */
888 		if (ioctl->copyout != VD_IDENTITY)
889 			(ioctl->copyout)((void *)ioctl->arg, buf);
890 
891 		if ((status = ldc_mem_copy(vd->ldc_handle, buf, 0, &nbytes,
892 			    request->cookie, request->ncookies,
893 			    LDC_COPY_OUT)) != 0) {
894 			PR0("ldc_mem_copy() returned errno %d "
895 			    "copying to client", status);
896 			return (status);
897 		}
898 	}
899 
900 	return (status);
901 }
902 
903 #define	RNDSIZE(expr) P2ROUNDUP(sizeof (expr), sizeof (uint64_t))
904 static int
905 vd_ioctl(vd_task_t *task)
906 {
907 	int			i, status, rc;
908 	void			*buf = NULL;
909 	struct dk_geom		dk_geom = {0};
910 	struct vtoc		vtoc = {0};
911 	struct dk_efi		dk_efi = {0};
912 	vd_t			*vd		= task->vd;
913 	vd_dring_payload_t	*request	= task->request;
914 	vd_ioctl_t		ioctl[] = {
915 		/* Command (no-copy) operations */
916 		{VD_OP_FLUSH, STRINGIZE(VD_OP_FLUSH), 0,
917 		    DKIOCFLUSHWRITECACHE, STRINGIZE(DKIOCFLUSHWRITECACHE),
918 		    NULL, NULL, NULL},
919 
920 		/* "Get" (copy-out) operations */
921 		{VD_OP_GET_WCE, STRINGIZE(VD_OP_GET_WCE), RNDSIZE(int),
922 		    DKIOCGETWCE, STRINGIZE(DKIOCGETWCE),
923 		    NULL, VD_IDENTITY, VD_IDENTITY},
924 		{VD_OP_GET_DISKGEOM, STRINGIZE(VD_OP_GET_DISKGEOM),
925 		    RNDSIZE(vd_geom_t),
926 		    DKIOCGGEOM, STRINGIZE(DKIOCGGEOM),
927 		    &dk_geom, NULL, dk_geom2vd_geom},
928 		{VD_OP_GET_VTOC, STRINGIZE(VD_OP_GET_VTOC), RNDSIZE(vd_vtoc_t),
929 		    DKIOCGVTOC, STRINGIZE(DKIOCGVTOC),
930 		    &vtoc, NULL, vtoc2vd_vtoc},
931 		{VD_OP_GET_EFI, STRINGIZE(VD_OP_GET_EFI), RNDSIZE(vd_efi_t),
932 		    DKIOCGETEFI, STRINGIZE(DKIOCGETEFI),
933 		    &dk_efi, vd_get_efi_in, vd_get_efi_out},
934 
935 		/* "Set" (copy-in) operations */
936 		{VD_OP_SET_WCE, STRINGIZE(VD_OP_SET_WCE), RNDSIZE(int),
937 		    DKIOCSETWCE, STRINGIZE(DKIOCSETWCE),
938 		    NULL, VD_IDENTITY, VD_IDENTITY},
939 		{VD_OP_SET_DISKGEOM, STRINGIZE(VD_OP_SET_DISKGEOM),
940 		    RNDSIZE(vd_geom_t),
941 		    DKIOCSGEOM, STRINGIZE(DKIOCSGEOM),
942 		    &dk_geom, vd_geom2dk_geom, NULL},
943 		{VD_OP_SET_VTOC, STRINGIZE(VD_OP_SET_VTOC), RNDSIZE(vd_vtoc_t),
944 		    DKIOCSVTOC, STRINGIZE(DKIOCSVTOC),
945 		    &vtoc, vd_vtoc2vtoc, NULL},
946 		{VD_OP_SET_EFI, STRINGIZE(VD_OP_SET_EFI), RNDSIZE(vd_efi_t),
947 		    DKIOCSETEFI, STRINGIZE(DKIOCSETEFI),
948 		    &dk_efi, vd_set_efi_in, vd_set_efi_out},
949 	};
950 	size_t		nioctls = (sizeof (ioctl))/(sizeof (ioctl[0]));
951 
952 
953 	ASSERT(vd != NULL);
954 	ASSERT(request != NULL);
955 	ASSERT(request->slice < vd->nslices);
956 
957 	/*
958 	 * Determine ioctl corresponding to caller's "operation" and
959 	 * validate caller's "nbytes"
960 	 */
961 	for (i = 0; i < nioctls; i++) {
962 		if (request->operation == ioctl[i].operation) {
963 			/* LDC memory operations require 8-byte multiples */
964 			ASSERT(ioctl[i].nbytes % sizeof (uint64_t) == 0);
965 
966 			if (request->operation == VD_OP_GET_EFI ||
967 			    request->operation == VD_OP_SET_EFI) {
968 				if (request->nbytes >= ioctl[i].nbytes)
969 					break;
970 				PR0("%s:  Expected at least nbytes = %lu, "
971 				    "got %lu", ioctl[i].operation_name,
972 				    ioctl[i].nbytes, request->nbytes);
973 				return (EINVAL);
974 			}
975 
976 			if (request->nbytes != ioctl[i].nbytes) {
977 				PR0("%s:  Expected nbytes = %lu, got %lu",
978 				    ioctl[i].operation_name, ioctl[i].nbytes,
979 				    request->nbytes);
980 				return (EINVAL);
981 			}
982 
983 			break;
984 		}
985 	}
986 	ASSERT(i < nioctls);	/* because "operation" already validated */
987 
988 	if (request->nbytes)
989 		buf = kmem_zalloc(request->nbytes, KM_SLEEP);
990 	status = vd_do_ioctl(vd, request, buf, &ioctl[i]);
991 	if (request->nbytes)
992 		kmem_free(buf, request->nbytes);
993 	if (vd->vdisk_type == VD_DISK_TYPE_DISK &&
994 	    (request->operation == VD_OP_SET_VTOC ||
995 	    request->operation == VD_OP_SET_EFI)) {
996 		/* update disk information */
997 		rc = vd_read_vtoc(vd->ldi_handle[0], &vd->vtoc,
998 		    &vd->vdisk_label);
999 		if (rc != 0)
1000 			PR0("vd_read_vtoc return error %d", rc);
1001 	}
1002 	PR0("Returning %d", status);
1003 	return (status);
1004 }
1005 
1006 static int
1007 vd_get_devid(vd_task_t *task)
1008 {
1009 	vd_t *vd = task->vd;
1010 	vd_dring_payload_t *request = task->request;
1011 	vd_devid_t *vd_devid;
1012 	impl_devid_t *devid;
1013 	int status, bufid_len, devid_len, len;
1014 	int bufbytes;
1015 
1016 	PR1("Get Device ID, nbytes=%ld", request->nbytes);
1017 
1018 	if (ddi_lyr_get_devid(vd->dev[request->slice],
1019 	    (ddi_devid_t *)&devid) != DDI_SUCCESS) {
1020 		/* the most common failure is that no devid is available */
1021 		PR2("No Device ID");
1022 		return (ENOENT);
1023 	}
1024 
1025 	bufid_len = request->nbytes - sizeof (vd_devid_t) + 1;
1026 	devid_len = DEVID_GETLEN(devid);
1027 
1028 	/*
1029 	 * Save the buffer size here for use in deallocation.
1030 	 * The actual number of bytes copied is returned in
1031 	 * the 'nbytes' field of the request structure.
1032 	 */
1033 	bufbytes = request->nbytes;
1034 
1035 	vd_devid = kmem_zalloc(bufbytes, KM_SLEEP);
1036 	vd_devid->length = devid_len;
1037 	vd_devid->type = DEVID_GETTYPE(devid);
1038 
1039 	len = (devid_len > bufid_len)? bufid_len : devid_len;
1040 
1041 	bcopy(devid->did_id, vd_devid->id, len);
1042 
1043 	/* LDC memory operations require 8-byte multiples */
1044 	ASSERT(request->nbytes % sizeof (uint64_t) == 0);
1045 
1046 	if ((status = ldc_mem_copy(vd->ldc_handle, (caddr_t)vd_devid, 0,
1047 	    &request->nbytes, request->cookie, request->ncookies,
1048 	    LDC_COPY_OUT)) != 0) {
1049 		PR0("ldc_mem_copy() returned errno %d copying to client",
1050 		    status);
1051 	}
1052 	PR1("post mem_copy: nbytes=%ld", request->nbytes);
1053 
1054 	kmem_free(vd_devid, bufbytes);
1055 	ddi_devid_free((ddi_devid_t)devid);
1056 
1057 	return (status);
1058 }
1059 
1060 /*
1061  * Define the supported operations once the functions for performing them have
1062  * been defined
1063  */
1064 static const vds_operation_t	vds_operation[] = {
1065 #define	X(_s)	#_s, _s
1066 	{X(VD_OP_BREAD),	vd_start_bio,	vd_complete_bio},
1067 	{X(VD_OP_BWRITE),	vd_start_bio,	vd_complete_bio},
1068 	{X(VD_OP_FLUSH),	vd_ioctl,	NULL},
1069 	{X(VD_OP_GET_WCE),	vd_ioctl,	NULL},
1070 	{X(VD_OP_SET_WCE),	vd_ioctl,	NULL},
1071 	{X(VD_OP_GET_VTOC),	vd_ioctl,	NULL},
1072 	{X(VD_OP_SET_VTOC),	vd_ioctl,	NULL},
1073 	{X(VD_OP_GET_DISKGEOM),	vd_ioctl,	NULL},
1074 	{X(VD_OP_SET_DISKGEOM),	vd_ioctl,	NULL},
1075 	{X(VD_OP_GET_EFI),	vd_ioctl,	NULL},
1076 	{X(VD_OP_SET_EFI),	vd_ioctl,	NULL},
1077 	{X(VD_OP_GET_DEVID),	vd_get_devid,	NULL},
1078 #undef	X
1079 };
1080 
1081 static const size_t	vds_noperations =
1082 	(sizeof (vds_operation))/(sizeof (vds_operation[0]));
1083 
1084 /*
1085  * Process a task specifying a client I/O request
1086  */
1087 static int
1088 vd_process_task(vd_task_t *task)
1089 {
1090 	int			i, status;
1091 	vd_t			*vd		= task->vd;
1092 	vd_dring_payload_t	*request	= task->request;
1093 
1094 
1095 	ASSERT(vd != NULL);
1096 	ASSERT(request != NULL);
1097 
1098 	/* Find the requested operation */
1099 	for (i = 0; i < vds_noperations; i++)
1100 		if (request->operation == vds_operation[i].operation)
1101 			break;
1102 	if (i == vds_noperations) {
1103 		PR0("Unsupported operation %u", request->operation);
1104 		return (ENOTSUP);
1105 	}
1106 
1107 	/* Handle client using absolute disk offsets */
1108 	if ((vd->vdisk_type == VD_DISK_TYPE_DISK) &&
1109 	    (request->slice == UINT8_MAX))
1110 		request->slice = VD_ENTIRE_DISK_SLICE;
1111 
1112 	/* Range-check slice */
1113 	if (request->slice >= vd->nslices) {
1114 		PR0("Invalid \"slice\" %u (max %u) for virtual disk",
1115 		    request->slice, (vd->nslices - 1));
1116 		return (EINVAL);
1117 	}
1118 
1119 	PR1("operation : %s", vds_operation[i].namep);
1120 
1121 	/* Start the operation */
1122 	if ((status = vds_operation[i].start(task)) != EINPROGRESS) {
1123 		PR0("operation : %s returned status %d",
1124 			vds_operation[i].namep, status);
1125 		request->status = status;	/* op succeeded or failed */
1126 		return (0);			/* but request completed */
1127 	}
1128 
1129 	ASSERT(vds_operation[i].complete != NULL);	/* debug case */
1130 	if (vds_operation[i].complete == NULL) {	/* non-debug case */
1131 		PR0("Unexpected return of EINPROGRESS "
1132 		    "with no I/O completion handler");
1133 		request->status = EIO;	/* operation failed */
1134 		return (0);		/* but request completed */
1135 	}
1136 
1137 	PR1("operation : kick off taskq entry for %s", vds_operation[i].namep);
1138 
1139 	/* Queue a task to complete the operation */
1140 	status = ddi_taskq_dispatch(vd->completionq, vds_operation[i].complete,
1141 	    task, DDI_SLEEP);
1142 	/* ddi_taskq_dispatch(9f) guarantees success with DDI_SLEEP */
1143 	ASSERT(status == DDI_SUCCESS);
1144 
1145 	PR1("Operation in progress");
1146 	return (EINPROGRESS);	/* completion handler will finish request */
1147 }
1148 
1149 /*
1150  * Return true if the "type", "subtype", and "env" fields of the "tag" first
1151  * argument match the corresponding remaining arguments; otherwise, return false
1152  */
1153 boolean_t
1154 vd_msgtype(vio_msg_tag_t *tag, int type, int subtype, int env)
1155 {
1156 	return ((tag->vio_msgtype == type) &&
1157 		(tag->vio_subtype == subtype) &&
1158 		(tag->vio_subtype_env == env)) ? B_TRUE : B_FALSE;
1159 }
1160 
1161 /*
1162  * Check whether the major/minor version specified in "ver_msg" is supported
1163  * by this server.
1164  */
1165 static boolean_t
1166 vds_supported_version(vio_ver_msg_t *ver_msg)
1167 {
1168 	for (int i = 0; i < vds_num_versions; i++) {
1169 		ASSERT(vds_version[i].major > 0);
1170 		ASSERT((i == 0) ||
1171 		    (vds_version[i].major < vds_version[i-1].major));
1172 
1173 		/*
1174 		 * If the major versions match, adjust the minor version, if
1175 		 * necessary, down to the highest value supported by this
1176 		 * server and return true so this message will get "ack"ed;
1177 		 * the client should also support all minor versions lower
1178 		 * than the value it sent
1179 		 */
1180 		if (ver_msg->ver_major == vds_version[i].major) {
1181 			if (ver_msg->ver_minor > vds_version[i].minor) {
1182 				PR0("Adjusting minor version from %u to %u",
1183 				    ver_msg->ver_minor, vds_version[i].minor);
1184 				ver_msg->ver_minor = vds_version[i].minor;
1185 			}
1186 			return (B_TRUE);
1187 		}
1188 
1189 		/*
1190 		 * If the message contains a higher major version number, set
1191 		 * the message's major/minor versions to the current values
1192 		 * and return false, so this message will get "nack"ed with
1193 		 * these values, and the client will potentially try again
1194 		 * with the same or a lower version
1195 		 */
1196 		if (ver_msg->ver_major > vds_version[i].major) {
1197 			ver_msg->ver_major = vds_version[i].major;
1198 			ver_msg->ver_minor = vds_version[i].minor;
1199 			return (B_FALSE);
1200 		}
1201 
1202 		/*
1203 		 * Otherwise, the message's major version is less than the
1204 		 * current major version, so continue the loop to the next
1205 		 * (lower) supported version
1206 		 */
1207 	}
1208 
1209 	/*
1210 	 * No common version was found; "ground" the version pair in the
1211 	 * message to terminate negotiation
1212 	 */
1213 	ver_msg->ver_major = 0;
1214 	ver_msg->ver_minor = 0;
1215 	return (B_FALSE);
1216 }
1217 
1218 /*
1219  * Process a version message from a client.  vds expects to receive version
1220  * messages from clients seeking service, but never issues version messages
1221  * itself; therefore, vds can ACK or NACK client version messages, but does
1222  * not expect to receive version-message ACKs or NACKs (and will treat such
1223  * messages as invalid).
1224  */
1225 static int
1226 vd_process_ver_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
1227 {
1228 	vio_ver_msg_t	*ver_msg = (vio_ver_msg_t *)msg;
1229 
1230 
1231 	ASSERT(msglen >= sizeof (msg->tag));
1232 
1233 	if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO,
1234 		VIO_VER_INFO)) {
1235 		return (ENOMSG);	/* not a version message */
1236 	}
1237 
1238 	if (msglen != sizeof (*ver_msg)) {
1239 		PR0("Expected %lu-byte version message; "
1240 		    "received %lu bytes", sizeof (*ver_msg), msglen);
1241 		return (EBADMSG);
1242 	}
1243 
1244 	if (ver_msg->dev_class != VDEV_DISK) {
1245 		PR0("Expected device class %u (disk); received %u",
1246 		    VDEV_DISK, ver_msg->dev_class);
1247 		return (EBADMSG);
1248 	}
1249 
1250 	/*
1251 	 * We're talking to the expected kind of client; set our device class
1252 	 * for "ack/nack" back to the client
1253 	 */
1254 	ver_msg->dev_class = VDEV_DISK_SERVER;
1255 
1256 	/*
1257 	 * Check whether the (valid) version message specifies a version
1258 	 * supported by this server.  If the version is not supported, return
1259 	 * EBADMSG so the message will get "nack"ed; vds_supported_version()
1260 	 * will have updated the message with a supported version for the
1261 	 * client to consider
1262 	 */
1263 	if (!vds_supported_version(ver_msg))
1264 		return (EBADMSG);
1265 
1266 
1267 	/*
1268 	 * A version has been agreed upon; use the client's SID for
1269 	 * communication on this channel now
1270 	 */
1271 	ASSERT(!(vd->initialized & VD_SID));
1272 	vd->sid = ver_msg->tag.vio_sid;
1273 	vd->initialized |= VD_SID;
1274 
1275 	/*
1276 	 * When multiple versions are supported, this function should store
1277 	 * the negotiated major and minor version values in the "vd" data
1278 	 * structure to govern further communication; in particular, note that
1279 	 * the client might have specified a lower minor version for the
1280 	 * agreed major version than specifed in the vds_version[] array.  The
1281 	 * following assertions should help remind future maintainers to make
1282 	 * the appropriate changes to support multiple versions.
1283 	 */
1284 	ASSERT(vds_num_versions == 1);
1285 	ASSERT(ver_msg->ver_major == vds_version[0].major);
1286 	ASSERT(ver_msg->ver_minor == vds_version[0].minor);
1287 
1288 	PR0("Using major version %u, minor version %u",
1289 	    ver_msg->ver_major, ver_msg->ver_minor);
1290 	return (0);
1291 }
1292 
1293 static int
1294 vd_process_attr_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
1295 {
1296 	vd_attr_msg_t	*attr_msg = (vd_attr_msg_t *)msg;
1297 
1298 
1299 	ASSERT(msglen >= sizeof (msg->tag));
1300 
1301 	if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO,
1302 		VIO_ATTR_INFO)) {
1303 		PR0("Message is not an attribute message");
1304 		return (ENOMSG);
1305 	}
1306 
1307 	if (msglen != sizeof (*attr_msg)) {
1308 		PR0("Expected %lu-byte attribute message; "
1309 		    "received %lu bytes", sizeof (*attr_msg), msglen);
1310 		return (EBADMSG);
1311 	}
1312 
1313 	if (attr_msg->max_xfer_sz == 0) {
1314 		PR0("Received maximum transfer size of 0 from client");
1315 		return (EBADMSG);
1316 	}
1317 
1318 	if ((attr_msg->xfer_mode != VIO_DESC_MODE) &&
1319 	    (attr_msg->xfer_mode != VIO_DRING_MODE)) {
1320 		PR0("Client requested unsupported transfer mode");
1321 		return (EBADMSG);
1322 	}
1323 
1324 	/* Success:  valid message and transfer mode */
1325 	vd->xfer_mode = attr_msg->xfer_mode;
1326 
1327 	if (vd->xfer_mode == VIO_DESC_MODE) {
1328 
1329 		/*
1330 		 * The vd_dring_inband_msg_t contains one cookie; need room
1331 		 * for up to n-1 more cookies, where "n" is the number of full
1332 		 * pages plus possibly one partial page required to cover
1333 		 * "max_xfer_sz".  Add room for one more cookie if
1334 		 * "max_xfer_sz" isn't an integral multiple of the page size.
1335 		 * Must first get the maximum transfer size in bytes.
1336 		 */
1337 		size_t	max_xfer_bytes = attr_msg->vdisk_block_size ?
1338 		    attr_msg->vdisk_block_size*attr_msg->max_xfer_sz :
1339 		    attr_msg->max_xfer_sz;
1340 		size_t	max_inband_msglen =
1341 		    sizeof (vd_dring_inband_msg_t) +
1342 		    ((max_xfer_bytes/PAGESIZE +
1343 			((max_xfer_bytes % PAGESIZE) ? 1 : 0))*
1344 			(sizeof (ldc_mem_cookie_t)));
1345 
1346 		/*
1347 		 * Set the maximum expected message length to
1348 		 * accommodate in-band-descriptor messages with all
1349 		 * their cookies
1350 		 */
1351 		vd->max_msglen = MAX(vd->max_msglen, max_inband_msglen);
1352 
1353 		/*
1354 		 * Initialize the data structure for processing in-band I/O
1355 		 * request descriptors
1356 		 */
1357 		vd->inband_task.vd	= vd;
1358 		vd->inband_task.msg	= kmem_alloc(vd->max_msglen, KM_SLEEP);
1359 		vd->inband_task.index	= 0;
1360 		vd->inband_task.type	= VD_FINAL_RANGE_TASK;	/* range == 1 */
1361 	}
1362 
1363 	/* Return the device's block size and max transfer size to the client */
1364 	attr_msg->vdisk_block_size	= DEV_BSIZE;
1365 	attr_msg->max_xfer_sz		= vd->max_xfer_sz;
1366 
1367 	attr_msg->vdisk_size = vd->vdisk_size;
1368 	attr_msg->vdisk_type = vd->vdisk_type;
1369 	attr_msg->operations = vds_operations;
1370 	PR0("%s", VD_CLIENT(vd));
1371 
1372 	ASSERT(vd->dring_task == NULL);
1373 
1374 	return (0);
1375 }
1376 
1377 static int
1378 vd_process_dring_reg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
1379 {
1380 	int			status;
1381 	size_t			expected;
1382 	ldc_mem_info_t		dring_minfo;
1383 	vio_dring_reg_msg_t	*reg_msg = (vio_dring_reg_msg_t *)msg;
1384 
1385 
1386 	ASSERT(msglen >= sizeof (msg->tag));
1387 
1388 	if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO,
1389 		VIO_DRING_REG)) {
1390 		PR0("Message is not a register-dring message");
1391 		return (ENOMSG);
1392 	}
1393 
1394 	if (msglen < sizeof (*reg_msg)) {
1395 		PR0("Expected at least %lu-byte register-dring message; "
1396 		    "received %lu bytes", sizeof (*reg_msg), msglen);
1397 		return (EBADMSG);
1398 	}
1399 
1400 	expected = sizeof (*reg_msg) +
1401 	    (reg_msg->ncookies - 1)*(sizeof (reg_msg->cookie[0]));
1402 	if (msglen != expected) {
1403 		PR0("Expected %lu-byte register-dring message; "
1404 		    "received %lu bytes", expected, msglen);
1405 		return (EBADMSG);
1406 	}
1407 
1408 	if (vd->initialized & VD_DRING) {
1409 		PR0("A dring was previously registered; only support one");
1410 		return (EBADMSG);
1411 	}
1412 
1413 	if (reg_msg->num_descriptors > INT32_MAX) {
1414 		PR0("reg_msg->num_descriptors = %u; must be <= %u (%s)",
1415 		    reg_msg->ncookies, INT32_MAX, STRINGIZE(INT32_MAX));
1416 		return (EBADMSG);
1417 	}
1418 
1419 	if (reg_msg->ncookies != 1) {
1420 		/*
1421 		 * In addition to fixing the assertion in the success case
1422 		 * below, supporting drings which require more than one
1423 		 * "cookie" requires increasing the value of vd->max_msglen
1424 		 * somewhere in the code path prior to receiving the message
1425 		 * which results in calling this function.  Note that without
1426 		 * making this change, the larger message size required to
1427 		 * accommodate multiple cookies cannot be successfully
1428 		 * received, so this function will not even get called.
1429 		 * Gracefully accommodating more dring cookies might
1430 		 * reasonably demand exchanging an additional attribute or
1431 		 * making a minor protocol adjustment
1432 		 */
1433 		PR0("reg_msg->ncookies = %u != 1", reg_msg->ncookies);
1434 		return (EBADMSG);
1435 	}
1436 
1437 	status = ldc_mem_dring_map(vd->ldc_handle, reg_msg->cookie,
1438 	    reg_msg->ncookies, reg_msg->num_descriptors,
1439 	    reg_msg->descriptor_size, LDC_DIRECT_MAP, &vd->dring_handle);
1440 	if (status != 0) {
1441 		PR0("ldc_mem_dring_map() returned errno %d", status);
1442 		return (status);
1443 	}
1444 
1445 	/*
1446 	 * To remove the need for this assertion, must call
1447 	 * ldc_mem_dring_nextcookie() successfully ncookies-1 times after a
1448 	 * successful call to ldc_mem_dring_map()
1449 	 */
1450 	ASSERT(reg_msg->ncookies == 1);
1451 
1452 	if ((status =
1453 		ldc_mem_dring_info(vd->dring_handle, &dring_minfo)) != 0) {
1454 		PR0("ldc_mem_dring_info() returned errno %d", status);
1455 		if ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0)
1456 			PR0("ldc_mem_dring_unmap() returned errno %d", status);
1457 		return (status);
1458 	}
1459 
1460 	if (dring_minfo.vaddr == NULL) {
1461 		PR0("Descriptor ring virtual address is NULL");
1462 		return (ENXIO);
1463 	}
1464 
1465 
1466 	/* Initialize for valid message and mapped dring */
1467 	PR1("descriptor size = %u, dring length = %u",
1468 	    vd->descriptor_size, vd->dring_len);
1469 	vd->initialized |= VD_DRING;
1470 	vd->dring_ident = 1;	/* "There Can Be Only One" */
1471 	vd->dring = dring_minfo.vaddr;
1472 	vd->descriptor_size = reg_msg->descriptor_size;
1473 	vd->dring_len = reg_msg->num_descriptors;
1474 	reg_msg->dring_ident = vd->dring_ident;
1475 
1476 	/*
1477 	 * Allocate and initialize a "shadow" array of data structures for
1478 	 * tasks to process I/O requests in dring elements
1479 	 */
1480 	vd->dring_task =
1481 	    kmem_zalloc((sizeof (*vd->dring_task)) * vd->dring_len, KM_SLEEP);
1482 	for (int i = 0; i < vd->dring_len; i++) {
1483 		vd->dring_task[i].vd		= vd;
1484 		vd->dring_task[i].index		= i;
1485 		vd->dring_task[i].request	= &VD_DRING_ELEM(i)->payload;
1486 
1487 		status = ldc_mem_alloc_handle(vd->ldc_handle,
1488 		    &(vd->dring_task[i].mhdl));
1489 		if (status) {
1490 			PR0("ldc_mem_alloc_handle() returned err %d ", status);
1491 			return (ENXIO);
1492 		}
1493 
1494 		vd->dring_task[i].msg = kmem_alloc(vd->max_msglen, KM_SLEEP);
1495 	}
1496 
1497 	return (0);
1498 }
1499 
1500 static int
1501 vd_process_dring_unreg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
1502 {
1503 	vio_dring_unreg_msg_t	*unreg_msg = (vio_dring_unreg_msg_t *)msg;
1504 
1505 
1506 	ASSERT(msglen >= sizeof (msg->tag));
1507 
1508 	if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO,
1509 		VIO_DRING_UNREG)) {
1510 		PR0("Message is not an unregister-dring message");
1511 		return (ENOMSG);
1512 	}
1513 
1514 	if (msglen != sizeof (*unreg_msg)) {
1515 		PR0("Expected %lu-byte unregister-dring message; "
1516 		    "received %lu bytes", sizeof (*unreg_msg), msglen);
1517 		return (EBADMSG);
1518 	}
1519 
1520 	if (unreg_msg->dring_ident != vd->dring_ident) {
1521 		PR0("Expected dring ident %lu; received %lu",
1522 		    vd->dring_ident, unreg_msg->dring_ident);
1523 		return (EBADMSG);
1524 	}
1525 
1526 	return (0);
1527 }
1528 
1529 static int
1530 process_rdx_msg(vio_msg_t *msg, size_t msglen)
1531 {
1532 	ASSERT(msglen >= sizeof (msg->tag));
1533 
1534 	if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, VIO_RDX)) {
1535 		PR0("Message is not an RDX message");
1536 		return (ENOMSG);
1537 	}
1538 
1539 	if (msglen != sizeof (vio_rdx_msg_t)) {
1540 		PR0("Expected %lu-byte RDX message; received %lu bytes",
1541 		    sizeof (vio_rdx_msg_t), msglen);
1542 		return (EBADMSG);
1543 	}
1544 
1545 	PR0("Valid RDX message");
1546 	return (0);
1547 }
1548 
1549 static int
1550 vd_check_seq_num(vd_t *vd, uint64_t seq_num)
1551 {
1552 	if ((vd->initialized & VD_SEQ_NUM) && (seq_num != vd->seq_num + 1)) {
1553 		PR0("Received seq_num %lu; expected %lu",
1554 		    seq_num, (vd->seq_num + 1));
1555 		PR0("initiating soft reset");
1556 		vd_need_reset(vd, B_FALSE);
1557 		return (1);
1558 	}
1559 
1560 	vd->seq_num = seq_num;
1561 	vd->initialized |= VD_SEQ_NUM;	/* superfluous after first time... */
1562 	return (0);
1563 }
1564 
1565 /*
1566  * Return the expected size of an inband-descriptor message with all the
1567  * cookies it claims to include
1568  */
1569 static size_t
1570 expected_inband_size(vd_dring_inband_msg_t *msg)
1571 {
1572 	return ((sizeof (*msg)) +
1573 	    (msg->payload.ncookies - 1)*(sizeof (msg->payload.cookie[0])));
1574 }
1575 
1576 /*
1577  * Process an in-band descriptor message:  used with clients like OBP, with
1578  * which vds exchanges descriptors within VIO message payloads, rather than
1579  * operating on them within a descriptor ring
1580  */
1581 static int
1582 vd_process_desc_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
1583 {
1584 	size_t			expected;
1585 	vd_dring_inband_msg_t	*desc_msg = (vd_dring_inband_msg_t *)msg;
1586 
1587 
1588 	ASSERT(msglen >= sizeof (msg->tag));
1589 
1590 	if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO,
1591 		VIO_DESC_DATA)) {
1592 		PR1("Message is not an in-band-descriptor message");
1593 		return (ENOMSG);
1594 	}
1595 
1596 	if (msglen < sizeof (*desc_msg)) {
1597 		PR0("Expected at least %lu-byte descriptor message; "
1598 		    "received %lu bytes", sizeof (*desc_msg), msglen);
1599 		return (EBADMSG);
1600 	}
1601 
1602 	if (msglen != (expected = expected_inband_size(desc_msg))) {
1603 		PR0("Expected %lu-byte descriptor message; "
1604 		    "received %lu bytes", expected, msglen);
1605 		return (EBADMSG);
1606 	}
1607 
1608 	if (vd_check_seq_num(vd, desc_msg->hdr.seq_num) != 0)
1609 		return (EBADMSG);
1610 
1611 	/*
1612 	 * Valid message:  Set up the in-band descriptor task and process the
1613 	 * request.  Arrange to acknowledge the client's message, unless an
1614 	 * error processing the descriptor task results in setting
1615 	 * VIO_SUBTYPE_NACK
1616 	 */
1617 	PR1("Valid in-band-descriptor message");
1618 	msg->tag.vio_subtype = VIO_SUBTYPE_ACK;
1619 
1620 	ASSERT(vd->inband_task.msg != NULL);
1621 
1622 	bcopy(msg, vd->inband_task.msg, msglen);
1623 	vd->inband_task.msglen	= msglen;
1624 
1625 	/*
1626 	 * The task request is now the payload of the message
1627 	 * that was just copied into the body of the task.
1628 	 */
1629 	desc_msg = (vd_dring_inband_msg_t *)vd->inband_task.msg;
1630 	vd->inband_task.request	= &desc_msg->payload;
1631 
1632 	return (vd_process_task(&vd->inband_task));
1633 }
1634 
1635 static int
1636 vd_process_element(vd_t *vd, vd_task_type_t type, uint32_t idx,
1637     vio_msg_t *msg, size_t msglen)
1638 {
1639 	int			status;
1640 	boolean_t		ready;
1641 	vd_dring_entry_t	*elem = VD_DRING_ELEM(idx);
1642 
1643 
1644 	/* Accept the updated dring element */
1645 	if ((status = ldc_mem_dring_acquire(vd->dring_handle, idx, idx)) != 0) {
1646 		PR0("ldc_mem_dring_acquire() returned errno %d", status);
1647 		return (status);
1648 	}
1649 	ready = (elem->hdr.dstate == VIO_DESC_READY);
1650 	if (ready) {
1651 		elem->hdr.dstate = VIO_DESC_ACCEPTED;
1652 	} else {
1653 		PR0("descriptor %u not ready", idx);
1654 		VD_DUMP_DRING_ELEM(elem);
1655 	}
1656 	if ((status = ldc_mem_dring_release(vd->dring_handle, idx, idx)) != 0) {
1657 		PR0("ldc_mem_dring_release() returned errno %d", status);
1658 		return (status);
1659 	}
1660 	if (!ready)
1661 		return (EBUSY);
1662 
1663 
1664 	/* Initialize a task and process the accepted element */
1665 	PR1("Processing dring element %u", idx);
1666 	vd->dring_task[idx].type	= type;
1667 
1668 	/* duplicate msg buf for cookies etc. */
1669 	bcopy(msg, vd->dring_task[idx].msg, msglen);
1670 
1671 	vd->dring_task[idx].msglen	= msglen;
1672 	if ((status = vd_process_task(&vd->dring_task[idx])) != EINPROGRESS)
1673 		status = vd_mark_elem_done(vd, idx, elem->payload.status);
1674 
1675 	return (status);
1676 }
1677 
1678 static int
1679 vd_process_element_range(vd_t *vd, int start, int end,
1680     vio_msg_t *msg, size_t msglen)
1681 {
1682 	int		i, n, nelem, status = 0;
1683 	boolean_t	inprogress = B_FALSE;
1684 	vd_task_type_t	type;
1685 
1686 
1687 	ASSERT(start >= 0);
1688 	ASSERT(end >= 0);
1689 
1690 	/*
1691 	 * Arrange to acknowledge the client's message, unless an error
1692 	 * processing one of the dring elements results in setting
1693 	 * VIO_SUBTYPE_NACK
1694 	 */
1695 	msg->tag.vio_subtype = VIO_SUBTYPE_ACK;
1696 
1697 	/*
1698 	 * Process the dring elements in the range
1699 	 */
1700 	nelem = ((end < start) ? end + vd->dring_len : end) - start + 1;
1701 	for (i = start, n = nelem; n > 0; i = (i + 1) % vd->dring_len, n--) {
1702 		((vio_dring_msg_t *)msg)->end_idx = i;
1703 		type = (n == 1) ? VD_FINAL_RANGE_TASK : VD_NONFINAL_RANGE_TASK;
1704 		status = vd_process_element(vd, type, i, msg, msglen);
1705 		if (status == EINPROGRESS)
1706 			inprogress = B_TRUE;
1707 		else if (status != 0)
1708 			break;
1709 	}
1710 
1711 	/*
1712 	 * If some, but not all, operations of a multi-element range are in
1713 	 * progress, wait for other operations to complete before returning
1714 	 * (which will result in "ack" or "nack" of the message).  Note that
1715 	 * all outstanding operations will need to complete, not just the ones
1716 	 * corresponding to the current range of dring elements; howevever, as
1717 	 * this situation is an error case, performance is less critical.
1718 	 */
1719 	if ((nelem > 1) && (status != EINPROGRESS) && inprogress)
1720 		ddi_taskq_wait(vd->completionq);
1721 
1722 	return (status);
1723 }
1724 
1725 static int
1726 vd_process_dring_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
1727 {
1728 	vio_dring_msg_t	*dring_msg = (vio_dring_msg_t *)msg;
1729 
1730 
1731 	ASSERT(msglen >= sizeof (msg->tag));
1732 
1733 	if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO,
1734 		VIO_DRING_DATA)) {
1735 		PR1("Message is not a dring-data message");
1736 		return (ENOMSG);
1737 	}
1738 
1739 	if (msglen != sizeof (*dring_msg)) {
1740 		PR0("Expected %lu-byte dring message; received %lu bytes",
1741 		    sizeof (*dring_msg), msglen);
1742 		return (EBADMSG);
1743 	}
1744 
1745 	if (vd_check_seq_num(vd, dring_msg->seq_num) != 0)
1746 		return (EBADMSG);
1747 
1748 	if (dring_msg->dring_ident != vd->dring_ident) {
1749 		PR0("Expected dring ident %lu; received ident %lu",
1750 		    vd->dring_ident, dring_msg->dring_ident);
1751 		return (EBADMSG);
1752 	}
1753 
1754 	if (dring_msg->start_idx >= vd->dring_len) {
1755 		PR0("\"start_idx\" = %u; must be less than %u",
1756 		    dring_msg->start_idx, vd->dring_len);
1757 		return (EBADMSG);
1758 	}
1759 
1760 	if ((dring_msg->end_idx < 0) ||
1761 	    (dring_msg->end_idx >= vd->dring_len)) {
1762 		PR0("\"end_idx\" = %u; must be >= 0 and less than %u",
1763 		    dring_msg->end_idx, vd->dring_len);
1764 		return (EBADMSG);
1765 	}
1766 
1767 	/* Valid message; process range of updated dring elements */
1768 	PR1("Processing descriptor range, start = %u, end = %u",
1769 	    dring_msg->start_idx, dring_msg->end_idx);
1770 	return (vd_process_element_range(vd, dring_msg->start_idx,
1771 		dring_msg->end_idx, msg, msglen));
1772 }
1773 
1774 static int
1775 recv_msg(ldc_handle_t ldc_handle, void *msg, size_t *nbytes)
1776 {
1777 	int	retry, status;
1778 	size_t	size = *nbytes;
1779 
1780 
1781 	for (retry = 0, status = ETIMEDOUT;
1782 	    retry < vds_ldc_retries && status == ETIMEDOUT;
1783 	    retry++) {
1784 		PR1("ldc_read() attempt %d", (retry + 1));
1785 		*nbytes = size;
1786 		status = ldc_read(ldc_handle, msg, nbytes);
1787 	}
1788 
1789 	if (status) {
1790 		PR0("ldc_read() returned errno %d", status);
1791 		if (status != ECONNRESET)
1792 			return (ENOMSG);
1793 		return (status);
1794 	} else if (*nbytes == 0) {
1795 		PR1("ldc_read() returned 0 and no message read");
1796 		return (ENOMSG);
1797 	}
1798 
1799 	PR1("RCVD %lu-byte message", *nbytes);
1800 	return (0);
1801 }
1802 
1803 static int
1804 vd_do_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
1805 {
1806 	int		status;
1807 
1808 
1809 	PR1("Processing (%x/%x/%x) message", msg->tag.vio_msgtype,
1810 	    msg->tag.vio_subtype, msg->tag.vio_subtype_env);
1811 #ifdef	DEBUG
1812 	vd_decode_tag(msg);
1813 #endif
1814 
1815 	/*
1816 	 * Validate session ID up front, since it applies to all messages
1817 	 * once set
1818 	 */
1819 	if ((msg->tag.vio_sid != vd->sid) && (vd->initialized & VD_SID)) {
1820 		PR0("Expected SID %u, received %u", vd->sid,
1821 		    msg->tag.vio_sid);
1822 		return (EBADMSG);
1823 	}
1824 
1825 	PR1("\tWhile in state %d (%s)", vd->state, vd_decode_state(vd->state));
1826 
1827 	/*
1828 	 * Process the received message based on connection state
1829 	 */
1830 	switch (vd->state) {
1831 	case VD_STATE_INIT:	/* expect version message */
1832 		if ((status = vd_process_ver_msg(vd, msg, msglen)) != 0)
1833 			return (status);
1834 
1835 		/* Version negotiated, move to that state */
1836 		vd->state = VD_STATE_VER;
1837 		return (0);
1838 
1839 	case VD_STATE_VER:	/* expect attribute message */
1840 		if ((status = vd_process_attr_msg(vd, msg, msglen)) != 0)
1841 			return (status);
1842 
1843 		/* Attributes exchanged, move to that state */
1844 		vd->state = VD_STATE_ATTR;
1845 		return (0);
1846 
1847 	case VD_STATE_ATTR:
1848 		switch (vd->xfer_mode) {
1849 		case VIO_DESC_MODE:	/* expect RDX message */
1850 			if ((status = process_rdx_msg(msg, msglen)) != 0)
1851 				return (status);
1852 
1853 			/* Ready to receive in-band descriptors */
1854 			vd->state = VD_STATE_DATA;
1855 			return (0);
1856 
1857 		case VIO_DRING_MODE:	/* expect register-dring message */
1858 			if ((status =
1859 				vd_process_dring_reg_msg(vd, msg, msglen)) != 0)
1860 				return (status);
1861 
1862 			/* One dring negotiated, move to that state */
1863 			vd->state = VD_STATE_DRING;
1864 			return (0);
1865 
1866 		default:
1867 			ASSERT("Unsupported transfer mode");
1868 			PR0("Unsupported transfer mode");
1869 			return (ENOTSUP);
1870 		}
1871 
1872 	case VD_STATE_DRING:	/* expect RDX, register-dring, or unreg-dring */
1873 		if ((status = process_rdx_msg(msg, msglen)) == 0) {
1874 			/* Ready to receive data */
1875 			vd->state = VD_STATE_DATA;
1876 			return (0);
1877 		} else if (status != ENOMSG) {
1878 			return (status);
1879 		}
1880 
1881 
1882 		/*
1883 		 * If another register-dring message is received, stay in
1884 		 * dring state in case the client sends RDX; although the
1885 		 * protocol allows multiple drings, this server does not
1886 		 * support using more than one
1887 		 */
1888 		if ((status =
1889 			vd_process_dring_reg_msg(vd, msg, msglen)) != ENOMSG)
1890 			return (status);
1891 
1892 		/*
1893 		 * Acknowledge an unregister-dring message, but reset the
1894 		 * connection anyway:  Although the protocol allows
1895 		 * unregistering drings, this server cannot serve a vdisk
1896 		 * without its only dring
1897 		 */
1898 		status = vd_process_dring_unreg_msg(vd, msg, msglen);
1899 		return ((status == 0) ? ENOTSUP : status);
1900 
1901 	case VD_STATE_DATA:
1902 		switch (vd->xfer_mode) {
1903 		case VIO_DESC_MODE:	/* expect in-band-descriptor message */
1904 			return (vd_process_desc_msg(vd, msg, msglen));
1905 
1906 		case VIO_DRING_MODE:	/* expect dring-data or unreg-dring */
1907 			/*
1908 			 * Typically expect dring-data messages, so handle
1909 			 * them first
1910 			 */
1911 			if ((status = vd_process_dring_msg(vd, msg,
1912 				    msglen)) != ENOMSG)
1913 				return (status);
1914 
1915 			/*
1916 			 * Acknowledge an unregister-dring message, but reset
1917 			 * the connection anyway:  Although the protocol
1918 			 * allows unregistering drings, this server cannot
1919 			 * serve a vdisk without its only dring
1920 			 */
1921 			status = vd_process_dring_unreg_msg(vd, msg, msglen);
1922 			return ((status == 0) ? ENOTSUP : status);
1923 
1924 		default:
1925 			ASSERT("Unsupported transfer mode");
1926 			PR0("Unsupported transfer mode");
1927 			return (ENOTSUP);
1928 		}
1929 
1930 	default:
1931 		ASSERT("Invalid client connection state");
1932 		PR0("Invalid client connection state");
1933 		return (ENOTSUP);
1934 	}
1935 }
1936 
1937 static int
1938 vd_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
1939 {
1940 	int		status;
1941 	boolean_t	reset_ldc = B_FALSE;
1942 
1943 
1944 	/*
1945 	 * Check that the message is at least big enough for a "tag", so that
1946 	 * message processing can proceed based on tag-specified message type
1947 	 */
1948 	if (msglen < sizeof (vio_msg_tag_t)) {
1949 		PR0("Received short (%lu-byte) message", msglen);
1950 		/* Can't "nack" short message, so drop the big hammer */
1951 		PR0("initiating full reset");
1952 		vd_need_reset(vd, B_TRUE);
1953 		return (EBADMSG);
1954 	}
1955 
1956 	/*
1957 	 * Process the message
1958 	 */
1959 	switch (status = vd_do_process_msg(vd, msg, msglen)) {
1960 	case 0:
1961 		/* "ack" valid, successfully-processed messages */
1962 		msg->tag.vio_subtype = VIO_SUBTYPE_ACK;
1963 		break;
1964 
1965 	case EINPROGRESS:
1966 		/* The completion handler will "ack" or "nack" the message */
1967 		return (EINPROGRESS);
1968 	case ENOMSG:
1969 		PR0("Received unexpected message");
1970 		_NOTE(FALLTHROUGH);
1971 	case EBADMSG:
1972 	case ENOTSUP:
1973 		/* "nack" invalid messages */
1974 		msg->tag.vio_subtype = VIO_SUBTYPE_NACK;
1975 		break;
1976 
1977 	default:
1978 		/* "nack" failed messages */
1979 		msg->tag.vio_subtype = VIO_SUBTYPE_NACK;
1980 		/* An LDC error probably occurred, so try resetting it */
1981 		reset_ldc = B_TRUE;
1982 		break;
1983 	}
1984 
1985 	PR1("\tResulting in state %d (%s)", vd->state,
1986 		vd_decode_state(vd->state));
1987 
1988 	/* Send the "ack" or "nack" to the client */
1989 	PR1("Sending %s",
1990 	    (msg->tag.vio_subtype == VIO_SUBTYPE_ACK) ? "ACK" : "NACK");
1991 	if (send_msg(vd->ldc_handle, msg, msglen) != 0)
1992 		reset_ldc = B_TRUE;
1993 
1994 	/* Arrange to reset the connection for nack'ed or failed messages */
1995 	if ((status != 0) || reset_ldc) {
1996 		PR0("initiating %s reset",
1997 		    (reset_ldc) ? "full" : "soft");
1998 		vd_need_reset(vd, reset_ldc);
1999 	}
2000 
2001 	return (status);
2002 }
2003 
2004 static boolean_t
2005 vd_enabled(vd_t *vd)
2006 {
2007 	boolean_t	enabled;
2008 
2009 
2010 	mutex_enter(&vd->lock);
2011 	enabled = vd->enabled;
2012 	mutex_exit(&vd->lock);
2013 	return (enabled);
2014 }
2015 
2016 static void
2017 vd_recv_msg(void *arg)
2018 {
2019 	vd_t	*vd = (vd_t *)arg;
2020 	int	rv = 0, status = 0;
2021 
2022 	ASSERT(vd != NULL);
2023 
2024 	PR2("New task to receive incoming message(s)");
2025 
2026 
2027 	while (vd_enabled(vd) && status == 0) {
2028 		size_t		msglen, msgsize;
2029 		ldc_status_t	lstatus;
2030 
2031 		/*
2032 		 * Receive and process a message
2033 		 */
2034 		vd_reset_if_needed(vd);	/* can change vd->max_msglen */
2035 
2036 		/*
2037 		 * check if channel is UP - else break out of loop
2038 		 */
2039 		status = ldc_status(vd->ldc_handle, &lstatus);
2040 		if (lstatus != LDC_UP) {
2041 			PR0("channel not up (status=%d), exiting recv loop\n",
2042 			    lstatus);
2043 			break;
2044 		}
2045 
2046 		ASSERT(vd->max_msglen != 0);
2047 
2048 		msgsize = vd->max_msglen; /* stable copy for alloc/free */
2049 		msglen	= msgsize;	  /* actual len after recv_msg() */
2050 
2051 		status = recv_msg(vd->ldc_handle, vd->vio_msgp, &msglen);
2052 		switch (status) {
2053 		case 0:
2054 			rv = vd_process_msg(vd, (vio_msg_t *)vd->vio_msgp,
2055 				msglen);
2056 			/* check if max_msglen changed */
2057 			if (msgsize != vd->max_msglen) {
2058 				PR0("max_msglen changed 0x%lx to 0x%lx bytes\n",
2059 				    msgsize, vd->max_msglen);
2060 				kmem_free(vd->vio_msgp, msgsize);
2061 				vd->vio_msgp =
2062 					kmem_alloc(vd->max_msglen, KM_SLEEP);
2063 			}
2064 			if (rv == EINPROGRESS)
2065 				continue;
2066 			break;
2067 
2068 		case ENOMSG:
2069 			break;
2070 
2071 		case ECONNRESET:
2072 			PR0("initiating soft reset (ECONNRESET)\n");
2073 			vd_need_reset(vd, B_FALSE);
2074 			status = 0;
2075 			break;
2076 
2077 		default:
2078 			/* Probably an LDC failure; arrange to reset it */
2079 			PR0("initiating full reset (status=0x%x)", status);
2080 			vd_need_reset(vd, B_TRUE);
2081 			break;
2082 		}
2083 	}
2084 
2085 	PR2("Task finished");
2086 }
2087 
2088 static uint_t
2089 vd_handle_ldc_events(uint64_t event, caddr_t arg)
2090 {
2091 	vd_t	*vd = (vd_t *)(void *)arg;
2092 	int	status;
2093 
2094 	ASSERT(vd != NULL);
2095 
2096 	if (!vd_enabled(vd))
2097 		return (LDC_SUCCESS);
2098 
2099 	if (event & LDC_EVT_DOWN) {
2100 		PR0("LDC_EVT_DOWN: LDC channel went down");
2101 
2102 		vd_need_reset(vd, B_TRUE);
2103 		status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd,
2104 		    DDI_SLEEP);
2105 		if (status == DDI_FAILURE) {
2106 			PR0("cannot schedule task to recv msg\n");
2107 			vd_need_reset(vd, B_TRUE);
2108 		}
2109 	}
2110 
2111 	if (event & LDC_EVT_RESET) {
2112 		PR0("LDC_EVT_RESET: LDC channel was reset");
2113 
2114 		if (vd->state != VD_STATE_INIT) {
2115 			PR0("scheduling full reset");
2116 			vd_need_reset(vd, B_FALSE);
2117 			status = ddi_taskq_dispatch(vd->startq, vd_recv_msg,
2118 			    vd, DDI_SLEEP);
2119 			if (status == DDI_FAILURE) {
2120 				PR0("cannot schedule task to recv msg\n");
2121 				vd_need_reset(vd, B_TRUE);
2122 			}
2123 
2124 		} else {
2125 			PR0("channel already reset, ignoring...\n");
2126 			PR0("doing ldc up...\n");
2127 			(void) ldc_up(vd->ldc_handle);
2128 		}
2129 
2130 		return (LDC_SUCCESS);
2131 	}
2132 
2133 	if (event & LDC_EVT_UP) {
2134 		PR0("EVT_UP: LDC is up\nResetting client connection state");
2135 		PR0("initiating soft reset");
2136 		vd_need_reset(vd, B_FALSE);
2137 		status = ddi_taskq_dispatch(vd->startq, vd_recv_msg,
2138 		    vd, DDI_SLEEP);
2139 		if (status == DDI_FAILURE) {
2140 			PR0("cannot schedule task to recv msg\n");
2141 			vd_need_reset(vd, B_TRUE);
2142 			return (LDC_SUCCESS);
2143 		}
2144 	}
2145 
2146 	if (event & LDC_EVT_READ) {
2147 		int	status;
2148 
2149 		PR1("New data available");
2150 		/* Queue a task to receive the new data */
2151 		status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd,
2152 		    DDI_SLEEP);
2153 
2154 		if (status == DDI_FAILURE) {
2155 			PR0("cannot schedule task to recv msg\n");
2156 			vd_need_reset(vd, B_TRUE);
2157 		}
2158 	}
2159 
2160 	return (LDC_SUCCESS);
2161 }
2162 
2163 static uint_t
2164 vds_check_for_vd(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
2165 {
2166 	_NOTE(ARGUNUSED(key, val))
2167 	(*((uint_t *)arg))++;
2168 	return (MH_WALK_TERMINATE);
2169 }
2170 
2171 
2172 static int
2173 vds_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2174 {
2175 	uint_t	vd_present = 0;
2176 	minor_t	instance;
2177 	vds_t	*vds;
2178 
2179 
2180 	switch (cmd) {
2181 	case DDI_DETACH:
2182 		/* the real work happens below */
2183 		break;
2184 	case DDI_SUSPEND:
2185 		PR0("No action required for DDI_SUSPEND");
2186 		return (DDI_SUCCESS);
2187 	default:
2188 		PR0("Unrecognized \"cmd\"");
2189 		return (DDI_FAILURE);
2190 	}
2191 
2192 	ASSERT(cmd == DDI_DETACH);
2193 	instance = ddi_get_instance(dip);
2194 	if ((vds = ddi_get_soft_state(vds_state, instance)) == NULL) {
2195 		PR0("Could not get state for instance %u", instance);
2196 		ddi_soft_state_free(vds_state, instance);
2197 		return (DDI_FAILURE);
2198 	}
2199 
2200 	/* Do no detach when serving any vdisks */
2201 	mod_hash_walk(vds->vd_table, vds_check_for_vd, &vd_present);
2202 	if (vd_present) {
2203 		PR0("Not detaching because serving vdisks");
2204 		return (DDI_FAILURE);
2205 	}
2206 
2207 	PR0("Detaching");
2208 	if (vds->initialized & VDS_MDEG) {
2209 		(void) mdeg_unregister(vds->mdeg);
2210 		kmem_free(vds->ispecp->specp, sizeof (vds_prop_template));
2211 		kmem_free(vds->ispecp, sizeof (mdeg_node_spec_t));
2212 		vds->ispecp = NULL;
2213 		vds->mdeg = NULL;
2214 	}
2215 
2216 	if (vds->initialized & VDS_LDI)
2217 		(void) ldi_ident_release(vds->ldi_ident);
2218 	mod_hash_destroy_hash(vds->vd_table);
2219 	ddi_soft_state_free(vds_state, instance);
2220 	return (DDI_SUCCESS);
2221 }
2222 
2223 static boolean_t
2224 is_pseudo_device(dev_info_t *dip)
2225 {
2226 	dev_info_t	*parent, *root = ddi_root_node();
2227 
2228 
2229 	for (parent = ddi_get_parent(dip); (parent != NULL) && (parent != root);
2230 	    parent = ddi_get_parent(parent)) {
2231 		if (strcmp(ddi_get_name(parent), DEVI_PSEUDO_NEXNAME) == 0)
2232 			return (B_TRUE);
2233 	}
2234 
2235 	return (B_FALSE);
2236 }
2237 
2238 static int
2239 vd_setup_full_disk(vd_t *vd)
2240 {
2241 	int		rval, status;
2242 	major_t		major = getmajor(vd->dev[0]);
2243 	minor_t		minor = getminor(vd->dev[0]) - VD_ENTIRE_DISK_SLICE;
2244 	struct dk_minfo	dk_minfo;
2245 
2246 	/*
2247 	 * At this point, vdisk_size is set to the size of partition 2 but
2248 	 * this does not represent the size of the disk because partition 2
2249 	 * may not cover the entire disk and its size does not include reserved
2250 	 * blocks. So we update vdisk_size to be the size of the entire disk.
2251 	 */
2252 	if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCGMEDIAINFO,
2253 	    (intptr_t)&dk_minfo, (vd_open_flags | FKIOCTL),
2254 	    kcred, &rval)) != 0) {
2255 		PR0("ldi_ioctl(DKIOCGMEDIAINFO) returned errno %d",
2256 		    status);
2257 		return (status);
2258 	}
2259 	vd->vdisk_size = dk_minfo.dki_capacity;
2260 
2261 	/* Set full-disk parameters */
2262 	vd->vdisk_type	= VD_DISK_TYPE_DISK;
2263 	vd->nslices	= (sizeof (vd->dev))/(sizeof (vd->dev[0]));
2264 
2265 	/* Move dev number and LDI handle to entire-disk-slice array elements */
2266 	vd->dev[VD_ENTIRE_DISK_SLICE]		= vd->dev[0];
2267 	vd->dev[0]				= 0;
2268 	vd->ldi_handle[VD_ENTIRE_DISK_SLICE]	= vd->ldi_handle[0];
2269 	vd->ldi_handle[0]			= NULL;
2270 
2271 	/* Initialize device numbers for remaining slices and open them */
2272 	for (int slice = 0; slice < vd->nslices; slice++) {
2273 		/*
2274 		 * Skip the entire-disk slice, as it's already open and its
2275 		 * device known
2276 		 */
2277 		if (slice == VD_ENTIRE_DISK_SLICE)
2278 			continue;
2279 		ASSERT(vd->dev[slice] == 0);
2280 		ASSERT(vd->ldi_handle[slice] == NULL);
2281 
2282 		/*
2283 		 * Construct the device number for the current slice
2284 		 */
2285 		vd->dev[slice] = makedevice(major, (minor + slice));
2286 
2287 		/*
2288 		 * Open all slices of the disk to serve them to the client.
2289 		 * Slices are opened exclusively to prevent other threads or
2290 		 * processes in the service domain from performing I/O to
2291 		 * slices being accessed by a client.  Failure to open a slice
2292 		 * results in vds not serving this disk, as the client could
2293 		 * attempt (and should be able) to access any slice immediately.
2294 		 * Any slices successfully opened before a failure will get
2295 		 * closed by vds_destroy_vd() as a result of the error returned
2296 		 * by this function.
2297 		 *
2298 		 * We need to do the open with FNDELAY so that opening an empty
2299 		 * slice does not fail.
2300 		 */
2301 		PR0("Opening device major %u, minor %u = slice %u",
2302 		    major, minor, slice);
2303 		if ((status = ldi_open_by_dev(&vd->dev[slice], OTYP_BLK,
2304 		    vd_open_flags | FNDELAY, kcred, &vd->ldi_handle[slice],
2305 		    vd->vds->ldi_ident)) != 0) {
2306 			PR0("ldi_open_by_dev() returned errno %d "
2307 			    "for slice %u", status, slice);
2308 			/* vds_destroy_vd() will close any open slices */
2309 			return (status);
2310 		}
2311 	}
2312 
2313 	return (0);
2314 }
2315 
2316 static int
2317 vd_setup_partition_efi(vd_t *vd)
2318 {
2319 	efi_gpt_t *gpt;
2320 	efi_gpe_t *gpe;
2321 	struct uuid uuid = EFI_RESERVED;
2322 	uint32_t crc;
2323 	int length;
2324 
2325 	length = sizeof (efi_gpt_t) + sizeof (efi_gpe_t);
2326 
2327 	gpt = kmem_zalloc(length, KM_SLEEP);
2328 	gpe = (efi_gpe_t *)(gpt + 1);
2329 
2330 	gpt->efi_gpt_Signature = LE_64(EFI_SIGNATURE);
2331 	gpt->efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT);
2332 	gpt->efi_gpt_HeaderSize = LE_32(sizeof (efi_gpt_t));
2333 	gpt->efi_gpt_FirstUsableLBA = LE_64(0ULL);
2334 	gpt->efi_gpt_LastUsableLBA = LE_64(vd->vdisk_size - 1);
2335 	gpt->efi_gpt_NumberOfPartitionEntries = LE_32(1);
2336 	gpt->efi_gpt_SizeOfPartitionEntry = LE_32(sizeof (efi_gpe_t));
2337 
2338 	UUID_LE_CONVERT(gpe->efi_gpe_PartitionTypeGUID, uuid);
2339 	gpe->efi_gpe_StartingLBA = gpt->efi_gpt_FirstUsableLBA;
2340 	gpe->efi_gpe_EndingLBA = gpt->efi_gpt_LastUsableLBA;
2341 
2342 	CRC32(crc, gpe, sizeof (efi_gpe_t), -1U, crc32_table);
2343 	gpt->efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
2344 
2345 	CRC32(crc, gpt, sizeof (efi_gpt_t), -1U, crc32_table);
2346 	gpt->efi_gpt_HeaderCRC32 = LE_32(~crc);
2347 
2348 	vd->dk_efi.dki_lba = 0;
2349 	vd->dk_efi.dki_length = length;
2350 	vd->dk_efi.dki_data = gpt;
2351 
2352 	return (0);
2353 }
2354 
2355 static int
2356 vd_setup_vd(char *device_path, vd_t *vd)
2357 {
2358 	int		rval, status;
2359 	dev_info_t	*dip;
2360 	struct dk_cinfo	dk_cinfo;
2361 
2362 	/*
2363 	 * We need to open with FNDELAY so that opening an empty partition
2364 	 * does not fail.
2365 	 */
2366 	if ((status = ldi_open_by_name(device_path, vd_open_flags | FNDELAY,
2367 	    kcred, &vd->ldi_handle[0], vd->vds->ldi_ident)) != 0) {
2368 		PRN("ldi_open_by_name(%s) = errno %d", device_path, status);
2369 		return (status);
2370 	}
2371 
2372 	/*
2373 	 * nslices must be updated now so that vds_destroy_vd() will close
2374 	 * the slice we have just opened in case of an error.
2375 	 */
2376 	vd->nslices = 1;
2377 
2378 	/* Get device number and size of backing device */
2379 	if ((status = ldi_get_dev(vd->ldi_handle[0], &vd->dev[0])) != 0) {
2380 		PRN("ldi_get_dev() returned errno %d for %s",
2381 		    status, device_path);
2382 		return (status);
2383 	}
2384 	if (ldi_get_size(vd->ldi_handle[0], &vd->vdisk_size) != DDI_SUCCESS) {
2385 		PRN("ldi_get_size() failed for %s", device_path);
2386 		return (EIO);
2387 	}
2388 	vd->vdisk_size = lbtodb(vd->vdisk_size);	/* convert to blocks */
2389 
2390 	/* Verify backing device supports dk_cinfo, dk_geom, and vtoc */
2391 	if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCINFO,
2392 		    (intptr_t)&dk_cinfo, (vd_open_flags | FKIOCTL), kcred,
2393 		    &rval)) != 0) {
2394 		PRN("ldi_ioctl(DKIOCINFO) returned errno %d for %s",
2395 		    status, device_path);
2396 		return (status);
2397 	}
2398 	if (dk_cinfo.dki_partition >= V_NUMPAR) {
2399 		PRN("slice %u >= maximum slice %u for %s",
2400 		    dk_cinfo.dki_partition, V_NUMPAR, device_path);
2401 		return (EIO);
2402 	}
2403 
2404 	status = vd_read_vtoc(vd->ldi_handle[0], &vd->vtoc, &vd->vdisk_label);
2405 
2406 	if (status != 0) {
2407 		PRN("vd_read_vtoc returned errno %d for %s",
2408 		    status, device_path);
2409 		return (status);
2410 	}
2411 
2412 	if (vd->vdisk_label == VD_DISK_LABEL_VTOC &&
2413 	    (status = ldi_ioctl(vd->ldi_handle[0], DKIOCGGEOM,
2414 	    (intptr_t)&vd->dk_geom, (vd_open_flags | FKIOCTL),
2415 	    kcred, &rval)) != 0) {
2416 		    PRN("ldi_ioctl(DKIOCGEOM) returned errno %d for %s",
2417 			status, device_path);
2418 		    return (status);
2419 	}
2420 
2421 	/* Store the device's max transfer size for return to the client */
2422 	vd->max_xfer_sz = dk_cinfo.dki_maxtransfer;
2423 
2424 
2425 	/* Determine if backing device is a pseudo device */
2426 	if ((dip = ddi_hold_devi_by_instance(getmajor(vd->dev[0]),
2427 		    dev_to_instance(vd->dev[0]), 0))  == NULL) {
2428 		PRN("%s is no longer accessible", device_path);
2429 		return (EIO);
2430 	}
2431 	vd->pseudo = is_pseudo_device(dip);
2432 	ddi_release_devi(dip);
2433 	if (vd->pseudo) {
2434 		vd->vdisk_type	= VD_DISK_TYPE_SLICE;
2435 		vd->nslices	= 1;
2436 		return (0);	/* ...and we're done */
2437 	}
2438 
2439 
2440 	/* If slice is entire-disk slice, initialize for full disk */
2441 	if (dk_cinfo.dki_partition == VD_ENTIRE_DISK_SLICE)
2442 		return (vd_setup_full_disk(vd));
2443 
2444 
2445 	/* Otherwise, we have a non-entire slice of a device */
2446 	vd->vdisk_type	= VD_DISK_TYPE_SLICE;
2447 	vd->nslices	= 1;
2448 
2449 	if (vd->vdisk_label == VD_DISK_LABEL_EFI) {
2450 		status = vd_setup_partition_efi(vd);
2451 		return (status);
2452 	}
2453 
2454 	/* Initialize dk_geom structure for single-slice device */
2455 	if (vd->dk_geom.dkg_nsect == 0) {
2456 		PR0("%s geometry claims 0 sectors per track", device_path);
2457 		return (EIO);
2458 	}
2459 	if (vd->dk_geom.dkg_nhead == 0) {
2460 		PR0("%s geometry claims 0 heads", device_path);
2461 		return (EIO);
2462 	}
2463 	vd->dk_geom.dkg_ncyl =
2464 	    vd->vdisk_size/vd->dk_geom.dkg_nsect/vd->dk_geom.dkg_nhead;
2465 	vd->dk_geom.dkg_acyl = 0;
2466 	vd->dk_geom.dkg_pcyl = vd->dk_geom.dkg_ncyl + vd->dk_geom.dkg_acyl;
2467 
2468 
2469 	/* Initialize vtoc structure for single-slice device */
2470 	bcopy(VD_VOLUME_NAME, vd->vtoc.v_volume,
2471 	    MIN(sizeof (VD_VOLUME_NAME), sizeof (vd->vtoc.v_volume)));
2472 	bzero(vd->vtoc.v_part, sizeof (vd->vtoc.v_part));
2473 	vd->vtoc.v_nparts = 1;
2474 	vd->vtoc.v_part[0].p_tag = V_UNASSIGNED;
2475 	vd->vtoc.v_part[0].p_flag = 0;
2476 	vd->vtoc.v_part[0].p_start = 0;
2477 	vd->vtoc.v_part[0].p_size = vd->vdisk_size;
2478 	bcopy(VD_ASCIILABEL, vd->vtoc.v_asciilabel,
2479 	    MIN(sizeof (VD_ASCIILABEL), sizeof (vd->vtoc.v_asciilabel)));
2480 
2481 
2482 	return (0);
2483 }
2484 
2485 static int
2486 vds_do_init_vd(vds_t *vds, uint64_t id, char *device_path, uint64_t ldc_id,
2487     vd_t **vdp)
2488 {
2489 	char			tq_name[TASKQ_NAMELEN];
2490 	int			status;
2491 	ddi_iblock_cookie_t	iblock = NULL;
2492 	ldc_attr_t		ldc_attr;
2493 	vd_t			*vd;
2494 
2495 
2496 	ASSERT(vds != NULL);
2497 	ASSERT(device_path != NULL);
2498 	ASSERT(vdp != NULL);
2499 	PR0("Adding vdisk for %s", device_path);
2500 
2501 	if ((vd = kmem_zalloc(sizeof (*vd), KM_NOSLEEP)) == NULL) {
2502 		PRN("No memory for virtual disk");
2503 		return (EAGAIN);
2504 	}
2505 	*vdp = vd;	/* assign here so vds_destroy_vd() can cleanup later */
2506 	vd->vds = vds;
2507 
2508 
2509 	/* Open vdisk and initialize parameters */
2510 	if ((status = vd_setup_vd(device_path, vd)) != 0)
2511 		return (status);
2512 	ASSERT(vd->nslices > 0 && vd->nslices <= V_NUMPAR);
2513 	PR0("vdisk_type = %s, pseudo = %s, nslices = %u",
2514 	    ((vd->vdisk_type == VD_DISK_TYPE_DISK) ? "disk" : "slice"),
2515 	    (vd->pseudo ? "yes" : "no"), vd->nslices);
2516 
2517 
2518 	/* Initialize locking */
2519 	if (ddi_get_soft_iblock_cookie(vds->dip, DDI_SOFTINT_MED,
2520 		&iblock) != DDI_SUCCESS) {
2521 		PRN("Could not get iblock cookie.");
2522 		return (EIO);
2523 	}
2524 
2525 	mutex_init(&vd->lock, NULL, MUTEX_DRIVER, iblock);
2526 	vd->initialized |= VD_LOCKING;
2527 
2528 
2529 	/* Create start and completion task queues for the vdisk */
2530 	(void) snprintf(tq_name, sizeof (tq_name), "vd_startq%lu", id);
2531 	PR1("tq_name = %s", tq_name);
2532 	if ((vd->startq = ddi_taskq_create(vds->dip, tq_name, 1,
2533 		    TASKQ_DEFAULTPRI, 0)) == NULL) {
2534 		PRN("Could not create task queue");
2535 		return (EIO);
2536 	}
2537 	(void) snprintf(tq_name, sizeof (tq_name), "vd_completionq%lu", id);
2538 	PR1("tq_name = %s", tq_name);
2539 	if ((vd->completionq = ddi_taskq_create(vds->dip, tq_name, 1,
2540 		    TASKQ_DEFAULTPRI, 0)) == NULL) {
2541 		PRN("Could not create task queue");
2542 		return (EIO);
2543 	}
2544 	vd->enabled = 1;	/* before callback can dispatch to startq */
2545 
2546 
2547 	/* Bring up LDC */
2548 	ldc_attr.devclass	= LDC_DEV_BLK_SVC;
2549 	ldc_attr.instance	= ddi_get_instance(vds->dip);
2550 	ldc_attr.mode		= LDC_MODE_UNRELIABLE;
2551 	ldc_attr.mtu		= VD_LDC_MTU;
2552 	if ((status = ldc_init(ldc_id, &ldc_attr, &vd->ldc_handle)) != 0) {
2553 		PR0("ldc_init(%lu) = errno %d", ldc_id, status);
2554 		return (status);
2555 	}
2556 	vd->initialized |= VD_LDC;
2557 
2558 	if ((status = ldc_reg_callback(vd->ldc_handle, vd_handle_ldc_events,
2559 		(caddr_t)vd)) != 0) {
2560 		PR0("ldc_reg_callback() returned errno %d", status);
2561 		return (status);
2562 	}
2563 
2564 	if ((status = ldc_open(vd->ldc_handle)) != 0) {
2565 		PR0("ldc_open() returned errno %d", status);
2566 		return (status);
2567 	}
2568 
2569 	if ((status = ldc_up(vd->ldc_handle)) != 0) {
2570 		PR0("ldc_up() returned errno %d", status);
2571 	}
2572 
2573 	/* Allocate the inband task memory handle */
2574 	status = ldc_mem_alloc_handle(vd->ldc_handle, &(vd->inband_task.mhdl));
2575 	if (status) {
2576 		PR0("ldc_mem_alloc_handle() returned err %d ", status);
2577 		return (ENXIO);
2578 	}
2579 
2580 	/* Add the successfully-initialized vdisk to the server's table */
2581 	if (mod_hash_insert(vds->vd_table, (mod_hash_key_t)id, vd) != 0) {
2582 		PRN("Error adding vdisk ID %lu to table", id);
2583 		return (EIO);
2584 	}
2585 
2586 	/* Allocate the staging buffer */
2587 	vd->max_msglen	= sizeof (vio_msg_t);	/* baseline vio message size */
2588 	vd->vio_msgp = kmem_alloc(vd->max_msglen, KM_SLEEP);
2589 
2590 	/* store initial state */
2591 	vd->state = VD_STATE_INIT;
2592 
2593 	return (0);
2594 }
2595 
2596 static void
2597 vd_free_dring_task(vd_t *vdp)
2598 {
2599 	if (vdp->dring_task != NULL) {
2600 		ASSERT(vdp->dring_len != 0);
2601 		/* Free all dring_task memory handles */
2602 		for (int i = 0; i < vdp->dring_len; i++) {
2603 			(void) ldc_mem_free_handle(vdp->dring_task[i].mhdl);
2604 			kmem_free(vdp->dring_task[i].msg, vdp->max_msglen);
2605 			vdp->dring_task[i].msg = NULL;
2606 		}
2607 		kmem_free(vdp->dring_task,
2608 		    (sizeof (*vdp->dring_task)) * vdp->dring_len);
2609 		vdp->dring_task = NULL;
2610 	}
2611 }
2612 
2613 /*
2614  * Destroy the state associated with a virtual disk
2615  */
2616 static void
2617 vds_destroy_vd(void *arg)
2618 {
2619 	vd_t	*vd = (vd_t *)arg;
2620 	int	retry = 0, rv;
2621 
2622 	if (vd == NULL)
2623 		return;
2624 
2625 	PR0("Destroying vdisk state");
2626 
2627 	if (vd->dk_efi.dki_data != NULL)
2628 		kmem_free(vd->dk_efi.dki_data, vd->dk_efi.dki_length);
2629 
2630 	/* Disable queuing requests for the vdisk */
2631 	if (vd->initialized & VD_LOCKING) {
2632 		mutex_enter(&vd->lock);
2633 		vd->enabled = 0;
2634 		mutex_exit(&vd->lock);
2635 	}
2636 
2637 	/* Drain and destroy start queue (*before* destroying completionq) */
2638 	if (vd->startq != NULL)
2639 		ddi_taskq_destroy(vd->startq);	/* waits for queued tasks */
2640 
2641 	/* Drain and destroy completion queue (*before* shutting down LDC) */
2642 	if (vd->completionq != NULL)
2643 		ddi_taskq_destroy(vd->completionq);	/* waits for tasks */
2644 
2645 	vd_free_dring_task(vd);
2646 
2647 	/* Free the inband task memory handle */
2648 	(void) ldc_mem_free_handle(vd->inband_task.mhdl);
2649 
2650 	/* Shut down LDC */
2651 	if (vd->initialized & VD_LDC) {
2652 		/* unmap the dring */
2653 		if (vd->initialized & VD_DRING)
2654 			(void) ldc_mem_dring_unmap(vd->dring_handle);
2655 
2656 		/* close LDC channel - retry on EAGAIN */
2657 		while ((rv = ldc_close(vd->ldc_handle)) == EAGAIN) {
2658 			if (++retry > vds_ldc_retries) {
2659 				PR0("Timed out closing channel");
2660 				break;
2661 			}
2662 			drv_usecwait(vds_ldc_delay);
2663 		}
2664 		if (rv == 0) {
2665 			(void) ldc_unreg_callback(vd->ldc_handle);
2666 			(void) ldc_fini(vd->ldc_handle);
2667 		} else {
2668 			/*
2669 			 * Closing the LDC channel has failed. Ideally we should
2670 			 * fail here but there is no Zeus level infrastructure
2671 			 * to handle this. The MD has already been changed and
2672 			 * we have to do the close. So we try to do as much
2673 			 * clean up as we can.
2674 			 */
2675 			(void) ldc_set_cb_mode(vd->ldc_handle, LDC_CB_DISABLE);
2676 			while (ldc_unreg_callback(vd->ldc_handle) == EAGAIN)
2677 				drv_usecwait(vds_ldc_delay);
2678 		}
2679 	}
2680 
2681 	/* Free the staging buffer for msgs */
2682 	if (vd->vio_msgp != NULL) {
2683 		kmem_free(vd->vio_msgp, vd->max_msglen);
2684 		vd->vio_msgp = NULL;
2685 	}
2686 
2687 	/* Free the inband message buffer */
2688 	if (vd->inband_task.msg != NULL) {
2689 		kmem_free(vd->inband_task.msg, vd->max_msglen);
2690 		vd->inband_task.msg = NULL;
2691 	}
2692 
2693 	/* Close any open backing-device slices */
2694 	for (uint_t slice = 0; slice < vd->nslices; slice++) {
2695 		if (vd->ldi_handle[slice] != NULL) {
2696 			PR0("Closing slice %u", slice);
2697 			(void) ldi_close(vd->ldi_handle[slice],
2698 			    vd_open_flags | FNDELAY, kcred);
2699 		}
2700 	}
2701 
2702 	/* Free lock */
2703 	if (vd->initialized & VD_LOCKING)
2704 		mutex_destroy(&vd->lock);
2705 
2706 	/* Finally, free the vdisk structure itself */
2707 	kmem_free(vd, sizeof (*vd));
2708 }
2709 
2710 static int
2711 vds_init_vd(vds_t *vds, uint64_t id, char *device_path, uint64_t ldc_id)
2712 {
2713 	int	status;
2714 	vd_t	*vd = NULL;
2715 
2716 
2717 	if ((status = vds_do_init_vd(vds, id, device_path, ldc_id, &vd)) != 0)
2718 		vds_destroy_vd(vd);
2719 
2720 	return (status);
2721 }
2722 
2723 static int
2724 vds_do_get_ldc_id(md_t *md, mde_cookie_t vd_node, mde_cookie_t *channel,
2725     uint64_t *ldc_id)
2726 {
2727 	int	num_channels;
2728 
2729 
2730 	/* Look for channel endpoint child(ren) of the vdisk MD node */
2731 	if ((num_channels = md_scan_dag(md, vd_node,
2732 		    md_find_name(md, VD_CHANNEL_ENDPOINT),
2733 		    md_find_name(md, "fwd"), channel)) <= 0) {
2734 		PRN("No \"%s\" found for virtual disk", VD_CHANNEL_ENDPOINT);
2735 		return (-1);
2736 	}
2737 
2738 	/* Get the "id" value for the first channel endpoint node */
2739 	if (md_get_prop_val(md, channel[0], VD_ID_PROP, ldc_id) != 0) {
2740 		PRN("No \"%s\" property found for \"%s\" of vdisk",
2741 		    VD_ID_PROP, VD_CHANNEL_ENDPOINT);
2742 		return (-1);
2743 	}
2744 
2745 	if (num_channels > 1) {
2746 		PRN("Using ID of first of multiple channels for this vdisk");
2747 	}
2748 
2749 	return (0);
2750 }
2751 
2752 static int
2753 vds_get_ldc_id(md_t *md, mde_cookie_t vd_node, uint64_t *ldc_id)
2754 {
2755 	int		num_nodes, status;
2756 	size_t		size;
2757 	mde_cookie_t	*channel;
2758 
2759 
2760 	if ((num_nodes = md_node_count(md)) <= 0) {
2761 		PRN("Invalid node count in Machine Description subtree");
2762 		return (-1);
2763 	}
2764 	size = num_nodes*(sizeof (*channel));
2765 	channel = kmem_zalloc(size, KM_SLEEP);
2766 	status = vds_do_get_ldc_id(md, vd_node, channel, ldc_id);
2767 	kmem_free(channel, size);
2768 
2769 	return (status);
2770 }
2771 
2772 static void
2773 vds_add_vd(vds_t *vds, md_t *md, mde_cookie_t vd_node)
2774 {
2775 	char		*device_path = NULL;
2776 	uint64_t	id = 0, ldc_id = 0;
2777 
2778 
2779 	if (md_get_prop_val(md, vd_node, VD_ID_PROP, &id) != 0) {
2780 		PRN("Error getting vdisk \"%s\"", VD_ID_PROP);
2781 		return;
2782 	}
2783 	PR0("Adding vdisk ID %lu", id);
2784 	if (md_get_prop_str(md, vd_node, VD_BLOCK_DEVICE_PROP,
2785 		&device_path) != 0) {
2786 		PRN("Error getting vdisk \"%s\"", VD_BLOCK_DEVICE_PROP);
2787 		return;
2788 	}
2789 
2790 	if (vds_get_ldc_id(md, vd_node, &ldc_id) != 0) {
2791 		PRN("Error getting LDC ID for vdisk %lu", id);
2792 		return;
2793 	}
2794 
2795 	if (vds_init_vd(vds, id, device_path, ldc_id) != 0) {
2796 		PRN("Failed to add vdisk ID %lu", id);
2797 		return;
2798 	}
2799 }
2800 
2801 static void
2802 vds_remove_vd(vds_t *vds, md_t *md, mde_cookie_t vd_node)
2803 {
2804 	uint64_t	id = 0;
2805 
2806 
2807 	if (md_get_prop_val(md, vd_node, VD_ID_PROP, &id) != 0) {
2808 		PRN("Unable to get \"%s\" property from vdisk's MD node",
2809 		    VD_ID_PROP);
2810 		return;
2811 	}
2812 	PR0("Removing vdisk ID %lu", id);
2813 	if (mod_hash_destroy(vds->vd_table, (mod_hash_key_t)id) != 0)
2814 		PRN("No vdisk entry found for vdisk ID %lu", id);
2815 }
2816 
2817 static void
2818 vds_change_vd(vds_t *vds, md_t *prev_md, mde_cookie_t prev_vd_node,
2819     md_t *curr_md, mde_cookie_t curr_vd_node)
2820 {
2821 	char		*curr_dev, *prev_dev;
2822 	uint64_t	curr_id = 0, curr_ldc_id = 0;
2823 	uint64_t	prev_id = 0, prev_ldc_id = 0;
2824 	size_t		len;
2825 
2826 
2827 	/* Validate that vdisk ID has not changed */
2828 	if (md_get_prop_val(prev_md, prev_vd_node, VD_ID_PROP, &prev_id) != 0) {
2829 		PRN("Error getting previous vdisk \"%s\" property",
2830 		    VD_ID_PROP);
2831 		return;
2832 	}
2833 	if (md_get_prop_val(curr_md, curr_vd_node, VD_ID_PROP, &curr_id) != 0) {
2834 		PRN("Error getting current vdisk \"%s\" property", VD_ID_PROP);
2835 		return;
2836 	}
2837 	if (curr_id != prev_id) {
2838 		PRN("Not changing vdisk:  ID changed from %lu to %lu",
2839 		    prev_id, curr_id);
2840 		return;
2841 	}
2842 
2843 	/* Validate that LDC ID has not changed */
2844 	if (vds_get_ldc_id(prev_md, prev_vd_node, &prev_ldc_id) != 0) {
2845 		PRN("Error getting LDC ID for vdisk %lu", prev_id);
2846 		return;
2847 	}
2848 
2849 	if (vds_get_ldc_id(curr_md, curr_vd_node, &curr_ldc_id) != 0) {
2850 		PRN("Error getting LDC ID for vdisk %lu", curr_id);
2851 		return;
2852 	}
2853 	if (curr_ldc_id != prev_ldc_id) {
2854 		_NOTE(NOTREACHED);	/* lint is confused */
2855 		PRN("Not changing vdisk:  "
2856 		    "LDC ID changed from %lu to %lu", prev_ldc_id, curr_ldc_id);
2857 		return;
2858 	}
2859 
2860 	/* Determine whether device path has changed */
2861 	if (md_get_prop_str(prev_md, prev_vd_node, VD_BLOCK_DEVICE_PROP,
2862 		&prev_dev) != 0) {
2863 		PRN("Error getting previous vdisk \"%s\"",
2864 		    VD_BLOCK_DEVICE_PROP);
2865 		return;
2866 	}
2867 	if (md_get_prop_str(curr_md, curr_vd_node, VD_BLOCK_DEVICE_PROP,
2868 		&curr_dev) != 0) {
2869 		PRN("Error getting current vdisk \"%s\"", VD_BLOCK_DEVICE_PROP);
2870 		return;
2871 	}
2872 	if (((len = strlen(curr_dev)) == strlen(prev_dev)) &&
2873 	    (strncmp(curr_dev, prev_dev, len) == 0))
2874 		return;	/* no relevant (supported) change */
2875 
2876 	PR0("Changing vdisk ID %lu", prev_id);
2877 
2878 	/* Remove old state, which will close vdisk and reset */
2879 	if (mod_hash_destroy(vds->vd_table, (mod_hash_key_t)prev_id) != 0)
2880 		PRN("No entry found for vdisk ID %lu", prev_id);
2881 
2882 	/* Re-initialize vdisk with new state */
2883 	if (vds_init_vd(vds, curr_id, curr_dev, curr_ldc_id) != 0) {
2884 		PRN("Failed to change vdisk ID %lu", curr_id);
2885 		return;
2886 	}
2887 }
2888 
2889 static int
2890 vds_process_md(void *arg, mdeg_result_t *md)
2891 {
2892 	int	i;
2893 	vds_t	*vds = arg;
2894 
2895 
2896 	if (md == NULL)
2897 		return (MDEG_FAILURE);
2898 	ASSERT(vds != NULL);
2899 
2900 	for (i = 0; i < md->removed.nelem; i++)
2901 		vds_remove_vd(vds, md->removed.mdp, md->removed.mdep[i]);
2902 	for (i = 0; i < md->match_curr.nelem; i++)
2903 		vds_change_vd(vds, md->match_prev.mdp, md->match_prev.mdep[i],
2904 		    md->match_curr.mdp, md->match_curr.mdep[i]);
2905 	for (i = 0; i < md->added.nelem; i++)
2906 		vds_add_vd(vds, md->added.mdp, md->added.mdep[i]);
2907 
2908 	return (MDEG_SUCCESS);
2909 }
2910 
2911 static int
2912 vds_do_attach(dev_info_t *dip)
2913 {
2914 	int			status, sz;
2915 	int			cfg_handle;
2916 	minor_t			instance = ddi_get_instance(dip);
2917 	vds_t			*vds;
2918 	mdeg_prop_spec_t	*pspecp;
2919 	mdeg_node_spec_t	*ispecp;
2920 
2921 	/*
2922 	 * The "cfg-handle" property of a vds node in an MD contains the MD's
2923 	 * notion of "instance", or unique identifier, for that node; OBP
2924 	 * stores the value of the "cfg-handle" MD property as the value of
2925 	 * the "reg" property on the node in the device tree it builds from
2926 	 * the MD and passes to Solaris.  Thus, we look up the devinfo node's
2927 	 * "reg" property value to uniquely identify this device instance when
2928 	 * registering with the MD event-generation framework.  If the "reg"
2929 	 * property cannot be found, the device tree state is presumably so
2930 	 * broken that there is no point in continuing.
2931 	 */
2932 	if (!ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
2933 		VD_REG_PROP)) {
2934 		PRN("vds \"%s\" property does not exist", VD_REG_PROP);
2935 		return (DDI_FAILURE);
2936 	}
2937 
2938 	/* Get the MD instance for later MDEG registration */
2939 	cfg_handle = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
2940 	    VD_REG_PROP, -1);
2941 
2942 	if (ddi_soft_state_zalloc(vds_state, instance) != DDI_SUCCESS) {
2943 		PRN("Could not allocate state for instance %u", instance);
2944 		return (DDI_FAILURE);
2945 	}
2946 
2947 	if ((vds = ddi_get_soft_state(vds_state, instance)) == NULL) {
2948 		PRN("Could not get state for instance %u", instance);
2949 		ddi_soft_state_free(vds_state, instance);
2950 		return (DDI_FAILURE);
2951 	}
2952 
2953 	vds->dip	= dip;
2954 	vds->vd_table	= mod_hash_create_ptrhash("vds_vd_table", VDS_NCHAINS,
2955 							vds_destroy_vd,
2956 							sizeof (void *));
2957 	ASSERT(vds->vd_table != NULL);
2958 
2959 	if ((status = ldi_ident_from_dip(dip, &vds->ldi_ident)) != 0) {
2960 		PRN("ldi_ident_from_dip() returned errno %d", status);
2961 		return (DDI_FAILURE);
2962 	}
2963 	vds->initialized |= VDS_LDI;
2964 
2965 	/* Register for MD updates */
2966 	sz = sizeof (vds_prop_template);
2967 	pspecp = kmem_alloc(sz, KM_SLEEP);
2968 	bcopy(vds_prop_template, pspecp, sz);
2969 
2970 	VDS_SET_MDEG_PROP_INST(pspecp, cfg_handle);
2971 
2972 	/* initialize the complete prop spec structure */
2973 	ispecp = kmem_zalloc(sizeof (mdeg_node_spec_t), KM_SLEEP);
2974 	ispecp->namep = "virtual-device";
2975 	ispecp->specp = pspecp;
2976 
2977 	if (mdeg_register(ispecp, &vd_match, vds_process_md, vds,
2978 		&vds->mdeg) != MDEG_SUCCESS) {
2979 		PRN("Unable to register for MD updates");
2980 		kmem_free(ispecp, sizeof (mdeg_node_spec_t));
2981 		kmem_free(pspecp, sz);
2982 		return (DDI_FAILURE);
2983 	}
2984 
2985 	vds->ispecp = ispecp;
2986 	vds->initialized |= VDS_MDEG;
2987 
2988 	/* Prevent auto-detaching so driver is available whenever MD changes */
2989 	if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) !=
2990 	    DDI_PROP_SUCCESS) {
2991 		PRN("failed to set \"%s\" property for instance %u",
2992 		    DDI_NO_AUTODETACH, instance);
2993 	}
2994 
2995 	ddi_report_dev(dip);
2996 	return (DDI_SUCCESS);
2997 }
2998 
2999 static int
3000 vds_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
3001 {
3002 	int	status;
3003 
3004 	switch (cmd) {
3005 	case DDI_ATTACH:
3006 		PR0("Attaching");
3007 		if ((status = vds_do_attach(dip)) != DDI_SUCCESS)
3008 			(void) vds_detach(dip, DDI_DETACH);
3009 		return (status);
3010 	case DDI_RESUME:
3011 		PR0("No action required for DDI_RESUME");
3012 		return (DDI_SUCCESS);
3013 	default:
3014 		return (DDI_FAILURE);
3015 	}
3016 }
3017 
3018 static struct dev_ops vds_ops = {
3019 	DEVO_REV,	/* devo_rev */
3020 	0,		/* devo_refcnt */
3021 	ddi_no_info,	/* devo_getinfo */
3022 	nulldev,	/* devo_identify */
3023 	nulldev,	/* devo_probe */
3024 	vds_attach,	/* devo_attach */
3025 	vds_detach,	/* devo_detach */
3026 	nodev,		/* devo_reset */
3027 	NULL,		/* devo_cb_ops */
3028 	NULL,		/* devo_bus_ops */
3029 	nulldev		/* devo_power */
3030 };
3031 
3032 static struct modldrv modldrv = {
3033 	&mod_driverops,
3034 	"virtual disk server v%I%",
3035 	&vds_ops,
3036 };
3037 
3038 static struct modlinkage modlinkage = {
3039 	MODREV_1,
3040 	&modldrv,
3041 	NULL
3042 };
3043 
3044 
3045 int
3046 _init(void)
3047 {
3048 	int		i, status;
3049 
3050 
3051 	if ((status = ddi_soft_state_init(&vds_state, sizeof (vds_t), 1)) != 0)
3052 		return (status);
3053 	if ((status = mod_install(&modlinkage)) != 0) {
3054 		ddi_soft_state_fini(&vds_state);
3055 		return (status);
3056 	}
3057 
3058 	/* Fill in the bit-mask of server-supported operations */
3059 	for (i = 0; i < vds_noperations; i++)
3060 		vds_operations |= 1 << (vds_operation[i].operation - 1);
3061 
3062 	return (0);
3063 }
3064 
3065 int
3066 _info(struct modinfo *modinfop)
3067 {
3068 	return (mod_info(&modlinkage, modinfop));
3069 }
3070 
3071 int
3072 _fini(void)
3073 {
3074 	int	status;
3075 
3076 
3077 	if ((status = mod_remove(&modlinkage)) != 0)
3078 		return (status);
3079 	ddi_soft_state_fini(&vds_state);
3080 	return (0);
3081 }
3082