xref: /titanic_41/usr/src/uts/sun4v/io/vds.c (revision 867ad6ccd534f8cc8a833f2a852036a33af5d522)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Virtual disk server
31  */
32 
33 
34 #include <sys/types.h>
35 #include <sys/conf.h>
36 #include <sys/crc32.h>
37 #include <sys/ddi.h>
38 #include <sys/dkio.h>
39 #include <sys/file.h>
40 #include <sys/mdeg.h>
41 #include <sys/modhash.h>
42 #include <sys/note.h>
43 #include <sys/pathname.h>
44 #include <sys/sunddi.h>
45 #include <sys/sunldi.h>
46 #include <sys/sysmacros.h>
47 #include <sys/vio_common.h>
48 #include <sys/vdsk_mailbox.h>
49 #include <sys/vdsk_common.h>
50 #include <sys/vtoc.h>
51 #include <sys/vfs.h>
52 #include <sys/stat.h>
53 
54 /* Virtual disk server initialization flags */
55 #define	VDS_LDI			0x01
56 #define	VDS_MDEG		0x02
57 
58 /* Virtual disk server tunable parameters */
59 #define	VDS_RETRIES		5
60 #define	VDS_LDC_DELAY		1000 /* 1 msecs */
61 #define	VDS_DEV_DELAY		10000000 /* 10 secs */
62 #define	VDS_NCHAINS		32
63 
64 /* Identification parameters for MD, synthetic dkio(7i) structures, etc. */
65 #define	VDS_NAME		"virtual-disk-server"
66 
67 #define	VD_NAME			"vd"
68 #define	VD_VOLUME_NAME		"vdisk"
69 #define	VD_ASCIILABEL		"Virtual Disk"
70 
71 #define	VD_CHANNEL_ENDPOINT	"channel-endpoint"
72 #define	VD_ID_PROP		"id"
73 #define	VD_BLOCK_DEVICE_PROP	"vds-block-device"
74 #define	VD_REG_PROP		"reg"
75 
76 /* Virtual disk initialization flags */
77 #define	VD_DISK_READY		0x01
78 #define	VD_LOCKING		0x02
79 #define	VD_LDC			0x04
80 #define	VD_DRING		0x08
81 #define	VD_SID			0x10
82 #define	VD_SEQ_NUM		0x20
83 
84 /* Flags for opening/closing backing devices via LDI */
85 #define	VD_OPEN_FLAGS		(FEXCL | FREAD | FWRITE)
86 
87 /*
88  * By Solaris convention, slice/partition 2 represents the entire disk;
89  * unfortunately, this convention does not appear to be codified.
90  */
91 #define	VD_ENTIRE_DISK_SLICE	2
92 
93 /* Return a cpp token as a string */
94 #define	STRINGIZE(token)	#token
95 
96 /*
97  * Print a message prefixed with the current function name to the message log
98  * (and optionally to the console for verbose boots); these macros use cpp's
99  * concatenation of string literals and C99 variable-length-argument-list
100  * macros
101  */
102 #define	PRN(...)	_PRN("?%s():  "__VA_ARGS__, "")
103 #define	_PRN(format, ...)					\
104 	cmn_err(CE_CONT, format"%s", __func__, __VA_ARGS__)
105 
106 /* Return a pointer to the "i"th vdisk dring element */
107 #define	VD_DRING_ELEM(i)	((vd_dring_entry_t *)(void *)	\
108 	    (vd->dring + (i)*vd->descriptor_size))
109 
110 /* Return the virtual disk client's type as a string (for use in messages) */
111 #define	VD_CLIENT(vd)							\
112 	(((vd)->xfer_mode == VIO_DESC_MODE) ? "in-band client" :	\
113 	    (((vd)->xfer_mode == VIO_DRING_MODE) ? "dring client" :	\
114 		(((vd)->xfer_mode == 0) ? "null client" :		\
115 		    "unsupported client")))
116 
117 /*
118  * Specification of an MD node passed to the MDEG to filter any
119  * 'vport' nodes that do not belong to the specified node. This
120  * template is copied for each vds instance and filled in with
121  * the appropriate 'cfg-handle' value before being passed to the MDEG.
122  */
123 static mdeg_prop_spec_t	vds_prop_template[] = {
124 	{ MDET_PROP_STR,	"name",		VDS_NAME },
125 	{ MDET_PROP_VAL,	"cfg-handle",	NULL },
126 	{ MDET_LIST_END,	NULL, 		NULL }
127 };
128 
129 #define	VDS_SET_MDEG_PROP_INST(specp, val) (specp)[1].ps_val = (val);
130 
131 /*
132  * Matching criteria passed to the MDEG to register interest
133  * in changes to 'virtual-device-port' nodes identified by their
134  * 'id' property.
135  */
136 static md_prop_match_t	vd_prop_match[] = {
137 	{ MDET_PROP_VAL,	VD_ID_PROP },
138 	{ MDET_LIST_END,	NULL }
139 };
140 
141 static mdeg_node_match_t vd_match = {"virtual-device-port",
142 				    vd_prop_match};
143 
144 /* Debugging macros */
145 #ifdef DEBUG
146 
147 static int	vd_msglevel = 0;
148 
149 #define	PR0 if (vd_msglevel > 0)	PRN
150 #define	PR1 if (vd_msglevel > 1)	PRN
151 #define	PR2 if (vd_msglevel > 2)	PRN
152 
153 #define	VD_DUMP_DRING_ELEM(elem)					\
154 	PR0("dst:%x op:%x st:%u nb:%lx addr:%lx ncook:%u\n",		\
155 	    elem->hdr.dstate,						\
156 	    elem->payload.operation,					\
157 	    elem->payload.status,					\
158 	    elem->payload.nbytes,					\
159 	    elem->payload.addr,						\
160 	    elem->payload.ncookies);
161 
162 char *
163 vd_decode_state(int state)
164 {
165 	char *str;
166 
167 #define	CASE_STATE(_s)	case _s: str = #_s; break;
168 
169 	switch (state) {
170 	CASE_STATE(VD_STATE_INIT)
171 	CASE_STATE(VD_STATE_VER)
172 	CASE_STATE(VD_STATE_ATTR)
173 	CASE_STATE(VD_STATE_DRING)
174 	CASE_STATE(VD_STATE_RDX)
175 	CASE_STATE(VD_STATE_DATA)
176 	default: str = "unknown"; break;
177 	}
178 
179 #undef CASE_STATE
180 
181 	return (str);
182 }
183 
184 void
185 vd_decode_tag(vio_msg_t *msg)
186 {
187 	char *tstr, *sstr, *estr;
188 
189 #define	CASE_TYPE(_s)	case _s: tstr = #_s; break;
190 
191 	switch (msg->tag.vio_msgtype) {
192 	CASE_TYPE(VIO_TYPE_CTRL)
193 	CASE_TYPE(VIO_TYPE_DATA)
194 	CASE_TYPE(VIO_TYPE_ERR)
195 	default: tstr = "unknown"; break;
196 	}
197 
198 #undef CASE_TYPE
199 
200 #define	CASE_SUBTYPE(_s) case _s: sstr = #_s; break;
201 
202 	switch (msg->tag.vio_subtype) {
203 	CASE_SUBTYPE(VIO_SUBTYPE_INFO)
204 	CASE_SUBTYPE(VIO_SUBTYPE_ACK)
205 	CASE_SUBTYPE(VIO_SUBTYPE_NACK)
206 	default: sstr = "unknown"; break;
207 	}
208 
209 #undef CASE_SUBTYPE
210 
211 #define	CASE_ENV(_s)	case _s: estr = #_s; break;
212 
213 	switch (msg->tag.vio_subtype_env) {
214 	CASE_ENV(VIO_VER_INFO)
215 	CASE_ENV(VIO_ATTR_INFO)
216 	CASE_ENV(VIO_DRING_REG)
217 	CASE_ENV(VIO_DRING_UNREG)
218 	CASE_ENV(VIO_RDX)
219 	CASE_ENV(VIO_PKT_DATA)
220 	CASE_ENV(VIO_DESC_DATA)
221 	CASE_ENV(VIO_DRING_DATA)
222 	default: estr = "unknown"; break;
223 	}
224 
225 #undef CASE_ENV
226 
227 	PR1("(%x/%x/%x) message : (%s/%s/%s)",
228 	    msg->tag.vio_msgtype, msg->tag.vio_subtype,
229 	    msg->tag.vio_subtype_env, tstr, sstr, estr);
230 }
231 
232 #else	/* !DEBUG */
233 
234 #define	PR0(...)
235 #define	PR1(...)
236 #define	PR2(...)
237 
238 #define	VD_DUMP_DRING_ELEM(elem)
239 
240 #define	vd_decode_state(_s)	(NULL)
241 #define	vd_decode_tag(_s)	(NULL)
242 
243 #endif	/* DEBUG */
244 
245 
246 /*
247  * Soft state structure for a vds instance
248  */
249 typedef struct vds {
250 	uint_t		initialized;	/* driver inst initialization flags */
251 	dev_info_t	*dip;		/* driver inst devinfo pointer */
252 	ldi_ident_t	ldi_ident;	/* driver's identifier for LDI */
253 	mod_hash_t	*vd_table;	/* table of virtual disks served */
254 	mdeg_node_spec_t *ispecp;	/* mdeg node specification */
255 	mdeg_handle_t	mdeg;		/* handle for MDEG operations  */
256 } vds_t;
257 
258 /*
259  * Types of descriptor-processing tasks
260  */
261 typedef enum vd_task_type {
262 	VD_NONFINAL_RANGE_TASK,	/* task for intermediate descriptor in range */
263 	VD_FINAL_RANGE_TASK,	/* task for last in a range of descriptors */
264 } vd_task_type_t;
265 
266 /*
267  * Structure describing the task for processing a descriptor
268  */
269 typedef struct vd_task {
270 	struct vd		*vd;		/* vd instance task is for */
271 	vd_task_type_t		type;		/* type of descriptor task */
272 	int			index;		/* dring elem index for task */
273 	vio_msg_t		*msg;		/* VIO message task is for */
274 	size_t			msglen;		/* length of message content */
275 	vd_dring_payload_t	*request;	/* request task will perform */
276 	struct buf		buf;		/* buf(9s) for I/O request */
277 	ldc_mem_handle_t	mhdl;		/* task memory handle */
278 } vd_task_t;
279 
280 /*
281  * Soft state structure for a virtual disk instance
282  */
283 typedef struct vd {
284 	uint_t			initialized;	/* vdisk initialization flags */
285 	vds_t			*vds;		/* server for this vdisk */
286 	ddi_taskq_t		*startq;	/* queue for I/O start tasks */
287 	ddi_taskq_t		*completionq;	/* queue for completion tasks */
288 	ldi_handle_t		ldi_handle[V_NUMPAR];	/* LDI slice handles */
289 	char			device_path[MAXPATHLEN + 1]; /* vdisk device */
290 	dev_t			dev[V_NUMPAR];	/* dev numbers for slices */
291 	uint_t			nslices;	/* number of slices */
292 	size_t			vdisk_size;	/* number of blocks in vdisk */
293 	vd_disk_type_t		vdisk_type;	/* slice or entire disk */
294 	vd_disk_label_t		vdisk_label;	/* EFI or VTOC label */
295 	ushort_t		max_xfer_sz;	/* max xfer size in DEV_BSIZE */
296 	boolean_t		pseudo;		/* underlying pseudo dev */
297 	boolean_t		file;		/* underlying file */
298 	char			*file_maddr;	/* file mapping address */
299 	vnode_t			*file_vnode;	/* file vnode */
300 	size_t			file_size;	/* file size */
301 	struct dk_efi		dk_efi;		/* synthetic for slice type */
302 	struct dk_geom		dk_geom;	/* synthetic for slice type */
303 	struct vtoc		vtoc;		/* synthetic for slice type */
304 	ldc_status_t		ldc_state;	/* LDC connection state */
305 	ldc_handle_t		ldc_handle;	/* handle for LDC comm */
306 	size_t			max_msglen;	/* largest LDC message len */
307 	vd_state_t		state;		/* client handshake state */
308 	uint8_t			xfer_mode;	/* transfer mode with client */
309 	uint32_t		sid;		/* client's session ID */
310 	uint64_t		seq_num;	/* message sequence number */
311 	uint64_t		dring_ident;	/* identifier of dring */
312 	ldc_dring_handle_t	dring_handle;	/* handle for dring ops */
313 	uint32_t		descriptor_size;	/* num bytes in desc */
314 	uint32_t		dring_len;	/* number of dring elements */
315 	caddr_t			dring;		/* address of dring */
316 	caddr_t			vio_msgp;	/* vio msg staging buffer */
317 	vd_task_t		inband_task;	/* task for inband descriptor */
318 	vd_task_t		*dring_task;	/* tasks dring elements */
319 
320 	kmutex_t		lock;		/* protects variables below */
321 	boolean_t		enabled;	/* is vdisk enabled? */
322 	boolean_t		reset_state;	/* reset connection state? */
323 	boolean_t		reset_ldc;	/* reset LDC channel? */
324 } vd_t;
325 
326 typedef struct vds_operation {
327 	char	*namep;
328 	uint8_t	operation;
329 	int	(*start)(vd_task_t *task);
330 	void	(*complete)(void *arg);
331 } vds_operation_t;
332 
333 typedef struct vd_ioctl {
334 	uint8_t		operation;		/* vdisk operation */
335 	const char	*operation_name;	/* vdisk operation name */
336 	size_t		nbytes;			/* size of operation buffer */
337 	int		cmd;			/* corresponding ioctl cmd */
338 	const char	*cmd_name;		/* ioctl cmd name */
339 	void		*arg;			/* ioctl cmd argument */
340 	/* convert input vd_buf to output ioctl_arg */
341 	void		(*copyin)(void *vd_buf, void *ioctl_arg);
342 	/* convert input ioctl_arg to output vd_buf */
343 	void		(*copyout)(void *ioctl_arg, void *vd_buf);
344 } vd_ioctl_t;
345 
346 /* Define trivial copyin/copyout conversion function flag */
347 #define	VD_IDENTITY	((void (*)(void *, void *))-1)
348 
349 
350 static int	vds_ldc_retries = VDS_RETRIES;
351 static int	vds_ldc_delay = VDS_LDC_DELAY;
352 static int	vds_dev_retries = VDS_RETRIES;
353 static int	vds_dev_delay = VDS_DEV_DELAY;
354 static void	*vds_state;
355 static uint64_t	vds_operations;	/* see vds_operation[] definition below */
356 
357 static int	vd_open_flags = VD_OPEN_FLAGS;
358 
359 /*
360  * Supported protocol version pairs, from highest (newest) to lowest (oldest)
361  *
362  * Each supported major version should appear only once, paired with (and only
363  * with) its highest supported minor version number (as the protocol requires
364  * supporting all lower minor version numbers as well)
365  */
366 static const vio_ver_t	vds_version[] = {{1, 0}};
367 static const size_t	vds_num_versions =
368     sizeof (vds_version)/sizeof (vds_version[0]);
369 
370 static void vd_free_dring_task(vd_t *vdp);
371 static int vd_setup_vd(vd_t *vd);
372 static boolean_t vd_enabled(vd_t *vd);
373 
374 static int
375 vd_start_bio(vd_task_t *task)
376 {
377 	int			rv, status = 0;
378 	vd_t			*vd		= task->vd;
379 	vd_dring_payload_t	*request	= task->request;
380 	struct buf		*buf		= &task->buf;
381 	uint8_t			mtype;
382 	caddr_t			addr;
383 	size_t			offset, maxlen;
384 	int 			slice;
385 
386 	ASSERT(vd != NULL);
387 	ASSERT(request != NULL);
388 
389 	slice = request->slice;
390 
391 	ASSERT(slice < vd->nslices);
392 	ASSERT((request->operation == VD_OP_BREAD) ||
393 	    (request->operation == VD_OP_BWRITE));
394 
395 	if (request->nbytes == 0)
396 		return (EINVAL);	/* no service for trivial requests */
397 
398 	PR1("%s %lu bytes at block %lu",
399 	    (request->operation == VD_OP_BREAD) ? "Read" : "Write",
400 	    request->nbytes, request->addr);
401 
402 	bioinit(buf);
403 	buf->b_flags		= B_BUSY;
404 	buf->b_bcount		= request->nbytes;
405 	buf->b_lblkno		= request->addr;
406 	buf->b_edev		= vd->dev[slice];
407 
408 	mtype = (&vd->inband_task == task) ? LDC_SHADOW_MAP : LDC_DIRECT_MAP;
409 
410 	/* Map memory exported by client */
411 	status = ldc_mem_map(task->mhdl, request->cookie, request->ncookies,
412 	    mtype, (request->operation == VD_OP_BREAD) ? LDC_MEM_W : LDC_MEM_R,
413 	    &(buf->b_un.b_addr), NULL);
414 	if (status != 0) {
415 		PR0("ldc_mem_map() returned err %d ", status);
416 		biofini(buf);
417 		return (status);
418 	}
419 
420 	status = ldc_mem_acquire(task->mhdl, 0, buf->b_bcount);
421 	if (status != 0) {
422 		(void) ldc_mem_unmap(task->mhdl);
423 		PR0("ldc_mem_acquire() returned err %d ", status);
424 		biofini(buf);
425 		return (status);
426 	}
427 
428 	buf->b_flags |= (request->operation == VD_OP_BREAD) ? B_READ : B_WRITE;
429 
430 	/* Start the block I/O */
431 	if (vd->file) {
432 
433 		if (request->addr >= vd->vtoc.v_part[slice].p_size) {
434 			/* address past the end of the slice */
435 			PR0("req_addr (0x%lx) > psize (0x%lx)",
436 			    request->addr, vd->vtoc.v_part[slice].p_size);
437 			request->nbytes = 0;
438 			status = 0;
439 			goto cleanup;
440 		}
441 
442 		offset = (vd->vtoc.v_part[slice].p_start +
443 		    request->addr) * DEV_BSIZE;
444 
445 		/*
446 		 * If the requested size is greater than the size
447 		 * of the partition, truncate the read/write.
448 		 */
449 		maxlen = (vd->vtoc.v_part[slice].p_size -
450 		    request->addr) * DEV_BSIZE;
451 
452 		if (request->nbytes > maxlen) {
453 			PR0("I/O size truncated to %lu bytes from %lu bytes",
454 			    maxlen, request->nbytes);
455 			request->nbytes = maxlen;
456 		}
457 
458 		/*
459 		 * We have to ensure that we are reading/writing into the mmap
460 		 * range. If we have a partial disk image (e.g. an image of
461 		 * s0 instead s2) the system can try to access slices that
462 		 * are not included into the disk image.
463 		 */
464 		if ((offset + request->nbytes) >= vd->file_size) {
465 			PR0("offset + nbytes (0x%lx + 0x%lx) >= "
466 			    "file_size (0x%lx)", offset, request->nbytes,
467 			    vd->file_size);
468 			request->nbytes = 0;
469 			status = EIO;
470 			goto cleanup;
471 		}
472 
473 		addr = vd->file_maddr + offset;
474 
475 		if (request->operation == VD_OP_BREAD)
476 			bcopy(addr, buf->b_un.b_addr, request->nbytes);
477 		else
478 			bcopy(buf->b_un.b_addr, addr, request->nbytes);
479 
480 	} else {
481 		status = ldi_strategy(vd->ldi_handle[slice], buf);
482 		if (status == 0)
483 			return (EINPROGRESS); /* will complete on completionq */
484 	}
485 
486 cleanup:
487 	/* Clean up after error */
488 	rv = ldc_mem_release(task->mhdl, 0, buf->b_bcount);
489 	if (rv) {
490 		PR0("ldc_mem_release() returned err %d ", rv);
491 	}
492 	rv = ldc_mem_unmap(task->mhdl);
493 	if (rv) {
494 		PR0("ldc_mem_unmap() returned err %d ", status);
495 	}
496 
497 	biofini(buf);
498 	return (status);
499 }
500 
501 static int
502 send_msg(ldc_handle_t ldc_handle, void *msg, size_t msglen)
503 {
504 	int	status;
505 	size_t	nbytes;
506 
507 	do {
508 		nbytes = msglen;
509 		status = ldc_write(ldc_handle, msg, &nbytes);
510 		if (status != EWOULDBLOCK)
511 			break;
512 		drv_usecwait(vds_ldc_delay);
513 	} while (status == EWOULDBLOCK);
514 
515 	if (status != 0) {
516 		if (status != ECONNRESET)
517 			PR0("ldc_write() returned errno %d", status);
518 		return (status);
519 	} else if (nbytes != msglen) {
520 		PR0("ldc_write() performed only partial write");
521 		return (EIO);
522 	}
523 
524 	PR1("SENT %lu bytes", msglen);
525 	return (0);
526 }
527 
528 static void
529 vd_need_reset(vd_t *vd, boolean_t reset_ldc)
530 {
531 	mutex_enter(&vd->lock);
532 	vd->reset_state	= B_TRUE;
533 	vd->reset_ldc	= reset_ldc;
534 	mutex_exit(&vd->lock);
535 }
536 
537 /*
538  * Reset the state of the connection with a client, if needed; reset the LDC
539  * transport as well, if needed.  This function should only be called from the
540  * "vd_recv_msg", as it waits for tasks - otherwise a deadlock can occur.
541  */
542 static void
543 vd_reset_if_needed(vd_t *vd)
544 {
545 	int	status = 0;
546 
547 	mutex_enter(&vd->lock);
548 	if (!vd->reset_state) {
549 		ASSERT(!vd->reset_ldc);
550 		mutex_exit(&vd->lock);
551 		return;
552 	}
553 	mutex_exit(&vd->lock);
554 
555 	PR0("Resetting connection state with %s", VD_CLIENT(vd));
556 
557 	/*
558 	 * Let any asynchronous I/O complete before possibly pulling the rug
559 	 * out from under it; defer checking vd->reset_ldc, as one of the
560 	 * asynchronous tasks might set it
561 	 */
562 	ddi_taskq_wait(vd->completionq);
563 
564 	if (vd->file) {
565 		status = VOP_FSYNC(vd->file_vnode, FSYNC, kcred);
566 		if (status) {
567 			PR0("VOP_FSYNC returned errno %d", status);
568 		}
569 	}
570 
571 	if ((vd->initialized & VD_DRING) &&
572 	    ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0))
573 		PR0("ldc_mem_dring_unmap() returned errno %d", status);
574 
575 	vd_free_dring_task(vd);
576 
577 	/* Free the staging buffer for msgs */
578 	if (vd->vio_msgp != NULL) {
579 		kmem_free(vd->vio_msgp, vd->max_msglen);
580 		vd->vio_msgp = NULL;
581 	}
582 
583 	/* Free the inband message buffer */
584 	if (vd->inband_task.msg != NULL) {
585 		kmem_free(vd->inband_task.msg, vd->max_msglen);
586 		vd->inband_task.msg = NULL;
587 	}
588 
589 	mutex_enter(&vd->lock);
590 
591 	if (vd->reset_ldc)
592 		PR0("taking down LDC channel");
593 	if (vd->reset_ldc && ((status = ldc_down(vd->ldc_handle)) != 0))
594 		PR0("ldc_down() returned errno %d", status);
595 
596 	vd->initialized	&= ~(VD_SID | VD_SEQ_NUM | VD_DRING);
597 	vd->state	= VD_STATE_INIT;
598 	vd->max_msglen	= sizeof (vio_msg_t);	/* baseline vio message size */
599 
600 	/* Allocate the staging buffer */
601 	vd->vio_msgp = kmem_alloc(vd->max_msglen, KM_SLEEP);
602 
603 	PR0("calling ldc_up\n");
604 	(void) ldc_up(vd->ldc_handle);
605 
606 	vd->reset_state	= B_FALSE;
607 	vd->reset_ldc	= B_FALSE;
608 
609 	mutex_exit(&vd->lock);
610 }
611 
612 static void vd_recv_msg(void *arg);
613 
614 static void
615 vd_mark_in_reset(vd_t *vd)
616 {
617 	int status;
618 
619 	PR0("vd_mark_in_reset: marking vd in reset\n");
620 
621 	vd_need_reset(vd, B_FALSE);
622 	status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd, DDI_SLEEP);
623 	if (status == DDI_FAILURE) {
624 		PR0("cannot schedule task to recv msg\n");
625 		vd_need_reset(vd, B_TRUE);
626 		return;
627 	}
628 }
629 
630 static int
631 vd_mark_elem_done(vd_t *vd, int idx, int elem_status, int elem_nbytes)
632 {
633 	boolean_t		accepted;
634 	int			status;
635 	vd_dring_entry_t	*elem = VD_DRING_ELEM(idx);
636 
637 	if (vd->reset_state)
638 		return (0);
639 
640 	/* Acquire the element */
641 	if (!vd->reset_state &&
642 	    (status = ldc_mem_dring_acquire(vd->dring_handle, idx, idx)) != 0) {
643 		if (status == ECONNRESET) {
644 			vd_mark_in_reset(vd);
645 			return (0);
646 		} else {
647 			PR0("ldc_mem_dring_acquire() returned errno %d",
648 			    status);
649 			return (status);
650 		}
651 	}
652 
653 	/* Set the element's status and mark it done */
654 	accepted = (elem->hdr.dstate == VIO_DESC_ACCEPTED);
655 	if (accepted) {
656 		elem->payload.nbytes	= elem_nbytes;
657 		elem->payload.status	= elem_status;
658 		elem->hdr.dstate	= VIO_DESC_DONE;
659 	} else {
660 		/* Perhaps client timed out waiting for I/O... */
661 		PR0("element %u no longer \"accepted\"", idx);
662 		VD_DUMP_DRING_ELEM(elem);
663 	}
664 	/* Release the element */
665 	if (!vd->reset_state &&
666 	    (status = ldc_mem_dring_release(vd->dring_handle, idx, idx)) != 0) {
667 		if (status == ECONNRESET) {
668 			vd_mark_in_reset(vd);
669 			return (0);
670 		} else {
671 			PR0("ldc_mem_dring_release() returned errno %d",
672 			    status);
673 			return (status);
674 		}
675 	}
676 
677 	return (accepted ? 0 : EINVAL);
678 }
679 
680 static void
681 vd_complete_bio(void *arg)
682 {
683 	int			status		= 0;
684 	vd_task_t		*task		= (vd_task_t *)arg;
685 	vd_t			*vd		= task->vd;
686 	vd_dring_payload_t	*request	= task->request;
687 	struct buf		*buf		= &task->buf;
688 
689 
690 	ASSERT(vd != NULL);
691 	ASSERT(request != NULL);
692 	ASSERT(task->msg != NULL);
693 	ASSERT(task->msglen >= sizeof (*task->msg));
694 	ASSERT(!vd->file);
695 
696 	/* Wait for the I/O to complete */
697 	request->status = biowait(buf);
698 
699 	/* return back the number of bytes read/written */
700 	request->nbytes = buf->b_bcount - buf->b_resid;
701 
702 	/* Release the buffer */
703 	if (!vd->reset_state)
704 		status = ldc_mem_release(task->mhdl, 0, buf->b_bcount);
705 	if (status) {
706 		PR0("ldc_mem_release() returned errno %d copying to "
707 		    "client", status);
708 		if (status == ECONNRESET) {
709 			vd_mark_in_reset(vd);
710 		}
711 	}
712 
713 	/* Unmap the memory, even if in reset */
714 	status = ldc_mem_unmap(task->mhdl);
715 	if (status) {
716 		PR0("ldc_mem_unmap() returned errno %d copying to client",
717 		    status);
718 		if (status == ECONNRESET) {
719 			vd_mark_in_reset(vd);
720 		}
721 	}
722 
723 	biofini(buf);
724 
725 	/* Update the dring element for a dring client */
726 	if (!vd->reset_state && (status == 0) &&
727 	    (vd->xfer_mode == VIO_DRING_MODE)) {
728 		status = vd_mark_elem_done(vd, task->index,
729 		    request->status, request->nbytes);
730 		if (status == ECONNRESET)
731 			vd_mark_in_reset(vd);
732 	}
733 
734 	/*
735 	 * If a transport error occurred, arrange to "nack" the message when
736 	 * the final task in the descriptor element range completes
737 	 */
738 	if (status != 0)
739 		task->msg->tag.vio_subtype = VIO_SUBTYPE_NACK;
740 
741 	/*
742 	 * Only the final task for a range of elements will respond to and
743 	 * free the message
744 	 */
745 	if (task->type == VD_NONFINAL_RANGE_TASK) {
746 		return;
747 	}
748 
749 	/*
750 	 * Send the "ack" or "nack" back to the client; if sending the message
751 	 * via LDC fails, arrange to reset both the connection state and LDC
752 	 * itself
753 	 */
754 	PR1("Sending %s",
755 	    (task->msg->tag.vio_subtype == VIO_SUBTYPE_ACK) ? "ACK" : "NACK");
756 	if (!vd->reset_state) {
757 		status = send_msg(vd->ldc_handle, task->msg, task->msglen);
758 		switch (status) {
759 		case 0:
760 			break;
761 		case ECONNRESET:
762 			vd_mark_in_reset(vd);
763 			break;
764 		default:
765 			PR0("initiating full reset");
766 			vd_need_reset(vd, B_TRUE);
767 			break;
768 		}
769 	}
770 }
771 
772 static void
773 vd_geom2dk_geom(void *vd_buf, void *ioctl_arg)
774 {
775 	VD_GEOM2DK_GEOM((vd_geom_t *)vd_buf, (struct dk_geom *)ioctl_arg);
776 }
777 
778 static void
779 vd_vtoc2vtoc(void *vd_buf, void *ioctl_arg)
780 {
781 	VD_VTOC2VTOC((vd_vtoc_t *)vd_buf, (struct vtoc *)ioctl_arg);
782 }
783 
784 static void
785 dk_geom2vd_geom(void *ioctl_arg, void *vd_buf)
786 {
787 	DK_GEOM2VD_GEOM((struct dk_geom *)ioctl_arg, (vd_geom_t *)vd_buf);
788 }
789 
790 static void
791 vtoc2vd_vtoc(void *ioctl_arg, void *vd_buf)
792 {
793 	VTOC2VD_VTOC((struct vtoc *)ioctl_arg, (vd_vtoc_t *)vd_buf);
794 }
795 
796 static void
797 vd_get_efi_in(void *vd_buf, void *ioctl_arg)
798 {
799 	vd_efi_t *vd_efi = (vd_efi_t *)vd_buf;
800 	dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg;
801 
802 	dk_efi->dki_lba = vd_efi->lba;
803 	dk_efi->dki_length = vd_efi->length;
804 	dk_efi->dki_data = kmem_zalloc(vd_efi->length, KM_SLEEP);
805 }
806 
807 static void
808 vd_get_efi_out(void *ioctl_arg, void *vd_buf)
809 {
810 	int len;
811 	vd_efi_t *vd_efi = (vd_efi_t *)vd_buf;
812 	dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg;
813 
814 	len = vd_efi->length;
815 	DK_EFI2VD_EFI(dk_efi, vd_efi);
816 	kmem_free(dk_efi->dki_data, len);
817 }
818 
819 static void
820 vd_set_efi_in(void *vd_buf, void *ioctl_arg)
821 {
822 	vd_efi_t *vd_efi = (vd_efi_t *)vd_buf;
823 	dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg;
824 
825 	dk_efi->dki_data = kmem_alloc(vd_efi->length, KM_SLEEP);
826 	VD_EFI2DK_EFI(vd_efi, dk_efi);
827 }
828 
829 static void
830 vd_set_efi_out(void *ioctl_arg, void *vd_buf)
831 {
832 	vd_efi_t *vd_efi = (vd_efi_t *)vd_buf;
833 	dk_efi_t *dk_efi = (dk_efi_t *)ioctl_arg;
834 
835 	kmem_free(dk_efi->dki_data, vd_efi->length);
836 }
837 
838 static int
839 vd_read_vtoc(ldi_handle_t handle, struct vtoc *vtoc, vd_disk_label_t *label)
840 {
841 	int status, rval;
842 	struct dk_gpt *efi;
843 	size_t efi_len;
844 
845 	*label = VD_DISK_LABEL_UNK;
846 
847 	status = ldi_ioctl(handle, DKIOCGVTOC, (intptr_t)vtoc,
848 	    (vd_open_flags | FKIOCTL), kcred, &rval);
849 
850 	if (status == 0) {
851 		*label = VD_DISK_LABEL_VTOC;
852 		return (0);
853 	} else if (status != ENOTSUP) {
854 		PR0("ldi_ioctl(DKIOCGVTOC) returned error %d", status);
855 		return (status);
856 	}
857 
858 	status = vds_efi_alloc_and_read(handle, &efi, &efi_len);
859 
860 	if (status) {
861 		PR0("vds_efi_alloc_and_read returned error %d", status);
862 		return (status);
863 	}
864 
865 	*label = VD_DISK_LABEL_EFI;
866 	vd_efi_to_vtoc(efi, vtoc);
867 	vd_efi_free(efi, efi_len);
868 
869 	return (0);
870 }
871 
872 static short
873 vd_lbl2cksum(struct dk_label *label)
874 {
875 	int	count;
876 	short	sum, *sp;
877 
878 	count =	(sizeof (struct dk_label)) / (sizeof (short)) - 1;
879 	sp = (short *)label;
880 	sum = 0;
881 	while (count--) {
882 		sum ^= *sp++;
883 	}
884 
885 	return (sum);
886 }
887 
888 static int
889 vd_do_slice_ioctl(vd_t *vd, int cmd, void *ioctl_arg)
890 {
891 	dk_efi_t *dk_ioc;
892 	struct dk_label *label;
893 	int i;
894 
895 	switch (vd->vdisk_label) {
896 
897 	case VD_DISK_LABEL_VTOC:
898 
899 		switch (cmd) {
900 		case DKIOCGGEOM:
901 			ASSERT(ioctl_arg != NULL);
902 			bcopy(&vd->dk_geom, ioctl_arg, sizeof (vd->dk_geom));
903 			return (0);
904 		case DKIOCGVTOC:
905 			ASSERT(ioctl_arg != NULL);
906 			bcopy(&vd->vtoc, ioctl_arg, sizeof (vd->vtoc));
907 			return (0);
908 		case DKIOCSVTOC:
909 			if (!vd->file)
910 				return (ENOTSUP);
911 			ASSERT(ioctl_arg != NULL);
912 			bcopy(ioctl_arg, &vd->vtoc, sizeof (vd->vtoc));
913 			/* write new VTOC to file */
914 			label = (struct dk_label *)vd->file_maddr;
915 			label->dkl_vtoc.v_nparts = vd->vtoc.v_nparts;
916 			label->dkl_vtoc.v_sanity = vd->vtoc.v_sanity;
917 			label->dkl_vtoc.v_version = vd->vtoc.v_version;
918 			bcopy(vd->vtoc.v_volume, label->dkl_vtoc.v_volume,
919 			    LEN_DKL_VVOL);
920 			for (i = 0; i < vd->vtoc.v_nparts; i++) {
921 				label->dkl_vtoc.v_timestamp[i] =
922 				    vd->vtoc.timestamp[i];
923 				label->dkl_vtoc.v_part[i].p_tag =
924 				    vd->vtoc.v_part[i].p_tag;
925 				label->dkl_vtoc.v_part[i].p_flag =
926 				    vd->vtoc.v_part[i].p_flag;
927 				label->dkl_map[i].dkl_cylno =
928 				    vd->vtoc.v_part[i].p_start /
929 				    (label->dkl_nhead * label->dkl_nsect);
930 				label->dkl_map[i].dkl_nblk =
931 				    vd->vtoc.v_part[i].p_size;
932 			}
933 
934 			/* re-compute checksum */
935 			label->dkl_cksum = vd_lbl2cksum(label);
936 
937 			return (0);
938 		default:
939 			return (ENOTSUP);
940 		}
941 
942 	case VD_DISK_LABEL_EFI:
943 
944 		switch (cmd) {
945 		case DKIOCGETEFI:
946 			ASSERT(ioctl_arg != NULL);
947 			dk_ioc = (dk_efi_t *)ioctl_arg;
948 			if (dk_ioc->dki_length < vd->dk_efi.dki_length)
949 				return (EINVAL);
950 			bcopy(vd->dk_efi.dki_data, dk_ioc->dki_data,
951 			    vd->dk_efi.dki_length);
952 			return (0);
953 		default:
954 			return (ENOTSUP);
955 		}
956 
957 	default:
958 		return (ENOTSUP);
959 	}
960 }
961 
962 static int
963 vd_do_ioctl(vd_t *vd, vd_dring_payload_t *request, void* buf, vd_ioctl_t *ioctl)
964 {
965 	int	rval = 0, status;
966 	size_t	nbytes = request->nbytes;	/* modifiable copy */
967 
968 
969 	ASSERT(request->slice < vd->nslices);
970 	PR0("Performing %s", ioctl->operation_name);
971 
972 	/* Get data from client and convert, if necessary */
973 	if (ioctl->copyin != NULL)  {
974 		ASSERT(nbytes != 0 && buf != NULL);
975 		PR1("Getting \"arg\" data from client");
976 		if ((status = ldc_mem_copy(vd->ldc_handle, buf, 0, &nbytes,
977 			    request->cookie, request->ncookies,
978 			    LDC_COPY_IN)) != 0) {
979 			PR0("ldc_mem_copy() returned errno %d "
980 			    "copying from client", status);
981 			return (status);
982 		}
983 
984 		/* Convert client's data, if necessary */
985 		if (ioctl->copyin == VD_IDENTITY)	/* use client buffer */
986 			ioctl->arg = buf;
987 		else	/* convert client vdisk operation data to ioctl data */
988 			(ioctl->copyin)(buf, (void *)ioctl->arg);
989 	}
990 
991 	/*
992 	 * Handle single-slice block devices internally; otherwise, have the
993 	 * real driver perform the ioctl()
994 	 */
995 	if (vd->file || (vd->vdisk_type == VD_DISK_TYPE_SLICE && !vd->pseudo)) {
996 		if ((status = vd_do_slice_ioctl(vd, ioctl->cmd,
997 			    (void *)ioctl->arg)) != 0)
998 			return (status);
999 	} else if ((status = ldi_ioctl(vd->ldi_handle[request->slice],
1000 		    ioctl->cmd, (intptr_t)ioctl->arg, (vd_open_flags | FKIOCTL),
1001 		    kcred, &rval)) != 0) {
1002 		PR0("ldi_ioctl(%s) = errno %d", ioctl->cmd_name, status);
1003 		return (status);
1004 	}
1005 #ifdef DEBUG
1006 	if (rval != 0) {
1007 		PR0("%s set rval = %d, which is not being returned to client",
1008 		    ioctl->cmd_name, rval);
1009 	}
1010 #endif /* DEBUG */
1011 
1012 	/* Convert data and send to client, if necessary */
1013 	if (ioctl->copyout != NULL)  {
1014 		ASSERT(nbytes != 0 && buf != NULL);
1015 		PR1("Sending \"arg\" data to client");
1016 
1017 		/* Convert ioctl data to vdisk operation data, if necessary */
1018 		if (ioctl->copyout != VD_IDENTITY)
1019 			(ioctl->copyout)((void *)ioctl->arg, buf);
1020 
1021 		if ((status = ldc_mem_copy(vd->ldc_handle, buf, 0, &nbytes,
1022 			    request->cookie, request->ncookies,
1023 			    LDC_COPY_OUT)) != 0) {
1024 			PR0("ldc_mem_copy() returned errno %d "
1025 			    "copying to client", status);
1026 			return (status);
1027 		}
1028 	}
1029 
1030 	return (status);
1031 }
1032 
1033 #define	RNDSIZE(expr) P2ROUNDUP(sizeof (expr), sizeof (uint64_t))
1034 static int
1035 vd_ioctl(vd_task_t *task)
1036 {
1037 	int			i, status, rc;
1038 	void			*buf = NULL;
1039 	struct dk_geom		dk_geom = {0};
1040 	struct vtoc		vtoc = {0};
1041 	struct dk_efi		dk_efi = {0};
1042 	vd_t			*vd		= task->vd;
1043 	vd_dring_payload_t	*request	= task->request;
1044 	vd_ioctl_t		ioctl[] = {
1045 		/* Command (no-copy) operations */
1046 		{VD_OP_FLUSH, STRINGIZE(VD_OP_FLUSH), 0,
1047 		    DKIOCFLUSHWRITECACHE, STRINGIZE(DKIOCFLUSHWRITECACHE),
1048 		    NULL, NULL, NULL},
1049 
1050 		/* "Get" (copy-out) operations */
1051 		{VD_OP_GET_WCE, STRINGIZE(VD_OP_GET_WCE), RNDSIZE(int),
1052 		    DKIOCGETWCE, STRINGIZE(DKIOCGETWCE),
1053 		    NULL, VD_IDENTITY, VD_IDENTITY},
1054 		{VD_OP_GET_DISKGEOM, STRINGIZE(VD_OP_GET_DISKGEOM),
1055 		    RNDSIZE(vd_geom_t),
1056 		    DKIOCGGEOM, STRINGIZE(DKIOCGGEOM),
1057 		    &dk_geom, NULL, dk_geom2vd_geom},
1058 		{VD_OP_GET_VTOC, STRINGIZE(VD_OP_GET_VTOC), RNDSIZE(vd_vtoc_t),
1059 		    DKIOCGVTOC, STRINGIZE(DKIOCGVTOC),
1060 		    &vtoc, NULL, vtoc2vd_vtoc},
1061 		{VD_OP_GET_EFI, STRINGIZE(VD_OP_GET_EFI), RNDSIZE(vd_efi_t),
1062 		    DKIOCGETEFI, STRINGIZE(DKIOCGETEFI),
1063 		    &dk_efi, vd_get_efi_in, vd_get_efi_out},
1064 
1065 		/* "Set" (copy-in) operations */
1066 		{VD_OP_SET_WCE, STRINGIZE(VD_OP_SET_WCE), RNDSIZE(int),
1067 		    DKIOCSETWCE, STRINGIZE(DKIOCSETWCE),
1068 		    NULL, VD_IDENTITY, VD_IDENTITY},
1069 		{VD_OP_SET_DISKGEOM, STRINGIZE(VD_OP_SET_DISKGEOM),
1070 		    RNDSIZE(vd_geom_t),
1071 		    DKIOCSGEOM, STRINGIZE(DKIOCSGEOM),
1072 		    &dk_geom, vd_geom2dk_geom, NULL},
1073 		{VD_OP_SET_VTOC, STRINGIZE(VD_OP_SET_VTOC), RNDSIZE(vd_vtoc_t),
1074 		    DKIOCSVTOC, STRINGIZE(DKIOCSVTOC),
1075 		    &vtoc, vd_vtoc2vtoc, NULL},
1076 		{VD_OP_SET_EFI, STRINGIZE(VD_OP_SET_EFI), RNDSIZE(vd_efi_t),
1077 		    DKIOCSETEFI, STRINGIZE(DKIOCSETEFI),
1078 		    &dk_efi, vd_set_efi_in, vd_set_efi_out},
1079 	};
1080 	size_t		nioctls = (sizeof (ioctl))/(sizeof (ioctl[0]));
1081 
1082 
1083 	ASSERT(vd != NULL);
1084 	ASSERT(request != NULL);
1085 	ASSERT(request->slice < vd->nslices);
1086 
1087 	/*
1088 	 * Determine ioctl corresponding to caller's "operation" and
1089 	 * validate caller's "nbytes"
1090 	 */
1091 	for (i = 0; i < nioctls; i++) {
1092 		if (request->operation == ioctl[i].operation) {
1093 			/* LDC memory operations require 8-byte multiples */
1094 			ASSERT(ioctl[i].nbytes % sizeof (uint64_t) == 0);
1095 
1096 			if (request->operation == VD_OP_GET_EFI ||
1097 			    request->operation == VD_OP_SET_EFI) {
1098 				if (request->nbytes >= ioctl[i].nbytes)
1099 					break;
1100 				PR0("%s:  Expected at least nbytes = %lu, "
1101 				    "got %lu", ioctl[i].operation_name,
1102 				    ioctl[i].nbytes, request->nbytes);
1103 				return (EINVAL);
1104 			}
1105 
1106 			if (request->nbytes != ioctl[i].nbytes) {
1107 				PR0("%s:  Expected nbytes = %lu, got %lu",
1108 				    ioctl[i].operation_name, ioctl[i].nbytes,
1109 				    request->nbytes);
1110 				return (EINVAL);
1111 			}
1112 
1113 			break;
1114 		}
1115 	}
1116 	ASSERT(i < nioctls);	/* because "operation" already validated */
1117 
1118 	if (request->nbytes)
1119 		buf = kmem_zalloc(request->nbytes, KM_SLEEP);
1120 	status = vd_do_ioctl(vd, request, buf, &ioctl[i]);
1121 	if (request->nbytes)
1122 		kmem_free(buf, request->nbytes);
1123 	if (!vd->file && vd->vdisk_type == VD_DISK_TYPE_DISK &&
1124 	    (request->operation == VD_OP_SET_VTOC ||
1125 	    request->operation == VD_OP_SET_EFI)) {
1126 		/* update disk information */
1127 		rc = vd_read_vtoc(vd->ldi_handle[0], &vd->vtoc,
1128 		    &vd->vdisk_label);
1129 		if (rc != 0)
1130 			PR0("vd_read_vtoc return error %d", rc);
1131 	}
1132 	PR0("Returning %d", status);
1133 	return (status);
1134 }
1135 
1136 static int
1137 vd_get_devid(vd_task_t *task)
1138 {
1139 	vd_t *vd = task->vd;
1140 	vd_dring_payload_t *request = task->request;
1141 	vd_devid_t *vd_devid;
1142 	impl_devid_t *devid;
1143 	int status, bufid_len, devid_len, len;
1144 	int bufbytes;
1145 
1146 	PR1("Get Device ID, nbytes=%ld", request->nbytes);
1147 
1148 	if (vd->file) {
1149 		/* no devid for disk on file */
1150 		return (ENOENT);
1151 	}
1152 
1153 	if (ddi_lyr_get_devid(vd->dev[request->slice],
1154 	    (ddi_devid_t *)&devid) != DDI_SUCCESS) {
1155 		/* the most common failure is that no devid is available */
1156 		PR2("No Device ID");
1157 		return (ENOENT);
1158 	}
1159 
1160 	bufid_len = request->nbytes - sizeof (vd_devid_t) + 1;
1161 	devid_len = DEVID_GETLEN(devid);
1162 
1163 	/*
1164 	 * Save the buffer size here for use in deallocation.
1165 	 * The actual number of bytes copied is returned in
1166 	 * the 'nbytes' field of the request structure.
1167 	 */
1168 	bufbytes = request->nbytes;
1169 
1170 	vd_devid = kmem_zalloc(bufbytes, KM_SLEEP);
1171 	vd_devid->length = devid_len;
1172 	vd_devid->type = DEVID_GETTYPE(devid);
1173 
1174 	len = (devid_len > bufid_len)? bufid_len : devid_len;
1175 
1176 	bcopy(devid->did_id, vd_devid->id, len);
1177 
1178 	/* LDC memory operations require 8-byte multiples */
1179 	ASSERT(request->nbytes % sizeof (uint64_t) == 0);
1180 
1181 	if ((status = ldc_mem_copy(vd->ldc_handle, (caddr_t)vd_devid, 0,
1182 	    &request->nbytes, request->cookie, request->ncookies,
1183 	    LDC_COPY_OUT)) != 0) {
1184 		PR0("ldc_mem_copy() returned errno %d copying to client",
1185 		    status);
1186 	}
1187 	PR1("post mem_copy: nbytes=%ld", request->nbytes);
1188 
1189 	kmem_free(vd_devid, bufbytes);
1190 	ddi_devid_free((ddi_devid_t)devid);
1191 
1192 	return (status);
1193 }
1194 
1195 /*
1196  * Define the supported operations once the functions for performing them have
1197  * been defined
1198  */
1199 static const vds_operation_t	vds_operation[] = {
1200 #define	X(_s)	#_s, _s
1201 	{X(VD_OP_BREAD),	vd_start_bio,	vd_complete_bio},
1202 	{X(VD_OP_BWRITE),	vd_start_bio,	vd_complete_bio},
1203 	{X(VD_OP_FLUSH),	vd_ioctl,	NULL},
1204 	{X(VD_OP_GET_WCE),	vd_ioctl,	NULL},
1205 	{X(VD_OP_SET_WCE),	vd_ioctl,	NULL},
1206 	{X(VD_OP_GET_VTOC),	vd_ioctl,	NULL},
1207 	{X(VD_OP_SET_VTOC),	vd_ioctl,	NULL},
1208 	{X(VD_OP_GET_DISKGEOM),	vd_ioctl,	NULL},
1209 	{X(VD_OP_SET_DISKGEOM),	vd_ioctl,	NULL},
1210 	{X(VD_OP_GET_EFI),	vd_ioctl,	NULL},
1211 	{X(VD_OP_SET_EFI),	vd_ioctl,	NULL},
1212 	{X(VD_OP_GET_DEVID),	vd_get_devid,	NULL},
1213 #undef	X
1214 };
1215 
1216 static const size_t	vds_noperations =
1217 	(sizeof (vds_operation))/(sizeof (vds_operation[0]));
1218 
1219 /*
1220  * Process a task specifying a client I/O request
1221  */
1222 static int
1223 vd_process_task(vd_task_t *task)
1224 {
1225 	int			i, status;
1226 	vd_t			*vd		= task->vd;
1227 	vd_dring_payload_t	*request	= task->request;
1228 
1229 
1230 	ASSERT(vd != NULL);
1231 	ASSERT(request != NULL);
1232 
1233 	/* Find the requested operation */
1234 	for (i = 0; i < vds_noperations; i++)
1235 		if (request->operation == vds_operation[i].operation)
1236 			break;
1237 	if (i == vds_noperations) {
1238 		PR0("Unsupported operation %u", request->operation);
1239 		return (ENOTSUP);
1240 	}
1241 
1242 	/* Handle client using absolute disk offsets */
1243 	if ((vd->vdisk_type == VD_DISK_TYPE_DISK) &&
1244 	    (request->slice == UINT8_MAX))
1245 		request->slice = VD_ENTIRE_DISK_SLICE;
1246 
1247 	/* Range-check slice */
1248 	if (request->slice >= vd->nslices) {
1249 		PR0("Invalid \"slice\" %u (max %u) for virtual disk",
1250 		    request->slice, (vd->nslices - 1));
1251 		return (EINVAL);
1252 	}
1253 
1254 	PR1("operation : %s", vds_operation[i].namep);
1255 
1256 	/* Start the operation */
1257 	if ((status = vds_operation[i].start(task)) != EINPROGRESS) {
1258 		PR0("operation : %s returned status %d",
1259 			vds_operation[i].namep, status);
1260 		request->status = status;	/* op succeeded or failed */
1261 		return (0);			/* but request completed */
1262 	}
1263 
1264 	ASSERT(vds_operation[i].complete != NULL);	/* debug case */
1265 	if (vds_operation[i].complete == NULL) {	/* non-debug case */
1266 		PR0("Unexpected return of EINPROGRESS "
1267 		    "with no I/O completion handler");
1268 		request->status = EIO;	/* operation failed */
1269 		return (0);		/* but request completed */
1270 	}
1271 
1272 	PR1("operation : kick off taskq entry for %s", vds_operation[i].namep);
1273 
1274 	/* Queue a task to complete the operation */
1275 	status = ddi_taskq_dispatch(vd->completionq, vds_operation[i].complete,
1276 	    task, DDI_SLEEP);
1277 	/* ddi_taskq_dispatch(9f) guarantees success with DDI_SLEEP */
1278 	ASSERT(status == DDI_SUCCESS);
1279 
1280 	PR1("Operation in progress");
1281 	return (EINPROGRESS);	/* completion handler will finish request */
1282 }
1283 
1284 /*
1285  * Return true if the "type", "subtype", and "env" fields of the "tag" first
1286  * argument match the corresponding remaining arguments; otherwise, return false
1287  */
1288 boolean_t
1289 vd_msgtype(vio_msg_tag_t *tag, int type, int subtype, int env)
1290 {
1291 	return ((tag->vio_msgtype == type) &&
1292 		(tag->vio_subtype == subtype) &&
1293 		(tag->vio_subtype_env == env)) ? B_TRUE : B_FALSE;
1294 }
1295 
1296 /*
1297  * Check whether the major/minor version specified in "ver_msg" is supported
1298  * by this server.
1299  */
1300 static boolean_t
1301 vds_supported_version(vio_ver_msg_t *ver_msg)
1302 {
1303 	for (int i = 0; i < vds_num_versions; i++) {
1304 		ASSERT(vds_version[i].major > 0);
1305 		ASSERT((i == 0) ||
1306 		    (vds_version[i].major < vds_version[i-1].major));
1307 
1308 		/*
1309 		 * If the major versions match, adjust the minor version, if
1310 		 * necessary, down to the highest value supported by this
1311 		 * server and return true so this message will get "ack"ed;
1312 		 * the client should also support all minor versions lower
1313 		 * than the value it sent
1314 		 */
1315 		if (ver_msg->ver_major == vds_version[i].major) {
1316 			if (ver_msg->ver_minor > vds_version[i].minor) {
1317 				PR0("Adjusting minor version from %u to %u",
1318 				    ver_msg->ver_minor, vds_version[i].minor);
1319 				ver_msg->ver_minor = vds_version[i].minor;
1320 			}
1321 			return (B_TRUE);
1322 		}
1323 
1324 		/*
1325 		 * If the message contains a higher major version number, set
1326 		 * the message's major/minor versions to the current values
1327 		 * and return false, so this message will get "nack"ed with
1328 		 * these values, and the client will potentially try again
1329 		 * with the same or a lower version
1330 		 */
1331 		if (ver_msg->ver_major > vds_version[i].major) {
1332 			ver_msg->ver_major = vds_version[i].major;
1333 			ver_msg->ver_minor = vds_version[i].minor;
1334 			return (B_FALSE);
1335 		}
1336 
1337 		/*
1338 		 * Otherwise, the message's major version is less than the
1339 		 * current major version, so continue the loop to the next
1340 		 * (lower) supported version
1341 		 */
1342 	}
1343 
1344 	/*
1345 	 * No common version was found; "ground" the version pair in the
1346 	 * message to terminate negotiation
1347 	 */
1348 	ver_msg->ver_major = 0;
1349 	ver_msg->ver_minor = 0;
1350 	return (B_FALSE);
1351 }
1352 
1353 /*
1354  * Process a version message from a client.  vds expects to receive version
1355  * messages from clients seeking service, but never issues version messages
1356  * itself; therefore, vds can ACK or NACK client version messages, but does
1357  * not expect to receive version-message ACKs or NACKs (and will treat such
1358  * messages as invalid).
1359  */
1360 static int
1361 vd_process_ver_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
1362 {
1363 	vio_ver_msg_t	*ver_msg = (vio_ver_msg_t *)msg;
1364 
1365 
1366 	ASSERT(msglen >= sizeof (msg->tag));
1367 
1368 	if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO,
1369 		VIO_VER_INFO)) {
1370 		return (ENOMSG);	/* not a version message */
1371 	}
1372 
1373 	if (msglen != sizeof (*ver_msg)) {
1374 		PR0("Expected %lu-byte version message; "
1375 		    "received %lu bytes", sizeof (*ver_msg), msglen);
1376 		return (EBADMSG);
1377 	}
1378 
1379 	if (ver_msg->dev_class != VDEV_DISK) {
1380 		PR0("Expected device class %u (disk); received %u",
1381 		    VDEV_DISK, ver_msg->dev_class);
1382 		return (EBADMSG);
1383 	}
1384 
1385 	/*
1386 	 * We're talking to the expected kind of client; set our device class
1387 	 * for "ack/nack" back to the client
1388 	 */
1389 	ver_msg->dev_class = VDEV_DISK_SERVER;
1390 
1391 	/*
1392 	 * Check whether the (valid) version message specifies a version
1393 	 * supported by this server.  If the version is not supported, return
1394 	 * EBADMSG so the message will get "nack"ed; vds_supported_version()
1395 	 * will have updated the message with a supported version for the
1396 	 * client to consider
1397 	 */
1398 	if (!vds_supported_version(ver_msg))
1399 		return (EBADMSG);
1400 
1401 
1402 	/*
1403 	 * A version has been agreed upon; use the client's SID for
1404 	 * communication on this channel now
1405 	 */
1406 	ASSERT(!(vd->initialized & VD_SID));
1407 	vd->sid = ver_msg->tag.vio_sid;
1408 	vd->initialized |= VD_SID;
1409 
1410 	/*
1411 	 * When multiple versions are supported, this function should store
1412 	 * the negotiated major and minor version values in the "vd" data
1413 	 * structure to govern further communication; in particular, note that
1414 	 * the client might have specified a lower minor version for the
1415 	 * agreed major version than specifed in the vds_version[] array.  The
1416 	 * following assertions should help remind future maintainers to make
1417 	 * the appropriate changes to support multiple versions.
1418 	 */
1419 	ASSERT(vds_num_versions == 1);
1420 	ASSERT(ver_msg->ver_major == vds_version[0].major);
1421 	ASSERT(ver_msg->ver_minor == vds_version[0].minor);
1422 
1423 	PR0("Using major version %u, minor version %u",
1424 	    ver_msg->ver_major, ver_msg->ver_minor);
1425 	return (0);
1426 }
1427 
1428 static int
1429 vd_process_attr_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
1430 {
1431 	vd_attr_msg_t	*attr_msg = (vd_attr_msg_t *)msg;
1432 	int		status, retry = 0;
1433 
1434 
1435 	ASSERT(msglen >= sizeof (msg->tag));
1436 
1437 	if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO,
1438 		VIO_ATTR_INFO)) {
1439 		PR0("Message is not an attribute message");
1440 		return (ENOMSG);
1441 	}
1442 
1443 	if (msglen != sizeof (*attr_msg)) {
1444 		PR0("Expected %lu-byte attribute message; "
1445 		    "received %lu bytes", sizeof (*attr_msg), msglen);
1446 		return (EBADMSG);
1447 	}
1448 
1449 	if (attr_msg->max_xfer_sz == 0) {
1450 		PR0("Received maximum transfer size of 0 from client");
1451 		return (EBADMSG);
1452 	}
1453 
1454 	if ((attr_msg->xfer_mode != VIO_DESC_MODE) &&
1455 	    (attr_msg->xfer_mode != VIO_DRING_MODE)) {
1456 		PR0("Client requested unsupported transfer mode");
1457 		return (EBADMSG);
1458 	}
1459 
1460 	/*
1461 	 * check if the underlying disk is ready, if not try accessing
1462 	 * the device again. Open the vdisk device and extract info
1463 	 * about it, as this is needed to respond to the attr info msg
1464 	 */
1465 	if ((vd->initialized & VD_DISK_READY) == 0) {
1466 		PR0("Retry setting up disk (%s)", vd->device_path);
1467 		do {
1468 			status = vd_setup_vd(vd);
1469 			if (status != EAGAIN || ++retry > vds_dev_retries)
1470 				break;
1471 
1472 			/* incremental delay */
1473 			delay(drv_usectohz(vds_dev_delay));
1474 
1475 			/* if vdisk is no longer enabled - return error */
1476 			if (!vd_enabled(vd))
1477 				return (ENXIO);
1478 
1479 		} while (status == EAGAIN);
1480 
1481 		if (status)
1482 			return (ENXIO);
1483 
1484 		vd->initialized |= VD_DISK_READY;
1485 		ASSERT(vd->nslices > 0 && vd->nslices <= V_NUMPAR);
1486 		PR0("vdisk_type = %s, pseudo = %s, file = %s, nslices = %u",
1487 		    ((vd->vdisk_type == VD_DISK_TYPE_DISK) ? "disk" : "slice"),
1488 		    (vd->pseudo ? "yes" : "no"),
1489 		    (vd->file ? "yes" : "no"),
1490 		    vd->nslices);
1491 	}
1492 
1493 	/* Success:  valid message and transfer mode */
1494 	vd->xfer_mode = attr_msg->xfer_mode;
1495 
1496 	if (vd->xfer_mode == VIO_DESC_MODE) {
1497 
1498 		/*
1499 		 * The vd_dring_inband_msg_t contains one cookie; need room
1500 		 * for up to n-1 more cookies, where "n" is the number of full
1501 		 * pages plus possibly one partial page required to cover
1502 		 * "max_xfer_sz".  Add room for one more cookie if
1503 		 * "max_xfer_sz" isn't an integral multiple of the page size.
1504 		 * Must first get the maximum transfer size in bytes.
1505 		 */
1506 		size_t	max_xfer_bytes = attr_msg->vdisk_block_size ?
1507 		    attr_msg->vdisk_block_size*attr_msg->max_xfer_sz :
1508 		    attr_msg->max_xfer_sz;
1509 		size_t	max_inband_msglen =
1510 		    sizeof (vd_dring_inband_msg_t) +
1511 		    ((max_xfer_bytes/PAGESIZE +
1512 			((max_xfer_bytes % PAGESIZE) ? 1 : 0))*
1513 			(sizeof (ldc_mem_cookie_t)));
1514 
1515 		/*
1516 		 * Set the maximum expected message length to
1517 		 * accommodate in-band-descriptor messages with all
1518 		 * their cookies
1519 		 */
1520 		vd->max_msglen = MAX(vd->max_msglen, max_inband_msglen);
1521 
1522 		/*
1523 		 * Initialize the data structure for processing in-band I/O
1524 		 * request descriptors
1525 		 */
1526 		vd->inband_task.vd	= vd;
1527 		vd->inband_task.msg	= kmem_alloc(vd->max_msglen, KM_SLEEP);
1528 		vd->inband_task.index	= 0;
1529 		vd->inband_task.type	= VD_FINAL_RANGE_TASK;	/* range == 1 */
1530 	}
1531 
1532 	/* Return the device's block size and max transfer size to the client */
1533 	attr_msg->vdisk_block_size	= DEV_BSIZE;
1534 	attr_msg->max_xfer_sz		= vd->max_xfer_sz;
1535 
1536 	attr_msg->vdisk_size = vd->vdisk_size;
1537 	attr_msg->vdisk_type = vd->vdisk_type;
1538 	attr_msg->operations = vds_operations;
1539 	PR0("%s", VD_CLIENT(vd));
1540 
1541 	ASSERT(vd->dring_task == NULL);
1542 
1543 	return (0);
1544 }
1545 
1546 static int
1547 vd_process_dring_reg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
1548 {
1549 	int			status;
1550 	size_t			expected;
1551 	ldc_mem_info_t		dring_minfo;
1552 	vio_dring_reg_msg_t	*reg_msg = (vio_dring_reg_msg_t *)msg;
1553 
1554 
1555 	ASSERT(msglen >= sizeof (msg->tag));
1556 
1557 	if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO,
1558 		VIO_DRING_REG)) {
1559 		PR0("Message is not a register-dring message");
1560 		return (ENOMSG);
1561 	}
1562 
1563 	if (msglen < sizeof (*reg_msg)) {
1564 		PR0("Expected at least %lu-byte register-dring message; "
1565 		    "received %lu bytes", sizeof (*reg_msg), msglen);
1566 		return (EBADMSG);
1567 	}
1568 
1569 	expected = sizeof (*reg_msg) +
1570 	    (reg_msg->ncookies - 1)*(sizeof (reg_msg->cookie[0]));
1571 	if (msglen != expected) {
1572 		PR0("Expected %lu-byte register-dring message; "
1573 		    "received %lu bytes", expected, msglen);
1574 		return (EBADMSG);
1575 	}
1576 
1577 	if (vd->initialized & VD_DRING) {
1578 		PR0("A dring was previously registered; only support one");
1579 		return (EBADMSG);
1580 	}
1581 
1582 	if (reg_msg->num_descriptors > INT32_MAX) {
1583 		PR0("reg_msg->num_descriptors = %u; must be <= %u (%s)",
1584 		    reg_msg->ncookies, INT32_MAX, STRINGIZE(INT32_MAX));
1585 		return (EBADMSG);
1586 	}
1587 
1588 	if (reg_msg->ncookies != 1) {
1589 		/*
1590 		 * In addition to fixing the assertion in the success case
1591 		 * below, supporting drings which require more than one
1592 		 * "cookie" requires increasing the value of vd->max_msglen
1593 		 * somewhere in the code path prior to receiving the message
1594 		 * which results in calling this function.  Note that without
1595 		 * making this change, the larger message size required to
1596 		 * accommodate multiple cookies cannot be successfully
1597 		 * received, so this function will not even get called.
1598 		 * Gracefully accommodating more dring cookies might
1599 		 * reasonably demand exchanging an additional attribute or
1600 		 * making a minor protocol adjustment
1601 		 */
1602 		PR0("reg_msg->ncookies = %u != 1", reg_msg->ncookies);
1603 		return (EBADMSG);
1604 	}
1605 
1606 	status = ldc_mem_dring_map(vd->ldc_handle, reg_msg->cookie,
1607 	    reg_msg->ncookies, reg_msg->num_descriptors,
1608 	    reg_msg->descriptor_size, LDC_DIRECT_MAP, &vd->dring_handle);
1609 	if (status != 0) {
1610 		PR0("ldc_mem_dring_map() returned errno %d", status);
1611 		return (status);
1612 	}
1613 
1614 	/*
1615 	 * To remove the need for this assertion, must call
1616 	 * ldc_mem_dring_nextcookie() successfully ncookies-1 times after a
1617 	 * successful call to ldc_mem_dring_map()
1618 	 */
1619 	ASSERT(reg_msg->ncookies == 1);
1620 
1621 	if ((status =
1622 		ldc_mem_dring_info(vd->dring_handle, &dring_minfo)) != 0) {
1623 		PR0("ldc_mem_dring_info() returned errno %d", status);
1624 		if ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0)
1625 			PR0("ldc_mem_dring_unmap() returned errno %d", status);
1626 		return (status);
1627 	}
1628 
1629 	if (dring_minfo.vaddr == NULL) {
1630 		PR0("Descriptor ring virtual address is NULL");
1631 		return (ENXIO);
1632 	}
1633 
1634 
1635 	/* Initialize for valid message and mapped dring */
1636 	PR1("descriptor size = %u, dring length = %u",
1637 	    vd->descriptor_size, vd->dring_len);
1638 	vd->initialized |= VD_DRING;
1639 	vd->dring_ident = 1;	/* "There Can Be Only One" */
1640 	vd->dring = dring_minfo.vaddr;
1641 	vd->descriptor_size = reg_msg->descriptor_size;
1642 	vd->dring_len = reg_msg->num_descriptors;
1643 	reg_msg->dring_ident = vd->dring_ident;
1644 
1645 	/*
1646 	 * Allocate and initialize a "shadow" array of data structures for
1647 	 * tasks to process I/O requests in dring elements
1648 	 */
1649 	vd->dring_task =
1650 	    kmem_zalloc((sizeof (*vd->dring_task)) * vd->dring_len, KM_SLEEP);
1651 	for (int i = 0; i < vd->dring_len; i++) {
1652 		vd->dring_task[i].vd		= vd;
1653 		vd->dring_task[i].index		= i;
1654 		vd->dring_task[i].request	= &VD_DRING_ELEM(i)->payload;
1655 
1656 		status = ldc_mem_alloc_handle(vd->ldc_handle,
1657 		    &(vd->dring_task[i].mhdl));
1658 		if (status) {
1659 			PR0("ldc_mem_alloc_handle() returned err %d ", status);
1660 			return (ENXIO);
1661 		}
1662 
1663 		vd->dring_task[i].msg = kmem_alloc(vd->max_msglen, KM_SLEEP);
1664 	}
1665 
1666 	return (0);
1667 }
1668 
1669 static int
1670 vd_process_dring_unreg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
1671 {
1672 	vio_dring_unreg_msg_t	*unreg_msg = (vio_dring_unreg_msg_t *)msg;
1673 
1674 
1675 	ASSERT(msglen >= sizeof (msg->tag));
1676 
1677 	if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO,
1678 		VIO_DRING_UNREG)) {
1679 		PR0("Message is not an unregister-dring message");
1680 		return (ENOMSG);
1681 	}
1682 
1683 	if (msglen != sizeof (*unreg_msg)) {
1684 		PR0("Expected %lu-byte unregister-dring message; "
1685 		    "received %lu bytes", sizeof (*unreg_msg), msglen);
1686 		return (EBADMSG);
1687 	}
1688 
1689 	if (unreg_msg->dring_ident != vd->dring_ident) {
1690 		PR0("Expected dring ident %lu; received %lu",
1691 		    vd->dring_ident, unreg_msg->dring_ident);
1692 		return (EBADMSG);
1693 	}
1694 
1695 	return (0);
1696 }
1697 
1698 static int
1699 process_rdx_msg(vio_msg_t *msg, size_t msglen)
1700 {
1701 	ASSERT(msglen >= sizeof (msg->tag));
1702 
1703 	if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, VIO_RDX)) {
1704 		PR0("Message is not an RDX message");
1705 		return (ENOMSG);
1706 	}
1707 
1708 	if (msglen != sizeof (vio_rdx_msg_t)) {
1709 		PR0("Expected %lu-byte RDX message; received %lu bytes",
1710 		    sizeof (vio_rdx_msg_t), msglen);
1711 		return (EBADMSG);
1712 	}
1713 
1714 	PR0("Valid RDX message");
1715 	return (0);
1716 }
1717 
1718 static int
1719 vd_check_seq_num(vd_t *vd, uint64_t seq_num)
1720 {
1721 	if ((vd->initialized & VD_SEQ_NUM) && (seq_num != vd->seq_num + 1)) {
1722 		PR0("Received seq_num %lu; expected %lu",
1723 		    seq_num, (vd->seq_num + 1));
1724 		PR0("initiating soft reset");
1725 		vd_need_reset(vd, B_FALSE);
1726 		return (1);
1727 	}
1728 
1729 	vd->seq_num = seq_num;
1730 	vd->initialized |= VD_SEQ_NUM;	/* superfluous after first time... */
1731 	return (0);
1732 }
1733 
1734 /*
1735  * Return the expected size of an inband-descriptor message with all the
1736  * cookies it claims to include
1737  */
1738 static size_t
1739 expected_inband_size(vd_dring_inband_msg_t *msg)
1740 {
1741 	return ((sizeof (*msg)) +
1742 	    (msg->payload.ncookies - 1)*(sizeof (msg->payload.cookie[0])));
1743 }
1744 
1745 /*
1746  * Process an in-band descriptor message:  used with clients like OBP, with
1747  * which vds exchanges descriptors within VIO message payloads, rather than
1748  * operating on them within a descriptor ring
1749  */
1750 static int
1751 vd_process_desc_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
1752 {
1753 	size_t			expected;
1754 	vd_dring_inband_msg_t	*desc_msg = (vd_dring_inband_msg_t *)msg;
1755 
1756 
1757 	ASSERT(msglen >= sizeof (msg->tag));
1758 
1759 	if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO,
1760 		VIO_DESC_DATA)) {
1761 		PR1("Message is not an in-band-descriptor message");
1762 		return (ENOMSG);
1763 	}
1764 
1765 	if (msglen < sizeof (*desc_msg)) {
1766 		PR0("Expected at least %lu-byte descriptor message; "
1767 		    "received %lu bytes", sizeof (*desc_msg), msglen);
1768 		return (EBADMSG);
1769 	}
1770 
1771 	if (msglen != (expected = expected_inband_size(desc_msg))) {
1772 		PR0("Expected %lu-byte descriptor message; "
1773 		    "received %lu bytes", expected, msglen);
1774 		return (EBADMSG);
1775 	}
1776 
1777 	if (vd_check_seq_num(vd, desc_msg->hdr.seq_num) != 0)
1778 		return (EBADMSG);
1779 
1780 	/*
1781 	 * Valid message:  Set up the in-band descriptor task and process the
1782 	 * request.  Arrange to acknowledge the client's message, unless an
1783 	 * error processing the descriptor task results in setting
1784 	 * VIO_SUBTYPE_NACK
1785 	 */
1786 	PR1("Valid in-band-descriptor message");
1787 	msg->tag.vio_subtype = VIO_SUBTYPE_ACK;
1788 
1789 	ASSERT(vd->inband_task.msg != NULL);
1790 
1791 	bcopy(msg, vd->inband_task.msg, msglen);
1792 	vd->inband_task.msglen	= msglen;
1793 
1794 	/*
1795 	 * The task request is now the payload of the message
1796 	 * that was just copied into the body of the task.
1797 	 */
1798 	desc_msg = (vd_dring_inband_msg_t *)vd->inband_task.msg;
1799 	vd->inband_task.request	= &desc_msg->payload;
1800 
1801 	return (vd_process_task(&vd->inband_task));
1802 }
1803 
1804 static int
1805 vd_process_element(vd_t *vd, vd_task_type_t type, uint32_t idx,
1806     vio_msg_t *msg, size_t msglen)
1807 {
1808 	int			status;
1809 	boolean_t		ready;
1810 	vd_dring_entry_t	*elem = VD_DRING_ELEM(idx);
1811 
1812 
1813 	/* Accept the updated dring element */
1814 	if ((status = ldc_mem_dring_acquire(vd->dring_handle, idx, idx)) != 0) {
1815 		PR0("ldc_mem_dring_acquire() returned errno %d", status);
1816 		return (status);
1817 	}
1818 	ready = (elem->hdr.dstate == VIO_DESC_READY);
1819 	if (ready) {
1820 		elem->hdr.dstate = VIO_DESC_ACCEPTED;
1821 	} else {
1822 		PR0("descriptor %u not ready", idx);
1823 		VD_DUMP_DRING_ELEM(elem);
1824 	}
1825 	if ((status = ldc_mem_dring_release(vd->dring_handle, idx, idx)) != 0) {
1826 		PR0("ldc_mem_dring_release() returned errno %d", status);
1827 		return (status);
1828 	}
1829 	if (!ready)
1830 		return (EBUSY);
1831 
1832 
1833 	/* Initialize a task and process the accepted element */
1834 	PR1("Processing dring element %u", idx);
1835 	vd->dring_task[idx].type	= type;
1836 
1837 	/* duplicate msg buf for cookies etc. */
1838 	bcopy(msg, vd->dring_task[idx].msg, msglen);
1839 
1840 	vd->dring_task[idx].msglen	= msglen;
1841 	if ((status = vd_process_task(&vd->dring_task[idx])) != EINPROGRESS)
1842 		status = vd_mark_elem_done(vd, idx,
1843 		    vd->dring_task[idx].request->status,
1844 		    vd->dring_task[idx].request->nbytes);
1845 
1846 	return (status);
1847 }
1848 
1849 static int
1850 vd_process_element_range(vd_t *vd, int start, int end,
1851     vio_msg_t *msg, size_t msglen)
1852 {
1853 	int		i, n, nelem, status = 0;
1854 	boolean_t	inprogress = B_FALSE;
1855 	vd_task_type_t	type;
1856 
1857 
1858 	ASSERT(start >= 0);
1859 	ASSERT(end >= 0);
1860 
1861 	/*
1862 	 * Arrange to acknowledge the client's message, unless an error
1863 	 * processing one of the dring elements results in setting
1864 	 * VIO_SUBTYPE_NACK
1865 	 */
1866 	msg->tag.vio_subtype = VIO_SUBTYPE_ACK;
1867 
1868 	/*
1869 	 * Process the dring elements in the range
1870 	 */
1871 	nelem = ((end < start) ? end + vd->dring_len : end) - start + 1;
1872 	for (i = start, n = nelem; n > 0; i = (i + 1) % vd->dring_len, n--) {
1873 		((vio_dring_msg_t *)msg)->end_idx = i;
1874 		type = (n == 1) ? VD_FINAL_RANGE_TASK : VD_NONFINAL_RANGE_TASK;
1875 		status = vd_process_element(vd, type, i, msg, msglen);
1876 		if (status == EINPROGRESS)
1877 			inprogress = B_TRUE;
1878 		else if (status != 0)
1879 			break;
1880 	}
1881 
1882 	/*
1883 	 * If some, but not all, operations of a multi-element range are in
1884 	 * progress, wait for other operations to complete before returning
1885 	 * (which will result in "ack" or "nack" of the message).  Note that
1886 	 * all outstanding operations will need to complete, not just the ones
1887 	 * corresponding to the current range of dring elements; howevever, as
1888 	 * this situation is an error case, performance is less critical.
1889 	 */
1890 	if ((nelem > 1) && (status != EINPROGRESS) && inprogress)
1891 		ddi_taskq_wait(vd->completionq);
1892 
1893 	return (status);
1894 }
1895 
1896 static int
1897 vd_process_dring_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
1898 {
1899 	vio_dring_msg_t	*dring_msg = (vio_dring_msg_t *)msg;
1900 
1901 
1902 	ASSERT(msglen >= sizeof (msg->tag));
1903 
1904 	if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO,
1905 		VIO_DRING_DATA)) {
1906 		PR1("Message is not a dring-data message");
1907 		return (ENOMSG);
1908 	}
1909 
1910 	if (msglen != sizeof (*dring_msg)) {
1911 		PR0("Expected %lu-byte dring message; received %lu bytes",
1912 		    sizeof (*dring_msg), msglen);
1913 		return (EBADMSG);
1914 	}
1915 
1916 	if (vd_check_seq_num(vd, dring_msg->seq_num) != 0)
1917 		return (EBADMSG);
1918 
1919 	if (dring_msg->dring_ident != vd->dring_ident) {
1920 		PR0("Expected dring ident %lu; received ident %lu",
1921 		    vd->dring_ident, dring_msg->dring_ident);
1922 		return (EBADMSG);
1923 	}
1924 
1925 	if (dring_msg->start_idx >= vd->dring_len) {
1926 		PR0("\"start_idx\" = %u; must be less than %u",
1927 		    dring_msg->start_idx, vd->dring_len);
1928 		return (EBADMSG);
1929 	}
1930 
1931 	if ((dring_msg->end_idx < 0) ||
1932 	    (dring_msg->end_idx >= vd->dring_len)) {
1933 		PR0("\"end_idx\" = %u; must be >= 0 and less than %u",
1934 		    dring_msg->end_idx, vd->dring_len);
1935 		return (EBADMSG);
1936 	}
1937 
1938 	/* Valid message; process range of updated dring elements */
1939 	PR1("Processing descriptor range, start = %u, end = %u",
1940 	    dring_msg->start_idx, dring_msg->end_idx);
1941 	return (vd_process_element_range(vd, dring_msg->start_idx,
1942 		dring_msg->end_idx, msg, msglen));
1943 }
1944 
1945 static int
1946 recv_msg(ldc_handle_t ldc_handle, void *msg, size_t *nbytes)
1947 {
1948 	int	retry, status;
1949 	size_t	size = *nbytes;
1950 
1951 
1952 	for (retry = 0, status = ETIMEDOUT;
1953 	    retry < vds_ldc_retries && status == ETIMEDOUT;
1954 	    retry++) {
1955 		PR1("ldc_read() attempt %d", (retry + 1));
1956 		*nbytes = size;
1957 		status = ldc_read(ldc_handle, msg, nbytes);
1958 	}
1959 
1960 	if (status) {
1961 		PR0("ldc_read() returned errno %d", status);
1962 		if (status != ECONNRESET)
1963 			return (ENOMSG);
1964 		return (status);
1965 	} else if (*nbytes == 0) {
1966 		PR1("ldc_read() returned 0 and no message read");
1967 		return (ENOMSG);
1968 	}
1969 
1970 	PR1("RCVD %lu-byte message", *nbytes);
1971 	return (0);
1972 }
1973 
1974 static int
1975 vd_do_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
1976 {
1977 	int		status;
1978 
1979 
1980 	PR1("Processing (%x/%x/%x) message", msg->tag.vio_msgtype,
1981 	    msg->tag.vio_subtype, msg->tag.vio_subtype_env);
1982 #ifdef	DEBUG
1983 	vd_decode_tag(msg);
1984 #endif
1985 
1986 	/*
1987 	 * Validate session ID up front, since it applies to all messages
1988 	 * once set
1989 	 */
1990 	if ((msg->tag.vio_sid != vd->sid) && (vd->initialized & VD_SID)) {
1991 		PR0("Expected SID %u, received %u", vd->sid,
1992 		    msg->tag.vio_sid);
1993 		return (EBADMSG);
1994 	}
1995 
1996 	PR1("\tWhile in state %d (%s)", vd->state, vd_decode_state(vd->state));
1997 
1998 	/*
1999 	 * Process the received message based on connection state
2000 	 */
2001 	switch (vd->state) {
2002 	case VD_STATE_INIT:	/* expect version message */
2003 		if ((status = vd_process_ver_msg(vd, msg, msglen)) != 0)
2004 			return (status);
2005 
2006 		/* Version negotiated, move to that state */
2007 		vd->state = VD_STATE_VER;
2008 		return (0);
2009 
2010 	case VD_STATE_VER:	/* expect attribute message */
2011 		if ((status = vd_process_attr_msg(vd, msg, msglen)) != 0)
2012 			return (status);
2013 
2014 		/* Attributes exchanged, move to that state */
2015 		vd->state = VD_STATE_ATTR;
2016 		return (0);
2017 
2018 	case VD_STATE_ATTR:
2019 		switch (vd->xfer_mode) {
2020 		case VIO_DESC_MODE:	/* expect RDX message */
2021 			if ((status = process_rdx_msg(msg, msglen)) != 0)
2022 				return (status);
2023 
2024 			/* Ready to receive in-band descriptors */
2025 			vd->state = VD_STATE_DATA;
2026 			return (0);
2027 
2028 		case VIO_DRING_MODE:	/* expect register-dring message */
2029 			if ((status =
2030 				vd_process_dring_reg_msg(vd, msg, msglen)) != 0)
2031 				return (status);
2032 
2033 			/* One dring negotiated, move to that state */
2034 			vd->state = VD_STATE_DRING;
2035 			return (0);
2036 
2037 		default:
2038 			ASSERT("Unsupported transfer mode");
2039 			PR0("Unsupported transfer mode");
2040 			return (ENOTSUP);
2041 		}
2042 
2043 	case VD_STATE_DRING:	/* expect RDX, register-dring, or unreg-dring */
2044 		if ((status = process_rdx_msg(msg, msglen)) == 0) {
2045 			/* Ready to receive data */
2046 			vd->state = VD_STATE_DATA;
2047 			return (0);
2048 		} else if (status != ENOMSG) {
2049 			return (status);
2050 		}
2051 
2052 
2053 		/*
2054 		 * If another register-dring message is received, stay in
2055 		 * dring state in case the client sends RDX; although the
2056 		 * protocol allows multiple drings, this server does not
2057 		 * support using more than one
2058 		 */
2059 		if ((status =
2060 			vd_process_dring_reg_msg(vd, msg, msglen)) != ENOMSG)
2061 			return (status);
2062 
2063 		/*
2064 		 * Acknowledge an unregister-dring message, but reset the
2065 		 * connection anyway:  Although the protocol allows
2066 		 * unregistering drings, this server cannot serve a vdisk
2067 		 * without its only dring
2068 		 */
2069 		status = vd_process_dring_unreg_msg(vd, msg, msglen);
2070 		return ((status == 0) ? ENOTSUP : status);
2071 
2072 	case VD_STATE_DATA:
2073 		switch (vd->xfer_mode) {
2074 		case VIO_DESC_MODE:	/* expect in-band-descriptor message */
2075 			return (vd_process_desc_msg(vd, msg, msglen));
2076 
2077 		case VIO_DRING_MODE:	/* expect dring-data or unreg-dring */
2078 			/*
2079 			 * Typically expect dring-data messages, so handle
2080 			 * them first
2081 			 */
2082 			if ((status = vd_process_dring_msg(vd, msg,
2083 				    msglen)) != ENOMSG)
2084 				return (status);
2085 
2086 			/*
2087 			 * Acknowledge an unregister-dring message, but reset
2088 			 * the connection anyway:  Although the protocol
2089 			 * allows unregistering drings, this server cannot
2090 			 * serve a vdisk without its only dring
2091 			 */
2092 			status = vd_process_dring_unreg_msg(vd, msg, msglen);
2093 			return ((status == 0) ? ENOTSUP : status);
2094 
2095 		default:
2096 			ASSERT("Unsupported transfer mode");
2097 			PR0("Unsupported transfer mode");
2098 			return (ENOTSUP);
2099 		}
2100 
2101 	default:
2102 		ASSERT("Invalid client connection state");
2103 		PR0("Invalid client connection state");
2104 		return (ENOTSUP);
2105 	}
2106 }
2107 
2108 static int
2109 vd_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
2110 {
2111 	int		status;
2112 	boolean_t	reset_ldc = B_FALSE;
2113 
2114 
2115 	/*
2116 	 * Check that the message is at least big enough for a "tag", so that
2117 	 * message processing can proceed based on tag-specified message type
2118 	 */
2119 	if (msglen < sizeof (vio_msg_tag_t)) {
2120 		PR0("Received short (%lu-byte) message", msglen);
2121 		/* Can't "nack" short message, so drop the big hammer */
2122 		PR0("initiating full reset");
2123 		vd_need_reset(vd, B_TRUE);
2124 		return (EBADMSG);
2125 	}
2126 
2127 	/*
2128 	 * Process the message
2129 	 */
2130 	switch (status = vd_do_process_msg(vd, msg, msglen)) {
2131 	case 0:
2132 		/* "ack" valid, successfully-processed messages */
2133 		msg->tag.vio_subtype = VIO_SUBTYPE_ACK;
2134 		break;
2135 
2136 	case EINPROGRESS:
2137 		/* The completion handler will "ack" or "nack" the message */
2138 		return (EINPROGRESS);
2139 	case ENOMSG:
2140 		PR0("Received unexpected message");
2141 		_NOTE(FALLTHROUGH);
2142 	case EBADMSG:
2143 	case ENOTSUP:
2144 		/* "nack" invalid messages */
2145 		msg->tag.vio_subtype = VIO_SUBTYPE_NACK;
2146 		break;
2147 
2148 	default:
2149 		/* "nack" failed messages */
2150 		msg->tag.vio_subtype = VIO_SUBTYPE_NACK;
2151 		/* An LDC error probably occurred, so try resetting it */
2152 		reset_ldc = B_TRUE;
2153 		break;
2154 	}
2155 
2156 	PR1("\tResulting in state %d (%s)", vd->state,
2157 		vd_decode_state(vd->state));
2158 
2159 	/* Send the "ack" or "nack" to the client */
2160 	PR1("Sending %s",
2161 	    (msg->tag.vio_subtype == VIO_SUBTYPE_ACK) ? "ACK" : "NACK");
2162 	if (send_msg(vd->ldc_handle, msg, msglen) != 0)
2163 		reset_ldc = B_TRUE;
2164 
2165 	/* Arrange to reset the connection for nack'ed or failed messages */
2166 	if ((status != 0) || reset_ldc) {
2167 		PR0("initiating %s reset",
2168 		    (reset_ldc) ? "full" : "soft");
2169 		vd_need_reset(vd, reset_ldc);
2170 	}
2171 
2172 	return (status);
2173 }
2174 
2175 static boolean_t
2176 vd_enabled(vd_t *vd)
2177 {
2178 	boolean_t	enabled;
2179 
2180 
2181 	mutex_enter(&vd->lock);
2182 	enabled = vd->enabled;
2183 	mutex_exit(&vd->lock);
2184 	return (enabled);
2185 }
2186 
2187 static void
2188 vd_recv_msg(void *arg)
2189 {
2190 	vd_t	*vd = (vd_t *)arg;
2191 	int	rv = 0, status = 0;
2192 
2193 	ASSERT(vd != NULL);
2194 
2195 	PR2("New task to receive incoming message(s)");
2196 
2197 
2198 	while (vd_enabled(vd) && status == 0) {
2199 		size_t		msglen, msgsize;
2200 		ldc_status_t	lstatus;
2201 
2202 		/*
2203 		 * Receive and process a message
2204 		 */
2205 		vd_reset_if_needed(vd);	/* can change vd->max_msglen */
2206 
2207 		/*
2208 		 * check if channel is UP - else break out of loop
2209 		 */
2210 		status = ldc_status(vd->ldc_handle, &lstatus);
2211 		if (lstatus != LDC_UP) {
2212 			PR0("channel not up (status=%d), exiting recv loop\n",
2213 			    lstatus);
2214 			break;
2215 		}
2216 
2217 		ASSERT(vd->max_msglen != 0);
2218 
2219 		msgsize = vd->max_msglen; /* stable copy for alloc/free */
2220 		msglen	= msgsize;	  /* actual len after recv_msg() */
2221 
2222 		status = recv_msg(vd->ldc_handle, vd->vio_msgp, &msglen);
2223 		switch (status) {
2224 		case 0:
2225 			rv = vd_process_msg(vd, (vio_msg_t *)vd->vio_msgp,
2226 				msglen);
2227 			/* check if max_msglen changed */
2228 			if (msgsize != vd->max_msglen) {
2229 				PR0("max_msglen changed 0x%lx to 0x%lx bytes\n",
2230 				    msgsize, vd->max_msglen);
2231 				kmem_free(vd->vio_msgp, msgsize);
2232 				vd->vio_msgp =
2233 					kmem_alloc(vd->max_msglen, KM_SLEEP);
2234 			}
2235 			if (rv == EINPROGRESS)
2236 				continue;
2237 			break;
2238 
2239 		case ENOMSG:
2240 			break;
2241 
2242 		case ECONNRESET:
2243 			PR0("initiating soft reset (ECONNRESET)\n");
2244 			vd_need_reset(vd, B_FALSE);
2245 			status = 0;
2246 			break;
2247 
2248 		default:
2249 			/* Probably an LDC failure; arrange to reset it */
2250 			PR0("initiating full reset (status=0x%x)", status);
2251 			vd_need_reset(vd, B_TRUE);
2252 			break;
2253 		}
2254 	}
2255 
2256 	PR2("Task finished");
2257 }
2258 
2259 static uint_t
2260 vd_handle_ldc_events(uint64_t event, caddr_t arg)
2261 {
2262 	vd_t	*vd = (vd_t *)(void *)arg;
2263 	int	status;
2264 
2265 	ASSERT(vd != NULL);
2266 
2267 	if (!vd_enabled(vd))
2268 		return (LDC_SUCCESS);
2269 
2270 	if (event & LDC_EVT_DOWN) {
2271 		PR0("LDC_EVT_DOWN: LDC channel went down");
2272 
2273 		vd_need_reset(vd, B_TRUE);
2274 		status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd,
2275 		    DDI_SLEEP);
2276 		if (status == DDI_FAILURE) {
2277 			PR0("cannot schedule task to recv msg\n");
2278 			vd_need_reset(vd, B_TRUE);
2279 		}
2280 	}
2281 
2282 	if (event & LDC_EVT_RESET) {
2283 		PR0("LDC_EVT_RESET: LDC channel was reset");
2284 
2285 		if (vd->state != VD_STATE_INIT) {
2286 			PR0("scheduling full reset");
2287 			vd_need_reset(vd, B_FALSE);
2288 			status = ddi_taskq_dispatch(vd->startq, vd_recv_msg,
2289 			    vd, DDI_SLEEP);
2290 			if (status == DDI_FAILURE) {
2291 				PR0("cannot schedule task to recv msg\n");
2292 				vd_need_reset(vd, B_TRUE);
2293 			}
2294 
2295 		} else {
2296 			PR0("channel already reset, ignoring...\n");
2297 			PR0("doing ldc up...\n");
2298 			(void) ldc_up(vd->ldc_handle);
2299 		}
2300 
2301 		return (LDC_SUCCESS);
2302 	}
2303 
2304 	if (event & LDC_EVT_UP) {
2305 		PR0("EVT_UP: LDC is up\nResetting client connection state");
2306 		PR0("initiating soft reset");
2307 		vd_need_reset(vd, B_FALSE);
2308 		status = ddi_taskq_dispatch(vd->startq, vd_recv_msg,
2309 		    vd, DDI_SLEEP);
2310 		if (status == DDI_FAILURE) {
2311 			PR0("cannot schedule task to recv msg\n");
2312 			vd_need_reset(vd, B_TRUE);
2313 			return (LDC_SUCCESS);
2314 		}
2315 	}
2316 
2317 	if (event & LDC_EVT_READ) {
2318 		int	status;
2319 
2320 		PR1("New data available");
2321 		/* Queue a task to receive the new data */
2322 		status = ddi_taskq_dispatch(vd->startq, vd_recv_msg, vd,
2323 		    DDI_SLEEP);
2324 
2325 		if (status == DDI_FAILURE) {
2326 			PR0("cannot schedule task to recv msg\n");
2327 			vd_need_reset(vd, B_TRUE);
2328 		}
2329 	}
2330 
2331 	return (LDC_SUCCESS);
2332 }
2333 
2334 static uint_t
2335 vds_check_for_vd(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
2336 {
2337 	_NOTE(ARGUNUSED(key, val))
2338 	(*((uint_t *)arg))++;
2339 	return (MH_WALK_TERMINATE);
2340 }
2341 
2342 
2343 static int
2344 vds_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2345 {
2346 	uint_t	vd_present = 0;
2347 	minor_t	instance;
2348 	vds_t	*vds;
2349 
2350 
2351 	switch (cmd) {
2352 	case DDI_DETACH:
2353 		/* the real work happens below */
2354 		break;
2355 	case DDI_SUSPEND:
2356 		PR0("No action required for DDI_SUSPEND");
2357 		return (DDI_SUCCESS);
2358 	default:
2359 		PR0("Unrecognized \"cmd\"");
2360 		return (DDI_FAILURE);
2361 	}
2362 
2363 	ASSERT(cmd == DDI_DETACH);
2364 	instance = ddi_get_instance(dip);
2365 	if ((vds = ddi_get_soft_state(vds_state, instance)) == NULL) {
2366 		PR0("Could not get state for instance %u", instance);
2367 		ddi_soft_state_free(vds_state, instance);
2368 		return (DDI_FAILURE);
2369 	}
2370 
2371 	/* Do no detach when serving any vdisks */
2372 	mod_hash_walk(vds->vd_table, vds_check_for_vd, &vd_present);
2373 	if (vd_present) {
2374 		PR0("Not detaching because serving vdisks");
2375 		return (DDI_FAILURE);
2376 	}
2377 
2378 	PR0("Detaching");
2379 	if (vds->initialized & VDS_MDEG) {
2380 		(void) mdeg_unregister(vds->mdeg);
2381 		kmem_free(vds->ispecp->specp, sizeof (vds_prop_template));
2382 		kmem_free(vds->ispecp, sizeof (mdeg_node_spec_t));
2383 		vds->ispecp = NULL;
2384 		vds->mdeg = NULL;
2385 	}
2386 
2387 	if (vds->initialized & VDS_LDI)
2388 		(void) ldi_ident_release(vds->ldi_ident);
2389 	mod_hash_destroy_hash(vds->vd_table);
2390 	ddi_soft_state_free(vds_state, instance);
2391 	return (DDI_SUCCESS);
2392 }
2393 
2394 static boolean_t
2395 is_pseudo_device(dev_info_t *dip)
2396 {
2397 	dev_info_t	*parent, *root = ddi_root_node();
2398 
2399 
2400 	for (parent = ddi_get_parent(dip); (parent != NULL) && (parent != root);
2401 	    parent = ddi_get_parent(parent)) {
2402 		if (strcmp(ddi_get_name(parent), DEVI_PSEUDO_NEXNAME) == 0)
2403 			return (B_TRUE);
2404 	}
2405 
2406 	return (B_FALSE);
2407 }
2408 
2409 static int
2410 vd_setup_full_disk(vd_t *vd)
2411 {
2412 	int		rval, status;
2413 	major_t		major = getmajor(vd->dev[0]);
2414 	minor_t		minor = getminor(vd->dev[0]) - VD_ENTIRE_DISK_SLICE;
2415 	struct dk_minfo	dk_minfo;
2416 
2417 	/*
2418 	 * At this point, vdisk_size is set to the size of partition 2 but
2419 	 * this does not represent the size of the disk because partition 2
2420 	 * may not cover the entire disk and its size does not include reserved
2421 	 * blocks. So we update vdisk_size to be the size of the entire disk.
2422 	 */
2423 	if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCGMEDIAINFO,
2424 	    (intptr_t)&dk_minfo, (vd_open_flags | FKIOCTL),
2425 	    kcred, &rval)) != 0) {
2426 		PR0("ldi_ioctl(DKIOCGMEDIAINFO) returned errno %d",
2427 		    status);
2428 		return (status);
2429 	}
2430 	vd->vdisk_size = dk_minfo.dki_capacity;
2431 
2432 	/* Set full-disk parameters */
2433 	vd->vdisk_type	= VD_DISK_TYPE_DISK;
2434 	vd->nslices	= (sizeof (vd->dev))/(sizeof (vd->dev[0]));
2435 
2436 	/* Move dev number and LDI handle to entire-disk-slice array elements */
2437 	vd->dev[VD_ENTIRE_DISK_SLICE]		= vd->dev[0];
2438 	vd->dev[0]				= 0;
2439 	vd->ldi_handle[VD_ENTIRE_DISK_SLICE]	= vd->ldi_handle[0];
2440 	vd->ldi_handle[0]			= NULL;
2441 
2442 	/* Initialize device numbers for remaining slices and open them */
2443 	for (int slice = 0; slice < vd->nslices; slice++) {
2444 		/*
2445 		 * Skip the entire-disk slice, as it's already open and its
2446 		 * device known
2447 		 */
2448 		if (slice == VD_ENTIRE_DISK_SLICE)
2449 			continue;
2450 		ASSERT(vd->dev[slice] == 0);
2451 		ASSERT(vd->ldi_handle[slice] == NULL);
2452 
2453 		/*
2454 		 * Construct the device number for the current slice
2455 		 */
2456 		vd->dev[slice] = makedevice(major, (minor + slice));
2457 
2458 		/*
2459 		 * Open all slices of the disk to serve them to the client.
2460 		 * Slices are opened exclusively to prevent other threads or
2461 		 * processes in the service domain from performing I/O to
2462 		 * slices being accessed by a client.  Failure to open a slice
2463 		 * results in vds not serving this disk, as the client could
2464 		 * attempt (and should be able) to access any slice immediately.
2465 		 * Any slices successfully opened before a failure will get
2466 		 * closed by vds_destroy_vd() as a result of the error returned
2467 		 * by this function.
2468 		 *
2469 		 * We need to do the open with FNDELAY so that opening an empty
2470 		 * slice does not fail.
2471 		 */
2472 		PR0("Opening device major %u, minor %u = slice %u",
2473 		    major, minor, slice);
2474 		if ((status = ldi_open_by_dev(&vd->dev[slice], OTYP_BLK,
2475 		    vd_open_flags | FNDELAY, kcred, &vd->ldi_handle[slice],
2476 		    vd->vds->ldi_ident)) != 0) {
2477 			PR0("ldi_open_by_dev() returned errno %d "
2478 			    "for slice %u", status, slice);
2479 			/* vds_destroy_vd() will close any open slices */
2480 			return (status);
2481 		}
2482 	}
2483 
2484 	return (0);
2485 }
2486 
2487 static int
2488 vd_setup_partition_efi(vd_t *vd)
2489 {
2490 	efi_gpt_t *gpt;
2491 	efi_gpe_t *gpe;
2492 	struct uuid uuid = EFI_RESERVED;
2493 	uint32_t crc;
2494 	int length;
2495 
2496 	length = sizeof (efi_gpt_t) + sizeof (efi_gpe_t);
2497 
2498 	gpt = kmem_zalloc(length, KM_SLEEP);
2499 	gpe = (efi_gpe_t *)(gpt + 1);
2500 
2501 	gpt->efi_gpt_Signature = LE_64(EFI_SIGNATURE);
2502 	gpt->efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT);
2503 	gpt->efi_gpt_HeaderSize = LE_32(sizeof (efi_gpt_t));
2504 	gpt->efi_gpt_FirstUsableLBA = LE_64(0ULL);
2505 	gpt->efi_gpt_LastUsableLBA = LE_64(vd->vdisk_size - 1);
2506 	gpt->efi_gpt_NumberOfPartitionEntries = LE_32(1);
2507 	gpt->efi_gpt_SizeOfPartitionEntry = LE_32(sizeof (efi_gpe_t));
2508 
2509 	UUID_LE_CONVERT(gpe->efi_gpe_PartitionTypeGUID, uuid);
2510 	gpe->efi_gpe_StartingLBA = gpt->efi_gpt_FirstUsableLBA;
2511 	gpe->efi_gpe_EndingLBA = gpt->efi_gpt_LastUsableLBA;
2512 
2513 	CRC32(crc, gpe, sizeof (efi_gpe_t), -1U, crc32_table);
2514 	gpt->efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc);
2515 
2516 	CRC32(crc, gpt, sizeof (efi_gpt_t), -1U, crc32_table);
2517 	gpt->efi_gpt_HeaderCRC32 = LE_32(~crc);
2518 
2519 	vd->dk_efi.dki_lba = 0;
2520 	vd->dk_efi.dki_length = length;
2521 	vd->dk_efi.dki_data = gpt;
2522 
2523 	return (0);
2524 }
2525 
2526 static int
2527 vd_setup_file(vd_t *vd)
2528 {
2529 	int 		i, rval, status;
2530 	short		sum;
2531 	vattr_t		vattr;
2532 	dev_t		dev;
2533 	char		*file_path = vd->device_path;
2534 	char		dev_path[MAXPATHLEN + 1];
2535 	ldi_handle_t	lhandle;
2536 	struct dk_cinfo	dk_cinfo;
2537 	struct dk_label *label;
2538 
2539 	/* make sure the file is valid */
2540 	if ((status = lookupname(file_path, UIO_SYSSPACE, FOLLOW,
2541 		NULLVPP, &vd->file_vnode)) != 0) {
2542 		PR0("Cannot lookup file(%s) errno %d", file_path, status);
2543 		return (status);
2544 	}
2545 
2546 	if (vd->file_vnode->v_type != VREG) {
2547 		PR0("Invalid file type (%s)\n", file_path);
2548 		VN_RELE(vd->file_vnode);
2549 		return (EBADF);
2550 	}
2551 	VN_RELE(vd->file_vnode);
2552 
2553 	if ((status = vn_open(file_path, UIO_SYSSPACE, vd_open_flags | FOFFMAX,
2554 	    0, &vd->file_vnode, 0, 0)) != 0) {
2555 		PR0("vn_open(%s) = errno %d", file_path, status);
2556 		return (status);
2557 	}
2558 
2559 	vattr.va_mask = AT_SIZE;
2560 	if ((status = VOP_GETATTR(vd->file_vnode, &vattr, 0, kcred)) != 0) {
2561 		PR0("VOP_GETATTR(%s) = errno %d", file_path, status);
2562 		(void) VOP_CLOSE(vd->file_vnode, vd_open_flags, 1, 0, kcred);
2563 		VN_RELE(vd->file_vnode);
2564 		return (EIO);
2565 	}
2566 
2567 	vd->file_size = vattr.va_size;
2568 	/* size should be at least sizeof(dk_label) */
2569 	if (vd->file_size < sizeof (struct dk_label)) {
2570 		PRN("Size of file has to be at least %ld bytes",
2571 		    sizeof (struct dk_label));
2572 		(void) VOP_CLOSE(vd->file_vnode, vd_open_flags, 1, 0, kcred);
2573 		VN_RELE(vd->file_vnode);
2574 		return (EIO);
2575 	}
2576 
2577 	if ((status = VOP_MAP(vd->file_vnode, 0, &kas, &vd->file_maddr,
2578 	    vd->file_size, PROT_READ | PROT_WRITE, PROT_READ | PROT_WRITE,
2579 	    MAP_SHARED, kcred)) != 0) {
2580 		PR0("VOP_MAP(%s) = errno %d", file_path, status);
2581 		(void) VOP_CLOSE(vd->file_vnode, vd_open_flags, 1, 0, kcred);
2582 		VN_RELE(vd->file_vnode);
2583 		return (EIO);
2584 	}
2585 
2586 	label = (struct dk_label *)vd->file_maddr;
2587 
2588 	/* label checksum */
2589 	sum = vd_lbl2cksum(label);
2590 
2591 	if (label->dkl_magic != DKL_MAGIC || label->dkl_cksum != sum) {
2592 		PR0("%s has an invalid disk label "
2593 		    "(magic=%x cksum=%x (expect %x))",
2594 		    file_path, label->dkl_magic, label->dkl_cksum, sum);
2595 
2596 		/* default label */
2597 		bzero(label, sizeof (struct dk_label));
2598 
2599 		/*
2600 		 * We must have a resonable number of cylinders and sectors so
2601 		 * that newfs can run using default values.
2602 		 *
2603 		 * if (disk_size < 2MB)
2604 		 * 	phys_cylinders = disk_size / 100K
2605 		 * else
2606 		 * 	phys_cylinders = disk_size / 300K
2607 		 *
2608 		 * phys_cylinders = (phys_cylinders == 0) ? 1 : phys_cylinders
2609 		 * alt_cylinders = (phys_cylinders > 2) ? 2 : 0;
2610 		 * data_cylinders = phys_cylinders - alt_cylinders
2611 		 *
2612 		 * sectors = disk_size / (phys_cylinders * blk_size)
2613 		 */
2614 		if (vd->file_size < (2 * 1024 * 1024))
2615 			label->dkl_pcyl = vd->file_size / (100 * 1024);
2616 		else
2617 			label->dkl_pcyl = vd->file_size / (300 * 1024);
2618 
2619 		if (label->dkl_pcyl == 0)
2620 			label->dkl_pcyl = 1;
2621 
2622 		if (label->dkl_pcyl > 2)
2623 			label->dkl_acyl = 2;
2624 		else
2625 			label->dkl_acyl = 0;
2626 
2627 		label->dkl_nsect = vd->file_size /
2628 			(DEV_BSIZE * label->dkl_pcyl);
2629 		label->dkl_ncyl = label->dkl_pcyl - label->dkl_acyl;
2630 		label->dkl_nhead = 1;
2631 		label->dkl_write_reinstruct = 0;
2632 		label->dkl_read_reinstruct = 0;
2633 		label->dkl_rpm = 7200;
2634 		label->dkl_apc = 0;
2635 		label->dkl_intrlv = 0;
2636 		label->dkl_magic = DKL_MAGIC;
2637 
2638 		PR0("requested disk size: %ld bytes\n", vd->file_size);
2639 		PR0("setup: ncyl=%d nhead=%d nsec=%d\n", label->dkl_pcyl,
2640 		    label->dkl_nhead, label->dkl_nsect);
2641 		PR0("provided disk size: %ld bytes\n", (uint64_t)
2642 		    (label->dkl_pcyl *
2643 			label->dkl_nhead * label->dkl_nsect * DEV_BSIZE));
2644 
2645 		/*
2646 		 * We must have a correct label name otherwise format(1m) will
2647 		 * not recognized the disk as labeled.
2648 		 */
2649 		(void) snprintf(label->dkl_asciilabel, LEN_DKL_ASCII,
2650 		    "SUNVDSK cyl %d alt %d hd %d sec %d",
2651 		    label->dkl_ncyl, label->dkl_acyl, label->dkl_nhead,
2652 		    label->dkl_nsect);
2653 
2654 		/* default VTOC */
2655 		label->dkl_vtoc.v_version = V_VERSION;
2656 		label->dkl_vtoc.v_nparts = 8;
2657 		label->dkl_vtoc.v_sanity = VTOC_SANE;
2658 		label->dkl_vtoc.v_part[2].p_tag = V_BACKUP;
2659 		label->dkl_map[2].dkl_cylno = 0;
2660 		label->dkl_map[2].dkl_nblk = label->dkl_ncyl *
2661 			label->dkl_nhead * label->dkl_nsect;
2662 		label->dkl_map[0] = label->dkl_map[2];
2663 		label->dkl_map[0] = label->dkl_map[2];
2664 		label->dkl_cksum = vd_lbl2cksum(label);
2665 	}
2666 
2667 	vd->nslices = label->dkl_vtoc.v_nparts;
2668 
2669 	/* sector size = block size = DEV_BSIZE */
2670 	vd->vdisk_size = (label->dkl_pcyl *
2671 	    label->dkl_nhead * label->dkl_nsect) / DEV_BSIZE;
2672 	vd->vdisk_type = VD_DISK_TYPE_DISK;
2673 	vd->vdisk_label = VD_DISK_LABEL_VTOC;
2674 	vd->max_xfer_sz = maxphys / DEV_BSIZE; /* default transfer size */
2675 
2676 	/* Get max_xfer_sz from the device where the file is */
2677 	dev = vd->file_vnode->v_vfsp->vfs_dev;
2678 	dev_path[0] = NULL;
2679 	if (ddi_dev_pathname(dev, S_IFBLK, dev_path) == DDI_SUCCESS) {
2680 		PR0("underlying device = %s\n", dev_path);
2681 	}
2682 
2683 	if ((status = ldi_open_by_dev(&dev, OTYP_BLK, FREAD,
2684 		kcred, &lhandle, vd->vds->ldi_ident)) != 0) {
2685 		PR0("ldi_open_by_dev() returned errno %d for device %s",
2686 		    status, dev_path);
2687 	} else {
2688 		if ((status = ldi_ioctl(lhandle, DKIOCINFO,
2689 		    (intptr_t)&dk_cinfo, (vd_open_flags | FKIOCTL), kcred,
2690 		    &rval)) != 0) {
2691 			PR0("ldi_ioctl(DKIOCINFO) returned errno %d for %s",
2692 			    status, dev_path);
2693 		} else {
2694 			/*
2695 			 * Store the device's max transfer size for
2696 			 * return to the client
2697 			 */
2698 			vd->max_xfer_sz = dk_cinfo.dki_maxtransfer;
2699 		}
2700 
2701 		PR0("close the device %s", dev_path);
2702 		(void) ldi_close(lhandle, FREAD, kcred);
2703 	}
2704 
2705 	PR0("using for file %s, dev %s, max_xfer = %u blks",
2706 	    file_path, dev_path, vd->max_xfer_sz);
2707 
2708 	vd->pseudo = B_FALSE;
2709 	vd->file = B_TRUE;
2710 
2711 	vd->dk_geom.dkg_ncyl = label->dkl_ncyl;
2712 	vd->dk_geom.dkg_acyl = label->dkl_acyl;
2713 	vd->dk_geom.dkg_pcyl = label->dkl_pcyl;
2714 	vd->dk_geom.dkg_nhead = label->dkl_nhead;
2715 	vd->dk_geom.dkg_nsect = label->dkl_nsect;
2716 	vd->dk_geom.dkg_intrlv = label->dkl_intrlv;
2717 	vd->dk_geom.dkg_apc = label->dkl_apc;
2718 	vd->dk_geom.dkg_rpm = label->dkl_rpm;
2719 	vd->dk_geom.dkg_write_reinstruct = label->dkl_write_reinstruct;
2720 	vd->dk_geom.dkg_read_reinstruct = label->dkl_read_reinstruct;
2721 
2722 	vd->vtoc.v_sanity = label->dkl_vtoc.v_sanity;
2723 	vd->vtoc.v_version = label->dkl_vtoc.v_version;
2724 	vd->vtoc.v_sectorsz = DEV_BSIZE;
2725 	vd->vtoc.v_nparts = label->dkl_vtoc.v_nparts;
2726 
2727 	bcopy(label->dkl_vtoc.v_volume, vd->vtoc.v_volume,
2728 	    LEN_DKL_VVOL);
2729 	bcopy(label->dkl_asciilabel, vd->vtoc.v_asciilabel,
2730 	    LEN_DKL_ASCII);
2731 
2732 	for (i = 0; i < vd->nslices; i++) {
2733 		vd->vtoc.timestamp[i] = label->dkl_vtoc.v_timestamp[i];
2734 		vd->vtoc.v_part[i].p_tag = label->dkl_vtoc.v_part[i].p_tag;
2735 		vd->vtoc.v_part[i].p_flag = label->dkl_vtoc.v_part[i].p_flag;
2736 		vd->vtoc.v_part[i].p_start = label->dkl_map[i].dkl_cylno *
2737 			label->dkl_nhead * label->dkl_nsect;
2738 		vd->vtoc.v_part[i].p_size = label->dkl_map[i].dkl_nblk;
2739 		vd->ldi_handle[i] = NULL;
2740 		vd->dev[i] = NULL;
2741 	}
2742 
2743 	return (0);
2744 }
2745 
2746 static int
2747 vd_setup_vd(vd_t *vd)
2748 {
2749 	int		rval, status;
2750 	dev_info_t	*dip;
2751 	struct dk_cinfo	dk_cinfo;
2752 	char		*device_path = vd->device_path;
2753 
2754 	/*
2755 	 * We need to open with FNDELAY so that opening an empty partition
2756 	 * does not fail.
2757 	 */
2758 	if ((status = ldi_open_by_name(device_path, vd_open_flags | FNDELAY,
2759 	    kcred, &vd->ldi_handle[0], vd->vds->ldi_ident)) != 0) {
2760 		PR0("ldi_open_by_name(%s) = errno %d", device_path, status);
2761 
2762 		/* this may not be a device try opening as a file */
2763 		if (status == ENXIO || status == ENODEV)
2764 			status = vd_setup_file(vd);
2765 		if (status) {
2766 			PR0("Cannot use device/file (%s), errno=%d\n",
2767 			    device_path, status);
2768 			if (status == ENXIO || status == ENODEV ||
2769 			    status == ENOENT) {
2770 				return (EAGAIN);
2771 			}
2772 		}
2773 		return (status);
2774 	}
2775 
2776 	/*
2777 	 * nslices must be updated now so that vds_destroy_vd() will close
2778 	 * the slice we have just opened in case of an error.
2779 	 */
2780 	vd->nslices = 1;
2781 	vd->file = B_FALSE;
2782 
2783 	/* Get device number and size of backing device */
2784 	if ((status = ldi_get_dev(vd->ldi_handle[0], &vd->dev[0])) != 0) {
2785 		PRN("ldi_get_dev() returned errno %d for %s",
2786 		    status, device_path);
2787 		return (status);
2788 	}
2789 	if (ldi_get_size(vd->ldi_handle[0], &vd->vdisk_size) != DDI_SUCCESS) {
2790 		PRN("ldi_get_size() failed for %s", device_path);
2791 		return (EIO);
2792 	}
2793 	vd->vdisk_size = lbtodb(vd->vdisk_size);	/* convert to blocks */
2794 
2795 	/* Verify backing device supports dk_cinfo, dk_geom, and vtoc */
2796 	if ((status = ldi_ioctl(vd->ldi_handle[0], DKIOCINFO,
2797 		    (intptr_t)&dk_cinfo, (vd_open_flags | FKIOCTL), kcred,
2798 		    &rval)) != 0) {
2799 		PRN("ldi_ioctl(DKIOCINFO) returned errno %d for %s",
2800 		    status, device_path);
2801 		return (status);
2802 	}
2803 	if (dk_cinfo.dki_partition >= V_NUMPAR) {
2804 		PRN("slice %u >= maximum slice %u for %s",
2805 		    dk_cinfo.dki_partition, V_NUMPAR, device_path);
2806 		return (EIO);
2807 	}
2808 
2809 	status = vd_read_vtoc(vd->ldi_handle[0], &vd->vtoc, &vd->vdisk_label);
2810 
2811 	if (status != 0) {
2812 		PRN("vd_read_vtoc returned errno %d for %s",
2813 		    status, device_path);
2814 		return (status);
2815 	}
2816 
2817 	if (vd->vdisk_label == VD_DISK_LABEL_VTOC &&
2818 	    (status = ldi_ioctl(vd->ldi_handle[0], DKIOCGGEOM,
2819 	    (intptr_t)&vd->dk_geom, (vd_open_flags | FKIOCTL),
2820 	    kcred, &rval)) != 0) {
2821 		    PRN("ldi_ioctl(DKIOCGEOM) returned errno %d for %s",
2822 			status, device_path);
2823 		    return (status);
2824 	}
2825 
2826 	/* Store the device's max transfer size for return to the client */
2827 	vd->max_xfer_sz = dk_cinfo.dki_maxtransfer;
2828 
2829 	/* Determine if backing device is a pseudo device */
2830 	if ((dip = ddi_hold_devi_by_instance(getmajor(vd->dev[0]),
2831 		    dev_to_instance(vd->dev[0]), 0))  == NULL) {
2832 		PRN("%s is no longer accessible", device_path);
2833 		return (EIO);
2834 	}
2835 	vd->pseudo = is_pseudo_device(dip);
2836 	ddi_release_devi(dip);
2837 	if (vd->pseudo) {
2838 		vd->vdisk_type	= VD_DISK_TYPE_SLICE;
2839 		vd->nslices	= 1;
2840 		return (0);	/* ...and we're done */
2841 	}
2842 
2843 	/* If slice is entire-disk slice, initialize for full disk */
2844 	if (dk_cinfo.dki_partition == VD_ENTIRE_DISK_SLICE)
2845 		return (vd_setup_full_disk(vd));
2846 
2847 
2848 	/* Otherwise, we have a non-entire slice of a device */
2849 	vd->vdisk_type	= VD_DISK_TYPE_SLICE;
2850 	vd->nslices	= 1;
2851 
2852 	if (vd->vdisk_label == VD_DISK_LABEL_EFI) {
2853 		status = vd_setup_partition_efi(vd);
2854 		return (status);
2855 	}
2856 
2857 	/* Initialize dk_geom structure for single-slice device */
2858 	if (vd->dk_geom.dkg_nsect == 0) {
2859 		PR0("%s geometry claims 0 sectors per track", device_path);
2860 		return (EIO);
2861 	}
2862 	if (vd->dk_geom.dkg_nhead == 0) {
2863 		PR0("%s geometry claims 0 heads", device_path);
2864 		return (EIO);
2865 	}
2866 	vd->dk_geom.dkg_ncyl =
2867 	    vd->vdisk_size/vd->dk_geom.dkg_nsect/vd->dk_geom.dkg_nhead;
2868 	vd->dk_geom.dkg_acyl = 0;
2869 	vd->dk_geom.dkg_pcyl = vd->dk_geom.dkg_ncyl + vd->dk_geom.dkg_acyl;
2870 
2871 
2872 	/* Initialize vtoc structure for single-slice device */
2873 	bcopy(VD_VOLUME_NAME, vd->vtoc.v_volume,
2874 	    MIN(sizeof (VD_VOLUME_NAME), sizeof (vd->vtoc.v_volume)));
2875 	bzero(vd->vtoc.v_part, sizeof (vd->vtoc.v_part));
2876 	vd->vtoc.v_nparts = 1;
2877 	vd->vtoc.v_part[0].p_tag = V_UNASSIGNED;
2878 	vd->vtoc.v_part[0].p_flag = 0;
2879 	vd->vtoc.v_part[0].p_start = 0;
2880 	vd->vtoc.v_part[0].p_size = vd->vdisk_size;
2881 	bcopy(VD_ASCIILABEL, vd->vtoc.v_asciilabel,
2882 	    MIN(sizeof (VD_ASCIILABEL), sizeof (vd->vtoc.v_asciilabel)));
2883 
2884 
2885 	return (0);
2886 }
2887 
2888 static int
2889 vds_do_init_vd(vds_t *vds, uint64_t id, char *device_path, uint64_t ldc_id,
2890     vd_t **vdp)
2891 {
2892 	char			tq_name[TASKQ_NAMELEN];
2893 	int			status;
2894 	ddi_iblock_cookie_t	iblock = NULL;
2895 	ldc_attr_t		ldc_attr;
2896 	vd_t			*vd;
2897 
2898 
2899 	ASSERT(vds != NULL);
2900 	ASSERT(device_path != NULL);
2901 	ASSERT(vdp != NULL);
2902 	PR0("Adding vdisk for %s", device_path);
2903 
2904 	if ((vd = kmem_zalloc(sizeof (*vd), KM_NOSLEEP)) == NULL) {
2905 		PRN("No memory for virtual disk");
2906 		return (EAGAIN);
2907 	}
2908 	*vdp = vd;	/* assign here so vds_destroy_vd() can cleanup later */
2909 	vd->vds = vds;
2910 	(void) strncpy(vd->device_path, device_path, MAXPATHLEN);
2911 
2912 	/* Open vdisk and initialize parameters */
2913 	if ((status = vd_setup_vd(vd)) == 0) {
2914 		vd->initialized |= VD_DISK_READY;
2915 
2916 		ASSERT(vd->nslices > 0 && vd->nslices <= V_NUMPAR);
2917 		PR0("vdisk_type = %s, pseudo = %s, file = %s, nslices = %u",
2918 		    ((vd->vdisk_type == VD_DISK_TYPE_DISK) ? "disk" : "slice"),
2919 		    (vd->pseudo ? "yes" : "no"), (vd->file ? "yes" : "no"),
2920 		    vd->nslices);
2921 	} else {
2922 		if (status != EAGAIN)
2923 			return (status);
2924 	}
2925 
2926 	/* Initialize locking */
2927 	if (ddi_get_soft_iblock_cookie(vds->dip, DDI_SOFTINT_MED,
2928 		&iblock) != DDI_SUCCESS) {
2929 		PRN("Could not get iblock cookie.");
2930 		return (EIO);
2931 	}
2932 
2933 	mutex_init(&vd->lock, NULL, MUTEX_DRIVER, iblock);
2934 	vd->initialized |= VD_LOCKING;
2935 
2936 
2937 	/* Create start and completion task queues for the vdisk */
2938 	(void) snprintf(tq_name, sizeof (tq_name), "vd_startq%lu", id);
2939 	PR1("tq_name = %s", tq_name);
2940 	if ((vd->startq = ddi_taskq_create(vds->dip, tq_name, 1,
2941 		    TASKQ_DEFAULTPRI, 0)) == NULL) {
2942 		PRN("Could not create task queue");
2943 		return (EIO);
2944 	}
2945 	(void) snprintf(tq_name, sizeof (tq_name), "vd_completionq%lu", id);
2946 	PR1("tq_name = %s", tq_name);
2947 	if ((vd->completionq = ddi_taskq_create(vds->dip, tq_name, 1,
2948 		    TASKQ_DEFAULTPRI, 0)) == NULL) {
2949 		PRN("Could not create task queue");
2950 		return (EIO);
2951 	}
2952 	vd->enabled = 1;	/* before callback can dispatch to startq */
2953 
2954 
2955 	/* Bring up LDC */
2956 	ldc_attr.devclass	= LDC_DEV_BLK_SVC;
2957 	ldc_attr.instance	= ddi_get_instance(vds->dip);
2958 	ldc_attr.mode		= LDC_MODE_UNRELIABLE;
2959 	ldc_attr.mtu		= VD_LDC_MTU;
2960 	if ((status = ldc_init(ldc_id, &ldc_attr, &vd->ldc_handle)) != 0) {
2961 		PR0("ldc_init(%lu) = errno %d", ldc_id, status);
2962 		return (status);
2963 	}
2964 	vd->initialized |= VD_LDC;
2965 
2966 	if ((status = ldc_reg_callback(vd->ldc_handle, vd_handle_ldc_events,
2967 		(caddr_t)vd)) != 0) {
2968 		PR0("ldc_reg_callback() returned errno %d", status);
2969 		return (status);
2970 	}
2971 
2972 	if ((status = ldc_open(vd->ldc_handle)) != 0) {
2973 		PR0("ldc_open() returned errno %d", status);
2974 		return (status);
2975 	}
2976 
2977 	if ((status = ldc_up(vd->ldc_handle)) != 0) {
2978 		PR0("ldc_up() returned errno %d", status);
2979 	}
2980 
2981 	/* Allocate the inband task memory handle */
2982 	status = ldc_mem_alloc_handle(vd->ldc_handle, &(vd->inband_task.mhdl));
2983 	if (status) {
2984 		PR0("ldc_mem_alloc_handle() returned err %d ", status);
2985 		return (ENXIO);
2986 	}
2987 
2988 	/* Add the successfully-initialized vdisk to the server's table */
2989 	if (mod_hash_insert(vds->vd_table, (mod_hash_key_t)id, vd) != 0) {
2990 		PRN("Error adding vdisk ID %lu to table", id);
2991 		return (EIO);
2992 	}
2993 
2994 	/* Allocate the staging buffer */
2995 	vd->max_msglen	= sizeof (vio_msg_t);	/* baseline vio message size */
2996 	vd->vio_msgp = kmem_alloc(vd->max_msglen, KM_SLEEP);
2997 
2998 	/* store initial state */
2999 	vd->state = VD_STATE_INIT;
3000 
3001 	return (0);
3002 }
3003 
3004 static void
3005 vd_free_dring_task(vd_t *vdp)
3006 {
3007 	if (vdp->dring_task != NULL) {
3008 		ASSERT(vdp->dring_len != 0);
3009 		/* Free all dring_task memory handles */
3010 		for (int i = 0; i < vdp->dring_len; i++) {
3011 			(void) ldc_mem_free_handle(vdp->dring_task[i].mhdl);
3012 			kmem_free(vdp->dring_task[i].msg, vdp->max_msglen);
3013 			vdp->dring_task[i].msg = NULL;
3014 		}
3015 		kmem_free(vdp->dring_task,
3016 		    (sizeof (*vdp->dring_task)) * vdp->dring_len);
3017 		vdp->dring_task = NULL;
3018 	}
3019 }
3020 
3021 /*
3022  * Destroy the state associated with a virtual disk
3023  */
3024 static void
3025 vds_destroy_vd(void *arg)
3026 {
3027 	vd_t	*vd = (vd_t *)arg;
3028 	int	retry = 0, rv;
3029 
3030 	if (vd == NULL)
3031 		return;
3032 
3033 	PR0("Destroying vdisk state");
3034 
3035 	if (vd->dk_efi.dki_data != NULL)
3036 		kmem_free(vd->dk_efi.dki_data, vd->dk_efi.dki_length);
3037 
3038 	/* Disable queuing requests for the vdisk */
3039 	if (vd->initialized & VD_LOCKING) {
3040 		mutex_enter(&vd->lock);
3041 		vd->enabled = 0;
3042 		mutex_exit(&vd->lock);
3043 	}
3044 
3045 	/* Drain and destroy start queue (*before* destroying completionq) */
3046 	if (vd->startq != NULL)
3047 		ddi_taskq_destroy(vd->startq);	/* waits for queued tasks */
3048 
3049 	/* Drain and destroy completion queue (*before* shutting down LDC) */
3050 	if (vd->completionq != NULL)
3051 		ddi_taskq_destroy(vd->completionq);	/* waits for tasks */
3052 
3053 	vd_free_dring_task(vd);
3054 
3055 	/* Free the inband task memory handle */
3056 	(void) ldc_mem_free_handle(vd->inband_task.mhdl);
3057 
3058 	/* Shut down LDC */
3059 	if (vd->initialized & VD_LDC) {
3060 		/* unmap the dring */
3061 		if (vd->initialized & VD_DRING)
3062 			(void) ldc_mem_dring_unmap(vd->dring_handle);
3063 
3064 		/* close LDC channel - retry on EAGAIN */
3065 		while ((rv = ldc_close(vd->ldc_handle)) == EAGAIN) {
3066 			if (++retry > vds_ldc_retries) {
3067 				PR0("Timed out closing channel");
3068 				break;
3069 			}
3070 			drv_usecwait(vds_ldc_delay);
3071 		}
3072 		if (rv == 0) {
3073 			(void) ldc_unreg_callback(vd->ldc_handle);
3074 			(void) ldc_fini(vd->ldc_handle);
3075 		} else {
3076 			/*
3077 			 * Closing the LDC channel has failed. Ideally we should
3078 			 * fail here but there is no Zeus level infrastructure
3079 			 * to handle this. The MD has already been changed and
3080 			 * we have to do the close. So we try to do as much
3081 			 * clean up as we can.
3082 			 */
3083 			(void) ldc_set_cb_mode(vd->ldc_handle, LDC_CB_DISABLE);
3084 			while (ldc_unreg_callback(vd->ldc_handle) == EAGAIN)
3085 				drv_usecwait(vds_ldc_delay);
3086 		}
3087 	}
3088 
3089 	/* Free the staging buffer for msgs */
3090 	if (vd->vio_msgp != NULL) {
3091 		kmem_free(vd->vio_msgp, vd->max_msglen);
3092 		vd->vio_msgp = NULL;
3093 	}
3094 
3095 	/* Free the inband message buffer */
3096 	if (vd->inband_task.msg != NULL) {
3097 		kmem_free(vd->inband_task.msg, vd->max_msglen);
3098 		vd->inband_task.msg = NULL;
3099 	}
3100 	if (vd->initialized & VD_DISK_READY) {
3101 		if (vd->file) {
3102 			/* Unmap and close file */
3103 			(void) as_unmap(&kas, vd->file_maddr, vd->file_size);
3104 			(void) VOP_CLOSE(vd->file_vnode, vd_open_flags, 1,
3105 			    0, kcred);
3106 			VN_RELE(vd->file_vnode);
3107 		} else {
3108 			/* Close any open backing-device slices */
3109 			for (uint_t slice = 0; slice < vd->nslices; slice++) {
3110 				if (vd->ldi_handle[slice] != NULL) {
3111 					PR0("Closing slice %u", slice);
3112 					(void) ldi_close(vd->ldi_handle[slice],
3113 					    vd_open_flags | FNDELAY, kcred);
3114 				}
3115 			}
3116 		}
3117 	}
3118 
3119 	/* Free lock */
3120 	if (vd->initialized & VD_LOCKING)
3121 		mutex_destroy(&vd->lock);
3122 
3123 	/* Finally, free the vdisk structure itself */
3124 	kmem_free(vd, sizeof (*vd));
3125 }
3126 
3127 static int
3128 vds_init_vd(vds_t *vds, uint64_t id, char *device_path, uint64_t ldc_id)
3129 {
3130 	int	status;
3131 	vd_t	*vd = NULL;
3132 
3133 
3134 	if ((status = vds_do_init_vd(vds, id, device_path, ldc_id, &vd)) != 0)
3135 		vds_destroy_vd(vd);
3136 
3137 	return (status);
3138 }
3139 
3140 static int
3141 vds_do_get_ldc_id(md_t *md, mde_cookie_t vd_node, mde_cookie_t *channel,
3142     uint64_t *ldc_id)
3143 {
3144 	int	num_channels;
3145 
3146 
3147 	/* Look for channel endpoint child(ren) of the vdisk MD node */
3148 	if ((num_channels = md_scan_dag(md, vd_node,
3149 		    md_find_name(md, VD_CHANNEL_ENDPOINT),
3150 		    md_find_name(md, "fwd"), channel)) <= 0) {
3151 		PRN("No \"%s\" found for virtual disk", VD_CHANNEL_ENDPOINT);
3152 		return (-1);
3153 	}
3154 
3155 	/* Get the "id" value for the first channel endpoint node */
3156 	if (md_get_prop_val(md, channel[0], VD_ID_PROP, ldc_id) != 0) {
3157 		PRN("No \"%s\" property found for \"%s\" of vdisk",
3158 		    VD_ID_PROP, VD_CHANNEL_ENDPOINT);
3159 		return (-1);
3160 	}
3161 
3162 	if (num_channels > 1) {
3163 		PRN("Using ID of first of multiple channels for this vdisk");
3164 	}
3165 
3166 	return (0);
3167 }
3168 
3169 static int
3170 vds_get_ldc_id(md_t *md, mde_cookie_t vd_node, uint64_t *ldc_id)
3171 {
3172 	int		num_nodes, status;
3173 	size_t		size;
3174 	mde_cookie_t	*channel;
3175 
3176 
3177 	if ((num_nodes = md_node_count(md)) <= 0) {
3178 		PRN("Invalid node count in Machine Description subtree");
3179 		return (-1);
3180 	}
3181 	size = num_nodes*(sizeof (*channel));
3182 	channel = kmem_zalloc(size, KM_SLEEP);
3183 	status = vds_do_get_ldc_id(md, vd_node, channel, ldc_id);
3184 	kmem_free(channel, size);
3185 
3186 	return (status);
3187 }
3188 
3189 static void
3190 vds_add_vd(vds_t *vds, md_t *md, mde_cookie_t vd_node)
3191 {
3192 	char		*device_path = NULL;
3193 	uint64_t	id = 0, ldc_id = 0;
3194 
3195 
3196 	if (md_get_prop_val(md, vd_node, VD_ID_PROP, &id) != 0) {
3197 		PRN("Error getting vdisk \"%s\"", VD_ID_PROP);
3198 		return;
3199 	}
3200 	PR0("Adding vdisk ID %lu", id);
3201 	if (md_get_prop_str(md, vd_node, VD_BLOCK_DEVICE_PROP,
3202 		&device_path) != 0) {
3203 		PRN("Error getting vdisk \"%s\"", VD_BLOCK_DEVICE_PROP);
3204 		return;
3205 	}
3206 
3207 	if (vds_get_ldc_id(md, vd_node, &ldc_id) != 0) {
3208 		PRN("Error getting LDC ID for vdisk %lu", id);
3209 		return;
3210 	}
3211 
3212 	if (vds_init_vd(vds, id, device_path, ldc_id) != 0) {
3213 		PRN("Failed to add vdisk ID %lu", id);
3214 		return;
3215 	}
3216 }
3217 
3218 static void
3219 vds_remove_vd(vds_t *vds, md_t *md, mde_cookie_t vd_node)
3220 {
3221 	uint64_t	id = 0;
3222 
3223 
3224 	if (md_get_prop_val(md, vd_node, VD_ID_PROP, &id) != 0) {
3225 		PRN("Unable to get \"%s\" property from vdisk's MD node",
3226 		    VD_ID_PROP);
3227 		return;
3228 	}
3229 	PR0("Removing vdisk ID %lu", id);
3230 	if (mod_hash_destroy(vds->vd_table, (mod_hash_key_t)id) != 0)
3231 		PRN("No vdisk entry found for vdisk ID %lu", id);
3232 }
3233 
3234 static void
3235 vds_change_vd(vds_t *vds, md_t *prev_md, mde_cookie_t prev_vd_node,
3236     md_t *curr_md, mde_cookie_t curr_vd_node)
3237 {
3238 	char		*curr_dev, *prev_dev;
3239 	uint64_t	curr_id = 0, curr_ldc_id = 0;
3240 	uint64_t	prev_id = 0, prev_ldc_id = 0;
3241 	size_t		len;
3242 
3243 
3244 	/* Validate that vdisk ID has not changed */
3245 	if (md_get_prop_val(prev_md, prev_vd_node, VD_ID_PROP, &prev_id) != 0) {
3246 		PRN("Error getting previous vdisk \"%s\" property",
3247 		    VD_ID_PROP);
3248 		return;
3249 	}
3250 	if (md_get_prop_val(curr_md, curr_vd_node, VD_ID_PROP, &curr_id) != 0) {
3251 		PRN("Error getting current vdisk \"%s\" property", VD_ID_PROP);
3252 		return;
3253 	}
3254 	if (curr_id != prev_id) {
3255 		PRN("Not changing vdisk:  ID changed from %lu to %lu",
3256 		    prev_id, curr_id);
3257 		return;
3258 	}
3259 
3260 	/* Validate that LDC ID has not changed */
3261 	if (vds_get_ldc_id(prev_md, prev_vd_node, &prev_ldc_id) != 0) {
3262 		PRN("Error getting LDC ID for vdisk %lu", prev_id);
3263 		return;
3264 	}
3265 
3266 	if (vds_get_ldc_id(curr_md, curr_vd_node, &curr_ldc_id) != 0) {
3267 		PRN("Error getting LDC ID for vdisk %lu", curr_id);
3268 		return;
3269 	}
3270 	if (curr_ldc_id != prev_ldc_id) {
3271 		_NOTE(NOTREACHED);	/* lint is confused */
3272 		PRN("Not changing vdisk:  "
3273 		    "LDC ID changed from %lu to %lu", prev_ldc_id, curr_ldc_id);
3274 		return;
3275 	}
3276 
3277 	/* Determine whether device path has changed */
3278 	if (md_get_prop_str(prev_md, prev_vd_node, VD_BLOCK_DEVICE_PROP,
3279 		&prev_dev) != 0) {
3280 		PRN("Error getting previous vdisk \"%s\"",
3281 		    VD_BLOCK_DEVICE_PROP);
3282 		return;
3283 	}
3284 	if (md_get_prop_str(curr_md, curr_vd_node, VD_BLOCK_DEVICE_PROP,
3285 		&curr_dev) != 0) {
3286 		PRN("Error getting current vdisk \"%s\"", VD_BLOCK_DEVICE_PROP);
3287 		return;
3288 	}
3289 	if (((len = strlen(curr_dev)) == strlen(prev_dev)) &&
3290 	    (strncmp(curr_dev, prev_dev, len) == 0))
3291 		return;	/* no relevant (supported) change */
3292 
3293 	PR0("Changing vdisk ID %lu", prev_id);
3294 
3295 	/* Remove old state, which will close vdisk and reset */
3296 	if (mod_hash_destroy(vds->vd_table, (mod_hash_key_t)prev_id) != 0)
3297 		PRN("No entry found for vdisk ID %lu", prev_id);
3298 
3299 	/* Re-initialize vdisk with new state */
3300 	if (vds_init_vd(vds, curr_id, curr_dev, curr_ldc_id) != 0) {
3301 		PRN("Failed to change vdisk ID %lu", curr_id);
3302 		return;
3303 	}
3304 }
3305 
3306 static int
3307 vds_process_md(void *arg, mdeg_result_t *md)
3308 {
3309 	int	i;
3310 	vds_t	*vds = arg;
3311 
3312 
3313 	if (md == NULL)
3314 		return (MDEG_FAILURE);
3315 	ASSERT(vds != NULL);
3316 
3317 	for (i = 0; i < md->removed.nelem; i++)
3318 		vds_remove_vd(vds, md->removed.mdp, md->removed.mdep[i]);
3319 	for (i = 0; i < md->match_curr.nelem; i++)
3320 		vds_change_vd(vds, md->match_prev.mdp, md->match_prev.mdep[i],
3321 		    md->match_curr.mdp, md->match_curr.mdep[i]);
3322 	for (i = 0; i < md->added.nelem; i++)
3323 		vds_add_vd(vds, md->added.mdp, md->added.mdep[i]);
3324 
3325 	return (MDEG_SUCCESS);
3326 }
3327 
3328 
3329 static int
3330 vds_do_attach(dev_info_t *dip)
3331 {
3332 	int			status, sz;
3333 	int			cfg_handle;
3334 	minor_t			instance = ddi_get_instance(dip);
3335 	vds_t			*vds;
3336 	mdeg_prop_spec_t	*pspecp;
3337 	mdeg_node_spec_t	*ispecp;
3338 
3339 	/*
3340 	 * The "cfg-handle" property of a vds node in an MD contains the MD's
3341 	 * notion of "instance", or unique identifier, for that node; OBP
3342 	 * stores the value of the "cfg-handle" MD property as the value of
3343 	 * the "reg" property on the node in the device tree it builds from
3344 	 * the MD and passes to Solaris.  Thus, we look up the devinfo node's
3345 	 * "reg" property value to uniquely identify this device instance when
3346 	 * registering with the MD event-generation framework.  If the "reg"
3347 	 * property cannot be found, the device tree state is presumably so
3348 	 * broken that there is no point in continuing.
3349 	 */
3350 	if (!ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
3351 		VD_REG_PROP)) {
3352 		PRN("vds \"%s\" property does not exist", VD_REG_PROP);
3353 		return (DDI_FAILURE);
3354 	}
3355 
3356 	/* Get the MD instance for later MDEG registration */
3357 	cfg_handle = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
3358 	    VD_REG_PROP, -1);
3359 
3360 	if (ddi_soft_state_zalloc(vds_state, instance) != DDI_SUCCESS) {
3361 		PRN("Could not allocate state for instance %u", instance);
3362 		return (DDI_FAILURE);
3363 	}
3364 
3365 	if ((vds = ddi_get_soft_state(vds_state, instance)) == NULL) {
3366 		PRN("Could not get state for instance %u", instance);
3367 		ddi_soft_state_free(vds_state, instance);
3368 		return (DDI_FAILURE);
3369 	}
3370 
3371 
3372 	vds->dip	= dip;
3373 	vds->vd_table	= mod_hash_create_ptrhash("vds_vd_table", VDS_NCHAINS,
3374 							vds_destroy_vd,
3375 							sizeof (void *));
3376 	ASSERT(vds->vd_table != NULL);
3377 
3378 	if ((status = ldi_ident_from_dip(dip, &vds->ldi_ident)) != 0) {
3379 		PRN("ldi_ident_from_dip() returned errno %d", status);
3380 		return (DDI_FAILURE);
3381 	}
3382 	vds->initialized |= VDS_LDI;
3383 
3384 	/* Register for MD updates */
3385 	sz = sizeof (vds_prop_template);
3386 	pspecp = kmem_alloc(sz, KM_SLEEP);
3387 	bcopy(vds_prop_template, pspecp, sz);
3388 
3389 	VDS_SET_MDEG_PROP_INST(pspecp, cfg_handle);
3390 
3391 	/* initialize the complete prop spec structure */
3392 	ispecp = kmem_zalloc(sizeof (mdeg_node_spec_t), KM_SLEEP);
3393 	ispecp->namep = "virtual-device";
3394 	ispecp->specp = pspecp;
3395 
3396 	if (mdeg_register(ispecp, &vd_match, vds_process_md, vds,
3397 		&vds->mdeg) != MDEG_SUCCESS) {
3398 		PRN("Unable to register for MD updates");
3399 		kmem_free(ispecp, sizeof (mdeg_node_spec_t));
3400 		kmem_free(pspecp, sz);
3401 		return (DDI_FAILURE);
3402 	}
3403 
3404 	vds->ispecp = ispecp;
3405 	vds->initialized |= VDS_MDEG;
3406 
3407 	/* Prevent auto-detaching so driver is available whenever MD changes */
3408 	if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) !=
3409 	    DDI_PROP_SUCCESS) {
3410 		PRN("failed to set \"%s\" property for instance %u",
3411 		    DDI_NO_AUTODETACH, instance);
3412 	}
3413 
3414 	ddi_report_dev(dip);
3415 	return (DDI_SUCCESS);
3416 }
3417 
3418 static int
3419 vds_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
3420 {
3421 	int	status;
3422 
3423 	switch (cmd) {
3424 	case DDI_ATTACH:
3425 		PR0("Attaching");
3426 		if ((status = vds_do_attach(dip)) != DDI_SUCCESS)
3427 			(void) vds_detach(dip, DDI_DETACH);
3428 		return (status);
3429 	case DDI_RESUME:
3430 		PR0("No action required for DDI_RESUME");
3431 		return (DDI_SUCCESS);
3432 	default:
3433 		return (DDI_FAILURE);
3434 	}
3435 }
3436 
3437 static struct dev_ops vds_ops = {
3438 	DEVO_REV,	/* devo_rev */
3439 	0,		/* devo_refcnt */
3440 	ddi_no_info,	/* devo_getinfo */
3441 	nulldev,	/* devo_identify */
3442 	nulldev,	/* devo_probe */
3443 	vds_attach,	/* devo_attach */
3444 	vds_detach,	/* devo_detach */
3445 	nodev,		/* devo_reset */
3446 	NULL,		/* devo_cb_ops */
3447 	NULL,		/* devo_bus_ops */
3448 	nulldev		/* devo_power */
3449 };
3450 
3451 static struct modldrv modldrv = {
3452 	&mod_driverops,
3453 	"virtual disk server v%I%",
3454 	&vds_ops,
3455 };
3456 
3457 static struct modlinkage modlinkage = {
3458 	MODREV_1,
3459 	&modldrv,
3460 	NULL
3461 };
3462 
3463 
3464 int
3465 _init(void)
3466 {
3467 	int		i, status;
3468 
3469 
3470 	if ((status = ddi_soft_state_init(&vds_state, sizeof (vds_t), 1)) != 0)
3471 		return (status);
3472 	if ((status = mod_install(&modlinkage)) != 0) {
3473 		ddi_soft_state_fini(&vds_state);
3474 		return (status);
3475 	}
3476 
3477 	/* Fill in the bit-mask of server-supported operations */
3478 	for (i = 0; i < vds_noperations; i++)
3479 		vds_operations |= 1 << (vds_operation[i].operation - 1);
3480 
3481 	return (0);
3482 }
3483 
3484 int
3485 _info(struct modinfo *modinfop)
3486 {
3487 	return (mod_info(&modlinkage, modinfop));
3488 }
3489 
3490 int
3491 _fini(void)
3492 {
3493 	int	status;
3494 
3495 
3496 	if ((status = mod_remove(&modlinkage)) != 0)
3497 		return (status);
3498 	ddi_soft_state_fini(&vds_state);
3499 	return (0);
3500 }
3501