xref: /titanic_50/usr/src/uts/sun4v/io/vds.c (revision b94bb0f0e78c11b6013e1a33c11fd73901947bfc)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Virtual disk server
31  */
32 
33 
34 #include <sys/types.h>
35 #include <sys/conf.h>
36 #include <sys/ddi.h>
37 #include <sys/dkio.h>
38 #include <sys/file.h>
39 #include <sys/mdeg.h>
40 #include <sys/modhash.h>
41 #include <sys/note.h>
42 #include <sys/pathname.h>
43 #include <sys/sunddi.h>
44 #include <sys/sunldi.h>
45 #include <sys/sysmacros.h>
46 #include <sys/vio_common.h>
47 #include <sys/vdsk_mailbox.h>
48 #include <sys/vdsk_common.h>
49 #include <sys/vtoc.h>
50 #include <sys/scsi/impl/uscsi.h>
51 
52 
53 /* Virtual disk server initialization flags */
54 #define	VDS_LOCKING		0x01
55 #define	VDS_LDI			0x02
56 #define	VDS_MDEG		0x04
57 
58 /* Virtual disk server tunable parameters */
59 #define	VDS_LDC_RETRIES		3
60 #define	VDS_NCHAINS		32
61 
62 /* Identification parameters for MD, synthetic dkio(7i) structures, etc. */
63 #define	VDS_NAME		"virtual-disk-server"
64 
65 #define	VD_NAME			"vd"
66 #define	VD_VOLUME_NAME		"vdisk"
67 #define	VD_ASCIILABEL		"Virtual Disk"
68 
69 #define	VD_CHANNEL_ENDPOINT	"channel-endpoint"
70 #define	VD_ID_PROP		"id"
71 #define	VD_BLOCK_DEVICE_PROP	"vds-block-device"
72 
73 /* Virtual disk initialization flags */
74 #define	VD_LOCKING		0x01
75 #define	VD_TASKQ		0x02
76 #define	VD_LDC			0x04
77 #define	VD_DRING		0x08
78 #define	VD_SID			0x10
79 #define	VD_SEQ_NUM		0x20
80 
81 /* Flags for opening/closing backing devices via LDI */
82 #define	VD_OPEN_FLAGS		(FEXCL | FREAD | FWRITE)
83 
84 /*
85  * By Solaris convention, slice/partition 2 represents the entire disk;
86  * unfortunately, this convention does not appear to be codified.
87  */
88 #define	VD_ENTIRE_DISK_SLICE	2
89 
90 /* Return a cpp token as a string */
91 #define	STRINGIZE(token)	#token
92 
93 /*
94  * Print a message prefixed with the current function name to the message log
95  * (and optionally to the console for verbose boots); these macros use cpp's
96  * concatenation of string literals and C99 variable-length-argument-list
97  * macros
98  */
99 #define	PRN(...)	_PRN("?%s():  "__VA_ARGS__, "")
100 #define	_PRN(format, ...)					\
101 	cmn_err(CE_CONT, format"%s", __func__, __VA_ARGS__)
102 
103 /* Return a pointer to the "i"th vdisk dring element */
104 #define	VD_DRING_ELEM(i)	((vd_dring_entry_t *)(void *)	\
105 	    (vd->dring + (i)*vd->descriptor_size))
106 
107 /* Return the virtual disk client's type as a string (for use in messages) */
108 #define	VD_CLIENT(vd)							\
109 	(((vd)->xfer_mode == VIO_DESC_MODE) ? "in-band client" :	\
110 	    (((vd)->xfer_mode == VIO_DRING_MODE) ? "dring client" :	\
111 		(((vd)->xfer_mode == 0) ? "null client" :		\
112 		    "unsupported client")))
113 
114 /* Debugging macros */
115 #ifdef DEBUG
116 #define	PR0 if (vd_msglevel > 0)	PRN
117 #define	PR1 if (vd_msglevel > 1)	PRN
118 #define	PR2 if (vd_msglevel > 2)	PRN
119 
120 #define	VD_DUMP_DRING_ELEM(elem)					\
121 	PRN("dst:%x op:%x st:%u nb:%lx addr:%lx ncook:%u\n",		\
122 	    elem->hdr.dstate,						\
123 	    elem->payload.operation,					\
124 	    elem->payload.status,					\
125 	    elem->payload.nbytes,					\
126 	    elem->payload.addr,						\
127 	    elem->payload.ncookies);
128 
129 #else	/* !DEBUG */
130 #define	PR0(...)
131 #define	PR1(...)
132 #define	PR2(...)
133 
134 #define	VD_DUMP_DRING_ELEM(elem)
135 
136 #endif	/* DEBUG */
137 
138 
139 typedef struct vds {
140 	uint_t		initialized;	/* driver inst initialization flags */
141 	dev_info_t	*dip;		/* driver inst devinfo pointer */
142 	kmutex_t	lock;		/* lock for this structure */
143 	ldi_ident_t	ldi_ident;	/* driver's identifier for LDI */
144 	mod_hash_t	*vd_table;	/* table of virtual disks served */
145 	mdeg_handle_t	mdeg;		/* handle for MDEG operations  */
146 } vds_t;
147 
148 typedef struct vd {
149 	uint_t			initialized;	/* vdisk initialization flags */
150 	kmutex_t		lock;		/* lock for this structure */
151 	vds_t			*vds;		/* server for this vdisk */
152 	ddi_taskq_t		*taskq;		/* taskq for this vdisk */
153 	ldi_handle_t		ldi_handle[V_NUMPAR];	/* LDI slice handles */
154 	dev_t			dev[V_NUMPAR];	/* dev numbers for slices */
155 	uint_t			nslices;	/* number for slices */
156 	size_t			vdisk_size;	/* number of blocks in vdisk */
157 	vd_disk_type_t		vdisk_type;	/* slice or entire disk */
158 	boolean_t		pseudo;		/* underlying pseudo dev */
159 	struct dk_geom		dk_geom;	/* synthetic for slice type */
160 	struct vtoc		vtoc;		/* synthetic for slice type */
161 	ldc_status_t		ldc_state;	/* LDC connection state */
162 	ldc_handle_t		ldc_handle;	/* handle for LDC comm */
163 	size_t			max_msglen;	/* largest LDC message len */
164 	boolean_t		enabled;	/* whether vdisk is enabled */
165 	vd_state_t		state;		/* client handshake state */
166 	uint8_t			xfer_mode;	/* transfer mode with client */
167 	uint32_t		sid;		/* client's session ID */
168 	uint64_t		seq_num;	/* message sequence number */
169 	uint64_t		dring_ident;	/* identifier of dring */
170 	ldc_dring_handle_t	dring_handle;	/* handle for dring ops */
171 	uint32_t		descriptor_size;	/* num bytes in desc */
172 	uint32_t		dring_len;	/* number of dring elements */
173 	caddr_t			dring;		/* address of dring */
174 } vd_t;
175 
176 typedef struct vds_operation {
177 	uint8_t	operation;
178 	int	(*function)(vd_t *vd, vd_dring_payload_t *request);
179 } vds_operation_t;
180 
181 typedef struct ioctl {
182 	uint8_t		operation;
183 	const char	*operation_name;
184 	int		cmd;
185 	const char	*cmd_name;
186 	uint_t		copy;
187 	size_t		nbytes;
188 } ioctl_t;
189 
190 
191 static int	vds_ldc_retries = VDS_LDC_RETRIES;
192 static void	*vds_state;
193 static uint64_t	vds_operations;	/* see vds_operation[] definition below */
194 
195 static int	vd_open_flags = VD_OPEN_FLAGS;
196 
197 #ifdef DEBUG
198 static int	vd_msglevel;
199 #endif /* DEBUG */
200 
201 
202 static int
203 vd_bread(vd_t *vd, vd_dring_payload_t *request)
204 {
205 	int		status;
206 	struct buf	buf;
207 
208 	PR1("Read %lu bytes at block %lu", request->nbytes, request->addr);
209 	if (request->nbytes == 0)
210 		return (EINVAL);	/* no service for trivial requests */
211 	ASSERT(mutex_owned(&vd->lock));
212 	ASSERT(request->slice < vd->nslices);
213 
214 	bioinit(&buf);
215 	buf.b_flags	= B_BUSY | B_READ;
216 	buf.b_bcount	= request->nbytes;
217 	buf.b_un.b_addr = kmem_alloc(buf.b_bcount, KM_SLEEP);
218 	buf.b_lblkno	= request->addr;
219 	buf.b_edev	= vd->dev[request->slice];
220 
221 	if ((status = ldi_strategy(vd->ldi_handle[request->slice], &buf)) == 0)
222 		status = biowait(&buf);
223 	biofini(&buf);
224 	if ((status == 0) &&
225 	    ((status = ldc_mem_copy(vd->ldc_handle, buf.b_un.b_addr, 0,
226 		    &request->nbytes, request->cookie, request->ncookies,
227 		    LDC_COPY_OUT)) != 0)) {
228 		PRN("ldc_mem_copy() returned errno %d copying to client",
229 		    status);
230 	}
231 	kmem_free(buf.b_un.b_addr, buf.b_bcount);	/* nbytes can change */
232 	return (status);
233 }
234 
235 static int
236 vd_do_bwrite(vd_t *vd, uint_t slice, diskaddr_t block, size_t nbytes,
237     ldc_mem_cookie_t *cookie, uint64_t ncookies, caddr_t data)
238 {
239 	int		status;
240 	struct buf	buf;
241 
242 	ASSERT(mutex_owned(&vd->lock));
243 	ASSERT(slice < vd->nslices);
244 	ASSERT(nbytes != 0);
245 	ASSERT(data != NULL);
246 
247 	/* Get data from client */
248 	if ((status = ldc_mem_copy(vd->ldc_handle, data, 0, &nbytes,
249 		    cookie, ncookies, LDC_COPY_IN)) != 0) {
250 		PRN("ldc_mem_copy() returned errno %d copying from client",
251 		    status);
252 		return (status);
253 	}
254 
255 	bioinit(&buf);
256 	buf.b_flags	= B_BUSY | B_WRITE;
257 	buf.b_bcount	= nbytes;
258 	buf.b_un.b_addr	= data;
259 	buf.b_lblkno	= block;
260 	buf.b_edev	= vd->dev[slice];
261 
262 	if ((status = ldi_strategy(vd->ldi_handle[slice], &buf)) == 0)
263 		status = biowait(&buf);
264 	biofini(&buf);
265 	return (status);
266 }
267 
268 static int
269 vd_bwrite(vd_t *vd, vd_dring_payload_t *request)
270 {
271 	int	status;
272 	caddr_t	data;
273 
274 
275 	PR1("Write %ld bytes at block %lu", request->nbytes, request->addr);
276 	if (request->nbytes == 0)
277 		return (EINVAL);	/* no service for trivial requests */
278 	data = kmem_alloc(request->nbytes, KM_SLEEP);
279 	status = vd_do_bwrite(vd, request->slice, request->addr,
280 	    request->nbytes, request->cookie, request->ncookies, data);
281 	kmem_free(data, request->nbytes);
282 	return (status);
283 }
284 
285 static int
286 vd_do_slice_ioctl(vd_t *vd, int cmd, void *buf)
287 {
288 	switch (cmd) {
289 	case DKIOCGGEOM:
290 		ASSERT(buf != NULL);
291 		bcopy(&vd->dk_geom, buf, sizeof (vd->dk_geom));
292 		return (0);
293 	case DKIOCGVTOC:
294 		ASSERT(buf != NULL);
295 		bcopy(&vd->vtoc, buf, sizeof (vd->vtoc));
296 		return (0);
297 	default:
298 		return (ENOTSUP);
299 	}
300 }
301 
302 static int
303 vd_do_ioctl(vd_t *vd, vd_dring_payload_t *request, void* buf, ioctl_t *ioctl)
304 {
305 	int	rval = 0, status;
306 	size_t	nbytes = request->nbytes;	/* modifiable copy */
307 
308 
309 	ASSERT(mutex_owned(&vd->lock));
310 	ASSERT(request->slice < vd->nslices);
311 	PR0("Performing %s", ioctl->operation_name);
312 
313 	/* Get data from client, if necessary */
314 	if (ioctl->copy & VD_COPYIN)  {
315 		ASSERT(nbytes != 0 && buf != NULL);
316 		PR1("Getting \"arg\" data from client");
317 		if ((status = ldc_mem_copy(vd->ldc_handle, buf, 0, &nbytes,
318 			    request->cookie, request->ncookies,
319 			    LDC_COPY_IN)) != 0) {
320 			PRN("ldc_mem_copy() returned errno %d "
321 			    "copying from client", status);
322 			return (status);
323 		}
324 	}
325 
326 	/*
327 	 * Handle single-slice block devices internally; otherwise, have the
328 	 * real driver perform the ioctl()
329 	 */
330 	if (vd->vdisk_type == VD_DISK_TYPE_SLICE && !vd->pseudo) {
331 		if ((status = vd_do_slice_ioctl(vd, ioctl->cmd, buf)) != 0)
332 			return (status);
333 	} else if ((status = ldi_ioctl(vd->ldi_handle[request->slice],
334 		    ioctl->cmd, (intptr_t)buf, FKIOCTL, kcred, &rval)) != 0) {
335 		PR0("ldi_ioctl(%s) = errno %d", ioctl->cmd_name, status);
336 		return (status);
337 	}
338 #ifdef DEBUG
339 	if (rval != 0) {
340 		PRN("%s set rval = %d, which is not being returned to client",
341 		    ioctl->cmd_name, rval);
342 	}
343 #endif /* DEBUG */
344 
345 	/* Send data to client, if necessary */
346 	if (ioctl->copy & VD_COPYOUT)  {
347 		ASSERT(nbytes != 0 && buf != NULL);
348 		PR1("Sending \"arg\" data to client");
349 		if ((status = ldc_mem_copy(vd->ldc_handle, buf, 0, &nbytes,
350 			    request->cookie, request->ncookies,
351 			    LDC_COPY_OUT)) != 0) {
352 			PRN("ldc_mem_copy() returned errno %d "
353 			    "copying to client", status);
354 			return (status);
355 		}
356 	}
357 
358 	return (status);
359 }
360 
361 #define	RNDSIZE(expr) P2ROUNDUP(sizeof (expr), sizeof (uint64_t))
362 static int
363 vd_ioctl(vd_t *vd, vd_dring_payload_t *request)
364 {
365 	static ioctl_t	ioctl[] = {
366 		/* Command (no-copy) operations */
367 		{VD_OP_FLUSH, STRINGIZE(VD_OP_FLUSH), DKIOCFLUSHWRITECACHE,
368 		    STRINGIZE(DKIOCFLUSHWRITECACHE), 0, 0},
369 
370 		/* "Get" (copy-out) operations */
371 		{VD_OP_GET_WCE, STRINGIZE(VD_OP_GET_WCE), DKIOCGETWCE,
372 		    STRINGIZE(DKIOCGETWCE), VD_COPYOUT, RNDSIZE(int)},
373 		{VD_OP_GET_DISKGEOM, STRINGIZE(VD_OP_GET_DISKGEOM), DKIOCGGEOM,
374 		    STRINGIZE(DKIOCGGEOM), VD_COPYOUT, RNDSIZE(struct dk_geom)},
375 		{VD_OP_GET_VTOC, STRINGIZE(VD_OP_GET_VTOC), DKIOCGVTOC,
376 		    STRINGIZE(DKIOCGVTOC), VD_COPYOUT, RNDSIZE(struct vtoc)},
377 
378 		/* "Set" (copy-in) operations */
379 		{VD_OP_SET_WCE, STRINGIZE(VD_OP_SET_WCE), DKIOCSETWCE,
380 		    STRINGIZE(DKIOCSETWCE), VD_COPYOUT, RNDSIZE(int)},
381 		{VD_OP_SET_DISKGEOM, STRINGIZE(VD_OP_SET_DISKGEOM), DKIOCSGEOM,
382 		    STRINGIZE(DKIOCSGEOM), VD_COPYIN, RNDSIZE(struct dk_geom)},
383 		{VD_OP_SET_VTOC, STRINGIZE(VD_OP_SET_VTOC), DKIOCSVTOC,
384 		    STRINGIZE(DKIOCSVTOC), VD_COPYIN, RNDSIZE(struct vtoc)},
385 
386 		/* "Get/set" (copy-in/copy-out) operations */
387 		{VD_OP_SCSICMD, STRINGIZE(VD_OP_SCSICMD), USCSICMD,
388 		    STRINGIZE(USCSICMD), VD_COPYIN|VD_COPYOUT,
389 		    RNDSIZE(struct uscsi_cmd)}
390 
391 	};
392 	int		i, status;
393 	void		*buf = NULL;
394 	size_t		nioctls = (sizeof (ioctl))/(sizeof (ioctl[0]));
395 
396 
397 	ASSERT(mutex_owned(&vd->lock));
398 	ASSERT(request->slice < vd->nslices);
399 
400 	/*
401 	 * Determine ioctl corresponding to caller's "operation" and
402 	 * validate caller's "nbytes"
403 	 */
404 	for (i = 0; i < nioctls; i++) {
405 		if (request->operation == ioctl[i].operation) {
406 			if (request->nbytes > ioctl[i].nbytes) {
407 				PRN("%s:  Expected <= %lu \"nbytes\", "
408 				    "got %lu", ioctl[i].operation_name,
409 				    ioctl[i].nbytes, request->nbytes);
410 				return (EINVAL);
411 			} else if ((request->nbytes % sizeof (uint64_t)) != 0) {
412 				PRN("%s:  nbytes = %lu not a multiple of %lu",
413 				    ioctl[i].operation_name, request->nbytes,
414 				    sizeof (uint64_t));
415 				return (EINVAL);
416 			}
417 
418 			break;
419 		}
420 	}
421 	ASSERT(i < nioctls);	/* because "operation" already validated */
422 
423 	if (request->nbytes)
424 		buf = kmem_zalloc(request->nbytes, KM_SLEEP);
425 	status = vd_do_ioctl(vd, request, buf, &ioctl[i]);
426 	if (request->nbytes)
427 		kmem_free(buf, request->nbytes);
428 	return (status);
429 }
430 
431 /*
432  * Define the supported operations once the functions for performing them have
433  * been defined
434  */
435 static const vds_operation_t	vds_operation[] = {
436 	{VD_OP_BREAD,		vd_bread},
437 	{VD_OP_BWRITE,		vd_bwrite},
438 	{VD_OP_FLUSH,		vd_ioctl},
439 	{VD_OP_GET_WCE,		vd_ioctl},
440 	{VD_OP_SET_WCE,		vd_ioctl},
441 	{VD_OP_GET_VTOC,	vd_ioctl},
442 	{VD_OP_SET_VTOC,	vd_ioctl},
443 	{VD_OP_GET_DISKGEOM,	vd_ioctl},
444 	{VD_OP_SET_DISKGEOM,	vd_ioctl},
445 	{VD_OP_SCSICMD,		vd_ioctl}
446 };
447 
448 static const size_t	vds_noperations =
449 	(sizeof (vds_operation))/(sizeof (vds_operation[0]));
450 
451 /*
452  * Process a request using a defined operation
453  */
454 static int
455 vd_process_request(vd_t *vd, vd_dring_payload_t *request)
456 {
457 	int	i;
458 
459 
460 	PR1("Entered");
461 	ASSERT(mutex_owned(&vd->lock));
462 
463 	/* Range-check slice */
464 	if (request->slice >= vd->nslices) {
465 		PRN("Invalid \"slice\" %u (max %u) for virtual disk",
466 		    request->slice, (vd->nslices - 1));
467 		return (EINVAL);
468 	}
469 
470 	/* Perform the requested operation */
471 	for (i = 0; i < vds_noperations; i++)
472 		if (request->operation == vds_operation[i].operation)
473 			return (vds_operation[i].function(vd, request));
474 
475 	/* No matching operation found */
476 	PRN("Unsupported operation %u", request->operation);
477 	return (ENOTSUP);
478 }
479 
480 static int
481 send_msg(ldc_handle_t ldc_handle, void *msg, size_t msglen)
482 {
483 	int	retry, status;
484 	size_t	nbytes;
485 
486 
487 	for (retry = 0, status = EWOULDBLOCK;
488 	    retry < vds_ldc_retries && status == EWOULDBLOCK;
489 	    retry++) {
490 		PR1("ldc_write() attempt %d", (retry + 1));
491 		nbytes = msglen;
492 		status = ldc_write(ldc_handle, msg, &nbytes);
493 	}
494 
495 	if (status != 0) {
496 		PRN("ldc_write() returned errno %d", status);
497 		return (status);
498 	} else if (nbytes != msglen) {
499 		PRN("ldc_write() performed only partial write");
500 		return (EIO);
501 	}
502 
503 	PR1("SENT %lu bytes", msglen);
504 	return (0);
505 }
506 
507 /*
508  * Return 1 if the "type", "subtype", and "env" fields of the "tag" first
509  * argument match the corresponding remaining arguments; otherwise, return 0
510  */
511 int
512 vd_msgtype(vio_msg_tag_t *tag, int type, int subtype, int env)
513 {
514 	return ((tag->vio_msgtype == type) &&
515 		(tag->vio_subtype == subtype) &&
516 		(tag->vio_subtype_env == env)) ? 1 : 0;
517 }
518 
519 static int
520 process_ver_msg(vio_msg_t *msg, size_t msglen)
521 {
522 	vio_ver_msg_t	*ver_msg = (vio_ver_msg_t *)msg;
523 
524 
525 	ASSERT(msglen >= sizeof (msg->tag));
526 
527 	if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO,
528 		VIO_VER_INFO)) {
529 		return (ENOMSG);	/* not a version message */
530 	}
531 
532 	if (msglen != sizeof (*ver_msg)) {
533 		PRN("Expected %lu-byte version message; "
534 		    "received %lu bytes", sizeof (*ver_msg), msglen);
535 		return (EBADMSG);
536 	}
537 
538 	if (ver_msg->dev_class != VDEV_DISK) {
539 		PRN("Expected device class %u (disk); received %u",
540 		    VDEV_DISK, ver_msg->dev_class);
541 		return (EBADMSG);
542 	}
543 
544 	if ((ver_msg->ver_major != VD_VER_MAJOR) ||
545 	    (ver_msg->ver_minor != VD_VER_MINOR)) {
546 		/* Unsupported version; send back supported version */
547 		ver_msg->ver_major = VD_VER_MAJOR;
548 		ver_msg->ver_minor = VD_VER_MINOR;
549 		return (EBADMSG);
550 	}
551 
552 	/* Valid message, version accepted */
553 	ver_msg->dev_class = VDEV_DISK_SERVER;
554 	return (0);
555 }
556 
557 static int
558 vd_process_attr_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
559 {
560 	vd_attr_msg_t	*attr_msg = (vd_attr_msg_t *)msg;
561 
562 
563 	PR0("Entered");
564 	ASSERT(mutex_owned(&vd->lock));
565 	ASSERT(msglen >= sizeof (msg->tag));
566 
567 	if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO,
568 		VIO_ATTR_INFO)) {
569 		return (ENOMSG);	/* not an attribute message */
570 	}
571 
572 	if (msglen != sizeof (*attr_msg)) {
573 		PRN("Expected %lu-byte attribute message; "
574 		    "received %lu bytes", sizeof (*attr_msg), msglen);
575 		return (EBADMSG);
576 	}
577 
578 	if (attr_msg->max_xfer_sz == 0) {
579 		PRN("Received maximum transfer size of 0 from client");
580 		return (EBADMSG);
581 	}
582 
583 	if ((attr_msg->xfer_mode != VIO_DESC_MODE) &&
584 	    (attr_msg->xfer_mode != VIO_DRING_MODE)) {
585 		PRN("Client requested unsupported transfer mode");
586 		return (EBADMSG);
587 	}
588 
589 
590 	/* Success:  valid message and transfer mode */
591 	vd->xfer_mode = attr_msg->xfer_mode;
592 	if (vd->xfer_mode == VIO_DESC_MODE) {
593 		/*
594 		 * The vd_dring_inband_msg_t contains one cookie; need room
595 		 * for up to n-1 more cookies, where "n" is the number of full
596 		 * pages plus possibly one partial page required to cover
597 		 * "max_xfer_sz".  Add room for one more cookie if
598 		 * "max_xfer_sz" isn't an integral multiple of the page size.
599 		 * Must first get the maximum transfer size in bytes.
600 		 */
601 #if 1	/* NEWOBP */
602 		size_t	max_xfer_bytes = attr_msg->vdisk_block_size ?
603 		    attr_msg->vdisk_block_size*attr_msg->max_xfer_sz :
604 		    attr_msg->max_xfer_sz;
605 		size_t	max_inband_msglen =
606 		    sizeof (vd_dring_inband_msg_t) +
607 		    ((max_xfer_bytes/PAGESIZE +
608 			((max_xfer_bytes % PAGESIZE) ? 1 : 0))*
609 			(sizeof (ldc_mem_cookie_t)));
610 #else	/* NEWOBP */
611 		size_t	max_inband_msglen =
612 		    sizeof (vd_dring_inband_msg_t) +
613 		    ((attr_msg->max_xfer_sz/PAGESIZE
614 			+ (attr_msg->max_xfer_sz % PAGESIZE ? 1 : 0))*
615 			(sizeof (ldc_mem_cookie_t)));
616 #endif	/* NEWOBP */
617 
618 		/*
619 		 * Set the maximum expected message length to
620 		 * accommodate in-band-descriptor messages with all
621 		 * their cookies
622 		 */
623 		vd->max_msglen = MAX(vd->max_msglen, max_inband_msglen);
624 	}
625 
626 	attr_msg->vdisk_size = vd->vdisk_size;
627 	attr_msg->vdisk_type = vd->vdisk_type;
628 	attr_msg->operations = vds_operations;
629 	PR0("%s", VD_CLIENT(vd));
630 	return (0);
631 }
632 
633 static int
634 vd_process_dring_reg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
635 {
636 	int			status;
637 	size_t			expected;
638 	ldc_mem_info_t		dring_minfo;
639 	vio_dring_reg_msg_t	*reg_msg = (vio_dring_reg_msg_t *)msg;
640 
641 
642 	PR0("Entered");
643 	ASSERT(mutex_owned(&vd->lock));
644 	ASSERT(msglen >= sizeof (msg->tag));
645 
646 	if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO,
647 		VIO_DRING_REG)) {
648 		return (ENOMSG);	/* not a register-dring message */
649 	}
650 
651 	if (msglen < sizeof (*reg_msg)) {
652 		PRN("Expected at least %lu-byte register-dring message; "
653 		    "received %lu bytes", sizeof (*reg_msg), msglen);
654 		return (EBADMSG);
655 	}
656 
657 	expected = sizeof (*reg_msg) +
658 	    (reg_msg->ncookies - 1)*(sizeof (reg_msg->cookie[0]));
659 	if (msglen != expected) {
660 		PRN("Expected %lu-byte register-dring message; "
661 		    "received %lu bytes", expected, msglen);
662 		return (EBADMSG);
663 	}
664 
665 	if (vd->initialized & VD_DRING) {
666 		PRN("A dring was previously registered; only support one");
667 		return (EBADMSG);
668 	}
669 
670 	if (reg_msg->ncookies != 1) {
671 		/*
672 		 * In addition to fixing the assertion in the success case
673 		 * below, supporting drings which require more than one
674 		 * "cookie" requires increasing the value of vd->max_msglen
675 		 * somewhere in the code path prior to receiving the message
676 		 * which results in calling this function.  Note that without
677 		 * making this change, the larger message size required to
678 		 * accommodate multiple cookies cannot be successfully
679 		 * received, so this function will not even get called.
680 		 * Gracefully accommodating more dring cookies might
681 		 * reasonably demand exchanging an additional attribute or
682 		 * making a minor protocol adjustment
683 		 */
684 		PRN("reg_msg->ncookies = %u != 1", reg_msg->ncookies);
685 		return (EBADMSG);
686 	}
687 
688 	status = ldc_mem_dring_map(vd->ldc_handle, reg_msg->cookie,
689 	    reg_msg->ncookies, reg_msg->num_descriptors,
690 	    reg_msg->descriptor_size, LDC_SHADOW_MAP, &vd->dring_handle);
691 	if (status != 0) {
692 		PRN("ldc_mem_dring_map() returned errno %d", status);
693 		return (status);
694 	}
695 
696 	/*
697 	 * To remove the need for this assertion, must call
698 	 * ldc_mem_dring_nextcookie() successfully ncookies-1 times after a
699 	 * successful call to ldc_mem_dring_map()
700 	 */
701 	ASSERT(reg_msg->ncookies == 1);
702 
703 	if ((status =
704 		ldc_mem_dring_info(vd->dring_handle, &dring_minfo)) != 0) {
705 		PRN("ldc_mem_dring_info() returned errno %d", status);
706 		if ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0)
707 			PRN("ldc_mem_dring_unmap() returned errno %d", status);
708 		return (status);
709 	}
710 
711 	if (dring_minfo.vaddr == NULL) {
712 		PRN("Descriptor ring virtual address is NULL");
713 		return (EBADMSG);	/* FIXME appropriate status? */
714 	}
715 
716 
717 	/* Valid message and dring mapped */
718 	PR1("descriptor size = %u, dring length = %u",
719 	    vd->descriptor_size, vd->dring_len);
720 	vd->initialized |= VD_DRING;
721 	vd->dring_ident = 1;	/* "There Can Be Only One" */
722 	vd->dring = dring_minfo.vaddr;
723 	vd->descriptor_size = reg_msg->descriptor_size;
724 	vd->dring_len = reg_msg->num_descriptors;
725 	reg_msg->dring_ident = vd->dring_ident;
726 	return (0);
727 }
728 
729 static int
730 vd_process_dring_unreg_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
731 {
732 	vio_dring_unreg_msg_t	*unreg_msg = (vio_dring_unreg_msg_t *)msg;
733 
734 
735 	PR0("Entered");
736 	ASSERT(mutex_owned(&vd->lock));
737 	ASSERT(msglen >= sizeof (msg->tag));
738 
739 	if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO,
740 		VIO_DRING_UNREG)) {
741 		return (ENOMSG);	/* not an unregister-dring message */
742 	}
743 
744 	if (msglen != sizeof (*unreg_msg)) {
745 		PRN("Expected %lu-byte unregister-dring message; "
746 		    "received %lu bytes", sizeof (*unreg_msg), msglen);
747 		return (EBADMSG);
748 	}
749 
750 	if (unreg_msg->dring_ident != vd->dring_ident) {
751 		PRN("Expected dring ident %lu; received %lu",
752 		    vd->dring_ident, unreg_msg->dring_ident);
753 		return (EBADMSG);
754 	}
755 
756 	/* FIXME set ack in unreg_msg? */
757 	return (0);
758 }
759 
760 static int
761 process_rdx_msg(vio_msg_t *msg, size_t msglen)
762 {
763 	PR0("Entered");
764 	ASSERT(msglen >= sizeof (msg->tag));
765 
766 	if (!vd_msgtype(&msg->tag, VIO_TYPE_CTRL, VIO_SUBTYPE_INFO, VIO_RDX))
767 		return (ENOMSG);	/* not an RDX message */
768 
769 	if (msglen != sizeof (vio_rdx_msg_t)) {
770 		PRN("Expected %lu-byte RDX message; received %lu bytes",
771 		    sizeof (vio_rdx_msg_t), msglen);
772 		return (EBADMSG);
773 	}
774 
775 	return (0);
776 }
777 
778 static void
779 vd_reset_connection(vd_t *vd, boolean_t reset_ldc)
780 {
781 	int	status = 0;
782 
783 
784 	ASSERT(mutex_owned(&vd->lock));
785 	PR0("Resetting connection with %s", VD_CLIENT(vd));
786 	if ((vd->initialized & VD_DRING) &&
787 	    ((status = ldc_mem_dring_unmap(vd->dring_handle)) != 0))
788 		PRN("ldc_mem_dring_unmap() returned errno %d", status);
789 	if ((reset_ldc == B_TRUE) &&
790 	    ((status = ldc_reset(vd->ldc_handle)) != 0))
791 		PRN("ldc_reset() returned errno %d", status);
792 	vd->initialized &= ~(VD_SID | VD_SEQ_NUM | VD_DRING);
793 	vd->state = VD_STATE_INIT;
794 	vd->max_msglen = sizeof (vio_msg_t);	/* baseline vio message size */
795 }
796 
797 static int
798 vd_check_seq_num(vd_t *vd, uint64_t seq_num)
799 {
800 	ASSERT(mutex_owned(&vd->lock));
801 	if ((vd->initialized & VD_SEQ_NUM) && (seq_num != vd->seq_num + 1)) {
802 		PRN("Received seq_num %lu; expected %lu",
803 		    seq_num, (vd->seq_num + 1));
804 		vd_reset_connection(vd, B_FALSE);
805 		return (1);
806 	}
807 
808 	vd->seq_num = seq_num;
809 	vd->initialized |= VD_SEQ_NUM;	/* superfluous after first time... */
810 	return (0);
811 }
812 
813 /*
814  * Return the expected size of an inband-descriptor message with all the
815  * cookies it claims to include
816  */
817 static size_t
818 expected_inband_size(vd_dring_inband_msg_t *msg)
819 {
820 	return ((sizeof (*msg)) +
821 	    (msg->payload.ncookies - 1)*(sizeof (msg->payload.cookie[0])));
822 }
823 
824 /*
825  * Process an in-band descriptor message:  used with clients like OBP, with
826  * which vds exchanges descriptors within VIO message payloads, rather than
827  * operating on them within a descriptor ring
828  */
829 static int
830 vd_process_desc_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
831 {
832 	size_t			expected;
833 	vd_dring_inband_msg_t	*desc_msg = (vd_dring_inband_msg_t *)msg;
834 
835 
836 	PR1("Entered");
837 	ASSERT(mutex_owned(&vd->lock));
838 	ASSERT(msglen >= sizeof (msg->tag));
839 
840 	if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO,
841 		VIO_DESC_DATA))
842 		return (ENOMSG);	/* not an in-band-descriptor message */
843 
844 	if (msglen < sizeof (*desc_msg)) {
845 		PRN("Expected at least %lu-byte descriptor message; "
846 		    "received %lu bytes", sizeof (*desc_msg), msglen);
847 		return (EBADMSG);
848 	}
849 
850 	if (msglen != (expected = expected_inband_size(desc_msg))) {
851 		PRN("Expected %lu-byte descriptor message; "
852 		    "received %lu bytes", expected, msglen);
853 		return (EBADMSG);
854 	}
855 
856 	if (vd_check_seq_num(vd, desc_msg->hdr.seq_num) != 0) {
857 		return (EBADMSG);
858 	}
859 
860 	/* Valid message; process the request */
861 	desc_msg->payload.status = vd_process_request(vd, &desc_msg->payload);
862 	return (0);
863 }
864 
865 static boolean_t
866 vd_accept_dring_elems(vd_t *vd, uint32_t start, uint32_t ndesc)
867 {
868 	uint32_t	i, n;
869 
870 
871 	/* Check descriptor states */
872 	for (n = ndesc, i = start; n > 0; n--, i = (i + 1) % vd->dring_len) {
873 		if (VD_DRING_ELEM(i)->hdr.dstate != VIO_DESC_READY) {
874 			PRN("descriptor %u not ready", i);
875 			VD_DUMP_DRING_ELEM(VD_DRING_ELEM(i));
876 			return (B_FALSE);
877 		}
878 	}
879 
880 	/* Descriptors are valid; accept them */
881 	for (n = ndesc, i = start; n > 0; n--, i = (i + 1) % vd->dring_len)
882 		VD_DRING_ELEM(i)->hdr.dstate = VIO_DESC_ACCEPTED;
883 
884 	return (B_TRUE);
885 }
886 
887 static int
888 vd_process_dring(vd_t *vd, uint32_t start, uint32_t end)
889 {
890 	int		status;
891 	boolean_t	accepted;
892 	uint32_t	i, io_status, n, ndesc;
893 
894 
895 	ASSERT(mutex_owned(&vd->lock));
896 	PR1("start = %u, end = %u", start, end);
897 
898 	/* Validate descriptor range */
899 	if ((start >= vd->dring_len) || (end >= vd->dring_len)) {
900 		PRN("\"start\" = %u, \"end\" = %u; both must be less than %u",
901 		    start, end, vd->dring_len);
902 		return (EINVAL);
903 	}
904 
905 	/* Acquire updated dring elements */
906 	if ((status = ldc_mem_dring_acquire(vd->dring_handle,
907 		    start, end)) != 0) {
908 		PRN("ldc_mem_dring_acquire() returned errno %d", status);
909 		return (status);
910 	}
911 	/* Accept updated dring elements */
912 	ndesc = ((end < start) ? end + vd->dring_len : end) - start + 1;
913 	PR1("ndesc = %u", ndesc);
914 	accepted = vd_accept_dring_elems(vd, start, ndesc);
915 	/* Release dring elements */
916 	if ((status = ldc_mem_dring_release(vd->dring_handle,
917 		    start, end)) != 0) {
918 		PRN("ldc_mem_dring_release() returned errno %d", status);
919 		return (status);
920 	}
921 	/* If a descriptor was in the wrong state, return an error */
922 	if (!accepted)
923 		return (EINVAL);
924 
925 
926 	/* Process accepted dring elements */
927 	for (n = ndesc, i = start; n > 0; n--, i = (i + 1) % vd->dring_len) {
928 		vd_dring_entry_t	*elem = VD_DRING_ELEM(i);
929 
930 		/* Process descriptor outside acquire/release bracket */
931 		PR1("Processing dring element %u", i);
932 		io_status = vd_process_request(vd, &elem->payload);
933 
934 		/* Re-acquire client's dring element */
935 		if ((status = ldc_mem_dring_acquire(vd->dring_handle,
936 			    i, i)) != 0) {
937 			PRN("ldc_mem_dring_acquire() returned errno %d",
938 			    status);
939 			return (status);
940 		}
941 		/* Update processed element */
942 		if (elem->hdr.dstate == VIO_DESC_ACCEPTED) {
943 			elem->payload.status	= io_status;
944 			elem->hdr.dstate	= VIO_DESC_DONE;
945 		} else {
946 			/* Perhaps client timed out waiting for I/O... */
947 			accepted = B_FALSE;
948 			PRN("element %u no longer \"accepted\"", i);
949 			VD_DUMP_DRING_ELEM(elem);
950 		}
951 		/* Release updated processed element */
952 		if ((status = ldc_mem_dring_release(vd->dring_handle,
953 			    i, i)) != 0) {
954 			PRN("ldc_mem_dring_release() returned errno %d",
955 			    status);
956 			return (status);
957 		}
958 		/* If the descriptor was in the wrong state, return an error */
959 		if (!accepted)
960 			return (EINVAL);
961 	}
962 
963 	return (0);
964 }
965 
966 static int
967 vd_process_dring_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
968 {
969 	vio_dring_msg_t	*dring_msg = (vio_dring_msg_t *)msg;
970 
971 
972 	PR1("Entered");
973 	ASSERT(mutex_owned(&vd->lock));
974 	ASSERT(msglen >= sizeof (msg->tag));
975 
976 	if (!vd_msgtype(&msg->tag, VIO_TYPE_DATA, VIO_SUBTYPE_INFO,
977 		VIO_DRING_DATA)) {
978 		return (ENOMSG);	/* not a dring-data message */
979 	}
980 
981 	if (msglen != sizeof (*dring_msg)) {
982 		PRN("Expected %lu-byte dring message; received %lu bytes",
983 		    sizeof (*dring_msg), msglen);
984 		return (EBADMSG);
985 	}
986 
987 	if (vd_check_seq_num(vd, dring_msg->seq_num) != 0) {
988 		return (EBADMSG);
989 	}
990 
991 	if (dring_msg->dring_ident != vd->dring_ident) {
992 		PRN("Expected dring ident %lu; received ident %lu",
993 		    vd->dring_ident, dring_msg->dring_ident);
994 		return (EBADMSG);
995 	}
996 
997 
998 	/* Valid message; process dring */
999 	dring_msg->tag.vio_subtype = VIO_SUBTYPE_ACK;
1000 	return (vd_process_dring(vd, dring_msg->start_idx, dring_msg->end_idx));
1001 }
1002 
1003 static int
1004 recv_msg(ldc_handle_t ldc_handle, void *msg, size_t *nbytes)
1005 {
1006 	int	retry, status;
1007 	size_t	size = *nbytes;
1008 	boolean_t	isempty = B_FALSE;
1009 
1010 
1011 	/* FIXME work around interrupt problem */
1012 	if ((ldc_chkq(ldc_handle, &isempty) != 0) || isempty)
1013 		return (ENOMSG);
1014 
1015 	for (retry = 0, status = ETIMEDOUT;
1016 	    retry < vds_ldc_retries && status == ETIMEDOUT;
1017 	    retry++) {
1018 		PR1("ldc_read() attempt %d", (retry + 1));
1019 		*nbytes = size;
1020 		status = ldc_read(ldc_handle, msg, nbytes);
1021 	}
1022 
1023 	if (status != 0) {
1024 		PRN("ldc_read() returned errno %d", status);
1025 		return (status);
1026 	} else if (*nbytes == 0) {
1027 		PR1("ldc_read() returned 0 and no message read");
1028 		return (ENOMSG);
1029 	}
1030 
1031 	PR1("RCVD %lu-byte message", *nbytes);
1032 	return (0);
1033 }
1034 
1035 static int
1036 vd_do_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
1037 {
1038 	int		status;
1039 
1040 
1041 	PR1("Processing (%x/%x/%x) message", msg->tag.vio_msgtype,
1042 	    msg->tag.vio_subtype, msg->tag.vio_subtype_env);
1043 	ASSERT(mutex_owned(&vd->lock));
1044 
1045 	/*
1046 	 * Validate session ID up front, since it applies to all messages
1047 	 * once set
1048 	 */
1049 	if ((msg->tag.vio_sid != vd->sid) && (vd->initialized & VD_SID)) {
1050 		PRN("Expected SID %u, received %u", vd->sid,
1051 		    msg->tag.vio_sid);
1052 		return (EBADMSG);
1053 	}
1054 
1055 
1056 	/*
1057 	 * Process the received message based on connection state
1058 	 */
1059 	switch (vd->state) {
1060 	case VD_STATE_INIT:	/* expect version message */
1061 		if ((status = process_ver_msg(msg, msglen)) != 0)
1062 			return (status);
1063 
1064 		/* The first version message sets the SID */
1065 		ASSERT(!(vd->initialized & VD_SID));
1066 		vd->sid = msg->tag.vio_sid;
1067 		vd->initialized |= VD_SID;
1068 
1069 		/* Version negotiated, move to that state */
1070 		vd->state = VD_STATE_VER;
1071 		return (0);
1072 
1073 	case VD_STATE_VER:	/* expect attribute message */
1074 		if ((status = vd_process_attr_msg(vd, msg, msglen)) != 0)
1075 			return (status);
1076 
1077 		/* Attributes exchanged, move to that state */
1078 		vd->state = VD_STATE_ATTR;
1079 		return (0);
1080 
1081 	case VD_STATE_ATTR:
1082 		switch (vd->xfer_mode) {
1083 		case VIO_DESC_MODE:	/* expect RDX message */
1084 			if ((status = process_rdx_msg(msg, msglen)) != 0)
1085 				return (status);
1086 
1087 			/* Ready to receive in-band descriptors */
1088 			vd->state = VD_STATE_DATA;
1089 			return (0);
1090 
1091 		case VIO_DRING_MODE:	/* expect register-dring message */
1092 			if ((status =
1093 				vd_process_dring_reg_msg(vd, msg, msglen)) != 0)
1094 				return (status);
1095 
1096 			/* One dring negotiated, move to that state */
1097 			vd->state = VD_STATE_DRING;
1098 			return (0);
1099 
1100 		default:
1101 			ASSERT("Unsupported transfer mode");
1102 			PRN("Unsupported transfer mode");
1103 			return (ENOTSUP);
1104 		}
1105 
1106 	case VD_STATE_DRING:	/* expect RDX, register-dring, or unreg-dring */
1107 		if ((status = process_rdx_msg(msg, msglen)) == 0) {
1108 			/* Ready to receive data */
1109 			vd->state = VD_STATE_DATA;
1110 			return (0);
1111 		} else if (status != ENOMSG) {
1112 			return (status);
1113 		}
1114 
1115 
1116 		/*
1117 		 * If another register-dring message is received, stay in
1118 		 * dring state in case the client sends RDX; although the
1119 		 * protocol allows multiple drings, this server does not
1120 		 * support using more than one
1121 		 */
1122 		if ((status =
1123 			vd_process_dring_reg_msg(vd, msg, msglen)) != ENOMSG)
1124 			return (status);
1125 
1126 		/*
1127 		 * Acknowledge an unregister-dring message, but reset the
1128 		 * connection anyway:  Although the protocol allows
1129 		 * unregistering drings, this server cannot serve a vdisk
1130 		 * without its only dring
1131 		 */
1132 		status = vd_process_dring_unreg_msg(vd, msg, msglen);
1133 		return ((status == 0) ? ENOTSUP : status);
1134 
1135 	case VD_STATE_DATA:
1136 		switch (vd->xfer_mode) {
1137 		case VIO_DESC_MODE:	/* expect in-band-descriptor message */
1138 			return (vd_process_desc_msg(vd, msg, msglen));
1139 
1140 		case VIO_DRING_MODE:	/* expect dring-data or unreg-dring */
1141 			/*
1142 			 * Typically expect dring-data messages, so handle
1143 			 * them first
1144 			 */
1145 			if ((status = vd_process_dring_msg(vd, msg,
1146 				    msglen)) != ENOMSG)
1147 				return (status);
1148 
1149 			/*
1150 			 * Acknowledge an unregister-dring message, but reset
1151 			 * the connection anyway:  Although the protocol
1152 			 * allows unregistering drings, this server cannot
1153 			 * serve a vdisk without its only dring
1154 			 */
1155 			status = vd_process_dring_unreg_msg(vd, msg, msglen);
1156 			return ((status == 0) ? ENOTSUP : status);
1157 
1158 		default:
1159 			ASSERT("Unsupported transfer mode");
1160 			PRN("Unsupported transfer mode");
1161 			return (ENOTSUP);
1162 		}
1163 
1164 	default:
1165 		ASSERT("Invalid client connection state");
1166 		PRN("Invalid client connection state");
1167 		return (ENOTSUP);
1168 	}
1169 }
1170 
1171 static void
1172 vd_process_msg(vd_t *vd, vio_msg_t *msg, size_t msglen)
1173 {
1174 	int		status;
1175 	boolean_t	reset_ldc = B_FALSE;
1176 
1177 
1178 	ASSERT(mutex_owned(&vd->lock));
1179 
1180 	/*
1181 	 * Check that the message is at least big enough for a "tag", so that
1182 	 * message processing can proceed based on tag-specified message type
1183 	 */
1184 	if (msglen < sizeof (vio_msg_tag_t)) {
1185 		PRN("Received short (%lu-byte) message", msglen);
1186 		/* Can't "nack" short message, so drop the big hammer */
1187 		vd_reset_connection(vd, B_TRUE);
1188 		return;
1189 	}
1190 
1191 	/*
1192 	 * Process the message
1193 	 */
1194 	switch (status = vd_do_process_msg(vd, msg, msglen)) {
1195 	case 0:
1196 		/* "ack" valid, successfully-processed messages */
1197 		msg->tag.vio_subtype = VIO_SUBTYPE_ACK;
1198 		break;
1199 
1200 	case ENOMSG:
1201 		PRN("Received unexpected message");
1202 		_NOTE(FALLTHROUGH);
1203 	case EBADMSG:
1204 	case ENOTSUP:
1205 		/* "nack" invalid messages */
1206 		msg->tag.vio_subtype = VIO_SUBTYPE_NACK;
1207 		break;
1208 
1209 	default:
1210 		/* "nack" failed messages */
1211 		msg->tag.vio_subtype = VIO_SUBTYPE_NACK;
1212 		/* An LDC error probably occurred, so try resetting it */
1213 		reset_ldc = B_TRUE;
1214 		break;
1215 	}
1216 
1217 	/* "ack" or "nack" the message */
1218 	PR1("Sending %s",
1219 	    (msg->tag.vio_subtype == VIO_SUBTYPE_ACK) ? "ACK" : "NACK");
1220 	if (send_msg(vd->ldc_handle, msg, msglen) != 0)
1221 		reset_ldc = B_TRUE;
1222 
1223 	/* Reset the connection for nack'ed or failed messages */
1224 	if ((status != 0) || reset_ldc)
1225 		vd_reset_connection(vd, reset_ldc);
1226 }
1227 
1228 static void
1229 vd_process_queue(void *arg)
1230 {
1231 	vd_t		*vd = (vd_t *)arg;
1232 	size_t		max_msglen, nbytes;
1233 	vio_msg_t	*vio_msg;
1234 
1235 
1236 	PR2("Entered");
1237 	ASSERT(vd != NULL);
1238 	mutex_enter(&vd->lock);
1239 	max_msglen = vd->max_msglen;	/* vd->maxmsglen can change */
1240 	vio_msg = kmem_alloc(max_msglen, KM_SLEEP);
1241 	for (nbytes = vd->max_msglen;
1242 		vd->enabled && recv_msg(vd->ldc_handle, vio_msg, &nbytes) == 0;
1243 		nbytes = vd->max_msglen)
1244 		vd_process_msg(vd, vio_msg, nbytes);
1245 	kmem_free(vio_msg, max_msglen);
1246 	mutex_exit(&vd->lock);
1247 	PR2("Returning");
1248 }
1249 
1250 static uint_t
1251 vd_handle_ldc_events(uint64_t event, caddr_t arg)
1252 {
1253 	uint_t	status;
1254 	vd_t	*vd = (vd_t *)(void *)arg;
1255 
1256 
1257 	ASSERT(vd != NULL);
1258 	mutex_enter(&vd->lock);
1259 	if (event & LDC_EVT_READ) {
1260 		PR1("New packet(s) available");
1261 		/* Queue a task to process the new data */
1262 		if (ddi_taskq_dispatch(vd->taskq, vd_process_queue, vd, 0) !=
1263 		    DDI_SUCCESS)
1264 			PRN("Unable to dispatch vd_process_queue()");
1265 	} else if (event & LDC_EVT_RESET) {
1266 		PR0("Attempting to bring up reset channel");
1267 		if (((status = ldc_up(vd->ldc_handle)) != 0) &&
1268 		    (status != ECONNREFUSED)) {
1269 			PRN("ldc_up() returned errno %d", status);
1270 		}
1271 	} else if (event & LDC_EVT_UP) {
1272 		/* Reset the connection state when channel comes (back) up */
1273 		vd_reset_connection(vd, B_FALSE);
1274 	}
1275 	mutex_exit(&vd->lock);
1276 	return (LDC_SUCCESS);
1277 }
1278 
1279 static uint_t
1280 vds_check_for_vd(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
1281 {
1282 	_NOTE(ARGUNUSED(key, val))
1283 	(*((uint_t *)arg))++;
1284 	return (MH_WALK_TERMINATE);
1285 }
1286 
1287 
1288 static int
1289 vds_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
1290 {
1291 	uint_t	vd_present = 0;
1292 	minor_t	instance;
1293 	vds_t	*vds;
1294 
1295 
1296 	PR0("Entered");
1297 	switch (cmd) {
1298 	case DDI_DETACH:
1299 		/* the real work happens below */
1300 		break;
1301 	case DDI_SUSPEND:
1302 		/* nothing to do for this non-device */
1303 		return (DDI_SUCCESS);
1304 	default:
1305 		return (DDI_FAILURE);
1306 	}
1307 
1308 	ASSERT(cmd == DDI_DETACH);
1309 	instance = ddi_get_instance(dip);
1310 	if ((vds = ddi_get_soft_state(vds_state, instance)) == NULL) {
1311 		PRN("Could not get state for instance %u", instance);
1312 		ddi_soft_state_free(vds_state, instance);
1313 		return (DDI_FAILURE);
1314 	}
1315 
1316 	/* Do no detach when serving any vdisks */
1317 	mod_hash_walk(vds->vd_table, vds_check_for_vd, &vd_present);
1318 	if (vd_present) {
1319 		PR0("Not detaching because serving vdisks");
1320 		return (DDI_FAILURE);
1321 	}
1322 
1323 	PR0("Detaching");
1324 	if (vds->initialized & VDS_MDEG)
1325 		(void) mdeg_unregister(vds->mdeg);
1326 	if (vds->initialized & VDS_LDI)
1327 		(void) ldi_ident_release(vds->ldi_ident);
1328 	mod_hash_destroy_hash(vds->vd_table);
1329 	if (vds->initialized & VDS_LOCKING)
1330 		mutex_destroy(&vds->lock);
1331 	ddi_soft_state_free(vds_state, instance);
1332 	return (DDI_SUCCESS);
1333 }
1334 
1335 static boolean_t
1336 is_pseudo_device(dev_info_t *dip)
1337 {
1338 	dev_info_t	*parent, *root = ddi_root_node();
1339 
1340 
1341 	for (parent = ddi_get_parent(dip); (parent != NULL) && (parent != root);
1342 	    parent = ddi_get_parent(parent)) {
1343 		if (strcmp(ddi_get_name(parent), DEVI_PSEUDO_NEXNAME) == 0)
1344 			return (B_TRUE);
1345 	}
1346 
1347 	return (B_FALSE);
1348 }
1349 
1350 static int
1351 vd_get_params(ldi_handle_t lh, char *block_device, vd_t *vd)
1352 {
1353 	int		otyp, rval, status;
1354 	dev_info_t	*dip;
1355 	struct dk_cinfo	dk_cinfo;
1356 
1357 
1358 	/* Get block device's device number, otyp, and size */
1359 	if ((status = ldi_get_dev(lh, &vd->dev[0])) != 0) {
1360 		PRN("ldi_get_dev() returned errno %d for %s",
1361 		    status, block_device);
1362 		return (status);
1363 	}
1364 	if ((status = ldi_get_otyp(lh, &otyp)) != 0) {
1365 		PRN("ldi_get_otyp() returned errno %d for %s",
1366 		    status, block_device);
1367 		return (status);
1368 	}
1369 	if (otyp != OTYP_BLK) {
1370 		PRN("Cannot serve non-block device %s", block_device);
1371 		return (ENOTBLK);
1372 	}
1373 	if (ldi_get_size(lh, &vd->vdisk_size) != DDI_SUCCESS) {
1374 		PRN("ldi_get_size() failed for %s", block_device);
1375 		return (EIO);
1376 	}
1377 
1378 	/* Determine if backing block device is a pseudo device */
1379 	if ((dip = ddi_hold_devi_by_instance(getmajor(vd->dev[0]),
1380 		    dev_to_instance(vd->dev[0]), 0))  == NULL) {
1381 		PRN("%s is no longer accessible", block_device);
1382 		return (EIO);
1383 	}
1384 	vd->pseudo = is_pseudo_device(dip);
1385 	ddi_release_devi(dip);
1386 	if (vd->pseudo) {
1387 		vd->vdisk_type	= VD_DISK_TYPE_SLICE;
1388 		vd->nslices	= 1;
1389 		return (0);	/* ...and we're done */
1390 	}
1391 
1392 	/* Get dk_cinfo to determine slice of backing block device */
1393 	if ((status = ldi_ioctl(lh, DKIOCINFO, (intptr_t)&dk_cinfo,
1394 		    FKIOCTL, kcred, &rval)) != 0) {
1395 		PRN("ldi_ioctl(DKIOCINFO) returned errno %d for %s",
1396 		    status, block_device);
1397 		return (status);
1398 	}
1399 
1400 	if (dk_cinfo.dki_partition >= V_NUMPAR) {
1401 		PRN("slice %u >= maximum slice %u for %s",
1402 		    dk_cinfo.dki_partition, V_NUMPAR, block_device);
1403 		return (EIO);
1404 	}
1405 
1406 	/* If block device slice is entire disk, fill in all slice devices */
1407 	if (dk_cinfo.dki_partition == VD_ENTIRE_DISK_SLICE) {
1408 		uint_t	slice;
1409 		major_t	major = getmajor(vd->dev[0]);
1410 		minor_t	minor = getminor(vd->dev[0]) - VD_ENTIRE_DISK_SLICE;
1411 
1412 		vd->vdisk_type	= VD_DISK_TYPE_DISK;
1413 		vd->nslices	= V_NUMPAR;
1414 		for (slice = 0; slice < vd->nslices; slice++)
1415 			vd->dev[slice] = makedevice(major, (minor + slice));
1416 		return (0);	/* ...and we're done */
1417 	}
1418 
1419 	/* Otherwise, we have a (partial) slice of a block device */
1420 	vd->vdisk_type	= VD_DISK_TYPE_SLICE;
1421 	vd->nslices	= 1;
1422 
1423 
1424 	/* Initialize dk_geom structure for single-slice block device */
1425 	if ((status = ldi_ioctl(lh, DKIOCGGEOM, (intptr_t)&vd->dk_geom,
1426 		    FKIOCTL, kcred, &rval)) != 0) {
1427 		PRN("ldi_ioctl(DKIOCGEOM) returned errno %d for %s",
1428 		    status, block_device);
1429 		return (status);
1430 	}
1431 	if (vd->dk_geom.dkg_nsect == 0) {
1432 		PRN("%s geometry claims 0 sectors per track", block_device);
1433 		return (EIO);
1434 	}
1435 	if (vd->dk_geom.dkg_nhead == 0) {
1436 		PRN("%s geometry claims 0 heads", block_device);
1437 		return (EIO);
1438 	}
1439 	vd->dk_geom.dkg_ncyl =
1440 	    lbtodb(vd->vdisk_size)/vd->dk_geom.dkg_nsect/vd->dk_geom.dkg_nhead;
1441 	vd->dk_geom.dkg_acyl = 0;
1442 	vd->dk_geom.dkg_pcyl = vd->dk_geom.dkg_ncyl + vd->dk_geom.dkg_acyl;
1443 
1444 
1445 	/* Initialize vtoc structure for single-slice block device */
1446 	if ((status = ldi_ioctl(lh, DKIOCGVTOC, (intptr_t)&vd->vtoc,
1447 		    FKIOCTL, kcred, &rval)) != 0) {
1448 		PRN("ldi_ioctl(DKIOCGVTOC) returned errno %d for %s",
1449 		    status, block_device);
1450 		return (status);
1451 	}
1452 	bcopy(VD_VOLUME_NAME, vd->vtoc.v_volume,
1453 	    MIN(sizeof (VD_VOLUME_NAME), sizeof (vd->vtoc.v_volume)));
1454 	bzero(vd->vtoc.v_part, sizeof (vd->vtoc.v_part));
1455 	vd->vtoc.v_nparts = 1;
1456 	vd->vtoc.v_part[0].p_tag = V_UNASSIGNED;
1457 	vd->vtoc.v_part[0].p_flag = 0;
1458 	vd->vtoc.v_part[0].p_start = 0;
1459 	vd->vtoc.v_part[0].p_size = lbtodb(vd->vdisk_size);
1460 	bcopy(VD_ASCIILABEL, vd->vtoc.v_asciilabel,
1461 	    MIN(sizeof (VD_ASCIILABEL), sizeof (vd->vtoc.v_asciilabel)));
1462 
1463 
1464 	return (0);
1465 }
1466 
1467 static int
1468 vds_do_init_vd(vds_t *vds, uint64_t id, char *block_device, uint64_t ldc_id,
1469     vd_t **vdp)
1470 {
1471 	char			tq_name[TASKQ_NAMELEN];
1472 	int			param_status, status;
1473 	uint_t			slice;
1474 	ddi_iblock_cookie_t	iblock = NULL;
1475 	ldc_attr_t		ldc_attr;
1476 	ldi_handle_t		lh = NULL;
1477 	vd_t			*vd;
1478 
1479 
1480 	ASSERT(vds != NULL);
1481 	ASSERT(block_device != NULL);
1482 	ASSERT(vdp != NULL);
1483 	PR0("Adding vdisk for %s", block_device);
1484 
1485 	if ((vd = kmem_zalloc(sizeof (*vd), KM_NOSLEEP)) == NULL) {
1486 		PRN("No memory for virtual disk");
1487 		return (EAGAIN);
1488 	}
1489 	*vdp = vd;	/* assign here so vds_destroy_vd() can cleanup later */
1490 	vd->vds = vds;
1491 
1492 
1493 	/* Get device parameters */
1494 	if ((status = ldi_open_by_name(block_device, FREAD, kcred, &lh,
1495 		    vds->ldi_ident)) != 0) {
1496 		PRN("ldi_open_by_name(%s) = errno %d", block_device, status);
1497 		return (status);
1498 	}
1499 	param_status = vd_get_params(lh, block_device, vd);
1500 	if ((status = ldi_close(lh, FREAD, kcred)) != 0) {
1501 		PRN("ldi_close(%s) = errno %d", block_device, status);
1502 		return (status);
1503 	}
1504 	if (param_status != 0)
1505 		return (param_status);
1506 	ASSERT(vd->nslices > 0 && vd->nslices <= V_NUMPAR);
1507 	PR0("vdisk_type = %s, pseudo = %s, nslices = %u",
1508 	    ((vd->vdisk_type == VD_DISK_TYPE_DISK) ? "disk" : "slice"),
1509 	    (vd->pseudo ? "yes" : "no"), vd->nslices);
1510 
1511 
1512 	/* Initialize locking */
1513 	if (ddi_get_soft_iblock_cookie(vds->dip, DDI_SOFTINT_MED,
1514 		&iblock) != DDI_SUCCESS) {
1515 		PRN("Could not get iblock cookie.");
1516 		return (EIO);
1517 	}
1518 
1519 	mutex_init(&vd->lock, NULL, MUTEX_DRIVER, iblock);
1520 	vd->initialized |= VD_LOCKING;
1521 
1522 
1523 	/* Open the backing-device slices */
1524 	for (slice = 0; slice < vd->nslices; slice++) {
1525 		ASSERT(vd->ldi_handle[slice] == NULL);
1526 		PR0("Opening device %u, minor %u = slice %u",
1527 		    getmajor(vd->dev[slice]), getminor(vd->dev[slice]), slice);
1528 		if ((status = ldi_open_by_dev(&vd->dev[slice], OTYP_BLK,
1529 			    vd_open_flags, kcred, &vd->ldi_handle[slice],
1530 			    vds->ldi_ident)) != 0) {
1531 			PRN("ldi_open_by_dev() returned errno %d for slice %u",
1532 			    status, slice);
1533 			/* vds_destroy_vd() will close any open slices */
1534 #if 0	/* FIXME */
1535 			return (status);
1536 #endif
1537 		}
1538 	}
1539 
1540 
1541 	/* Create the task queue for the vdisk */
1542 	(void) snprintf(tq_name, sizeof (tq_name), "vd%lu", id);
1543 	PR1("tq_name = %s", tq_name);
1544 	if ((vd->taskq = ddi_taskq_create(vds->dip, tq_name, 1,
1545 		    TASKQ_DEFAULTPRI, 0)) == NULL) {
1546 		PRN("Could not create task queue");
1547 		return (EIO);
1548 	}
1549 	vd->initialized |= VD_TASKQ;
1550 	vd->enabled = 1;	/* before callback can dispatch to taskq */
1551 
1552 
1553 	/* Bring up LDC */
1554 	ldc_attr.devclass	= LDC_DEV_BLK_SVC;
1555 	ldc_attr.instance	= ddi_get_instance(vds->dip);
1556 	ldc_attr.mode		= LDC_MODE_UNRELIABLE;
1557 	ldc_attr.qlen		= VD_LDC_QLEN;
1558 	if ((status = ldc_init(ldc_id, &ldc_attr, &vd->ldc_handle)) != 0) {
1559 		PRN("ldc_init(%lu) = errno %d", ldc_id, status);
1560 		return (status);
1561 	}
1562 	vd->initialized |= VD_LDC;
1563 
1564 	if ((status = ldc_reg_callback(vd->ldc_handle, vd_handle_ldc_events,
1565 		(caddr_t)vd)) != 0) {
1566 		PRN("ldc_reg_callback() returned errno %d", status);
1567 		return (status);
1568 	}
1569 
1570 	if ((status = ldc_open(vd->ldc_handle)) != 0) {
1571 		PRN("ldc_open() returned errno %d", status);
1572 		return (status);
1573 	}
1574 
1575 	if (((status = ldc_up(vd->ldc_handle)) != 0) &&
1576 	    (status != ECONNREFUSED)) {
1577 		PRN("ldc_up() returned errno %d", status);
1578 		return (status);
1579 	}
1580 
1581 
1582 	/* Add the successfully-initialized vdisk to the server's table */
1583 	if (mod_hash_insert(vds->vd_table, (mod_hash_key_t)id, vd) != 0) {
1584 		PRN("Error adding vdisk ID %lu to table", id);
1585 		return (EIO);
1586 	}
1587 
1588 	return (0);
1589 }
1590 
1591 /*
1592  * Destroy the state associated with a virtual disk
1593  */
1594 static void
1595 vds_destroy_vd(void *arg)
1596 {
1597 	vd_t	*vd = (vd_t *)arg;
1598 
1599 
1600 	PR0("Entered");
1601 	if (vd == NULL)
1602 		return;
1603 
1604 	/* Disable queuing requests for the vdisk */
1605 	if (vd->initialized & VD_LOCKING) {
1606 		mutex_enter(&vd->lock);
1607 		vd->enabled = 0;
1608 		mutex_exit(&vd->lock);
1609 	}
1610 
1611 	/* Drain and destroy the task queue (*before* shutting down LDC) */
1612 	if (vd->initialized & VD_TASKQ)
1613 		ddi_taskq_destroy(vd->taskq);	/* waits for queued tasks */
1614 
1615 	/* Shut down LDC */
1616 	if (vd->initialized & VD_LDC) {
1617 		if (vd->initialized & VD_DRING)
1618 			(void) ldc_mem_dring_unmap(vd->dring_handle);
1619 		(void) ldc_unreg_callback(vd->ldc_handle);
1620 		(void) ldc_close(vd->ldc_handle);
1621 		(void) ldc_fini(vd->ldc_handle);
1622 	}
1623 
1624 	/* Close any open backing-device slices */
1625 	for (uint_t slice = 0; slice < vd->nslices; slice++) {
1626 		if (vd->ldi_handle[slice] != NULL) {
1627 			PR0("Closing slice %u", slice);
1628 			(void) ldi_close(vd->ldi_handle[slice],
1629 			    vd_open_flags, kcred);
1630 		}
1631 	}
1632 
1633 	/* Free lock */
1634 	if (vd->initialized & VD_LOCKING)
1635 		mutex_destroy(&vd->lock);
1636 
1637 	/* Finally, free the vdisk structure itself */
1638 	kmem_free(vd, sizeof (*vd));
1639 }
1640 
1641 static int
1642 vds_init_vd(vds_t *vds, uint64_t id, char *block_device, uint64_t ldc_id)
1643 {
1644 	int	status;
1645 	vd_t	*vd = NULL;
1646 
1647 
1648 #ifdef lint
1649 	(void) vd;
1650 #endif	/* lint */
1651 
1652 	if ((status = vds_do_init_vd(vds, id, block_device, ldc_id, &vd)) != 0)
1653 		vds_destroy_vd(vd);
1654 
1655 	return (status);
1656 }
1657 
1658 static int
1659 vds_do_get_ldc_id(md_t *md, mde_cookie_t vd_node, mde_cookie_t *channel,
1660     uint64_t *ldc_id)
1661 {
1662 	int	num_channels;
1663 
1664 
1665 	/* Look for channel endpoint child(ren) of the vdisk MD node */
1666 	if ((num_channels = md_scan_dag(md, vd_node,
1667 		    md_find_name(md, VD_CHANNEL_ENDPOINT),
1668 		    md_find_name(md, "fwd"), channel)) <= 0) {
1669 		PRN("No \"%s\" found for virtual disk", VD_CHANNEL_ENDPOINT);
1670 		return (-1);
1671 	}
1672 
1673 	/* Get the "id" value for the first channel endpoint node */
1674 	if (md_get_prop_val(md, channel[0], VD_ID_PROP, ldc_id) != 0) {
1675 		PRN("No \"%s\" property found for \"%s\" of vdisk",
1676 		    VD_ID_PROP, VD_CHANNEL_ENDPOINT);
1677 		return (-1);
1678 	}
1679 
1680 	if (num_channels > 1) {
1681 		PRN("Using ID of first of multiple channels for this vdisk");
1682 	}
1683 
1684 	return (0);
1685 }
1686 
1687 static int
1688 vds_get_ldc_id(md_t *md, mde_cookie_t vd_node, uint64_t *ldc_id)
1689 {
1690 	int		num_nodes, status;
1691 	size_t		size;
1692 	mde_cookie_t	*channel;
1693 
1694 
1695 	if ((num_nodes = md_node_count(md)) <= 0) {
1696 		PRN("Invalid node count in Machine Description subtree");
1697 		return (-1);
1698 	}
1699 	size = num_nodes*(sizeof (*channel));
1700 	channel = kmem_zalloc(size, KM_SLEEP);
1701 	status = vds_do_get_ldc_id(md, vd_node, channel, ldc_id);
1702 	kmem_free(channel, size);
1703 
1704 	return (status);
1705 }
1706 
1707 static void
1708 vds_add_vd(vds_t *vds, md_t *md, mde_cookie_t vd_node)
1709 {
1710 	char		*block_device = NULL;
1711 	uint64_t	id = 0, ldc_id = 0;
1712 
1713 
1714 	if (md_get_prop_val(md, vd_node, VD_ID_PROP, &id) != 0) {
1715 		PRN("Error getting vdisk \"%s\"", VD_ID_PROP);
1716 		return;
1717 	}
1718 	PR0("Adding vdisk ID %lu", id);
1719 	if (md_get_prop_str(md, vd_node, VD_BLOCK_DEVICE_PROP,
1720 		&block_device) != 0) {
1721 		PRN("Error getting vdisk \"%s\"", VD_BLOCK_DEVICE_PROP);
1722 		return;
1723 	}
1724 
1725 	if (vds_get_ldc_id(md, vd_node, &ldc_id) != 0) {
1726 		PRN("Error getting LDC ID for vdisk %lu", id);
1727 		return;
1728 	}
1729 
1730 	if (vds_init_vd(vds, id, block_device, ldc_id) != 0) {
1731 		PRN("Failed to add vdisk ID %lu", id);
1732 		return;
1733 	}
1734 }
1735 
1736 static void
1737 vds_remove_vd(vds_t *vds, md_t *md, mde_cookie_t vd_node)
1738 {
1739 	uint64_t	id = 0;
1740 
1741 
1742 	if (md_get_prop_val(md, vd_node, VD_ID_PROP, &id) != 0) {
1743 		PRN("Unable to get \"%s\" property from vdisk's MD node",
1744 		    VD_ID_PROP);
1745 		return;
1746 	}
1747 	PR0("Removing vdisk ID %lu", id);
1748 	if (mod_hash_destroy(vds->vd_table, (mod_hash_key_t)id) != 0)
1749 		PRN("No vdisk entry found for vdisk ID %lu", id);
1750 }
1751 
1752 static void
1753 vds_change_vd(vds_t *vds, md_t *prev_md, mde_cookie_t prev_vd_node,
1754     md_t *curr_md, mde_cookie_t curr_vd_node)
1755 {
1756 	char		*curr_dev, *prev_dev;
1757 	uint64_t	curr_id = 0, curr_ldc_id = 0;
1758 	uint64_t	prev_id = 0, prev_ldc_id = 0;
1759 	size_t		len;
1760 
1761 
1762 	/* Validate that vdisk ID has not changed */
1763 	if (md_get_prop_val(prev_md, prev_vd_node, VD_ID_PROP, &prev_id) != 0) {
1764 		PRN("Error getting previous vdisk \"%s\" property",
1765 		    VD_ID_PROP);
1766 		return;
1767 	}
1768 	if (md_get_prop_val(curr_md, curr_vd_node, VD_ID_PROP, &curr_id) != 0) {
1769 		PRN("Error getting current vdisk \"%s\" property", VD_ID_PROP);
1770 		return;
1771 	}
1772 	if (curr_id != prev_id) {
1773 		PRN("Not changing vdisk:  ID changed from %lu to %lu",
1774 		    prev_id, curr_id);
1775 		return;
1776 	}
1777 
1778 	/* Validate that LDC ID has not changed */
1779 	if (vds_get_ldc_id(prev_md, prev_vd_node, &prev_ldc_id) != 0) {
1780 		PRN("Error getting LDC ID for vdisk %lu", prev_id);
1781 		return;
1782 	}
1783 
1784 	if (vds_get_ldc_id(curr_md, curr_vd_node, &curr_ldc_id) != 0) {
1785 		PRN("Error getting LDC ID for vdisk %lu", curr_id);
1786 		return;
1787 	}
1788 	if (curr_ldc_id != prev_ldc_id) {
1789 		_NOTE(NOTREACHED);	/* FIXME is there a better way? */
1790 		PRN("Not changing vdisk:  "
1791 		    "LDC ID changed from %lu to %lu", prev_ldc_id, curr_ldc_id);
1792 		return;
1793 	}
1794 
1795 	/* Determine whether device path has changed */
1796 	if (md_get_prop_str(prev_md, prev_vd_node, VD_BLOCK_DEVICE_PROP,
1797 		&prev_dev) != 0) {
1798 		PRN("Error getting previous vdisk \"%s\"",
1799 		    VD_BLOCK_DEVICE_PROP);
1800 		return;
1801 	}
1802 	if (md_get_prop_str(curr_md, curr_vd_node, VD_BLOCK_DEVICE_PROP,
1803 		&curr_dev) != 0) {
1804 		PRN("Error getting current vdisk \"%s\"", VD_BLOCK_DEVICE_PROP);
1805 		return;
1806 	}
1807 	if (((len = strlen(curr_dev)) == strlen(prev_dev)) &&
1808 	    (strncmp(curr_dev, prev_dev, len) == 0))
1809 		return;	/* no relevant (supported) change */
1810 
1811 	PR0("Changing vdisk ID %lu", prev_id);
1812 	/* Remove old state, which will close vdisk and reset */
1813 	if (mod_hash_destroy(vds->vd_table, (mod_hash_key_t)prev_id) != 0)
1814 		PRN("No entry found for vdisk ID %lu", prev_id);
1815 	/* Re-initialize vdisk with new state */
1816 	if (vds_init_vd(vds, curr_id, curr_dev, curr_ldc_id) != 0) {
1817 		PRN("Failed to change vdisk ID %lu", curr_id);
1818 		return;
1819 	}
1820 }
1821 
1822 static int
1823 vds_process_md(void *arg, mdeg_result_t *md)
1824 {
1825 	int	i;
1826 	vds_t	*vds = arg;
1827 
1828 
1829 	if (md == NULL)
1830 		return (MDEG_FAILURE);
1831 	ASSERT(vds != NULL);
1832 
1833 	for (i = 0; i < md->removed.nelem; i++)
1834 		vds_remove_vd(vds, md->removed.mdp, md->removed.mdep[i]);
1835 	for (i = 0; i < md->match_curr.nelem; i++)
1836 		vds_change_vd(vds, md->match_prev.mdp, md->match_prev.mdep[i],
1837 		    md->match_curr.mdp, md->match_curr.mdep[i]);
1838 	for (i = 0; i < md->added.nelem; i++)
1839 		vds_add_vd(vds, md->added.mdp, md->added.mdep[i]);
1840 
1841 	return (MDEG_SUCCESS);
1842 }
1843 
1844 static int
1845 vds_do_attach(dev_info_t *dip)
1846 {
1847 	static char	reg_prop[] = "reg";	/* devinfo ID prop */
1848 
1849 	/* MDEG specification for a (particular) vds node */
1850 	static mdeg_prop_spec_t	vds_prop_spec[] = {
1851 		{MDET_PROP_STR, "name", {VDS_NAME}},
1852 		{MDET_PROP_VAL, "cfg-handle", {0}},
1853 		{MDET_LIST_END, NULL, {0}}};
1854 	static mdeg_node_spec_t	vds_spec = {"virtual-device", vds_prop_spec};
1855 
1856 	/* MDEG specification for matching a vd node */
1857 	static md_prop_match_t	vd_prop_spec[] = {
1858 		{MDET_PROP_VAL, VD_ID_PROP},
1859 		{MDET_LIST_END, NULL}};
1860 	static mdeg_node_match_t vd_spec = {"virtual-device-port",
1861 					    vd_prop_spec};
1862 
1863 	int			status;
1864 	uint64_t		cfg_handle;
1865 	minor_t			instance = ddi_get_instance(dip);
1866 	vds_t			*vds;
1867 
1868 
1869 	/*
1870 	 * The "cfg-handle" property of a vds node in an MD contains the MD's
1871 	 * notion of "instance", or unique identifier, for that node; OBP
1872 	 * stores the value of the "cfg-handle" MD property as the value of
1873 	 * the "reg" property on the node in the device tree it builds from
1874 	 * the MD and passes to Solaris.  Thus, we look up the devinfo node's
1875 	 * "reg" property value to uniquely identify this device instance when
1876 	 * registering with the MD event-generation framework.  If the "reg"
1877 	 * property cannot be found, the device tree state is presumably so
1878 	 * broken that there is no point in continuing.
1879 	 */
1880 	if (!ddi_prop_exists(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, reg_prop)) {
1881 		PRN("vds \"%s\" property does not exist", reg_prop);
1882 		return (DDI_FAILURE);
1883 	}
1884 
1885 	/* Get the MD instance for later MDEG registration */
1886 	cfg_handle = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
1887 	    reg_prop, -1);
1888 
1889 	if (ddi_soft_state_zalloc(vds_state, instance) != DDI_SUCCESS) {
1890 		PRN("Could not allocate state for instance %u", instance);
1891 		return (DDI_FAILURE);
1892 	}
1893 
1894 	if ((vds = ddi_get_soft_state(vds_state, instance)) == NULL) {
1895 		PRN("Could not get state for instance %u", instance);
1896 		ddi_soft_state_free(vds_state, instance);
1897 		return (DDI_FAILURE);
1898 	}
1899 
1900 
1901 	vds->dip	= dip;
1902 	vds->vd_table	= mod_hash_create_ptrhash("vds_vd_table", VDS_NCHAINS,
1903 							vds_destroy_vd,
1904 							sizeof (void *));
1905 	ASSERT(vds->vd_table != NULL);
1906 
1907 	mutex_init(&vds->lock, NULL, MUTEX_DRIVER, NULL);
1908 	vds->initialized |= VDS_LOCKING;
1909 
1910 	if ((status = ldi_ident_from_dip(dip, &vds->ldi_ident)) != 0) {
1911 		PRN("ldi_ident_from_dip() returned errno %d", status);
1912 		return (DDI_FAILURE);
1913 	}
1914 	vds->initialized |= VDS_LDI;
1915 
1916 	/* Register for MD updates */
1917 	vds_prop_spec[1].ps_val = cfg_handle;
1918 	if (mdeg_register(&vds_spec, &vd_spec, vds_process_md, vds,
1919 		&vds->mdeg) != MDEG_SUCCESS) {
1920 		PRN("Unable to register for MD updates");
1921 		return (DDI_FAILURE);
1922 	}
1923 	vds->initialized |= VDS_MDEG;
1924 
1925 	ddi_report_dev(dip);
1926 	return (DDI_SUCCESS);
1927 }
1928 
1929 static int
1930 vds_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
1931 {
1932 	int	status;
1933 
1934 	PR0("Entered");
1935 	switch (cmd) {
1936 	case DDI_ATTACH:
1937 		if ((status = vds_do_attach(dip)) != DDI_SUCCESS)
1938 			(void) vds_detach(dip, DDI_DETACH);
1939 		return (status);
1940 	case DDI_RESUME:
1941 		/* nothing to do for this non-device */
1942 		return (DDI_SUCCESS);
1943 	default:
1944 		return (DDI_FAILURE);
1945 	}
1946 }
1947 
1948 static struct dev_ops vds_ops = {
1949 	DEVO_REV,	/* devo_rev */
1950 	0,		/* devo_refcnt */
1951 	ddi_no_info,	/* devo_getinfo */
1952 	nulldev,	/* devo_identify */
1953 	nulldev,	/* devo_probe */
1954 	vds_attach,	/* devo_attach */
1955 	vds_detach,	/* devo_detach */
1956 	nodev,		/* devo_reset */
1957 	NULL,		/* devo_cb_ops */
1958 	NULL,		/* devo_bus_ops */
1959 	nulldev		/* devo_power */
1960 };
1961 
1962 static struct modldrv modldrv = {
1963 	&mod_driverops,
1964 	"virtual disk server v%I%",
1965 	&vds_ops,
1966 };
1967 
1968 static struct modlinkage modlinkage = {
1969 	MODREV_1,
1970 	&modldrv,
1971 	NULL
1972 };
1973 
1974 
1975 int
1976 _init(void)
1977 {
1978 	int		i, status;
1979 
1980 
1981 	PR0("Built %s %s", __DATE__, __TIME__);
1982 	if ((status = ddi_soft_state_init(&vds_state, sizeof (vds_t), 1)) != 0)
1983 		return (status);
1984 	if ((status = mod_install(&modlinkage)) != 0) {
1985 		ddi_soft_state_fini(&vds_state);
1986 		return (status);
1987 	}
1988 
1989 	/* Fill in the bit-mask of server-supported operations */
1990 	for (i = 0; i < vds_noperations; i++)
1991 		vds_operations |= 1 << (vds_operation[i].operation - 1);
1992 
1993 	return (0);
1994 }
1995 
1996 int
1997 _info(struct modinfo *modinfop)
1998 {
1999 	return (mod_info(&modlinkage, modinfop));
2000 }
2001 
2002 int
2003 _fini(void)
2004 {
2005 	int	status;
2006 
2007 
2008 	PR0("Entered");
2009 	if ((status = mod_remove(&modlinkage)) != 0)
2010 		return (status);
2011 	ddi_soft_state_fini(&vds_state);
2012 	return (0);
2013 }
2014