xref: /titanic_51/usr/src/uts/common/xen/io/xdf.c (revision aba6a64cda8fc853e4d61f08c163d0b9be0815b2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * xdf.c - Xen Virtual Block Device Driver
29  * TODO:
30  *	- support alternate block size (currently only DEV_BSIZE supported)
31  *	- revalidate geometry for removable devices
32  */
33 
34 #pragma ident	"%Z%%M%	%I%	%E% SMI"
35 
36 #include <sys/ddi.h>
37 #include <sys/sunddi.h>
38 #include <sys/conf.h>
39 #include <sys/cmlb.h>
40 #include <sys/dkio.h>
41 #include <sys/promif.h>
42 #include <sys/sysmacros.h>
43 #include <sys/kstat.h>
44 #include <sys/mach_mmu.h>
45 #ifdef XPV_HVM_DRIVER
46 #include <sys/xpv_support.h>
47 #include <sys/sunndi.h>
48 #endif /* XPV_HVM_DRIVER */
49 #include <public/io/xenbus.h>
50 #include <xen/sys/xenbus_impl.h>
51 #include <xen/sys/xendev.h>
52 #include <sys/gnttab.h>
53 #include <sys/scsi/generic/inquiry.h>
54 #include <xen/io/blkif_impl.h>
55 #include <io/xdf.h>
56 
57 #define	FLUSH_DISKCACHE	0x1
58 #define	WRITE_BARRIER	0x2
59 #define	DEFAULT_FLUSH_BLOCK	156 /* block to write to cause a cache flush */
60 #define	USE_WRITE_BARRIER(vdp)				\
61 	((vdp)->xdf_feature_barrier && !(vdp)->xdf_flush_supported)
62 #define	USE_FLUSH_DISKCACHE(vdp)			\
63 	((vdp)->xdf_feature_barrier && (vdp)->xdf_flush_supported)
64 #define	IS_WRITE_BARRIER(vdp, bp)			\
65 	(!IS_READ(bp) && USE_WRITE_BARRIER(vdp) &&	\
66 	((bp)->b_un.b_addr == (vdp)->xdf_cache_flush_block))
67 #define	IS_FLUSH_DISKCACHE(bp)				\
68 	(!IS_READ(bp) && USE_FLUSH_DISKCACHE(vdp) && ((bp)->b_bcount == 0))
69 
70 static void *vbd_ss;
71 static kmem_cache_t *xdf_vreq_cache;
72 static kmem_cache_t *xdf_gs_cache;
73 static int xdf_maxphys = XB_MAXPHYS;
74 int xdfdebug = 0;
75 extern int do_polled_io;
76 diskaddr_t xdf_flush_block = DEFAULT_FLUSH_BLOCK;
77 int	xdf_barrier_flush_disable = 0;
78 
79 /*
80  * dev_ops and cb_ops entrypoints
81  */
82 static int xdf_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
83 static int xdf_attach(dev_info_t *, ddi_attach_cmd_t);
84 static int xdf_detach(dev_info_t *, ddi_detach_cmd_t);
85 static int xdf_reset(dev_info_t *, ddi_reset_cmd_t);
86 static int xdf_open(dev_t *, int, int, cred_t *);
87 static int xdf_close(dev_t, int, int, struct cred *);
88 static int xdf_strategy(struct buf *);
89 static int xdf_read(dev_t, struct uio *, cred_t *);
90 static int xdf_aread(dev_t, struct aio_req *, cred_t *);
91 static int xdf_write(dev_t, struct uio *, cred_t *);
92 static int xdf_awrite(dev_t, struct aio_req *, cred_t *);
93 static int xdf_dump(dev_t, caddr_t, daddr_t, int);
94 static int xdf_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
95 static uint_t xdf_intr(caddr_t);
96 static int xdf_prop_op(dev_t, dev_info_t *, ddi_prop_op_t, int, char *,
97     caddr_t, int *);
98 
99 /*
100  * misc private functions
101  */
102 static int xdf_suspend(dev_info_t *);
103 static int xdf_resume(dev_info_t *);
104 static int xdf_start_connect(xdf_t *);
105 static int xdf_start_disconnect(xdf_t *);
106 static int xdf_post_connect(xdf_t *);
107 static void xdf_post_disconnect(xdf_t *);
108 static void xdf_oe_change(dev_info_t *, ddi_eventcookie_t, void *, void *);
109 static void xdf_iostart(xdf_t *);
110 static void xdf_iofini(xdf_t *, uint64_t, int);
111 static int xdf_prepare_rreq(xdf_t *, struct buf *, blkif_request_t *);
112 static int xdf_drain_io(xdf_t *);
113 static boolean_t xdf_isopen(xdf_t *, int);
114 static int xdf_check_state_transition(xdf_t *, XenbusState);
115 static int xdf_connect(xdf_t *, boolean_t);
116 static int xdf_dmacallback(caddr_t);
117 static void xdf_timeout_handler(void *);
118 static uint_t xdf_iorestart(caddr_t);
119 static v_req_t *vreq_get(xdf_t *, buf_t *);
120 static void vreq_free(xdf_t *, v_req_t *);
121 static int vreq_setup(xdf_t *, v_req_t *);
122 static ge_slot_t *gs_get(xdf_t *, int);
123 static void gs_free(xdf_t *, ge_slot_t *);
124 static grant_ref_t gs_grant(ge_slot_t *, mfn_t);
125 static void unexpectedie(xdf_t *);
126 static void xdfmin(struct buf *);
127 static void xdf_synthetic_pgeom(dev_info_t *, cmlb_geom_t *);
128 extern int xdf_kstat_create(dev_info_t *, char *, int);
129 extern void xdf_kstat_delete(dev_info_t *);
130 
131 #if defined(XPV_HVM_DRIVER)
132 static void xdf_hvm_add(dev_info_t *);
133 static void xdf_hvm_rm(dev_info_t *);
134 static void xdf_hvm_init(void);
135 static void xdf_hvm_fini(void);
136 #endif /* XPV_HVM_DRIVER */
137 
138 static 	struct cb_ops xdf_cbops = {
139 	xdf_open,
140 	xdf_close,
141 	xdf_strategy,
142 	nodev,
143 	xdf_dump,
144 	xdf_read,
145 	xdf_write,
146 	xdf_ioctl,
147 	nodev,
148 	nodev,
149 	nodev,
150 	nochpoll,
151 	xdf_prop_op,
152 	NULL,
153 	D_MP | D_NEW | D_64BIT,
154 	CB_REV,
155 	xdf_aread,
156 	xdf_awrite
157 };
158 
159 struct dev_ops xdf_devops = {
160 	DEVO_REV,		/* devo_rev */
161 	0,			/* devo_refcnt */
162 	xdf_getinfo,		/* devo_getinfo */
163 	nulldev,		/* devo_identify */
164 	nulldev,		/* devo_probe */
165 	xdf_attach,		/* devo_attach */
166 	xdf_detach,		/* devo_detach */
167 	xdf_reset,		/* devo_reset */
168 	&xdf_cbops,		/* devo_cb_ops */
169 	(struct bus_ops *)NULL	/* devo_bus_ops */
170 };
171 
172 static struct modldrv modldrv = {
173 	&mod_driverops,		/* Type of module.  This one is a driver */
174 	"virtual block driver %I%",	/* short description */
175 	&xdf_devops		/* driver specific ops */
176 };
177 
178 static struct modlinkage xdf_modlinkage = {
179 	MODREV_1, (void *)&modldrv, NULL
180 };
181 
182 /*
183  * I/O buffer DMA attributes
184  * Make sure: one DMA window contains BLKIF_MAX_SEGMENTS_PER_REQUEST at most
185  */
186 static ddi_dma_attr_t xb_dma_attr = {
187 	DMA_ATTR_V0,
188 	(uint64_t)0,			/* lowest address */
189 	(uint64_t)0xffffffffffffffff,	/* highest usable address */
190 	(uint64_t)0xffffff,		/* DMA counter limit max */
191 	(uint64_t)XB_BSIZE,		/* alignment in bytes */
192 	XB_BSIZE - 1,			/* bitmap of burst sizes */
193 	XB_BSIZE,			/* min transfer */
194 	(uint64_t)XB_MAX_XFER, 		/* maximum transfer */
195 	(uint64_t)PAGEOFFSET,		/* 1 page segment length  */
196 	BLKIF_MAX_SEGMENTS_PER_REQUEST,	/* maximum number of segments */
197 	XB_BSIZE,			/* granularity */
198 	0,				/* flags (reserved) */
199 };
200 
201 static ddi_device_acc_attr_t xc_acc_attr = {
202 	DDI_DEVICE_ATTR_V0,
203 	DDI_NEVERSWAP_ACC,
204 	DDI_STRICTORDER_ACC
205 };
206 
207 /* callbacks from commmon label */
208 
209 int xdf_lb_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t, void *);
210 int xdf_lb_getinfo(dev_info_t *, int, void *, void *);
211 
212 static cmlb_tg_ops_t xdf_lb_ops = {
213 	TG_DK_OPS_VERSION_1,
214 	xdf_lb_rdwr,
215 	xdf_lb_getinfo
216 };
217 
218 int
219 _init(void)
220 {
221 	int rc;
222 
223 	if ((rc = ddi_soft_state_init(&vbd_ss, sizeof (xdf_t), 0)) != 0)
224 		return (rc);
225 
226 	xdf_vreq_cache = kmem_cache_create("xdf_vreq_cache",
227 	    sizeof (v_req_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
228 	xdf_gs_cache = kmem_cache_create("xdf_gs_cache",
229 	    sizeof (ge_slot_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
230 
231 #if defined(XPV_HVM_DRIVER)
232 	xdf_hvm_init();
233 #endif /* XPV_HVM_DRIVER */
234 
235 	if ((rc = mod_install(&xdf_modlinkage)) != 0) {
236 #if defined(XPV_HVM_DRIVER)
237 		xdf_hvm_fini();
238 #endif /* XPV_HVM_DRIVER */
239 		kmem_cache_destroy(xdf_vreq_cache);
240 		kmem_cache_destroy(xdf_gs_cache);
241 		ddi_soft_state_fini(&vbd_ss);
242 		return (rc);
243 	}
244 
245 	return (rc);
246 }
247 
248 int
249 _fini(void)
250 {
251 
252 	int err;
253 	if ((err = mod_remove(&xdf_modlinkage)) != 0)
254 		return (err);
255 
256 #if defined(XPV_HVM_DRIVER)
257 	xdf_hvm_fini();
258 #endif /* XPV_HVM_DRIVER */
259 
260 	kmem_cache_destroy(xdf_vreq_cache);
261 	kmem_cache_destroy(xdf_gs_cache);
262 	ddi_soft_state_fini(&vbd_ss);
263 
264 	return (0);
265 }
266 
267 int
268 _info(struct modinfo *modinfop)
269 {
270 	return (mod_info(&xdf_modlinkage, modinfop));
271 }
272 
273 /*ARGSUSED*/
274 static int
275 xdf_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **rp)
276 {
277 	int instance;
278 	xdf_t *vbdp;
279 
280 	instance = XDF_INST(getminor((dev_t)arg));
281 
282 	switch (cmd) {
283 	case DDI_INFO_DEVT2DEVINFO:
284 		if ((vbdp = ddi_get_soft_state(vbd_ss, instance)) == NULL) {
285 			*rp = NULL;
286 			return (DDI_FAILURE);
287 		}
288 		*rp = vbdp->xdf_dip;
289 		return (DDI_SUCCESS);
290 
291 	case DDI_INFO_DEVT2INSTANCE:
292 		*rp = (void *)(uintptr_t)instance;
293 		return (DDI_SUCCESS);
294 
295 	default:
296 		return (DDI_FAILURE);
297 	}
298 }
299 
300 static int
301 xdf_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags,
302 	char *name, caddr_t valuep, int *lengthp)
303 {
304 	int instance = ddi_get_instance(dip);
305 	xdf_t *vdp;
306 	diskaddr_t p_blkcnt;
307 
308 	/*
309 	 * xdf dynamic properties are device specific and size oriented.
310 	 * Requests issued under conditions where size is valid are passed
311 	 * to ddi_prop_op_nblocks with the size information, otherwise the
312 	 * request is passed to ddi_prop_op.
313 	 */
314 	vdp = ddi_get_soft_state(vbd_ss, instance);
315 
316 	if ((dev == DDI_DEV_T_ANY) || (vdp == NULL))
317 		return (ddi_prop_op(dev, dip, prop_op, mod_flags,
318 		    name, valuep, lengthp));
319 
320 	/* do cv_wait until connected or failed */
321 	mutex_enter(&vdp->xdf_dev_lk);
322 	if (xdf_connect(vdp, B_TRUE) != XD_READY) {
323 		mutex_exit(&vdp->xdf_dev_lk);
324 		goto out;
325 	}
326 	mutex_exit(&vdp->xdf_dev_lk);
327 
328 	if (cmlb_partinfo(vdp->xdf_vd_lbl, XDF_PART(getminor(dev)), &p_blkcnt,
329 	    NULL, NULL, NULL, NULL) == 0)
330 		return (ddi_prop_op_nblocks(dev, dip, prop_op, mod_flags,
331 		    name, valuep, lengthp, (uint64_t)p_blkcnt));
332 
333 out:
334 	return (ddi_prop_op(dev, dip, prop_op, mod_flags, name, valuep,
335 	    lengthp));
336 }
337 
338 static int
339 xdf_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
340 {
341 	xdf_t *vdp;
342 	ddi_iblock_cookie_t softibc;
343 	int instance;
344 
345 	xdfdebug = ddi_prop_get_int(DDI_DEV_T_ANY, devi, DDI_PROP_NOTPROM,
346 	    "xdfdebug", 0);
347 
348 	switch (cmd) {
349 		case DDI_ATTACH:
350 			break;
351 
352 		case DDI_RESUME:
353 			return (xdf_resume(devi));
354 
355 		default:
356 			return (DDI_FAILURE);
357 	}
358 
359 	instance = ddi_get_instance(devi);
360 	if (ddi_soft_state_zalloc(vbd_ss, instance) != DDI_SUCCESS)
361 		return (DDI_FAILURE);
362 
363 	DPRINTF(DDI_DBG, ("xdf%d: attaching\n", instance));
364 	vdp = ddi_get_soft_state(vbd_ss, instance);
365 	ddi_set_driver_private(devi, vdp);
366 	vdp->xdf_dip = devi;
367 	cv_init(&vdp->xdf_dev_cv, NULL, CV_DEFAULT, NULL);
368 
369 	if (ddi_get_iblock_cookie(devi, 0, &vdp->xdf_ibc) != DDI_SUCCESS) {
370 		cmn_err(CE_WARN, "xdf@%s: failed to get iblock cookie",
371 		    ddi_get_name_addr(devi));
372 		goto errout0;
373 	}
374 	mutex_init(&vdp->xdf_dev_lk, NULL, MUTEX_DRIVER, (void *)vdp->xdf_ibc);
375 	mutex_init(&vdp->xdf_cb_lk, NULL, MUTEX_DRIVER, (void *)vdp->xdf_ibc);
376 	mutex_init(&vdp->xdf_iostat_lk, NULL, MUTEX_DRIVER,
377 	    (void *)vdp->xdf_ibc);
378 
379 	if (ddi_get_soft_iblock_cookie(devi, DDI_SOFTINT_LOW, &softibc)
380 	    != DDI_SUCCESS) {
381 		cmn_err(CE_WARN, "xdf@%s: failed to get softintr iblock cookie",
382 		    ddi_get_name_addr(devi));
383 		goto errout0;
384 	}
385 	if (ddi_add_softintr(devi, DDI_SOFTINT_LOW, &vdp->xdf_softintr_id,
386 	    &softibc, NULL, xdf_iorestart, (caddr_t)vdp) != DDI_SUCCESS) {
387 		cmn_err(CE_WARN, "xdf@%s: failed to add softintr",
388 		    ddi_get_name_addr(devi));
389 		goto errout0;
390 	}
391 
392 #if !defined(XPV_HVM_DRIVER)
393 	/* create kstat for iostat(1M) */
394 	if (xdf_kstat_create(devi, "xdf", instance) != 0) {
395 		cmn_err(CE_WARN, "xdf@%s: failed to create kstat",
396 		    ddi_get_name_addr(devi));
397 		goto errout0;
398 	}
399 #endif /* !XPV_HVM_DRIVER */
400 
401 	/* driver handles kernel-issued IOCTLs */
402 	if (ddi_prop_create(DDI_DEV_T_NONE, devi, DDI_PROP_CANSLEEP,
403 	    DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) {
404 		cmn_err(CE_WARN, "xdf@%s: cannot create DDI_KERNEL_IOCTL prop",
405 		    ddi_get_name_addr(devi));
406 		goto errout0;
407 	}
408 
409 	/*
410 	 * Initialize the physical geometry stucture.  Note that currently
411 	 * we don't know the size of the backend device so the number
412 	 * of blocks on the device will be initialized to zero.  Once
413 	 * we connect to the backend device we'll update the physical
414 	 * geometry to reflect the real size of the device.
415 	 */
416 	xdf_synthetic_pgeom(devi, &vdp->xdf_pgeom);
417 
418 	/*
419 	 * create default device minor nodes: non-removable disk
420 	 * we will adjust minor nodes after we are connected w/ backend
421 	 */
422 	cmlb_alloc_handle(&vdp->xdf_vd_lbl);
423 	if (cmlb_attach(devi, &xdf_lb_ops, DTYPE_DIRECT, 0, 1, DDI_NT_BLOCK,
424 #if defined(XPV_HVM_DRIVER)
425 	    CMLB_CREATE_ALTSLICE_VTOC_16_DTYPE_DIRECT |
426 	    CMLB_INTERNAL_MINOR_NODES,
427 #else /* !XPV_HVM_DRIVER */
428 	    CMLB_FAKE_LABEL_ONE_PARTITION,
429 #endif /* !XPV_HVM_DRIVER */
430 	    vdp->xdf_vd_lbl, NULL) != 0) {
431 		cmn_err(CE_WARN, "xdf@%s: default cmlb attach failed",
432 		    ddi_get_name_addr(devi));
433 		goto errout0;
434 	}
435 
436 	/*
437 	 * We ship with cache-enabled disks
438 	 */
439 	vdp->xdf_wce = 1;
440 
441 	mutex_enter(&vdp->xdf_cb_lk);
442 
443 	/* Watch backend XenbusState change */
444 	if (xvdi_add_event_handler(devi, XS_OE_STATE,
445 	    xdf_oe_change) != DDI_SUCCESS) {
446 		mutex_exit(&vdp->xdf_cb_lk);
447 		goto errout0;
448 	}
449 
450 	if (xdf_start_connect(vdp) != DDI_SUCCESS) {
451 		cmn_err(CE_WARN, "xdf@%s: start connection failed",
452 		    ddi_get_name_addr(devi));
453 		(void) xdf_start_disconnect(vdp);
454 		mutex_exit(&vdp->xdf_cb_lk);
455 		goto errout1;
456 	}
457 
458 	mutex_exit(&vdp->xdf_cb_lk);
459 
460 	list_create(&vdp->xdf_vreq_act, sizeof (v_req_t),
461 	    offsetof(v_req_t, v_link));
462 	list_create(&vdp->xdf_gs_act, sizeof (ge_slot_t),
463 	    offsetof(ge_slot_t, link));
464 
465 #if defined(XPV_HVM_DRIVER)
466 	xdf_hvm_add(devi);
467 
468 	(void) ddi_prop_update_int(DDI_DEV_T_NONE, devi, DDI_NO_AUTODETACH, 1);
469 
470 	/*
471 	 * Report our version to dom0.
472 	 */
473 	if (xenbus_printf(XBT_NULL, "hvmpv/xdf", "version", "%d",
474 	    HVMPV_XDF_VERS))
475 		cmn_err(CE_WARN, "xdf: couldn't write version\n");
476 #endif /* XPV_HVM_DRIVER */
477 
478 	ddi_report_dev(devi);
479 
480 	DPRINTF(DDI_DBG, ("xdf%d: attached\n", instance));
481 
482 	return (DDI_SUCCESS);
483 
484 errout1:
485 	xvdi_remove_event_handler(devi, XS_OE_STATE);
486 errout0:
487 	if (vdp->xdf_vd_lbl != NULL) {
488 		cmlb_detach(vdp->xdf_vd_lbl, NULL);
489 		cmlb_free_handle(&vdp->xdf_vd_lbl);
490 	}
491 #if !defined(XPV_HVM_DRIVER)
492 	xdf_kstat_delete(devi);
493 #endif /* !XPV_HVM_DRIVER */
494 	if (vdp->xdf_softintr_id != NULL)
495 		ddi_remove_softintr(vdp->xdf_softintr_id);
496 	if (vdp->xdf_ibc != NULL) {
497 		mutex_destroy(&vdp->xdf_cb_lk);
498 		mutex_destroy(&vdp->xdf_dev_lk);
499 	}
500 	cv_destroy(&vdp->xdf_dev_cv);
501 	ddi_soft_state_free(vbd_ss, instance);
502 	ddi_set_driver_private(devi, NULL);
503 	ddi_prop_remove_all(devi);
504 	cmn_err(CE_WARN, "xdf@%s: attach failed", ddi_get_name_addr(devi));
505 	return (DDI_FAILURE);
506 }
507 
508 static int
509 xdf_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
510 {
511 	xdf_t *vdp;
512 	int instance;
513 
514 	switch (cmd) {
515 
516 	case DDI_PM_SUSPEND:
517 		break;
518 
519 	case DDI_SUSPEND:
520 		return (xdf_suspend(devi));
521 
522 	case DDI_DETACH:
523 		break;
524 
525 	default:
526 		return (DDI_FAILURE);
527 	}
528 
529 	instance = ddi_get_instance(devi);
530 	DPRINTF(DDI_DBG, ("xdf%d: detaching\n", instance));
531 	vdp = ddi_get_soft_state(vbd_ss, instance);
532 
533 	if (vdp == NULL)
534 		return (DDI_FAILURE);
535 
536 	mutex_enter(&vdp->xdf_dev_lk);
537 	if (xdf_isopen(vdp, -1)) {
538 		mutex_exit(&vdp->xdf_dev_lk);
539 		return (DDI_FAILURE);
540 	}
541 
542 	if (vdp->xdf_status != XD_CLOSED) {
543 		mutex_exit(&vdp->xdf_dev_lk);
544 		return (DDI_FAILURE);
545 	}
546 
547 #if defined(XPV_HVM_DRIVER)
548 	xdf_hvm_rm(devi);
549 #endif /* XPV_HVM_DRIVER */
550 
551 	ASSERT(!ISDMACBON(vdp));
552 	mutex_exit(&vdp->xdf_dev_lk);
553 
554 	if (vdp->xdf_timeout_id != 0)
555 		(void) untimeout(vdp->xdf_timeout_id);
556 
557 	xvdi_remove_event_handler(devi, XS_OE_STATE);
558 
559 	/* we'll support backend running in domU later */
560 #ifdef	DOMU_BACKEND
561 	(void) xvdi_post_event(devi, XEN_HP_REMOVE);
562 #endif
563 
564 	list_destroy(&vdp->xdf_vreq_act);
565 	list_destroy(&vdp->xdf_gs_act);
566 	ddi_prop_remove_all(devi);
567 	xdf_kstat_delete(devi);
568 	ddi_remove_softintr(vdp->xdf_softintr_id);
569 	ddi_set_driver_private(devi, NULL);
570 	cv_destroy(&vdp->xdf_dev_cv);
571 	mutex_destroy(&vdp->xdf_cb_lk);
572 	mutex_destroy(&vdp->xdf_dev_lk);
573 	if (vdp->xdf_cache_flush_block != NULL)
574 		kmem_free(vdp->xdf_flush_mem, 2 * DEV_BSIZE);
575 	ddi_soft_state_free(vbd_ss, instance);
576 	return (DDI_SUCCESS);
577 }
578 
579 static int
580 xdf_suspend(dev_info_t *devi)
581 {
582 	xdf_t *vdp;
583 	int instance;
584 	enum xdf_state st;
585 
586 	instance = ddi_get_instance(devi);
587 
588 	if (xdfdebug & SUSRES_DBG)
589 		xen_printf("xdf_suspend: xdf#%d\n", instance);
590 
591 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
592 		return (DDI_FAILURE);
593 
594 	xvdi_suspend(devi);
595 
596 	mutex_enter(&vdp->xdf_cb_lk);
597 	mutex_enter(&vdp->xdf_dev_lk);
598 	st = vdp->xdf_status;
599 	/* change status to stop further I/O requests */
600 	if (st == XD_READY)
601 		vdp->xdf_status = XD_SUSPEND;
602 	mutex_exit(&vdp->xdf_dev_lk);
603 	mutex_exit(&vdp->xdf_cb_lk);
604 
605 	/* make sure no more I/O responses left in the ring buffer */
606 	if ((st == XD_INIT) || (st == XD_READY)) {
607 #ifdef XPV_HVM_DRIVER
608 		ec_unbind_evtchn(vdp->xdf_evtchn);
609 #else /* !XPV_HVM_DRIVER */
610 		(void) ddi_remove_intr(devi, 0, NULL);
611 #endif /* !XPV_HVM_DRIVER */
612 		(void) xdf_drain_io(vdp);
613 		/*
614 		 * no need to teardown the ring buffer here
615 		 * it will be simply re-init'ed during resume when
616 		 * we call xvdi_alloc_ring
617 		 */
618 	}
619 
620 	if (xdfdebug & SUSRES_DBG)
621 		xen_printf("xdf_suspend: SUCCESS\n");
622 
623 	return (DDI_SUCCESS);
624 }
625 
626 /*ARGSUSED*/
627 static int
628 xdf_resume(dev_info_t *devi)
629 {
630 	xdf_t *vdp;
631 	int instance;
632 
633 	instance = ddi_get_instance(devi);
634 	if (xdfdebug & SUSRES_DBG)
635 		xen_printf("xdf_resume: xdf%d\n", instance);
636 
637 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
638 		return (DDI_FAILURE);
639 
640 	mutex_enter(&vdp->xdf_cb_lk);
641 
642 	if (xvdi_resume(devi) != DDI_SUCCESS) {
643 		mutex_exit(&vdp->xdf_cb_lk);
644 		return (DDI_FAILURE);
645 	}
646 
647 	mutex_enter(&vdp->xdf_dev_lk);
648 	ASSERT(vdp->xdf_status != XD_READY);
649 	vdp->xdf_status = XD_UNKNOWN;
650 	mutex_exit(&vdp->xdf_dev_lk);
651 
652 	if (xdf_start_connect(vdp) != DDI_SUCCESS) {
653 		mutex_exit(&vdp->xdf_cb_lk);
654 		return (DDI_FAILURE);
655 	}
656 
657 	mutex_exit(&vdp->xdf_cb_lk);
658 
659 	if (xdfdebug & SUSRES_DBG)
660 		xen_printf("xdf_resume: done\n");
661 	return (DDI_SUCCESS);
662 }
663 
664 /*ARGSUSED*/
665 static int
666 xdf_reset(dev_info_t *devi, ddi_reset_cmd_t cmd)
667 {
668 	xdf_t *vdp;
669 	int instance;
670 
671 	instance = ddi_get_instance(devi);
672 	DPRINTF(DDI_DBG, ("xdf%d: resetting\n", instance));
673 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
674 		return (DDI_FAILURE);
675 
676 	/*
677 	 * wait for any outstanding I/O to complete
678 	 */
679 	(void) xdf_drain_io(vdp);
680 
681 	DPRINTF(DDI_DBG, ("xdf%d: reset complete\n", instance));
682 	return (DDI_SUCCESS);
683 }
684 
685 static int
686 xdf_open(dev_t *devp, int flag, int otyp, cred_t *credp)
687 {
688 	minor_t	minor;
689 	xdf_t	*vdp;
690 	int part;
691 	ulong_t parbit;
692 	diskaddr_t p_blkct = 0;
693 	boolean_t firstopen;
694 	boolean_t nodelay;
695 
696 	minor = getminor(*devp);
697 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
698 		return (ENXIO);
699 
700 	nodelay = (flag & (FNDELAY | FNONBLOCK));
701 
702 	DPRINTF(DDI_DBG, ("xdf%d: opening\n", XDF_INST(minor)));
703 
704 	/* do cv_wait until connected or failed */
705 	mutex_enter(&vdp->xdf_dev_lk);
706 	if (!nodelay && (xdf_connect(vdp, B_TRUE) != XD_READY)) {
707 		mutex_exit(&vdp->xdf_dev_lk);
708 		return (ENXIO);
709 	}
710 
711 	if ((flag & FWRITE) && XD_IS_RO(vdp)) {
712 		mutex_exit(&vdp->xdf_dev_lk);
713 		return (EROFS);
714 	}
715 
716 	part = XDF_PART(minor);
717 	parbit = 1 << part;
718 	if ((vdp->xdf_vd_exclopen & parbit) ||
719 	    ((flag & FEXCL) && xdf_isopen(vdp, part))) {
720 		mutex_exit(&vdp->xdf_dev_lk);
721 		return (EBUSY);
722 	}
723 
724 	/* are we the first one to open this node? */
725 	firstopen = !xdf_isopen(vdp, -1);
726 
727 	if (otyp == OTYP_LYR)
728 		vdp->xdf_vd_lyropen[part]++;
729 
730 	vdp->xdf_vd_open[otyp] |= parbit;
731 
732 	if (flag & FEXCL)
733 		vdp->xdf_vd_exclopen |= parbit;
734 
735 	mutex_exit(&vdp->xdf_dev_lk);
736 
737 	/* force a re-validation */
738 	if (firstopen)
739 		cmlb_invalidate(vdp->xdf_vd_lbl, NULL);
740 
741 	/*
742 	 * check size
743 	 * ignore CD/DVD which contains a zero-sized s0
744 	 */
745 	if (!nodelay && !XD_IS_CD(vdp) &&
746 	    ((cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
747 	    NULL, NULL, NULL, NULL) != 0) || (p_blkct == 0))) {
748 		(void) xdf_close(*devp, flag, otyp, credp);
749 		return (ENXIO);
750 	}
751 
752 	return (0);
753 }
754 
755 /*ARGSUSED*/
756 static int
757 xdf_close(dev_t dev, int flag, int otyp, struct cred *credp)
758 {
759 	minor_t	minor;
760 	xdf_t	*vdp;
761 	int part;
762 	ulong_t parbit;
763 
764 	minor = getminor(dev);
765 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
766 		return (ENXIO);
767 
768 	mutex_enter(&vdp->xdf_dev_lk);
769 	part = XDF_PART(minor);
770 	if (!xdf_isopen(vdp, part)) {
771 		mutex_exit(&vdp->xdf_dev_lk);
772 		return (ENXIO);
773 	}
774 	parbit = 1 << part;
775 
776 	ASSERT((vdp->xdf_vd_open[otyp] & parbit) != 0);
777 	if (otyp == OTYP_LYR) {
778 		ASSERT(vdp->xdf_vd_lyropen[part] > 0);
779 		if (--vdp->xdf_vd_lyropen[part] == 0)
780 			vdp->xdf_vd_open[otyp] &= ~parbit;
781 	} else {
782 		vdp->xdf_vd_open[otyp] &= ~parbit;
783 	}
784 	vdp->xdf_vd_exclopen &= ~parbit;
785 
786 	mutex_exit(&vdp->xdf_dev_lk);
787 	return (0);
788 }
789 
790 static int
791 xdf_strategy(struct buf *bp)
792 {
793 	xdf_t	*vdp;
794 	minor_t minor;
795 	diskaddr_t p_blkct, p_blkst;
796 	ulong_t nblks;
797 	int part;
798 
799 	minor = getminor(bp->b_edev);
800 	part = XDF_PART(minor);
801 
802 	vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor));
803 	if ((vdp == NULL) || !xdf_isopen(vdp, part)) {
804 		bioerror(bp, ENXIO);
805 		bp->b_resid = bp->b_bcount;
806 		biodone(bp);
807 		return (0);
808 	}
809 
810 	/* Check for writes to a read only device */
811 	if (!IS_READ(bp) && XD_IS_RO(vdp)) {
812 		bioerror(bp, EROFS);
813 		bp->b_resid = bp->b_bcount;
814 		biodone(bp);
815 		return (0);
816 	}
817 
818 	/* Check if this I/O is accessing a partition or the entire disk */
819 	if ((long)bp->b_private == XB_SLICE_NONE) {
820 		/* This I/O is using an absolute offset */
821 		p_blkct = vdp->xdf_xdev_nblocks;
822 		p_blkst = 0;
823 	} else {
824 		/* This I/O is using a partition relative offset */
825 		if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
826 		    &p_blkst, NULL, NULL, NULL)) {
827 			bioerror(bp, ENXIO);
828 			bp->b_resid = bp->b_bcount;
829 			biodone(bp);
830 			return (0);
831 		}
832 	}
833 
834 	/* check for a starting block beyond the disk or partition limit */
835 	if (bp->b_blkno > p_blkct) {
836 		DPRINTF(IO_DBG, ("xdf: block %lld exceeds VBD size %"PRIu64,
837 		    (longlong_t)bp->b_blkno, (uint64_t)p_blkct));
838 		bioerror(bp, EINVAL);
839 		bp->b_resid = bp->b_bcount;
840 		biodone(bp);
841 		return (0);
842 	}
843 
844 	/* Legacy: don't set error flag at this case */
845 	if (bp->b_blkno == p_blkct) {
846 		bp->b_resid = bp->b_bcount;
847 		biodone(bp);
848 		return (0);
849 	}
850 
851 	/* Adjust for partial transfer */
852 	nblks = bp->b_bcount >> XB_BSHIFT;
853 	if ((bp->b_blkno + nblks) > p_blkct) {
854 		bp->b_resid = ((bp->b_blkno + nblks) - p_blkct) << XB_BSHIFT;
855 		bp->b_bcount -= bp->b_resid;
856 	}
857 
858 	DPRINTF(IO_DBG, ("xdf: strategy blk %lld len %lu\n",
859 	    (longlong_t)bp->b_blkno, (ulong_t)bp->b_bcount));
860 
861 	/* Fix up the buf struct */
862 	bp->b_flags |= B_BUSY;
863 	bp->av_forw = bp->av_back = NULL; /* not tagged with a v_req */
864 	bp->b_private = (void *)(uintptr_t)p_blkst;
865 
866 	mutex_enter(&vdp->xdf_dev_lk);
867 	if (vdp->xdf_xdev_iostat != NULL)
868 		kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
869 	if (vdp->xdf_f_act == NULL) {
870 		vdp->xdf_f_act = vdp->xdf_l_act = bp;
871 	} else {
872 		vdp->xdf_l_act->av_forw = bp;
873 		vdp->xdf_l_act = bp;
874 	}
875 	mutex_exit(&vdp->xdf_dev_lk);
876 
877 	xdf_iostart(vdp);
878 	if (do_polled_io)
879 		(void) xdf_drain_io(vdp);
880 	return (0);
881 }
882 
883 /*ARGSUSED*/
884 static int
885 xdf_read(dev_t dev, struct uio *uiop, cred_t *credp)
886 {
887 
888 	xdf_t	*vdp;
889 	minor_t minor;
890 	diskaddr_t p_blkcnt;
891 	int part;
892 
893 	minor = getminor(dev);
894 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
895 		return (ENXIO);
896 
897 	DPRINTF(IO_DBG, ("xdf: read offset 0x%"PRIx64"\n",
898 	    (int64_t)uiop->uio_offset));
899 
900 	part = XDF_PART(minor);
901 	if (!xdf_isopen(vdp, part))
902 		return (ENXIO);
903 
904 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
905 	    NULL, NULL, NULL, NULL))
906 		return (ENXIO);
907 
908 	if (U_INVAL(uiop))
909 		return (EINVAL);
910 
911 	return (physio(xdf_strategy, NULL, dev, B_READ, xdfmin, uiop));
912 }
913 
914 /*ARGSUSED*/
915 static int
916 xdf_write(dev_t dev, struct uio *uiop, cred_t *credp)
917 {
918 	xdf_t *vdp;
919 	minor_t minor;
920 	diskaddr_t p_blkcnt;
921 	int part;
922 
923 	minor = getminor(dev);
924 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
925 		return (ENXIO);
926 
927 	DPRINTF(IO_DBG, ("xdf: write offset 0x%"PRIx64"\n",
928 	    (int64_t)uiop->uio_offset));
929 
930 	part = XDF_PART(minor);
931 	if (!xdf_isopen(vdp, part))
932 		return (ENXIO);
933 
934 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
935 	    NULL, NULL, NULL, NULL))
936 		return (ENXIO);
937 
938 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt))
939 		return (ENOSPC);
940 
941 	if (U_INVAL(uiop))
942 		return (EINVAL);
943 
944 	return (physio(xdf_strategy, NULL, dev, B_WRITE, minphys, uiop));
945 }
946 
947 /*ARGSUSED*/
948 static int
949 xdf_aread(dev_t dev, struct aio_req *aiop, cred_t *credp)
950 {
951 	xdf_t	*vdp;
952 	minor_t minor;
953 	struct uio *uiop = aiop->aio_uio;
954 	diskaddr_t p_blkcnt;
955 	int part;
956 
957 	minor = getminor(dev);
958 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
959 		return (ENXIO);
960 
961 	part = XDF_PART(minor);
962 	if (!xdf_isopen(vdp, part))
963 		return (ENXIO);
964 
965 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
966 	    NULL, NULL, NULL, NULL))
967 		return (ENXIO);
968 
969 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt))
970 		return (ENOSPC);
971 
972 	if (U_INVAL(uiop))
973 		return (EINVAL);
974 
975 	return (aphysio(xdf_strategy, anocancel, dev, B_READ, minphys, aiop));
976 }
977 
978 /*ARGSUSED*/
979 static int
980 xdf_awrite(dev_t dev, struct aio_req *aiop, cred_t *credp)
981 {
982 	xdf_t *vdp;
983 	minor_t minor;
984 	struct uio *uiop = aiop->aio_uio;
985 	diskaddr_t p_blkcnt;
986 	int part;
987 
988 	minor = getminor(dev);
989 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
990 		return (ENXIO);
991 
992 	part = XDF_PART(minor);
993 	if (!xdf_isopen(vdp, part))
994 		return (ENXIO);
995 
996 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
997 	    NULL, NULL, NULL, NULL))
998 		return (ENXIO);
999 
1000 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt))
1001 		return (ENOSPC);
1002 
1003 	if (U_INVAL(uiop))
1004 		return (EINVAL);
1005 
1006 	return (aphysio(xdf_strategy, anocancel, dev, B_WRITE, minphys, aiop));
1007 }
1008 
1009 static int
1010 xdf_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
1011 {
1012 	struct buf dumpbuf, *dbp;
1013 	xdf_t	*vdp;
1014 	minor_t minor;
1015 	int err = 0;
1016 	int part;
1017 	diskaddr_t p_blkcnt, p_blkst;
1018 
1019 	minor = getminor(dev);
1020 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
1021 		return (ENXIO);
1022 
1023 	DPRINTF(IO_DBG, ("xdf: dump addr (0x%p) blk (%ld) nblks (%d)\n",
1024 	    addr, blkno, nblk));
1025 
1026 	part = XDF_PART(minor);
1027 	if (!xdf_isopen(vdp, part))
1028 		return (ENXIO);
1029 
1030 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt, &p_blkst,
1031 	    NULL, NULL, NULL))
1032 		return (ENXIO);
1033 
1034 	if ((blkno + nblk) > p_blkcnt) {
1035 		cmn_err(CE_WARN, "xdf: block %ld exceeds VBD size %"PRIu64,
1036 		    blkno + nblk, (uint64_t)p_blkcnt);
1037 		return (EINVAL);
1038 	}
1039 
1040 	dbp = &dumpbuf;
1041 	bioinit(dbp);
1042 	dbp->b_flags = B_BUSY;
1043 	dbp->b_un.b_addr = addr;
1044 	dbp->b_bcount = nblk << DEV_BSHIFT;
1045 	dbp->b_blkno = blkno;
1046 	dbp->b_edev = dev;
1047 	dbp->b_private = (void *)(uintptr_t)p_blkst;
1048 
1049 	mutex_enter(&vdp->xdf_dev_lk);
1050 	if (vdp->xdf_xdev_iostat != NULL)
1051 		kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1052 	if (vdp->xdf_f_act == NULL) {
1053 		vdp->xdf_f_act = vdp->xdf_l_act = dbp;
1054 	} else {
1055 		vdp->xdf_l_act->av_forw = dbp;
1056 		vdp->xdf_l_act = dbp;
1057 	}
1058 	dbp->av_forw = NULL;
1059 	dbp->av_back = NULL;
1060 	mutex_exit(&vdp->xdf_dev_lk);
1061 	xdf_iostart(vdp);
1062 	err = xdf_drain_io(vdp);
1063 	biofini(dbp);
1064 	return (err);
1065 }
1066 
1067 /*ARGSUSED*/
1068 static int
1069 xdf_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
1070     int *rvalp)
1071 {
1072 	int instance;
1073 	xdf_t	*vdp;
1074 	minor_t minor;
1075 	int part;
1076 
1077 	minor = getminor(dev);
1078 	instance = XDF_INST(minor);
1079 
1080 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
1081 		return (ENXIO);
1082 
1083 	DPRINTF(IOCTL_DBG, ("xdf%d:ioctl: cmd %d (0x%x)\n",
1084 	    instance, cmd, cmd));
1085 
1086 	part = XDF_PART(minor);
1087 	if (!xdf_isopen(vdp, part))
1088 		return (ENXIO);
1089 
1090 	switch (cmd) {
1091 	case DKIOCGMEDIAINFO: {
1092 		struct dk_minfo	media_info;
1093 
1094 		media_info.dki_lbsize = DEV_BSIZE;
1095 		media_info.dki_capacity = vdp->xdf_pgeom.g_capacity;
1096 		media_info.dki_media_type = DK_FIXED_DISK;
1097 
1098 		if (ddi_copyout(&media_info, (void *)arg,
1099 		    sizeof (struct dk_minfo), mode)) {
1100 			return (EFAULT);
1101 		} else {
1102 			return (0);
1103 		}
1104 	}
1105 
1106 	case DKIOCINFO: {
1107 		struct dk_cinfo info;
1108 
1109 		/* controller information */
1110 		if (XD_IS_CD(vdp))
1111 			info.dki_ctype = DKC_CDROM;
1112 		else
1113 			info.dki_ctype = DKC_VBD;
1114 
1115 		info.dki_cnum = 0;
1116 		(void) strncpy((char *)(&info.dki_cname), "xdf", 8);
1117 
1118 		/* unit information */
1119 		info.dki_unit = ddi_get_instance(vdp->xdf_dip);
1120 		(void) strncpy((char *)(&info.dki_dname), "xdf", 8);
1121 		info.dki_flags = DKI_FMTVOL;
1122 		info.dki_partition = part;
1123 		info.dki_maxtransfer = maxphys / DEV_BSIZE;
1124 		info.dki_addr = 0;
1125 		info.dki_space = 0;
1126 		info.dki_prio = 0;
1127 		info.dki_vec = 0;
1128 
1129 		if (ddi_copyout(&info, (void *)arg, sizeof (info), mode))
1130 			return (EFAULT);
1131 		else
1132 			return (0);
1133 	}
1134 
1135 	case DKIOCSTATE: {
1136 		enum dkio_state	dkstate = DKIO_INSERTED;
1137 		if (ddi_copyout(&dkstate, (void *)arg, sizeof (dkstate),
1138 		    mode) != 0)
1139 			return (EFAULT);
1140 		return (0);
1141 	}
1142 
1143 	/*
1144 	 * is media removable?
1145 	 */
1146 	case DKIOCREMOVABLE: {
1147 		int i = XD_IS_RM(vdp) ? 1 : 0;
1148 		if (ddi_copyout(&i, (caddr_t)arg, sizeof (int), mode))
1149 			return (EFAULT);
1150 		return (0);
1151 	}
1152 
1153 	case DKIOCG_PHYGEOM:
1154 	case DKIOCG_VIRTGEOM:
1155 	case DKIOCGGEOM:
1156 	case DKIOCSGEOM:
1157 	case DKIOCGAPART:
1158 	case DKIOCSAPART:
1159 	case DKIOCGVTOC:
1160 	case DKIOCSVTOC:
1161 	case DKIOCPARTINFO:
1162 	case DKIOCGMBOOT:
1163 	case DKIOCSMBOOT:
1164 	case DKIOCGETEFI:
1165 	case DKIOCSETEFI:
1166 	case DKIOCPARTITION: {
1167 		int rc;
1168 
1169 		rc = cmlb_ioctl(vdp->xdf_vd_lbl, dev, cmd, arg, mode, credp,
1170 		    rvalp, NULL);
1171 		return (rc);
1172 	}
1173 
1174 	case DKIOCGETWCE:
1175 		if (ddi_copyout(&vdp->xdf_wce, (void *)arg,
1176 		    sizeof (vdp->xdf_wce), mode))
1177 			return (EFAULT);
1178 		return (0);
1179 	case DKIOCSETWCE:
1180 		if (ddi_copyin((void *)arg, &vdp->xdf_wce,
1181 		    sizeof (vdp->xdf_wce), mode))
1182 			return (EFAULT);
1183 		return (0);
1184 	case DKIOCFLUSHWRITECACHE: {
1185 		int rc;
1186 		struct dk_callback *dkc = (struct dk_callback *)arg;
1187 
1188 		if (vdp->xdf_flush_supported) {
1189 			rc = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
1190 			    NULL, 0, 0, (void *)dev);
1191 		} else if (vdp->xdf_feature_barrier &&
1192 		    !xdf_barrier_flush_disable) {
1193 			rc = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
1194 			    vdp->xdf_cache_flush_block, xdf_flush_block,
1195 			    DEV_BSIZE, (void *)dev);
1196 		} else {
1197 			return (ENOTTY);
1198 		}
1199 		if ((mode & FKIOCTL) && (dkc != NULL) &&
1200 		    (dkc->dkc_callback != NULL)) {
1201 			(*dkc->dkc_callback)(dkc->dkc_cookie, rc);
1202 			/* need to return 0 after calling callback */
1203 			rc = 0;
1204 		}
1205 		return (rc);
1206 	}
1207 
1208 	default:
1209 		return (ENOTTY);
1210 	}
1211 }
1212 
1213 /*
1214  * xdf interrupt handler
1215  */
1216 static uint_t
1217 xdf_intr(caddr_t arg)
1218 {
1219 	xdf_t *vdp = (xdf_t *)arg;
1220 	xendev_ring_t *xbr;
1221 	blkif_response_t *resp;
1222 	int bioerr;
1223 	uint64_t id;
1224 	extern int do_polled_io;
1225 	uint8_t op;
1226 	uint16_t status;
1227 	ddi_acc_handle_t acchdl;
1228 
1229 	mutex_enter(&vdp->xdf_dev_lk);
1230 
1231 	if ((xbr = vdp->xdf_xb_ring) == NULL) {
1232 		mutex_exit(&vdp->xdf_dev_lk);
1233 		return (DDI_INTR_UNCLAIMED);
1234 	}
1235 
1236 	acchdl = vdp->xdf_xb_ring_hdl;
1237 
1238 	/*
1239 	 * complete all requests which have a response
1240 	 */
1241 	while (resp = xvdi_ring_get_response(xbr)) {
1242 		id = ddi_get64(acchdl, &resp->id);
1243 		op = ddi_get8(acchdl, &resp->operation);
1244 		status = ddi_get16(acchdl, (uint16_t *)&resp->status);
1245 		DPRINTF(INTR_DBG, ("resp: op %d id %"PRIu64" status %d\n",
1246 		    op, id, status));
1247 
1248 		/*
1249 		 * XXPV - close connection to the backend and restart
1250 		 */
1251 		if (status != BLKIF_RSP_OKAY) {
1252 			DPRINTF(IO_DBG, ("xdf@%s: I/O error while %s",
1253 			    ddi_get_name_addr(vdp->xdf_dip),
1254 			    (op == BLKIF_OP_READ) ? "reading" : "writing"));
1255 			bioerr = EIO;
1256 		} else {
1257 			bioerr = 0;
1258 		}
1259 
1260 		xdf_iofini(vdp, id, bioerr);
1261 	}
1262 
1263 	mutex_exit(&vdp->xdf_dev_lk);
1264 
1265 	if (!do_polled_io)
1266 		xdf_iostart(vdp);
1267 
1268 	return (DDI_INTR_CLAIMED);
1269 }
1270 
1271 int xdf_fbrewrites;	/* how many times was our flush block rewritten */
1272 
1273 /*
1274  * Snarf new data if our flush block was re-written
1275  */
1276 static void
1277 check_fbwrite(xdf_t *vdp, buf_t *bp, daddr_t blkno)
1278 {
1279 	int nblks;
1280 	boolean_t mapin;
1281 
1282 	if (IS_WRITE_BARRIER(vdp, bp))
1283 		return; /* write was a flush write */
1284 
1285 	mapin = B_FALSE;
1286 	nblks = bp->b_bcount >> DEV_BSHIFT;
1287 	if (xdf_flush_block >= blkno && xdf_flush_block < (blkno + nblks)) {
1288 		xdf_fbrewrites++;
1289 		if (bp->b_flags & (B_PAGEIO | B_PHYS)) {
1290 			mapin = B_TRUE;
1291 			bp_mapin(bp);
1292 		}
1293 		bcopy(bp->b_un.b_addr +
1294 		    ((xdf_flush_block - blkno) << DEV_BSHIFT),
1295 		    vdp->xdf_cache_flush_block, DEV_BSIZE);
1296 		if (mapin)
1297 			bp_mapout(bp);
1298 	}
1299 }
1300 
1301 static void
1302 xdf_iofini(xdf_t *vdp, uint64_t id, int bioerr)
1303 {
1304 	ge_slot_t *gs = (ge_slot_t *)(uintptr_t)id;
1305 	v_req_t *vreq = gs->vreq;
1306 	buf_t *bp = vreq->v_buf;
1307 
1308 	gs_free(vdp, gs);
1309 	if (bioerr)
1310 		bioerror(bp, bioerr);
1311 	vreq->v_nslots--;
1312 	if (vreq->v_nslots != 0)
1313 		return;
1314 
1315 	XDF_UPDATE_IO_STAT(vdp, bp);
1316 	if (vdp->xdf_xdev_iostat != NULL)
1317 		kstat_runq_exit(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1318 
1319 	if (IS_ERROR(bp))
1320 		bp->b_resid = bp->b_bcount;
1321 
1322 	vreq_free(vdp, vreq);
1323 	biodone(bp);
1324 }
1325 
1326 /*
1327  * return value of xdf_prepare_rreq()
1328  * used in xdf_iostart()
1329  */
1330 #define	XF_PARTIAL	0 /* rreq is full, not all I/O in buf transferred */
1331 #define	XF_COMP		1 /* no more I/O left in buf */
1332 
1333 static void
1334 xdf_iostart(xdf_t *vdp)
1335 {
1336 	xendev_ring_t *xbr;
1337 	struct buf *bp;
1338 	blkif_request_t *rreq;
1339 	int retval;
1340 	int rreqready = 0;
1341 
1342 	xbr = vdp->xdf_xb_ring;
1343 
1344 	/*
1345 	 * populate the ring request(s)
1346 	 *
1347 	 * loop until there is no buf to transfer or no free slot
1348 	 * available in I/O ring
1349 	 */
1350 	mutex_enter(&vdp->xdf_dev_lk);
1351 
1352 	for (;;) {
1353 		if (vdp->xdf_status != XD_READY)
1354 			break;
1355 
1356 		/* active buf queue empty? */
1357 		if ((bp = vdp->xdf_f_act) == NULL)
1358 			break;
1359 
1360 		/* try to grab a vreq for this bp */
1361 		if ((BP2VREQ(bp) == NULL) && (vreq_get(vdp, bp) == NULL))
1362 				break;
1363 		/* alloc DMA/GTE resources */
1364 		if (vreq_setup(vdp, BP2VREQ(bp)) != DDI_SUCCESS)
1365 			break;
1366 
1367 		/* get next blkif_request in the ring */
1368 		if ((rreq = xvdi_ring_get_request(xbr)) == NULL)
1369 			break;
1370 		bzero(rreq, sizeof (blkif_request_t));
1371 
1372 		/* populate blkif_request with this buf */
1373 		rreqready++;
1374 		retval = xdf_prepare_rreq(vdp, bp, rreq);
1375 		if (retval == XF_COMP) {
1376 			/* finish this bp, switch to next one */
1377 			if (vdp->xdf_xdev_iostat != NULL)
1378 				kstat_waitq_to_runq(
1379 				    KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1380 			vdp->xdf_f_act = bp->av_forw;
1381 			bp->av_forw = NULL;
1382 		}
1383 	}
1384 
1385 	/*
1386 	 * Send the request(s) to the backend
1387 	 */
1388 	if (rreqready) {
1389 		if (xvdi_ring_push_request(xbr)) {
1390 			DPRINTF(IO_DBG, ("xdf_iostart: "
1391 			    "sent request(s) to backend\n"));
1392 			xvdi_notify_oe(vdp->xdf_dip);
1393 		}
1394 	}
1395 
1396 	mutex_exit(&vdp->xdf_dev_lk);
1397 }
1398 
1399 /*
1400  * populate a single blkif_request_t w/ a buf
1401  */
1402 static int
1403 xdf_prepare_rreq(xdf_t *vdp, struct buf *bp, blkif_request_t *rreq)
1404 {
1405 	int		rval;
1406 	grant_ref_t	gr;
1407 	uint8_t		fsect, lsect;
1408 	size_t		bcnt;
1409 	paddr_t		dma_addr;
1410 	off_t		blk_off;
1411 	dev_info_t	*dip = vdp->xdf_dip;
1412 	blkif_vdev_t	vdev = xvdi_get_vdevnum(dip);
1413 	v_req_t		*vreq = BP2VREQ(bp);
1414 	uint64_t	blkno = vreq->v_blkno;
1415 	uint_t		ndmacs = vreq->v_ndmacs;
1416 	ddi_acc_handle_t acchdl = vdp->xdf_xb_ring_hdl;
1417 	int		seg = 0;
1418 	int		isread = IS_READ(bp);
1419 
1420 	if (isread)
1421 		ddi_put8(acchdl, &rreq->operation, BLKIF_OP_READ);
1422 	else {
1423 		switch (vreq->v_flush_diskcache) {
1424 		case FLUSH_DISKCACHE:
1425 			ddi_put8(acchdl, &rreq->operation,
1426 			    BLKIF_OP_FLUSH_DISKCACHE);
1427 			ddi_put16(acchdl, &rreq->handle, vdev);
1428 			ddi_put64(acchdl, &rreq->id,
1429 			    (uint64_t)(uintptr_t)(vreq->v_gs));
1430 			ddi_put8(acchdl, &rreq->nr_segments, 0);
1431 			return (XF_COMP);
1432 		case WRITE_BARRIER:
1433 			ddi_put8(acchdl, &rreq->operation,
1434 			    BLKIF_OP_WRITE_BARRIER);
1435 			break;
1436 		default:
1437 			if (!vdp->xdf_wce)
1438 				ddi_put8(acchdl, &rreq->operation,
1439 				    BLKIF_OP_WRITE_BARRIER);
1440 			else
1441 				ddi_put8(acchdl, &rreq->operation,
1442 				    BLKIF_OP_WRITE);
1443 			break;
1444 		}
1445 	}
1446 
1447 	ddi_put16(acchdl, &rreq->handle, vdev);
1448 	ddi_put64(acchdl, &rreq->sector_number, blkno);
1449 	ddi_put64(acchdl, &rreq->id, (uint64_t)(uintptr_t)(vreq->v_gs));
1450 
1451 	/*
1452 	 * loop until all segments are populated or no more dma cookie in buf
1453 	 */
1454 	for (;;) {
1455 	/*
1456 	 * Each segment of a blkif request can transfer up to
1457 	 * one 4K page of data.
1458 	 */
1459 		bcnt = vreq->v_dmac.dmac_size;
1460 		ASSERT(bcnt <= PAGESIZE);
1461 		ASSERT((bcnt % XB_BSIZE) == 0);
1462 		dma_addr = vreq->v_dmac.dmac_laddress;
1463 		blk_off = (uint_t)((paddr_t)XB_SEGOFFSET & dma_addr);
1464 		ASSERT((blk_off & XB_BMASK) == 0);
1465 		fsect = blk_off >> XB_BSHIFT;
1466 		lsect = fsect + (bcnt >> XB_BSHIFT) - 1;
1467 		ASSERT(fsect < XB_MAX_SEGLEN / XB_BSIZE &&
1468 		    lsect < XB_MAX_SEGLEN / XB_BSIZE);
1469 		DPRINTF(IO_DBG, ("  ""seg%d: dmacS %lu blk_off %ld\n",
1470 		    seg, vreq->v_dmac.dmac_size, blk_off));
1471 		gr = gs_grant(vreq->v_gs, PATOMA(dma_addr) >> PAGESHIFT);
1472 		ddi_put32(acchdl, &rreq->seg[seg].gref, gr);
1473 		ddi_put8(acchdl, &rreq->seg[seg].first_sect, fsect);
1474 		ddi_put8(acchdl, &rreq->seg[seg].last_sect, lsect);
1475 		DPRINTF(IO_DBG, ("  ""seg%d: fs %d ls %d gr %d dma 0x%"PRIx64
1476 		    "\n", seg, fsect, lsect, gr, dma_addr));
1477 
1478 		blkno += (bcnt >> XB_BSHIFT);
1479 		seg++;
1480 		ASSERT(seg <= BLKIF_MAX_SEGMENTS_PER_REQUEST);
1481 		if (--ndmacs) {
1482 			ddi_dma_nextcookie(vreq->v_dmahdl, &vreq->v_dmac);
1483 			continue;
1484 		}
1485 
1486 		vreq->v_status = VREQ_DMAWIN_DONE;
1487 		vreq->v_blkno = blkno;
1488 		if (vreq->v_dmaw + 1 == vreq->v_ndmaws)
1489 			/* last win */
1490 			rval = XF_COMP;
1491 		else
1492 			rval = XF_PARTIAL;
1493 		break;
1494 	}
1495 	ddi_put8(acchdl,  &rreq->nr_segments, seg);
1496 	DPRINTF(IO_DBG, ("xdf_prepare_rreq: request id=%"PRIx64" ready\n",
1497 	    rreq->id));
1498 
1499 	return (rval);
1500 }
1501 
1502 #define	XDF_QSEC	50000	/* .005 second */
1503 #define	XDF_POLLCNT	12	/* loop for 12 times before time out */
1504 
1505 static int
1506 xdf_drain_io(xdf_t *vdp)
1507 {
1508 	int pollc, rval;
1509 	xendev_ring_t *xbr;
1510 
1511 	if (xdfdebug & SUSRES_DBG)
1512 		xen_printf("xdf_drain_io: start\n");
1513 
1514 	mutex_enter(&vdp->xdf_dev_lk);
1515 
1516 	if ((vdp->xdf_status != XD_READY) && (vdp->xdf_status != XD_SUSPEND))
1517 		goto out;
1518 
1519 	rval = 0;
1520 	xbr = vdp->xdf_xb_ring;
1521 	ASSERT(xbr != NULL);
1522 
1523 	for (pollc = 0; pollc < XDF_POLLCNT; pollc++) {
1524 		if (xvdi_ring_has_unconsumed_responses(xbr)) {
1525 			mutex_exit(&vdp->xdf_dev_lk);
1526 			(void) xdf_intr((caddr_t)vdp);
1527 			mutex_enter(&vdp->xdf_dev_lk);
1528 		}
1529 		if (!xvdi_ring_has_incomp_request(xbr))
1530 			goto out;
1531 
1532 #ifndef	XPV_HVM_DRIVER
1533 		(void) HYPERVISOR_yield();
1534 #endif /* XPV_HVM_DRIVER */
1535 		/*
1536 		 * file-backed devices can be slow
1537 		 */
1538 		drv_usecwait(XDF_QSEC << pollc);
1539 	}
1540 	cmn_err(CE_WARN, "xdf_polled_io: timeout");
1541 	rval = EIO;
1542 out:
1543 	mutex_exit(&vdp->xdf_dev_lk);
1544 	if (xdfdebug & SUSRES_DBG)
1545 		xen_printf("xdf_drain_io: end, err=%d\n", rval);
1546 	return (rval);
1547 }
1548 
1549 /* ARGSUSED5 */
1550 int
1551 xdf_lb_rdwr(dev_info_t *devi, uchar_t cmd, void *bufp,
1552     diskaddr_t start, size_t reqlen, void *tg_cookie)
1553 {
1554 	xdf_t *vdp;
1555 	struct buf *bp;
1556 	int err = 0;
1557 
1558 	vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi));
1559 	if (vdp == NULL)
1560 		return (ENXIO);
1561 
1562 	if ((start + (reqlen >> DEV_BSHIFT)) > vdp->xdf_pgeom.g_capacity)
1563 		return (EINVAL);
1564 
1565 	bp = getrbuf(KM_SLEEP);
1566 	if (cmd == TG_READ)
1567 		bp->b_flags = B_BUSY | B_READ;
1568 	else
1569 		bp->b_flags = B_BUSY | B_WRITE;
1570 	bp->b_un.b_addr = bufp;
1571 	bp->b_bcount = reqlen;
1572 	bp->b_blkno = start;
1573 	bp->b_edev = DDI_DEV_T_NONE; /* don't have dev_t */
1574 
1575 	mutex_enter(&vdp->xdf_dev_lk);
1576 	if (vdp->xdf_xdev_iostat != NULL)
1577 		kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1578 	if (vdp->xdf_f_act == NULL) {
1579 		vdp->xdf_f_act = vdp->xdf_l_act = bp;
1580 	} else {
1581 		vdp->xdf_l_act->av_forw = bp;
1582 		vdp->xdf_l_act = bp;
1583 	}
1584 	mutex_exit(&vdp->xdf_dev_lk);
1585 	xdf_iostart(vdp);
1586 	err = biowait(bp);
1587 
1588 	ASSERT(bp->b_flags & B_DONE);
1589 
1590 	freerbuf(bp);
1591 	return (err);
1592 }
1593 
1594 /*
1595  * synthetic geometry
1596  */
1597 #define	XDF_NSECTS	256
1598 #define	XDF_NHEADS	16
1599 
1600 static void
1601 xdf_synthetic_pgeom(dev_info_t *devi, cmlb_geom_t *geomp)
1602 {
1603 	xdf_t *vdp;
1604 	uint_t ncyl;
1605 
1606 	vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi));
1607 
1608 	ncyl = vdp->xdf_xdev_nblocks / (XDF_NHEADS * XDF_NSECTS);
1609 
1610 	geomp->g_ncyl = ncyl == 0 ? 1 : ncyl;
1611 	geomp->g_acyl = 0;
1612 	geomp->g_nhead = XDF_NHEADS;
1613 	geomp->g_secsize = XB_BSIZE;
1614 	geomp->g_nsect = XDF_NSECTS;
1615 	geomp->g_intrlv = 0;
1616 	geomp->g_rpm = 7200;
1617 	geomp->g_capacity = vdp->xdf_xdev_nblocks;
1618 }
1619 
1620 static int
1621 xdf_lb_getcap(dev_info_t *devi, diskaddr_t *capp)
1622 {
1623 	xdf_t *vdp;
1624 
1625 	vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi));
1626 
1627 	if (vdp == NULL)
1628 		return (ENXIO);
1629 
1630 	mutex_enter(&vdp->xdf_dev_lk);
1631 	*capp = vdp->xdf_pgeom.g_capacity;
1632 	DPRINTF(LBL_DBG, ("capacity %llu\n", *capp));
1633 	mutex_exit(&vdp->xdf_dev_lk);
1634 	return (0);
1635 }
1636 
1637 static int
1638 xdf_lb_getpgeom(dev_info_t *devi, cmlb_geom_t *geomp)
1639 {
1640 	xdf_t *vdp;
1641 
1642 	if ((vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi))) == NULL)
1643 		return (ENXIO);
1644 	*geomp = vdp->xdf_pgeom;
1645 	return (0);
1646 }
1647 
1648 /*
1649  * No real HBA, no geometry available from it
1650  */
1651 /*ARGSUSED*/
1652 static int
1653 xdf_lb_getvgeom(dev_info_t *devi, cmlb_geom_t *geomp)
1654 {
1655 	return (EINVAL);
1656 }
1657 
1658 static int
1659 xdf_lb_getattribute(dev_info_t *devi, tg_attribute_t *tgattributep)
1660 {
1661 	xdf_t *vdp;
1662 
1663 	if (!(vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi))))
1664 		return (ENXIO);
1665 
1666 	if (XD_IS_RO(vdp))
1667 		tgattributep->media_is_writable = 0;
1668 	else
1669 		tgattributep->media_is_writable = 1;
1670 	return (0);
1671 }
1672 
1673 /* ARGSUSED3 */
1674 int
1675 xdf_lb_getinfo(dev_info_t *devi, int cmd, void *arg, void *tg_cookie)
1676 {
1677 	switch (cmd) {
1678 	case TG_GETPHYGEOM:
1679 		return (xdf_lb_getpgeom(devi, (cmlb_geom_t *)arg));
1680 	case TG_GETVIRTGEOM:
1681 		return (xdf_lb_getvgeom(devi, (cmlb_geom_t *)arg));
1682 	case TG_GETCAPACITY:
1683 		return (xdf_lb_getcap(devi, (diskaddr_t *)arg));
1684 	case TG_GETBLOCKSIZE:
1685 		*(uint32_t *)arg = XB_BSIZE;
1686 		return (0);
1687 	case TG_GETATTR:
1688 		return (xdf_lb_getattribute(devi, (tg_attribute_t *)arg));
1689 	default:
1690 		return (ENOTTY);
1691 	}
1692 }
1693 
1694 /*
1695  * Kick-off connect process
1696  * Status should be XD_UNKNOWN or XD_CLOSED
1697  * On success, status will be changed to XD_INIT
1698  * On error, status won't be changed
1699  */
1700 static int
1701 xdf_start_connect(xdf_t *vdp)
1702 {
1703 	char *xsnode;
1704 	grant_ref_t gref;
1705 	xenbus_transaction_t xbt;
1706 	int rv;
1707 	dev_info_t *dip = vdp->xdf_dip;
1708 
1709 	if ((vdp->xdf_peer = xvdi_get_oeid(dip)) == (domid_t)-1)
1710 		goto errout;
1711 
1712 	if (xvdi_alloc_evtchn(dip) != DDI_SUCCESS) {
1713 		cmn_err(CE_WARN, "xdf@%s: failed to alloc event channel",
1714 		    ddi_get_name_addr(dip));
1715 		goto errout;
1716 	}
1717 	vdp->xdf_evtchn = xvdi_get_evtchn(dip);
1718 #ifdef XPV_HVM_DRIVER
1719 	ec_bind_evtchn_to_handler(vdp->xdf_evtchn, IPL_VBD, xdf_intr, vdp);
1720 #else /* !XPV_HVM_DRIVER */
1721 	if (ddi_add_intr(dip, 0, NULL, NULL, xdf_intr, (caddr_t)vdp) !=
1722 	    DDI_SUCCESS) {
1723 		cmn_err(CE_WARN, "xdf_start_connect: xdf@%s: "
1724 		    "failed to add intr handler", ddi_get_name_addr(dip));
1725 		goto errout1;
1726 	}
1727 #endif /* !XPV_HVM_DRIVER */
1728 
1729 	if (xvdi_alloc_ring(dip, BLKIF_RING_SIZE,
1730 	    sizeof (union blkif_sring_entry), &gref, &vdp->xdf_xb_ring) !=
1731 	    DDI_SUCCESS) {
1732 		cmn_err(CE_WARN, "xdf@%s: failed to alloc comm ring",
1733 		    ddi_get_name_addr(dip));
1734 		goto errout2;
1735 	}
1736 	vdp->xdf_xb_ring_hdl = vdp->xdf_xb_ring->xr_acc_hdl; /* ugly!! */
1737 
1738 	/*
1739 	 * Write into xenstore the info needed by backend
1740 	 */
1741 	if ((xsnode = xvdi_get_xsname(dip)) == NULL) {
1742 		cmn_err(CE_WARN, "xdf@%s: "
1743 		    "failed to get xenstore node path",
1744 		    ddi_get_name_addr(dip));
1745 		goto fail_trans;
1746 	}
1747 trans_retry:
1748 	if (xenbus_transaction_start(&xbt)) {
1749 		cmn_err(CE_WARN, "xdf@%s: failed to start transaction",
1750 		    ddi_get_name_addr(dip));
1751 		xvdi_fatal_error(dip, EIO, "transaction start");
1752 		goto fail_trans;
1753 	}
1754 
1755 	if (rv = xenbus_printf(xbt, xsnode, "ring-ref", "%u", gref)) {
1756 		cmn_err(CE_WARN, "xdf@%s: failed to write ring-ref",
1757 		    ddi_get_name_addr(dip));
1758 		xvdi_fatal_error(dip, rv, "writing ring-ref");
1759 		goto abort_trans;
1760 	}
1761 
1762 	if (rv = xenbus_printf(xbt, xsnode, "event-channel", "%u",
1763 	    vdp->xdf_evtchn)) {
1764 		cmn_err(CE_WARN, "xdf@%s: failed to write event-channel",
1765 		    ddi_get_name_addr(dip));
1766 		xvdi_fatal_error(dip, rv, "writing event-channel");
1767 		goto abort_trans;
1768 	}
1769 
1770 	/*
1771 	 * "protocol" is written by the domain builder in the case of PV
1772 	 * domains. However, it is not written for HVM domains, so let's
1773 	 * write it here.
1774 	 */
1775 	if (rv = xenbus_printf(xbt, xsnode, "protocol", "%s",
1776 	    XEN_IO_PROTO_ABI_NATIVE)) {
1777 		cmn_err(CE_WARN, "xdf@%s: failed to write protocol",
1778 		    ddi_get_name_addr(dip));
1779 		xvdi_fatal_error(dip, rv, "writing protocol");
1780 		goto abort_trans;
1781 	}
1782 
1783 	if ((rv = xvdi_switch_state(dip, xbt, XenbusStateInitialised)) > 0) {
1784 		cmn_err(CE_WARN, "xdf@%s: "
1785 		    "failed to switch state to XenbusStateInitialised",
1786 		    ddi_get_name_addr(dip));
1787 		xvdi_fatal_error(dip, rv, "writing state");
1788 		goto abort_trans;
1789 	}
1790 
1791 	/* kick-off connect process */
1792 	if (rv = xenbus_transaction_end(xbt, 0)) {
1793 		if (rv == EAGAIN)
1794 			goto trans_retry;
1795 		cmn_err(CE_WARN, "xdf@%s: failed to end transaction",
1796 		    ddi_get_name_addr(dip));
1797 		xvdi_fatal_error(dip, rv, "completing transaction");
1798 		goto fail_trans;
1799 	}
1800 
1801 	ASSERT(mutex_owned(&vdp->xdf_cb_lk));
1802 	mutex_enter(&vdp->xdf_dev_lk);
1803 	vdp->xdf_status = XD_INIT;
1804 	mutex_exit(&vdp->xdf_dev_lk);
1805 
1806 	return (DDI_SUCCESS);
1807 
1808 abort_trans:
1809 	(void) xenbus_transaction_end(xbt, 1);
1810 fail_trans:
1811 	xvdi_free_ring(vdp->xdf_xb_ring);
1812 errout2:
1813 #ifdef XPV_HVM_DRIVER
1814 	ec_unbind_evtchn(vdp->xdf_evtchn);
1815 #else /* !XPV_HVM_DRIVER */
1816 	(void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
1817 #endif /* !XPV_HVM_DRIVER */
1818 errout1:
1819 	xvdi_free_evtchn(dip);
1820 errout:
1821 	cmn_err(CE_WARN, "xdf@%s: fail to kick-off connecting",
1822 	    ddi_get_name_addr(dip));
1823 	return (DDI_FAILURE);
1824 }
1825 
1826 /*
1827  * Kick-off disconnect process
1828  * Status won't be changed
1829  */
1830 static int
1831 xdf_start_disconnect(xdf_t *vdp)
1832 {
1833 	if (xvdi_switch_state(vdp->xdf_dip, XBT_NULL, XenbusStateClosed) > 0) {
1834 		cmn_err(CE_WARN, "xdf@%s: fail to kick-off disconnecting",
1835 		    ddi_get_name_addr(vdp->xdf_dip));
1836 		return (DDI_FAILURE);
1837 	}
1838 
1839 	return (DDI_SUCCESS);
1840 }
1841 
1842 int
1843 xdf_get_flush_block(xdf_t *vdp)
1844 {
1845 	/*
1846 	 * Get a DEV_BSIZE aligned bufer
1847 	 */
1848 	vdp->xdf_flush_mem = kmem_alloc(DEV_BSIZE * 2, KM_SLEEP);
1849 	vdp->xdf_cache_flush_block =
1850 	    (char *)P2ROUNDUP((uintptr_t)(vdp->xdf_flush_mem), DEV_BSIZE);
1851 	if (xdf_lb_rdwr(vdp->xdf_dip, TG_READ, vdp->xdf_cache_flush_block,
1852 	    xdf_flush_block, DEV_BSIZE, NULL) != 0)
1853 		return (DDI_FAILURE);
1854 	return (DDI_SUCCESS);
1855 }
1856 
1857 /*
1858  * Finish other initialization after we've connected to backend
1859  * Status should be XD_INIT before calling this routine
1860  * On success, status should be changed to XD_READY
1861  * On error, status should stay XD_INIT
1862  */
1863 static int
1864 xdf_post_connect(xdf_t *vdp)
1865 {
1866 	int rv;
1867 	uint_t len;
1868 	char *type;
1869 	char *barrier;
1870 	dev_info_t *devi = vdp->xdf_dip;
1871 
1872 	/*
1873 	 * Determine if feature barrier is supported by backend
1874 	 */
1875 	if (xenbus_read(XBT_NULL, xvdi_get_oename(devi),
1876 	    "feature-barrier", (void **)&barrier, &len) == 0) {
1877 		vdp->xdf_feature_barrier = 1;
1878 		kmem_free(barrier, len);
1879 	} else {
1880 		cmn_err(CE_NOTE, "xdf@%s: failed to read feature-barrier",
1881 		    ddi_get_name_addr(vdp->xdf_dip));
1882 		vdp->xdf_feature_barrier = 0;
1883 	}
1884 
1885 	/* probe backend */
1886 	if (rv = xenbus_gather(XBT_NULL, xvdi_get_oename(devi),
1887 	    "sectors", "%"SCNu64, &vdp->xdf_xdev_nblocks,
1888 	    "info", "%u", &vdp->xdf_xdev_info, NULL)) {
1889 		cmn_err(CE_WARN, "xdf_post_connect: xdf@%s: "
1890 		    "cannot read backend info", ddi_get_name_addr(devi));
1891 		xvdi_fatal_error(devi, rv, "reading backend info");
1892 		return (DDI_FAILURE);
1893 	}
1894 
1895 	/*
1896 	 * Make sure that the device we're connecting isn't smaller than
1897 	 * the old connected device.
1898 	 */
1899 	if (vdp->xdf_xdev_nblocks < vdp->xdf_pgeom.g_capacity) {
1900 		cmn_err(CE_WARN, "xdf_post_connect: xdf@%s: "
1901 		    "backend disk device shrank", ddi_get_name_addr(devi));
1902 		/* XXX:  call xvdi_fatal_error() here? */
1903 		xvdi_fatal_error(devi, rv, "reading backend info");
1904 		return (DDI_FAILURE);
1905 	}
1906 
1907 	/*
1908 	 * Only update the physical geometry to reflect the new device
1909 	 * size if this is the first time we're connecting to the backend
1910 	 * device.  Once we assign a physical geometry to a device it stays
1911 	 * fixed until:
1912 	 *	- we get detach and re-attached (at which point we
1913 	 *	  automatically assign a new physical geometry).
1914 	 *	- someone calls TG_SETPHYGEOM to explicity set the
1915 	 *	  physical geometry.
1916 	 */
1917 	if (vdp->xdf_pgeom.g_capacity == 0)
1918 		xdf_synthetic_pgeom(devi, &vdp->xdf_pgeom);
1919 
1920 	/* fix disk type */
1921 	if (xenbus_read(XBT_NULL, xvdi_get_xsname(devi), "device-type",
1922 	    (void **)&type, &len) != 0) {
1923 		cmn_err(CE_WARN, "xdf_post_connect: xdf@%s: "
1924 		    "cannot read device-type", ddi_get_name_addr(devi));
1925 		xvdi_fatal_error(devi, rv, "reading device-type");
1926 		return (DDI_FAILURE);
1927 	}
1928 	if (strcmp(type, "cdrom") == 0)
1929 		vdp->xdf_xdev_info |= VDISK_CDROM;
1930 	kmem_free(type, len);
1931 
1932 	/*
1933 	 * We've created all the minor nodes via cmlb_attach() using default
1934 	 * value in xdf_attach() to make it possible to block in xdf_open(),
1935 	 * in case there's anyone (say, booting thread) ever trying to open
1936 	 * it before connected to backend. We will refresh all those minor
1937 	 * nodes w/ latest info we've got now when we are almost connected.
1938 	 *
1939 	 * Don't do this when xdf is already opened by someone (could happen
1940 	 * during resume), for that cmlb_attach() will invalid the label info
1941 	 * and confuse those who has already opened the node, which is bad.
1942 	 */
1943 	if (!xdf_isopen(vdp, -1) && (XD_IS_CD(vdp) || XD_IS_RM(vdp))) {
1944 		/* re-init cmlb w/ latest info we got from backend */
1945 		if (cmlb_attach(devi, &xdf_lb_ops,
1946 		    XD_IS_CD(vdp) ? DTYPE_RODIRECT : DTYPE_DIRECT,
1947 		    XD_IS_RM(vdp), 1, DDI_NT_BLOCK,
1948 #if defined(XPV_HVM_DRIVER)
1949 		    CMLB_CREATE_ALTSLICE_VTOC_16_DTYPE_DIRECT |
1950 		    CMLB_INTERNAL_MINOR_NODES,
1951 #else /* !XPV_HVM_DRIVER */
1952 		    CMLB_FAKE_LABEL_ONE_PARTITION,
1953 #endif /* !XPV_HVM_DRIVER */
1954 		    vdp->xdf_vd_lbl, NULL) != 0) {
1955 			cmn_err(CE_WARN, "xdf@%s: cmlb attach failed",
1956 			    ddi_get_name_addr(devi));
1957 			return (DDI_FAILURE);
1958 		}
1959 	}
1960 
1961 	/* mark vbd is ready for I/O */
1962 	ASSERT(mutex_owned(&vdp->xdf_cb_lk));
1963 	mutex_enter(&vdp->xdf_dev_lk);
1964 	vdp->xdf_status = XD_READY;
1965 	mutex_exit(&vdp->xdf_dev_lk);
1966 	/*
1967 	 * If backend has feature-barrier, see if it supports disk
1968 	 * cache flush op.
1969 	 */
1970 	vdp->xdf_flush_supported = 0;
1971 	if (vdp->xdf_feature_barrier) {
1972 		/*
1973 		 * Pretend we already know flush is supported so probe
1974 		 * will attempt the correct op.
1975 		 */
1976 		vdp->xdf_flush_supported = 1;
1977 		if (xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE, NULL, 0, 0, 0) == 0) {
1978 			vdp->xdf_flush_supported = 1;
1979 		} else {
1980 			vdp->xdf_flush_supported = 0;
1981 			/*
1982 			 * If the other end does not support the cache flush op
1983 			 * then we must use a barrier-write to force disk
1984 			 * cache flushing.  Barrier writes require that a data
1985 			 * block actually be written.
1986 			 * Cache a block to barrier-write when we are
1987 			 * asked to perform a flush.
1988 			 * XXX - would it be better to just copy 1 block
1989 			 * (512 bytes) from whatever write we did last
1990 			 * and rewrite that block?
1991 			 */
1992 			if (xdf_get_flush_block(vdp) != DDI_SUCCESS)
1993 				return (DDI_FAILURE);
1994 		}
1995 	}
1996 
1997 	cmn_err(CE_CONT, "?xdf@%s: %"PRIu64" blocks", ddi_get_name_addr(devi),
1998 	    (uint64_t)vdp->xdf_xdev_nblocks);
1999 
2000 	return (DDI_SUCCESS);
2001 }
2002 
2003 /*
2004  * Finish other uninitialization after we've disconnected from backend
2005  * when status is XD_CLOSING or XD_INIT. After returns, status is XD_CLOSED
2006  */
2007 static void
2008 xdf_post_disconnect(xdf_t *vdp)
2009 {
2010 #ifdef XPV_HVM_DRIVER
2011 	ec_unbind_evtchn(vdp->xdf_evtchn);
2012 #else /* !XPV_HVM_DRIVER */
2013 	(void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
2014 #endif /* !XPV_HVM_DRIVER */
2015 	xvdi_free_evtchn(vdp->xdf_dip);
2016 	xvdi_free_ring(vdp->xdf_xb_ring);
2017 	vdp->xdf_xb_ring = NULL;
2018 	vdp->xdf_xb_ring_hdl = NULL;
2019 	vdp->xdf_peer = (domid_t)-1;
2020 
2021 	ASSERT(mutex_owned(&vdp->xdf_cb_lk));
2022 	mutex_enter(&vdp->xdf_dev_lk);
2023 	vdp->xdf_status = XD_CLOSED;
2024 	mutex_exit(&vdp->xdf_dev_lk);
2025 }
2026 
2027 /*ARGSUSED*/
2028 static void
2029 xdf_oe_change(dev_info_t *dip, ddi_eventcookie_t id, void *arg, void *impl_data)
2030 {
2031 	XenbusState new_state = *(XenbusState *)impl_data;
2032 	xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip);
2033 	boolean_t unexpect_die = B_FALSE;
2034 	int status;
2035 
2036 	DPRINTF(DDI_DBG, ("xdf@%s: otherend state change to %d!\n",
2037 	    ddi_get_name_addr(dip), new_state));
2038 
2039 	mutex_enter(&vdp->xdf_cb_lk);
2040 
2041 	if (xdf_check_state_transition(vdp, new_state) == DDI_FAILURE) {
2042 		mutex_exit(&vdp->xdf_cb_lk);
2043 		return;
2044 	}
2045 
2046 	switch (new_state) {
2047 	case XenbusStateInitialising:
2048 		ASSERT(vdp->xdf_status == XD_CLOSED);
2049 		/*
2050 		 * backend recovered from a previous failure,
2051 		 * kick-off connect process again
2052 		 */
2053 		if (xdf_start_connect(vdp) != DDI_SUCCESS) {
2054 			cmn_err(CE_WARN, "xdf@%s:"
2055 			    " failed to start reconnecting to backend",
2056 			    ddi_get_name_addr(dip));
2057 		}
2058 		break;
2059 	case XenbusStateConnected:
2060 		ASSERT(vdp->xdf_status == XD_INIT);
2061 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
2062 		/* finish final init after connect */
2063 		if (xdf_post_connect(vdp) != DDI_SUCCESS)
2064 			(void) xdf_start_disconnect(vdp);
2065 		break;
2066 	case XenbusStateClosing:
2067 		if (vdp->xdf_status == XD_READY) {
2068 			mutex_enter(&vdp->xdf_dev_lk);
2069 			if (xdf_isopen(vdp, -1)) {
2070 				cmn_err(CE_NOTE, "xdf@%s: hot-unplug failed, "
2071 				    "still in use", ddi_get_name_addr(dip));
2072 				mutex_exit(&vdp->xdf_dev_lk);
2073 				break;
2074 			} else {
2075 				vdp->xdf_status = XD_CLOSING;
2076 			}
2077 			mutex_exit(&vdp->xdf_dev_lk);
2078 		}
2079 		(void) xdf_start_disconnect(vdp);
2080 		break;
2081 	case XenbusStateClosed:
2082 		/* first check if BE closed unexpectedly */
2083 		mutex_enter(&vdp->xdf_dev_lk);
2084 		if (xdf_isopen(vdp, -1)) {
2085 			unexpect_die = B_TRUE;
2086 			unexpectedie(vdp);
2087 			cmn_err(CE_WARN, "xdf@%s: backend closed, "
2088 			    "reconnecting...", ddi_get_name_addr(dip));
2089 		}
2090 		mutex_exit(&vdp->xdf_dev_lk);
2091 
2092 		if (vdp->xdf_status == XD_READY) {
2093 			mutex_enter(&vdp->xdf_dev_lk);
2094 			vdp->xdf_status = XD_CLOSING;
2095 			mutex_exit(&vdp->xdf_dev_lk);
2096 
2097 #ifdef	DOMU_BACKEND
2098 			(void) xvdi_post_event(dip, XEN_HP_REMOVE);
2099 #endif
2100 
2101 			xdf_post_disconnect(vdp);
2102 			(void) xvdi_switch_state(dip, XBT_NULL,
2103 			    XenbusStateClosed);
2104 		} else if ((vdp->xdf_status == XD_INIT) ||
2105 		    (vdp->xdf_status == XD_CLOSING)) {
2106 			xdf_post_disconnect(vdp);
2107 		} else {
2108 			mutex_enter(&vdp->xdf_dev_lk);
2109 			vdp->xdf_status = XD_CLOSED;
2110 			mutex_exit(&vdp->xdf_dev_lk);
2111 		}
2112 	}
2113 
2114 	/* notify anybody waiting for oe state change */
2115 	mutex_enter(&vdp->xdf_dev_lk);
2116 	cv_broadcast(&vdp->xdf_dev_cv);
2117 	mutex_exit(&vdp->xdf_dev_lk);
2118 
2119 	status = vdp->xdf_status;
2120 	mutex_exit(&vdp->xdf_cb_lk);
2121 
2122 	if (status == XD_READY) {
2123 		xdf_iostart(vdp);
2124 	} else if ((status == XD_CLOSED) && !unexpect_die) {
2125 		/* interface is closed successfully, remove all minor nodes */
2126 		cmlb_detach(vdp->xdf_vd_lbl, NULL);
2127 		cmlb_free_handle(&vdp->xdf_vd_lbl);
2128 	}
2129 }
2130 
2131 /* check if partition is open, -1 - check all partitions on the disk */
2132 static boolean_t
2133 xdf_isopen(xdf_t *vdp, int partition)
2134 {
2135 	int i;
2136 	ulong_t parbit;
2137 	boolean_t rval = B_FALSE;
2138 
2139 	ASSERT((partition == -1) ||
2140 	    ((partition >= 0) || (partition < XDF_PEXT)));
2141 
2142 	if (partition == -1)
2143 		parbit = (ulong_t)-1;
2144 	else
2145 		parbit = 1 << partition;
2146 
2147 	for (i = 0; i < OTYPCNT; i++) {
2148 		if (vdp->xdf_vd_open[i] & parbit)
2149 			rval = B_TRUE;
2150 	}
2151 
2152 	return (rval);
2153 }
2154 
2155 /*
2156  * Xdf_check_state_transition will check the XenbusState change to see
2157  * if the change is a valid transition or not.
2158  * The new state is written by backend domain, or by running xenstore-write
2159  * to change it manually in dom0
2160  */
2161 static int
2162 xdf_check_state_transition(xdf_t *vdp, XenbusState oestate)
2163 {
2164 	int status;
2165 	int stcheck;
2166 #define	STOK	0 /* need further process */
2167 #define	STNOP	1 /* no action need taking */
2168 #define	STBUG	2 /* unexpected state change, could be a bug */
2169 
2170 	status = vdp->xdf_status;
2171 	stcheck = STOK;
2172 
2173 	switch (status) {
2174 	case XD_UNKNOWN:
2175 		if ((oestate == XenbusStateUnknown)		||
2176 		    (oestate == XenbusStateConnected))
2177 			stcheck = STBUG;
2178 		else if ((oestate == XenbusStateInitialising)	||
2179 		    (oestate == XenbusStateInitWait)		||
2180 		    (oestate == XenbusStateInitialised))
2181 			stcheck = STNOP;
2182 		break;
2183 	case XD_INIT:
2184 		if (oestate == XenbusStateUnknown)
2185 			stcheck = STBUG;
2186 		else if ((oestate == XenbusStateInitialising)	||
2187 		    (oestate == XenbusStateInitWait)		||
2188 		    (oestate == XenbusStateInitialised))
2189 			stcheck = STNOP;
2190 		break;
2191 	case XD_READY:
2192 		if ((oestate == XenbusStateUnknown)		||
2193 		    (oestate == XenbusStateInitialising)	||
2194 		    (oestate == XenbusStateInitWait)		||
2195 		    (oestate == XenbusStateInitialised))
2196 			stcheck = STBUG;
2197 		else if (oestate == XenbusStateConnected)
2198 			stcheck = STNOP;
2199 		break;
2200 	case XD_CLOSING:
2201 		if ((oestate == XenbusStateUnknown)		||
2202 		    (oestate == XenbusStateInitialising)	||
2203 		    (oestate == XenbusStateInitWait)		||
2204 		    (oestate == XenbusStateInitialised)		||
2205 		    (oestate == XenbusStateConnected))
2206 			stcheck = STBUG;
2207 		else if (oestate == XenbusStateClosing)
2208 			stcheck = STNOP;
2209 		break;
2210 	case XD_CLOSED:
2211 		if ((oestate == XenbusStateUnknown)		||
2212 		    (oestate == XenbusStateConnected))
2213 			stcheck = STBUG;
2214 		else if ((oestate == XenbusStateInitWait)	||
2215 		    (oestate == XenbusStateInitialised)		||
2216 		    (oestate == XenbusStateClosing)		||
2217 		    (oestate == XenbusStateClosed))
2218 			stcheck = STNOP;
2219 		break;
2220 	case XD_SUSPEND:
2221 	default:
2222 			stcheck = STBUG;
2223 	}
2224 
2225 	if (stcheck == STOK)
2226 		return (DDI_SUCCESS);
2227 
2228 	if (stcheck == STBUG)
2229 		cmn_err(CE_NOTE, "xdf@%s: unexpected otherend "
2230 		    "state change to %d!, when status is %d",
2231 		    ddi_get_name_addr(vdp->xdf_dip), oestate, status);
2232 
2233 	return (DDI_FAILURE);
2234 }
2235 
2236 static int
2237 xdf_connect(xdf_t *vdp, boolean_t wait)
2238 {
2239 	ASSERT(mutex_owned(&vdp->xdf_dev_lk));
2240 	while (vdp->xdf_status != XD_READY) {
2241 		if (!wait || (vdp->xdf_status > XD_READY))
2242 			break;
2243 
2244 		if (cv_wait_sig(&vdp->xdf_dev_cv, &vdp->xdf_dev_lk) == 0)
2245 			break;
2246 	}
2247 
2248 	return (vdp->xdf_status);
2249 }
2250 
2251 /*
2252  * callback func when DMA/GTE resources is available
2253  *
2254  * Note: we only register one callback function to grant table subsystem
2255  * since we only have one 'struct gnttab_free_callback' in xdf_t.
2256  */
2257 static int
2258 xdf_dmacallback(caddr_t arg)
2259 {
2260 	xdf_t *vdp = (xdf_t *)arg;
2261 	ASSERT(vdp != NULL);
2262 
2263 	DPRINTF(DMA_DBG, ("xdf@%s: DMA callback started\n",
2264 	    ddi_get_name_addr(vdp->xdf_dip)));
2265 
2266 	ddi_trigger_softintr(vdp->xdf_softintr_id);
2267 	return (DDI_DMA_CALLBACK_DONE);
2268 }
2269 
2270 static uint_t
2271 xdf_iorestart(caddr_t arg)
2272 {
2273 	xdf_t *vdp = (xdf_t *)arg;
2274 
2275 	ASSERT(vdp != NULL);
2276 
2277 	mutex_enter(&vdp->xdf_dev_lk);
2278 	ASSERT(ISDMACBON(vdp));
2279 	SETDMACBOFF(vdp);
2280 	mutex_exit(&vdp->xdf_dev_lk);
2281 
2282 	xdf_iostart(vdp);
2283 
2284 	return (DDI_INTR_CLAIMED);
2285 }
2286 
2287 static void
2288 xdf_timeout_handler(void *arg)
2289 {
2290 	xdf_t *vdp = arg;
2291 
2292 	mutex_enter(&vdp->xdf_dev_lk);
2293 	vdp->xdf_timeout_id = 0;
2294 	mutex_exit(&vdp->xdf_dev_lk);
2295 
2296 	/* new timeout thread could be re-scheduled */
2297 	xdf_iostart(vdp);
2298 }
2299 
2300 /*
2301  * Alloc a vreq for this bp
2302  * bp->av_back contains the pointer to the vreq upon return
2303  */
2304 static v_req_t *
2305 vreq_get(xdf_t *vdp, buf_t *bp)
2306 {
2307 	v_req_t *vreq = NULL;
2308 
2309 	ASSERT(BP2VREQ(bp) == NULL);
2310 
2311 	vreq = kmem_cache_alloc(xdf_vreq_cache, KM_NOSLEEP);
2312 	if (vreq == NULL) {
2313 		if (vdp->xdf_timeout_id == 0)
2314 			/* restart I/O after one second */
2315 			vdp->xdf_timeout_id =
2316 			    timeout(xdf_timeout_handler, vdp, hz);
2317 		return (NULL);
2318 	}
2319 	bzero(vreq, sizeof (v_req_t));
2320 
2321 	list_insert_head(&vdp->xdf_vreq_act, (void *)vreq);
2322 	bp->av_back = (buf_t *)vreq;
2323 	vreq->v_buf = bp;
2324 	vreq->v_status = VREQ_INIT;
2325 	/* init of other fields in vreq is up to the caller */
2326 
2327 	return (vreq);
2328 }
2329 
2330 static void
2331 vreq_free(xdf_t *vdp, v_req_t *vreq)
2332 {
2333 	buf_t *bp = vreq->v_buf;
2334 
2335 	list_remove(&vdp->xdf_vreq_act, (void *)vreq);
2336 
2337 	if (vreq->v_flush_diskcache == FLUSH_DISKCACHE)
2338 		goto done;
2339 
2340 	switch (vreq->v_status) {
2341 	case VREQ_DMAWIN_DONE:
2342 	case VREQ_GS_ALLOCED:
2343 	case VREQ_DMABUF_BOUND:
2344 		(void) ddi_dma_unbind_handle(vreq->v_dmahdl);
2345 		/*FALLTHRU*/
2346 	case VREQ_DMAMEM_ALLOCED:
2347 		if (!ALIGNED_XFER(bp)) {
2348 			ASSERT(vreq->v_abuf != NULL);
2349 			if (!IS_ERROR(bp) && IS_READ(bp))
2350 				bcopy(vreq->v_abuf, bp->b_un.b_addr,
2351 				    bp->b_bcount);
2352 			ddi_dma_mem_free(&vreq->v_align);
2353 		}
2354 		/*FALLTHRU*/
2355 	case VREQ_MEMDMAHDL_ALLOCED:
2356 		if (!ALIGNED_XFER(bp))
2357 			ddi_dma_free_handle(&vreq->v_memdmahdl);
2358 		/*FALLTHRU*/
2359 	case VREQ_DMAHDL_ALLOCED:
2360 		ddi_dma_free_handle(&vreq->v_dmahdl);
2361 		break;
2362 	default:
2363 		break;
2364 	}
2365 done:
2366 	vreq->v_buf->av_back = NULL;
2367 	kmem_cache_free(xdf_vreq_cache, vreq);
2368 }
2369 
2370 /*
2371  * Initalize the DMA and grant table resources for the buf
2372  */
2373 static int
2374 vreq_setup(xdf_t *vdp, v_req_t *vreq)
2375 {
2376 	int rc;
2377 	ddi_dma_attr_t dmaattr;
2378 	uint_t ndcs, ndws;
2379 	ddi_dma_handle_t dh;
2380 	ddi_dma_handle_t mdh;
2381 	ddi_dma_cookie_t dc;
2382 	ddi_acc_handle_t abh;
2383 	caddr_t	aba;
2384 	ge_slot_t *gs;
2385 	size_t bufsz;
2386 	off_t off;
2387 	size_t sz;
2388 	buf_t *bp = vreq->v_buf;
2389 	int dma_flags = (IS_READ(bp) ? DDI_DMA_READ : DDI_DMA_WRITE) |
2390 	    DDI_DMA_STREAMING | DDI_DMA_PARTIAL;
2391 
2392 	switch (vreq->v_status) {
2393 	case VREQ_INIT:
2394 		if (IS_FLUSH_DISKCACHE(bp)) {
2395 			if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
2396 				DPRINTF(DMA_DBG, (
2397 				    "xdf@%s: get ge_slotfailed\n",
2398 				    ddi_get_name_addr(vdp->xdf_dip)));
2399 				return (DDI_FAILURE);
2400 			}
2401 			vreq->v_blkno = 0;
2402 			vreq->v_nslots = 1;
2403 			vreq->v_gs = gs;
2404 			vreq->v_flush_diskcache = FLUSH_DISKCACHE;
2405 			vreq->v_status = VREQ_GS_ALLOCED;
2406 			gs->vreq = vreq;
2407 			return (DDI_SUCCESS);
2408 		}
2409 
2410 		if (IS_WRITE_BARRIER(vdp, bp))
2411 			vreq->v_flush_diskcache = WRITE_BARRIER;
2412 		vreq->v_blkno = bp->b_blkno +
2413 		    (diskaddr_t)(uintptr_t)bp->b_private;
2414 		bp->b_private = NULL;
2415 		/* See if we wrote new data to our flush block */
2416 		if (!IS_READ(bp) && USE_WRITE_BARRIER(vdp))
2417 			check_fbwrite(vdp, bp, vreq->v_blkno);
2418 		vreq->v_status = VREQ_INIT_DONE;
2419 		/*FALLTHRU*/
2420 
2421 	case VREQ_INIT_DONE:
2422 		/*
2423 		 * alloc DMA handle
2424 		 */
2425 		rc = ddi_dma_alloc_handle(vdp->xdf_dip, &xb_dma_attr,
2426 		    xdf_dmacallback, (caddr_t)vdp, &dh);
2427 		if (rc != DDI_SUCCESS) {
2428 			SETDMACBON(vdp);
2429 			DPRINTF(DMA_DBG, ("xdf@%s: DMA handle alloc failed\n",
2430 			    ddi_get_name_addr(vdp->xdf_dip)));
2431 			return (DDI_FAILURE);
2432 		}
2433 
2434 		vreq->v_dmahdl = dh;
2435 		vreq->v_status = VREQ_DMAHDL_ALLOCED;
2436 		/*FALLTHRU*/
2437 
2438 	case VREQ_DMAHDL_ALLOCED:
2439 		/*
2440 		 * alloc dma handle for 512-byte aligned buf
2441 		 */
2442 		if (!ALIGNED_XFER(bp)) {
2443 			/*
2444 			 * XXPV: we need to temporarily enlarge the seg
2445 			 * boundary and s/g length to work round CR6381968
2446 			 */
2447 			dmaattr = xb_dma_attr;
2448 			dmaattr.dma_attr_seg = (uint64_t)-1;
2449 			dmaattr.dma_attr_sgllen = INT_MAX;
2450 			rc = ddi_dma_alloc_handle(vdp->xdf_dip, &dmaattr,
2451 			    xdf_dmacallback, (caddr_t)vdp, &mdh);
2452 			if (rc != DDI_SUCCESS) {
2453 				SETDMACBON(vdp);
2454 				DPRINTF(DMA_DBG, ("xdf@%s: unaligned buf DMA"
2455 				    "handle alloc failed\n",
2456 				    ddi_get_name_addr(vdp->xdf_dip)));
2457 				return (DDI_FAILURE);
2458 			}
2459 			vreq->v_memdmahdl = mdh;
2460 			vreq->v_status = VREQ_MEMDMAHDL_ALLOCED;
2461 		}
2462 		/*FALLTHRU*/
2463 
2464 	case VREQ_MEMDMAHDL_ALLOCED:
2465 		/*
2466 		 * alloc 512-byte aligned buf
2467 		 */
2468 		if (!ALIGNED_XFER(bp)) {
2469 			if (bp->b_flags & (B_PAGEIO | B_PHYS))
2470 				bp_mapin(bp);
2471 
2472 			rc = ddi_dma_mem_alloc(vreq->v_memdmahdl,
2473 			    roundup(bp->b_bcount, XB_BSIZE), &xc_acc_attr,
2474 			    DDI_DMA_STREAMING, xdf_dmacallback, (caddr_t)vdp,
2475 			    &aba, &bufsz, &abh);
2476 			if (rc != DDI_SUCCESS) {
2477 				SETDMACBON(vdp);
2478 				DPRINTF(DMA_DBG, (
2479 				    "xdf@%s: DMA mem allocation failed\n",
2480 				    ddi_get_name_addr(vdp->xdf_dip)));
2481 				return (DDI_FAILURE);
2482 			}
2483 
2484 			vreq->v_abuf = aba;
2485 			vreq->v_align = abh;
2486 			vreq->v_status = VREQ_DMAMEM_ALLOCED;
2487 
2488 			ASSERT(bufsz >= bp->b_bcount);
2489 			if (!IS_READ(bp))
2490 				bcopy(bp->b_un.b_addr, vreq->v_abuf,
2491 				    bp->b_bcount);
2492 		}
2493 		/*FALLTHRU*/
2494 
2495 	case VREQ_DMAMEM_ALLOCED:
2496 		/*
2497 		 * dma bind
2498 		 */
2499 		if (ALIGNED_XFER(bp)) {
2500 			rc = ddi_dma_buf_bind_handle(vreq->v_dmahdl, bp,
2501 			    dma_flags, xdf_dmacallback, (caddr_t)vdp,
2502 			    &dc, &ndcs);
2503 		} else {
2504 			rc = ddi_dma_addr_bind_handle(vreq->v_dmahdl,
2505 			    NULL, vreq->v_abuf, bp->b_bcount, dma_flags,
2506 			    xdf_dmacallback, (caddr_t)vdp, &dc, &ndcs);
2507 		}
2508 		if (rc == DDI_DMA_MAPPED || rc == DDI_DMA_PARTIAL_MAP) {
2509 			/* get num of dma windows */
2510 			if (rc == DDI_DMA_PARTIAL_MAP) {
2511 				rc = ddi_dma_numwin(vreq->v_dmahdl, &ndws);
2512 				ASSERT(rc == DDI_SUCCESS);
2513 			} else {
2514 				ndws = 1;
2515 			}
2516 		} else {
2517 			SETDMACBON(vdp);
2518 			DPRINTF(DMA_DBG, ("xdf@%s: DMA bind failed\n",
2519 			    ddi_get_name_addr(vdp->xdf_dip)));
2520 			return (DDI_FAILURE);
2521 		}
2522 
2523 		vreq->v_dmac = dc;
2524 		vreq->v_dmaw = 0;
2525 		vreq->v_ndmacs = ndcs;
2526 		vreq->v_ndmaws = ndws;
2527 		vreq->v_nslots = ndws;
2528 		vreq->v_status = VREQ_DMABUF_BOUND;
2529 		/*FALLTHRU*/
2530 
2531 	case VREQ_DMABUF_BOUND:
2532 		/*
2533 		 * get ge_slot, callback is set upon failure from gs_get(),
2534 		 * if not set previously
2535 		 */
2536 		if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
2537 			DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
2538 			    ddi_get_name_addr(vdp->xdf_dip)));
2539 			return (DDI_FAILURE);
2540 		}
2541 
2542 		vreq->v_gs = gs;
2543 		gs->vreq = vreq;
2544 		vreq->v_status = VREQ_GS_ALLOCED;
2545 		break;
2546 
2547 	case VREQ_GS_ALLOCED:
2548 		/* nothing need to be done */
2549 		break;
2550 
2551 	case VREQ_DMAWIN_DONE:
2552 		/*
2553 		 * move to the next dma window
2554 		 */
2555 		ASSERT((vreq->v_dmaw + 1) < vreq->v_ndmaws);
2556 
2557 		/* get a ge_slot for this DMA window */
2558 		if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
2559 			DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
2560 			    ddi_get_name_addr(vdp->xdf_dip)));
2561 			return (DDI_FAILURE);
2562 		}
2563 
2564 		vreq->v_gs = gs;
2565 		gs->vreq = vreq;
2566 		vreq->v_dmaw++;
2567 		rc = ddi_dma_getwin(vreq->v_dmahdl, vreq->v_dmaw, &off, &sz,
2568 		    &vreq->v_dmac, &vreq->v_ndmacs);
2569 		ASSERT(rc == DDI_SUCCESS);
2570 		vreq->v_status = VREQ_GS_ALLOCED;
2571 		break;
2572 
2573 	default:
2574 		return (DDI_FAILURE);
2575 	}
2576 
2577 	return (DDI_SUCCESS);
2578 }
2579 
2580 static ge_slot_t *
2581 gs_get(xdf_t *vdp, int isread)
2582 {
2583 	grant_ref_t gh;
2584 	ge_slot_t *gs;
2585 
2586 	/* try to alloc GTEs needed in this slot, first */
2587 	if (gnttab_alloc_grant_references(
2588 	    BLKIF_MAX_SEGMENTS_PER_REQUEST, &gh) == -1) {
2589 		if (vdp->xdf_gnt_callback.next == NULL) {
2590 			SETDMACBON(vdp);
2591 			gnttab_request_free_callback(
2592 			    &vdp->xdf_gnt_callback,
2593 			    (void (*)(void *))xdf_dmacallback,
2594 			    (void *)vdp,
2595 			    BLKIF_MAX_SEGMENTS_PER_REQUEST);
2596 		}
2597 		return (NULL);
2598 	}
2599 
2600 	gs = kmem_cache_alloc(xdf_gs_cache, KM_NOSLEEP);
2601 	if (gs == NULL) {
2602 		gnttab_free_grant_references(gh);
2603 		if (vdp->xdf_timeout_id == 0)
2604 			/* restart I/O after one second */
2605 			vdp->xdf_timeout_id =
2606 			    timeout(xdf_timeout_handler, vdp, hz);
2607 		return (NULL);
2608 	}
2609 
2610 	/* init gs_slot */
2611 	list_insert_head(&vdp->xdf_gs_act, (void *)gs);
2612 	gs->oeid = vdp->xdf_peer;
2613 	gs->isread = isread;
2614 	gs->ghead = gh;
2615 	gs->ngrefs = 0;
2616 
2617 	return (gs);
2618 }
2619 
2620 static void
2621 gs_free(xdf_t *vdp, ge_slot_t *gs)
2622 {
2623 	int i;
2624 	grant_ref_t *gp = gs->ge;
2625 	int ngrefs = gs->ngrefs;
2626 	boolean_t isread = gs->isread;
2627 
2628 	list_remove(&vdp->xdf_gs_act, (void *)gs);
2629 
2630 	/* release all grant table entry resources used in this slot */
2631 	for (i = 0; i < ngrefs; i++, gp++)
2632 		gnttab_end_foreign_access(*gp, !isread, 0);
2633 	gnttab_free_grant_references(gs->ghead);
2634 
2635 	kmem_cache_free(xdf_gs_cache, (void *)gs);
2636 }
2637 
2638 static grant_ref_t
2639 gs_grant(ge_slot_t *gs, mfn_t mfn)
2640 {
2641 	grant_ref_t gr = gnttab_claim_grant_reference(&gs->ghead);
2642 
2643 	ASSERT(gr != -1);
2644 	ASSERT(gs->ngrefs < BLKIF_MAX_SEGMENTS_PER_REQUEST);
2645 	gs->ge[gs->ngrefs++] = gr;
2646 	gnttab_grant_foreign_access_ref(gr, gs->oeid, mfn, !gs->isread);
2647 
2648 	return (gr);
2649 }
2650 
2651 static void
2652 unexpectedie(xdf_t *vdp)
2653 {
2654 	/* clean up I/Os in ring that have responses */
2655 	if (xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring)) {
2656 		mutex_exit(&vdp->xdf_dev_lk);
2657 		(void) xdf_intr((caddr_t)vdp);
2658 		mutex_enter(&vdp->xdf_dev_lk);
2659 	}
2660 
2661 	/* free up all grant table entries */
2662 	while (!list_is_empty(&vdp->xdf_gs_act))
2663 		gs_free(vdp, list_head(&vdp->xdf_gs_act));
2664 
2665 	/*
2666 	 * move bp back to active list orderly
2667 	 * vreq_busy is updated in vreq_free()
2668 	 */
2669 	while (!list_is_empty(&vdp->xdf_vreq_act)) {
2670 		v_req_t *vreq = list_head(&vdp->xdf_vreq_act);
2671 		buf_t *bp = vreq->v_buf;
2672 
2673 		bp->av_back = NULL;
2674 		bp->b_resid = bp->b_bcount;
2675 		if (vdp->xdf_f_act == NULL) {
2676 			vdp->xdf_f_act = vdp->xdf_l_act = bp;
2677 		} else {
2678 			/* move to the head of list */
2679 			bp->av_forw = vdp->xdf_f_act;
2680 			vdp->xdf_f_act = bp;
2681 		}
2682 		if (vdp->xdf_xdev_iostat != NULL)
2683 			kstat_runq_back_to_waitq(
2684 			    KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
2685 		vreq_free(vdp, vreq);
2686 	}
2687 }
2688 
2689 static void
2690 xdfmin(struct buf *bp)
2691 {
2692 	if (bp->b_bcount > xdf_maxphys)
2693 		bp->b_bcount = xdf_maxphys;
2694 }
2695 
2696 void
2697 xdf_kstat_delete(dev_info_t *dip)
2698 {
2699 	xdf_t	*vdp = (xdf_t *)ddi_get_driver_private(dip);
2700 	kstat_t	*kstat;
2701 
2702 	/*
2703 	 * The locking order here is xdf_iostat_lk and then xdf_dev_lk.
2704 	 * xdf_dev_lk is used to protect the xdf_xdev_iostat pointer
2705 	 * and the contents of the our kstat.  xdf_iostat_lk is used
2706 	 * to protect the allocation and freeing of the actual kstat.
2707 	 * xdf_dev_lk can't be used for this purpose because kstat
2708 	 * readers use it to access the contents of the kstat and
2709 	 * hence it can't be held when calling kstat_delete().
2710 	 */
2711 	mutex_enter(&vdp->xdf_iostat_lk);
2712 	mutex_enter(&vdp->xdf_dev_lk);
2713 
2714 	if (vdp->xdf_xdev_iostat == NULL) {
2715 		mutex_exit(&vdp->xdf_dev_lk);
2716 		mutex_exit(&vdp->xdf_iostat_lk);
2717 		return;
2718 	}
2719 
2720 	kstat = vdp->xdf_xdev_iostat;
2721 	vdp->xdf_xdev_iostat = NULL;
2722 	mutex_exit(&vdp->xdf_dev_lk);
2723 
2724 	kstat_delete(kstat);
2725 	mutex_exit(&vdp->xdf_iostat_lk);
2726 }
2727 
2728 int
2729 xdf_kstat_create(dev_info_t *dip, char *ks_module, int ks_instance)
2730 {
2731 	xdf_t	*vdp = (xdf_t *)ddi_get_driver_private(dip);
2732 
2733 	/* See comment about locking in xdf_kstat_delete(). */
2734 	mutex_enter(&vdp->xdf_iostat_lk);
2735 	mutex_enter(&vdp->xdf_dev_lk);
2736 
2737 	if (vdp->xdf_xdev_iostat != NULL) {
2738 		mutex_exit(&vdp->xdf_dev_lk);
2739 		mutex_exit(&vdp->xdf_iostat_lk);
2740 		return (-1);
2741 	}
2742 
2743 	if ((vdp->xdf_xdev_iostat = kstat_create(
2744 	    ks_module, ks_instance, NULL, "disk",
2745 	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) == NULL) {
2746 		mutex_exit(&vdp->xdf_dev_lk);
2747 		mutex_exit(&vdp->xdf_iostat_lk);
2748 		return (-1);
2749 	}
2750 
2751 	vdp->xdf_xdev_iostat->ks_lock = &vdp->xdf_dev_lk;
2752 	kstat_install(vdp->xdf_xdev_iostat);
2753 	mutex_exit(&vdp->xdf_dev_lk);
2754 	mutex_exit(&vdp->xdf_iostat_lk);
2755 
2756 	return (0);
2757 }
2758 
2759 #if defined(XPV_HVM_DRIVER)
2760 
2761 typedef struct xdf_hvm_entry {
2762 	list_node_t	xdf_he_list;
2763 	char		*xdf_he_path;
2764 	dev_info_t	*xdf_he_dip;
2765 } xdf_hvm_entry_t;
2766 
2767 static list_t xdf_hvm_list;
2768 static kmutex_t xdf_hvm_list_lock;
2769 
2770 static xdf_hvm_entry_t *
2771 i_xdf_hvm_find(char *path, dev_info_t *dip)
2772 {
2773 	xdf_hvm_entry_t	*i;
2774 
2775 	ASSERT((path != NULL) || (dip != NULL));
2776 	ASSERT(MUTEX_HELD(&xdf_hvm_list_lock));
2777 
2778 	i = list_head(&xdf_hvm_list);
2779 	while (i != NULL) {
2780 		if ((path != NULL) && strcmp(i->xdf_he_path, path) != 0) {
2781 			i = list_next(&xdf_hvm_list, i);
2782 			continue;
2783 		}
2784 		if ((dip != NULL) && (i->xdf_he_dip != dip)) {
2785 			i = list_next(&xdf_hvm_list, i);
2786 			continue;
2787 		}
2788 		break;
2789 	}
2790 	return (i);
2791 }
2792 
2793 dev_info_t *
2794 xdf_hvm_hold(char *path)
2795 {
2796 	xdf_hvm_entry_t	*i;
2797 	dev_info_t	*dip;
2798 
2799 	mutex_enter(&xdf_hvm_list_lock);
2800 	i = i_xdf_hvm_find(path, NULL);
2801 	if (i == NULL) {
2802 		mutex_exit(&xdf_hvm_list_lock);
2803 		return (B_FALSE);
2804 	}
2805 	ndi_hold_devi(dip = i->xdf_he_dip);
2806 	mutex_exit(&xdf_hvm_list_lock);
2807 	return (dip);
2808 }
2809 
2810 static void
2811 xdf_hvm_add(dev_info_t *dip)
2812 {
2813 	xdf_hvm_entry_t	*i;
2814 	char		*path;
2815 
2816 	/* figure out the path for the dip */
2817 	path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
2818 	(void) ddi_pathname(dip, path);
2819 
2820 	i = kmem_alloc(sizeof (*i), KM_SLEEP);
2821 	i->xdf_he_dip = dip;
2822 	i->xdf_he_path = i_ddi_strdup(path, KM_SLEEP);
2823 
2824 	mutex_enter(&xdf_hvm_list_lock);
2825 	ASSERT(i_xdf_hvm_find(path, NULL) == NULL);
2826 	ASSERT(i_xdf_hvm_find(NULL, dip) == NULL);
2827 	list_insert_head(&xdf_hvm_list, i);
2828 	mutex_exit(&xdf_hvm_list_lock);
2829 
2830 	kmem_free(path, MAXPATHLEN);
2831 }
2832 
2833 static void
2834 xdf_hvm_rm(dev_info_t *dip)
2835 {
2836 	xdf_hvm_entry_t	*i;
2837 
2838 	mutex_enter(&xdf_hvm_list_lock);
2839 	VERIFY((i = i_xdf_hvm_find(NULL, dip)) != NULL);
2840 	list_remove(&xdf_hvm_list, i);
2841 	mutex_exit(&xdf_hvm_list_lock);
2842 
2843 	kmem_free(i->xdf_he_path, strlen(i->xdf_he_path) + 1);
2844 	kmem_free(i, sizeof (*i));
2845 }
2846 
2847 static void
2848 xdf_hvm_init(void)
2849 {
2850 	list_create(&xdf_hvm_list, sizeof (xdf_hvm_entry_t),
2851 	    offsetof(xdf_hvm_entry_t, xdf_he_list));
2852 	mutex_init(&xdf_hvm_list_lock, NULL, MUTEX_DEFAULT, NULL);
2853 }
2854 
2855 static void
2856 xdf_hvm_fini(void)
2857 {
2858 	ASSERT(list_head(&xdf_hvm_list) == NULL);
2859 	list_destroy(&xdf_hvm_list);
2860 	mutex_destroy(&xdf_hvm_list_lock);
2861 }
2862 
2863 int
2864 xdf_hvm_connect(dev_info_t *dip)
2865 {
2866 	xdf_t	*vdp = (xdf_t *)ddi_get_driver_private(dip);
2867 	int	rv;
2868 
2869 	/* do cv_wait until connected or failed */
2870 	mutex_enter(&vdp->xdf_dev_lk);
2871 	rv = xdf_connect(vdp, B_TRUE);
2872 	mutex_exit(&vdp->xdf_dev_lk);
2873 	return ((rv == XD_READY) ? 0 : -1);
2874 }
2875 
2876 int
2877 xdf_hvm_setpgeom(dev_info_t *dip, cmlb_geom_t *geomp)
2878 {
2879 	xdf_t	*vdp = (xdf_t *)ddi_get_driver_private(dip);
2880 
2881 	/* sanity check the requested physical geometry */
2882 	mutex_enter(&vdp->xdf_dev_lk);
2883 	if ((geomp->g_secsize != XB_BSIZE) ||
2884 	    (geomp->g_capacity == 0)) {
2885 		mutex_exit(&vdp->xdf_dev_lk);
2886 		return (EINVAL);
2887 	}
2888 
2889 	/*
2890 	 * If we've already connected to the backend device then make sure
2891 	 * we're not defining a physical geometry larger than our backend
2892 	 * device.
2893 	 */
2894 	if ((vdp->xdf_xdev_nblocks != 0) &&
2895 	    (geomp->g_capacity > vdp->xdf_xdev_nblocks)) {
2896 		mutex_exit(&vdp->xdf_dev_lk);
2897 		return (EINVAL);
2898 	}
2899 
2900 	vdp->xdf_pgeom = *geomp;
2901 	mutex_exit(&vdp->xdf_dev_lk);
2902 
2903 	/* force a re-validation */
2904 	cmlb_invalidate(vdp->xdf_vd_lbl, NULL);
2905 
2906 	return (0);
2907 }
2908 
2909 #endif /* XPV_HVM_DRIVER */
2910