xref: /titanic_51/usr/src/uts/common/xen/io/xdf.c (revision fc3af78a71855c71878866a294572d00e6720533)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * xdf.c - Xen Virtual Block Device Driver
29  * TODO:
30  *	- support alternate block size (currently only DEV_BSIZE supported)
31  *	- revalidate geometry for removable devices
32  */
33 
34 #include <sys/ddi.h>
35 #include <sys/sunddi.h>
36 #include <sys/conf.h>
37 #include <sys/cmlb.h>
38 #include <sys/dkio.h>
39 #include <sys/promif.h>
40 #include <sys/sysmacros.h>
41 #include <sys/kstat.h>
42 #include <sys/mach_mmu.h>
43 #ifdef XPV_HVM_DRIVER
44 #include <sys/xpv_support.h>
45 #include <sys/sunndi.h>
46 #endif /* XPV_HVM_DRIVER */
47 #include <public/io/xenbus.h>
48 #include <xen/sys/xenbus_impl.h>
49 #include <xen/sys/xendev.h>
50 #include <sys/gnttab.h>
51 #include <sys/scsi/generic/inquiry.h>
52 #include <xen/io/blkif_impl.h>
53 #include <io/xdf.h>
54 
55 #define	FLUSH_DISKCACHE	0x1
56 #define	WRITE_BARRIER	0x2
57 #define	DEFAULT_FLUSH_BLOCK	156 /* block to write to cause a cache flush */
58 #define	USE_WRITE_BARRIER(vdp)				\
59 	((vdp)->xdf_feature_barrier && !(vdp)->xdf_flush_supported)
60 #define	USE_FLUSH_DISKCACHE(vdp)			\
61 	((vdp)->xdf_feature_barrier && (vdp)->xdf_flush_supported)
62 #define	IS_WRITE_BARRIER(vdp, bp)			\
63 	(!IS_READ(bp) && USE_WRITE_BARRIER(vdp) &&	\
64 	((bp)->b_un.b_addr == (vdp)->xdf_cache_flush_block))
65 #define	IS_FLUSH_DISKCACHE(bp)				\
66 	(!IS_READ(bp) && USE_FLUSH_DISKCACHE(vdp) && ((bp)->b_bcount == 0))
67 
68 static void *vbd_ss;
69 static kmem_cache_t *xdf_vreq_cache;
70 static kmem_cache_t *xdf_gs_cache;
71 static int xdf_maxphys = XB_MAXPHYS;
72 int xdfdebug = 0;
73 extern int do_polled_io;
74 diskaddr_t xdf_flush_block = DEFAULT_FLUSH_BLOCK;
75 int	xdf_barrier_flush_disable = 0;
76 
77 /*
78  * dev_ops and cb_ops entrypoints
79  */
80 static int xdf_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
81 static int xdf_attach(dev_info_t *, ddi_attach_cmd_t);
82 static int xdf_detach(dev_info_t *, ddi_detach_cmd_t);
83 static int xdf_reset(dev_info_t *, ddi_reset_cmd_t);
84 static int xdf_open(dev_t *, int, int, cred_t *);
85 static int xdf_close(dev_t, int, int, struct cred *);
86 static int xdf_strategy(struct buf *);
87 static int xdf_read(dev_t, struct uio *, cred_t *);
88 static int xdf_aread(dev_t, struct aio_req *, cred_t *);
89 static int xdf_write(dev_t, struct uio *, cred_t *);
90 static int xdf_awrite(dev_t, struct aio_req *, cred_t *);
91 static int xdf_dump(dev_t, caddr_t, daddr_t, int);
92 static int xdf_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
93 static uint_t xdf_intr(caddr_t);
94 static int xdf_prop_op(dev_t, dev_info_t *, ddi_prop_op_t, int, char *,
95     caddr_t, int *);
96 
97 /*
98  * misc private functions
99  */
100 static int xdf_suspend(dev_info_t *);
101 static int xdf_resume(dev_info_t *);
102 static int xdf_start_connect(xdf_t *);
103 static int xdf_start_disconnect(xdf_t *);
104 static int xdf_post_connect(xdf_t *);
105 static void xdf_post_disconnect(xdf_t *);
106 static void xdf_oe_change(dev_info_t *, ddi_eventcookie_t, void *, void *);
107 static void xdf_iostart(xdf_t *);
108 static void xdf_iofini(xdf_t *, uint64_t, int);
109 static int xdf_prepare_rreq(xdf_t *, struct buf *, blkif_request_t *);
110 static int xdf_drain_io(xdf_t *);
111 static boolean_t xdf_isopen(xdf_t *, int);
112 static int xdf_check_state_transition(xdf_t *, XenbusState);
113 static int xdf_connect(xdf_t *, boolean_t);
114 static int xdf_dmacallback(caddr_t);
115 static void xdf_timeout_handler(void *);
116 static uint_t xdf_iorestart(caddr_t);
117 static v_req_t *vreq_get(xdf_t *, buf_t *);
118 static void vreq_free(xdf_t *, v_req_t *);
119 static int vreq_setup(xdf_t *, v_req_t *);
120 static ge_slot_t *gs_get(xdf_t *, int);
121 static void gs_free(xdf_t *, ge_slot_t *);
122 static grant_ref_t gs_grant(ge_slot_t *, mfn_t);
123 static void unexpectedie(xdf_t *);
124 static void xdfmin(struct buf *);
125 static void xdf_synthetic_pgeom(dev_info_t *, cmlb_geom_t *);
126 extern int xdf_kstat_create(dev_info_t *, char *, int);
127 extern void xdf_kstat_delete(dev_info_t *);
128 
129 #if defined(XPV_HVM_DRIVER)
130 static void xdf_hvm_add(dev_info_t *);
131 static void xdf_hvm_rm(dev_info_t *);
132 static void xdf_hvm_init(void);
133 static void xdf_hvm_fini(void);
134 #endif /* XPV_HVM_DRIVER */
135 
136 static 	struct cb_ops xdf_cbops = {
137 	xdf_open,
138 	xdf_close,
139 	xdf_strategy,
140 	nodev,
141 	xdf_dump,
142 	xdf_read,
143 	xdf_write,
144 	xdf_ioctl,
145 	nodev,
146 	nodev,
147 	nodev,
148 	nochpoll,
149 	xdf_prop_op,
150 	NULL,
151 	D_MP | D_NEW | D_64BIT,
152 	CB_REV,
153 	xdf_aread,
154 	xdf_awrite
155 };
156 
157 struct dev_ops xdf_devops = {
158 	DEVO_REV,		/* devo_rev */
159 	0,			/* devo_refcnt */
160 	xdf_getinfo,		/* devo_getinfo */
161 	nulldev,		/* devo_identify */
162 	nulldev,		/* devo_probe */
163 	xdf_attach,		/* devo_attach */
164 	xdf_detach,		/* devo_detach */
165 	xdf_reset,		/* devo_reset */
166 	&xdf_cbops,		/* devo_cb_ops */
167 	(struct bus_ops *)NULL,	/* devo_bus_ops */
168 	NULL,			/* devo_power */
169 	ddi_quiesce_not_supported,	/* devo_quiesce */
170 };
171 
172 static struct modldrv modldrv = {
173 	&mod_driverops,		/* Type of module.  This one is a driver */
174 	"virtual block driver",	/* short description */
175 	&xdf_devops		/* driver specific ops */
176 };
177 
178 static struct modlinkage xdf_modlinkage = {
179 	MODREV_1, (void *)&modldrv, NULL
180 };
181 
182 /*
183  * I/O buffer DMA attributes
184  * Make sure: one DMA window contains BLKIF_MAX_SEGMENTS_PER_REQUEST at most
185  */
186 static ddi_dma_attr_t xb_dma_attr = {
187 	DMA_ATTR_V0,
188 	(uint64_t)0,			/* lowest address */
189 	(uint64_t)0xffffffffffffffff,	/* highest usable address */
190 	(uint64_t)0xffffff,		/* DMA counter limit max */
191 	(uint64_t)XB_BSIZE,		/* alignment in bytes */
192 	XB_BSIZE - 1,			/* bitmap of burst sizes */
193 	XB_BSIZE,			/* min transfer */
194 	(uint64_t)XB_MAX_XFER, 		/* maximum transfer */
195 	(uint64_t)PAGEOFFSET,		/* 1 page segment length  */
196 	BLKIF_MAX_SEGMENTS_PER_REQUEST,	/* maximum number of segments */
197 	XB_BSIZE,			/* granularity */
198 	0,				/* flags (reserved) */
199 };
200 
201 static ddi_device_acc_attr_t xc_acc_attr = {
202 	DDI_DEVICE_ATTR_V0,
203 	DDI_NEVERSWAP_ACC,
204 	DDI_STRICTORDER_ACC
205 };
206 
207 /* callbacks from commmon label */
208 
209 int xdf_lb_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t, void *);
210 int xdf_lb_getinfo(dev_info_t *, int, void *, void *);
211 
212 static cmlb_tg_ops_t xdf_lb_ops = {
213 	TG_DK_OPS_VERSION_1,
214 	xdf_lb_rdwr,
215 	xdf_lb_getinfo
216 };
217 
218 int
219 _init(void)
220 {
221 	int rc;
222 
223 	if ((rc = ddi_soft_state_init(&vbd_ss, sizeof (xdf_t), 0)) != 0)
224 		return (rc);
225 
226 	xdf_vreq_cache = kmem_cache_create("xdf_vreq_cache",
227 	    sizeof (v_req_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
228 	xdf_gs_cache = kmem_cache_create("xdf_gs_cache",
229 	    sizeof (ge_slot_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
230 
231 #if defined(XPV_HVM_DRIVER)
232 	xdf_hvm_init();
233 #endif /* XPV_HVM_DRIVER */
234 
235 	if ((rc = mod_install(&xdf_modlinkage)) != 0) {
236 #if defined(XPV_HVM_DRIVER)
237 		xdf_hvm_fini();
238 #endif /* XPV_HVM_DRIVER */
239 		kmem_cache_destroy(xdf_vreq_cache);
240 		kmem_cache_destroy(xdf_gs_cache);
241 		ddi_soft_state_fini(&vbd_ss);
242 		return (rc);
243 	}
244 
245 	return (rc);
246 }
247 
248 int
249 _fini(void)
250 {
251 
252 	int err;
253 	if ((err = mod_remove(&xdf_modlinkage)) != 0)
254 		return (err);
255 
256 #if defined(XPV_HVM_DRIVER)
257 	xdf_hvm_fini();
258 #endif /* XPV_HVM_DRIVER */
259 
260 	kmem_cache_destroy(xdf_vreq_cache);
261 	kmem_cache_destroy(xdf_gs_cache);
262 	ddi_soft_state_fini(&vbd_ss);
263 
264 	return (0);
265 }
266 
267 int
268 _info(struct modinfo *modinfop)
269 {
270 	return (mod_info(&xdf_modlinkage, modinfop));
271 }
272 
273 /*ARGSUSED*/
274 static int
275 xdf_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **rp)
276 {
277 	int instance;
278 	xdf_t *vbdp;
279 
280 	instance = XDF_INST(getminor((dev_t)arg));
281 
282 	switch (cmd) {
283 	case DDI_INFO_DEVT2DEVINFO:
284 		if ((vbdp = ddi_get_soft_state(vbd_ss, instance)) == NULL) {
285 			*rp = NULL;
286 			return (DDI_FAILURE);
287 		}
288 		*rp = vbdp->xdf_dip;
289 		return (DDI_SUCCESS);
290 
291 	case DDI_INFO_DEVT2INSTANCE:
292 		*rp = (void *)(uintptr_t)instance;
293 		return (DDI_SUCCESS);
294 
295 	default:
296 		return (DDI_FAILURE);
297 	}
298 }
299 
300 static int
301 xdf_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags,
302 	char *name, caddr_t valuep, int *lengthp)
303 {
304 	xdf_t	*vdp;
305 
306 	if ((vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(dip))) == NULL)
307 		return (ddi_prop_op(dev, dip, prop_op, mod_flags,
308 		    name, valuep, lengthp));
309 
310 	return (cmlb_prop_op(vdp->xdf_vd_lbl,
311 	    dev, dip, prop_op, mod_flags, name, valuep, lengthp,
312 	    XDF_PART(getminor(dev)), NULL));
313 }
314 
315 static int
316 xdf_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
317 {
318 	xdf_t *vdp;
319 	ddi_iblock_cookie_t softibc;
320 	int instance;
321 
322 	xdfdebug = ddi_prop_get_int(DDI_DEV_T_ANY, devi, DDI_PROP_NOTPROM,
323 	    "xdfdebug", 0);
324 
325 	switch (cmd) {
326 		case DDI_ATTACH:
327 			break;
328 
329 		case DDI_RESUME:
330 			return (xdf_resume(devi));
331 
332 		default:
333 			return (DDI_FAILURE);
334 	}
335 
336 	instance = ddi_get_instance(devi);
337 	if (ddi_soft_state_zalloc(vbd_ss, instance) != DDI_SUCCESS)
338 		return (DDI_FAILURE);
339 
340 	DPRINTF(DDI_DBG, ("xdf%d: attaching\n", instance));
341 	vdp = ddi_get_soft_state(vbd_ss, instance);
342 	ddi_set_driver_private(devi, vdp);
343 	vdp->xdf_dip = devi;
344 	cv_init(&vdp->xdf_dev_cv, NULL, CV_DEFAULT, NULL);
345 
346 	if (ddi_get_iblock_cookie(devi, 0, &vdp->xdf_ibc) != DDI_SUCCESS) {
347 		cmn_err(CE_WARN, "xdf@%s: failed to get iblock cookie",
348 		    ddi_get_name_addr(devi));
349 		goto errout0;
350 	}
351 	mutex_init(&vdp->xdf_dev_lk, NULL, MUTEX_DRIVER, (void *)vdp->xdf_ibc);
352 	mutex_init(&vdp->xdf_cb_lk, NULL, MUTEX_DRIVER, (void *)vdp->xdf_ibc);
353 	mutex_init(&vdp->xdf_iostat_lk, NULL, MUTEX_DRIVER,
354 	    (void *)vdp->xdf_ibc);
355 
356 	if (ddi_get_soft_iblock_cookie(devi, DDI_SOFTINT_LOW, &softibc)
357 	    != DDI_SUCCESS) {
358 		cmn_err(CE_WARN, "xdf@%s: failed to get softintr iblock cookie",
359 		    ddi_get_name_addr(devi));
360 		goto errout0;
361 	}
362 	if (ddi_add_softintr(devi, DDI_SOFTINT_LOW, &vdp->xdf_softintr_id,
363 	    &softibc, NULL, xdf_iorestart, (caddr_t)vdp) != DDI_SUCCESS) {
364 		cmn_err(CE_WARN, "xdf@%s: failed to add softintr",
365 		    ddi_get_name_addr(devi));
366 		goto errout0;
367 	}
368 
369 #if !defined(XPV_HVM_DRIVER)
370 	/* create kstat for iostat(1M) */
371 	if (xdf_kstat_create(devi, "xdf", instance) != 0) {
372 		cmn_err(CE_WARN, "xdf@%s: failed to create kstat",
373 		    ddi_get_name_addr(devi));
374 		goto errout0;
375 	}
376 #endif /* !XPV_HVM_DRIVER */
377 
378 	/* driver handles kernel-issued IOCTLs */
379 	if (ddi_prop_create(DDI_DEV_T_NONE, devi, DDI_PROP_CANSLEEP,
380 	    DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) {
381 		cmn_err(CE_WARN, "xdf@%s: cannot create DDI_KERNEL_IOCTL prop",
382 		    ddi_get_name_addr(devi));
383 		goto errout0;
384 	}
385 
386 	/*
387 	 * Initialize the physical geometry stucture.  Note that currently
388 	 * we don't know the size of the backend device so the number
389 	 * of blocks on the device will be initialized to zero.  Once
390 	 * we connect to the backend device we'll update the physical
391 	 * geometry to reflect the real size of the device.
392 	 */
393 	xdf_synthetic_pgeom(devi, &vdp->xdf_pgeom);
394 
395 	/*
396 	 * create default device minor nodes: non-removable disk
397 	 * we will adjust minor nodes after we are connected w/ backend
398 	 */
399 	cmlb_alloc_handle(&vdp->xdf_vd_lbl);
400 	if (cmlb_attach(devi, &xdf_lb_ops, DTYPE_DIRECT, 0, 1,
401 	    DDI_NT_BLOCK_XVMD,
402 #if defined(XPV_HVM_DRIVER)
403 	    CMLB_CREATE_ALTSLICE_VTOC_16_DTYPE_DIRECT |
404 	    CMLB_INTERNAL_MINOR_NODES,
405 #else /* !XPV_HVM_DRIVER */
406 	    CMLB_FAKE_LABEL_ONE_PARTITION,
407 #endif /* !XPV_HVM_DRIVER */
408 	    vdp->xdf_vd_lbl, NULL) != 0) {
409 		cmn_err(CE_WARN, "xdf@%s: default cmlb attach failed",
410 		    ddi_get_name_addr(devi));
411 		goto errout0;
412 	}
413 
414 	/*
415 	 * We ship with cache-enabled disks
416 	 */
417 	vdp->xdf_wce = 1;
418 
419 	mutex_enter(&vdp->xdf_cb_lk);
420 
421 	/* Watch backend XenbusState change */
422 	if (xvdi_add_event_handler(devi, XS_OE_STATE, xdf_oe_change,
423 	    NULL) != DDI_SUCCESS) {
424 		mutex_exit(&vdp->xdf_cb_lk);
425 		goto errout0;
426 	}
427 
428 	if (xdf_start_connect(vdp) != DDI_SUCCESS) {
429 		cmn_err(CE_WARN, "xdf@%s: start connection failed",
430 		    ddi_get_name_addr(devi));
431 		(void) xdf_start_disconnect(vdp);
432 		mutex_exit(&vdp->xdf_cb_lk);
433 		goto errout1;
434 	}
435 
436 	mutex_exit(&vdp->xdf_cb_lk);
437 
438 	list_create(&vdp->xdf_vreq_act, sizeof (v_req_t),
439 	    offsetof(v_req_t, v_link));
440 	list_create(&vdp->xdf_gs_act, sizeof (ge_slot_t),
441 	    offsetof(ge_slot_t, link));
442 
443 #if defined(XPV_HVM_DRIVER)
444 	xdf_hvm_add(devi);
445 
446 	(void) ddi_prop_update_int(DDI_DEV_T_NONE, devi, DDI_NO_AUTODETACH, 1);
447 
448 	/*
449 	 * Report our version to dom0.
450 	 */
451 	if (xenbus_printf(XBT_NULL, "hvmpv/xdf", "version", "%d",
452 	    HVMPV_XDF_VERS))
453 		cmn_err(CE_WARN, "xdf: couldn't write version\n");
454 #endif /* XPV_HVM_DRIVER */
455 
456 	ddi_report_dev(devi);
457 
458 	DPRINTF(DDI_DBG, ("xdf%d: attached\n", instance));
459 
460 	return (DDI_SUCCESS);
461 
462 errout1:
463 	xvdi_remove_event_handler(devi, XS_OE_STATE);
464 errout0:
465 	if (vdp->xdf_vd_lbl != NULL) {
466 		cmlb_detach(vdp->xdf_vd_lbl, NULL);
467 		cmlb_free_handle(&vdp->xdf_vd_lbl);
468 		vdp->xdf_vd_lbl = NULL;
469 	}
470 #if !defined(XPV_HVM_DRIVER)
471 	xdf_kstat_delete(devi);
472 #endif /* !XPV_HVM_DRIVER */
473 	if (vdp->xdf_softintr_id != NULL)
474 		ddi_remove_softintr(vdp->xdf_softintr_id);
475 	if (vdp->xdf_ibc != NULL) {
476 		mutex_destroy(&vdp->xdf_cb_lk);
477 		mutex_destroy(&vdp->xdf_dev_lk);
478 	}
479 	cv_destroy(&vdp->xdf_dev_cv);
480 	ddi_soft_state_free(vbd_ss, instance);
481 	ddi_set_driver_private(devi, NULL);
482 	ddi_prop_remove_all(devi);
483 	cmn_err(CE_WARN, "xdf@%s: attach failed", ddi_get_name_addr(devi));
484 	return (DDI_FAILURE);
485 }
486 
487 static int
488 xdf_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
489 {
490 	xdf_t *vdp;
491 	int instance;
492 
493 	switch (cmd) {
494 
495 	case DDI_PM_SUSPEND:
496 		break;
497 
498 	case DDI_SUSPEND:
499 		return (xdf_suspend(devi));
500 
501 	case DDI_DETACH:
502 		break;
503 
504 	default:
505 		return (DDI_FAILURE);
506 	}
507 
508 	instance = ddi_get_instance(devi);
509 	DPRINTF(DDI_DBG, ("xdf%d: detaching\n", instance));
510 	vdp = ddi_get_soft_state(vbd_ss, instance);
511 
512 	if (vdp == NULL)
513 		return (DDI_FAILURE);
514 
515 	mutex_enter(&vdp->xdf_dev_lk);
516 	if (xdf_isopen(vdp, -1)) {
517 		mutex_exit(&vdp->xdf_dev_lk);
518 		return (DDI_FAILURE);
519 	}
520 
521 	if (vdp->xdf_status != XD_CLOSED) {
522 		mutex_exit(&vdp->xdf_dev_lk);
523 		return (DDI_FAILURE);
524 	}
525 
526 #if defined(XPV_HVM_DRIVER)
527 	xdf_hvm_rm(devi);
528 #endif /* XPV_HVM_DRIVER */
529 
530 	ASSERT(!ISDMACBON(vdp));
531 	mutex_exit(&vdp->xdf_dev_lk);
532 
533 	if (vdp->xdf_timeout_id != 0)
534 		(void) untimeout(vdp->xdf_timeout_id);
535 
536 	xvdi_remove_event_handler(devi, XS_OE_STATE);
537 
538 	/* we'll support backend running in domU later */
539 #ifdef	DOMU_BACKEND
540 	(void) xvdi_post_event(devi, XEN_HP_REMOVE);
541 #endif
542 
543 	list_destroy(&vdp->xdf_vreq_act);
544 	list_destroy(&vdp->xdf_gs_act);
545 	ddi_prop_remove_all(devi);
546 	xdf_kstat_delete(devi);
547 	ddi_remove_softintr(vdp->xdf_softintr_id);
548 	ddi_set_driver_private(devi, NULL);
549 	cv_destroy(&vdp->xdf_dev_cv);
550 	mutex_destroy(&vdp->xdf_cb_lk);
551 	mutex_destroy(&vdp->xdf_dev_lk);
552 	if (vdp->xdf_cache_flush_block != NULL)
553 		kmem_free(vdp->xdf_flush_mem, 2 * DEV_BSIZE);
554 	ddi_soft_state_free(vbd_ss, instance);
555 	return (DDI_SUCCESS);
556 }
557 
558 static int
559 xdf_suspend(dev_info_t *devi)
560 {
561 	xdf_t *vdp;
562 	int instance;
563 	enum xdf_state st;
564 
565 	instance = ddi_get_instance(devi);
566 
567 	if (xdfdebug & SUSRES_DBG)
568 		xen_printf("xdf_suspend: xdf#%d\n", instance);
569 
570 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
571 		return (DDI_FAILURE);
572 
573 	xvdi_suspend(devi);
574 
575 	mutex_enter(&vdp->xdf_cb_lk);
576 	mutex_enter(&vdp->xdf_dev_lk);
577 	st = vdp->xdf_status;
578 	/* change status to stop further I/O requests */
579 	if (st == XD_READY)
580 		vdp->xdf_status = XD_SUSPEND;
581 	mutex_exit(&vdp->xdf_dev_lk);
582 	mutex_exit(&vdp->xdf_cb_lk);
583 
584 	/* make sure no more I/O responses left in the ring buffer */
585 	if ((st == XD_INIT) || (st == XD_READY)) {
586 #ifdef XPV_HVM_DRIVER
587 		ec_unbind_evtchn(vdp->xdf_evtchn);
588 		xvdi_free_evtchn(devi);
589 #else /* !XPV_HVM_DRIVER */
590 		(void) ddi_remove_intr(devi, 0, NULL);
591 #endif /* !XPV_HVM_DRIVER */
592 		(void) xdf_drain_io(vdp);
593 		/*
594 		 * no need to teardown the ring buffer here
595 		 * it will be simply re-init'ed during resume when
596 		 * we call xvdi_alloc_ring
597 		 */
598 	}
599 
600 	if (xdfdebug & SUSRES_DBG)
601 		xen_printf("xdf_suspend: SUCCESS\n");
602 
603 	return (DDI_SUCCESS);
604 }
605 
606 /*ARGSUSED*/
607 static int
608 xdf_resume(dev_info_t *devi)
609 {
610 	xdf_t *vdp;
611 	int instance;
612 
613 	instance = ddi_get_instance(devi);
614 	if (xdfdebug & SUSRES_DBG)
615 		xen_printf("xdf_resume: xdf%d\n", instance);
616 
617 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
618 		return (DDI_FAILURE);
619 
620 	mutex_enter(&vdp->xdf_cb_lk);
621 
622 	if (xvdi_resume(devi) != DDI_SUCCESS) {
623 		mutex_exit(&vdp->xdf_cb_lk);
624 		return (DDI_FAILURE);
625 	}
626 
627 	mutex_enter(&vdp->xdf_dev_lk);
628 	ASSERT(vdp->xdf_status != XD_READY);
629 	vdp->xdf_status = XD_UNKNOWN;
630 	mutex_exit(&vdp->xdf_dev_lk);
631 
632 	if (xdf_start_connect(vdp) != DDI_SUCCESS) {
633 		mutex_exit(&vdp->xdf_cb_lk);
634 		return (DDI_FAILURE);
635 	}
636 
637 	mutex_exit(&vdp->xdf_cb_lk);
638 
639 	if (xdfdebug & SUSRES_DBG)
640 		xen_printf("xdf_resume: done\n");
641 	return (DDI_SUCCESS);
642 }
643 
644 /*ARGSUSED*/
645 static int
646 xdf_reset(dev_info_t *devi, ddi_reset_cmd_t cmd)
647 {
648 	xdf_t *vdp;
649 	int instance;
650 
651 	instance = ddi_get_instance(devi);
652 	DPRINTF(DDI_DBG, ("xdf%d: resetting\n", instance));
653 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
654 		return (DDI_FAILURE);
655 
656 	/*
657 	 * wait for any outstanding I/O to complete
658 	 */
659 	(void) xdf_drain_io(vdp);
660 
661 	DPRINTF(DDI_DBG, ("xdf%d: reset complete\n", instance));
662 	return (DDI_SUCCESS);
663 }
664 
665 static int
666 xdf_open(dev_t *devp, int flag, int otyp, cred_t *credp)
667 {
668 	minor_t	minor;
669 	xdf_t	*vdp;
670 	int part;
671 	ulong_t parbit;
672 	diskaddr_t p_blkct = 0;
673 	boolean_t firstopen;
674 	boolean_t nodelay;
675 
676 	minor = getminor(*devp);
677 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
678 		return (ENXIO);
679 
680 	nodelay = (flag & (FNDELAY | FNONBLOCK));
681 
682 	DPRINTF(DDI_DBG, ("xdf%d: opening\n", XDF_INST(minor)));
683 
684 	/* do cv_wait until connected or failed */
685 	mutex_enter(&vdp->xdf_dev_lk);
686 	if (!nodelay && (xdf_connect(vdp, B_TRUE) != XD_READY)) {
687 		mutex_exit(&vdp->xdf_dev_lk);
688 		return (ENXIO);
689 	}
690 
691 	if ((flag & FWRITE) && XD_IS_RO(vdp)) {
692 		mutex_exit(&vdp->xdf_dev_lk);
693 		return (EROFS);
694 	}
695 
696 	part = XDF_PART(minor);
697 	parbit = 1 << part;
698 	if ((vdp->xdf_vd_exclopen & parbit) ||
699 	    ((flag & FEXCL) && xdf_isopen(vdp, part))) {
700 		mutex_exit(&vdp->xdf_dev_lk);
701 		return (EBUSY);
702 	}
703 
704 	/* are we the first one to open this node? */
705 	firstopen = !xdf_isopen(vdp, -1);
706 
707 	if (otyp == OTYP_LYR)
708 		vdp->xdf_vd_lyropen[part]++;
709 
710 	vdp->xdf_vd_open[otyp] |= parbit;
711 
712 	if (flag & FEXCL)
713 		vdp->xdf_vd_exclopen |= parbit;
714 
715 	mutex_exit(&vdp->xdf_dev_lk);
716 
717 	/* force a re-validation */
718 	if (firstopen)
719 		cmlb_invalidate(vdp->xdf_vd_lbl, NULL);
720 
721 	/*
722 	 * check size
723 	 * ignore CD/DVD which contains a zero-sized s0
724 	 */
725 	if (!nodelay && !XD_IS_CD(vdp) &&
726 	    ((cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
727 	    NULL, NULL, NULL, NULL) != 0) || (p_blkct == 0))) {
728 		(void) xdf_close(*devp, flag, otyp, credp);
729 		return (ENXIO);
730 	}
731 
732 	return (0);
733 }
734 
735 /*ARGSUSED*/
736 static int
737 xdf_close(dev_t dev, int flag, int otyp, struct cred *credp)
738 {
739 	minor_t	minor;
740 	xdf_t	*vdp;
741 	int part;
742 	ulong_t parbit;
743 
744 	minor = getminor(dev);
745 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
746 		return (ENXIO);
747 
748 	mutex_enter(&vdp->xdf_dev_lk);
749 	part = XDF_PART(minor);
750 	if (!xdf_isopen(vdp, part)) {
751 		mutex_exit(&vdp->xdf_dev_lk);
752 		return (ENXIO);
753 	}
754 	parbit = 1 << part;
755 
756 	ASSERT((vdp->xdf_vd_open[otyp] & parbit) != 0);
757 	if (otyp == OTYP_LYR) {
758 		ASSERT(vdp->xdf_vd_lyropen[part] > 0);
759 		if (--vdp->xdf_vd_lyropen[part] == 0)
760 			vdp->xdf_vd_open[otyp] &= ~parbit;
761 	} else {
762 		vdp->xdf_vd_open[otyp] &= ~parbit;
763 	}
764 	vdp->xdf_vd_exclopen &= ~parbit;
765 
766 	mutex_exit(&vdp->xdf_dev_lk);
767 	return (0);
768 }
769 
770 static int
771 xdf_strategy(struct buf *bp)
772 {
773 	xdf_t	*vdp;
774 	minor_t minor;
775 	diskaddr_t p_blkct, p_blkst;
776 	ulong_t nblks;
777 	int part;
778 
779 	minor = getminor(bp->b_edev);
780 	part = XDF_PART(minor);
781 
782 	vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor));
783 	if ((vdp == NULL) || !xdf_isopen(vdp, part)) {
784 		bioerror(bp, ENXIO);
785 		bp->b_resid = bp->b_bcount;
786 		biodone(bp);
787 		return (0);
788 	}
789 
790 	/* Check for writes to a read only device */
791 	if (!IS_READ(bp) && XD_IS_RO(vdp)) {
792 		bioerror(bp, EROFS);
793 		bp->b_resid = bp->b_bcount;
794 		biodone(bp);
795 		return (0);
796 	}
797 
798 	/* Check if this I/O is accessing a partition or the entire disk */
799 	if ((long)bp->b_private == XB_SLICE_NONE) {
800 		/* This I/O is using an absolute offset */
801 		p_blkct = vdp->xdf_xdev_nblocks;
802 		p_blkst = 0;
803 	} else {
804 		/* This I/O is using a partition relative offset */
805 		if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
806 		    &p_blkst, NULL, NULL, NULL)) {
807 			bioerror(bp, ENXIO);
808 			bp->b_resid = bp->b_bcount;
809 			biodone(bp);
810 			return (0);
811 		}
812 	}
813 
814 	/* check for a starting block beyond the disk or partition limit */
815 	if (bp->b_blkno > p_blkct) {
816 		DPRINTF(IO_DBG, ("xdf: block %lld exceeds VBD size %"PRIu64,
817 		    (longlong_t)bp->b_blkno, (uint64_t)p_blkct));
818 		bioerror(bp, EINVAL);
819 		bp->b_resid = bp->b_bcount;
820 		biodone(bp);
821 		return (0);
822 	}
823 
824 	/* Legacy: don't set error flag at this case */
825 	if (bp->b_blkno == p_blkct) {
826 		bp->b_resid = bp->b_bcount;
827 		biodone(bp);
828 		return (0);
829 	}
830 
831 	/* Adjust for partial transfer */
832 	nblks = bp->b_bcount >> XB_BSHIFT;
833 	if ((bp->b_blkno + nblks) > p_blkct) {
834 		bp->b_resid = ((bp->b_blkno + nblks) - p_blkct) << XB_BSHIFT;
835 		bp->b_bcount -= bp->b_resid;
836 	}
837 
838 	DPRINTF(IO_DBG, ("xdf: strategy blk %lld len %lu\n",
839 	    (longlong_t)bp->b_blkno, (ulong_t)bp->b_bcount));
840 
841 	/* Fix up the buf struct */
842 	bp->b_flags |= B_BUSY;
843 	bp->av_forw = bp->av_back = NULL; /* not tagged with a v_req */
844 	bp->b_private = (void *)(uintptr_t)p_blkst;
845 
846 	mutex_enter(&vdp->xdf_dev_lk);
847 	if (vdp->xdf_xdev_iostat != NULL)
848 		kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
849 	if (vdp->xdf_f_act == NULL) {
850 		vdp->xdf_f_act = vdp->xdf_l_act = bp;
851 	} else {
852 		vdp->xdf_l_act->av_forw = bp;
853 		vdp->xdf_l_act = bp;
854 	}
855 	mutex_exit(&vdp->xdf_dev_lk);
856 
857 	xdf_iostart(vdp);
858 	if (do_polled_io)
859 		(void) xdf_drain_io(vdp);
860 	return (0);
861 }
862 
863 /*ARGSUSED*/
864 static int
865 xdf_read(dev_t dev, struct uio *uiop, cred_t *credp)
866 {
867 
868 	xdf_t	*vdp;
869 	minor_t minor;
870 	diskaddr_t p_blkcnt;
871 	int part;
872 
873 	minor = getminor(dev);
874 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
875 		return (ENXIO);
876 
877 	DPRINTF(IO_DBG, ("xdf: read offset 0x%"PRIx64"\n",
878 	    (int64_t)uiop->uio_offset));
879 
880 	part = XDF_PART(minor);
881 	if (!xdf_isopen(vdp, part))
882 		return (ENXIO);
883 
884 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
885 	    NULL, NULL, NULL, NULL))
886 		return (ENXIO);
887 
888 	if (U_INVAL(uiop))
889 		return (EINVAL);
890 
891 	return (physio(xdf_strategy, NULL, dev, B_READ, xdfmin, uiop));
892 }
893 
894 /*ARGSUSED*/
895 static int
896 xdf_write(dev_t dev, struct uio *uiop, cred_t *credp)
897 {
898 	xdf_t *vdp;
899 	minor_t minor;
900 	diskaddr_t p_blkcnt;
901 	int part;
902 
903 	minor = getminor(dev);
904 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
905 		return (ENXIO);
906 
907 	DPRINTF(IO_DBG, ("xdf: write offset 0x%"PRIx64"\n",
908 	    (int64_t)uiop->uio_offset));
909 
910 	part = XDF_PART(minor);
911 	if (!xdf_isopen(vdp, part))
912 		return (ENXIO);
913 
914 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
915 	    NULL, NULL, NULL, NULL))
916 		return (ENXIO);
917 
918 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt))
919 		return (ENOSPC);
920 
921 	if (U_INVAL(uiop))
922 		return (EINVAL);
923 
924 	return (physio(xdf_strategy, NULL, dev, B_WRITE, minphys, uiop));
925 }
926 
927 /*ARGSUSED*/
928 static int
929 xdf_aread(dev_t dev, struct aio_req *aiop, cred_t *credp)
930 {
931 	xdf_t	*vdp;
932 	minor_t minor;
933 	struct uio *uiop = aiop->aio_uio;
934 	diskaddr_t p_blkcnt;
935 	int part;
936 
937 	minor = getminor(dev);
938 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
939 		return (ENXIO);
940 
941 	part = XDF_PART(minor);
942 	if (!xdf_isopen(vdp, part))
943 		return (ENXIO);
944 
945 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
946 	    NULL, NULL, NULL, NULL))
947 		return (ENXIO);
948 
949 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt))
950 		return (ENOSPC);
951 
952 	if (U_INVAL(uiop))
953 		return (EINVAL);
954 
955 	return (aphysio(xdf_strategy, anocancel, dev, B_READ, minphys, aiop));
956 }
957 
958 /*ARGSUSED*/
959 static int
960 xdf_awrite(dev_t dev, struct aio_req *aiop, cred_t *credp)
961 {
962 	xdf_t *vdp;
963 	minor_t minor;
964 	struct uio *uiop = aiop->aio_uio;
965 	diskaddr_t p_blkcnt;
966 	int part;
967 
968 	minor = getminor(dev);
969 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
970 		return (ENXIO);
971 
972 	part = XDF_PART(minor);
973 	if (!xdf_isopen(vdp, part))
974 		return (ENXIO);
975 
976 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
977 	    NULL, NULL, NULL, NULL))
978 		return (ENXIO);
979 
980 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt))
981 		return (ENOSPC);
982 
983 	if (U_INVAL(uiop))
984 		return (EINVAL);
985 
986 	return (aphysio(xdf_strategy, anocancel, dev, B_WRITE, minphys, aiop));
987 }
988 
989 static int
990 xdf_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
991 {
992 	struct buf dumpbuf, *dbp;
993 	xdf_t	*vdp;
994 	minor_t minor;
995 	int err = 0;
996 	int part;
997 	diskaddr_t p_blkcnt, p_blkst;
998 
999 	minor = getminor(dev);
1000 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
1001 		return (ENXIO);
1002 
1003 	DPRINTF(IO_DBG, ("xdf: dump addr (0x%p) blk (%ld) nblks (%d)\n",
1004 	    (void *)addr, blkno, nblk));
1005 
1006 	part = XDF_PART(minor);
1007 	if (!xdf_isopen(vdp, part))
1008 		return (ENXIO);
1009 
1010 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt, &p_blkst,
1011 	    NULL, NULL, NULL))
1012 		return (ENXIO);
1013 
1014 	if ((blkno + nblk) > p_blkcnt) {
1015 		cmn_err(CE_WARN, "xdf: block %ld exceeds VBD size %"PRIu64,
1016 		    blkno + nblk, (uint64_t)p_blkcnt);
1017 		return (EINVAL);
1018 	}
1019 
1020 	dbp = &dumpbuf;
1021 	bioinit(dbp);
1022 	dbp->b_flags = B_BUSY;
1023 	dbp->b_un.b_addr = addr;
1024 	dbp->b_bcount = nblk << DEV_BSHIFT;
1025 	dbp->b_blkno = blkno;
1026 	dbp->b_edev = dev;
1027 	dbp->b_private = (void *)(uintptr_t)p_blkst;
1028 
1029 	mutex_enter(&vdp->xdf_dev_lk);
1030 	if (vdp->xdf_xdev_iostat != NULL)
1031 		kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1032 	if (vdp->xdf_f_act == NULL) {
1033 		vdp->xdf_f_act = vdp->xdf_l_act = dbp;
1034 	} else {
1035 		vdp->xdf_l_act->av_forw = dbp;
1036 		vdp->xdf_l_act = dbp;
1037 	}
1038 	dbp->av_forw = NULL;
1039 	dbp->av_back = NULL;
1040 	mutex_exit(&vdp->xdf_dev_lk);
1041 	xdf_iostart(vdp);
1042 	err = xdf_drain_io(vdp);
1043 	biofini(dbp);
1044 	return (err);
1045 }
1046 
1047 /*ARGSUSED*/
1048 static int
1049 xdf_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
1050     int *rvalp)
1051 {
1052 	int instance;
1053 	xdf_t	*vdp;
1054 	minor_t minor;
1055 	int part;
1056 
1057 	minor = getminor(dev);
1058 	instance = XDF_INST(minor);
1059 
1060 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
1061 		return (ENXIO);
1062 
1063 	DPRINTF(IOCTL_DBG, ("xdf%d:ioctl: cmd %d (0x%x)\n",
1064 	    instance, cmd, cmd));
1065 
1066 	part = XDF_PART(minor);
1067 	if (!xdf_isopen(vdp, part))
1068 		return (ENXIO);
1069 
1070 	switch (cmd) {
1071 	case DKIOCGMEDIAINFO: {
1072 		struct dk_minfo	media_info;
1073 
1074 		media_info.dki_lbsize = DEV_BSIZE;
1075 		media_info.dki_capacity = vdp->xdf_pgeom.g_capacity;
1076 		media_info.dki_media_type = DK_FIXED_DISK;
1077 
1078 		if (ddi_copyout(&media_info, (void *)arg,
1079 		    sizeof (struct dk_minfo), mode)) {
1080 			return (EFAULT);
1081 		} else {
1082 			return (0);
1083 		}
1084 	}
1085 
1086 	case DKIOCINFO: {
1087 		struct dk_cinfo info;
1088 
1089 		/* controller information */
1090 		if (XD_IS_CD(vdp))
1091 			info.dki_ctype = DKC_CDROM;
1092 		else
1093 			info.dki_ctype = DKC_VBD;
1094 
1095 		info.dki_cnum = 0;
1096 		(void) strncpy((char *)(&info.dki_cname), "xdf", 8);
1097 
1098 		/* unit information */
1099 		info.dki_unit = ddi_get_instance(vdp->xdf_dip);
1100 		(void) strncpy((char *)(&info.dki_dname), "xdf", 8);
1101 		info.dki_flags = DKI_FMTVOL;
1102 		info.dki_partition = part;
1103 		info.dki_maxtransfer = maxphys / DEV_BSIZE;
1104 		info.dki_addr = 0;
1105 		info.dki_space = 0;
1106 		info.dki_prio = 0;
1107 		info.dki_vec = 0;
1108 
1109 		if (ddi_copyout(&info, (void *)arg, sizeof (info), mode))
1110 			return (EFAULT);
1111 		else
1112 			return (0);
1113 	}
1114 
1115 	case DKIOCSTATE: {
1116 		enum dkio_state	dkstate = DKIO_INSERTED;
1117 		if (ddi_copyout(&dkstate, (void *)arg, sizeof (dkstate),
1118 		    mode) != 0)
1119 			return (EFAULT);
1120 		return (0);
1121 	}
1122 
1123 	/*
1124 	 * is media removable?
1125 	 */
1126 	case DKIOCREMOVABLE: {
1127 		int i = XD_IS_RM(vdp) ? 1 : 0;
1128 		if (ddi_copyout(&i, (caddr_t)arg, sizeof (int), mode))
1129 			return (EFAULT);
1130 		return (0);
1131 	}
1132 
1133 	case DKIOCG_PHYGEOM:
1134 	case DKIOCG_VIRTGEOM:
1135 	case DKIOCGGEOM:
1136 	case DKIOCSGEOM:
1137 	case DKIOCGAPART:
1138 	case DKIOCSAPART:
1139 	case DKIOCGVTOC:
1140 	case DKIOCSVTOC:
1141 	case DKIOCPARTINFO:
1142 	case DKIOCGEXTVTOC:
1143 	case DKIOCSEXTVTOC:
1144 	case DKIOCEXTPARTINFO:
1145 	case DKIOCGMBOOT:
1146 	case DKIOCSMBOOT:
1147 	case DKIOCGETEFI:
1148 	case DKIOCSETEFI:
1149 	case DKIOCSETEXTPART:
1150 	case DKIOCPARTITION: {
1151 		int rc;
1152 
1153 		rc = cmlb_ioctl(vdp->xdf_vd_lbl, dev, cmd, arg, mode, credp,
1154 		    rvalp, NULL);
1155 		return (rc);
1156 	}
1157 
1158 	case DKIOCGETWCE:
1159 		if (ddi_copyout(&vdp->xdf_wce, (void *)arg,
1160 		    sizeof (vdp->xdf_wce), mode))
1161 			return (EFAULT);
1162 		return (0);
1163 	case DKIOCSETWCE:
1164 		if (ddi_copyin((void *)arg, &vdp->xdf_wce,
1165 		    sizeof (vdp->xdf_wce), mode))
1166 			return (EFAULT);
1167 		return (0);
1168 	case DKIOCFLUSHWRITECACHE: {
1169 		int rc;
1170 		struct dk_callback *dkc = (struct dk_callback *)arg;
1171 
1172 		if (vdp->xdf_flush_supported) {
1173 			rc = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
1174 			    NULL, 0, 0, (void *)dev);
1175 		} else if (vdp->xdf_feature_barrier &&
1176 		    !xdf_barrier_flush_disable) {
1177 			rc = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
1178 			    vdp->xdf_cache_flush_block, xdf_flush_block,
1179 			    DEV_BSIZE, (void *)dev);
1180 		} else {
1181 			return (ENOTTY);
1182 		}
1183 		if ((mode & FKIOCTL) && (dkc != NULL) &&
1184 		    (dkc->dkc_callback != NULL)) {
1185 			(*dkc->dkc_callback)(dkc->dkc_cookie, rc);
1186 			/* need to return 0 after calling callback */
1187 			rc = 0;
1188 		}
1189 		return (rc);
1190 	}
1191 
1192 	default:
1193 		return (ENOTTY);
1194 	}
1195 }
1196 
1197 /*
1198  * xdf interrupt handler
1199  */
1200 static uint_t
1201 xdf_intr(caddr_t arg)
1202 {
1203 	xdf_t *vdp = (xdf_t *)arg;
1204 	xendev_ring_t *xbr;
1205 	blkif_response_t *resp;
1206 	int bioerr;
1207 	uint64_t id;
1208 	extern int do_polled_io;
1209 	uint8_t op;
1210 	uint16_t status;
1211 	ddi_acc_handle_t acchdl;
1212 
1213 	mutex_enter(&vdp->xdf_dev_lk);
1214 
1215 	if ((xbr = vdp->xdf_xb_ring) == NULL) {
1216 		mutex_exit(&vdp->xdf_dev_lk);
1217 		return (DDI_INTR_UNCLAIMED);
1218 	}
1219 
1220 	acchdl = vdp->xdf_xb_ring_hdl;
1221 
1222 	/*
1223 	 * complete all requests which have a response
1224 	 */
1225 	while (resp = xvdi_ring_get_response(xbr)) {
1226 		id = ddi_get64(acchdl, &resp->id);
1227 		op = ddi_get8(acchdl, &resp->operation);
1228 		status = ddi_get16(acchdl, (uint16_t *)&resp->status);
1229 		DPRINTF(INTR_DBG, ("resp: op %d id %"PRIu64" status %d\n",
1230 		    op, id, status));
1231 
1232 		/*
1233 		 * XXPV - close connection to the backend and restart
1234 		 */
1235 		if (status != BLKIF_RSP_OKAY) {
1236 			DPRINTF(IO_DBG, ("xdf@%s: I/O error while %s",
1237 			    ddi_get_name_addr(vdp->xdf_dip),
1238 			    (op == BLKIF_OP_READ) ? "reading" : "writing"));
1239 			bioerr = EIO;
1240 		} else {
1241 			bioerr = 0;
1242 		}
1243 
1244 		xdf_iofini(vdp, id, bioerr);
1245 	}
1246 
1247 	mutex_exit(&vdp->xdf_dev_lk);
1248 
1249 	if (!do_polled_io)
1250 		xdf_iostart(vdp);
1251 
1252 	return (DDI_INTR_CLAIMED);
1253 }
1254 
1255 int xdf_fbrewrites;	/* how many times was our flush block rewritten */
1256 
1257 /*
1258  * Snarf new data if our flush block was re-written
1259  */
1260 static void
1261 check_fbwrite(xdf_t *vdp, buf_t *bp, daddr_t blkno)
1262 {
1263 	int nblks;
1264 	boolean_t mapin;
1265 
1266 	if (IS_WRITE_BARRIER(vdp, bp))
1267 		return; /* write was a flush write */
1268 
1269 	mapin = B_FALSE;
1270 	nblks = bp->b_bcount >> DEV_BSHIFT;
1271 	if (xdf_flush_block >= blkno && xdf_flush_block < (blkno + nblks)) {
1272 		xdf_fbrewrites++;
1273 		if (bp->b_flags & (B_PAGEIO | B_PHYS)) {
1274 			mapin = B_TRUE;
1275 			bp_mapin(bp);
1276 		}
1277 		bcopy(bp->b_un.b_addr +
1278 		    ((xdf_flush_block - blkno) << DEV_BSHIFT),
1279 		    vdp->xdf_cache_flush_block, DEV_BSIZE);
1280 		if (mapin)
1281 			bp_mapout(bp);
1282 	}
1283 }
1284 
1285 static void
1286 xdf_iofini(xdf_t *vdp, uint64_t id, int bioerr)
1287 {
1288 	ge_slot_t *gs = (ge_slot_t *)(uintptr_t)id;
1289 	v_req_t *vreq = gs->vreq;
1290 	buf_t *bp = vreq->v_buf;
1291 
1292 	gs_free(vdp, gs);
1293 	if (bioerr)
1294 		bioerror(bp, bioerr);
1295 	vreq->v_nslots--;
1296 	if (vreq->v_nslots != 0)
1297 		return;
1298 
1299 	XDF_UPDATE_IO_STAT(vdp, bp);
1300 	if (vdp->xdf_xdev_iostat != NULL)
1301 		kstat_runq_exit(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1302 
1303 	if (IS_ERROR(bp))
1304 		bp->b_resid = bp->b_bcount;
1305 
1306 	vreq_free(vdp, vreq);
1307 	biodone(bp);
1308 }
1309 
1310 /*
1311  * return value of xdf_prepare_rreq()
1312  * used in xdf_iostart()
1313  */
1314 #define	XF_PARTIAL	0 /* rreq is full, not all I/O in buf transferred */
1315 #define	XF_COMP		1 /* no more I/O left in buf */
1316 
1317 static void
1318 xdf_iostart(xdf_t *vdp)
1319 {
1320 	xendev_ring_t *xbr;
1321 	struct buf *bp;
1322 	blkif_request_t *rreq;
1323 	int retval;
1324 	int rreqready = 0;
1325 
1326 	xbr = vdp->xdf_xb_ring;
1327 
1328 	/*
1329 	 * populate the ring request(s)
1330 	 *
1331 	 * loop until there is no buf to transfer or no free slot
1332 	 * available in I/O ring
1333 	 */
1334 	mutex_enter(&vdp->xdf_dev_lk);
1335 
1336 	for (;;) {
1337 		if (vdp->xdf_status != XD_READY)
1338 			break;
1339 
1340 		/* active buf queue empty? */
1341 		if ((bp = vdp->xdf_f_act) == NULL)
1342 			break;
1343 
1344 		/* try to grab a vreq for this bp */
1345 		if ((BP2VREQ(bp) == NULL) && (vreq_get(vdp, bp) == NULL))
1346 				break;
1347 		/* alloc DMA/GTE resources */
1348 		if (vreq_setup(vdp, BP2VREQ(bp)) != DDI_SUCCESS)
1349 			break;
1350 
1351 		/* get next blkif_request in the ring */
1352 		if ((rreq = xvdi_ring_get_request(xbr)) == NULL)
1353 			break;
1354 		bzero(rreq, sizeof (blkif_request_t));
1355 
1356 		/* populate blkif_request with this buf */
1357 		rreqready++;
1358 		retval = xdf_prepare_rreq(vdp, bp, rreq);
1359 		if (retval == XF_COMP) {
1360 			/* finish this bp, switch to next one */
1361 			if (vdp->xdf_xdev_iostat != NULL)
1362 				kstat_waitq_to_runq(
1363 				    KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1364 			vdp->xdf_f_act = bp->av_forw;
1365 			bp->av_forw = NULL;
1366 		}
1367 	}
1368 
1369 	/*
1370 	 * Send the request(s) to the backend
1371 	 */
1372 	if (rreqready) {
1373 		if (xvdi_ring_push_request(xbr)) {
1374 			DPRINTF(IO_DBG, ("xdf_iostart: "
1375 			    "sent request(s) to backend\n"));
1376 			xvdi_notify_oe(vdp->xdf_dip);
1377 		}
1378 	}
1379 
1380 	mutex_exit(&vdp->xdf_dev_lk);
1381 }
1382 
1383 /*
1384  * populate a single blkif_request_t w/ a buf
1385  */
1386 static int
1387 xdf_prepare_rreq(xdf_t *vdp, struct buf *bp, blkif_request_t *rreq)
1388 {
1389 	int		rval;
1390 	grant_ref_t	gr;
1391 	uint8_t		fsect, lsect;
1392 	size_t		bcnt;
1393 	paddr_t		dma_addr;
1394 	off_t		blk_off;
1395 	dev_info_t	*dip = vdp->xdf_dip;
1396 	blkif_vdev_t	vdev = xvdi_get_vdevnum(dip);
1397 	v_req_t		*vreq = BP2VREQ(bp);
1398 	uint64_t	blkno = vreq->v_blkno;
1399 	uint_t		ndmacs = vreq->v_ndmacs;
1400 	ddi_acc_handle_t acchdl = vdp->xdf_xb_ring_hdl;
1401 	int		seg = 0;
1402 	int		isread = IS_READ(bp);
1403 
1404 	if (isread)
1405 		ddi_put8(acchdl, &rreq->operation, BLKIF_OP_READ);
1406 	else {
1407 		switch (vreq->v_flush_diskcache) {
1408 		case FLUSH_DISKCACHE:
1409 			ddi_put8(acchdl, &rreq->operation,
1410 			    BLKIF_OP_FLUSH_DISKCACHE);
1411 			ddi_put16(acchdl, &rreq->handle, vdev);
1412 			ddi_put64(acchdl, &rreq->id,
1413 			    (uint64_t)(uintptr_t)(vreq->v_gs));
1414 			ddi_put8(acchdl, &rreq->nr_segments, 0);
1415 			return (XF_COMP);
1416 		case WRITE_BARRIER:
1417 			ddi_put8(acchdl, &rreq->operation,
1418 			    BLKIF_OP_WRITE_BARRIER);
1419 			break;
1420 		default:
1421 			if (!vdp->xdf_wce)
1422 				ddi_put8(acchdl, &rreq->operation,
1423 				    BLKIF_OP_WRITE_BARRIER);
1424 			else
1425 				ddi_put8(acchdl, &rreq->operation,
1426 				    BLKIF_OP_WRITE);
1427 			break;
1428 		}
1429 	}
1430 
1431 	ddi_put16(acchdl, &rreq->handle, vdev);
1432 	ddi_put64(acchdl, &rreq->sector_number, blkno);
1433 	ddi_put64(acchdl, &rreq->id, (uint64_t)(uintptr_t)(vreq->v_gs));
1434 
1435 	/*
1436 	 * loop until all segments are populated or no more dma cookie in buf
1437 	 */
1438 	for (;;) {
1439 	/*
1440 	 * Each segment of a blkif request can transfer up to
1441 	 * one 4K page of data.
1442 	 */
1443 		bcnt = vreq->v_dmac.dmac_size;
1444 		ASSERT(bcnt <= PAGESIZE);
1445 		ASSERT((bcnt % XB_BSIZE) == 0);
1446 		dma_addr = vreq->v_dmac.dmac_laddress;
1447 		blk_off = (uint_t)((paddr_t)XB_SEGOFFSET & dma_addr);
1448 		ASSERT((blk_off & XB_BMASK) == 0);
1449 		fsect = blk_off >> XB_BSHIFT;
1450 		lsect = fsect + (bcnt >> XB_BSHIFT) - 1;
1451 		ASSERT(fsect < XB_MAX_SEGLEN / XB_BSIZE &&
1452 		    lsect < XB_MAX_SEGLEN / XB_BSIZE);
1453 		DPRINTF(IO_DBG, ("  ""seg%d: dmacS %lu blk_off %ld\n",
1454 		    seg, vreq->v_dmac.dmac_size, blk_off));
1455 		gr = gs_grant(vreq->v_gs, PATOMA(dma_addr) >> PAGESHIFT);
1456 		ddi_put32(acchdl, &rreq->seg[seg].gref, gr);
1457 		ddi_put8(acchdl, &rreq->seg[seg].first_sect, fsect);
1458 		ddi_put8(acchdl, &rreq->seg[seg].last_sect, lsect);
1459 		DPRINTF(IO_DBG, ("  ""seg%d: fs %d ls %d gr %d dma 0x%"PRIx64
1460 		    "\n", seg, fsect, lsect, gr, dma_addr));
1461 
1462 		blkno += (bcnt >> XB_BSHIFT);
1463 		seg++;
1464 		ASSERT(seg <= BLKIF_MAX_SEGMENTS_PER_REQUEST);
1465 		if (--ndmacs) {
1466 			ddi_dma_nextcookie(vreq->v_dmahdl, &vreq->v_dmac);
1467 			continue;
1468 		}
1469 
1470 		vreq->v_status = VREQ_DMAWIN_DONE;
1471 		vreq->v_blkno = blkno;
1472 		if (vreq->v_dmaw + 1 == vreq->v_ndmaws)
1473 			/* last win */
1474 			rval = XF_COMP;
1475 		else
1476 			rval = XF_PARTIAL;
1477 		break;
1478 	}
1479 	ddi_put8(acchdl,  &rreq->nr_segments, seg);
1480 	DPRINTF(IO_DBG, ("xdf_prepare_rreq: request id=%"PRIx64" ready\n",
1481 	    rreq->id));
1482 
1483 	return (rval);
1484 }
1485 
1486 #define	XDF_QSEC	50000	/* .005 second */
1487 #define	XDF_POLLCNT	12	/* loop for 12 times before time out */
1488 
1489 static int
1490 xdf_drain_io(xdf_t *vdp)
1491 {
1492 	int pollc, rval;
1493 	xendev_ring_t *xbr;
1494 
1495 	if (xdfdebug & SUSRES_DBG)
1496 		xen_printf("xdf_drain_io: start\n");
1497 
1498 	mutex_enter(&vdp->xdf_dev_lk);
1499 
1500 	if ((vdp->xdf_status != XD_READY) && (vdp->xdf_status != XD_SUSPEND))
1501 		goto out;
1502 
1503 	rval = 0;
1504 	xbr = vdp->xdf_xb_ring;
1505 	ASSERT(xbr != NULL);
1506 
1507 	for (pollc = 0; pollc < XDF_POLLCNT; pollc++) {
1508 		if (xvdi_ring_has_unconsumed_responses(xbr)) {
1509 			mutex_exit(&vdp->xdf_dev_lk);
1510 			(void) xdf_intr((caddr_t)vdp);
1511 			mutex_enter(&vdp->xdf_dev_lk);
1512 		}
1513 		if (!xvdi_ring_has_incomp_request(xbr))
1514 			goto out;
1515 
1516 #ifndef	XPV_HVM_DRIVER
1517 		(void) HYPERVISOR_yield();
1518 #endif /* XPV_HVM_DRIVER */
1519 		/*
1520 		 * file-backed devices can be slow
1521 		 */
1522 		drv_usecwait(XDF_QSEC << pollc);
1523 	}
1524 	cmn_err(CE_WARN, "xdf_polled_io: timeout");
1525 	rval = EIO;
1526 out:
1527 	mutex_exit(&vdp->xdf_dev_lk);
1528 	if (xdfdebug & SUSRES_DBG)
1529 		xen_printf("xdf_drain_io: end, err=%d\n", rval);
1530 	return (rval);
1531 }
1532 
1533 /* ARGSUSED5 */
1534 int
1535 xdf_lb_rdwr(dev_info_t *devi, uchar_t cmd, void *bufp,
1536     diskaddr_t start, size_t reqlen, void *tg_cookie)
1537 {
1538 	xdf_t *vdp;
1539 	struct buf *bp;
1540 	int err = 0;
1541 
1542 	vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi));
1543 	if (vdp == NULL)
1544 		return (ENXIO);
1545 
1546 	if ((start + (reqlen >> DEV_BSHIFT)) > vdp->xdf_pgeom.g_capacity)
1547 		return (EINVAL);
1548 
1549 	bp = getrbuf(KM_SLEEP);
1550 	if (cmd == TG_READ)
1551 		bp->b_flags = B_BUSY | B_READ;
1552 	else
1553 		bp->b_flags = B_BUSY | B_WRITE;
1554 	bp->b_un.b_addr = bufp;
1555 	bp->b_bcount = reqlen;
1556 	bp->b_blkno = start;
1557 	bp->b_edev = DDI_DEV_T_NONE; /* don't have dev_t */
1558 
1559 	mutex_enter(&vdp->xdf_dev_lk);
1560 	if (vdp->xdf_xdev_iostat != NULL)
1561 		kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1562 	if (vdp->xdf_f_act == NULL) {
1563 		vdp->xdf_f_act = vdp->xdf_l_act = bp;
1564 	} else {
1565 		vdp->xdf_l_act->av_forw = bp;
1566 		vdp->xdf_l_act = bp;
1567 	}
1568 	mutex_exit(&vdp->xdf_dev_lk);
1569 	xdf_iostart(vdp);
1570 	err = biowait(bp);
1571 
1572 	ASSERT(bp->b_flags & B_DONE);
1573 
1574 	freerbuf(bp);
1575 	return (err);
1576 }
1577 
1578 /*
1579  * synthetic geometry
1580  */
1581 #define	XDF_NSECTS	256
1582 #define	XDF_NHEADS	16
1583 
1584 static void
1585 xdf_synthetic_pgeom(dev_info_t *devi, cmlb_geom_t *geomp)
1586 {
1587 	xdf_t *vdp;
1588 	uint_t ncyl;
1589 
1590 	vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi));
1591 
1592 	ncyl = vdp->xdf_xdev_nblocks / (XDF_NHEADS * XDF_NSECTS);
1593 
1594 	geomp->g_ncyl = ncyl == 0 ? 1 : ncyl;
1595 	geomp->g_acyl = 0;
1596 	geomp->g_nhead = XDF_NHEADS;
1597 	geomp->g_secsize = XB_BSIZE;
1598 	geomp->g_nsect = XDF_NSECTS;
1599 	geomp->g_intrlv = 0;
1600 	geomp->g_rpm = 7200;
1601 	geomp->g_capacity = vdp->xdf_xdev_nblocks;
1602 }
1603 
1604 static int
1605 xdf_lb_getcap(dev_info_t *devi, diskaddr_t *capp)
1606 {
1607 	xdf_t *vdp;
1608 
1609 	vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi));
1610 
1611 	if (vdp == NULL)
1612 		return (ENXIO);
1613 
1614 	mutex_enter(&vdp->xdf_dev_lk);
1615 	*capp = vdp->xdf_pgeom.g_capacity;
1616 	DPRINTF(LBL_DBG, ("capacity %llu\n", *capp));
1617 	mutex_exit(&vdp->xdf_dev_lk);
1618 	return (0);
1619 }
1620 
1621 static int
1622 xdf_lb_getpgeom(dev_info_t *devi, cmlb_geom_t *geomp)
1623 {
1624 	xdf_t *vdp;
1625 
1626 	if ((vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi))) == NULL)
1627 		return (ENXIO);
1628 	*geomp = vdp->xdf_pgeom;
1629 	return (0);
1630 }
1631 
1632 /*
1633  * No real HBA, no geometry available from it
1634  */
1635 /*ARGSUSED*/
1636 static int
1637 xdf_lb_getvgeom(dev_info_t *devi, cmlb_geom_t *geomp)
1638 {
1639 	return (EINVAL);
1640 }
1641 
1642 static int
1643 xdf_lb_getattribute(dev_info_t *devi, tg_attribute_t *tgattributep)
1644 {
1645 	xdf_t *vdp;
1646 
1647 	if (!(vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi))))
1648 		return (ENXIO);
1649 
1650 	if (XD_IS_RO(vdp))
1651 		tgattributep->media_is_writable = 0;
1652 	else
1653 		tgattributep->media_is_writable = 1;
1654 	return (0);
1655 }
1656 
1657 /* ARGSUSED3 */
1658 int
1659 xdf_lb_getinfo(dev_info_t *devi, int cmd, void *arg, void *tg_cookie)
1660 {
1661 	switch (cmd) {
1662 	case TG_GETPHYGEOM:
1663 		return (xdf_lb_getpgeom(devi, (cmlb_geom_t *)arg));
1664 	case TG_GETVIRTGEOM:
1665 		return (xdf_lb_getvgeom(devi, (cmlb_geom_t *)arg));
1666 	case TG_GETCAPACITY:
1667 		return (xdf_lb_getcap(devi, (diskaddr_t *)arg));
1668 	case TG_GETBLOCKSIZE:
1669 		*(uint32_t *)arg = XB_BSIZE;
1670 		return (0);
1671 	case TG_GETATTR:
1672 		return (xdf_lb_getattribute(devi, (tg_attribute_t *)arg));
1673 	default:
1674 		return (ENOTTY);
1675 	}
1676 }
1677 
1678 /*
1679  * Kick-off connect process
1680  * Status should be XD_UNKNOWN or XD_CLOSED
1681  * On success, status will be changed to XD_INIT
1682  * On error, status won't be changed
1683  */
1684 static int
1685 xdf_start_connect(xdf_t *vdp)
1686 {
1687 	char *xsnode;
1688 	grant_ref_t gref;
1689 	xenbus_transaction_t xbt;
1690 	int rv;
1691 	dev_info_t *dip = vdp->xdf_dip;
1692 
1693 	if ((vdp->xdf_peer = xvdi_get_oeid(dip)) == (domid_t)-1)
1694 		goto errout;
1695 
1696 	if (xvdi_alloc_evtchn(dip) != DDI_SUCCESS) {
1697 		cmn_err(CE_WARN, "xdf@%s: failed to alloc event channel",
1698 		    ddi_get_name_addr(dip));
1699 		goto errout;
1700 	}
1701 	vdp->xdf_evtchn = xvdi_get_evtchn(dip);
1702 #ifdef XPV_HVM_DRIVER
1703 	ec_bind_evtchn_to_handler(vdp->xdf_evtchn, IPL_VBD, xdf_intr, vdp);
1704 #else /* !XPV_HVM_DRIVER */
1705 	if (ddi_add_intr(dip, 0, NULL, NULL, xdf_intr, (caddr_t)vdp) !=
1706 	    DDI_SUCCESS) {
1707 		cmn_err(CE_WARN, "xdf_start_connect: xdf@%s: "
1708 		    "failed to add intr handler", ddi_get_name_addr(dip));
1709 		goto errout1;
1710 	}
1711 #endif /* !XPV_HVM_DRIVER */
1712 
1713 	if (xvdi_alloc_ring(dip, BLKIF_RING_SIZE,
1714 	    sizeof (union blkif_sring_entry), &gref, &vdp->xdf_xb_ring) !=
1715 	    DDI_SUCCESS) {
1716 		cmn_err(CE_WARN, "xdf@%s: failed to alloc comm ring",
1717 		    ddi_get_name_addr(dip));
1718 		goto errout2;
1719 	}
1720 	vdp->xdf_xb_ring_hdl = vdp->xdf_xb_ring->xr_acc_hdl; /* ugly!! */
1721 
1722 	/*
1723 	 * Write into xenstore the info needed by backend
1724 	 */
1725 	if ((xsnode = xvdi_get_xsname(dip)) == NULL) {
1726 		cmn_err(CE_WARN, "xdf@%s: "
1727 		    "failed to get xenstore node path",
1728 		    ddi_get_name_addr(dip));
1729 		goto fail_trans;
1730 	}
1731 trans_retry:
1732 	if (xenbus_transaction_start(&xbt)) {
1733 		cmn_err(CE_WARN, "xdf@%s: failed to start transaction",
1734 		    ddi_get_name_addr(dip));
1735 		xvdi_fatal_error(dip, EIO, "transaction start");
1736 		goto fail_trans;
1737 	}
1738 
1739 	if (rv = xenbus_printf(xbt, xsnode, "ring-ref", "%u", gref)) {
1740 		cmn_err(CE_WARN, "xdf@%s: failed to write ring-ref",
1741 		    ddi_get_name_addr(dip));
1742 		xvdi_fatal_error(dip, rv, "writing ring-ref");
1743 		goto abort_trans;
1744 	}
1745 
1746 	if (rv = xenbus_printf(xbt, xsnode, "event-channel", "%u",
1747 	    vdp->xdf_evtchn)) {
1748 		cmn_err(CE_WARN, "xdf@%s: failed to write event-channel",
1749 		    ddi_get_name_addr(dip));
1750 		xvdi_fatal_error(dip, rv, "writing event-channel");
1751 		goto abort_trans;
1752 	}
1753 
1754 	/*
1755 	 * "protocol" is written by the domain builder in the case of PV
1756 	 * domains. However, it is not written for HVM domains, so let's
1757 	 * write it here.
1758 	 */
1759 	if (rv = xenbus_printf(xbt, xsnode, "protocol", "%s",
1760 	    XEN_IO_PROTO_ABI_NATIVE)) {
1761 		cmn_err(CE_WARN, "xdf@%s: failed to write protocol",
1762 		    ddi_get_name_addr(dip));
1763 		xvdi_fatal_error(dip, rv, "writing protocol");
1764 		goto abort_trans;
1765 	}
1766 
1767 	if ((rv = xvdi_switch_state(dip, xbt, XenbusStateInitialised)) > 0) {
1768 		cmn_err(CE_WARN, "xdf@%s: "
1769 		    "failed to switch state to XenbusStateInitialised",
1770 		    ddi_get_name_addr(dip));
1771 		xvdi_fatal_error(dip, rv, "writing state");
1772 		goto abort_trans;
1773 	}
1774 
1775 	/* kick-off connect process */
1776 	if (rv = xenbus_transaction_end(xbt, 0)) {
1777 		if (rv == EAGAIN)
1778 			goto trans_retry;
1779 		cmn_err(CE_WARN, "xdf@%s: failed to end transaction",
1780 		    ddi_get_name_addr(dip));
1781 		xvdi_fatal_error(dip, rv, "completing transaction");
1782 		goto fail_trans;
1783 	}
1784 
1785 	ASSERT(mutex_owned(&vdp->xdf_cb_lk));
1786 	mutex_enter(&vdp->xdf_dev_lk);
1787 	vdp->xdf_status = XD_INIT;
1788 	mutex_exit(&vdp->xdf_dev_lk);
1789 
1790 	return (DDI_SUCCESS);
1791 
1792 abort_trans:
1793 	(void) xenbus_transaction_end(xbt, 1);
1794 fail_trans:
1795 	xvdi_free_ring(vdp->xdf_xb_ring);
1796 errout2:
1797 #ifdef XPV_HVM_DRIVER
1798 	ec_unbind_evtchn(vdp->xdf_evtchn);
1799 #else /* !XPV_HVM_DRIVER */
1800 	(void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
1801 #endif /* !XPV_HVM_DRIVER */
1802 errout1:
1803 	xvdi_free_evtchn(dip);
1804 errout:
1805 	cmn_err(CE_WARN, "xdf@%s: fail to kick-off connecting",
1806 	    ddi_get_name_addr(dip));
1807 	return (DDI_FAILURE);
1808 }
1809 
1810 /*
1811  * Kick-off disconnect process
1812  * Status won't be changed
1813  */
1814 static int
1815 xdf_start_disconnect(xdf_t *vdp)
1816 {
1817 	if (xvdi_switch_state(vdp->xdf_dip, XBT_NULL, XenbusStateClosed) > 0) {
1818 		cmn_err(CE_WARN, "xdf@%s: fail to kick-off disconnecting",
1819 		    ddi_get_name_addr(vdp->xdf_dip));
1820 		return (DDI_FAILURE);
1821 	}
1822 
1823 	return (DDI_SUCCESS);
1824 }
1825 
1826 int
1827 xdf_get_flush_block(xdf_t *vdp)
1828 {
1829 	/*
1830 	 * Get a DEV_BSIZE aligned bufer
1831 	 */
1832 	vdp->xdf_flush_mem = kmem_alloc(DEV_BSIZE * 2, KM_SLEEP);
1833 	vdp->xdf_cache_flush_block =
1834 	    (char *)P2ROUNDUP((uintptr_t)(vdp->xdf_flush_mem), DEV_BSIZE);
1835 	if (xdf_lb_rdwr(vdp->xdf_dip, TG_READ, vdp->xdf_cache_flush_block,
1836 	    xdf_flush_block, DEV_BSIZE, NULL) != 0)
1837 		return (DDI_FAILURE);
1838 	return (DDI_SUCCESS);
1839 }
1840 
1841 /*
1842  * Finish other initialization after we've connected to backend
1843  * Status should be XD_INIT before calling this routine
1844  * On success, status should be changed to XD_READY
1845  * On error, status should stay XD_INIT
1846  */
1847 static int
1848 xdf_post_connect(xdf_t *vdp)
1849 {
1850 	int rv;
1851 	uint_t len;
1852 	char *type;
1853 	char *barrier;
1854 	dev_info_t *devi = vdp->xdf_dip;
1855 
1856 	/*
1857 	 * Determine if feature barrier is supported by backend
1858 	 */
1859 	if (xenbus_read(XBT_NULL, xvdi_get_oename(devi),
1860 	    "feature-barrier", (void **)&barrier, &len) == 0) {
1861 		vdp->xdf_feature_barrier = 1;
1862 		kmem_free(barrier, len);
1863 	} else {
1864 		cmn_err(CE_NOTE, "xdf@%s: failed to read feature-barrier",
1865 		    ddi_get_name_addr(vdp->xdf_dip));
1866 		vdp->xdf_feature_barrier = 0;
1867 	}
1868 
1869 	/* probe backend */
1870 	if (rv = xenbus_gather(XBT_NULL, xvdi_get_oename(devi),
1871 	    "sectors", "%"SCNu64, &vdp->xdf_xdev_nblocks,
1872 	    "info", "%u", &vdp->xdf_xdev_info, NULL)) {
1873 		cmn_err(CE_WARN, "xdf_post_connect: xdf@%s: "
1874 		    "cannot read backend info", ddi_get_name_addr(devi));
1875 		xvdi_fatal_error(devi, rv, "reading backend info");
1876 		return (DDI_FAILURE);
1877 	}
1878 
1879 	/*
1880 	 * Make sure that the device we're connecting isn't smaller than
1881 	 * the old connected device.
1882 	 */
1883 	if (vdp->xdf_xdev_nblocks < vdp->xdf_pgeom.g_capacity) {
1884 		cmn_err(CE_WARN, "xdf_post_connect: xdf@%s: "
1885 		    "backend disk device shrank", ddi_get_name_addr(devi));
1886 		/* XXX:  call xvdi_fatal_error() here? */
1887 		xvdi_fatal_error(devi, rv, "reading backend info");
1888 		return (DDI_FAILURE);
1889 	}
1890 
1891 #ifdef _ILP32
1892 	if (vdp->xdf_xdev_nblocks > DK_MAX_BLOCKS) {
1893 		cmn_err(CE_WARN, "xdf_post_connect: xdf@%s: "
1894 		    "backend disk device too large with %llu blocks for"
1895 		    " 32-bit kernel", ddi_get_name_addr(devi),
1896 		    vdp->xdf_xdev_nblocks);
1897 		xvdi_fatal_error(devi, rv, "reading backend info");
1898 		return (DDI_FAILURE);
1899 	}
1900 #endif
1901 
1902 
1903 	/*
1904 	 * Only update the physical geometry to reflect the new device
1905 	 * size if this is the first time we're connecting to the backend
1906 	 * device.  Once we assign a physical geometry to a device it stays
1907 	 * fixed until:
1908 	 *	- we get detach and re-attached (at which point we
1909 	 *	  automatically assign a new physical geometry).
1910 	 *	- someone calls TG_SETPHYGEOM to explicity set the
1911 	 *	  physical geometry.
1912 	 */
1913 	if (vdp->xdf_pgeom.g_capacity == 0)
1914 		xdf_synthetic_pgeom(devi, &vdp->xdf_pgeom);
1915 
1916 	/* fix disk type */
1917 	if (xenbus_read(XBT_NULL, xvdi_get_xsname(devi), "device-type",
1918 	    (void **)&type, &len) != 0) {
1919 		cmn_err(CE_WARN, "xdf_post_connect: xdf@%s: "
1920 		    "cannot read device-type", ddi_get_name_addr(devi));
1921 		xvdi_fatal_error(devi, rv, "reading device-type");
1922 		return (DDI_FAILURE);
1923 	}
1924 	if (strcmp(type, "cdrom") == 0)
1925 		vdp->xdf_xdev_info |= VDISK_CDROM;
1926 	kmem_free(type, len);
1927 
1928 	/*
1929 	 * We've created all the minor nodes via cmlb_attach() using default
1930 	 * value in xdf_attach() to make it possible to block in xdf_open(),
1931 	 * in case there's anyone (say, booting thread) ever trying to open
1932 	 * it before connected to backend. We will refresh all those minor
1933 	 * nodes w/ latest info we've got now when we are almost connected.
1934 	 *
1935 	 * Don't do this when xdf is already opened by someone (could happen
1936 	 * during resume), for that cmlb_attach() will invalid the label info
1937 	 * and confuse those who has already opened the node, which is bad.
1938 	 */
1939 	if (!xdf_isopen(vdp, -1) && (XD_IS_CD(vdp) || XD_IS_RM(vdp))) {
1940 		/* re-init cmlb w/ latest info we got from backend */
1941 		if (cmlb_attach(devi, &xdf_lb_ops,
1942 		    XD_IS_CD(vdp) ? DTYPE_RODIRECT : DTYPE_DIRECT,
1943 		    XD_IS_RM(vdp), 1,
1944 		    XD_IS_CD(vdp) ? DDI_NT_CD_XVMD : DDI_NT_BLOCK_XVMD,
1945 #if defined(XPV_HVM_DRIVER)
1946 		    CMLB_CREATE_ALTSLICE_VTOC_16_DTYPE_DIRECT |
1947 		    CMLB_INTERNAL_MINOR_NODES,
1948 #else /* !XPV_HVM_DRIVER */
1949 		    CMLB_FAKE_LABEL_ONE_PARTITION,
1950 #endif /* !XPV_HVM_DRIVER */
1951 		    vdp->xdf_vd_lbl, NULL) != 0) {
1952 			cmn_err(CE_WARN, "xdf@%s: cmlb attach failed",
1953 			    ddi_get_name_addr(devi));
1954 			return (DDI_FAILURE);
1955 		}
1956 	}
1957 
1958 	/* mark vbd is ready for I/O */
1959 	ASSERT(mutex_owned(&vdp->xdf_cb_lk));
1960 	mutex_enter(&vdp->xdf_dev_lk);
1961 	vdp->xdf_status = XD_READY;
1962 	mutex_exit(&vdp->xdf_dev_lk);
1963 	/*
1964 	 * If backend has feature-barrier, see if it supports disk
1965 	 * cache flush op.
1966 	 */
1967 	vdp->xdf_flush_supported = 0;
1968 	if (vdp->xdf_feature_barrier) {
1969 		/*
1970 		 * Pretend we already know flush is supported so probe
1971 		 * will attempt the correct op.
1972 		 */
1973 		vdp->xdf_flush_supported = 1;
1974 		if (xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE, NULL, 0, 0, 0) == 0) {
1975 			vdp->xdf_flush_supported = 1;
1976 		} else {
1977 			vdp->xdf_flush_supported = 0;
1978 			/*
1979 			 * If the other end does not support the cache flush op
1980 			 * then we must use a barrier-write to force disk
1981 			 * cache flushing.  Barrier writes require that a data
1982 			 * block actually be written.
1983 			 * Cache a block to barrier-write when we are
1984 			 * asked to perform a flush.
1985 			 * XXX - would it be better to just copy 1 block
1986 			 * (512 bytes) from whatever write we did last
1987 			 * and rewrite that block?
1988 			 */
1989 			if (xdf_get_flush_block(vdp) != DDI_SUCCESS)
1990 				return (DDI_FAILURE);
1991 		}
1992 	}
1993 
1994 	cmn_err(CE_CONT, "?xdf@%s: %"PRIu64" blocks", ddi_get_name_addr(devi),
1995 	    (uint64_t)vdp->xdf_xdev_nblocks);
1996 
1997 	return (DDI_SUCCESS);
1998 }
1999 
2000 /*
2001  * Finish other uninitialization after we've disconnected from backend
2002  * when status is XD_CLOSING or XD_INIT. After returns, status is XD_CLOSED
2003  */
2004 static void
2005 xdf_post_disconnect(xdf_t *vdp)
2006 {
2007 #ifdef XPV_HVM_DRIVER
2008 	ec_unbind_evtchn(vdp->xdf_evtchn);
2009 #else /* !XPV_HVM_DRIVER */
2010 	(void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
2011 #endif /* !XPV_HVM_DRIVER */
2012 	xvdi_free_evtchn(vdp->xdf_dip);
2013 	xvdi_free_ring(vdp->xdf_xb_ring);
2014 	vdp->xdf_xb_ring = NULL;
2015 	vdp->xdf_xb_ring_hdl = NULL;
2016 	vdp->xdf_peer = (domid_t)-1;
2017 
2018 	ASSERT(mutex_owned(&vdp->xdf_cb_lk));
2019 	mutex_enter(&vdp->xdf_dev_lk);
2020 	vdp->xdf_status = XD_CLOSED;
2021 	mutex_exit(&vdp->xdf_dev_lk);
2022 }
2023 
2024 /*ARGSUSED*/
2025 static void
2026 xdf_oe_change(dev_info_t *dip, ddi_eventcookie_t id, void *arg, void *impl_data)
2027 {
2028 	XenbusState new_state = *(XenbusState *)impl_data;
2029 	xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip);
2030 	boolean_t unexpect_die = B_FALSE;
2031 	int status;
2032 
2033 	DPRINTF(DDI_DBG, ("xdf@%s: otherend state change to %d!\n",
2034 	    ddi_get_name_addr(dip), new_state));
2035 
2036 	mutex_enter(&vdp->xdf_cb_lk);
2037 
2038 	if (xdf_check_state_transition(vdp, new_state) == DDI_FAILURE) {
2039 		mutex_exit(&vdp->xdf_cb_lk);
2040 		return;
2041 	}
2042 
2043 	switch (new_state) {
2044 	case XenbusStateInitialising:
2045 		ASSERT(vdp->xdf_status == XD_CLOSED);
2046 		/*
2047 		 * backend recovered from a previous failure,
2048 		 * kick-off connect process again
2049 		 */
2050 		if (xdf_start_connect(vdp) != DDI_SUCCESS) {
2051 			cmn_err(CE_WARN, "xdf@%s:"
2052 			    " failed to start reconnecting to backend",
2053 			    ddi_get_name_addr(dip));
2054 		}
2055 		break;
2056 	case XenbusStateConnected:
2057 		ASSERT(vdp->xdf_status == XD_INIT);
2058 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
2059 		/* finish final init after connect */
2060 		if (xdf_post_connect(vdp) != DDI_SUCCESS)
2061 			(void) xdf_start_disconnect(vdp);
2062 		break;
2063 	case XenbusStateClosing:
2064 		mutex_enter(&vdp->xdf_dev_lk);
2065 		if (xdf_isopen(vdp, -1)) {
2066 			cmn_err(CE_NOTE, "xdf@%s: hot-unplug failed, "
2067 			    "still in use", ddi_get_name_addr(dip));
2068 		} else {
2069 			if ((vdp->xdf_status == XD_READY) ||
2070 			    (vdp->xdf_status == XD_INIT))
2071 				vdp->xdf_status = XD_CLOSING;
2072 			(void) xdf_start_disconnect(vdp);
2073 		}
2074 		mutex_exit(&vdp->xdf_dev_lk);
2075 		break;
2076 	case XenbusStateClosed:
2077 		/* first check if BE closed unexpectedly */
2078 		mutex_enter(&vdp->xdf_dev_lk);
2079 		if (xdf_isopen(vdp, -1)) {
2080 			unexpect_die = B_TRUE;
2081 			unexpectedie(vdp);
2082 			cmn_err(CE_WARN, "xdf@%s: backend closed, "
2083 			    "reconnecting...", ddi_get_name_addr(dip));
2084 		}
2085 		mutex_exit(&vdp->xdf_dev_lk);
2086 
2087 		if (vdp->xdf_status == XD_READY) {
2088 			mutex_enter(&vdp->xdf_dev_lk);
2089 			vdp->xdf_status = XD_CLOSING;
2090 			mutex_exit(&vdp->xdf_dev_lk);
2091 
2092 #ifdef	DOMU_BACKEND
2093 			(void) xvdi_post_event(dip, XEN_HP_REMOVE);
2094 #endif
2095 
2096 			xdf_post_disconnect(vdp);
2097 			(void) xvdi_switch_state(dip, XBT_NULL,
2098 			    XenbusStateClosed);
2099 		} else if ((vdp->xdf_status == XD_INIT) ||
2100 		    (vdp->xdf_status == XD_CLOSING)) {
2101 			xdf_post_disconnect(vdp);
2102 		} else {
2103 			mutex_enter(&vdp->xdf_dev_lk);
2104 			vdp->xdf_status = XD_CLOSED;
2105 			mutex_exit(&vdp->xdf_dev_lk);
2106 		}
2107 	}
2108 
2109 	/* notify anybody waiting for oe state change */
2110 	mutex_enter(&vdp->xdf_dev_lk);
2111 	cv_broadcast(&vdp->xdf_dev_cv);
2112 	mutex_exit(&vdp->xdf_dev_lk);
2113 
2114 	status = vdp->xdf_status;
2115 	mutex_exit(&vdp->xdf_cb_lk);
2116 
2117 	if (status == XD_READY) {
2118 		xdf_iostart(vdp);
2119 	} else if ((status == XD_CLOSED) && !unexpect_die) {
2120 		/* interface is closed successfully, remove all minor nodes */
2121 		if (vdp->xdf_vd_lbl != NULL) {
2122 			cmlb_detach(vdp->xdf_vd_lbl, NULL);
2123 			cmlb_free_handle(&vdp->xdf_vd_lbl);
2124 			vdp->xdf_vd_lbl = NULL;
2125 		}
2126 	}
2127 }
2128 
2129 /* check if partition is open, -1 - check all partitions on the disk */
2130 static boolean_t
2131 xdf_isopen(xdf_t *vdp, int partition)
2132 {
2133 	int i;
2134 	ulong_t parbit;
2135 	boolean_t rval = B_FALSE;
2136 
2137 	ASSERT((partition == -1) ||
2138 	    ((partition >= 0) || (partition < XDF_PEXT)));
2139 
2140 	if (partition == -1)
2141 		parbit = (ulong_t)-1;
2142 	else
2143 		parbit = 1 << partition;
2144 
2145 	for (i = 0; i < OTYPCNT; i++) {
2146 		if (vdp->xdf_vd_open[i] & parbit)
2147 			rval = B_TRUE;
2148 	}
2149 
2150 	return (rval);
2151 }
2152 
2153 /*
2154  * Xdf_check_state_transition will check the XenbusState change to see
2155  * if the change is a valid transition or not.
2156  * The new state is written by backend domain, or by running xenstore-write
2157  * to change it manually in dom0
2158  */
2159 static int
2160 xdf_check_state_transition(xdf_t *vdp, XenbusState oestate)
2161 {
2162 	int status;
2163 	int stcheck;
2164 #define	STOK	0 /* need further process */
2165 #define	STNOP	1 /* no action need taking */
2166 #define	STBUG	2 /* unexpected state change, could be a bug */
2167 
2168 	status = vdp->xdf_status;
2169 	stcheck = STOK;
2170 
2171 	switch (status) {
2172 	case XD_UNKNOWN:
2173 		if ((oestate == XenbusStateUnknown)		||
2174 		    (oestate == XenbusStateConnected))
2175 			stcheck = STBUG;
2176 		else if ((oestate == XenbusStateInitialising)	||
2177 		    (oestate == XenbusStateInitWait)		||
2178 		    (oestate == XenbusStateInitialised))
2179 			stcheck = STNOP;
2180 		break;
2181 	case XD_INIT:
2182 		if (oestate == XenbusStateUnknown)
2183 			stcheck = STBUG;
2184 		else if ((oestate == XenbusStateInitialising)	||
2185 		    (oestate == XenbusStateInitWait)		||
2186 		    (oestate == XenbusStateInitialised))
2187 			stcheck = STNOP;
2188 		break;
2189 	case XD_READY:
2190 		if ((oestate == XenbusStateUnknown)		||
2191 		    (oestate == XenbusStateInitialising)	||
2192 		    (oestate == XenbusStateInitWait)		||
2193 		    (oestate == XenbusStateInitialised))
2194 			stcheck = STBUG;
2195 		else if (oestate == XenbusStateConnected)
2196 			stcheck = STNOP;
2197 		break;
2198 	case XD_CLOSING:
2199 		if ((oestate == XenbusStateUnknown)		||
2200 		    (oestate == XenbusStateInitialising)	||
2201 		    (oestate == XenbusStateInitWait)		||
2202 		    (oestate == XenbusStateInitialised)		||
2203 		    (oestate == XenbusStateConnected))
2204 			stcheck = STBUG;
2205 		else if (oestate == XenbusStateClosing)
2206 			stcheck = STNOP;
2207 		break;
2208 	case XD_CLOSED:
2209 		if ((oestate == XenbusStateUnknown)		||
2210 		    (oestate == XenbusStateConnected))
2211 			stcheck = STBUG;
2212 		else if ((oestate == XenbusStateInitWait)	||
2213 		    (oestate == XenbusStateInitialised)		||
2214 		    (oestate == XenbusStateClosing)		||
2215 		    (oestate == XenbusStateClosed))
2216 			stcheck = STNOP;
2217 		break;
2218 	case XD_SUSPEND:
2219 	default:
2220 			stcheck = STBUG;
2221 	}
2222 
2223 	if (stcheck == STOK)
2224 		return (DDI_SUCCESS);
2225 
2226 	if (stcheck == STBUG)
2227 		cmn_err(CE_NOTE, "xdf@%s: unexpected otherend "
2228 		    "state change to %d!, when status is %d",
2229 		    ddi_get_name_addr(vdp->xdf_dip), oestate, status);
2230 
2231 	return (DDI_FAILURE);
2232 }
2233 
2234 static int
2235 xdf_connect(xdf_t *vdp, boolean_t wait)
2236 {
2237 	ASSERT(mutex_owned(&vdp->xdf_dev_lk));
2238 	while (vdp->xdf_status != XD_READY) {
2239 		if (!wait || (vdp->xdf_status > XD_READY))
2240 			break;
2241 
2242 		if (cv_wait_sig(&vdp->xdf_dev_cv, &vdp->xdf_dev_lk) == 0)
2243 			break;
2244 	}
2245 
2246 	return (vdp->xdf_status);
2247 }
2248 
2249 /*
2250  * callback func when DMA/GTE resources is available
2251  *
2252  * Note: we only register one callback function to grant table subsystem
2253  * since we only have one 'struct gnttab_free_callback' in xdf_t.
2254  */
2255 static int
2256 xdf_dmacallback(caddr_t arg)
2257 {
2258 	xdf_t *vdp = (xdf_t *)arg;
2259 	ASSERT(vdp != NULL);
2260 
2261 	DPRINTF(DMA_DBG, ("xdf@%s: DMA callback started\n",
2262 	    ddi_get_name_addr(vdp->xdf_dip)));
2263 
2264 	ddi_trigger_softintr(vdp->xdf_softintr_id);
2265 	return (DDI_DMA_CALLBACK_DONE);
2266 }
2267 
2268 static uint_t
2269 xdf_iorestart(caddr_t arg)
2270 {
2271 	xdf_t *vdp = (xdf_t *)arg;
2272 
2273 	ASSERT(vdp != NULL);
2274 
2275 	mutex_enter(&vdp->xdf_dev_lk);
2276 	ASSERT(ISDMACBON(vdp));
2277 	SETDMACBOFF(vdp);
2278 	mutex_exit(&vdp->xdf_dev_lk);
2279 
2280 	xdf_iostart(vdp);
2281 
2282 	return (DDI_INTR_CLAIMED);
2283 }
2284 
2285 static void
2286 xdf_timeout_handler(void *arg)
2287 {
2288 	xdf_t *vdp = arg;
2289 
2290 	mutex_enter(&vdp->xdf_dev_lk);
2291 	vdp->xdf_timeout_id = 0;
2292 	mutex_exit(&vdp->xdf_dev_lk);
2293 
2294 	/* new timeout thread could be re-scheduled */
2295 	xdf_iostart(vdp);
2296 }
2297 
2298 /*
2299  * Alloc a vreq for this bp
2300  * bp->av_back contains the pointer to the vreq upon return
2301  */
2302 static v_req_t *
2303 vreq_get(xdf_t *vdp, buf_t *bp)
2304 {
2305 	v_req_t *vreq = NULL;
2306 
2307 	ASSERT(BP2VREQ(bp) == NULL);
2308 
2309 	vreq = kmem_cache_alloc(xdf_vreq_cache, KM_NOSLEEP);
2310 	if (vreq == NULL) {
2311 		if (vdp->xdf_timeout_id == 0)
2312 			/* restart I/O after one second */
2313 			vdp->xdf_timeout_id =
2314 			    timeout(xdf_timeout_handler, vdp, hz);
2315 		return (NULL);
2316 	}
2317 	bzero(vreq, sizeof (v_req_t));
2318 
2319 	list_insert_head(&vdp->xdf_vreq_act, (void *)vreq);
2320 	bp->av_back = (buf_t *)vreq;
2321 	vreq->v_buf = bp;
2322 	vreq->v_status = VREQ_INIT;
2323 	/* init of other fields in vreq is up to the caller */
2324 
2325 	return (vreq);
2326 }
2327 
2328 static void
2329 vreq_free(xdf_t *vdp, v_req_t *vreq)
2330 {
2331 	buf_t *bp = vreq->v_buf;
2332 
2333 	list_remove(&vdp->xdf_vreq_act, (void *)vreq);
2334 
2335 	if (vreq->v_flush_diskcache == FLUSH_DISKCACHE)
2336 		goto done;
2337 
2338 	switch (vreq->v_status) {
2339 	case VREQ_DMAWIN_DONE:
2340 	case VREQ_GS_ALLOCED:
2341 	case VREQ_DMABUF_BOUND:
2342 		(void) ddi_dma_unbind_handle(vreq->v_dmahdl);
2343 		/*FALLTHRU*/
2344 	case VREQ_DMAMEM_ALLOCED:
2345 		if (!ALIGNED_XFER(bp)) {
2346 			ASSERT(vreq->v_abuf != NULL);
2347 			if (!IS_ERROR(bp) && IS_READ(bp))
2348 				bcopy(vreq->v_abuf, bp->b_un.b_addr,
2349 				    bp->b_bcount);
2350 			ddi_dma_mem_free(&vreq->v_align);
2351 		}
2352 		/*FALLTHRU*/
2353 	case VREQ_MEMDMAHDL_ALLOCED:
2354 		if (!ALIGNED_XFER(bp))
2355 			ddi_dma_free_handle(&vreq->v_memdmahdl);
2356 		/*FALLTHRU*/
2357 	case VREQ_DMAHDL_ALLOCED:
2358 		ddi_dma_free_handle(&vreq->v_dmahdl);
2359 		break;
2360 	default:
2361 		break;
2362 	}
2363 done:
2364 	vreq->v_buf->av_back = NULL;
2365 	kmem_cache_free(xdf_vreq_cache, vreq);
2366 }
2367 
2368 /*
2369  * Initalize the DMA and grant table resources for the buf
2370  */
2371 static int
2372 vreq_setup(xdf_t *vdp, v_req_t *vreq)
2373 {
2374 	int rc;
2375 	ddi_dma_attr_t dmaattr;
2376 	uint_t ndcs, ndws;
2377 	ddi_dma_handle_t dh;
2378 	ddi_dma_handle_t mdh;
2379 	ddi_dma_cookie_t dc;
2380 	ddi_acc_handle_t abh;
2381 	caddr_t	aba;
2382 	ge_slot_t *gs;
2383 	size_t bufsz;
2384 	off_t off;
2385 	size_t sz;
2386 	buf_t *bp = vreq->v_buf;
2387 	int dma_flags = (IS_READ(bp) ? DDI_DMA_READ : DDI_DMA_WRITE) |
2388 	    DDI_DMA_STREAMING | DDI_DMA_PARTIAL;
2389 
2390 	switch (vreq->v_status) {
2391 	case VREQ_INIT:
2392 		if (IS_FLUSH_DISKCACHE(bp)) {
2393 			if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
2394 				DPRINTF(DMA_DBG, (
2395 				    "xdf@%s: get ge_slotfailed\n",
2396 				    ddi_get_name_addr(vdp->xdf_dip)));
2397 				return (DDI_FAILURE);
2398 			}
2399 			vreq->v_blkno = 0;
2400 			vreq->v_nslots = 1;
2401 			vreq->v_gs = gs;
2402 			vreq->v_flush_diskcache = FLUSH_DISKCACHE;
2403 			vreq->v_status = VREQ_GS_ALLOCED;
2404 			gs->vreq = vreq;
2405 			return (DDI_SUCCESS);
2406 		}
2407 
2408 		if (IS_WRITE_BARRIER(vdp, bp))
2409 			vreq->v_flush_diskcache = WRITE_BARRIER;
2410 		vreq->v_blkno = bp->b_blkno +
2411 		    (diskaddr_t)(uintptr_t)bp->b_private;
2412 		bp->b_private = NULL;
2413 		/* See if we wrote new data to our flush block */
2414 		if (!IS_READ(bp) && USE_WRITE_BARRIER(vdp))
2415 			check_fbwrite(vdp, bp, vreq->v_blkno);
2416 		vreq->v_status = VREQ_INIT_DONE;
2417 		/*FALLTHRU*/
2418 
2419 	case VREQ_INIT_DONE:
2420 		/*
2421 		 * alloc DMA handle
2422 		 */
2423 		rc = ddi_dma_alloc_handle(vdp->xdf_dip, &xb_dma_attr,
2424 		    xdf_dmacallback, (caddr_t)vdp, &dh);
2425 		if (rc != DDI_SUCCESS) {
2426 			SETDMACBON(vdp);
2427 			DPRINTF(DMA_DBG, ("xdf@%s: DMA handle alloc failed\n",
2428 			    ddi_get_name_addr(vdp->xdf_dip)));
2429 			return (DDI_FAILURE);
2430 		}
2431 
2432 		vreq->v_dmahdl = dh;
2433 		vreq->v_status = VREQ_DMAHDL_ALLOCED;
2434 		/*FALLTHRU*/
2435 
2436 	case VREQ_DMAHDL_ALLOCED:
2437 		/*
2438 		 * alloc dma handle for 512-byte aligned buf
2439 		 */
2440 		if (!ALIGNED_XFER(bp)) {
2441 			/*
2442 			 * XXPV: we need to temporarily enlarge the seg
2443 			 * boundary and s/g length to work round CR6381968
2444 			 */
2445 			dmaattr = xb_dma_attr;
2446 			dmaattr.dma_attr_seg = (uint64_t)-1;
2447 			dmaattr.dma_attr_sgllen = INT_MAX;
2448 			rc = ddi_dma_alloc_handle(vdp->xdf_dip, &dmaattr,
2449 			    xdf_dmacallback, (caddr_t)vdp, &mdh);
2450 			if (rc != DDI_SUCCESS) {
2451 				SETDMACBON(vdp);
2452 				DPRINTF(DMA_DBG, ("xdf@%s: unaligned buf DMA"
2453 				    "handle alloc failed\n",
2454 				    ddi_get_name_addr(vdp->xdf_dip)));
2455 				return (DDI_FAILURE);
2456 			}
2457 			vreq->v_memdmahdl = mdh;
2458 			vreq->v_status = VREQ_MEMDMAHDL_ALLOCED;
2459 		}
2460 		/*FALLTHRU*/
2461 
2462 	case VREQ_MEMDMAHDL_ALLOCED:
2463 		/*
2464 		 * alloc 512-byte aligned buf
2465 		 */
2466 		if (!ALIGNED_XFER(bp)) {
2467 			if (bp->b_flags & (B_PAGEIO | B_PHYS))
2468 				bp_mapin(bp);
2469 
2470 			rc = ddi_dma_mem_alloc(vreq->v_memdmahdl,
2471 			    roundup(bp->b_bcount, XB_BSIZE), &xc_acc_attr,
2472 			    DDI_DMA_STREAMING, xdf_dmacallback, (caddr_t)vdp,
2473 			    &aba, &bufsz, &abh);
2474 			if (rc != DDI_SUCCESS) {
2475 				SETDMACBON(vdp);
2476 				DPRINTF(DMA_DBG, (
2477 				    "xdf@%s: DMA mem allocation failed\n",
2478 				    ddi_get_name_addr(vdp->xdf_dip)));
2479 				return (DDI_FAILURE);
2480 			}
2481 
2482 			vreq->v_abuf = aba;
2483 			vreq->v_align = abh;
2484 			vreq->v_status = VREQ_DMAMEM_ALLOCED;
2485 
2486 			ASSERT(bufsz >= bp->b_bcount);
2487 			if (!IS_READ(bp))
2488 				bcopy(bp->b_un.b_addr, vreq->v_abuf,
2489 				    bp->b_bcount);
2490 		}
2491 		/*FALLTHRU*/
2492 
2493 	case VREQ_DMAMEM_ALLOCED:
2494 		/*
2495 		 * dma bind
2496 		 */
2497 		if (ALIGNED_XFER(bp)) {
2498 			rc = ddi_dma_buf_bind_handle(vreq->v_dmahdl, bp,
2499 			    dma_flags, xdf_dmacallback, (caddr_t)vdp,
2500 			    &dc, &ndcs);
2501 		} else {
2502 			rc = ddi_dma_addr_bind_handle(vreq->v_dmahdl,
2503 			    NULL, vreq->v_abuf, bp->b_bcount, dma_flags,
2504 			    xdf_dmacallback, (caddr_t)vdp, &dc, &ndcs);
2505 		}
2506 		if (rc == DDI_DMA_MAPPED || rc == DDI_DMA_PARTIAL_MAP) {
2507 			/* get num of dma windows */
2508 			if (rc == DDI_DMA_PARTIAL_MAP) {
2509 				rc = ddi_dma_numwin(vreq->v_dmahdl, &ndws);
2510 				ASSERT(rc == DDI_SUCCESS);
2511 			} else {
2512 				ndws = 1;
2513 			}
2514 		} else {
2515 			SETDMACBON(vdp);
2516 			DPRINTF(DMA_DBG, ("xdf@%s: DMA bind failed\n",
2517 			    ddi_get_name_addr(vdp->xdf_dip)));
2518 			return (DDI_FAILURE);
2519 		}
2520 
2521 		vreq->v_dmac = dc;
2522 		vreq->v_dmaw = 0;
2523 		vreq->v_ndmacs = ndcs;
2524 		vreq->v_ndmaws = ndws;
2525 		vreq->v_nslots = ndws;
2526 		vreq->v_status = VREQ_DMABUF_BOUND;
2527 		/*FALLTHRU*/
2528 
2529 	case VREQ_DMABUF_BOUND:
2530 		/*
2531 		 * get ge_slot, callback is set upon failure from gs_get(),
2532 		 * if not set previously
2533 		 */
2534 		if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
2535 			DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
2536 			    ddi_get_name_addr(vdp->xdf_dip)));
2537 			return (DDI_FAILURE);
2538 		}
2539 
2540 		vreq->v_gs = gs;
2541 		gs->vreq = vreq;
2542 		vreq->v_status = VREQ_GS_ALLOCED;
2543 		break;
2544 
2545 	case VREQ_GS_ALLOCED:
2546 		/* nothing need to be done */
2547 		break;
2548 
2549 	case VREQ_DMAWIN_DONE:
2550 		/*
2551 		 * move to the next dma window
2552 		 */
2553 		ASSERT((vreq->v_dmaw + 1) < vreq->v_ndmaws);
2554 
2555 		/* get a ge_slot for this DMA window */
2556 		if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
2557 			DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
2558 			    ddi_get_name_addr(vdp->xdf_dip)));
2559 			return (DDI_FAILURE);
2560 		}
2561 
2562 		vreq->v_gs = gs;
2563 		gs->vreq = vreq;
2564 		vreq->v_dmaw++;
2565 		rc = ddi_dma_getwin(vreq->v_dmahdl, vreq->v_dmaw, &off, &sz,
2566 		    &vreq->v_dmac, &vreq->v_ndmacs);
2567 		ASSERT(rc == DDI_SUCCESS);
2568 		vreq->v_status = VREQ_GS_ALLOCED;
2569 		break;
2570 
2571 	default:
2572 		return (DDI_FAILURE);
2573 	}
2574 
2575 	return (DDI_SUCCESS);
2576 }
2577 
2578 static ge_slot_t *
2579 gs_get(xdf_t *vdp, int isread)
2580 {
2581 	grant_ref_t gh;
2582 	ge_slot_t *gs;
2583 
2584 	/* try to alloc GTEs needed in this slot, first */
2585 	if (gnttab_alloc_grant_references(
2586 	    BLKIF_MAX_SEGMENTS_PER_REQUEST, &gh) == -1) {
2587 		if (vdp->xdf_gnt_callback.next == NULL) {
2588 			SETDMACBON(vdp);
2589 			gnttab_request_free_callback(
2590 			    &vdp->xdf_gnt_callback,
2591 			    (void (*)(void *))xdf_dmacallback,
2592 			    (void *)vdp,
2593 			    BLKIF_MAX_SEGMENTS_PER_REQUEST);
2594 		}
2595 		return (NULL);
2596 	}
2597 
2598 	gs = kmem_cache_alloc(xdf_gs_cache, KM_NOSLEEP);
2599 	if (gs == NULL) {
2600 		gnttab_free_grant_references(gh);
2601 		if (vdp->xdf_timeout_id == 0)
2602 			/* restart I/O after one second */
2603 			vdp->xdf_timeout_id =
2604 			    timeout(xdf_timeout_handler, vdp, hz);
2605 		return (NULL);
2606 	}
2607 
2608 	/* init gs_slot */
2609 	list_insert_head(&vdp->xdf_gs_act, (void *)gs);
2610 	gs->oeid = vdp->xdf_peer;
2611 	gs->isread = isread;
2612 	gs->ghead = gh;
2613 	gs->ngrefs = 0;
2614 
2615 	return (gs);
2616 }
2617 
2618 static void
2619 gs_free(xdf_t *vdp, ge_slot_t *gs)
2620 {
2621 	int i;
2622 	grant_ref_t *gp = gs->ge;
2623 	int ngrefs = gs->ngrefs;
2624 	boolean_t isread = gs->isread;
2625 
2626 	list_remove(&vdp->xdf_gs_act, (void *)gs);
2627 
2628 	/* release all grant table entry resources used in this slot */
2629 	for (i = 0; i < ngrefs; i++, gp++)
2630 		gnttab_end_foreign_access(*gp, !isread, 0);
2631 	gnttab_free_grant_references(gs->ghead);
2632 
2633 	kmem_cache_free(xdf_gs_cache, (void *)gs);
2634 }
2635 
2636 static grant_ref_t
2637 gs_grant(ge_slot_t *gs, mfn_t mfn)
2638 {
2639 	grant_ref_t gr = gnttab_claim_grant_reference(&gs->ghead);
2640 
2641 	ASSERT(gr != -1);
2642 	ASSERT(gs->ngrefs < BLKIF_MAX_SEGMENTS_PER_REQUEST);
2643 	gs->ge[gs->ngrefs++] = gr;
2644 	gnttab_grant_foreign_access_ref(gr, gs->oeid, mfn, !gs->isread);
2645 
2646 	return (gr);
2647 }
2648 
2649 static void
2650 unexpectedie(xdf_t *vdp)
2651 {
2652 	/* clean up I/Os in ring that have responses */
2653 	if (xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring)) {
2654 		mutex_exit(&vdp->xdf_dev_lk);
2655 		(void) xdf_intr((caddr_t)vdp);
2656 		mutex_enter(&vdp->xdf_dev_lk);
2657 	}
2658 
2659 	/* free up all grant table entries */
2660 	while (!list_is_empty(&vdp->xdf_gs_act))
2661 		gs_free(vdp, list_head(&vdp->xdf_gs_act));
2662 
2663 	/*
2664 	 * move bp back to active list orderly
2665 	 * vreq_busy is updated in vreq_free()
2666 	 */
2667 	while (!list_is_empty(&vdp->xdf_vreq_act)) {
2668 		v_req_t *vreq = list_head(&vdp->xdf_vreq_act);
2669 		buf_t *bp = vreq->v_buf;
2670 
2671 		bp->av_back = NULL;
2672 		bp->b_resid = bp->b_bcount;
2673 		if (vdp->xdf_f_act == NULL) {
2674 			vdp->xdf_f_act = vdp->xdf_l_act = bp;
2675 		} else {
2676 			/* move to the head of list */
2677 			bp->av_forw = vdp->xdf_f_act;
2678 			vdp->xdf_f_act = bp;
2679 		}
2680 		if (vdp->xdf_xdev_iostat != NULL)
2681 			kstat_runq_back_to_waitq(
2682 			    KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
2683 		vreq_free(vdp, vreq);
2684 	}
2685 }
2686 
2687 static void
2688 xdfmin(struct buf *bp)
2689 {
2690 	if (bp->b_bcount > xdf_maxphys)
2691 		bp->b_bcount = xdf_maxphys;
2692 }
2693 
2694 void
2695 xdf_kstat_delete(dev_info_t *dip)
2696 {
2697 	xdf_t	*vdp = (xdf_t *)ddi_get_driver_private(dip);
2698 	kstat_t	*kstat;
2699 
2700 	/*
2701 	 * The locking order here is xdf_iostat_lk and then xdf_dev_lk.
2702 	 * xdf_dev_lk is used to protect the xdf_xdev_iostat pointer
2703 	 * and the contents of the our kstat.  xdf_iostat_lk is used
2704 	 * to protect the allocation and freeing of the actual kstat.
2705 	 * xdf_dev_lk can't be used for this purpose because kstat
2706 	 * readers use it to access the contents of the kstat and
2707 	 * hence it can't be held when calling kstat_delete().
2708 	 */
2709 	mutex_enter(&vdp->xdf_iostat_lk);
2710 	mutex_enter(&vdp->xdf_dev_lk);
2711 
2712 	if (vdp->xdf_xdev_iostat == NULL) {
2713 		mutex_exit(&vdp->xdf_dev_lk);
2714 		mutex_exit(&vdp->xdf_iostat_lk);
2715 		return;
2716 	}
2717 
2718 	kstat = vdp->xdf_xdev_iostat;
2719 	vdp->xdf_xdev_iostat = NULL;
2720 	mutex_exit(&vdp->xdf_dev_lk);
2721 
2722 	kstat_delete(kstat);
2723 	mutex_exit(&vdp->xdf_iostat_lk);
2724 }
2725 
2726 int
2727 xdf_kstat_create(dev_info_t *dip, char *ks_module, int ks_instance)
2728 {
2729 	xdf_t	*vdp = (xdf_t *)ddi_get_driver_private(dip);
2730 
2731 	/* See comment about locking in xdf_kstat_delete(). */
2732 	mutex_enter(&vdp->xdf_iostat_lk);
2733 	mutex_enter(&vdp->xdf_dev_lk);
2734 
2735 	if (vdp->xdf_xdev_iostat != NULL) {
2736 		mutex_exit(&vdp->xdf_dev_lk);
2737 		mutex_exit(&vdp->xdf_iostat_lk);
2738 		return (-1);
2739 	}
2740 
2741 	if ((vdp->xdf_xdev_iostat = kstat_create(
2742 	    ks_module, ks_instance, NULL, "disk",
2743 	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) == NULL) {
2744 		mutex_exit(&vdp->xdf_dev_lk);
2745 		mutex_exit(&vdp->xdf_iostat_lk);
2746 		return (-1);
2747 	}
2748 
2749 	vdp->xdf_xdev_iostat->ks_lock = &vdp->xdf_dev_lk;
2750 	kstat_install(vdp->xdf_xdev_iostat);
2751 	mutex_exit(&vdp->xdf_dev_lk);
2752 	mutex_exit(&vdp->xdf_iostat_lk);
2753 
2754 	return (0);
2755 }
2756 
2757 #if defined(XPV_HVM_DRIVER)
2758 
2759 typedef struct xdf_hvm_entry {
2760 	list_node_t	xdf_he_list;
2761 	char		*xdf_he_path;
2762 	dev_info_t	*xdf_he_dip;
2763 } xdf_hvm_entry_t;
2764 
2765 static list_t xdf_hvm_list;
2766 static kmutex_t xdf_hvm_list_lock;
2767 
2768 static xdf_hvm_entry_t *
2769 i_xdf_hvm_find(char *path, dev_info_t *dip)
2770 {
2771 	xdf_hvm_entry_t	*i;
2772 
2773 	ASSERT((path != NULL) || (dip != NULL));
2774 	ASSERT(MUTEX_HELD(&xdf_hvm_list_lock));
2775 
2776 	i = list_head(&xdf_hvm_list);
2777 	while (i != NULL) {
2778 		if ((path != NULL) && strcmp(i->xdf_he_path, path) != 0) {
2779 			i = list_next(&xdf_hvm_list, i);
2780 			continue;
2781 		}
2782 		if ((dip != NULL) && (i->xdf_he_dip != dip)) {
2783 			i = list_next(&xdf_hvm_list, i);
2784 			continue;
2785 		}
2786 		break;
2787 	}
2788 	return (i);
2789 }
2790 
2791 dev_info_t *
2792 xdf_hvm_hold(char *path)
2793 {
2794 	xdf_hvm_entry_t	*i;
2795 	dev_info_t	*dip;
2796 
2797 	mutex_enter(&xdf_hvm_list_lock);
2798 	i = i_xdf_hvm_find(path, NULL);
2799 	if (i == NULL) {
2800 		mutex_exit(&xdf_hvm_list_lock);
2801 		return (B_FALSE);
2802 	}
2803 	ndi_hold_devi(dip = i->xdf_he_dip);
2804 	mutex_exit(&xdf_hvm_list_lock);
2805 	return (dip);
2806 }
2807 
2808 static void
2809 xdf_hvm_add(dev_info_t *dip)
2810 {
2811 	xdf_hvm_entry_t	*i;
2812 	char		*path;
2813 
2814 	/* figure out the path for the dip */
2815 	path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
2816 	(void) ddi_pathname(dip, path);
2817 
2818 	i = kmem_alloc(sizeof (*i), KM_SLEEP);
2819 	i->xdf_he_dip = dip;
2820 	i->xdf_he_path = i_ddi_strdup(path, KM_SLEEP);
2821 
2822 	mutex_enter(&xdf_hvm_list_lock);
2823 	ASSERT(i_xdf_hvm_find(path, NULL) == NULL);
2824 	ASSERT(i_xdf_hvm_find(NULL, dip) == NULL);
2825 	list_insert_head(&xdf_hvm_list, i);
2826 	mutex_exit(&xdf_hvm_list_lock);
2827 
2828 	kmem_free(path, MAXPATHLEN);
2829 }
2830 
2831 static void
2832 xdf_hvm_rm(dev_info_t *dip)
2833 {
2834 	xdf_hvm_entry_t	*i;
2835 
2836 	mutex_enter(&xdf_hvm_list_lock);
2837 	VERIFY((i = i_xdf_hvm_find(NULL, dip)) != NULL);
2838 	list_remove(&xdf_hvm_list, i);
2839 	mutex_exit(&xdf_hvm_list_lock);
2840 
2841 	kmem_free(i->xdf_he_path, strlen(i->xdf_he_path) + 1);
2842 	kmem_free(i, sizeof (*i));
2843 }
2844 
2845 static void
2846 xdf_hvm_init(void)
2847 {
2848 	list_create(&xdf_hvm_list, sizeof (xdf_hvm_entry_t),
2849 	    offsetof(xdf_hvm_entry_t, xdf_he_list));
2850 	mutex_init(&xdf_hvm_list_lock, NULL, MUTEX_DEFAULT, NULL);
2851 }
2852 
2853 static void
2854 xdf_hvm_fini(void)
2855 {
2856 	ASSERT(list_head(&xdf_hvm_list) == NULL);
2857 	list_destroy(&xdf_hvm_list);
2858 	mutex_destroy(&xdf_hvm_list_lock);
2859 }
2860 
2861 int
2862 xdf_hvm_connect(dev_info_t *dip)
2863 {
2864 	xdf_t	*vdp = (xdf_t *)ddi_get_driver_private(dip);
2865 	int	rv;
2866 
2867 	/* do cv_wait until connected or failed */
2868 	mutex_enter(&vdp->xdf_dev_lk);
2869 	rv = xdf_connect(vdp, B_TRUE);
2870 	mutex_exit(&vdp->xdf_dev_lk);
2871 	return ((rv == XD_READY) ? 0 : -1);
2872 }
2873 
2874 int
2875 xdf_hvm_setpgeom(dev_info_t *dip, cmlb_geom_t *geomp)
2876 {
2877 	xdf_t	*vdp = (xdf_t *)ddi_get_driver_private(dip);
2878 
2879 	/* sanity check the requested physical geometry */
2880 	mutex_enter(&vdp->xdf_dev_lk);
2881 	if ((geomp->g_secsize != XB_BSIZE) ||
2882 	    (geomp->g_capacity == 0)) {
2883 		mutex_exit(&vdp->xdf_dev_lk);
2884 		return (EINVAL);
2885 	}
2886 
2887 	/*
2888 	 * If we've already connected to the backend device then make sure
2889 	 * we're not defining a physical geometry larger than our backend
2890 	 * device.
2891 	 */
2892 	if ((vdp->xdf_xdev_nblocks != 0) &&
2893 	    (geomp->g_capacity > vdp->xdf_xdev_nblocks)) {
2894 		mutex_exit(&vdp->xdf_dev_lk);
2895 		return (EINVAL);
2896 	}
2897 
2898 	vdp->xdf_pgeom = *geomp;
2899 	mutex_exit(&vdp->xdf_dev_lk);
2900 
2901 	/* force a re-validation */
2902 	cmlb_invalidate(vdp->xdf_vd_lbl, NULL);
2903 
2904 	return (0);
2905 }
2906 
2907 #endif /* XPV_HVM_DRIVER */
2908