xref: /titanic_52/usr/src/uts/common/xen/io/xdf.c (revision d7448364b360ed82582291005bd9831f2a5d18a0)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * xdf.c - Xen Virtual Block Device Driver
29  * TODO:
30  *	- support alternate block size (currently only DEV_BSIZE supported)
31  *	- revalidate geometry for removable devices
32  */
33 
34 #pragma ident	"%Z%%M%	%I%	%E% SMI"
35 
36 #include <sys/ddi.h>
37 #include <sys/sunddi.h>
38 #include <sys/conf.h>
39 #include <sys/cmlb.h>
40 #include <sys/dkio.h>
41 #include <sys/promif.h>
42 #include <sys/sysmacros.h>
43 #include <sys/kstat.h>
44 #include <sys/mach_mmu.h>
45 #ifdef XPV_HVM_DRIVER
46 #include <sys/xpv_support.h>
47 #include <sys/sunndi.h>
48 #endif /* XPV_HVM_DRIVER */
49 #include <public/io/xenbus.h>
50 #include <xen/sys/xenbus_impl.h>
51 #include <xen/sys/xendev.h>
52 #include <sys/gnttab.h>
53 #include <sys/scsi/generic/inquiry.h>
54 #include <xen/io/blkif_impl.h>
55 #include <io/xdf.h>
56 
57 #define	FLUSH_DISKCACHE	0x1
58 #define	WRITE_BARRIER	0x2
59 #define	DEFAULT_FLUSH_BLOCK	156 /* block to write to cause a cache flush */
60 #define	USE_WRITE_BARRIER(vdp)				\
61 	((vdp)->xdf_feature_barrier && !(vdp)->xdf_flush_supported)
62 #define	USE_FLUSH_DISKCACHE(vdp)			\
63 	((vdp)->xdf_feature_barrier && (vdp)->xdf_flush_supported)
64 #define	IS_WRITE_BARRIER(vdp, bp)			\
65 	(!IS_READ(bp) && USE_WRITE_BARRIER(vdp) &&	\
66 	((bp)->b_un.b_addr == (vdp)->xdf_cache_flush_block))
67 #define	IS_FLUSH_DISKCACHE(bp)				\
68 	(!IS_READ(bp) && USE_FLUSH_DISKCACHE(vdp) && ((bp)->b_bcount == 0))
69 
70 static void *vbd_ss;
71 static kmem_cache_t *xdf_vreq_cache;
72 static kmem_cache_t *xdf_gs_cache;
73 static int xdf_maxphys = XB_MAXPHYS;
74 int xdfdebug = 0;
75 extern int do_polled_io;
76 diskaddr_t xdf_flush_block = DEFAULT_FLUSH_BLOCK;
77 int	xdf_barrier_flush_disable = 0;
78 
79 /*
80  * dev_ops and cb_ops entrypoints
81  */
82 static int xdf_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
83 static int xdf_attach(dev_info_t *, ddi_attach_cmd_t);
84 static int xdf_detach(dev_info_t *, ddi_detach_cmd_t);
85 static int xdf_reset(dev_info_t *, ddi_reset_cmd_t);
86 static int xdf_open(dev_t *, int, int, cred_t *);
87 static int xdf_close(dev_t, int, int, struct cred *);
88 static int xdf_strategy(struct buf *);
89 static int xdf_read(dev_t, struct uio *, cred_t *);
90 static int xdf_aread(dev_t, struct aio_req *, cred_t *);
91 static int xdf_write(dev_t, struct uio *, cred_t *);
92 static int xdf_awrite(dev_t, struct aio_req *, cred_t *);
93 static int xdf_dump(dev_t, caddr_t, daddr_t, int);
94 static int xdf_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
95 static uint_t xdf_intr(caddr_t);
96 static int xdf_prop_op(dev_t, dev_info_t *, ddi_prop_op_t, int, char *,
97     caddr_t, int *);
98 
99 /*
100  * misc private functions
101  */
102 static int xdf_suspend(dev_info_t *);
103 static int xdf_resume(dev_info_t *);
104 static int xdf_start_connect(xdf_t *);
105 static int xdf_start_disconnect(xdf_t *);
106 static int xdf_post_connect(xdf_t *);
107 static void xdf_post_disconnect(xdf_t *);
108 static void xdf_oe_change(dev_info_t *, ddi_eventcookie_t, void *, void *);
109 static void xdf_iostart(xdf_t *);
110 static void xdf_iofini(xdf_t *, uint64_t, int);
111 static int xdf_prepare_rreq(xdf_t *, struct buf *, blkif_request_t *);
112 static int xdf_drain_io(xdf_t *);
113 static boolean_t xdf_isopen(xdf_t *, int);
114 static int xdf_check_state_transition(xdf_t *, XenbusState);
115 static int xdf_connect(xdf_t *, boolean_t);
116 static int xdf_dmacallback(caddr_t);
117 static void xdf_timeout_handler(void *);
118 static uint_t xdf_iorestart(caddr_t);
119 static v_req_t *vreq_get(xdf_t *, buf_t *);
120 static void vreq_free(xdf_t *, v_req_t *);
121 static int vreq_setup(xdf_t *, v_req_t *);
122 static ge_slot_t *gs_get(xdf_t *, int);
123 static void gs_free(xdf_t *, ge_slot_t *);
124 static grant_ref_t gs_grant(ge_slot_t *, mfn_t);
125 static void unexpectedie(xdf_t *);
126 static void xdfmin(struct buf *);
127 static void xdf_synthetic_pgeom(dev_info_t *, cmlb_geom_t *);
128 extern int xdf_kstat_create(dev_info_t *, char *, int);
129 extern void xdf_kstat_delete(dev_info_t *);
130 
131 #if defined(XPV_HVM_DRIVER)
132 static void xdf_hvm_add(dev_info_t *);
133 static void xdf_hvm_rm(dev_info_t *);
134 static void xdf_hvm_init(void);
135 static void xdf_hvm_fini(void);
136 #endif /* XPV_HVM_DRIVER */
137 
138 static 	struct cb_ops xdf_cbops = {
139 	xdf_open,
140 	xdf_close,
141 	xdf_strategy,
142 	nodev,
143 	xdf_dump,
144 	xdf_read,
145 	xdf_write,
146 	xdf_ioctl,
147 	nodev,
148 	nodev,
149 	nodev,
150 	nochpoll,
151 	xdf_prop_op,
152 	NULL,
153 	D_MP | D_NEW | D_64BIT,
154 	CB_REV,
155 	xdf_aread,
156 	xdf_awrite
157 };
158 
159 struct dev_ops xdf_devops = {
160 	DEVO_REV,		/* devo_rev */
161 	0,			/* devo_refcnt */
162 	xdf_getinfo,		/* devo_getinfo */
163 	nulldev,		/* devo_identify */
164 	nulldev,		/* devo_probe */
165 	xdf_attach,		/* devo_attach */
166 	xdf_detach,		/* devo_detach */
167 	xdf_reset,		/* devo_reset */
168 	&xdf_cbops,		/* devo_cb_ops */
169 	(struct bus_ops *)NULL	/* devo_bus_ops */
170 };
171 
172 static struct modldrv modldrv = {
173 	&mod_driverops,		/* Type of module.  This one is a driver */
174 	"virtual block driver %I%",	/* short description */
175 	&xdf_devops		/* driver specific ops */
176 };
177 
178 static struct modlinkage xdf_modlinkage = {
179 	MODREV_1, (void *)&modldrv, NULL
180 };
181 
182 /*
183  * I/O buffer DMA attributes
184  * Make sure: one DMA window contains BLKIF_MAX_SEGMENTS_PER_REQUEST at most
185  */
186 static ddi_dma_attr_t xb_dma_attr = {
187 	DMA_ATTR_V0,
188 	(uint64_t)0,			/* lowest address */
189 	(uint64_t)0xffffffffffffffff,	/* highest usable address */
190 	(uint64_t)0xffffff,		/* DMA counter limit max */
191 	(uint64_t)XB_BSIZE,		/* alignment in bytes */
192 	XB_BSIZE - 1,			/* bitmap of burst sizes */
193 	XB_BSIZE,			/* min transfer */
194 	(uint64_t)XB_MAX_XFER, 		/* maximum transfer */
195 	(uint64_t)PAGEOFFSET,		/* 1 page segment length  */
196 	BLKIF_MAX_SEGMENTS_PER_REQUEST,	/* maximum number of segments */
197 	XB_BSIZE,			/* granularity */
198 	0,				/* flags (reserved) */
199 };
200 
201 static ddi_device_acc_attr_t xc_acc_attr = {
202 	DDI_DEVICE_ATTR_V0,
203 	DDI_NEVERSWAP_ACC,
204 	DDI_STRICTORDER_ACC
205 };
206 
207 /* callbacks from commmon label */
208 
209 int xdf_lb_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t, void *);
210 int xdf_lb_getinfo(dev_info_t *, int, void *, void *);
211 
212 static cmlb_tg_ops_t xdf_lb_ops = {
213 	TG_DK_OPS_VERSION_1,
214 	xdf_lb_rdwr,
215 	xdf_lb_getinfo
216 };
217 
218 int
219 _init(void)
220 {
221 	int rc;
222 
223 	if ((rc = ddi_soft_state_init(&vbd_ss, sizeof (xdf_t), 0)) != 0)
224 		return (rc);
225 
226 	xdf_vreq_cache = kmem_cache_create("xdf_vreq_cache",
227 	    sizeof (v_req_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
228 	xdf_gs_cache = kmem_cache_create("xdf_gs_cache",
229 	    sizeof (ge_slot_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
230 
231 #if defined(XPV_HVM_DRIVER)
232 	xdf_hvm_init();
233 #endif /* XPV_HVM_DRIVER */
234 
235 	if ((rc = mod_install(&xdf_modlinkage)) != 0) {
236 #if defined(XPV_HVM_DRIVER)
237 		xdf_hvm_fini();
238 #endif /* XPV_HVM_DRIVER */
239 		kmem_cache_destroy(xdf_vreq_cache);
240 		kmem_cache_destroy(xdf_gs_cache);
241 		ddi_soft_state_fini(&vbd_ss);
242 		return (rc);
243 	}
244 
245 	return (rc);
246 }
247 
248 int
249 _fini(void)
250 {
251 
252 	int err;
253 	if ((err = mod_remove(&xdf_modlinkage)) != 0)
254 		return (err);
255 
256 #if defined(XPV_HVM_DRIVER)
257 	xdf_hvm_fini();
258 #endif /* XPV_HVM_DRIVER */
259 
260 	kmem_cache_destroy(xdf_vreq_cache);
261 	kmem_cache_destroy(xdf_gs_cache);
262 	ddi_soft_state_fini(&vbd_ss);
263 
264 	return (0);
265 }
266 
267 int
268 _info(struct modinfo *modinfop)
269 {
270 	return (mod_info(&xdf_modlinkage, modinfop));
271 }
272 
273 /*ARGSUSED*/
274 static int
275 xdf_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **rp)
276 {
277 	int instance;
278 	xdf_t *vbdp;
279 
280 	instance = XDF_INST(getminor((dev_t)arg));
281 
282 	switch (cmd) {
283 	case DDI_INFO_DEVT2DEVINFO:
284 		if ((vbdp = ddi_get_soft_state(vbd_ss, instance)) == NULL) {
285 			*rp = NULL;
286 			return (DDI_FAILURE);
287 		}
288 		*rp = vbdp->xdf_dip;
289 		return (DDI_SUCCESS);
290 
291 	case DDI_INFO_DEVT2INSTANCE:
292 		*rp = (void *)(uintptr_t)instance;
293 		return (DDI_SUCCESS);
294 
295 	default:
296 		return (DDI_FAILURE);
297 	}
298 }
299 
300 static int
301 xdf_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags,
302 	char *name, caddr_t valuep, int *lengthp)
303 {
304 	xdf_t	*vdp;
305 
306 	if ((vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(dip))) == NULL)
307 		return (ddi_prop_op(dev, dip, prop_op, mod_flags,
308 		    name, valuep, lengthp));
309 
310 	/* do cv_wait until connected or failed */
311 	mutex_enter(&vdp->xdf_dev_lk);
312 	if (xdf_connect(vdp, B_TRUE) != XD_READY) {
313 		mutex_exit(&vdp->xdf_dev_lk);
314 		return (ddi_prop_op(dev, dip, prop_op, mod_flags,
315 		    name, valuep, lengthp));
316 	}
317 	mutex_exit(&vdp->xdf_dev_lk);
318 
319 	return (cmlb_prop_op(vdp->xdf_vd_lbl,
320 	    dev, dip, prop_op, mod_flags, name, valuep, lengthp,
321 	    XDF_PART(getminor(dev)), NULL));
322 }
323 
324 static int
325 xdf_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
326 {
327 	xdf_t *vdp;
328 	ddi_iblock_cookie_t softibc;
329 	int instance;
330 
331 	xdfdebug = ddi_prop_get_int(DDI_DEV_T_ANY, devi, DDI_PROP_NOTPROM,
332 	    "xdfdebug", 0);
333 
334 	switch (cmd) {
335 		case DDI_ATTACH:
336 			break;
337 
338 		case DDI_RESUME:
339 			return (xdf_resume(devi));
340 
341 		default:
342 			return (DDI_FAILURE);
343 	}
344 
345 	instance = ddi_get_instance(devi);
346 	if (ddi_soft_state_zalloc(vbd_ss, instance) != DDI_SUCCESS)
347 		return (DDI_FAILURE);
348 
349 	DPRINTF(DDI_DBG, ("xdf%d: attaching\n", instance));
350 	vdp = ddi_get_soft_state(vbd_ss, instance);
351 	ddi_set_driver_private(devi, vdp);
352 	vdp->xdf_dip = devi;
353 	cv_init(&vdp->xdf_dev_cv, NULL, CV_DEFAULT, NULL);
354 
355 	if (ddi_get_iblock_cookie(devi, 0, &vdp->xdf_ibc) != DDI_SUCCESS) {
356 		cmn_err(CE_WARN, "xdf@%s: failed to get iblock cookie",
357 		    ddi_get_name_addr(devi));
358 		goto errout0;
359 	}
360 	mutex_init(&vdp->xdf_dev_lk, NULL, MUTEX_DRIVER, (void *)vdp->xdf_ibc);
361 	mutex_init(&vdp->xdf_cb_lk, NULL, MUTEX_DRIVER, (void *)vdp->xdf_ibc);
362 	mutex_init(&vdp->xdf_iostat_lk, NULL, MUTEX_DRIVER,
363 	    (void *)vdp->xdf_ibc);
364 
365 	if (ddi_get_soft_iblock_cookie(devi, DDI_SOFTINT_LOW, &softibc)
366 	    != DDI_SUCCESS) {
367 		cmn_err(CE_WARN, "xdf@%s: failed to get softintr iblock cookie",
368 		    ddi_get_name_addr(devi));
369 		goto errout0;
370 	}
371 	if (ddi_add_softintr(devi, DDI_SOFTINT_LOW, &vdp->xdf_softintr_id,
372 	    &softibc, NULL, xdf_iorestart, (caddr_t)vdp) != DDI_SUCCESS) {
373 		cmn_err(CE_WARN, "xdf@%s: failed to add softintr",
374 		    ddi_get_name_addr(devi));
375 		goto errout0;
376 	}
377 
378 #if !defined(XPV_HVM_DRIVER)
379 	/* create kstat for iostat(1M) */
380 	if (xdf_kstat_create(devi, "xdf", instance) != 0) {
381 		cmn_err(CE_WARN, "xdf@%s: failed to create kstat",
382 		    ddi_get_name_addr(devi));
383 		goto errout0;
384 	}
385 #endif /* !XPV_HVM_DRIVER */
386 
387 	/* driver handles kernel-issued IOCTLs */
388 	if (ddi_prop_create(DDI_DEV_T_NONE, devi, DDI_PROP_CANSLEEP,
389 	    DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) {
390 		cmn_err(CE_WARN, "xdf@%s: cannot create DDI_KERNEL_IOCTL prop",
391 		    ddi_get_name_addr(devi));
392 		goto errout0;
393 	}
394 
395 	/*
396 	 * Initialize the physical geometry stucture.  Note that currently
397 	 * we don't know the size of the backend device so the number
398 	 * of blocks on the device will be initialized to zero.  Once
399 	 * we connect to the backend device we'll update the physical
400 	 * geometry to reflect the real size of the device.
401 	 */
402 	xdf_synthetic_pgeom(devi, &vdp->xdf_pgeom);
403 
404 	/*
405 	 * create default device minor nodes: non-removable disk
406 	 * we will adjust minor nodes after we are connected w/ backend
407 	 */
408 	cmlb_alloc_handle(&vdp->xdf_vd_lbl);
409 	if (cmlb_attach(devi, &xdf_lb_ops, DTYPE_DIRECT, 0, 1,
410 	    DDI_NT_BLOCK_XVMD,
411 #if defined(XPV_HVM_DRIVER)
412 	    CMLB_CREATE_ALTSLICE_VTOC_16_DTYPE_DIRECT |
413 	    CMLB_INTERNAL_MINOR_NODES,
414 #else /* !XPV_HVM_DRIVER */
415 	    CMLB_FAKE_LABEL_ONE_PARTITION,
416 #endif /* !XPV_HVM_DRIVER */
417 	    vdp->xdf_vd_lbl, NULL) != 0) {
418 		cmn_err(CE_WARN, "xdf@%s: default cmlb attach failed",
419 		    ddi_get_name_addr(devi));
420 		goto errout0;
421 	}
422 
423 	/*
424 	 * We ship with cache-enabled disks
425 	 */
426 	vdp->xdf_wce = 1;
427 
428 	mutex_enter(&vdp->xdf_cb_lk);
429 
430 	/* Watch backend XenbusState change */
431 	if (xvdi_add_event_handler(devi, XS_OE_STATE,
432 	    xdf_oe_change) != DDI_SUCCESS) {
433 		mutex_exit(&vdp->xdf_cb_lk);
434 		goto errout0;
435 	}
436 
437 	if (xdf_start_connect(vdp) != DDI_SUCCESS) {
438 		cmn_err(CE_WARN, "xdf@%s: start connection failed",
439 		    ddi_get_name_addr(devi));
440 		(void) xdf_start_disconnect(vdp);
441 		mutex_exit(&vdp->xdf_cb_lk);
442 		goto errout1;
443 	}
444 
445 	mutex_exit(&vdp->xdf_cb_lk);
446 
447 	list_create(&vdp->xdf_vreq_act, sizeof (v_req_t),
448 	    offsetof(v_req_t, v_link));
449 	list_create(&vdp->xdf_gs_act, sizeof (ge_slot_t),
450 	    offsetof(ge_slot_t, link));
451 
452 #if defined(XPV_HVM_DRIVER)
453 	xdf_hvm_add(devi);
454 
455 	(void) ddi_prop_update_int(DDI_DEV_T_NONE, devi, DDI_NO_AUTODETACH, 1);
456 
457 	/*
458 	 * Report our version to dom0.
459 	 */
460 	if (xenbus_printf(XBT_NULL, "hvmpv/xdf", "version", "%d",
461 	    HVMPV_XDF_VERS))
462 		cmn_err(CE_WARN, "xdf: couldn't write version\n");
463 #endif /* XPV_HVM_DRIVER */
464 
465 	ddi_report_dev(devi);
466 
467 	DPRINTF(DDI_DBG, ("xdf%d: attached\n", instance));
468 
469 	return (DDI_SUCCESS);
470 
471 errout1:
472 	xvdi_remove_event_handler(devi, XS_OE_STATE);
473 errout0:
474 	if (vdp->xdf_vd_lbl != NULL) {
475 		cmlb_detach(vdp->xdf_vd_lbl, NULL);
476 		cmlb_free_handle(&vdp->xdf_vd_lbl);
477 		vdp->xdf_vd_lbl = NULL;
478 	}
479 #if !defined(XPV_HVM_DRIVER)
480 	xdf_kstat_delete(devi);
481 #endif /* !XPV_HVM_DRIVER */
482 	if (vdp->xdf_softintr_id != NULL)
483 		ddi_remove_softintr(vdp->xdf_softintr_id);
484 	if (vdp->xdf_ibc != NULL) {
485 		mutex_destroy(&vdp->xdf_cb_lk);
486 		mutex_destroy(&vdp->xdf_dev_lk);
487 	}
488 	cv_destroy(&vdp->xdf_dev_cv);
489 	ddi_soft_state_free(vbd_ss, instance);
490 	ddi_set_driver_private(devi, NULL);
491 	ddi_prop_remove_all(devi);
492 	cmn_err(CE_WARN, "xdf@%s: attach failed", ddi_get_name_addr(devi));
493 	return (DDI_FAILURE);
494 }
495 
496 static int
497 xdf_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
498 {
499 	xdf_t *vdp;
500 	int instance;
501 
502 	switch (cmd) {
503 
504 	case DDI_PM_SUSPEND:
505 		break;
506 
507 	case DDI_SUSPEND:
508 		return (xdf_suspend(devi));
509 
510 	case DDI_DETACH:
511 		break;
512 
513 	default:
514 		return (DDI_FAILURE);
515 	}
516 
517 	instance = ddi_get_instance(devi);
518 	DPRINTF(DDI_DBG, ("xdf%d: detaching\n", instance));
519 	vdp = ddi_get_soft_state(vbd_ss, instance);
520 
521 	if (vdp == NULL)
522 		return (DDI_FAILURE);
523 
524 	mutex_enter(&vdp->xdf_dev_lk);
525 	if (xdf_isopen(vdp, -1)) {
526 		mutex_exit(&vdp->xdf_dev_lk);
527 		return (DDI_FAILURE);
528 	}
529 
530 	if (vdp->xdf_status != XD_CLOSED) {
531 		mutex_exit(&vdp->xdf_dev_lk);
532 		return (DDI_FAILURE);
533 	}
534 
535 #if defined(XPV_HVM_DRIVER)
536 	xdf_hvm_rm(devi);
537 #endif /* XPV_HVM_DRIVER */
538 
539 	ASSERT(!ISDMACBON(vdp));
540 	mutex_exit(&vdp->xdf_dev_lk);
541 
542 	if (vdp->xdf_timeout_id != 0)
543 		(void) untimeout(vdp->xdf_timeout_id);
544 
545 	xvdi_remove_event_handler(devi, XS_OE_STATE);
546 
547 	/* we'll support backend running in domU later */
548 #ifdef	DOMU_BACKEND
549 	(void) xvdi_post_event(devi, XEN_HP_REMOVE);
550 #endif
551 
552 	list_destroy(&vdp->xdf_vreq_act);
553 	list_destroy(&vdp->xdf_gs_act);
554 	ddi_prop_remove_all(devi);
555 	xdf_kstat_delete(devi);
556 	ddi_remove_softintr(vdp->xdf_softintr_id);
557 	ddi_set_driver_private(devi, NULL);
558 	cv_destroy(&vdp->xdf_dev_cv);
559 	mutex_destroy(&vdp->xdf_cb_lk);
560 	mutex_destroy(&vdp->xdf_dev_lk);
561 	if (vdp->xdf_cache_flush_block != NULL)
562 		kmem_free(vdp->xdf_flush_mem, 2 * DEV_BSIZE);
563 	ddi_soft_state_free(vbd_ss, instance);
564 	return (DDI_SUCCESS);
565 }
566 
567 static int
568 xdf_suspend(dev_info_t *devi)
569 {
570 	xdf_t *vdp;
571 	int instance;
572 	enum xdf_state st;
573 
574 	instance = ddi_get_instance(devi);
575 
576 	if (xdfdebug & SUSRES_DBG)
577 		xen_printf("xdf_suspend: xdf#%d\n", instance);
578 
579 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
580 		return (DDI_FAILURE);
581 
582 	xvdi_suspend(devi);
583 
584 	mutex_enter(&vdp->xdf_cb_lk);
585 	mutex_enter(&vdp->xdf_dev_lk);
586 	st = vdp->xdf_status;
587 	/* change status to stop further I/O requests */
588 	if (st == XD_READY)
589 		vdp->xdf_status = XD_SUSPEND;
590 	mutex_exit(&vdp->xdf_dev_lk);
591 	mutex_exit(&vdp->xdf_cb_lk);
592 
593 	/* make sure no more I/O responses left in the ring buffer */
594 	if ((st == XD_INIT) || (st == XD_READY)) {
595 #ifdef XPV_HVM_DRIVER
596 		ec_unbind_evtchn(vdp->xdf_evtchn);
597 		xvdi_free_evtchn(devi);
598 #else /* !XPV_HVM_DRIVER */
599 		(void) ddi_remove_intr(devi, 0, NULL);
600 #endif /* !XPV_HVM_DRIVER */
601 		(void) xdf_drain_io(vdp);
602 		/*
603 		 * no need to teardown the ring buffer here
604 		 * it will be simply re-init'ed during resume when
605 		 * we call xvdi_alloc_ring
606 		 */
607 	}
608 
609 	if (xdfdebug & SUSRES_DBG)
610 		xen_printf("xdf_suspend: SUCCESS\n");
611 
612 	return (DDI_SUCCESS);
613 }
614 
615 /*ARGSUSED*/
616 static int
617 xdf_resume(dev_info_t *devi)
618 {
619 	xdf_t *vdp;
620 	int instance;
621 
622 	instance = ddi_get_instance(devi);
623 	if (xdfdebug & SUSRES_DBG)
624 		xen_printf("xdf_resume: xdf%d\n", instance);
625 
626 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
627 		return (DDI_FAILURE);
628 
629 	mutex_enter(&vdp->xdf_cb_lk);
630 
631 	if (xvdi_resume(devi) != DDI_SUCCESS) {
632 		mutex_exit(&vdp->xdf_cb_lk);
633 		return (DDI_FAILURE);
634 	}
635 
636 	mutex_enter(&vdp->xdf_dev_lk);
637 	ASSERT(vdp->xdf_status != XD_READY);
638 	vdp->xdf_status = XD_UNKNOWN;
639 	mutex_exit(&vdp->xdf_dev_lk);
640 
641 	if (xdf_start_connect(vdp) != DDI_SUCCESS) {
642 		mutex_exit(&vdp->xdf_cb_lk);
643 		return (DDI_FAILURE);
644 	}
645 
646 	mutex_exit(&vdp->xdf_cb_lk);
647 
648 	if (xdfdebug & SUSRES_DBG)
649 		xen_printf("xdf_resume: done\n");
650 	return (DDI_SUCCESS);
651 }
652 
653 /*ARGSUSED*/
654 static int
655 xdf_reset(dev_info_t *devi, ddi_reset_cmd_t cmd)
656 {
657 	xdf_t *vdp;
658 	int instance;
659 
660 	instance = ddi_get_instance(devi);
661 	DPRINTF(DDI_DBG, ("xdf%d: resetting\n", instance));
662 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
663 		return (DDI_FAILURE);
664 
665 	/*
666 	 * wait for any outstanding I/O to complete
667 	 */
668 	(void) xdf_drain_io(vdp);
669 
670 	DPRINTF(DDI_DBG, ("xdf%d: reset complete\n", instance));
671 	return (DDI_SUCCESS);
672 }
673 
674 static int
675 xdf_open(dev_t *devp, int flag, int otyp, cred_t *credp)
676 {
677 	minor_t	minor;
678 	xdf_t	*vdp;
679 	int part;
680 	ulong_t parbit;
681 	diskaddr_t p_blkct = 0;
682 	boolean_t firstopen;
683 	boolean_t nodelay;
684 
685 	minor = getminor(*devp);
686 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
687 		return (ENXIO);
688 
689 	nodelay = (flag & (FNDELAY | FNONBLOCK));
690 
691 	DPRINTF(DDI_DBG, ("xdf%d: opening\n", XDF_INST(minor)));
692 
693 	/* do cv_wait until connected or failed */
694 	mutex_enter(&vdp->xdf_dev_lk);
695 	if (!nodelay && (xdf_connect(vdp, B_TRUE) != XD_READY)) {
696 		mutex_exit(&vdp->xdf_dev_lk);
697 		return (ENXIO);
698 	}
699 
700 	if ((flag & FWRITE) && XD_IS_RO(vdp)) {
701 		mutex_exit(&vdp->xdf_dev_lk);
702 		return (EROFS);
703 	}
704 
705 	part = XDF_PART(minor);
706 	parbit = 1 << part;
707 	if ((vdp->xdf_vd_exclopen & parbit) ||
708 	    ((flag & FEXCL) && xdf_isopen(vdp, part))) {
709 		mutex_exit(&vdp->xdf_dev_lk);
710 		return (EBUSY);
711 	}
712 
713 	/* are we the first one to open this node? */
714 	firstopen = !xdf_isopen(vdp, -1);
715 
716 	if (otyp == OTYP_LYR)
717 		vdp->xdf_vd_lyropen[part]++;
718 
719 	vdp->xdf_vd_open[otyp] |= parbit;
720 
721 	if (flag & FEXCL)
722 		vdp->xdf_vd_exclopen |= parbit;
723 
724 	mutex_exit(&vdp->xdf_dev_lk);
725 
726 	/* force a re-validation */
727 	if (firstopen)
728 		cmlb_invalidate(vdp->xdf_vd_lbl, NULL);
729 
730 	/*
731 	 * check size
732 	 * ignore CD/DVD which contains a zero-sized s0
733 	 */
734 	if (!nodelay && !XD_IS_CD(vdp) &&
735 	    ((cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
736 	    NULL, NULL, NULL, NULL) != 0) || (p_blkct == 0))) {
737 		(void) xdf_close(*devp, flag, otyp, credp);
738 		return (ENXIO);
739 	}
740 
741 	return (0);
742 }
743 
744 /*ARGSUSED*/
745 static int
746 xdf_close(dev_t dev, int flag, int otyp, struct cred *credp)
747 {
748 	minor_t	minor;
749 	xdf_t	*vdp;
750 	int part;
751 	ulong_t parbit;
752 
753 	minor = getminor(dev);
754 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
755 		return (ENXIO);
756 
757 	mutex_enter(&vdp->xdf_dev_lk);
758 	part = XDF_PART(minor);
759 	if (!xdf_isopen(vdp, part)) {
760 		mutex_exit(&vdp->xdf_dev_lk);
761 		return (ENXIO);
762 	}
763 	parbit = 1 << part;
764 
765 	ASSERT((vdp->xdf_vd_open[otyp] & parbit) != 0);
766 	if (otyp == OTYP_LYR) {
767 		ASSERT(vdp->xdf_vd_lyropen[part] > 0);
768 		if (--vdp->xdf_vd_lyropen[part] == 0)
769 			vdp->xdf_vd_open[otyp] &= ~parbit;
770 	} else {
771 		vdp->xdf_vd_open[otyp] &= ~parbit;
772 	}
773 	vdp->xdf_vd_exclopen &= ~parbit;
774 
775 	mutex_exit(&vdp->xdf_dev_lk);
776 	return (0);
777 }
778 
779 static int
780 xdf_strategy(struct buf *bp)
781 {
782 	xdf_t	*vdp;
783 	minor_t minor;
784 	diskaddr_t p_blkct, p_blkst;
785 	ulong_t nblks;
786 	int part;
787 
788 	minor = getminor(bp->b_edev);
789 	part = XDF_PART(minor);
790 
791 	vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor));
792 	if ((vdp == NULL) || !xdf_isopen(vdp, part)) {
793 		bioerror(bp, ENXIO);
794 		bp->b_resid = bp->b_bcount;
795 		biodone(bp);
796 		return (0);
797 	}
798 
799 	/* Check for writes to a read only device */
800 	if (!IS_READ(bp) && XD_IS_RO(vdp)) {
801 		bioerror(bp, EROFS);
802 		bp->b_resid = bp->b_bcount;
803 		biodone(bp);
804 		return (0);
805 	}
806 
807 	/* Check if this I/O is accessing a partition or the entire disk */
808 	if ((long)bp->b_private == XB_SLICE_NONE) {
809 		/* This I/O is using an absolute offset */
810 		p_blkct = vdp->xdf_xdev_nblocks;
811 		p_blkst = 0;
812 	} else {
813 		/* This I/O is using a partition relative offset */
814 		if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
815 		    &p_blkst, NULL, NULL, NULL)) {
816 			bioerror(bp, ENXIO);
817 			bp->b_resid = bp->b_bcount;
818 			biodone(bp);
819 			return (0);
820 		}
821 	}
822 
823 	/* check for a starting block beyond the disk or partition limit */
824 	if (bp->b_blkno > p_blkct) {
825 		DPRINTF(IO_DBG, ("xdf: block %lld exceeds VBD size %"PRIu64,
826 		    (longlong_t)bp->b_blkno, (uint64_t)p_blkct));
827 		bioerror(bp, EINVAL);
828 		bp->b_resid = bp->b_bcount;
829 		biodone(bp);
830 		return (0);
831 	}
832 
833 	/* Legacy: don't set error flag at this case */
834 	if (bp->b_blkno == p_blkct) {
835 		bp->b_resid = bp->b_bcount;
836 		biodone(bp);
837 		return (0);
838 	}
839 
840 	/* Adjust for partial transfer */
841 	nblks = bp->b_bcount >> XB_BSHIFT;
842 	if ((bp->b_blkno + nblks) > p_blkct) {
843 		bp->b_resid = ((bp->b_blkno + nblks) - p_blkct) << XB_BSHIFT;
844 		bp->b_bcount -= bp->b_resid;
845 	}
846 
847 	DPRINTF(IO_DBG, ("xdf: strategy blk %lld len %lu\n",
848 	    (longlong_t)bp->b_blkno, (ulong_t)bp->b_bcount));
849 
850 	/* Fix up the buf struct */
851 	bp->b_flags |= B_BUSY;
852 	bp->av_forw = bp->av_back = NULL; /* not tagged with a v_req */
853 	bp->b_private = (void *)(uintptr_t)p_blkst;
854 
855 	mutex_enter(&vdp->xdf_dev_lk);
856 	if (vdp->xdf_xdev_iostat != NULL)
857 		kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
858 	if (vdp->xdf_f_act == NULL) {
859 		vdp->xdf_f_act = vdp->xdf_l_act = bp;
860 	} else {
861 		vdp->xdf_l_act->av_forw = bp;
862 		vdp->xdf_l_act = bp;
863 	}
864 	mutex_exit(&vdp->xdf_dev_lk);
865 
866 	xdf_iostart(vdp);
867 	if (do_polled_io)
868 		(void) xdf_drain_io(vdp);
869 	return (0);
870 }
871 
872 /*ARGSUSED*/
873 static int
874 xdf_read(dev_t dev, struct uio *uiop, cred_t *credp)
875 {
876 
877 	xdf_t	*vdp;
878 	minor_t minor;
879 	diskaddr_t p_blkcnt;
880 	int part;
881 
882 	minor = getminor(dev);
883 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
884 		return (ENXIO);
885 
886 	DPRINTF(IO_DBG, ("xdf: read offset 0x%"PRIx64"\n",
887 	    (int64_t)uiop->uio_offset));
888 
889 	part = XDF_PART(minor);
890 	if (!xdf_isopen(vdp, part))
891 		return (ENXIO);
892 
893 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
894 	    NULL, NULL, NULL, NULL))
895 		return (ENXIO);
896 
897 	if (U_INVAL(uiop))
898 		return (EINVAL);
899 
900 	return (physio(xdf_strategy, NULL, dev, B_READ, xdfmin, uiop));
901 }
902 
903 /*ARGSUSED*/
904 static int
905 xdf_write(dev_t dev, struct uio *uiop, cred_t *credp)
906 {
907 	xdf_t *vdp;
908 	minor_t minor;
909 	diskaddr_t p_blkcnt;
910 	int part;
911 
912 	minor = getminor(dev);
913 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
914 		return (ENXIO);
915 
916 	DPRINTF(IO_DBG, ("xdf: write offset 0x%"PRIx64"\n",
917 	    (int64_t)uiop->uio_offset));
918 
919 	part = XDF_PART(minor);
920 	if (!xdf_isopen(vdp, part))
921 		return (ENXIO);
922 
923 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
924 	    NULL, NULL, NULL, NULL))
925 		return (ENXIO);
926 
927 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt))
928 		return (ENOSPC);
929 
930 	if (U_INVAL(uiop))
931 		return (EINVAL);
932 
933 	return (physio(xdf_strategy, NULL, dev, B_WRITE, minphys, uiop));
934 }
935 
936 /*ARGSUSED*/
937 static int
938 xdf_aread(dev_t dev, struct aio_req *aiop, cred_t *credp)
939 {
940 	xdf_t	*vdp;
941 	minor_t minor;
942 	struct uio *uiop = aiop->aio_uio;
943 	diskaddr_t p_blkcnt;
944 	int part;
945 
946 	minor = getminor(dev);
947 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
948 		return (ENXIO);
949 
950 	part = XDF_PART(minor);
951 	if (!xdf_isopen(vdp, part))
952 		return (ENXIO);
953 
954 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
955 	    NULL, NULL, NULL, NULL))
956 		return (ENXIO);
957 
958 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt))
959 		return (ENOSPC);
960 
961 	if (U_INVAL(uiop))
962 		return (EINVAL);
963 
964 	return (aphysio(xdf_strategy, anocancel, dev, B_READ, minphys, aiop));
965 }
966 
967 /*ARGSUSED*/
968 static int
969 xdf_awrite(dev_t dev, struct aio_req *aiop, cred_t *credp)
970 {
971 	xdf_t *vdp;
972 	minor_t minor;
973 	struct uio *uiop = aiop->aio_uio;
974 	diskaddr_t p_blkcnt;
975 	int part;
976 
977 	minor = getminor(dev);
978 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
979 		return (ENXIO);
980 
981 	part = XDF_PART(minor);
982 	if (!xdf_isopen(vdp, part))
983 		return (ENXIO);
984 
985 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
986 	    NULL, NULL, NULL, NULL))
987 		return (ENXIO);
988 
989 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt))
990 		return (ENOSPC);
991 
992 	if (U_INVAL(uiop))
993 		return (EINVAL);
994 
995 	return (aphysio(xdf_strategy, anocancel, dev, B_WRITE, minphys, aiop));
996 }
997 
998 static int
999 xdf_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
1000 {
1001 	struct buf dumpbuf, *dbp;
1002 	xdf_t	*vdp;
1003 	minor_t minor;
1004 	int err = 0;
1005 	int part;
1006 	diskaddr_t p_blkcnt, p_blkst;
1007 
1008 	minor = getminor(dev);
1009 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
1010 		return (ENXIO);
1011 
1012 	DPRINTF(IO_DBG, ("xdf: dump addr (0x%p) blk (%ld) nblks (%d)\n",
1013 	    addr, blkno, nblk));
1014 
1015 	part = XDF_PART(minor);
1016 	if (!xdf_isopen(vdp, part))
1017 		return (ENXIO);
1018 
1019 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt, &p_blkst,
1020 	    NULL, NULL, NULL))
1021 		return (ENXIO);
1022 
1023 	if ((blkno + nblk) > p_blkcnt) {
1024 		cmn_err(CE_WARN, "xdf: block %ld exceeds VBD size %"PRIu64,
1025 		    blkno + nblk, (uint64_t)p_blkcnt);
1026 		return (EINVAL);
1027 	}
1028 
1029 	dbp = &dumpbuf;
1030 	bioinit(dbp);
1031 	dbp->b_flags = B_BUSY;
1032 	dbp->b_un.b_addr = addr;
1033 	dbp->b_bcount = nblk << DEV_BSHIFT;
1034 	dbp->b_blkno = blkno;
1035 	dbp->b_edev = dev;
1036 	dbp->b_private = (void *)(uintptr_t)p_blkst;
1037 
1038 	mutex_enter(&vdp->xdf_dev_lk);
1039 	if (vdp->xdf_xdev_iostat != NULL)
1040 		kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1041 	if (vdp->xdf_f_act == NULL) {
1042 		vdp->xdf_f_act = vdp->xdf_l_act = dbp;
1043 	} else {
1044 		vdp->xdf_l_act->av_forw = dbp;
1045 		vdp->xdf_l_act = dbp;
1046 	}
1047 	dbp->av_forw = NULL;
1048 	dbp->av_back = NULL;
1049 	mutex_exit(&vdp->xdf_dev_lk);
1050 	xdf_iostart(vdp);
1051 	err = xdf_drain_io(vdp);
1052 	biofini(dbp);
1053 	return (err);
1054 }
1055 
1056 /*ARGSUSED*/
1057 static int
1058 xdf_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
1059     int *rvalp)
1060 {
1061 	int instance;
1062 	xdf_t	*vdp;
1063 	minor_t minor;
1064 	int part;
1065 
1066 	minor = getminor(dev);
1067 	instance = XDF_INST(minor);
1068 
1069 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
1070 		return (ENXIO);
1071 
1072 	DPRINTF(IOCTL_DBG, ("xdf%d:ioctl: cmd %d (0x%x)\n",
1073 	    instance, cmd, cmd));
1074 
1075 	part = XDF_PART(minor);
1076 	if (!xdf_isopen(vdp, part))
1077 		return (ENXIO);
1078 
1079 	switch (cmd) {
1080 	case DKIOCGMEDIAINFO: {
1081 		struct dk_minfo	media_info;
1082 
1083 		media_info.dki_lbsize = DEV_BSIZE;
1084 		media_info.dki_capacity = vdp->xdf_pgeom.g_capacity;
1085 		media_info.dki_media_type = DK_FIXED_DISK;
1086 
1087 		if (ddi_copyout(&media_info, (void *)arg,
1088 		    sizeof (struct dk_minfo), mode)) {
1089 			return (EFAULT);
1090 		} else {
1091 			return (0);
1092 		}
1093 	}
1094 
1095 	case DKIOCINFO: {
1096 		struct dk_cinfo info;
1097 
1098 		/* controller information */
1099 		if (XD_IS_CD(vdp))
1100 			info.dki_ctype = DKC_CDROM;
1101 		else
1102 			info.dki_ctype = DKC_VBD;
1103 
1104 		info.dki_cnum = 0;
1105 		(void) strncpy((char *)(&info.dki_cname), "xdf", 8);
1106 
1107 		/* unit information */
1108 		info.dki_unit = ddi_get_instance(vdp->xdf_dip);
1109 		(void) strncpy((char *)(&info.dki_dname), "xdf", 8);
1110 		info.dki_flags = DKI_FMTVOL;
1111 		info.dki_partition = part;
1112 		info.dki_maxtransfer = maxphys / DEV_BSIZE;
1113 		info.dki_addr = 0;
1114 		info.dki_space = 0;
1115 		info.dki_prio = 0;
1116 		info.dki_vec = 0;
1117 
1118 		if (ddi_copyout(&info, (void *)arg, sizeof (info), mode))
1119 			return (EFAULT);
1120 		else
1121 			return (0);
1122 	}
1123 
1124 	case DKIOCSTATE: {
1125 		enum dkio_state	dkstate = DKIO_INSERTED;
1126 		if (ddi_copyout(&dkstate, (void *)arg, sizeof (dkstate),
1127 		    mode) != 0)
1128 			return (EFAULT);
1129 		return (0);
1130 	}
1131 
1132 	/*
1133 	 * is media removable?
1134 	 */
1135 	case DKIOCREMOVABLE: {
1136 		int i = XD_IS_RM(vdp) ? 1 : 0;
1137 		if (ddi_copyout(&i, (caddr_t)arg, sizeof (int), mode))
1138 			return (EFAULT);
1139 		return (0);
1140 	}
1141 
1142 	case DKIOCG_PHYGEOM:
1143 	case DKIOCG_VIRTGEOM:
1144 	case DKIOCGGEOM:
1145 	case DKIOCSGEOM:
1146 	case DKIOCGAPART:
1147 	case DKIOCSAPART:
1148 	case DKIOCGVTOC:
1149 	case DKIOCSVTOC:
1150 	case DKIOCPARTINFO:
1151 	case DKIOCGMBOOT:
1152 	case DKIOCSMBOOT:
1153 	case DKIOCGETEFI:
1154 	case DKIOCSETEFI:
1155 	case DKIOCPARTITION: {
1156 		int rc;
1157 
1158 		rc = cmlb_ioctl(vdp->xdf_vd_lbl, dev, cmd, arg, mode, credp,
1159 		    rvalp, NULL);
1160 		return (rc);
1161 	}
1162 
1163 	case DKIOCGETWCE:
1164 		if (ddi_copyout(&vdp->xdf_wce, (void *)arg,
1165 		    sizeof (vdp->xdf_wce), mode))
1166 			return (EFAULT);
1167 		return (0);
1168 	case DKIOCSETWCE:
1169 		if (ddi_copyin((void *)arg, &vdp->xdf_wce,
1170 		    sizeof (vdp->xdf_wce), mode))
1171 			return (EFAULT);
1172 		return (0);
1173 	case DKIOCFLUSHWRITECACHE: {
1174 		int rc;
1175 		struct dk_callback *dkc = (struct dk_callback *)arg;
1176 
1177 		if (vdp->xdf_flush_supported) {
1178 			rc = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
1179 			    NULL, 0, 0, (void *)dev);
1180 		} else if (vdp->xdf_feature_barrier &&
1181 		    !xdf_barrier_flush_disable) {
1182 			rc = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
1183 			    vdp->xdf_cache_flush_block, xdf_flush_block,
1184 			    DEV_BSIZE, (void *)dev);
1185 		} else {
1186 			return (ENOTTY);
1187 		}
1188 		if ((mode & FKIOCTL) && (dkc != NULL) &&
1189 		    (dkc->dkc_callback != NULL)) {
1190 			(*dkc->dkc_callback)(dkc->dkc_cookie, rc);
1191 			/* need to return 0 after calling callback */
1192 			rc = 0;
1193 		}
1194 		return (rc);
1195 	}
1196 
1197 	default:
1198 		return (ENOTTY);
1199 	}
1200 }
1201 
1202 /*
1203  * xdf interrupt handler
1204  */
1205 static uint_t
1206 xdf_intr(caddr_t arg)
1207 {
1208 	xdf_t *vdp = (xdf_t *)arg;
1209 	xendev_ring_t *xbr;
1210 	blkif_response_t *resp;
1211 	int bioerr;
1212 	uint64_t id;
1213 	extern int do_polled_io;
1214 	uint8_t op;
1215 	uint16_t status;
1216 	ddi_acc_handle_t acchdl;
1217 
1218 	mutex_enter(&vdp->xdf_dev_lk);
1219 
1220 	if ((xbr = vdp->xdf_xb_ring) == NULL) {
1221 		mutex_exit(&vdp->xdf_dev_lk);
1222 		return (DDI_INTR_UNCLAIMED);
1223 	}
1224 
1225 	acchdl = vdp->xdf_xb_ring_hdl;
1226 
1227 	/*
1228 	 * complete all requests which have a response
1229 	 */
1230 	while (resp = xvdi_ring_get_response(xbr)) {
1231 		id = ddi_get64(acchdl, &resp->id);
1232 		op = ddi_get8(acchdl, &resp->operation);
1233 		status = ddi_get16(acchdl, (uint16_t *)&resp->status);
1234 		DPRINTF(INTR_DBG, ("resp: op %d id %"PRIu64" status %d\n",
1235 		    op, id, status));
1236 
1237 		/*
1238 		 * XXPV - close connection to the backend and restart
1239 		 */
1240 		if (status != BLKIF_RSP_OKAY) {
1241 			DPRINTF(IO_DBG, ("xdf@%s: I/O error while %s",
1242 			    ddi_get_name_addr(vdp->xdf_dip),
1243 			    (op == BLKIF_OP_READ) ? "reading" : "writing"));
1244 			bioerr = EIO;
1245 		} else {
1246 			bioerr = 0;
1247 		}
1248 
1249 		xdf_iofini(vdp, id, bioerr);
1250 	}
1251 
1252 	mutex_exit(&vdp->xdf_dev_lk);
1253 
1254 	if (!do_polled_io)
1255 		xdf_iostart(vdp);
1256 
1257 	return (DDI_INTR_CLAIMED);
1258 }
1259 
1260 int xdf_fbrewrites;	/* how many times was our flush block rewritten */
1261 
1262 /*
1263  * Snarf new data if our flush block was re-written
1264  */
1265 static void
1266 check_fbwrite(xdf_t *vdp, buf_t *bp, daddr_t blkno)
1267 {
1268 	int nblks;
1269 	boolean_t mapin;
1270 
1271 	if (IS_WRITE_BARRIER(vdp, bp))
1272 		return; /* write was a flush write */
1273 
1274 	mapin = B_FALSE;
1275 	nblks = bp->b_bcount >> DEV_BSHIFT;
1276 	if (xdf_flush_block >= blkno && xdf_flush_block < (blkno + nblks)) {
1277 		xdf_fbrewrites++;
1278 		if (bp->b_flags & (B_PAGEIO | B_PHYS)) {
1279 			mapin = B_TRUE;
1280 			bp_mapin(bp);
1281 		}
1282 		bcopy(bp->b_un.b_addr +
1283 		    ((xdf_flush_block - blkno) << DEV_BSHIFT),
1284 		    vdp->xdf_cache_flush_block, DEV_BSIZE);
1285 		if (mapin)
1286 			bp_mapout(bp);
1287 	}
1288 }
1289 
1290 static void
1291 xdf_iofini(xdf_t *vdp, uint64_t id, int bioerr)
1292 {
1293 	ge_slot_t *gs = (ge_slot_t *)(uintptr_t)id;
1294 	v_req_t *vreq = gs->vreq;
1295 	buf_t *bp = vreq->v_buf;
1296 
1297 	gs_free(vdp, gs);
1298 	if (bioerr)
1299 		bioerror(bp, bioerr);
1300 	vreq->v_nslots--;
1301 	if (vreq->v_nslots != 0)
1302 		return;
1303 
1304 	XDF_UPDATE_IO_STAT(vdp, bp);
1305 	if (vdp->xdf_xdev_iostat != NULL)
1306 		kstat_runq_exit(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1307 
1308 	if (IS_ERROR(bp))
1309 		bp->b_resid = bp->b_bcount;
1310 
1311 	vreq_free(vdp, vreq);
1312 	biodone(bp);
1313 }
1314 
1315 /*
1316  * return value of xdf_prepare_rreq()
1317  * used in xdf_iostart()
1318  */
1319 #define	XF_PARTIAL	0 /* rreq is full, not all I/O in buf transferred */
1320 #define	XF_COMP		1 /* no more I/O left in buf */
1321 
1322 static void
1323 xdf_iostart(xdf_t *vdp)
1324 {
1325 	xendev_ring_t *xbr;
1326 	struct buf *bp;
1327 	blkif_request_t *rreq;
1328 	int retval;
1329 	int rreqready = 0;
1330 
1331 	xbr = vdp->xdf_xb_ring;
1332 
1333 	/*
1334 	 * populate the ring request(s)
1335 	 *
1336 	 * loop until there is no buf to transfer or no free slot
1337 	 * available in I/O ring
1338 	 */
1339 	mutex_enter(&vdp->xdf_dev_lk);
1340 
1341 	for (;;) {
1342 		if (vdp->xdf_status != XD_READY)
1343 			break;
1344 
1345 		/* active buf queue empty? */
1346 		if ((bp = vdp->xdf_f_act) == NULL)
1347 			break;
1348 
1349 		/* try to grab a vreq for this bp */
1350 		if ((BP2VREQ(bp) == NULL) && (vreq_get(vdp, bp) == NULL))
1351 				break;
1352 		/* alloc DMA/GTE resources */
1353 		if (vreq_setup(vdp, BP2VREQ(bp)) != DDI_SUCCESS)
1354 			break;
1355 
1356 		/* get next blkif_request in the ring */
1357 		if ((rreq = xvdi_ring_get_request(xbr)) == NULL)
1358 			break;
1359 		bzero(rreq, sizeof (blkif_request_t));
1360 
1361 		/* populate blkif_request with this buf */
1362 		rreqready++;
1363 		retval = xdf_prepare_rreq(vdp, bp, rreq);
1364 		if (retval == XF_COMP) {
1365 			/* finish this bp, switch to next one */
1366 			if (vdp->xdf_xdev_iostat != NULL)
1367 				kstat_waitq_to_runq(
1368 				    KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1369 			vdp->xdf_f_act = bp->av_forw;
1370 			bp->av_forw = NULL;
1371 		}
1372 	}
1373 
1374 	/*
1375 	 * Send the request(s) to the backend
1376 	 */
1377 	if (rreqready) {
1378 		if (xvdi_ring_push_request(xbr)) {
1379 			DPRINTF(IO_DBG, ("xdf_iostart: "
1380 			    "sent request(s) to backend\n"));
1381 			xvdi_notify_oe(vdp->xdf_dip);
1382 		}
1383 	}
1384 
1385 	mutex_exit(&vdp->xdf_dev_lk);
1386 }
1387 
1388 /*
1389  * populate a single blkif_request_t w/ a buf
1390  */
1391 static int
1392 xdf_prepare_rreq(xdf_t *vdp, struct buf *bp, blkif_request_t *rreq)
1393 {
1394 	int		rval;
1395 	grant_ref_t	gr;
1396 	uint8_t		fsect, lsect;
1397 	size_t		bcnt;
1398 	paddr_t		dma_addr;
1399 	off_t		blk_off;
1400 	dev_info_t	*dip = vdp->xdf_dip;
1401 	blkif_vdev_t	vdev = xvdi_get_vdevnum(dip);
1402 	v_req_t		*vreq = BP2VREQ(bp);
1403 	uint64_t	blkno = vreq->v_blkno;
1404 	uint_t		ndmacs = vreq->v_ndmacs;
1405 	ddi_acc_handle_t acchdl = vdp->xdf_xb_ring_hdl;
1406 	int		seg = 0;
1407 	int		isread = IS_READ(bp);
1408 
1409 	if (isread)
1410 		ddi_put8(acchdl, &rreq->operation, BLKIF_OP_READ);
1411 	else {
1412 		switch (vreq->v_flush_diskcache) {
1413 		case FLUSH_DISKCACHE:
1414 			ddi_put8(acchdl, &rreq->operation,
1415 			    BLKIF_OP_FLUSH_DISKCACHE);
1416 			ddi_put16(acchdl, &rreq->handle, vdev);
1417 			ddi_put64(acchdl, &rreq->id,
1418 			    (uint64_t)(uintptr_t)(vreq->v_gs));
1419 			ddi_put8(acchdl, &rreq->nr_segments, 0);
1420 			return (XF_COMP);
1421 		case WRITE_BARRIER:
1422 			ddi_put8(acchdl, &rreq->operation,
1423 			    BLKIF_OP_WRITE_BARRIER);
1424 			break;
1425 		default:
1426 			if (!vdp->xdf_wce)
1427 				ddi_put8(acchdl, &rreq->operation,
1428 				    BLKIF_OP_WRITE_BARRIER);
1429 			else
1430 				ddi_put8(acchdl, &rreq->operation,
1431 				    BLKIF_OP_WRITE);
1432 			break;
1433 		}
1434 	}
1435 
1436 	ddi_put16(acchdl, &rreq->handle, vdev);
1437 	ddi_put64(acchdl, &rreq->sector_number, blkno);
1438 	ddi_put64(acchdl, &rreq->id, (uint64_t)(uintptr_t)(vreq->v_gs));
1439 
1440 	/*
1441 	 * loop until all segments are populated or no more dma cookie in buf
1442 	 */
1443 	for (;;) {
1444 	/*
1445 	 * Each segment of a blkif request can transfer up to
1446 	 * one 4K page of data.
1447 	 */
1448 		bcnt = vreq->v_dmac.dmac_size;
1449 		ASSERT(bcnt <= PAGESIZE);
1450 		ASSERT((bcnt % XB_BSIZE) == 0);
1451 		dma_addr = vreq->v_dmac.dmac_laddress;
1452 		blk_off = (uint_t)((paddr_t)XB_SEGOFFSET & dma_addr);
1453 		ASSERT((blk_off & XB_BMASK) == 0);
1454 		fsect = blk_off >> XB_BSHIFT;
1455 		lsect = fsect + (bcnt >> XB_BSHIFT) - 1;
1456 		ASSERT(fsect < XB_MAX_SEGLEN / XB_BSIZE &&
1457 		    lsect < XB_MAX_SEGLEN / XB_BSIZE);
1458 		DPRINTF(IO_DBG, ("  ""seg%d: dmacS %lu blk_off %ld\n",
1459 		    seg, vreq->v_dmac.dmac_size, blk_off));
1460 		gr = gs_grant(vreq->v_gs, PATOMA(dma_addr) >> PAGESHIFT);
1461 		ddi_put32(acchdl, &rreq->seg[seg].gref, gr);
1462 		ddi_put8(acchdl, &rreq->seg[seg].first_sect, fsect);
1463 		ddi_put8(acchdl, &rreq->seg[seg].last_sect, lsect);
1464 		DPRINTF(IO_DBG, ("  ""seg%d: fs %d ls %d gr %d dma 0x%"PRIx64
1465 		    "\n", seg, fsect, lsect, gr, dma_addr));
1466 
1467 		blkno += (bcnt >> XB_BSHIFT);
1468 		seg++;
1469 		ASSERT(seg <= BLKIF_MAX_SEGMENTS_PER_REQUEST);
1470 		if (--ndmacs) {
1471 			ddi_dma_nextcookie(vreq->v_dmahdl, &vreq->v_dmac);
1472 			continue;
1473 		}
1474 
1475 		vreq->v_status = VREQ_DMAWIN_DONE;
1476 		vreq->v_blkno = blkno;
1477 		if (vreq->v_dmaw + 1 == vreq->v_ndmaws)
1478 			/* last win */
1479 			rval = XF_COMP;
1480 		else
1481 			rval = XF_PARTIAL;
1482 		break;
1483 	}
1484 	ddi_put8(acchdl,  &rreq->nr_segments, seg);
1485 	DPRINTF(IO_DBG, ("xdf_prepare_rreq: request id=%"PRIx64" ready\n",
1486 	    rreq->id));
1487 
1488 	return (rval);
1489 }
1490 
1491 #define	XDF_QSEC	50000	/* .005 second */
1492 #define	XDF_POLLCNT	12	/* loop for 12 times before time out */
1493 
1494 static int
1495 xdf_drain_io(xdf_t *vdp)
1496 {
1497 	int pollc, rval;
1498 	xendev_ring_t *xbr;
1499 
1500 	if (xdfdebug & SUSRES_DBG)
1501 		xen_printf("xdf_drain_io: start\n");
1502 
1503 	mutex_enter(&vdp->xdf_dev_lk);
1504 
1505 	if ((vdp->xdf_status != XD_READY) && (vdp->xdf_status != XD_SUSPEND))
1506 		goto out;
1507 
1508 	rval = 0;
1509 	xbr = vdp->xdf_xb_ring;
1510 	ASSERT(xbr != NULL);
1511 
1512 	for (pollc = 0; pollc < XDF_POLLCNT; pollc++) {
1513 		if (xvdi_ring_has_unconsumed_responses(xbr)) {
1514 			mutex_exit(&vdp->xdf_dev_lk);
1515 			(void) xdf_intr((caddr_t)vdp);
1516 			mutex_enter(&vdp->xdf_dev_lk);
1517 		}
1518 		if (!xvdi_ring_has_incomp_request(xbr))
1519 			goto out;
1520 
1521 #ifndef	XPV_HVM_DRIVER
1522 		(void) HYPERVISOR_yield();
1523 #endif /* XPV_HVM_DRIVER */
1524 		/*
1525 		 * file-backed devices can be slow
1526 		 */
1527 		drv_usecwait(XDF_QSEC << pollc);
1528 	}
1529 	cmn_err(CE_WARN, "xdf_polled_io: timeout");
1530 	rval = EIO;
1531 out:
1532 	mutex_exit(&vdp->xdf_dev_lk);
1533 	if (xdfdebug & SUSRES_DBG)
1534 		xen_printf("xdf_drain_io: end, err=%d\n", rval);
1535 	return (rval);
1536 }
1537 
1538 /* ARGSUSED5 */
1539 int
1540 xdf_lb_rdwr(dev_info_t *devi, uchar_t cmd, void *bufp,
1541     diskaddr_t start, size_t reqlen, void *tg_cookie)
1542 {
1543 	xdf_t *vdp;
1544 	struct buf *bp;
1545 	int err = 0;
1546 
1547 	vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi));
1548 	if (vdp == NULL)
1549 		return (ENXIO);
1550 
1551 	if ((start + (reqlen >> DEV_BSHIFT)) > vdp->xdf_pgeom.g_capacity)
1552 		return (EINVAL);
1553 
1554 	bp = getrbuf(KM_SLEEP);
1555 	if (cmd == TG_READ)
1556 		bp->b_flags = B_BUSY | B_READ;
1557 	else
1558 		bp->b_flags = B_BUSY | B_WRITE;
1559 	bp->b_un.b_addr = bufp;
1560 	bp->b_bcount = reqlen;
1561 	bp->b_blkno = start;
1562 	bp->b_edev = DDI_DEV_T_NONE; /* don't have dev_t */
1563 
1564 	mutex_enter(&vdp->xdf_dev_lk);
1565 	if (vdp->xdf_xdev_iostat != NULL)
1566 		kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1567 	if (vdp->xdf_f_act == NULL) {
1568 		vdp->xdf_f_act = vdp->xdf_l_act = bp;
1569 	} else {
1570 		vdp->xdf_l_act->av_forw = bp;
1571 		vdp->xdf_l_act = bp;
1572 	}
1573 	mutex_exit(&vdp->xdf_dev_lk);
1574 	xdf_iostart(vdp);
1575 	err = biowait(bp);
1576 
1577 	ASSERT(bp->b_flags & B_DONE);
1578 
1579 	freerbuf(bp);
1580 	return (err);
1581 }
1582 
1583 /*
1584  * synthetic geometry
1585  */
1586 #define	XDF_NSECTS	256
1587 #define	XDF_NHEADS	16
1588 
1589 static void
1590 xdf_synthetic_pgeom(dev_info_t *devi, cmlb_geom_t *geomp)
1591 {
1592 	xdf_t *vdp;
1593 	uint_t ncyl;
1594 
1595 	vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi));
1596 
1597 	ncyl = vdp->xdf_xdev_nblocks / (XDF_NHEADS * XDF_NSECTS);
1598 
1599 	geomp->g_ncyl = ncyl == 0 ? 1 : ncyl;
1600 	geomp->g_acyl = 0;
1601 	geomp->g_nhead = XDF_NHEADS;
1602 	geomp->g_secsize = XB_BSIZE;
1603 	geomp->g_nsect = XDF_NSECTS;
1604 	geomp->g_intrlv = 0;
1605 	geomp->g_rpm = 7200;
1606 	geomp->g_capacity = vdp->xdf_xdev_nblocks;
1607 }
1608 
1609 static int
1610 xdf_lb_getcap(dev_info_t *devi, diskaddr_t *capp)
1611 {
1612 	xdf_t *vdp;
1613 
1614 	vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi));
1615 
1616 	if (vdp == NULL)
1617 		return (ENXIO);
1618 
1619 	mutex_enter(&vdp->xdf_dev_lk);
1620 	*capp = vdp->xdf_pgeom.g_capacity;
1621 	DPRINTF(LBL_DBG, ("capacity %llu\n", *capp));
1622 	mutex_exit(&vdp->xdf_dev_lk);
1623 	return (0);
1624 }
1625 
1626 static int
1627 xdf_lb_getpgeom(dev_info_t *devi, cmlb_geom_t *geomp)
1628 {
1629 	xdf_t *vdp;
1630 
1631 	if ((vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi))) == NULL)
1632 		return (ENXIO);
1633 	*geomp = vdp->xdf_pgeom;
1634 	return (0);
1635 }
1636 
1637 /*
1638  * No real HBA, no geometry available from it
1639  */
1640 /*ARGSUSED*/
1641 static int
1642 xdf_lb_getvgeom(dev_info_t *devi, cmlb_geom_t *geomp)
1643 {
1644 	return (EINVAL);
1645 }
1646 
1647 static int
1648 xdf_lb_getattribute(dev_info_t *devi, tg_attribute_t *tgattributep)
1649 {
1650 	xdf_t *vdp;
1651 
1652 	if (!(vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi))))
1653 		return (ENXIO);
1654 
1655 	if (XD_IS_RO(vdp))
1656 		tgattributep->media_is_writable = 0;
1657 	else
1658 		tgattributep->media_is_writable = 1;
1659 	return (0);
1660 }
1661 
1662 /* ARGSUSED3 */
1663 int
1664 xdf_lb_getinfo(dev_info_t *devi, int cmd, void *arg, void *tg_cookie)
1665 {
1666 	switch (cmd) {
1667 	case TG_GETPHYGEOM:
1668 		return (xdf_lb_getpgeom(devi, (cmlb_geom_t *)arg));
1669 	case TG_GETVIRTGEOM:
1670 		return (xdf_lb_getvgeom(devi, (cmlb_geom_t *)arg));
1671 	case TG_GETCAPACITY:
1672 		return (xdf_lb_getcap(devi, (diskaddr_t *)arg));
1673 	case TG_GETBLOCKSIZE:
1674 		*(uint32_t *)arg = XB_BSIZE;
1675 		return (0);
1676 	case TG_GETATTR:
1677 		return (xdf_lb_getattribute(devi, (tg_attribute_t *)arg));
1678 	default:
1679 		return (ENOTTY);
1680 	}
1681 }
1682 
1683 /*
1684  * Kick-off connect process
1685  * Status should be XD_UNKNOWN or XD_CLOSED
1686  * On success, status will be changed to XD_INIT
1687  * On error, status won't be changed
1688  */
1689 static int
1690 xdf_start_connect(xdf_t *vdp)
1691 {
1692 	char *xsnode;
1693 	grant_ref_t gref;
1694 	xenbus_transaction_t xbt;
1695 	int rv;
1696 	dev_info_t *dip = vdp->xdf_dip;
1697 
1698 	if ((vdp->xdf_peer = xvdi_get_oeid(dip)) == (domid_t)-1)
1699 		goto errout;
1700 
1701 	if (xvdi_alloc_evtchn(dip) != DDI_SUCCESS) {
1702 		cmn_err(CE_WARN, "xdf@%s: failed to alloc event channel",
1703 		    ddi_get_name_addr(dip));
1704 		goto errout;
1705 	}
1706 	vdp->xdf_evtchn = xvdi_get_evtchn(dip);
1707 #ifdef XPV_HVM_DRIVER
1708 	ec_bind_evtchn_to_handler(vdp->xdf_evtchn, IPL_VBD, xdf_intr, vdp);
1709 #else /* !XPV_HVM_DRIVER */
1710 	if (ddi_add_intr(dip, 0, NULL, NULL, xdf_intr, (caddr_t)vdp) !=
1711 	    DDI_SUCCESS) {
1712 		cmn_err(CE_WARN, "xdf_start_connect: xdf@%s: "
1713 		    "failed to add intr handler", ddi_get_name_addr(dip));
1714 		goto errout1;
1715 	}
1716 #endif /* !XPV_HVM_DRIVER */
1717 
1718 	if (xvdi_alloc_ring(dip, BLKIF_RING_SIZE,
1719 	    sizeof (union blkif_sring_entry), &gref, &vdp->xdf_xb_ring) !=
1720 	    DDI_SUCCESS) {
1721 		cmn_err(CE_WARN, "xdf@%s: failed to alloc comm ring",
1722 		    ddi_get_name_addr(dip));
1723 		goto errout2;
1724 	}
1725 	vdp->xdf_xb_ring_hdl = vdp->xdf_xb_ring->xr_acc_hdl; /* ugly!! */
1726 
1727 	/*
1728 	 * Write into xenstore the info needed by backend
1729 	 */
1730 	if ((xsnode = xvdi_get_xsname(dip)) == NULL) {
1731 		cmn_err(CE_WARN, "xdf@%s: "
1732 		    "failed to get xenstore node path",
1733 		    ddi_get_name_addr(dip));
1734 		goto fail_trans;
1735 	}
1736 trans_retry:
1737 	if (xenbus_transaction_start(&xbt)) {
1738 		cmn_err(CE_WARN, "xdf@%s: failed to start transaction",
1739 		    ddi_get_name_addr(dip));
1740 		xvdi_fatal_error(dip, EIO, "transaction start");
1741 		goto fail_trans;
1742 	}
1743 
1744 	if (rv = xenbus_printf(xbt, xsnode, "ring-ref", "%u", gref)) {
1745 		cmn_err(CE_WARN, "xdf@%s: failed to write ring-ref",
1746 		    ddi_get_name_addr(dip));
1747 		xvdi_fatal_error(dip, rv, "writing ring-ref");
1748 		goto abort_trans;
1749 	}
1750 
1751 	if (rv = xenbus_printf(xbt, xsnode, "event-channel", "%u",
1752 	    vdp->xdf_evtchn)) {
1753 		cmn_err(CE_WARN, "xdf@%s: failed to write event-channel",
1754 		    ddi_get_name_addr(dip));
1755 		xvdi_fatal_error(dip, rv, "writing event-channel");
1756 		goto abort_trans;
1757 	}
1758 
1759 	/*
1760 	 * "protocol" is written by the domain builder in the case of PV
1761 	 * domains. However, it is not written for HVM domains, so let's
1762 	 * write it here.
1763 	 */
1764 	if (rv = xenbus_printf(xbt, xsnode, "protocol", "%s",
1765 	    XEN_IO_PROTO_ABI_NATIVE)) {
1766 		cmn_err(CE_WARN, "xdf@%s: failed to write protocol",
1767 		    ddi_get_name_addr(dip));
1768 		xvdi_fatal_error(dip, rv, "writing protocol");
1769 		goto abort_trans;
1770 	}
1771 
1772 	if ((rv = xvdi_switch_state(dip, xbt, XenbusStateInitialised)) > 0) {
1773 		cmn_err(CE_WARN, "xdf@%s: "
1774 		    "failed to switch state to XenbusStateInitialised",
1775 		    ddi_get_name_addr(dip));
1776 		xvdi_fatal_error(dip, rv, "writing state");
1777 		goto abort_trans;
1778 	}
1779 
1780 	/* kick-off connect process */
1781 	if (rv = xenbus_transaction_end(xbt, 0)) {
1782 		if (rv == EAGAIN)
1783 			goto trans_retry;
1784 		cmn_err(CE_WARN, "xdf@%s: failed to end transaction",
1785 		    ddi_get_name_addr(dip));
1786 		xvdi_fatal_error(dip, rv, "completing transaction");
1787 		goto fail_trans;
1788 	}
1789 
1790 	ASSERT(mutex_owned(&vdp->xdf_cb_lk));
1791 	mutex_enter(&vdp->xdf_dev_lk);
1792 	vdp->xdf_status = XD_INIT;
1793 	mutex_exit(&vdp->xdf_dev_lk);
1794 
1795 	return (DDI_SUCCESS);
1796 
1797 abort_trans:
1798 	(void) xenbus_transaction_end(xbt, 1);
1799 fail_trans:
1800 	xvdi_free_ring(vdp->xdf_xb_ring);
1801 errout2:
1802 #ifdef XPV_HVM_DRIVER
1803 	ec_unbind_evtchn(vdp->xdf_evtchn);
1804 #else /* !XPV_HVM_DRIVER */
1805 	(void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
1806 #endif /* !XPV_HVM_DRIVER */
1807 errout1:
1808 	xvdi_free_evtchn(dip);
1809 errout:
1810 	cmn_err(CE_WARN, "xdf@%s: fail to kick-off connecting",
1811 	    ddi_get_name_addr(dip));
1812 	return (DDI_FAILURE);
1813 }
1814 
1815 /*
1816  * Kick-off disconnect process
1817  * Status won't be changed
1818  */
1819 static int
1820 xdf_start_disconnect(xdf_t *vdp)
1821 {
1822 	if (xvdi_switch_state(vdp->xdf_dip, XBT_NULL, XenbusStateClosed) > 0) {
1823 		cmn_err(CE_WARN, "xdf@%s: fail to kick-off disconnecting",
1824 		    ddi_get_name_addr(vdp->xdf_dip));
1825 		return (DDI_FAILURE);
1826 	}
1827 
1828 	return (DDI_SUCCESS);
1829 }
1830 
1831 int
1832 xdf_get_flush_block(xdf_t *vdp)
1833 {
1834 	/*
1835 	 * Get a DEV_BSIZE aligned bufer
1836 	 */
1837 	vdp->xdf_flush_mem = kmem_alloc(DEV_BSIZE * 2, KM_SLEEP);
1838 	vdp->xdf_cache_flush_block =
1839 	    (char *)P2ROUNDUP((uintptr_t)(vdp->xdf_flush_mem), DEV_BSIZE);
1840 	if (xdf_lb_rdwr(vdp->xdf_dip, TG_READ, vdp->xdf_cache_flush_block,
1841 	    xdf_flush_block, DEV_BSIZE, NULL) != 0)
1842 		return (DDI_FAILURE);
1843 	return (DDI_SUCCESS);
1844 }
1845 
1846 /*
1847  * Finish other initialization after we've connected to backend
1848  * Status should be XD_INIT before calling this routine
1849  * On success, status should be changed to XD_READY
1850  * On error, status should stay XD_INIT
1851  */
1852 static int
1853 xdf_post_connect(xdf_t *vdp)
1854 {
1855 	int rv;
1856 	uint_t len;
1857 	char *type;
1858 	char *barrier;
1859 	dev_info_t *devi = vdp->xdf_dip;
1860 
1861 	/*
1862 	 * Determine if feature barrier is supported by backend
1863 	 */
1864 	if (xenbus_read(XBT_NULL, xvdi_get_oename(devi),
1865 	    "feature-barrier", (void **)&barrier, &len) == 0) {
1866 		vdp->xdf_feature_barrier = 1;
1867 		kmem_free(barrier, len);
1868 	} else {
1869 		cmn_err(CE_NOTE, "xdf@%s: failed to read feature-barrier",
1870 		    ddi_get_name_addr(vdp->xdf_dip));
1871 		vdp->xdf_feature_barrier = 0;
1872 	}
1873 
1874 	/* probe backend */
1875 	if (rv = xenbus_gather(XBT_NULL, xvdi_get_oename(devi),
1876 	    "sectors", "%"SCNu64, &vdp->xdf_xdev_nblocks,
1877 	    "info", "%u", &vdp->xdf_xdev_info, NULL)) {
1878 		cmn_err(CE_WARN, "xdf_post_connect: xdf@%s: "
1879 		    "cannot read backend info", ddi_get_name_addr(devi));
1880 		xvdi_fatal_error(devi, rv, "reading backend info");
1881 		return (DDI_FAILURE);
1882 	}
1883 
1884 	/*
1885 	 * Make sure that the device we're connecting isn't smaller than
1886 	 * the old connected device.
1887 	 */
1888 	if (vdp->xdf_xdev_nblocks < vdp->xdf_pgeom.g_capacity) {
1889 		cmn_err(CE_WARN, "xdf_post_connect: xdf@%s: "
1890 		    "backend disk device shrank", ddi_get_name_addr(devi));
1891 		/* XXX:  call xvdi_fatal_error() here? */
1892 		xvdi_fatal_error(devi, rv, "reading backend info");
1893 		return (DDI_FAILURE);
1894 	}
1895 
1896 	/*
1897 	 * Only update the physical geometry to reflect the new device
1898 	 * size if this is the first time we're connecting to the backend
1899 	 * device.  Once we assign a physical geometry to a device it stays
1900 	 * fixed until:
1901 	 *	- we get detach and re-attached (at which point we
1902 	 *	  automatically assign a new physical geometry).
1903 	 *	- someone calls TG_SETPHYGEOM to explicity set the
1904 	 *	  physical geometry.
1905 	 */
1906 	if (vdp->xdf_pgeom.g_capacity == 0)
1907 		xdf_synthetic_pgeom(devi, &vdp->xdf_pgeom);
1908 
1909 	/* fix disk type */
1910 	if (xenbus_read(XBT_NULL, xvdi_get_xsname(devi), "device-type",
1911 	    (void **)&type, &len) != 0) {
1912 		cmn_err(CE_WARN, "xdf_post_connect: xdf@%s: "
1913 		    "cannot read device-type", ddi_get_name_addr(devi));
1914 		xvdi_fatal_error(devi, rv, "reading device-type");
1915 		return (DDI_FAILURE);
1916 	}
1917 	if (strcmp(type, "cdrom") == 0)
1918 		vdp->xdf_xdev_info |= VDISK_CDROM;
1919 	kmem_free(type, len);
1920 
1921 	/*
1922 	 * We've created all the minor nodes via cmlb_attach() using default
1923 	 * value in xdf_attach() to make it possible to block in xdf_open(),
1924 	 * in case there's anyone (say, booting thread) ever trying to open
1925 	 * it before connected to backend. We will refresh all those minor
1926 	 * nodes w/ latest info we've got now when we are almost connected.
1927 	 *
1928 	 * Don't do this when xdf is already opened by someone (could happen
1929 	 * during resume), for that cmlb_attach() will invalid the label info
1930 	 * and confuse those who has already opened the node, which is bad.
1931 	 */
1932 	if (!xdf_isopen(vdp, -1) && (XD_IS_CD(vdp) || XD_IS_RM(vdp))) {
1933 		/* re-init cmlb w/ latest info we got from backend */
1934 		if (cmlb_attach(devi, &xdf_lb_ops,
1935 		    XD_IS_CD(vdp) ? DTYPE_RODIRECT : DTYPE_DIRECT,
1936 		    XD_IS_RM(vdp), 1,
1937 		    XD_IS_CD(vdp) ? DDI_NT_CD_XVMD : DDI_NT_BLOCK_XVMD,
1938 #if defined(XPV_HVM_DRIVER)
1939 		    CMLB_CREATE_ALTSLICE_VTOC_16_DTYPE_DIRECT |
1940 		    CMLB_INTERNAL_MINOR_NODES,
1941 #else /* !XPV_HVM_DRIVER */
1942 		    CMLB_FAKE_LABEL_ONE_PARTITION,
1943 #endif /* !XPV_HVM_DRIVER */
1944 		    vdp->xdf_vd_lbl, NULL) != 0) {
1945 			cmn_err(CE_WARN, "xdf@%s: cmlb attach failed",
1946 			    ddi_get_name_addr(devi));
1947 			return (DDI_FAILURE);
1948 		}
1949 	}
1950 
1951 	/* mark vbd is ready for I/O */
1952 	ASSERT(mutex_owned(&vdp->xdf_cb_lk));
1953 	mutex_enter(&vdp->xdf_dev_lk);
1954 	vdp->xdf_status = XD_READY;
1955 	mutex_exit(&vdp->xdf_dev_lk);
1956 	/*
1957 	 * If backend has feature-barrier, see if it supports disk
1958 	 * cache flush op.
1959 	 */
1960 	vdp->xdf_flush_supported = 0;
1961 	if (vdp->xdf_feature_barrier) {
1962 		/*
1963 		 * Pretend we already know flush is supported so probe
1964 		 * will attempt the correct op.
1965 		 */
1966 		vdp->xdf_flush_supported = 1;
1967 		if (xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE, NULL, 0, 0, 0) == 0) {
1968 			vdp->xdf_flush_supported = 1;
1969 		} else {
1970 			vdp->xdf_flush_supported = 0;
1971 			/*
1972 			 * If the other end does not support the cache flush op
1973 			 * then we must use a barrier-write to force disk
1974 			 * cache flushing.  Barrier writes require that a data
1975 			 * block actually be written.
1976 			 * Cache a block to barrier-write when we are
1977 			 * asked to perform a flush.
1978 			 * XXX - would it be better to just copy 1 block
1979 			 * (512 bytes) from whatever write we did last
1980 			 * and rewrite that block?
1981 			 */
1982 			if (xdf_get_flush_block(vdp) != DDI_SUCCESS)
1983 				return (DDI_FAILURE);
1984 		}
1985 	}
1986 
1987 	cmn_err(CE_CONT, "?xdf@%s: %"PRIu64" blocks", ddi_get_name_addr(devi),
1988 	    (uint64_t)vdp->xdf_xdev_nblocks);
1989 
1990 	return (DDI_SUCCESS);
1991 }
1992 
1993 /*
1994  * Finish other uninitialization after we've disconnected from backend
1995  * when status is XD_CLOSING or XD_INIT. After returns, status is XD_CLOSED
1996  */
1997 static void
1998 xdf_post_disconnect(xdf_t *vdp)
1999 {
2000 #ifdef XPV_HVM_DRIVER
2001 	ec_unbind_evtchn(vdp->xdf_evtchn);
2002 #else /* !XPV_HVM_DRIVER */
2003 	(void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
2004 #endif /* !XPV_HVM_DRIVER */
2005 	xvdi_free_evtchn(vdp->xdf_dip);
2006 	xvdi_free_ring(vdp->xdf_xb_ring);
2007 	vdp->xdf_xb_ring = NULL;
2008 	vdp->xdf_xb_ring_hdl = NULL;
2009 	vdp->xdf_peer = (domid_t)-1;
2010 
2011 	ASSERT(mutex_owned(&vdp->xdf_cb_lk));
2012 	mutex_enter(&vdp->xdf_dev_lk);
2013 	vdp->xdf_status = XD_CLOSED;
2014 	mutex_exit(&vdp->xdf_dev_lk);
2015 }
2016 
2017 /*ARGSUSED*/
2018 static void
2019 xdf_oe_change(dev_info_t *dip, ddi_eventcookie_t id, void *arg, void *impl_data)
2020 {
2021 	XenbusState new_state = *(XenbusState *)impl_data;
2022 	xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip);
2023 	boolean_t unexpect_die = B_FALSE;
2024 	int status;
2025 
2026 	DPRINTF(DDI_DBG, ("xdf@%s: otherend state change to %d!\n",
2027 	    ddi_get_name_addr(dip), new_state));
2028 
2029 	mutex_enter(&vdp->xdf_cb_lk);
2030 
2031 	if (xdf_check_state_transition(vdp, new_state) == DDI_FAILURE) {
2032 		mutex_exit(&vdp->xdf_cb_lk);
2033 		return;
2034 	}
2035 
2036 	switch (new_state) {
2037 	case XenbusStateInitialising:
2038 		ASSERT(vdp->xdf_status == XD_CLOSED);
2039 		/*
2040 		 * backend recovered from a previous failure,
2041 		 * kick-off connect process again
2042 		 */
2043 		if (xdf_start_connect(vdp) != DDI_SUCCESS) {
2044 			cmn_err(CE_WARN, "xdf@%s:"
2045 			    " failed to start reconnecting to backend",
2046 			    ddi_get_name_addr(dip));
2047 		}
2048 		break;
2049 	case XenbusStateConnected:
2050 		ASSERT(vdp->xdf_status == XD_INIT);
2051 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
2052 		/* finish final init after connect */
2053 		if (xdf_post_connect(vdp) != DDI_SUCCESS)
2054 			(void) xdf_start_disconnect(vdp);
2055 		break;
2056 	case XenbusStateClosing:
2057 		if (vdp->xdf_status == XD_READY) {
2058 			mutex_enter(&vdp->xdf_dev_lk);
2059 			if (xdf_isopen(vdp, -1)) {
2060 				cmn_err(CE_NOTE, "xdf@%s: hot-unplug failed, "
2061 				    "still in use", ddi_get_name_addr(dip));
2062 				mutex_exit(&vdp->xdf_dev_lk);
2063 				break;
2064 			} else {
2065 				vdp->xdf_status = XD_CLOSING;
2066 			}
2067 			mutex_exit(&vdp->xdf_dev_lk);
2068 		}
2069 		(void) xdf_start_disconnect(vdp);
2070 		break;
2071 	case XenbusStateClosed:
2072 		/* first check if BE closed unexpectedly */
2073 		mutex_enter(&vdp->xdf_dev_lk);
2074 		if (xdf_isopen(vdp, -1)) {
2075 			unexpect_die = B_TRUE;
2076 			unexpectedie(vdp);
2077 			cmn_err(CE_WARN, "xdf@%s: backend closed, "
2078 			    "reconnecting...", ddi_get_name_addr(dip));
2079 		}
2080 		mutex_exit(&vdp->xdf_dev_lk);
2081 
2082 		if (vdp->xdf_status == XD_READY) {
2083 			mutex_enter(&vdp->xdf_dev_lk);
2084 			vdp->xdf_status = XD_CLOSING;
2085 			mutex_exit(&vdp->xdf_dev_lk);
2086 
2087 #ifdef	DOMU_BACKEND
2088 			(void) xvdi_post_event(dip, XEN_HP_REMOVE);
2089 #endif
2090 
2091 			xdf_post_disconnect(vdp);
2092 			(void) xvdi_switch_state(dip, XBT_NULL,
2093 			    XenbusStateClosed);
2094 		} else if ((vdp->xdf_status == XD_INIT) ||
2095 		    (vdp->xdf_status == XD_CLOSING)) {
2096 			xdf_post_disconnect(vdp);
2097 		} else {
2098 			mutex_enter(&vdp->xdf_dev_lk);
2099 			vdp->xdf_status = XD_CLOSED;
2100 			mutex_exit(&vdp->xdf_dev_lk);
2101 		}
2102 	}
2103 
2104 	/* notify anybody waiting for oe state change */
2105 	mutex_enter(&vdp->xdf_dev_lk);
2106 	cv_broadcast(&vdp->xdf_dev_cv);
2107 	mutex_exit(&vdp->xdf_dev_lk);
2108 
2109 	status = vdp->xdf_status;
2110 	mutex_exit(&vdp->xdf_cb_lk);
2111 
2112 	if (status == XD_READY) {
2113 		xdf_iostart(vdp);
2114 	} else if ((status == XD_CLOSED) && !unexpect_die) {
2115 		/* interface is closed successfully, remove all minor nodes */
2116 		if (vdp->xdf_vd_lbl != NULL) {
2117 			cmlb_detach(vdp->xdf_vd_lbl, NULL);
2118 			cmlb_free_handle(&vdp->xdf_vd_lbl);
2119 			vdp->xdf_vd_lbl = NULL;
2120 		}
2121 	}
2122 }
2123 
2124 /* check if partition is open, -1 - check all partitions on the disk */
2125 static boolean_t
2126 xdf_isopen(xdf_t *vdp, int partition)
2127 {
2128 	int i;
2129 	ulong_t parbit;
2130 	boolean_t rval = B_FALSE;
2131 
2132 	ASSERT((partition == -1) ||
2133 	    ((partition >= 0) || (partition < XDF_PEXT)));
2134 
2135 	if (partition == -1)
2136 		parbit = (ulong_t)-1;
2137 	else
2138 		parbit = 1 << partition;
2139 
2140 	for (i = 0; i < OTYPCNT; i++) {
2141 		if (vdp->xdf_vd_open[i] & parbit)
2142 			rval = B_TRUE;
2143 	}
2144 
2145 	return (rval);
2146 }
2147 
2148 /*
2149  * Xdf_check_state_transition will check the XenbusState change to see
2150  * if the change is a valid transition or not.
2151  * The new state is written by backend domain, or by running xenstore-write
2152  * to change it manually in dom0
2153  */
2154 static int
2155 xdf_check_state_transition(xdf_t *vdp, XenbusState oestate)
2156 {
2157 	int status;
2158 	int stcheck;
2159 #define	STOK	0 /* need further process */
2160 #define	STNOP	1 /* no action need taking */
2161 #define	STBUG	2 /* unexpected state change, could be a bug */
2162 
2163 	status = vdp->xdf_status;
2164 	stcheck = STOK;
2165 
2166 	switch (status) {
2167 	case XD_UNKNOWN:
2168 		if ((oestate == XenbusStateUnknown)		||
2169 		    (oestate == XenbusStateConnected))
2170 			stcheck = STBUG;
2171 		else if ((oestate == XenbusStateInitialising)	||
2172 		    (oestate == XenbusStateInitWait)		||
2173 		    (oestate == XenbusStateInitialised))
2174 			stcheck = STNOP;
2175 		break;
2176 	case XD_INIT:
2177 		if (oestate == XenbusStateUnknown)
2178 			stcheck = STBUG;
2179 		else if ((oestate == XenbusStateInitialising)	||
2180 		    (oestate == XenbusStateInitWait)		||
2181 		    (oestate == XenbusStateInitialised))
2182 			stcheck = STNOP;
2183 		break;
2184 	case XD_READY:
2185 		if ((oestate == XenbusStateUnknown)		||
2186 		    (oestate == XenbusStateInitialising)	||
2187 		    (oestate == XenbusStateInitWait)		||
2188 		    (oestate == XenbusStateInitialised))
2189 			stcheck = STBUG;
2190 		else if (oestate == XenbusStateConnected)
2191 			stcheck = STNOP;
2192 		break;
2193 	case XD_CLOSING:
2194 		if ((oestate == XenbusStateUnknown)		||
2195 		    (oestate == XenbusStateInitialising)	||
2196 		    (oestate == XenbusStateInitWait)		||
2197 		    (oestate == XenbusStateInitialised)		||
2198 		    (oestate == XenbusStateConnected))
2199 			stcheck = STBUG;
2200 		else if (oestate == XenbusStateClosing)
2201 			stcheck = STNOP;
2202 		break;
2203 	case XD_CLOSED:
2204 		if ((oestate == XenbusStateUnknown)		||
2205 		    (oestate == XenbusStateConnected))
2206 			stcheck = STBUG;
2207 		else if ((oestate == XenbusStateInitWait)	||
2208 		    (oestate == XenbusStateInitialised)		||
2209 		    (oestate == XenbusStateClosing)		||
2210 		    (oestate == XenbusStateClosed))
2211 			stcheck = STNOP;
2212 		break;
2213 	case XD_SUSPEND:
2214 	default:
2215 			stcheck = STBUG;
2216 	}
2217 
2218 	if (stcheck == STOK)
2219 		return (DDI_SUCCESS);
2220 
2221 	if (stcheck == STBUG)
2222 		cmn_err(CE_NOTE, "xdf@%s: unexpected otherend "
2223 		    "state change to %d!, when status is %d",
2224 		    ddi_get_name_addr(vdp->xdf_dip), oestate, status);
2225 
2226 	return (DDI_FAILURE);
2227 }
2228 
2229 static int
2230 xdf_connect(xdf_t *vdp, boolean_t wait)
2231 {
2232 	ASSERT(mutex_owned(&vdp->xdf_dev_lk));
2233 	while (vdp->xdf_status != XD_READY) {
2234 		if (!wait || (vdp->xdf_status > XD_READY))
2235 			break;
2236 
2237 		if (cv_wait_sig(&vdp->xdf_dev_cv, &vdp->xdf_dev_lk) == 0)
2238 			break;
2239 	}
2240 
2241 	return (vdp->xdf_status);
2242 }
2243 
2244 /*
2245  * callback func when DMA/GTE resources is available
2246  *
2247  * Note: we only register one callback function to grant table subsystem
2248  * since we only have one 'struct gnttab_free_callback' in xdf_t.
2249  */
2250 static int
2251 xdf_dmacallback(caddr_t arg)
2252 {
2253 	xdf_t *vdp = (xdf_t *)arg;
2254 	ASSERT(vdp != NULL);
2255 
2256 	DPRINTF(DMA_DBG, ("xdf@%s: DMA callback started\n",
2257 	    ddi_get_name_addr(vdp->xdf_dip)));
2258 
2259 	ddi_trigger_softintr(vdp->xdf_softintr_id);
2260 	return (DDI_DMA_CALLBACK_DONE);
2261 }
2262 
2263 static uint_t
2264 xdf_iorestart(caddr_t arg)
2265 {
2266 	xdf_t *vdp = (xdf_t *)arg;
2267 
2268 	ASSERT(vdp != NULL);
2269 
2270 	mutex_enter(&vdp->xdf_dev_lk);
2271 	ASSERT(ISDMACBON(vdp));
2272 	SETDMACBOFF(vdp);
2273 	mutex_exit(&vdp->xdf_dev_lk);
2274 
2275 	xdf_iostart(vdp);
2276 
2277 	return (DDI_INTR_CLAIMED);
2278 }
2279 
2280 static void
2281 xdf_timeout_handler(void *arg)
2282 {
2283 	xdf_t *vdp = arg;
2284 
2285 	mutex_enter(&vdp->xdf_dev_lk);
2286 	vdp->xdf_timeout_id = 0;
2287 	mutex_exit(&vdp->xdf_dev_lk);
2288 
2289 	/* new timeout thread could be re-scheduled */
2290 	xdf_iostart(vdp);
2291 }
2292 
2293 /*
2294  * Alloc a vreq for this bp
2295  * bp->av_back contains the pointer to the vreq upon return
2296  */
2297 static v_req_t *
2298 vreq_get(xdf_t *vdp, buf_t *bp)
2299 {
2300 	v_req_t *vreq = NULL;
2301 
2302 	ASSERT(BP2VREQ(bp) == NULL);
2303 
2304 	vreq = kmem_cache_alloc(xdf_vreq_cache, KM_NOSLEEP);
2305 	if (vreq == NULL) {
2306 		if (vdp->xdf_timeout_id == 0)
2307 			/* restart I/O after one second */
2308 			vdp->xdf_timeout_id =
2309 			    timeout(xdf_timeout_handler, vdp, hz);
2310 		return (NULL);
2311 	}
2312 	bzero(vreq, sizeof (v_req_t));
2313 
2314 	list_insert_head(&vdp->xdf_vreq_act, (void *)vreq);
2315 	bp->av_back = (buf_t *)vreq;
2316 	vreq->v_buf = bp;
2317 	vreq->v_status = VREQ_INIT;
2318 	/* init of other fields in vreq is up to the caller */
2319 
2320 	return (vreq);
2321 }
2322 
2323 static void
2324 vreq_free(xdf_t *vdp, v_req_t *vreq)
2325 {
2326 	buf_t *bp = vreq->v_buf;
2327 
2328 	list_remove(&vdp->xdf_vreq_act, (void *)vreq);
2329 
2330 	if (vreq->v_flush_diskcache == FLUSH_DISKCACHE)
2331 		goto done;
2332 
2333 	switch (vreq->v_status) {
2334 	case VREQ_DMAWIN_DONE:
2335 	case VREQ_GS_ALLOCED:
2336 	case VREQ_DMABUF_BOUND:
2337 		(void) ddi_dma_unbind_handle(vreq->v_dmahdl);
2338 		/*FALLTHRU*/
2339 	case VREQ_DMAMEM_ALLOCED:
2340 		if (!ALIGNED_XFER(bp)) {
2341 			ASSERT(vreq->v_abuf != NULL);
2342 			if (!IS_ERROR(bp) && IS_READ(bp))
2343 				bcopy(vreq->v_abuf, bp->b_un.b_addr,
2344 				    bp->b_bcount);
2345 			ddi_dma_mem_free(&vreq->v_align);
2346 		}
2347 		/*FALLTHRU*/
2348 	case VREQ_MEMDMAHDL_ALLOCED:
2349 		if (!ALIGNED_XFER(bp))
2350 			ddi_dma_free_handle(&vreq->v_memdmahdl);
2351 		/*FALLTHRU*/
2352 	case VREQ_DMAHDL_ALLOCED:
2353 		ddi_dma_free_handle(&vreq->v_dmahdl);
2354 		break;
2355 	default:
2356 		break;
2357 	}
2358 done:
2359 	vreq->v_buf->av_back = NULL;
2360 	kmem_cache_free(xdf_vreq_cache, vreq);
2361 }
2362 
2363 /*
2364  * Initalize the DMA and grant table resources for the buf
2365  */
2366 static int
2367 vreq_setup(xdf_t *vdp, v_req_t *vreq)
2368 {
2369 	int rc;
2370 	ddi_dma_attr_t dmaattr;
2371 	uint_t ndcs, ndws;
2372 	ddi_dma_handle_t dh;
2373 	ddi_dma_handle_t mdh;
2374 	ddi_dma_cookie_t dc;
2375 	ddi_acc_handle_t abh;
2376 	caddr_t	aba;
2377 	ge_slot_t *gs;
2378 	size_t bufsz;
2379 	off_t off;
2380 	size_t sz;
2381 	buf_t *bp = vreq->v_buf;
2382 	int dma_flags = (IS_READ(bp) ? DDI_DMA_READ : DDI_DMA_WRITE) |
2383 	    DDI_DMA_STREAMING | DDI_DMA_PARTIAL;
2384 
2385 	switch (vreq->v_status) {
2386 	case VREQ_INIT:
2387 		if (IS_FLUSH_DISKCACHE(bp)) {
2388 			if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
2389 				DPRINTF(DMA_DBG, (
2390 				    "xdf@%s: get ge_slotfailed\n",
2391 				    ddi_get_name_addr(vdp->xdf_dip)));
2392 				return (DDI_FAILURE);
2393 			}
2394 			vreq->v_blkno = 0;
2395 			vreq->v_nslots = 1;
2396 			vreq->v_gs = gs;
2397 			vreq->v_flush_diskcache = FLUSH_DISKCACHE;
2398 			vreq->v_status = VREQ_GS_ALLOCED;
2399 			gs->vreq = vreq;
2400 			return (DDI_SUCCESS);
2401 		}
2402 
2403 		if (IS_WRITE_BARRIER(vdp, bp))
2404 			vreq->v_flush_diskcache = WRITE_BARRIER;
2405 		vreq->v_blkno = bp->b_blkno +
2406 		    (diskaddr_t)(uintptr_t)bp->b_private;
2407 		bp->b_private = NULL;
2408 		/* See if we wrote new data to our flush block */
2409 		if (!IS_READ(bp) && USE_WRITE_BARRIER(vdp))
2410 			check_fbwrite(vdp, bp, vreq->v_blkno);
2411 		vreq->v_status = VREQ_INIT_DONE;
2412 		/*FALLTHRU*/
2413 
2414 	case VREQ_INIT_DONE:
2415 		/*
2416 		 * alloc DMA handle
2417 		 */
2418 		rc = ddi_dma_alloc_handle(vdp->xdf_dip, &xb_dma_attr,
2419 		    xdf_dmacallback, (caddr_t)vdp, &dh);
2420 		if (rc != DDI_SUCCESS) {
2421 			SETDMACBON(vdp);
2422 			DPRINTF(DMA_DBG, ("xdf@%s: DMA handle alloc failed\n",
2423 			    ddi_get_name_addr(vdp->xdf_dip)));
2424 			return (DDI_FAILURE);
2425 		}
2426 
2427 		vreq->v_dmahdl = dh;
2428 		vreq->v_status = VREQ_DMAHDL_ALLOCED;
2429 		/*FALLTHRU*/
2430 
2431 	case VREQ_DMAHDL_ALLOCED:
2432 		/*
2433 		 * alloc dma handle for 512-byte aligned buf
2434 		 */
2435 		if (!ALIGNED_XFER(bp)) {
2436 			/*
2437 			 * XXPV: we need to temporarily enlarge the seg
2438 			 * boundary and s/g length to work round CR6381968
2439 			 */
2440 			dmaattr = xb_dma_attr;
2441 			dmaattr.dma_attr_seg = (uint64_t)-1;
2442 			dmaattr.dma_attr_sgllen = INT_MAX;
2443 			rc = ddi_dma_alloc_handle(vdp->xdf_dip, &dmaattr,
2444 			    xdf_dmacallback, (caddr_t)vdp, &mdh);
2445 			if (rc != DDI_SUCCESS) {
2446 				SETDMACBON(vdp);
2447 				DPRINTF(DMA_DBG, ("xdf@%s: unaligned buf DMA"
2448 				    "handle alloc failed\n",
2449 				    ddi_get_name_addr(vdp->xdf_dip)));
2450 				return (DDI_FAILURE);
2451 			}
2452 			vreq->v_memdmahdl = mdh;
2453 			vreq->v_status = VREQ_MEMDMAHDL_ALLOCED;
2454 		}
2455 		/*FALLTHRU*/
2456 
2457 	case VREQ_MEMDMAHDL_ALLOCED:
2458 		/*
2459 		 * alloc 512-byte aligned buf
2460 		 */
2461 		if (!ALIGNED_XFER(bp)) {
2462 			if (bp->b_flags & (B_PAGEIO | B_PHYS))
2463 				bp_mapin(bp);
2464 
2465 			rc = ddi_dma_mem_alloc(vreq->v_memdmahdl,
2466 			    roundup(bp->b_bcount, XB_BSIZE), &xc_acc_attr,
2467 			    DDI_DMA_STREAMING, xdf_dmacallback, (caddr_t)vdp,
2468 			    &aba, &bufsz, &abh);
2469 			if (rc != DDI_SUCCESS) {
2470 				SETDMACBON(vdp);
2471 				DPRINTF(DMA_DBG, (
2472 				    "xdf@%s: DMA mem allocation failed\n",
2473 				    ddi_get_name_addr(vdp->xdf_dip)));
2474 				return (DDI_FAILURE);
2475 			}
2476 
2477 			vreq->v_abuf = aba;
2478 			vreq->v_align = abh;
2479 			vreq->v_status = VREQ_DMAMEM_ALLOCED;
2480 
2481 			ASSERT(bufsz >= bp->b_bcount);
2482 			if (!IS_READ(bp))
2483 				bcopy(bp->b_un.b_addr, vreq->v_abuf,
2484 				    bp->b_bcount);
2485 		}
2486 		/*FALLTHRU*/
2487 
2488 	case VREQ_DMAMEM_ALLOCED:
2489 		/*
2490 		 * dma bind
2491 		 */
2492 		if (ALIGNED_XFER(bp)) {
2493 			rc = ddi_dma_buf_bind_handle(vreq->v_dmahdl, bp,
2494 			    dma_flags, xdf_dmacallback, (caddr_t)vdp,
2495 			    &dc, &ndcs);
2496 		} else {
2497 			rc = ddi_dma_addr_bind_handle(vreq->v_dmahdl,
2498 			    NULL, vreq->v_abuf, bp->b_bcount, dma_flags,
2499 			    xdf_dmacallback, (caddr_t)vdp, &dc, &ndcs);
2500 		}
2501 		if (rc == DDI_DMA_MAPPED || rc == DDI_DMA_PARTIAL_MAP) {
2502 			/* get num of dma windows */
2503 			if (rc == DDI_DMA_PARTIAL_MAP) {
2504 				rc = ddi_dma_numwin(vreq->v_dmahdl, &ndws);
2505 				ASSERT(rc == DDI_SUCCESS);
2506 			} else {
2507 				ndws = 1;
2508 			}
2509 		} else {
2510 			SETDMACBON(vdp);
2511 			DPRINTF(DMA_DBG, ("xdf@%s: DMA bind failed\n",
2512 			    ddi_get_name_addr(vdp->xdf_dip)));
2513 			return (DDI_FAILURE);
2514 		}
2515 
2516 		vreq->v_dmac = dc;
2517 		vreq->v_dmaw = 0;
2518 		vreq->v_ndmacs = ndcs;
2519 		vreq->v_ndmaws = ndws;
2520 		vreq->v_nslots = ndws;
2521 		vreq->v_status = VREQ_DMABUF_BOUND;
2522 		/*FALLTHRU*/
2523 
2524 	case VREQ_DMABUF_BOUND:
2525 		/*
2526 		 * get ge_slot, callback is set upon failure from gs_get(),
2527 		 * if not set previously
2528 		 */
2529 		if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
2530 			DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
2531 			    ddi_get_name_addr(vdp->xdf_dip)));
2532 			return (DDI_FAILURE);
2533 		}
2534 
2535 		vreq->v_gs = gs;
2536 		gs->vreq = vreq;
2537 		vreq->v_status = VREQ_GS_ALLOCED;
2538 		break;
2539 
2540 	case VREQ_GS_ALLOCED:
2541 		/* nothing need to be done */
2542 		break;
2543 
2544 	case VREQ_DMAWIN_DONE:
2545 		/*
2546 		 * move to the next dma window
2547 		 */
2548 		ASSERT((vreq->v_dmaw + 1) < vreq->v_ndmaws);
2549 
2550 		/* get a ge_slot for this DMA window */
2551 		if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
2552 			DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
2553 			    ddi_get_name_addr(vdp->xdf_dip)));
2554 			return (DDI_FAILURE);
2555 		}
2556 
2557 		vreq->v_gs = gs;
2558 		gs->vreq = vreq;
2559 		vreq->v_dmaw++;
2560 		rc = ddi_dma_getwin(vreq->v_dmahdl, vreq->v_dmaw, &off, &sz,
2561 		    &vreq->v_dmac, &vreq->v_ndmacs);
2562 		ASSERT(rc == DDI_SUCCESS);
2563 		vreq->v_status = VREQ_GS_ALLOCED;
2564 		break;
2565 
2566 	default:
2567 		return (DDI_FAILURE);
2568 	}
2569 
2570 	return (DDI_SUCCESS);
2571 }
2572 
2573 static ge_slot_t *
2574 gs_get(xdf_t *vdp, int isread)
2575 {
2576 	grant_ref_t gh;
2577 	ge_slot_t *gs;
2578 
2579 	/* try to alloc GTEs needed in this slot, first */
2580 	if (gnttab_alloc_grant_references(
2581 	    BLKIF_MAX_SEGMENTS_PER_REQUEST, &gh) == -1) {
2582 		if (vdp->xdf_gnt_callback.next == NULL) {
2583 			SETDMACBON(vdp);
2584 			gnttab_request_free_callback(
2585 			    &vdp->xdf_gnt_callback,
2586 			    (void (*)(void *))xdf_dmacallback,
2587 			    (void *)vdp,
2588 			    BLKIF_MAX_SEGMENTS_PER_REQUEST);
2589 		}
2590 		return (NULL);
2591 	}
2592 
2593 	gs = kmem_cache_alloc(xdf_gs_cache, KM_NOSLEEP);
2594 	if (gs == NULL) {
2595 		gnttab_free_grant_references(gh);
2596 		if (vdp->xdf_timeout_id == 0)
2597 			/* restart I/O after one second */
2598 			vdp->xdf_timeout_id =
2599 			    timeout(xdf_timeout_handler, vdp, hz);
2600 		return (NULL);
2601 	}
2602 
2603 	/* init gs_slot */
2604 	list_insert_head(&vdp->xdf_gs_act, (void *)gs);
2605 	gs->oeid = vdp->xdf_peer;
2606 	gs->isread = isread;
2607 	gs->ghead = gh;
2608 	gs->ngrefs = 0;
2609 
2610 	return (gs);
2611 }
2612 
2613 static void
2614 gs_free(xdf_t *vdp, ge_slot_t *gs)
2615 {
2616 	int i;
2617 	grant_ref_t *gp = gs->ge;
2618 	int ngrefs = gs->ngrefs;
2619 	boolean_t isread = gs->isread;
2620 
2621 	list_remove(&vdp->xdf_gs_act, (void *)gs);
2622 
2623 	/* release all grant table entry resources used in this slot */
2624 	for (i = 0; i < ngrefs; i++, gp++)
2625 		gnttab_end_foreign_access(*gp, !isread, 0);
2626 	gnttab_free_grant_references(gs->ghead);
2627 
2628 	kmem_cache_free(xdf_gs_cache, (void *)gs);
2629 }
2630 
2631 static grant_ref_t
2632 gs_grant(ge_slot_t *gs, mfn_t mfn)
2633 {
2634 	grant_ref_t gr = gnttab_claim_grant_reference(&gs->ghead);
2635 
2636 	ASSERT(gr != -1);
2637 	ASSERT(gs->ngrefs < BLKIF_MAX_SEGMENTS_PER_REQUEST);
2638 	gs->ge[gs->ngrefs++] = gr;
2639 	gnttab_grant_foreign_access_ref(gr, gs->oeid, mfn, !gs->isread);
2640 
2641 	return (gr);
2642 }
2643 
2644 static void
2645 unexpectedie(xdf_t *vdp)
2646 {
2647 	/* clean up I/Os in ring that have responses */
2648 	if (xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring)) {
2649 		mutex_exit(&vdp->xdf_dev_lk);
2650 		(void) xdf_intr((caddr_t)vdp);
2651 		mutex_enter(&vdp->xdf_dev_lk);
2652 	}
2653 
2654 	/* free up all grant table entries */
2655 	while (!list_is_empty(&vdp->xdf_gs_act))
2656 		gs_free(vdp, list_head(&vdp->xdf_gs_act));
2657 
2658 	/*
2659 	 * move bp back to active list orderly
2660 	 * vreq_busy is updated in vreq_free()
2661 	 */
2662 	while (!list_is_empty(&vdp->xdf_vreq_act)) {
2663 		v_req_t *vreq = list_head(&vdp->xdf_vreq_act);
2664 		buf_t *bp = vreq->v_buf;
2665 
2666 		bp->av_back = NULL;
2667 		bp->b_resid = bp->b_bcount;
2668 		if (vdp->xdf_f_act == NULL) {
2669 			vdp->xdf_f_act = vdp->xdf_l_act = bp;
2670 		} else {
2671 			/* move to the head of list */
2672 			bp->av_forw = vdp->xdf_f_act;
2673 			vdp->xdf_f_act = bp;
2674 		}
2675 		if (vdp->xdf_xdev_iostat != NULL)
2676 			kstat_runq_back_to_waitq(
2677 			    KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
2678 		vreq_free(vdp, vreq);
2679 	}
2680 }
2681 
2682 static void
2683 xdfmin(struct buf *bp)
2684 {
2685 	if (bp->b_bcount > xdf_maxphys)
2686 		bp->b_bcount = xdf_maxphys;
2687 }
2688 
2689 void
2690 xdf_kstat_delete(dev_info_t *dip)
2691 {
2692 	xdf_t	*vdp = (xdf_t *)ddi_get_driver_private(dip);
2693 	kstat_t	*kstat;
2694 
2695 	/*
2696 	 * The locking order here is xdf_iostat_lk and then xdf_dev_lk.
2697 	 * xdf_dev_lk is used to protect the xdf_xdev_iostat pointer
2698 	 * and the contents of the our kstat.  xdf_iostat_lk is used
2699 	 * to protect the allocation and freeing of the actual kstat.
2700 	 * xdf_dev_lk can't be used for this purpose because kstat
2701 	 * readers use it to access the contents of the kstat and
2702 	 * hence it can't be held when calling kstat_delete().
2703 	 */
2704 	mutex_enter(&vdp->xdf_iostat_lk);
2705 	mutex_enter(&vdp->xdf_dev_lk);
2706 
2707 	if (vdp->xdf_xdev_iostat == NULL) {
2708 		mutex_exit(&vdp->xdf_dev_lk);
2709 		mutex_exit(&vdp->xdf_iostat_lk);
2710 		return;
2711 	}
2712 
2713 	kstat = vdp->xdf_xdev_iostat;
2714 	vdp->xdf_xdev_iostat = NULL;
2715 	mutex_exit(&vdp->xdf_dev_lk);
2716 
2717 	kstat_delete(kstat);
2718 	mutex_exit(&vdp->xdf_iostat_lk);
2719 }
2720 
2721 int
2722 xdf_kstat_create(dev_info_t *dip, char *ks_module, int ks_instance)
2723 {
2724 	xdf_t	*vdp = (xdf_t *)ddi_get_driver_private(dip);
2725 
2726 	/* See comment about locking in xdf_kstat_delete(). */
2727 	mutex_enter(&vdp->xdf_iostat_lk);
2728 	mutex_enter(&vdp->xdf_dev_lk);
2729 
2730 	if (vdp->xdf_xdev_iostat != NULL) {
2731 		mutex_exit(&vdp->xdf_dev_lk);
2732 		mutex_exit(&vdp->xdf_iostat_lk);
2733 		return (-1);
2734 	}
2735 
2736 	if ((vdp->xdf_xdev_iostat = kstat_create(
2737 	    ks_module, ks_instance, NULL, "disk",
2738 	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) == NULL) {
2739 		mutex_exit(&vdp->xdf_dev_lk);
2740 		mutex_exit(&vdp->xdf_iostat_lk);
2741 		return (-1);
2742 	}
2743 
2744 	vdp->xdf_xdev_iostat->ks_lock = &vdp->xdf_dev_lk;
2745 	kstat_install(vdp->xdf_xdev_iostat);
2746 	mutex_exit(&vdp->xdf_dev_lk);
2747 	mutex_exit(&vdp->xdf_iostat_lk);
2748 
2749 	return (0);
2750 }
2751 
2752 #if defined(XPV_HVM_DRIVER)
2753 
2754 typedef struct xdf_hvm_entry {
2755 	list_node_t	xdf_he_list;
2756 	char		*xdf_he_path;
2757 	dev_info_t	*xdf_he_dip;
2758 } xdf_hvm_entry_t;
2759 
2760 static list_t xdf_hvm_list;
2761 static kmutex_t xdf_hvm_list_lock;
2762 
2763 static xdf_hvm_entry_t *
2764 i_xdf_hvm_find(char *path, dev_info_t *dip)
2765 {
2766 	xdf_hvm_entry_t	*i;
2767 
2768 	ASSERT((path != NULL) || (dip != NULL));
2769 	ASSERT(MUTEX_HELD(&xdf_hvm_list_lock));
2770 
2771 	i = list_head(&xdf_hvm_list);
2772 	while (i != NULL) {
2773 		if ((path != NULL) && strcmp(i->xdf_he_path, path) != 0) {
2774 			i = list_next(&xdf_hvm_list, i);
2775 			continue;
2776 		}
2777 		if ((dip != NULL) && (i->xdf_he_dip != dip)) {
2778 			i = list_next(&xdf_hvm_list, i);
2779 			continue;
2780 		}
2781 		break;
2782 	}
2783 	return (i);
2784 }
2785 
2786 dev_info_t *
2787 xdf_hvm_hold(char *path)
2788 {
2789 	xdf_hvm_entry_t	*i;
2790 	dev_info_t	*dip;
2791 
2792 	mutex_enter(&xdf_hvm_list_lock);
2793 	i = i_xdf_hvm_find(path, NULL);
2794 	if (i == NULL) {
2795 		mutex_exit(&xdf_hvm_list_lock);
2796 		return (B_FALSE);
2797 	}
2798 	ndi_hold_devi(dip = i->xdf_he_dip);
2799 	mutex_exit(&xdf_hvm_list_lock);
2800 	return (dip);
2801 }
2802 
2803 static void
2804 xdf_hvm_add(dev_info_t *dip)
2805 {
2806 	xdf_hvm_entry_t	*i;
2807 	char		*path;
2808 
2809 	/* figure out the path for the dip */
2810 	path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
2811 	(void) ddi_pathname(dip, path);
2812 
2813 	i = kmem_alloc(sizeof (*i), KM_SLEEP);
2814 	i->xdf_he_dip = dip;
2815 	i->xdf_he_path = i_ddi_strdup(path, KM_SLEEP);
2816 
2817 	mutex_enter(&xdf_hvm_list_lock);
2818 	ASSERT(i_xdf_hvm_find(path, NULL) == NULL);
2819 	ASSERT(i_xdf_hvm_find(NULL, dip) == NULL);
2820 	list_insert_head(&xdf_hvm_list, i);
2821 	mutex_exit(&xdf_hvm_list_lock);
2822 
2823 	kmem_free(path, MAXPATHLEN);
2824 }
2825 
2826 static void
2827 xdf_hvm_rm(dev_info_t *dip)
2828 {
2829 	xdf_hvm_entry_t	*i;
2830 
2831 	mutex_enter(&xdf_hvm_list_lock);
2832 	VERIFY((i = i_xdf_hvm_find(NULL, dip)) != NULL);
2833 	list_remove(&xdf_hvm_list, i);
2834 	mutex_exit(&xdf_hvm_list_lock);
2835 
2836 	kmem_free(i->xdf_he_path, strlen(i->xdf_he_path) + 1);
2837 	kmem_free(i, sizeof (*i));
2838 }
2839 
2840 static void
2841 xdf_hvm_init(void)
2842 {
2843 	list_create(&xdf_hvm_list, sizeof (xdf_hvm_entry_t),
2844 	    offsetof(xdf_hvm_entry_t, xdf_he_list));
2845 	mutex_init(&xdf_hvm_list_lock, NULL, MUTEX_DEFAULT, NULL);
2846 }
2847 
2848 static void
2849 xdf_hvm_fini(void)
2850 {
2851 	ASSERT(list_head(&xdf_hvm_list) == NULL);
2852 	list_destroy(&xdf_hvm_list);
2853 	mutex_destroy(&xdf_hvm_list_lock);
2854 }
2855 
2856 int
2857 xdf_hvm_connect(dev_info_t *dip)
2858 {
2859 	xdf_t	*vdp = (xdf_t *)ddi_get_driver_private(dip);
2860 	int	rv;
2861 
2862 	/* do cv_wait until connected or failed */
2863 	mutex_enter(&vdp->xdf_dev_lk);
2864 	rv = xdf_connect(vdp, B_TRUE);
2865 	mutex_exit(&vdp->xdf_dev_lk);
2866 	return ((rv == XD_READY) ? 0 : -1);
2867 }
2868 
2869 int
2870 xdf_hvm_setpgeom(dev_info_t *dip, cmlb_geom_t *geomp)
2871 {
2872 	xdf_t	*vdp = (xdf_t *)ddi_get_driver_private(dip);
2873 
2874 	/* sanity check the requested physical geometry */
2875 	mutex_enter(&vdp->xdf_dev_lk);
2876 	if ((geomp->g_secsize != XB_BSIZE) ||
2877 	    (geomp->g_capacity == 0)) {
2878 		mutex_exit(&vdp->xdf_dev_lk);
2879 		return (EINVAL);
2880 	}
2881 
2882 	/*
2883 	 * If we've already connected to the backend device then make sure
2884 	 * we're not defining a physical geometry larger than our backend
2885 	 * device.
2886 	 */
2887 	if ((vdp->xdf_xdev_nblocks != 0) &&
2888 	    (geomp->g_capacity > vdp->xdf_xdev_nblocks)) {
2889 		mutex_exit(&vdp->xdf_dev_lk);
2890 		return (EINVAL);
2891 	}
2892 
2893 	vdp->xdf_pgeom = *geomp;
2894 	mutex_exit(&vdp->xdf_dev_lk);
2895 
2896 	/* force a re-validation */
2897 	cmlb_invalidate(vdp->xdf_vd_lbl, NULL);
2898 
2899 	return (0);
2900 }
2901 
2902 #endif /* XPV_HVM_DRIVER */
2903