xref: /illumos-gate/usr/src/uts/common/xen/io/xdf.c (revision 02f22325adceea5762fbc7f49cee82e407e8f3a1)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * xdf.c - Xen Virtual Block Device Driver
29  * TODO:
30  *	- support alternate block size (currently only DEV_BSIZE supported)
31  *	- revalidate geometry for removable devices
32  */
33 
34 #pragma ident	"%Z%%M%	%I%	%E% SMI"
35 
36 #include <sys/ddi.h>
37 #include <sys/sunddi.h>
38 #include <sys/conf.h>
39 #include <sys/cmlb.h>
40 #include <sys/dkio.h>
41 #include <sys/promif.h>
42 #include <sys/sysmacros.h>
43 #include <sys/kstat.h>
44 #include <sys/mach_mmu.h>
45 #ifdef XPV_HVM_DRIVER
46 #include <sys/xpv_support.h>
47 #include <sys/sunndi.h>
48 #endif /* XPV_HVM_DRIVER */
49 #include <public/io/xenbus.h>
50 #include <xen/sys/xenbus_impl.h>
51 #include <xen/sys/xendev.h>
52 #include <sys/gnttab.h>
53 #include <sys/scsi/generic/inquiry.h>
54 #include <xen/io/blkif_impl.h>
55 #include <io/xdf.h>
56 
57 #define	FLUSH_DISKCACHE	0x1
58 #define	WRITE_BARRIER	0x2
59 #define	DEFAULT_FLUSH_BLOCK	156 /* block to write to cause a cache flush */
60 #define	USE_WRITE_BARRIER(vdp)				\
61 	((vdp)->xdf_feature_barrier && !(vdp)->xdf_flush_supported)
62 #define	USE_FLUSH_DISKCACHE(vdp)			\
63 	((vdp)->xdf_feature_barrier && (vdp)->xdf_flush_supported)
64 #define	IS_WRITE_BARRIER(vdp, bp)			\
65 	(!IS_READ(bp) && USE_WRITE_BARRIER(vdp) &&	\
66 	((bp)->b_un.b_addr == (vdp)->xdf_cache_flush_block))
67 #define	IS_FLUSH_DISKCACHE(bp)				\
68 	(!IS_READ(bp) && USE_FLUSH_DISKCACHE(vdp) && ((bp)->b_bcount == 0))
69 
70 static void *vbd_ss;
71 static kmem_cache_t *xdf_vreq_cache;
72 static kmem_cache_t *xdf_gs_cache;
73 static int xdf_maxphys = XB_MAXPHYS;
74 int xdfdebug = 0;
75 extern int do_polled_io;
76 diskaddr_t xdf_flush_block = DEFAULT_FLUSH_BLOCK;
77 int	xdf_barrier_flush_disable = 0;
78 
79 /*
80  * dev_ops and cb_ops entrypoints
81  */
82 static int xdf_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
83 static int xdf_attach(dev_info_t *, ddi_attach_cmd_t);
84 static int xdf_detach(dev_info_t *, ddi_detach_cmd_t);
85 static int xdf_reset(dev_info_t *, ddi_reset_cmd_t);
86 static int xdf_open(dev_t *, int, int, cred_t *);
87 static int xdf_close(dev_t, int, int, struct cred *);
88 static int xdf_strategy(struct buf *);
89 static int xdf_read(dev_t, struct uio *, cred_t *);
90 static int xdf_aread(dev_t, struct aio_req *, cred_t *);
91 static int xdf_write(dev_t, struct uio *, cred_t *);
92 static int xdf_awrite(dev_t, struct aio_req *, cred_t *);
93 static int xdf_dump(dev_t, caddr_t, daddr_t, int);
94 static int xdf_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
95 static uint_t xdf_intr(caddr_t);
96 static int xdf_prop_op(dev_t, dev_info_t *, ddi_prop_op_t, int, char *,
97     caddr_t, int *);
98 
99 /*
100  * misc private functions
101  */
102 static int xdf_suspend(dev_info_t *);
103 static int xdf_resume(dev_info_t *);
104 static int xdf_start_connect(xdf_t *);
105 static int xdf_start_disconnect(xdf_t *);
106 static int xdf_post_connect(xdf_t *);
107 static void xdf_post_disconnect(xdf_t *);
108 static void xdf_oe_change(dev_info_t *, ddi_eventcookie_t, void *, void *);
109 static void xdf_iostart(xdf_t *);
110 static void xdf_iofini(xdf_t *, uint64_t, int);
111 static int xdf_prepare_rreq(xdf_t *, struct buf *, blkif_request_t *);
112 static int xdf_drain_io(xdf_t *);
113 static boolean_t xdf_isopen(xdf_t *, int);
114 static int xdf_check_state_transition(xdf_t *, XenbusState);
115 static int xdf_connect(xdf_t *, boolean_t);
116 static int xdf_dmacallback(caddr_t);
117 static void xdf_timeout_handler(void *);
118 static uint_t xdf_iorestart(caddr_t);
119 static v_req_t *vreq_get(xdf_t *, buf_t *);
120 static void vreq_free(xdf_t *, v_req_t *);
121 static int vreq_setup(xdf_t *, v_req_t *);
122 static ge_slot_t *gs_get(xdf_t *, int);
123 static void gs_free(xdf_t *, ge_slot_t *);
124 static grant_ref_t gs_grant(ge_slot_t *, mfn_t);
125 static void unexpectedie(xdf_t *);
126 static void xdfmin(struct buf *);
127 static void xdf_synthetic_pgeom(dev_info_t *, cmlb_geom_t *);
128 extern int xdf_kstat_create(dev_info_t *, char *, int);
129 extern void xdf_kstat_delete(dev_info_t *);
130 
131 #if defined(XPV_HVM_DRIVER)
132 static void xdf_hvm_add(dev_info_t *);
133 static void xdf_hvm_rm(dev_info_t *);
134 static void xdf_hvm_init(void);
135 static void xdf_hvm_fini(void);
136 #endif /* XPV_HVM_DRIVER */
137 
138 static 	struct cb_ops xdf_cbops = {
139 	xdf_open,
140 	xdf_close,
141 	xdf_strategy,
142 	nodev,
143 	xdf_dump,
144 	xdf_read,
145 	xdf_write,
146 	xdf_ioctl,
147 	nodev,
148 	nodev,
149 	nodev,
150 	nochpoll,
151 	xdf_prop_op,
152 	NULL,
153 	D_MP | D_NEW | D_64BIT,
154 	CB_REV,
155 	xdf_aread,
156 	xdf_awrite
157 };
158 
159 struct dev_ops xdf_devops = {
160 	DEVO_REV,		/* devo_rev */
161 	0,			/* devo_refcnt */
162 	xdf_getinfo,		/* devo_getinfo */
163 	nulldev,		/* devo_identify */
164 	nulldev,		/* devo_probe */
165 	xdf_attach,		/* devo_attach */
166 	xdf_detach,		/* devo_detach */
167 	xdf_reset,		/* devo_reset */
168 	&xdf_cbops,		/* devo_cb_ops */
169 	(struct bus_ops *)NULL	/* devo_bus_ops */
170 };
171 
172 static struct modldrv modldrv = {
173 	&mod_driverops,		/* Type of module.  This one is a driver */
174 	"virtual block driver %I%",	/* short description */
175 	&xdf_devops		/* driver specific ops */
176 };
177 
178 static struct modlinkage xdf_modlinkage = {
179 	MODREV_1, (void *)&modldrv, NULL
180 };
181 
182 /*
183  * I/O buffer DMA attributes
184  * Make sure: one DMA window contains BLKIF_MAX_SEGMENTS_PER_REQUEST at most
185  */
186 static ddi_dma_attr_t xb_dma_attr = {
187 	DMA_ATTR_V0,
188 	(uint64_t)0,			/* lowest address */
189 	(uint64_t)0xffffffffffffffff,	/* highest usable address */
190 	(uint64_t)0xffffff,		/* DMA counter limit max */
191 	(uint64_t)XB_BSIZE,		/* alignment in bytes */
192 	XB_BSIZE - 1,			/* bitmap of burst sizes */
193 	XB_BSIZE,			/* min transfer */
194 	(uint64_t)XB_MAX_XFER, 		/* maximum transfer */
195 	(uint64_t)PAGEOFFSET,		/* 1 page segment length  */
196 	BLKIF_MAX_SEGMENTS_PER_REQUEST,	/* maximum number of segments */
197 	XB_BSIZE,			/* granularity */
198 	0,				/* flags (reserved) */
199 };
200 
201 static ddi_device_acc_attr_t xc_acc_attr = {
202 	DDI_DEVICE_ATTR_V0,
203 	DDI_NEVERSWAP_ACC,
204 	DDI_STRICTORDER_ACC
205 };
206 
207 /* callbacks from commmon label */
208 
209 int xdf_lb_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t, void *);
210 int xdf_lb_getinfo(dev_info_t *, int, void *, void *);
211 
212 static cmlb_tg_ops_t xdf_lb_ops = {
213 	TG_DK_OPS_VERSION_1,
214 	xdf_lb_rdwr,
215 	xdf_lb_getinfo
216 };
217 
218 int
219 _init(void)
220 {
221 	int rc;
222 
223 	if ((rc = ddi_soft_state_init(&vbd_ss, sizeof (xdf_t), 0)) != 0)
224 		return (rc);
225 
226 	xdf_vreq_cache = kmem_cache_create("xdf_vreq_cache",
227 	    sizeof (v_req_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
228 	xdf_gs_cache = kmem_cache_create("xdf_gs_cache",
229 	    sizeof (ge_slot_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
230 
231 #if defined(XPV_HVM_DRIVER)
232 	xdf_hvm_init();
233 #endif /* XPV_HVM_DRIVER */
234 
235 	if ((rc = mod_install(&xdf_modlinkage)) != 0) {
236 #if defined(XPV_HVM_DRIVER)
237 		xdf_hvm_fini();
238 #endif /* XPV_HVM_DRIVER */
239 		kmem_cache_destroy(xdf_vreq_cache);
240 		kmem_cache_destroy(xdf_gs_cache);
241 		ddi_soft_state_fini(&vbd_ss);
242 		return (rc);
243 	}
244 
245 	return (rc);
246 }
247 
248 int
249 _fini(void)
250 {
251 
252 	int err;
253 	if ((err = mod_remove(&xdf_modlinkage)) != 0)
254 		return (err);
255 
256 #if defined(XPV_HVM_DRIVER)
257 	xdf_hvm_fini();
258 #endif /* XPV_HVM_DRIVER */
259 
260 	kmem_cache_destroy(xdf_vreq_cache);
261 	kmem_cache_destroy(xdf_gs_cache);
262 	ddi_soft_state_fini(&vbd_ss);
263 
264 	return (0);
265 }
266 
267 int
268 _info(struct modinfo *modinfop)
269 {
270 	return (mod_info(&xdf_modlinkage, modinfop));
271 }
272 
273 /*ARGSUSED*/
274 static int
275 xdf_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **rp)
276 {
277 	int instance;
278 	xdf_t *vbdp;
279 
280 	instance = XDF_INST(getminor((dev_t)arg));
281 
282 	switch (cmd) {
283 	case DDI_INFO_DEVT2DEVINFO:
284 		if ((vbdp = ddi_get_soft_state(vbd_ss, instance)) == NULL) {
285 			*rp = NULL;
286 			return (DDI_FAILURE);
287 		}
288 		*rp = vbdp->xdf_dip;
289 		return (DDI_SUCCESS);
290 
291 	case DDI_INFO_DEVT2INSTANCE:
292 		*rp = (void *)(uintptr_t)instance;
293 		return (DDI_SUCCESS);
294 
295 	default:
296 		return (DDI_FAILURE);
297 	}
298 }
299 
300 static int
301 xdf_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags,
302 	char *name, caddr_t valuep, int *lengthp)
303 {
304 	int instance = ddi_get_instance(dip);
305 	xdf_t *vdp;
306 	diskaddr_t p_blkcnt;
307 
308 	/*
309 	 * xdf dynamic properties are device specific and size oriented.
310 	 * Requests issued under conditions where size is valid are passed
311 	 * to ddi_prop_op_nblocks with the size information, otherwise the
312 	 * request is passed to ddi_prop_op.
313 	 */
314 	vdp = ddi_get_soft_state(vbd_ss, instance);
315 
316 	if ((dev == DDI_DEV_T_ANY) || (vdp == NULL))
317 		return (ddi_prop_op(dev, dip, prop_op, mod_flags,
318 		    name, valuep, lengthp));
319 
320 	/* do cv_wait until connected or failed */
321 	mutex_enter(&vdp->xdf_dev_lk);
322 	if (xdf_connect(vdp, B_TRUE) != XD_READY) {
323 		mutex_exit(&vdp->xdf_dev_lk);
324 		goto out;
325 	}
326 	mutex_exit(&vdp->xdf_dev_lk);
327 
328 	if (cmlb_partinfo(vdp->xdf_vd_lbl, XDF_PART(getminor(dev)), &p_blkcnt,
329 	    NULL, NULL, NULL, NULL) == 0)
330 		return (ddi_prop_op_nblocks(dev, dip, prop_op, mod_flags,
331 		    name, valuep, lengthp, (uint64_t)p_blkcnt));
332 
333 out:
334 	return (ddi_prop_op(dev, dip, prop_op, mod_flags, name, valuep,
335 	    lengthp));
336 }
337 
338 static int
339 xdf_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
340 {
341 	xdf_t *vdp;
342 	ddi_iblock_cookie_t softibc;
343 	int instance;
344 
345 	xdfdebug = ddi_prop_get_int(DDI_DEV_T_ANY, devi, DDI_PROP_NOTPROM,
346 	    "xdfdebug", 0);
347 
348 	switch (cmd) {
349 		case DDI_ATTACH:
350 			break;
351 
352 		case DDI_RESUME:
353 			return (xdf_resume(devi));
354 
355 		default:
356 			return (DDI_FAILURE);
357 	}
358 
359 	instance = ddi_get_instance(devi);
360 	if (ddi_soft_state_zalloc(vbd_ss, instance) != DDI_SUCCESS)
361 		return (DDI_FAILURE);
362 
363 	DPRINTF(DDI_DBG, ("xdf%d: attaching\n", instance));
364 	vdp = ddi_get_soft_state(vbd_ss, instance);
365 	ddi_set_driver_private(devi, vdp);
366 	vdp->xdf_dip = devi;
367 	cv_init(&vdp->xdf_dev_cv, NULL, CV_DEFAULT, NULL);
368 
369 	if (ddi_get_iblock_cookie(devi, 0, &vdp->xdf_ibc) != DDI_SUCCESS) {
370 		cmn_err(CE_WARN, "xdf@%s: failed to get iblock cookie",
371 		    ddi_get_name_addr(devi));
372 		goto errout0;
373 	}
374 	mutex_init(&vdp->xdf_dev_lk, NULL, MUTEX_DRIVER, (void *)vdp->xdf_ibc);
375 	mutex_init(&vdp->xdf_cb_lk, NULL, MUTEX_DRIVER, (void *)vdp->xdf_ibc);
376 	mutex_init(&vdp->xdf_iostat_lk, NULL, MUTEX_DRIVER,
377 	    (void *)vdp->xdf_ibc);
378 
379 	if (ddi_get_soft_iblock_cookie(devi, DDI_SOFTINT_LOW, &softibc)
380 	    != DDI_SUCCESS) {
381 		cmn_err(CE_WARN, "xdf@%s: failed to get softintr iblock cookie",
382 		    ddi_get_name_addr(devi));
383 		goto errout0;
384 	}
385 	if (ddi_add_softintr(devi, DDI_SOFTINT_LOW, &vdp->xdf_softintr_id,
386 	    &softibc, NULL, xdf_iorestart, (caddr_t)vdp) != DDI_SUCCESS) {
387 		cmn_err(CE_WARN, "xdf@%s: failed to add softintr",
388 		    ddi_get_name_addr(devi));
389 		goto errout0;
390 	}
391 
392 #if !defined(XPV_HVM_DRIVER)
393 	/* create kstat for iostat(1M) */
394 	if (xdf_kstat_create(devi, "xdf", instance) != 0) {
395 		cmn_err(CE_WARN, "xdf@%s: failed to create kstat",
396 		    ddi_get_name_addr(devi));
397 		goto errout0;
398 	}
399 #endif /* !XPV_HVM_DRIVER */
400 
401 	/* driver handles kernel-issued IOCTLs */
402 	if (ddi_prop_create(DDI_DEV_T_NONE, devi, DDI_PROP_CANSLEEP,
403 	    DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) {
404 		cmn_err(CE_WARN, "xdf@%s: cannot create DDI_KERNEL_IOCTL prop",
405 		    ddi_get_name_addr(devi));
406 		goto errout0;
407 	}
408 
409 	/*
410 	 * Initialize the physical geometry stucture.  Note that currently
411 	 * we don't know the size of the backend device so the number
412 	 * of blocks on the device will be initialized to zero.  Once
413 	 * we connect to the backend device we'll update the physical
414 	 * geometry to reflect the real size of the device.
415 	 */
416 	xdf_synthetic_pgeom(devi, &vdp->xdf_pgeom);
417 
418 	/*
419 	 * create default device minor nodes: non-removable disk
420 	 * we will adjust minor nodes after we are connected w/ backend
421 	 */
422 	cmlb_alloc_handle(&vdp->xdf_vd_lbl);
423 	if (cmlb_attach(devi, &xdf_lb_ops, DTYPE_DIRECT, 0, 1,
424 	    DDI_NT_BLOCK_XVMD,
425 #if defined(XPV_HVM_DRIVER)
426 	    CMLB_CREATE_ALTSLICE_VTOC_16_DTYPE_DIRECT |
427 	    CMLB_INTERNAL_MINOR_NODES,
428 #else /* !XPV_HVM_DRIVER */
429 	    CMLB_FAKE_LABEL_ONE_PARTITION,
430 #endif /* !XPV_HVM_DRIVER */
431 	    vdp->xdf_vd_lbl, NULL) != 0) {
432 		cmn_err(CE_WARN, "xdf@%s: default cmlb attach failed",
433 		    ddi_get_name_addr(devi));
434 		goto errout0;
435 	}
436 
437 	/*
438 	 * We ship with cache-enabled disks
439 	 */
440 	vdp->xdf_wce = 1;
441 
442 	mutex_enter(&vdp->xdf_cb_lk);
443 
444 	/* Watch backend XenbusState change */
445 	if (xvdi_add_event_handler(devi, XS_OE_STATE,
446 	    xdf_oe_change) != DDI_SUCCESS) {
447 		mutex_exit(&vdp->xdf_cb_lk);
448 		goto errout0;
449 	}
450 
451 	if (xdf_start_connect(vdp) != DDI_SUCCESS) {
452 		cmn_err(CE_WARN, "xdf@%s: start connection failed",
453 		    ddi_get_name_addr(devi));
454 		(void) xdf_start_disconnect(vdp);
455 		mutex_exit(&vdp->xdf_cb_lk);
456 		goto errout1;
457 	}
458 
459 	mutex_exit(&vdp->xdf_cb_lk);
460 
461 	list_create(&vdp->xdf_vreq_act, sizeof (v_req_t),
462 	    offsetof(v_req_t, v_link));
463 	list_create(&vdp->xdf_gs_act, sizeof (ge_slot_t),
464 	    offsetof(ge_slot_t, link));
465 
466 #if defined(XPV_HVM_DRIVER)
467 	xdf_hvm_add(devi);
468 
469 	(void) ddi_prop_update_int(DDI_DEV_T_NONE, devi, DDI_NO_AUTODETACH, 1);
470 
471 	/*
472 	 * Report our version to dom0.
473 	 */
474 	if (xenbus_printf(XBT_NULL, "hvmpv/xdf", "version", "%d",
475 	    HVMPV_XDF_VERS))
476 		cmn_err(CE_WARN, "xdf: couldn't write version\n");
477 #endif /* XPV_HVM_DRIVER */
478 
479 	ddi_report_dev(devi);
480 
481 	DPRINTF(DDI_DBG, ("xdf%d: attached\n", instance));
482 
483 	return (DDI_SUCCESS);
484 
485 errout1:
486 	xvdi_remove_event_handler(devi, XS_OE_STATE);
487 errout0:
488 	if (vdp->xdf_vd_lbl != NULL) {
489 		cmlb_detach(vdp->xdf_vd_lbl, NULL);
490 		cmlb_free_handle(&vdp->xdf_vd_lbl);
491 		vdp->xdf_vd_lbl = NULL;
492 	}
493 #if !defined(XPV_HVM_DRIVER)
494 	xdf_kstat_delete(devi);
495 #endif /* !XPV_HVM_DRIVER */
496 	if (vdp->xdf_softintr_id != NULL)
497 		ddi_remove_softintr(vdp->xdf_softintr_id);
498 	if (vdp->xdf_ibc != NULL) {
499 		mutex_destroy(&vdp->xdf_cb_lk);
500 		mutex_destroy(&vdp->xdf_dev_lk);
501 	}
502 	cv_destroy(&vdp->xdf_dev_cv);
503 	ddi_soft_state_free(vbd_ss, instance);
504 	ddi_set_driver_private(devi, NULL);
505 	ddi_prop_remove_all(devi);
506 	cmn_err(CE_WARN, "xdf@%s: attach failed", ddi_get_name_addr(devi));
507 	return (DDI_FAILURE);
508 }
509 
510 static int
511 xdf_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
512 {
513 	xdf_t *vdp;
514 	int instance;
515 
516 	switch (cmd) {
517 
518 	case DDI_PM_SUSPEND:
519 		break;
520 
521 	case DDI_SUSPEND:
522 		return (xdf_suspend(devi));
523 
524 	case DDI_DETACH:
525 		break;
526 
527 	default:
528 		return (DDI_FAILURE);
529 	}
530 
531 	instance = ddi_get_instance(devi);
532 	DPRINTF(DDI_DBG, ("xdf%d: detaching\n", instance));
533 	vdp = ddi_get_soft_state(vbd_ss, instance);
534 
535 	if (vdp == NULL)
536 		return (DDI_FAILURE);
537 
538 	mutex_enter(&vdp->xdf_dev_lk);
539 	if (xdf_isopen(vdp, -1)) {
540 		mutex_exit(&vdp->xdf_dev_lk);
541 		return (DDI_FAILURE);
542 	}
543 
544 	if (vdp->xdf_status != XD_CLOSED) {
545 		mutex_exit(&vdp->xdf_dev_lk);
546 		return (DDI_FAILURE);
547 	}
548 
549 #if defined(XPV_HVM_DRIVER)
550 	xdf_hvm_rm(devi);
551 #endif /* XPV_HVM_DRIVER */
552 
553 	ASSERT(!ISDMACBON(vdp));
554 	mutex_exit(&vdp->xdf_dev_lk);
555 
556 	if (vdp->xdf_timeout_id != 0)
557 		(void) untimeout(vdp->xdf_timeout_id);
558 
559 	xvdi_remove_event_handler(devi, XS_OE_STATE);
560 
561 	/* we'll support backend running in domU later */
562 #ifdef	DOMU_BACKEND
563 	(void) xvdi_post_event(devi, XEN_HP_REMOVE);
564 #endif
565 
566 	list_destroy(&vdp->xdf_vreq_act);
567 	list_destroy(&vdp->xdf_gs_act);
568 	ddi_prop_remove_all(devi);
569 	xdf_kstat_delete(devi);
570 	ddi_remove_softintr(vdp->xdf_softintr_id);
571 	ddi_set_driver_private(devi, NULL);
572 	cv_destroy(&vdp->xdf_dev_cv);
573 	mutex_destroy(&vdp->xdf_cb_lk);
574 	mutex_destroy(&vdp->xdf_dev_lk);
575 	if (vdp->xdf_cache_flush_block != NULL)
576 		kmem_free(vdp->xdf_flush_mem, 2 * DEV_BSIZE);
577 	ddi_soft_state_free(vbd_ss, instance);
578 	return (DDI_SUCCESS);
579 }
580 
581 static int
582 xdf_suspend(dev_info_t *devi)
583 {
584 	xdf_t *vdp;
585 	int instance;
586 	enum xdf_state st;
587 
588 	instance = ddi_get_instance(devi);
589 
590 	if (xdfdebug & SUSRES_DBG)
591 		xen_printf("xdf_suspend: xdf#%d\n", instance);
592 
593 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
594 		return (DDI_FAILURE);
595 
596 	xvdi_suspend(devi);
597 
598 	mutex_enter(&vdp->xdf_cb_lk);
599 	mutex_enter(&vdp->xdf_dev_lk);
600 	st = vdp->xdf_status;
601 	/* change status to stop further I/O requests */
602 	if (st == XD_READY)
603 		vdp->xdf_status = XD_SUSPEND;
604 	mutex_exit(&vdp->xdf_dev_lk);
605 	mutex_exit(&vdp->xdf_cb_lk);
606 
607 	/* make sure no more I/O responses left in the ring buffer */
608 	if ((st == XD_INIT) || (st == XD_READY)) {
609 #ifdef XPV_HVM_DRIVER
610 		ec_unbind_evtchn(vdp->xdf_evtchn);
611 		xvdi_free_evtchn(devi);
612 #else /* !XPV_HVM_DRIVER */
613 		(void) ddi_remove_intr(devi, 0, NULL);
614 #endif /* !XPV_HVM_DRIVER */
615 		(void) xdf_drain_io(vdp);
616 		/*
617 		 * no need to teardown the ring buffer here
618 		 * it will be simply re-init'ed during resume when
619 		 * we call xvdi_alloc_ring
620 		 */
621 	}
622 
623 	if (xdfdebug & SUSRES_DBG)
624 		xen_printf("xdf_suspend: SUCCESS\n");
625 
626 	return (DDI_SUCCESS);
627 }
628 
629 /*ARGSUSED*/
630 static int
631 xdf_resume(dev_info_t *devi)
632 {
633 	xdf_t *vdp;
634 	int instance;
635 
636 	instance = ddi_get_instance(devi);
637 	if (xdfdebug & SUSRES_DBG)
638 		xen_printf("xdf_resume: xdf%d\n", instance);
639 
640 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
641 		return (DDI_FAILURE);
642 
643 	mutex_enter(&vdp->xdf_cb_lk);
644 
645 	if (xvdi_resume(devi) != DDI_SUCCESS) {
646 		mutex_exit(&vdp->xdf_cb_lk);
647 		return (DDI_FAILURE);
648 	}
649 
650 	mutex_enter(&vdp->xdf_dev_lk);
651 	ASSERT(vdp->xdf_status != XD_READY);
652 	vdp->xdf_status = XD_UNKNOWN;
653 	mutex_exit(&vdp->xdf_dev_lk);
654 
655 	if (xdf_start_connect(vdp) != DDI_SUCCESS) {
656 		mutex_exit(&vdp->xdf_cb_lk);
657 		return (DDI_FAILURE);
658 	}
659 
660 	mutex_exit(&vdp->xdf_cb_lk);
661 
662 	if (xdfdebug & SUSRES_DBG)
663 		xen_printf("xdf_resume: done\n");
664 	return (DDI_SUCCESS);
665 }
666 
667 /*ARGSUSED*/
668 static int
669 xdf_reset(dev_info_t *devi, ddi_reset_cmd_t cmd)
670 {
671 	xdf_t *vdp;
672 	int instance;
673 
674 	instance = ddi_get_instance(devi);
675 	DPRINTF(DDI_DBG, ("xdf%d: resetting\n", instance));
676 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
677 		return (DDI_FAILURE);
678 
679 	/*
680 	 * wait for any outstanding I/O to complete
681 	 */
682 	(void) xdf_drain_io(vdp);
683 
684 	DPRINTF(DDI_DBG, ("xdf%d: reset complete\n", instance));
685 	return (DDI_SUCCESS);
686 }
687 
688 static int
689 xdf_open(dev_t *devp, int flag, int otyp, cred_t *credp)
690 {
691 	minor_t	minor;
692 	xdf_t	*vdp;
693 	int part;
694 	ulong_t parbit;
695 	diskaddr_t p_blkct = 0;
696 	boolean_t firstopen;
697 	boolean_t nodelay;
698 
699 	minor = getminor(*devp);
700 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
701 		return (ENXIO);
702 
703 	nodelay = (flag & (FNDELAY | FNONBLOCK));
704 
705 	DPRINTF(DDI_DBG, ("xdf%d: opening\n", XDF_INST(minor)));
706 
707 	/* do cv_wait until connected or failed */
708 	mutex_enter(&vdp->xdf_dev_lk);
709 	if (!nodelay && (xdf_connect(vdp, B_TRUE) != XD_READY)) {
710 		mutex_exit(&vdp->xdf_dev_lk);
711 		return (ENXIO);
712 	}
713 
714 	if ((flag & FWRITE) && XD_IS_RO(vdp)) {
715 		mutex_exit(&vdp->xdf_dev_lk);
716 		return (EROFS);
717 	}
718 
719 	part = XDF_PART(minor);
720 	parbit = 1 << part;
721 	if ((vdp->xdf_vd_exclopen & parbit) ||
722 	    ((flag & FEXCL) && xdf_isopen(vdp, part))) {
723 		mutex_exit(&vdp->xdf_dev_lk);
724 		return (EBUSY);
725 	}
726 
727 	/* are we the first one to open this node? */
728 	firstopen = !xdf_isopen(vdp, -1);
729 
730 	if (otyp == OTYP_LYR)
731 		vdp->xdf_vd_lyropen[part]++;
732 
733 	vdp->xdf_vd_open[otyp] |= parbit;
734 
735 	if (flag & FEXCL)
736 		vdp->xdf_vd_exclopen |= parbit;
737 
738 	mutex_exit(&vdp->xdf_dev_lk);
739 
740 	/* force a re-validation */
741 	if (firstopen)
742 		cmlb_invalidate(vdp->xdf_vd_lbl, NULL);
743 
744 	/*
745 	 * check size
746 	 * ignore CD/DVD which contains a zero-sized s0
747 	 */
748 	if (!nodelay && !XD_IS_CD(vdp) &&
749 	    ((cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
750 	    NULL, NULL, NULL, NULL) != 0) || (p_blkct == 0))) {
751 		(void) xdf_close(*devp, flag, otyp, credp);
752 		return (ENXIO);
753 	}
754 
755 	return (0);
756 }
757 
758 /*ARGSUSED*/
759 static int
760 xdf_close(dev_t dev, int flag, int otyp, struct cred *credp)
761 {
762 	minor_t	minor;
763 	xdf_t	*vdp;
764 	int part;
765 	ulong_t parbit;
766 
767 	minor = getminor(dev);
768 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
769 		return (ENXIO);
770 
771 	mutex_enter(&vdp->xdf_dev_lk);
772 	part = XDF_PART(minor);
773 	if (!xdf_isopen(vdp, part)) {
774 		mutex_exit(&vdp->xdf_dev_lk);
775 		return (ENXIO);
776 	}
777 	parbit = 1 << part;
778 
779 	ASSERT((vdp->xdf_vd_open[otyp] & parbit) != 0);
780 	if (otyp == OTYP_LYR) {
781 		ASSERT(vdp->xdf_vd_lyropen[part] > 0);
782 		if (--vdp->xdf_vd_lyropen[part] == 0)
783 			vdp->xdf_vd_open[otyp] &= ~parbit;
784 	} else {
785 		vdp->xdf_vd_open[otyp] &= ~parbit;
786 	}
787 	vdp->xdf_vd_exclopen &= ~parbit;
788 
789 	mutex_exit(&vdp->xdf_dev_lk);
790 	return (0);
791 }
792 
793 static int
794 xdf_strategy(struct buf *bp)
795 {
796 	xdf_t	*vdp;
797 	minor_t minor;
798 	diskaddr_t p_blkct, p_blkst;
799 	ulong_t nblks;
800 	int part;
801 
802 	minor = getminor(bp->b_edev);
803 	part = XDF_PART(minor);
804 
805 	vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor));
806 	if ((vdp == NULL) || !xdf_isopen(vdp, part)) {
807 		bioerror(bp, ENXIO);
808 		bp->b_resid = bp->b_bcount;
809 		biodone(bp);
810 		return (0);
811 	}
812 
813 	/* Check for writes to a read only device */
814 	if (!IS_READ(bp) && XD_IS_RO(vdp)) {
815 		bioerror(bp, EROFS);
816 		bp->b_resid = bp->b_bcount;
817 		biodone(bp);
818 		return (0);
819 	}
820 
821 	/* Check if this I/O is accessing a partition or the entire disk */
822 	if ((long)bp->b_private == XB_SLICE_NONE) {
823 		/* This I/O is using an absolute offset */
824 		p_blkct = vdp->xdf_xdev_nblocks;
825 		p_blkst = 0;
826 	} else {
827 		/* This I/O is using a partition relative offset */
828 		if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
829 		    &p_blkst, NULL, NULL, NULL)) {
830 			bioerror(bp, ENXIO);
831 			bp->b_resid = bp->b_bcount;
832 			biodone(bp);
833 			return (0);
834 		}
835 	}
836 
837 	/* check for a starting block beyond the disk or partition limit */
838 	if (bp->b_blkno > p_blkct) {
839 		DPRINTF(IO_DBG, ("xdf: block %lld exceeds VBD size %"PRIu64,
840 		    (longlong_t)bp->b_blkno, (uint64_t)p_blkct));
841 		bioerror(bp, EINVAL);
842 		bp->b_resid = bp->b_bcount;
843 		biodone(bp);
844 		return (0);
845 	}
846 
847 	/* Legacy: don't set error flag at this case */
848 	if (bp->b_blkno == p_blkct) {
849 		bp->b_resid = bp->b_bcount;
850 		biodone(bp);
851 		return (0);
852 	}
853 
854 	/* Adjust for partial transfer */
855 	nblks = bp->b_bcount >> XB_BSHIFT;
856 	if ((bp->b_blkno + nblks) > p_blkct) {
857 		bp->b_resid = ((bp->b_blkno + nblks) - p_blkct) << XB_BSHIFT;
858 		bp->b_bcount -= bp->b_resid;
859 	}
860 
861 	DPRINTF(IO_DBG, ("xdf: strategy blk %lld len %lu\n",
862 	    (longlong_t)bp->b_blkno, (ulong_t)bp->b_bcount));
863 
864 	/* Fix up the buf struct */
865 	bp->b_flags |= B_BUSY;
866 	bp->av_forw = bp->av_back = NULL; /* not tagged with a v_req */
867 	bp->b_private = (void *)(uintptr_t)p_blkst;
868 
869 	mutex_enter(&vdp->xdf_dev_lk);
870 	if (vdp->xdf_xdev_iostat != NULL)
871 		kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
872 	if (vdp->xdf_f_act == NULL) {
873 		vdp->xdf_f_act = vdp->xdf_l_act = bp;
874 	} else {
875 		vdp->xdf_l_act->av_forw = bp;
876 		vdp->xdf_l_act = bp;
877 	}
878 	mutex_exit(&vdp->xdf_dev_lk);
879 
880 	xdf_iostart(vdp);
881 	if (do_polled_io)
882 		(void) xdf_drain_io(vdp);
883 	return (0);
884 }
885 
886 /*ARGSUSED*/
887 static int
888 xdf_read(dev_t dev, struct uio *uiop, cred_t *credp)
889 {
890 
891 	xdf_t	*vdp;
892 	minor_t minor;
893 	diskaddr_t p_blkcnt;
894 	int part;
895 
896 	minor = getminor(dev);
897 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
898 		return (ENXIO);
899 
900 	DPRINTF(IO_DBG, ("xdf: read offset 0x%"PRIx64"\n",
901 	    (int64_t)uiop->uio_offset));
902 
903 	part = XDF_PART(minor);
904 	if (!xdf_isopen(vdp, part))
905 		return (ENXIO);
906 
907 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
908 	    NULL, NULL, NULL, NULL))
909 		return (ENXIO);
910 
911 	if (U_INVAL(uiop))
912 		return (EINVAL);
913 
914 	return (physio(xdf_strategy, NULL, dev, B_READ, xdfmin, uiop));
915 }
916 
917 /*ARGSUSED*/
918 static int
919 xdf_write(dev_t dev, struct uio *uiop, cred_t *credp)
920 {
921 	xdf_t *vdp;
922 	minor_t minor;
923 	diskaddr_t p_blkcnt;
924 	int part;
925 
926 	minor = getminor(dev);
927 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
928 		return (ENXIO);
929 
930 	DPRINTF(IO_DBG, ("xdf: write offset 0x%"PRIx64"\n",
931 	    (int64_t)uiop->uio_offset));
932 
933 	part = XDF_PART(minor);
934 	if (!xdf_isopen(vdp, part))
935 		return (ENXIO);
936 
937 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
938 	    NULL, NULL, NULL, NULL))
939 		return (ENXIO);
940 
941 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt))
942 		return (ENOSPC);
943 
944 	if (U_INVAL(uiop))
945 		return (EINVAL);
946 
947 	return (physio(xdf_strategy, NULL, dev, B_WRITE, minphys, uiop));
948 }
949 
950 /*ARGSUSED*/
951 static int
952 xdf_aread(dev_t dev, struct aio_req *aiop, cred_t *credp)
953 {
954 	xdf_t	*vdp;
955 	minor_t minor;
956 	struct uio *uiop = aiop->aio_uio;
957 	diskaddr_t p_blkcnt;
958 	int part;
959 
960 	minor = getminor(dev);
961 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
962 		return (ENXIO);
963 
964 	part = XDF_PART(minor);
965 	if (!xdf_isopen(vdp, part))
966 		return (ENXIO);
967 
968 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
969 	    NULL, NULL, NULL, NULL))
970 		return (ENXIO);
971 
972 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt))
973 		return (ENOSPC);
974 
975 	if (U_INVAL(uiop))
976 		return (EINVAL);
977 
978 	return (aphysio(xdf_strategy, anocancel, dev, B_READ, minphys, aiop));
979 }
980 
981 /*ARGSUSED*/
982 static int
983 xdf_awrite(dev_t dev, struct aio_req *aiop, cred_t *credp)
984 {
985 	xdf_t *vdp;
986 	minor_t minor;
987 	struct uio *uiop = aiop->aio_uio;
988 	diskaddr_t p_blkcnt;
989 	int part;
990 
991 	minor = getminor(dev);
992 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
993 		return (ENXIO);
994 
995 	part = XDF_PART(minor);
996 	if (!xdf_isopen(vdp, part))
997 		return (ENXIO);
998 
999 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
1000 	    NULL, NULL, NULL, NULL))
1001 		return (ENXIO);
1002 
1003 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt))
1004 		return (ENOSPC);
1005 
1006 	if (U_INVAL(uiop))
1007 		return (EINVAL);
1008 
1009 	return (aphysio(xdf_strategy, anocancel, dev, B_WRITE, minphys, aiop));
1010 }
1011 
1012 static int
1013 xdf_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
1014 {
1015 	struct buf dumpbuf, *dbp;
1016 	xdf_t	*vdp;
1017 	minor_t minor;
1018 	int err = 0;
1019 	int part;
1020 	diskaddr_t p_blkcnt, p_blkst;
1021 
1022 	minor = getminor(dev);
1023 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
1024 		return (ENXIO);
1025 
1026 	DPRINTF(IO_DBG, ("xdf: dump addr (0x%p) blk (%ld) nblks (%d)\n",
1027 	    addr, blkno, nblk));
1028 
1029 	part = XDF_PART(minor);
1030 	if (!xdf_isopen(vdp, part))
1031 		return (ENXIO);
1032 
1033 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt, &p_blkst,
1034 	    NULL, NULL, NULL))
1035 		return (ENXIO);
1036 
1037 	if ((blkno + nblk) > p_blkcnt) {
1038 		cmn_err(CE_WARN, "xdf: block %ld exceeds VBD size %"PRIu64,
1039 		    blkno + nblk, (uint64_t)p_blkcnt);
1040 		return (EINVAL);
1041 	}
1042 
1043 	dbp = &dumpbuf;
1044 	bioinit(dbp);
1045 	dbp->b_flags = B_BUSY;
1046 	dbp->b_un.b_addr = addr;
1047 	dbp->b_bcount = nblk << DEV_BSHIFT;
1048 	dbp->b_blkno = blkno;
1049 	dbp->b_edev = dev;
1050 	dbp->b_private = (void *)(uintptr_t)p_blkst;
1051 
1052 	mutex_enter(&vdp->xdf_dev_lk);
1053 	if (vdp->xdf_xdev_iostat != NULL)
1054 		kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1055 	if (vdp->xdf_f_act == NULL) {
1056 		vdp->xdf_f_act = vdp->xdf_l_act = dbp;
1057 	} else {
1058 		vdp->xdf_l_act->av_forw = dbp;
1059 		vdp->xdf_l_act = dbp;
1060 	}
1061 	dbp->av_forw = NULL;
1062 	dbp->av_back = NULL;
1063 	mutex_exit(&vdp->xdf_dev_lk);
1064 	xdf_iostart(vdp);
1065 	err = xdf_drain_io(vdp);
1066 	biofini(dbp);
1067 	return (err);
1068 }
1069 
1070 /*ARGSUSED*/
1071 static int
1072 xdf_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
1073     int *rvalp)
1074 {
1075 	int instance;
1076 	xdf_t	*vdp;
1077 	minor_t minor;
1078 	int part;
1079 
1080 	minor = getminor(dev);
1081 	instance = XDF_INST(minor);
1082 
1083 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
1084 		return (ENXIO);
1085 
1086 	DPRINTF(IOCTL_DBG, ("xdf%d:ioctl: cmd %d (0x%x)\n",
1087 	    instance, cmd, cmd));
1088 
1089 	part = XDF_PART(minor);
1090 	if (!xdf_isopen(vdp, part))
1091 		return (ENXIO);
1092 
1093 	switch (cmd) {
1094 	case DKIOCGMEDIAINFO: {
1095 		struct dk_minfo	media_info;
1096 
1097 		media_info.dki_lbsize = DEV_BSIZE;
1098 		media_info.dki_capacity = vdp->xdf_pgeom.g_capacity;
1099 		media_info.dki_media_type = DK_FIXED_DISK;
1100 
1101 		if (ddi_copyout(&media_info, (void *)arg,
1102 		    sizeof (struct dk_minfo), mode)) {
1103 			return (EFAULT);
1104 		} else {
1105 			return (0);
1106 		}
1107 	}
1108 
1109 	case DKIOCINFO: {
1110 		struct dk_cinfo info;
1111 
1112 		/* controller information */
1113 		if (XD_IS_CD(vdp))
1114 			info.dki_ctype = DKC_CDROM;
1115 		else
1116 			info.dki_ctype = DKC_VBD;
1117 
1118 		info.dki_cnum = 0;
1119 		(void) strncpy((char *)(&info.dki_cname), "xdf", 8);
1120 
1121 		/* unit information */
1122 		info.dki_unit = ddi_get_instance(vdp->xdf_dip);
1123 		(void) strncpy((char *)(&info.dki_dname), "xdf", 8);
1124 		info.dki_flags = DKI_FMTVOL;
1125 		info.dki_partition = part;
1126 		info.dki_maxtransfer = maxphys / DEV_BSIZE;
1127 		info.dki_addr = 0;
1128 		info.dki_space = 0;
1129 		info.dki_prio = 0;
1130 		info.dki_vec = 0;
1131 
1132 		if (ddi_copyout(&info, (void *)arg, sizeof (info), mode))
1133 			return (EFAULT);
1134 		else
1135 			return (0);
1136 	}
1137 
1138 	case DKIOCSTATE: {
1139 		enum dkio_state	dkstate = DKIO_INSERTED;
1140 		if (ddi_copyout(&dkstate, (void *)arg, sizeof (dkstate),
1141 		    mode) != 0)
1142 			return (EFAULT);
1143 		return (0);
1144 	}
1145 
1146 	/*
1147 	 * is media removable?
1148 	 */
1149 	case DKIOCREMOVABLE: {
1150 		int i = XD_IS_RM(vdp) ? 1 : 0;
1151 		if (ddi_copyout(&i, (caddr_t)arg, sizeof (int), mode))
1152 			return (EFAULT);
1153 		return (0);
1154 	}
1155 
1156 	case DKIOCG_PHYGEOM:
1157 	case DKIOCG_VIRTGEOM:
1158 	case DKIOCGGEOM:
1159 	case DKIOCSGEOM:
1160 	case DKIOCGAPART:
1161 	case DKIOCSAPART:
1162 	case DKIOCGVTOC:
1163 	case DKIOCSVTOC:
1164 	case DKIOCPARTINFO:
1165 	case DKIOCGMBOOT:
1166 	case DKIOCSMBOOT:
1167 	case DKIOCGETEFI:
1168 	case DKIOCSETEFI:
1169 	case DKIOCPARTITION: {
1170 		int rc;
1171 
1172 		rc = cmlb_ioctl(vdp->xdf_vd_lbl, dev, cmd, arg, mode, credp,
1173 		    rvalp, NULL);
1174 		return (rc);
1175 	}
1176 
1177 	case DKIOCGETWCE:
1178 		if (ddi_copyout(&vdp->xdf_wce, (void *)arg,
1179 		    sizeof (vdp->xdf_wce), mode))
1180 			return (EFAULT);
1181 		return (0);
1182 	case DKIOCSETWCE:
1183 		if (ddi_copyin((void *)arg, &vdp->xdf_wce,
1184 		    sizeof (vdp->xdf_wce), mode))
1185 			return (EFAULT);
1186 		return (0);
1187 	case DKIOCFLUSHWRITECACHE: {
1188 		int rc;
1189 		struct dk_callback *dkc = (struct dk_callback *)arg;
1190 
1191 		if (vdp->xdf_flush_supported) {
1192 			rc = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
1193 			    NULL, 0, 0, (void *)dev);
1194 		} else if (vdp->xdf_feature_barrier &&
1195 		    !xdf_barrier_flush_disable) {
1196 			rc = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
1197 			    vdp->xdf_cache_flush_block, xdf_flush_block,
1198 			    DEV_BSIZE, (void *)dev);
1199 		} else {
1200 			return (ENOTTY);
1201 		}
1202 		if ((mode & FKIOCTL) && (dkc != NULL) &&
1203 		    (dkc->dkc_callback != NULL)) {
1204 			(*dkc->dkc_callback)(dkc->dkc_cookie, rc);
1205 			/* need to return 0 after calling callback */
1206 			rc = 0;
1207 		}
1208 		return (rc);
1209 	}
1210 
1211 	default:
1212 		return (ENOTTY);
1213 	}
1214 }
1215 
1216 /*
1217  * xdf interrupt handler
1218  */
1219 static uint_t
1220 xdf_intr(caddr_t arg)
1221 {
1222 	xdf_t *vdp = (xdf_t *)arg;
1223 	xendev_ring_t *xbr;
1224 	blkif_response_t *resp;
1225 	int bioerr;
1226 	uint64_t id;
1227 	extern int do_polled_io;
1228 	uint8_t op;
1229 	uint16_t status;
1230 	ddi_acc_handle_t acchdl;
1231 
1232 	mutex_enter(&vdp->xdf_dev_lk);
1233 
1234 	if ((xbr = vdp->xdf_xb_ring) == NULL) {
1235 		mutex_exit(&vdp->xdf_dev_lk);
1236 		return (DDI_INTR_UNCLAIMED);
1237 	}
1238 
1239 	acchdl = vdp->xdf_xb_ring_hdl;
1240 
1241 	/*
1242 	 * complete all requests which have a response
1243 	 */
1244 	while (resp = xvdi_ring_get_response(xbr)) {
1245 		id = ddi_get64(acchdl, &resp->id);
1246 		op = ddi_get8(acchdl, &resp->operation);
1247 		status = ddi_get16(acchdl, (uint16_t *)&resp->status);
1248 		DPRINTF(INTR_DBG, ("resp: op %d id %"PRIu64" status %d\n",
1249 		    op, id, status));
1250 
1251 		/*
1252 		 * XXPV - close connection to the backend and restart
1253 		 */
1254 		if (status != BLKIF_RSP_OKAY) {
1255 			DPRINTF(IO_DBG, ("xdf@%s: I/O error while %s",
1256 			    ddi_get_name_addr(vdp->xdf_dip),
1257 			    (op == BLKIF_OP_READ) ? "reading" : "writing"));
1258 			bioerr = EIO;
1259 		} else {
1260 			bioerr = 0;
1261 		}
1262 
1263 		xdf_iofini(vdp, id, bioerr);
1264 	}
1265 
1266 	mutex_exit(&vdp->xdf_dev_lk);
1267 
1268 	if (!do_polled_io)
1269 		xdf_iostart(vdp);
1270 
1271 	return (DDI_INTR_CLAIMED);
1272 }
1273 
1274 int xdf_fbrewrites;	/* how many times was our flush block rewritten */
1275 
1276 /*
1277  * Snarf new data if our flush block was re-written
1278  */
1279 static void
1280 check_fbwrite(xdf_t *vdp, buf_t *bp, daddr_t blkno)
1281 {
1282 	int nblks;
1283 	boolean_t mapin;
1284 
1285 	if (IS_WRITE_BARRIER(vdp, bp))
1286 		return; /* write was a flush write */
1287 
1288 	mapin = B_FALSE;
1289 	nblks = bp->b_bcount >> DEV_BSHIFT;
1290 	if (xdf_flush_block >= blkno && xdf_flush_block < (blkno + nblks)) {
1291 		xdf_fbrewrites++;
1292 		if (bp->b_flags & (B_PAGEIO | B_PHYS)) {
1293 			mapin = B_TRUE;
1294 			bp_mapin(bp);
1295 		}
1296 		bcopy(bp->b_un.b_addr +
1297 		    ((xdf_flush_block - blkno) << DEV_BSHIFT),
1298 		    vdp->xdf_cache_flush_block, DEV_BSIZE);
1299 		if (mapin)
1300 			bp_mapout(bp);
1301 	}
1302 }
1303 
1304 static void
1305 xdf_iofini(xdf_t *vdp, uint64_t id, int bioerr)
1306 {
1307 	ge_slot_t *gs = (ge_slot_t *)(uintptr_t)id;
1308 	v_req_t *vreq = gs->vreq;
1309 	buf_t *bp = vreq->v_buf;
1310 
1311 	gs_free(vdp, gs);
1312 	if (bioerr)
1313 		bioerror(bp, bioerr);
1314 	vreq->v_nslots--;
1315 	if (vreq->v_nslots != 0)
1316 		return;
1317 
1318 	XDF_UPDATE_IO_STAT(vdp, bp);
1319 	if (vdp->xdf_xdev_iostat != NULL)
1320 		kstat_runq_exit(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1321 
1322 	if (IS_ERROR(bp))
1323 		bp->b_resid = bp->b_bcount;
1324 
1325 	vreq_free(vdp, vreq);
1326 	biodone(bp);
1327 }
1328 
1329 /*
1330  * return value of xdf_prepare_rreq()
1331  * used in xdf_iostart()
1332  */
1333 #define	XF_PARTIAL	0 /* rreq is full, not all I/O in buf transferred */
1334 #define	XF_COMP		1 /* no more I/O left in buf */
1335 
1336 static void
1337 xdf_iostart(xdf_t *vdp)
1338 {
1339 	xendev_ring_t *xbr;
1340 	struct buf *bp;
1341 	blkif_request_t *rreq;
1342 	int retval;
1343 	int rreqready = 0;
1344 
1345 	xbr = vdp->xdf_xb_ring;
1346 
1347 	/*
1348 	 * populate the ring request(s)
1349 	 *
1350 	 * loop until there is no buf to transfer or no free slot
1351 	 * available in I/O ring
1352 	 */
1353 	mutex_enter(&vdp->xdf_dev_lk);
1354 
1355 	for (;;) {
1356 		if (vdp->xdf_status != XD_READY)
1357 			break;
1358 
1359 		/* active buf queue empty? */
1360 		if ((bp = vdp->xdf_f_act) == NULL)
1361 			break;
1362 
1363 		/* try to grab a vreq for this bp */
1364 		if ((BP2VREQ(bp) == NULL) && (vreq_get(vdp, bp) == NULL))
1365 				break;
1366 		/* alloc DMA/GTE resources */
1367 		if (vreq_setup(vdp, BP2VREQ(bp)) != DDI_SUCCESS)
1368 			break;
1369 
1370 		/* get next blkif_request in the ring */
1371 		if ((rreq = xvdi_ring_get_request(xbr)) == NULL)
1372 			break;
1373 		bzero(rreq, sizeof (blkif_request_t));
1374 
1375 		/* populate blkif_request with this buf */
1376 		rreqready++;
1377 		retval = xdf_prepare_rreq(vdp, bp, rreq);
1378 		if (retval == XF_COMP) {
1379 			/* finish this bp, switch to next one */
1380 			if (vdp->xdf_xdev_iostat != NULL)
1381 				kstat_waitq_to_runq(
1382 				    KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1383 			vdp->xdf_f_act = bp->av_forw;
1384 			bp->av_forw = NULL;
1385 		}
1386 	}
1387 
1388 	/*
1389 	 * Send the request(s) to the backend
1390 	 */
1391 	if (rreqready) {
1392 		if (xvdi_ring_push_request(xbr)) {
1393 			DPRINTF(IO_DBG, ("xdf_iostart: "
1394 			    "sent request(s) to backend\n"));
1395 			xvdi_notify_oe(vdp->xdf_dip);
1396 		}
1397 	}
1398 
1399 	mutex_exit(&vdp->xdf_dev_lk);
1400 }
1401 
1402 /*
1403  * populate a single blkif_request_t w/ a buf
1404  */
1405 static int
1406 xdf_prepare_rreq(xdf_t *vdp, struct buf *bp, blkif_request_t *rreq)
1407 {
1408 	int		rval;
1409 	grant_ref_t	gr;
1410 	uint8_t		fsect, lsect;
1411 	size_t		bcnt;
1412 	paddr_t		dma_addr;
1413 	off_t		blk_off;
1414 	dev_info_t	*dip = vdp->xdf_dip;
1415 	blkif_vdev_t	vdev = xvdi_get_vdevnum(dip);
1416 	v_req_t		*vreq = BP2VREQ(bp);
1417 	uint64_t	blkno = vreq->v_blkno;
1418 	uint_t		ndmacs = vreq->v_ndmacs;
1419 	ddi_acc_handle_t acchdl = vdp->xdf_xb_ring_hdl;
1420 	int		seg = 0;
1421 	int		isread = IS_READ(bp);
1422 
1423 	if (isread)
1424 		ddi_put8(acchdl, &rreq->operation, BLKIF_OP_READ);
1425 	else {
1426 		switch (vreq->v_flush_diskcache) {
1427 		case FLUSH_DISKCACHE:
1428 			ddi_put8(acchdl, &rreq->operation,
1429 			    BLKIF_OP_FLUSH_DISKCACHE);
1430 			ddi_put16(acchdl, &rreq->handle, vdev);
1431 			ddi_put64(acchdl, &rreq->id,
1432 			    (uint64_t)(uintptr_t)(vreq->v_gs));
1433 			ddi_put8(acchdl, &rreq->nr_segments, 0);
1434 			return (XF_COMP);
1435 		case WRITE_BARRIER:
1436 			ddi_put8(acchdl, &rreq->operation,
1437 			    BLKIF_OP_WRITE_BARRIER);
1438 			break;
1439 		default:
1440 			if (!vdp->xdf_wce)
1441 				ddi_put8(acchdl, &rreq->operation,
1442 				    BLKIF_OP_WRITE_BARRIER);
1443 			else
1444 				ddi_put8(acchdl, &rreq->operation,
1445 				    BLKIF_OP_WRITE);
1446 			break;
1447 		}
1448 	}
1449 
1450 	ddi_put16(acchdl, &rreq->handle, vdev);
1451 	ddi_put64(acchdl, &rreq->sector_number, blkno);
1452 	ddi_put64(acchdl, &rreq->id, (uint64_t)(uintptr_t)(vreq->v_gs));
1453 
1454 	/*
1455 	 * loop until all segments are populated or no more dma cookie in buf
1456 	 */
1457 	for (;;) {
1458 	/*
1459 	 * Each segment of a blkif request can transfer up to
1460 	 * one 4K page of data.
1461 	 */
1462 		bcnt = vreq->v_dmac.dmac_size;
1463 		ASSERT(bcnt <= PAGESIZE);
1464 		ASSERT((bcnt % XB_BSIZE) == 0);
1465 		dma_addr = vreq->v_dmac.dmac_laddress;
1466 		blk_off = (uint_t)((paddr_t)XB_SEGOFFSET & dma_addr);
1467 		ASSERT((blk_off & XB_BMASK) == 0);
1468 		fsect = blk_off >> XB_BSHIFT;
1469 		lsect = fsect + (bcnt >> XB_BSHIFT) - 1;
1470 		ASSERT(fsect < XB_MAX_SEGLEN / XB_BSIZE &&
1471 		    lsect < XB_MAX_SEGLEN / XB_BSIZE);
1472 		DPRINTF(IO_DBG, ("  ""seg%d: dmacS %lu blk_off %ld\n",
1473 		    seg, vreq->v_dmac.dmac_size, blk_off));
1474 		gr = gs_grant(vreq->v_gs, PATOMA(dma_addr) >> PAGESHIFT);
1475 		ddi_put32(acchdl, &rreq->seg[seg].gref, gr);
1476 		ddi_put8(acchdl, &rreq->seg[seg].first_sect, fsect);
1477 		ddi_put8(acchdl, &rreq->seg[seg].last_sect, lsect);
1478 		DPRINTF(IO_DBG, ("  ""seg%d: fs %d ls %d gr %d dma 0x%"PRIx64
1479 		    "\n", seg, fsect, lsect, gr, dma_addr));
1480 
1481 		blkno += (bcnt >> XB_BSHIFT);
1482 		seg++;
1483 		ASSERT(seg <= BLKIF_MAX_SEGMENTS_PER_REQUEST);
1484 		if (--ndmacs) {
1485 			ddi_dma_nextcookie(vreq->v_dmahdl, &vreq->v_dmac);
1486 			continue;
1487 		}
1488 
1489 		vreq->v_status = VREQ_DMAWIN_DONE;
1490 		vreq->v_blkno = blkno;
1491 		if (vreq->v_dmaw + 1 == vreq->v_ndmaws)
1492 			/* last win */
1493 			rval = XF_COMP;
1494 		else
1495 			rval = XF_PARTIAL;
1496 		break;
1497 	}
1498 	ddi_put8(acchdl,  &rreq->nr_segments, seg);
1499 	DPRINTF(IO_DBG, ("xdf_prepare_rreq: request id=%"PRIx64" ready\n",
1500 	    rreq->id));
1501 
1502 	return (rval);
1503 }
1504 
1505 #define	XDF_QSEC	50000	/* .005 second */
1506 #define	XDF_POLLCNT	12	/* loop for 12 times before time out */
1507 
1508 static int
1509 xdf_drain_io(xdf_t *vdp)
1510 {
1511 	int pollc, rval;
1512 	xendev_ring_t *xbr;
1513 
1514 	if (xdfdebug & SUSRES_DBG)
1515 		xen_printf("xdf_drain_io: start\n");
1516 
1517 	mutex_enter(&vdp->xdf_dev_lk);
1518 
1519 	if ((vdp->xdf_status != XD_READY) && (vdp->xdf_status != XD_SUSPEND))
1520 		goto out;
1521 
1522 	rval = 0;
1523 	xbr = vdp->xdf_xb_ring;
1524 	ASSERT(xbr != NULL);
1525 
1526 	for (pollc = 0; pollc < XDF_POLLCNT; pollc++) {
1527 		if (xvdi_ring_has_unconsumed_responses(xbr)) {
1528 			mutex_exit(&vdp->xdf_dev_lk);
1529 			(void) xdf_intr((caddr_t)vdp);
1530 			mutex_enter(&vdp->xdf_dev_lk);
1531 		}
1532 		if (!xvdi_ring_has_incomp_request(xbr))
1533 			goto out;
1534 
1535 #ifndef	XPV_HVM_DRIVER
1536 		(void) HYPERVISOR_yield();
1537 #endif /* XPV_HVM_DRIVER */
1538 		/*
1539 		 * file-backed devices can be slow
1540 		 */
1541 		drv_usecwait(XDF_QSEC << pollc);
1542 	}
1543 	cmn_err(CE_WARN, "xdf_polled_io: timeout");
1544 	rval = EIO;
1545 out:
1546 	mutex_exit(&vdp->xdf_dev_lk);
1547 	if (xdfdebug & SUSRES_DBG)
1548 		xen_printf("xdf_drain_io: end, err=%d\n", rval);
1549 	return (rval);
1550 }
1551 
1552 /* ARGSUSED5 */
1553 int
1554 xdf_lb_rdwr(dev_info_t *devi, uchar_t cmd, void *bufp,
1555     diskaddr_t start, size_t reqlen, void *tg_cookie)
1556 {
1557 	xdf_t *vdp;
1558 	struct buf *bp;
1559 	int err = 0;
1560 
1561 	vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi));
1562 	if (vdp == NULL)
1563 		return (ENXIO);
1564 
1565 	if ((start + (reqlen >> DEV_BSHIFT)) > vdp->xdf_pgeom.g_capacity)
1566 		return (EINVAL);
1567 
1568 	bp = getrbuf(KM_SLEEP);
1569 	if (cmd == TG_READ)
1570 		bp->b_flags = B_BUSY | B_READ;
1571 	else
1572 		bp->b_flags = B_BUSY | B_WRITE;
1573 	bp->b_un.b_addr = bufp;
1574 	bp->b_bcount = reqlen;
1575 	bp->b_blkno = start;
1576 	bp->b_edev = DDI_DEV_T_NONE; /* don't have dev_t */
1577 
1578 	mutex_enter(&vdp->xdf_dev_lk);
1579 	if (vdp->xdf_xdev_iostat != NULL)
1580 		kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1581 	if (vdp->xdf_f_act == NULL) {
1582 		vdp->xdf_f_act = vdp->xdf_l_act = bp;
1583 	} else {
1584 		vdp->xdf_l_act->av_forw = bp;
1585 		vdp->xdf_l_act = bp;
1586 	}
1587 	mutex_exit(&vdp->xdf_dev_lk);
1588 	xdf_iostart(vdp);
1589 	err = biowait(bp);
1590 
1591 	ASSERT(bp->b_flags & B_DONE);
1592 
1593 	freerbuf(bp);
1594 	return (err);
1595 }
1596 
1597 /*
1598  * synthetic geometry
1599  */
1600 #define	XDF_NSECTS	256
1601 #define	XDF_NHEADS	16
1602 
1603 static void
1604 xdf_synthetic_pgeom(dev_info_t *devi, cmlb_geom_t *geomp)
1605 {
1606 	xdf_t *vdp;
1607 	uint_t ncyl;
1608 
1609 	vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi));
1610 
1611 	ncyl = vdp->xdf_xdev_nblocks / (XDF_NHEADS * XDF_NSECTS);
1612 
1613 	geomp->g_ncyl = ncyl == 0 ? 1 : ncyl;
1614 	geomp->g_acyl = 0;
1615 	geomp->g_nhead = XDF_NHEADS;
1616 	geomp->g_secsize = XB_BSIZE;
1617 	geomp->g_nsect = XDF_NSECTS;
1618 	geomp->g_intrlv = 0;
1619 	geomp->g_rpm = 7200;
1620 	geomp->g_capacity = vdp->xdf_xdev_nblocks;
1621 }
1622 
1623 static int
1624 xdf_lb_getcap(dev_info_t *devi, diskaddr_t *capp)
1625 {
1626 	xdf_t *vdp;
1627 
1628 	vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi));
1629 
1630 	if (vdp == NULL)
1631 		return (ENXIO);
1632 
1633 	mutex_enter(&vdp->xdf_dev_lk);
1634 	*capp = vdp->xdf_pgeom.g_capacity;
1635 	DPRINTF(LBL_DBG, ("capacity %llu\n", *capp));
1636 	mutex_exit(&vdp->xdf_dev_lk);
1637 	return (0);
1638 }
1639 
1640 static int
1641 xdf_lb_getpgeom(dev_info_t *devi, cmlb_geom_t *geomp)
1642 {
1643 	xdf_t *vdp;
1644 
1645 	if ((vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi))) == NULL)
1646 		return (ENXIO);
1647 	*geomp = vdp->xdf_pgeom;
1648 	return (0);
1649 }
1650 
1651 /*
1652  * No real HBA, no geometry available from it
1653  */
1654 /*ARGSUSED*/
1655 static int
1656 xdf_lb_getvgeom(dev_info_t *devi, cmlb_geom_t *geomp)
1657 {
1658 	return (EINVAL);
1659 }
1660 
1661 static int
1662 xdf_lb_getattribute(dev_info_t *devi, tg_attribute_t *tgattributep)
1663 {
1664 	xdf_t *vdp;
1665 
1666 	if (!(vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi))))
1667 		return (ENXIO);
1668 
1669 	if (XD_IS_RO(vdp))
1670 		tgattributep->media_is_writable = 0;
1671 	else
1672 		tgattributep->media_is_writable = 1;
1673 	return (0);
1674 }
1675 
1676 /* ARGSUSED3 */
1677 int
1678 xdf_lb_getinfo(dev_info_t *devi, int cmd, void *arg, void *tg_cookie)
1679 {
1680 	switch (cmd) {
1681 	case TG_GETPHYGEOM:
1682 		return (xdf_lb_getpgeom(devi, (cmlb_geom_t *)arg));
1683 	case TG_GETVIRTGEOM:
1684 		return (xdf_lb_getvgeom(devi, (cmlb_geom_t *)arg));
1685 	case TG_GETCAPACITY:
1686 		return (xdf_lb_getcap(devi, (diskaddr_t *)arg));
1687 	case TG_GETBLOCKSIZE:
1688 		*(uint32_t *)arg = XB_BSIZE;
1689 		return (0);
1690 	case TG_GETATTR:
1691 		return (xdf_lb_getattribute(devi, (tg_attribute_t *)arg));
1692 	default:
1693 		return (ENOTTY);
1694 	}
1695 }
1696 
1697 /*
1698  * Kick-off connect process
1699  * Status should be XD_UNKNOWN or XD_CLOSED
1700  * On success, status will be changed to XD_INIT
1701  * On error, status won't be changed
1702  */
1703 static int
1704 xdf_start_connect(xdf_t *vdp)
1705 {
1706 	char *xsnode;
1707 	grant_ref_t gref;
1708 	xenbus_transaction_t xbt;
1709 	int rv;
1710 	dev_info_t *dip = vdp->xdf_dip;
1711 
1712 	if ((vdp->xdf_peer = xvdi_get_oeid(dip)) == (domid_t)-1)
1713 		goto errout;
1714 
1715 	if (xvdi_alloc_evtchn(dip) != DDI_SUCCESS) {
1716 		cmn_err(CE_WARN, "xdf@%s: failed to alloc event channel",
1717 		    ddi_get_name_addr(dip));
1718 		goto errout;
1719 	}
1720 	vdp->xdf_evtchn = xvdi_get_evtchn(dip);
1721 #ifdef XPV_HVM_DRIVER
1722 	ec_bind_evtchn_to_handler(vdp->xdf_evtchn, IPL_VBD, xdf_intr, vdp);
1723 #else /* !XPV_HVM_DRIVER */
1724 	if (ddi_add_intr(dip, 0, NULL, NULL, xdf_intr, (caddr_t)vdp) !=
1725 	    DDI_SUCCESS) {
1726 		cmn_err(CE_WARN, "xdf_start_connect: xdf@%s: "
1727 		    "failed to add intr handler", ddi_get_name_addr(dip));
1728 		goto errout1;
1729 	}
1730 #endif /* !XPV_HVM_DRIVER */
1731 
1732 	if (xvdi_alloc_ring(dip, BLKIF_RING_SIZE,
1733 	    sizeof (union blkif_sring_entry), &gref, &vdp->xdf_xb_ring) !=
1734 	    DDI_SUCCESS) {
1735 		cmn_err(CE_WARN, "xdf@%s: failed to alloc comm ring",
1736 		    ddi_get_name_addr(dip));
1737 		goto errout2;
1738 	}
1739 	vdp->xdf_xb_ring_hdl = vdp->xdf_xb_ring->xr_acc_hdl; /* ugly!! */
1740 
1741 	/*
1742 	 * Write into xenstore the info needed by backend
1743 	 */
1744 	if ((xsnode = xvdi_get_xsname(dip)) == NULL) {
1745 		cmn_err(CE_WARN, "xdf@%s: "
1746 		    "failed to get xenstore node path",
1747 		    ddi_get_name_addr(dip));
1748 		goto fail_trans;
1749 	}
1750 trans_retry:
1751 	if (xenbus_transaction_start(&xbt)) {
1752 		cmn_err(CE_WARN, "xdf@%s: failed to start transaction",
1753 		    ddi_get_name_addr(dip));
1754 		xvdi_fatal_error(dip, EIO, "transaction start");
1755 		goto fail_trans;
1756 	}
1757 
1758 	if (rv = xenbus_printf(xbt, xsnode, "ring-ref", "%u", gref)) {
1759 		cmn_err(CE_WARN, "xdf@%s: failed to write ring-ref",
1760 		    ddi_get_name_addr(dip));
1761 		xvdi_fatal_error(dip, rv, "writing ring-ref");
1762 		goto abort_trans;
1763 	}
1764 
1765 	if (rv = xenbus_printf(xbt, xsnode, "event-channel", "%u",
1766 	    vdp->xdf_evtchn)) {
1767 		cmn_err(CE_WARN, "xdf@%s: failed to write event-channel",
1768 		    ddi_get_name_addr(dip));
1769 		xvdi_fatal_error(dip, rv, "writing event-channel");
1770 		goto abort_trans;
1771 	}
1772 
1773 	/*
1774 	 * "protocol" is written by the domain builder in the case of PV
1775 	 * domains. However, it is not written for HVM domains, so let's
1776 	 * write it here.
1777 	 */
1778 	if (rv = xenbus_printf(xbt, xsnode, "protocol", "%s",
1779 	    XEN_IO_PROTO_ABI_NATIVE)) {
1780 		cmn_err(CE_WARN, "xdf@%s: failed to write protocol",
1781 		    ddi_get_name_addr(dip));
1782 		xvdi_fatal_error(dip, rv, "writing protocol");
1783 		goto abort_trans;
1784 	}
1785 
1786 	if ((rv = xvdi_switch_state(dip, xbt, XenbusStateInitialised)) > 0) {
1787 		cmn_err(CE_WARN, "xdf@%s: "
1788 		    "failed to switch state to XenbusStateInitialised",
1789 		    ddi_get_name_addr(dip));
1790 		xvdi_fatal_error(dip, rv, "writing state");
1791 		goto abort_trans;
1792 	}
1793 
1794 	/* kick-off connect process */
1795 	if (rv = xenbus_transaction_end(xbt, 0)) {
1796 		if (rv == EAGAIN)
1797 			goto trans_retry;
1798 		cmn_err(CE_WARN, "xdf@%s: failed to end transaction",
1799 		    ddi_get_name_addr(dip));
1800 		xvdi_fatal_error(dip, rv, "completing transaction");
1801 		goto fail_trans;
1802 	}
1803 
1804 	ASSERT(mutex_owned(&vdp->xdf_cb_lk));
1805 	mutex_enter(&vdp->xdf_dev_lk);
1806 	vdp->xdf_status = XD_INIT;
1807 	mutex_exit(&vdp->xdf_dev_lk);
1808 
1809 	return (DDI_SUCCESS);
1810 
1811 abort_trans:
1812 	(void) xenbus_transaction_end(xbt, 1);
1813 fail_trans:
1814 	xvdi_free_ring(vdp->xdf_xb_ring);
1815 errout2:
1816 #ifdef XPV_HVM_DRIVER
1817 	ec_unbind_evtchn(vdp->xdf_evtchn);
1818 #else /* !XPV_HVM_DRIVER */
1819 	(void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
1820 #endif /* !XPV_HVM_DRIVER */
1821 errout1:
1822 	xvdi_free_evtchn(dip);
1823 errout:
1824 	cmn_err(CE_WARN, "xdf@%s: fail to kick-off connecting",
1825 	    ddi_get_name_addr(dip));
1826 	return (DDI_FAILURE);
1827 }
1828 
1829 /*
1830  * Kick-off disconnect process
1831  * Status won't be changed
1832  */
1833 static int
1834 xdf_start_disconnect(xdf_t *vdp)
1835 {
1836 	if (xvdi_switch_state(vdp->xdf_dip, XBT_NULL, XenbusStateClosed) > 0) {
1837 		cmn_err(CE_WARN, "xdf@%s: fail to kick-off disconnecting",
1838 		    ddi_get_name_addr(vdp->xdf_dip));
1839 		return (DDI_FAILURE);
1840 	}
1841 
1842 	return (DDI_SUCCESS);
1843 }
1844 
1845 int
1846 xdf_get_flush_block(xdf_t *vdp)
1847 {
1848 	/*
1849 	 * Get a DEV_BSIZE aligned bufer
1850 	 */
1851 	vdp->xdf_flush_mem = kmem_alloc(DEV_BSIZE * 2, KM_SLEEP);
1852 	vdp->xdf_cache_flush_block =
1853 	    (char *)P2ROUNDUP((uintptr_t)(vdp->xdf_flush_mem), DEV_BSIZE);
1854 	if (xdf_lb_rdwr(vdp->xdf_dip, TG_READ, vdp->xdf_cache_flush_block,
1855 	    xdf_flush_block, DEV_BSIZE, NULL) != 0)
1856 		return (DDI_FAILURE);
1857 	return (DDI_SUCCESS);
1858 }
1859 
1860 /*
1861  * Finish other initialization after we've connected to backend
1862  * Status should be XD_INIT before calling this routine
1863  * On success, status should be changed to XD_READY
1864  * On error, status should stay XD_INIT
1865  */
1866 static int
1867 xdf_post_connect(xdf_t *vdp)
1868 {
1869 	int rv;
1870 	uint_t len;
1871 	char *type;
1872 	char *barrier;
1873 	dev_info_t *devi = vdp->xdf_dip;
1874 
1875 	/*
1876 	 * Determine if feature barrier is supported by backend
1877 	 */
1878 	if (xenbus_read(XBT_NULL, xvdi_get_oename(devi),
1879 	    "feature-barrier", (void **)&barrier, &len) == 0) {
1880 		vdp->xdf_feature_barrier = 1;
1881 		kmem_free(barrier, len);
1882 	} else {
1883 		cmn_err(CE_NOTE, "xdf@%s: failed to read feature-barrier",
1884 		    ddi_get_name_addr(vdp->xdf_dip));
1885 		vdp->xdf_feature_barrier = 0;
1886 	}
1887 
1888 	/* probe backend */
1889 	if (rv = xenbus_gather(XBT_NULL, xvdi_get_oename(devi),
1890 	    "sectors", "%"SCNu64, &vdp->xdf_xdev_nblocks,
1891 	    "info", "%u", &vdp->xdf_xdev_info, NULL)) {
1892 		cmn_err(CE_WARN, "xdf_post_connect: xdf@%s: "
1893 		    "cannot read backend info", ddi_get_name_addr(devi));
1894 		xvdi_fatal_error(devi, rv, "reading backend info");
1895 		return (DDI_FAILURE);
1896 	}
1897 
1898 	/*
1899 	 * Make sure that the device we're connecting isn't smaller than
1900 	 * the old connected device.
1901 	 */
1902 	if (vdp->xdf_xdev_nblocks < vdp->xdf_pgeom.g_capacity) {
1903 		cmn_err(CE_WARN, "xdf_post_connect: xdf@%s: "
1904 		    "backend disk device shrank", ddi_get_name_addr(devi));
1905 		/* XXX:  call xvdi_fatal_error() here? */
1906 		xvdi_fatal_error(devi, rv, "reading backend info");
1907 		return (DDI_FAILURE);
1908 	}
1909 
1910 	/*
1911 	 * Only update the physical geometry to reflect the new device
1912 	 * size if this is the first time we're connecting to the backend
1913 	 * device.  Once we assign a physical geometry to a device it stays
1914 	 * fixed until:
1915 	 *	- we get detach and re-attached (at which point we
1916 	 *	  automatically assign a new physical geometry).
1917 	 *	- someone calls TG_SETPHYGEOM to explicity set the
1918 	 *	  physical geometry.
1919 	 */
1920 	if (vdp->xdf_pgeom.g_capacity == 0)
1921 		xdf_synthetic_pgeom(devi, &vdp->xdf_pgeom);
1922 
1923 	/* fix disk type */
1924 	if (xenbus_read(XBT_NULL, xvdi_get_xsname(devi), "device-type",
1925 	    (void **)&type, &len) != 0) {
1926 		cmn_err(CE_WARN, "xdf_post_connect: xdf@%s: "
1927 		    "cannot read device-type", ddi_get_name_addr(devi));
1928 		xvdi_fatal_error(devi, rv, "reading device-type");
1929 		return (DDI_FAILURE);
1930 	}
1931 	if (strcmp(type, "cdrom") == 0)
1932 		vdp->xdf_xdev_info |= VDISK_CDROM;
1933 	kmem_free(type, len);
1934 
1935 	/*
1936 	 * We've created all the minor nodes via cmlb_attach() using default
1937 	 * value in xdf_attach() to make it possible to block in xdf_open(),
1938 	 * in case there's anyone (say, booting thread) ever trying to open
1939 	 * it before connected to backend. We will refresh all those minor
1940 	 * nodes w/ latest info we've got now when we are almost connected.
1941 	 *
1942 	 * Don't do this when xdf is already opened by someone (could happen
1943 	 * during resume), for that cmlb_attach() will invalid the label info
1944 	 * and confuse those who has already opened the node, which is bad.
1945 	 */
1946 	if (!xdf_isopen(vdp, -1) && (XD_IS_CD(vdp) || XD_IS_RM(vdp))) {
1947 		/* re-init cmlb w/ latest info we got from backend */
1948 		if (cmlb_attach(devi, &xdf_lb_ops,
1949 		    XD_IS_CD(vdp) ? DTYPE_RODIRECT : DTYPE_DIRECT,
1950 		    XD_IS_RM(vdp), 1,
1951 		    XD_IS_CD(vdp) ? DDI_NT_CD_XVMD : DDI_NT_BLOCK_XVMD,
1952 #if defined(XPV_HVM_DRIVER)
1953 		    CMLB_CREATE_ALTSLICE_VTOC_16_DTYPE_DIRECT |
1954 		    CMLB_INTERNAL_MINOR_NODES,
1955 #else /* !XPV_HVM_DRIVER */
1956 		    CMLB_FAKE_LABEL_ONE_PARTITION,
1957 #endif /* !XPV_HVM_DRIVER */
1958 		    vdp->xdf_vd_lbl, NULL) != 0) {
1959 			cmn_err(CE_WARN, "xdf@%s: cmlb attach failed",
1960 			    ddi_get_name_addr(devi));
1961 			return (DDI_FAILURE);
1962 		}
1963 	}
1964 
1965 	/* mark vbd is ready for I/O */
1966 	ASSERT(mutex_owned(&vdp->xdf_cb_lk));
1967 	mutex_enter(&vdp->xdf_dev_lk);
1968 	vdp->xdf_status = XD_READY;
1969 	mutex_exit(&vdp->xdf_dev_lk);
1970 	/*
1971 	 * If backend has feature-barrier, see if it supports disk
1972 	 * cache flush op.
1973 	 */
1974 	vdp->xdf_flush_supported = 0;
1975 	if (vdp->xdf_feature_barrier) {
1976 		/*
1977 		 * Pretend we already know flush is supported so probe
1978 		 * will attempt the correct op.
1979 		 */
1980 		vdp->xdf_flush_supported = 1;
1981 		if (xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE, NULL, 0, 0, 0) == 0) {
1982 			vdp->xdf_flush_supported = 1;
1983 		} else {
1984 			vdp->xdf_flush_supported = 0;
1985 			/*
1986 			 * If the other end does not support the cache flush op
1987 			 * then we must use a barrier-write to force disk
1988 			 * cache flushing.  Barrier writes require that a data
1989 			 * block actually be written.
1990 			 * Cache a block to barrier-write when we are
1991 			 * asked to perform a flush.
1992 			 * XXX - would it be better to just copy 1 block
1993 			 * (512 bytes) from whatever write we did last
1994 			 * and rewrite that block?
1995 			 */
1996 			if (xdf_get_flush_block(vdp) != DDI_SUCCESS)
1997 				return (DDI_FAILURE);
1998 		}
1999 	}
2000 
2001 	cmn_err(CE_CONT, "?xdf@%s: %"PRIu64" blocks", ddi_get_name_addr(devi),
2002 	    (uint64_t)vdp->xdf_xdev_nblocks);
2003 
2004 	return (DDI_SUCCESS);
2005 }
2006 
2007 /*
2008  * Finish other uninitialization after we've disconnected from backend
2009  * when status is XD_CLOSING or XD_INIT. After returns, status is XD_CLOSED
2010  */
2011 static void
2012 xdf_post_disconnect(xdf_t *vdp)
2013 {
2014 #ifdef XPV_HVM_DRIVER
2015 	ec_unbind_evtchn(vdp->xdf_evtchn);
2016 #else /* !XPV_HVM_DRIVER */
2017 	(void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
2018 #endif /* !XPV_HVM_DRIVER */
2019 	xvdi_free_evtchn(vdp->xdf_dip);
2020 	xvdi_free_ring(vdp->xdf_xb_ring);
2021 	vdp->xdf_xb_ring = NULL;
2022 	vdp->xdf_xb_ring_hdl = NULL;
2023 	vdp->xdf_peer = (domid_t)-1;
2024 
2025 	ASSERT(mutex_owned(&vdp->xdf_cb_lk));
2026 	mutex_enter(&vdp->xdf_dev_lk);
2027 	vdp->xdf_status = XD_CLOSED;
2028 	mutex_exit(&vdp->xdf_dev_lk);
2029 }
2030 
2031 /*ARGSUSED*/
2032 static void
2033 xdf_oe_change(dev_info_t *dip, ddi_eventcookie_t id, void *arg, void *impl_data)
2034 {
2035 	XenbusState new_state = *(XenbusState *)impl_data;
2036 	xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip);
2037 	boolean_t unexpect_die = B_FALSE;
2038 	int status;
2039 
2040 	DPRINTF(DDI_DBG, ("xdf@%s: otherend state change to %d!\n",
2041 	    ddi_get_name_addr(dip), new_state));
2042 
2043 	mutex_enter(&vdp->xdf_cb_lk);
2044 
2045 	if (xdf_check_state_transition(vdp, new_state) == DDI_FAILURE) {
2046 		mutex_exit(&vdp->xdf_cb_lk);
2047 		return;
2048 	}
2049 
2050 	switch (new_state) {
2051 	case XenbusStateInitialising:
2052 		ASSERT(vdp->xdf_status == XD_CLOSED);
2053 		/*
2054 		 * backend recovered from a previous failure,
2055 		 * kick-off connect process again
2056 		 */
2057 		if (xdf_start_connect(vdp) != DDI_SUCCESS) {
2058 			cmn_err(CE_WARN, "xdf@%s:"
2059 			    " failed to start reconnecting to backend",
2060 			    ddi_get_name_addr(dip));
2061 		}
2062 		break;
2063 	case XenbusStateConnected:
2064 		ASSERT(vdp->xdf_status == XD_INIT);
2065 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
2066 		/* finish final init after connect */
2067 		if (xdf_post_connect(vdp) != DDI_SUCCESS)
2068 			(void) xdf_start_disconnect(vdp);
2069 		break;
2070 	case XenbusStateClosing:
2071 		if (vdp->xdf_status == XD_READY) {
2072 			mutex_enter(&vdp->xdf_dev_lk);
2073 			if (xdf_isopen(vdp, -1)) {
2074 				cmn_err(CE_NOTE, "xdf@%s: hot-unplug failed, "
2075 				    "still in use", ddi_get_name_addr(dip));
2076 				mutex_exit(&vdp->xdf_dev_lk);
2077 				break;
2078 			} else {
2079 				vdp->xdf_status = XD_CLOSING;
2080 			}
2081 			mutex_exit(&vdp->xdf_dev_lk);
2082 		}
2083 		(void) xdf_start_disconnect(vdp);
2084 		break;
2085 	case XenbusStateClosed:
2086 		/* first check if BE closed unexpectedly */
2087 		mutex_enter(&vdp->xdf_dev_lk);
2088 		if (xdf_isopen(vdp, -1)) {
2089 			unexpect_die = B_TRUE;
2090 			unexpectedie(vdp);
2091 			cmn_err(CE_WARN, "xdf@%s: backend closed, "
2092 			    "reconnecting...", ddi_get_name_addr(dip));
2093 		}
2094 		mutex_exit(&vdp->xdf_dev_lk);
2095 
2096 		if (vdp->xdf_status == XD_READY) {
2097 			mutex_enter(&vdp->xdf_dev_lk);
2098 			vdp->xdf_status = XD_CLOSING;
2099 			mutex_exit(&vdp->xdf_dev_lk);
2100 
2101 #ifdef	DOMU_BACKEND
2102 			(void) xvdi_post_event(dip, XEN_HP_REMOVE);
2103 #endif
2104 
2105 			xdf_post_disconnect(vdp);
2106 			(void) xvdi_switch_state(dip, XBT_NULL,
2107 			    XenbusStateClosed);
2108 		} else if ((vdp->xdf_status == XD_INIT) ||
2109 		    (vdp->xdf_status == XD_CLOSING)) {
2110 			xdf_post_disconnect(vdp);
2111 		} else {
2112 			mutex_enter(&vdp->xdf_dev_lk);
2113 			vdp->xdf_status = XD_CLOSED;
2114 			mutex_exit(&vdp->xdf_dev_lk);
2115 		}
2116 	}
2117 
2118 	/* notify anybody waiting for oe state change */
2119 	mutex_enter(&vdp->xdf_dev_lk);
2120 	cv_broadcast(&vdp->xdf_dev_cv);
2121 	mutex_exit(&vdp->xdf_dev_lk);
2122 
2123 	status = vdp->xdf_status;
2124 	mutex_exit(&vdp->xdf_cb_lk);
2125 
2126 	if (status == XD_READY) {
2127 		xdf_iostart(vdp);
2128 	} else if ((status == XD_CLOSED) && !unexpect_die) {
2129 		/* interface is closed successfully, remove all minor nodes */
2130 		if (vdp->xdf_vd_lbl != NULL) {
2131 			cmlb_detach(vdp->xdf_vd_lbl, NULL);
2132 			cmlb_free_handle(&vdp->xdf_vd_lbl);
2133 			vdp->xdf_vd_lbl = NULL;
2134 		}
2135 	}
2136 }
2137 
2138 /* check if partition is open, -1 - check all partitions on the disk */
2139 static boolean_t
2140 xdf_isopen(xdf_t *vdp, int partition)
2141 {
2142 	int i;
2143 	ulong_t parbit;
2144 	boolean_t rval = B_FALSE;
2145 
2146 	ASSERT((partition == -1) ||
2147 	    ((partition >= 0) || (partition < XDF_PEXT)));
2148 
2149 	if (partition == -1)
2150 		parbit = (ulong_t)-1;
2151 	else
2152 		parbit = 1 << partition;
2153 
2154 	for (i = 0; i < OTYPCNT; i++) {
2155 		if (vdp->xdf_vd_open[i] & parbit)
2156 			rval = B_TRUE;
2157 	}
2158 
2159 	return (rval);
2160 }
2161 
2162 /*
2163  * Xdf_check_state_transition will check the XenbusState change to see
2164  * if the change is a valid transition or not.
2165  * The new state is written by backend domain, or by running xenstore-write
2166  * to change it manually in dom0
2167  */
2168 static int
2169 xdf_check_state_transition(xdf_t *vdp, XenbusState oestate)
2170 {
2171 	int status;
2172 	int stcheck;
2173 #define	STOK	0 /* need further process */
2174 #define	STNOP	1 /* no action need taking */
2175 #define	STBUG	2 /* unexpected state change, could be a bug */
2176 
2177 	status = vdp->xdf_status;
2178 	stcheck = STOK;
2179 
2180 	switch (status) {
2181 	case XD_UNKNOWN:
2182 		if ((oestate == XenbusStateUnknown)		||
2183 		    (oestate == XenbusStateConnected))
2184 			stcheck = STBUG;
2185 		else if ((oestate == XenbusStateInitialising)	||
2186 		    (oestate == XenbusStateInitWait)		||
2187 		    (oestate == XenbusStateInitialised))
2188 			stcheck = STNOP;
2189 		break;
2190 	case XD_INIT:
2191 		if (oestate == XenbusStateUnknown)
2192 			stcheck = STBUG;
2193 		else if ((oestate == XenbusStateInitialising)	||
2194 		    (oestate == XenbusStateInitWait)		||
2195 		    (oestate == XenbusStateInitialised))
2196 			stcheck = STNOP;
2197 		break;
2198 	case XD_READY:
2199 		if ((oestate == XenbusStateUnknown)		||
2200 		    (oestate == XenbusStateInitialising)	||
2201 		    (oestate == XenbusStateInitWait)		||
2202 		    (oestate == XenbusStateInitialised))
2203 			stcheck = STBUG;
2204 		else if (oestate == XenbusStateConnected)
2205 			stcheck = STNOP;
2206 		break;
2207 	case XD_CLOSING:
2208 		if ((oestate == XenbusStateUnknown)		||
2209 		    (oestate == XenbusStateInitialising)	||
2210 		    (oestate == XenbusStateInitWait)		||
2211 		    (oestate == XenbusStateInitialised)		||
2212 		    (oestate == XenbusStateConnected))
2213 			stcheck = STBUG;
2214 		else if (oestate == XenbusStateClosing)
2215 			stcheck = STNOP;
2216 		break;
2217 	case XD_CLOSED:
2218 		if ((oestate == XenbusStateUnknown)		||
2219 		    (oestate == XenbusStateConnected))
2220 			stcheck = STBUG;
2221 		else if ((oestate == XenbusStateInitWait)	||
2222 		    (oestate == XenbusStateInitialised)		||
2223 		    (oestate == XenbusStateClosing)		||
2224 		    (oestate == XenbusStateClosed))
2225 			stcheck = STNOP;
2226 		break;
2227 	case XD_SUSPEND:
2228 	default:
2229 			stcheck = STBUG;
2230 	}
2231 
2232 	if (stcheck == STOK)
2233 		return (DDI_SUCCESS);
2234 
2235 	if (stcheck == STBUG)
2236 		cmn_err(CE_NOTE, "xdf@%s: unexpected otherend "
2237 		    "state change to %d!, when status is %d",
2238 		    ddi_get_name_addr(vdp->xdf_dip), oestate, status);
2239 
2240 	return (DDI_FAILURE);
2241 }
2242 
2243 static int
2244 xdf_connect(xdf_t *vdp, boolean_t wait)
2245 {
2246 	ASSERT(mutex_owned(&vdp->xdf_dev_lk));
2247 	while (vdp->xdf_status != XD_READY) {
2248 		if (!wait || (vdp->xdf_status > XD_READY))
2249 			break;
2250 
2251 		if (cv_wait_sig(&vdp->xdf_dev_cv, &vdp->xdf_dev_lk) == 0)
2252 			break;
2253 	}
2254 
2255 	return (vdp->xdf_status);
2256 }
2257 
2258 /*
2259  * callback func when DMA/GTE resources is available
2260  *
2261  * Note: we only register one callback function to grant table subsystem
2262  * since we only have one 'struct gnttab_free_callback' in xdf_t.
2263  */
2264 static int
2265 xdf_dmacallback(caddr_t arg)
2266 {
2267 	xdf_t *vdp = (xdf_t *)arg;
2268 	ASSERT(vdp != NULL);
2269 
2270 	DPRINTF(DMA_DBG, ("xdf@%s: DMA callback started\n",
2271 	    ddi_get_name_addr(vdp->xdf_dip)));
2272 
2273 	ddi_trigger_softintr(vdp->xdf_softintr_id);
2274 	return (DDI_DMA_CALLBACK_DONE);
2275 }
2276 
2277 static uint_t
2278 xdf_iorestart(caddr_t arg)
2279 {
2280 	xdf_t *vdp = (xdf_t *)arg;
2281 
2282 	ASSERT(vdp != NULL);
2283 
2284 	mutex_enter(&vdp->xdf_dev_lk);
2285 	ASSERT(ISDMACBON(vdp));
2286 	SETDMACBOFF(vdp);
2287 	mutex_exit(&vdp->xdf_dev_lk);
2288 
2289 	xdf_iostart(vdp);
2290 
2291 	return (DDI_INTR_CLAIMED);
2292 }
2293 
2294 static void
2295 xdf_timeout_handler(void *arg)
2296 {
2297 	xdf_t *vdp = arg;
2298 
2299 	mutex_enter(&vdp->xdf_dev_lk);
2300 	vdp->xdf_timeout_id = 0;
2301 	mutex_exit(&vdp->xdf_dev_lk);
2302 
2303 	/* new timeout thread could be re-scheduled */
2304 	xdf_iostart(vdp);
2305 }
2306 
2307 /*
2308  * Alloc a vreq for this bp
2309  * bp->av_back contains the pointer to the vreq upon return
2310  */
2311 static v_req_t *
2312 vreq_get(xdf_t *vdp, buf_t *bp)
2313 {
2314 	v_req_t *vreq = NULL;
2315 
2316 	ASSERT(BP2VREQ(bp) == NULL);
2317 
2318 	vreq = kmem_cache_alloc(xdf_vreq_cache, KM_NOSLEEP);
2319 	if (vreq == NULL) {
2320 		if (vdp->xdf_timeout_id == 0)
2321 			/* restart I/O after one second */
2322 			vdp->xdf_timeout_id =
2323 			    timeout(xdf_timeout_handler, vdp, hz);
2324 		return (NULL);
2325 	}
2326 	bzero(vreq, sizeof (v_req_t));
2327 
2328 	list_insert_head(&vdp->xdf_vreq_act, (void *)vreq);
2329 	bp->av_back = (buf_t *)vreq;
2330 	vreq->v_buf = bp;
2331 	vreq->v_status = VREQ_INIT;
2332 	/* init of other fields in vreq is up to the caller */
2333 
2334 	return (vreq);
2335 }
2336 
2337 static void
2338 vreq_free(xdf_t *vdp, v_req_t *vreq)
2339 {
2340 	buf_t *bp = vreq->v_buf;
2341 
2342 	list_remove(&vdp->xdf_vreq_act, (void *)vreq);
2343 
2344 	if (vreq->v_flush_diskcache == FLUSH_DISKCACHE)
2345 		goto done;
2346 
2347 	switch (vreq->v_status) {
2348 	case VREQ_DMAWIN_DONE:
2349 	case VREQ_GS_ALLOCED:
2350 	case VREQ_DMABUF_BOUND:
2351 		(void) ddi_dma_unbind_handle(vreq->v_dmahdl);
2352 		/*FALLTHRU*/
2353 	case VREQ_DMAMEM_ALLOCED:
2354 		if (!ALIGNED_XFER(bp)) {
2355 			ASSERT(vreq->v_abuf != NULL);
2356 			if (!IS_ERROR(bp) && IS_READ(bp))
2357 				bcopy(vreq->v_abuf, bp->b_un.b_addr,
2358 				    bp->b_bcount);
2359 			ddi_dma_mem_free(&vreq->v_align);
2360 		}
2361 		/*FALLTHRU*/
2362 	case VREQ_MEMDMAHDL_ALLOCED:
2363 		if (!ALIGNED_XFER(bp))
2364 			ddi_dma_free_handle(&vreq->v_memdmahdl);
2365 		/*FALLTHRU*/
2366 	case VREQ_DMAHDL_ALLOCED:
2367 		ddi_dma_free_handle(&vreq->v_dmahdl);
2368 		break;
2369 	default:
2370 		break;
2371 	}
2372 done:
2373 	vreq->v_buf->av_back = NULL;
2374 	kmem_cache_free(xdf_vreq_cache, vreq);
2375 }
2376 
2377 /*
2378  * Initalize the DMA and grant table resources for the buf
2379  */
2380 static int
2381 vreq_setup(xdf_t *vdp, v_req_t *vreq)
2382 {
2383 	int rc;
2384 	ddi_dma_attr_t dmaattr;
2385 	uint_t ndcs, ndws;
2386 	ddi_dma_handle_t dh;
2387 	ddi_dma_handle_t mdh;
2388 	ddi_dma_cookie_t dc;
2389 	ddi_acc_handle_t abh;
2390 	caddr_t	aba;
2391 	ge_slot_t *gs;
2392 	size_t bufsz;
2393 	off_t off;
2394 	size_t sz;
2395 	buf_t *bp = vreq->v_buf;
2396 	int dma_flags = (IS_READ(bp) ? DDI_DMA_READ : DDI_DMA_WRITE) |
2397 	    DDI_DMA_STREAMING | DDI_DMA_PARTIAL;
2398 
2399 	switch (vreq->v_status) {
2400 	case VREQ_INIT:
2401 		if (IS_FLUSH_DISKCACHE(bp)) {
2402 			if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
2403 				DPRINTF(DMA_DBG, (
2404 				    "xdf@%s: get ge_slotfailed\n",
2405 				    ddi_get_name_addr(vdp->xdf_dip)));
2406 				return (DDI_FAILURE);
2407 			}
2408 			vreq->v_blkno = 0;
2409 			vreq->v_nslots = 1;
2410 			vreq->v_gs = gs;
2411 			vreq->v_flush_diskcache = FLUSH_DISKCACHE;
2412 			vreq->v_status = VREQ_GS_ALLOCED;
2413 			gs->vreq = vreq;
2414 			return (DDI_SUCCESS);
2415 		}
2416 
2417 		if (IS_WRITE_BARRIER(vdp, bp))
2418 			vreq->v_flush_diskcache = WRITE_BARRIER;
2419 		vreq->v_blkno = bp->b_blkno +
2420 		    (diskaddr_t)(uintptr_t)bp->b_private;
2421 		bp->b_private = NULL;
2422 		/* See if we wrote new data to our flush block */
2423 		if (!IS_READ(bp) && USE_WRITE_BARRIER(vdp))
2424 			check_fbwrite(vdp, bp, vreq->v_blkno);
2425 		vreq->v_status = VREQ_INIT_DONE;
2426 		/*FALLTHRU*/
2427 
2428 	case VREQ_INIT_DONE:
2429 		/*
2430 		 * alloc DMA handle
2431 		 */
2432 		rc = ddi_dma_alloc_handle(vdp->xdf_dip, &xb_dma_attr,
2433 		    xdf_dmacallback, (caddr_t)vdp, &dh);
2434 		if (rc != DDI_SUCCESS) {
2435 			SETDMACBON(vdp);
2436 			DPRINTF(DMA_DBG, ("xdf@%s: DMA handle alloc failed\n",
2437 			    ddi_get_name_addr(vdp->xdf_dip)));
2438 			return (DDI_FAILURE);
2439 		}
2440 
2441 		vreq->v_dmahdl = dh;
2442 		vreq->v_status = VREQ_DMAHDL_ALLOCED;
2443 		/*FALLTHRU*/
2444 
2445 	case VREQ_DMAHDL_ALLOCED:
2446 		/*
2447 		 * alloc dma handle for 512-byte aligned buf
2448 		 */
2449 		if (!ALIGNED_XFER(bp)) {
2450 			/*
2451 			 * XXPV: we need to temporarily enlarge the seg
2452 			 * boundary and s/g length to work round CR6381968
2453 			 */
2454 			dmaattr = xb_dma_attr;
2455 			dmaattr.dma_attr_seg = (uint64_t)-1;
2456 			dmaattr.dma_attr_sgllen = INT_MAX;
2457 			rc = ddi_dma_alloc_handle(vdp->xdf_dip, &dmaattr,
2458 			    xdf_dmacallback, (caddr_t)vdp, &mdh);
2459 			if (rc != DDI_SUCCESS) {
2460 				SETDMACBON(vdp);
2461 				DPRINTF(DMA_DBG, ("xdf@%s: unaligned buf DMA"
2462 				    "handle alloc failed\n",
2463 				    ddi_get_name_addr(vdp->xdf_dip)));
2464 				return (DDI_FAILURE);
2465 			}
2466 			vreq->v_memdmahdl = mdh;
2467 			vreq->v_status = VREQ_MEMDMAHDL_ALLOCED;
2468 		}
2469 		/*FALLTHRU*/
2470 
2471 	case VREQ_MEMDMAHDL_ALLOCED:
2472 		/*
2473 		 * alloc 512-byte aligned buf
2474 		 */
2475 		if (!ALIGNED_XFER(bp)) {
2476 			if (bp->b_flags & (B_PAGEIO | B_PHYS))
2477 				bp_mapin(bp);
2478 
2479 			rc = ddi_dma_mem_alloc(vreq->v_memdmahdl,
2480 			    roundup(bp->b_bcount, XB_BSIZE), &xc_acc_attr,
2481 			    DDI_DMA_STREAMING, xdf_dmacallback, (caddr_t)vdp,
2482 			    &aba, &bufsz, &abh);
2483 			if (rc != DDI_SUCCESS) {
2484 				SETDMACBON(vdp);
2485 				DPRINTF(DMA_DBG, (
2486 				    "xdf@%s: DMA mem allocation failed\n",
2487 				    ddi_get_name_addr(vdp->xdf_dip)));
2488 				return (DDI_FAILURE);
2489 			}
2490 
2491 			vreq->v_abuf = aba;
2492 			vreq->v_align = abh;
2493 			vreq->v_status = VREQ_DMAMEM_ALLOCED;
2494 
2495 			ASSERT(bufsz >= bp->b_bcount);
2496 			if (!IS_READ(bp))
2497 				bcopy(bp->b_un.b_addr, vreq->v_abuf,
2498 				    bp->b_bcount);
2499 		}
2500 		/*FALLTHRU*/
2501 
2502 	case VREQ_DMAMEM_ALLOCED:
2503 		/*
2504 		 * dma bind
2505 		 */
2506 		if (ALIGNED_XFER(bp)) {
2507 			rc = ddi_dma_buf_bind_handle(vreq->v_dmahdl, bp,
2508 			    dma_flags, xdf_dmacallback, (caddr_t)vdp,
2509 			    &dc, &ndcs);
2510 		} else {
2511 			rc = ddi_dma_addr_bind_handle(vreq->v_dmahdl,
2512 			    NULL, vreq->v_abuf, bp->b_bcount, dma_flags,
2513 			    xdf_dmacallback, (caddr_t)vdp, &dc, &ndcs);
2514 		}
2515 		if (rc == DDI_DMA_MAPPED || rc == DDI_DMA_PARTIAL_MAP) {
2516 			/* get num of dma windows */
2517 			if (rc == DDI_DMA_PARTIAL_MAP) {
2518 				rc = ddi_dma_numwin(vreq->v_dmahdl, &ndws);
2519 				ASSERT(rc == DDI_SUCCESS);
2520 			} else {
2521 				ndws = 1;
2522 			}
2523 		} else {
2524 			SETDMACBON(vdp);
2525 			DPRINTF(DMA_DBG, ("xdf@%s: DMA bind failed\n",
2526 			    ddi_get_name_addr(vdp->xdf_dip)));
2527 			return (DDI_FAILURE);
2528 		}
2529 
2530 		vreq->v_dmac = dc;
2531 		vreq->v_dmaw = 0;
2532 		vreq->v_ndmacs = ndcs;
2533 		vreq->v_ndmaws = ndws;
2534 		vreq->v_nslots = ndws;
2535 		vreq->v_status = VREQ_DMABUF_BOUND;
2536 		/*FALLTHRU*/
2537 
2538 	case VREQ_DMABUF_BOUND:
2539 		/*
2540 		 * get ge_slot, callback is set upon failure from gs_get(),
2541 		 * if not set previously
2542 		 */
2543 		if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
2544 			DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
2545 			    ddi_get_name_addr(vdp->xdf_dip)));
2546 			return (DDI_FAILURE);
2547 		}
2548 
2549 		vreq->v_gs = gs;
2550 		gs->vreq = vreq;
2551 		vreq->v_status = VREQ_GS_ALLOCED;
2552 		break;
2553 
2554 	case VREQ_GS_ALLOCED:
2555 		/* nothing need to be done */
2556 		break;
2557 
2558 	case VREQ_DMAWIN_DONE:
2559 		/*
2560 		 * move to the next dma window
2561 		 */
2562 		ASSERT((vreq->v_dmaw + 1) < vreq->v_ndmaws);
2563 
2564 		/* get a ge_slot for this DMA window */
2565 		if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
2566 			DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
2567 			    ddi_get_name_addr(vdp->xdf_dip)));
2568 			return (DDI_FAILURE);
2569 		}
2570 
2571 		vreq->v_gs = gs;
2572 		gs->vreq = vreq;
2573 		vreq->v_dmaw++;
2574 		rc = ddi_dma_getwin(vreq->v_dmahdl, vreq->v_dmaw, &off, &sz,
2575 		    &vreq->v_dmac, &vreq->v_ndmacs);
2576 		ASSERT(rc == DDI_SUCCESS);
2577 		vreq->v_status = VREQ_GS_ALLOCED;
2578 		break;
2579 
2580 	default:
2581 		return (DDI_FAILURE);
2582 	}
2583 
2584 	return (DDI_SUCCESS);
2585 }
2586 
2587 static ge_slot_t *
2588 gs_get(xdf_t *vdp, int isread)
2589 {
2590 	grant_ref_t gh;
2591 	ge_slot_t *gs;
2592 
2593 	/* try to alloc GTEs needed in this slot, first */
2594 	if (gnttab_alloc_grant_references(
2595 	    BLKIF_MAX_SEGMENTS_PER_REQUEST, &gh) == -1) {
2596 		if (vdp->xdf_gnt_callback.next == NULL) {
2597 			SETDMACBON(vdp);
2598 			gnttab_request_free_callback(
2599 			    &vdp->xdf_gnt_callback,
2600 			    (void (*)(void *))xdf_dmacallback,
2601 			    (void *)vdp,
2602 			    BLKIF_MAX_SEGMENTS_PER_REQUEST);
2603 		}
2604 		return (NULL);
2605 	}
2606 
2607 	gs = kmem_cache_alloc(xdf_gs_cache, KM_NOSLEEP);
2608 	if (gs == NULL) {
2609 		gnttab_free_grant_references(gh);
2610 		if (vdp->xdf_timeout_id == 0)
2611 			/* restart I/O after one second */
2612 			vdp->xdf_timeout_id =
2613 			    timeout(xdf_timeout_handler, vdp, hz);
2614 		return (NULL);
2615 	}
2616 
2617 	/* init gs_slot */
2618 	list_insert_head(&vdp->xdf_gs_act, (void *)gs);
2619 	gs->oeid = vdp->xdf_peer;
2620 	gs->isread = isread;
2621 	gs->ghead = gh;
2622 	gs->ngrefs = 0;
2623 
2624 	return (gs);
2625 }
2626 
2627 static void
2628 gs_free(xdf_t *vdp, ge_slot_t *gs)
2629 {
2630 	int i;
2631 	grant_ref_t *gp = gs->ge;
2632 	int ngrefs = gs->ngrefs;
2633 	boolean_t isread = gs->isread;
2634 
2635 	list_remove(&vdp->xdf_gs_act, (void *)gs);
2636 
2637 	/* release all grant table entry resources used in this slot */
2638 	for (i = 0; i < ngrefs; i++, gp++)
2639 		gnttab_end_foreign_access(*gp, !isread, 0);
2640 	gnttab_free_grant_references(gs->ghead);
2641 
2642 	kmem_cache_free(xdf_gs_cache, (void *)gs);
2643 }
2644 
2645 static grant_ref_t
2646 gs_grant(ge_slot_t *gs, mfn_t mfn)
2647 {
2648 	grant_ref_t gr = gnttab_claim_grant_reference(&gs->ghead);
2649 
2650 	ASSERT(gr != -1);
2651 	ASSERT(gs->ngrefs < BLKIF_MAX_SEGMENTS_PER_REQUEST);
2652 	gs->ge[gs->ngrefs++] = gr;
2653 	gnttab_grant_foreign_access_ref(gr, gs->oeid, mfn, !gs->isread);
2654 
2655 	return (gr);
2656 }
2657 
2658 static void
2659 unexpectedie(xdf_t *vdp)
2660 {
2661 	/* clean up I/Os in ring that have responses */
2662 	if (xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring)) {
2663 		mutex_exit(&vdp->xdf_dev_lk);
2664 		(void) xdf_intr((caddr_t)vdp);
2665 		mutex_enter(&vdp->xdf_dev_lk);
2666 	}
2667 
2668 	/* free up all grant table entries */
2669 	while (!list_is_empty(&vdp->xdf_gs_act))
2670 		gs_free(vdp, list_head(&vdp->xdf_gs_act));
2671 
2672 	/*
2673 	 * move bp back to active list orderly
2674 	 * vreq_busy is updated in vreq_free()
2675 	 */
2676 	while (!list_is_empty(&vdp->xdf_vreq_act)) {
2677 		v_req_t *vreq = list_head(&vdp->xdf_vreq_act);
2678 		buf_t *bp = vreq->v_buf;
2679 
2680 		bp->av_back = NULL;
2681 		bp->b_resid = bp->b_bcount;
2682 		if (vdp->xdf_f_act == NULL) {
2683 			vdp->xdf_f_act = vdp->xdf_l_act = bp;
2684 		} else {
2685 			/* move to the head of list */
2686 			bp->av_forw = vdp->xdf_f_act;
2687 			vdp->xdf_f_act = bp;
2688 		}
2689 		if (vdp->xdf_xdev_iostat != NULL)
2690 			kstat_runq_back_to_waitq(
2691 			    KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
2692 		vreq_free(vdp, vreq);
2693 	}
2694 }
2695 
2696 static void
2697 xdfmin(struct buf *bp)
2698 {
2699 	if (bp->b_bcount > xdf_maxphys)
2700 		bp->b_bcount = xdf_maxphys;
2701 }
2702 
2703 void
2704 xdf_kstat_delete(dev_info_t *dip)
2705 {
2706 	xdf_t	*vdp = (xdf_t *)ddi_get_driver_private(dip);
2707 	kstat_t	*kstat;
2708 
2709 	/*
2710 	 * The locking order here is xdf_iostat_lk and then xdf_dev_lk.
2711 	 * xdf_dev_lk is used to protect the xdf_xdev_iostat pointer
2712 	 * and the contents of the our kstat.  xdf_iostat_lk is used
2713 	 * to protect the allocation and freeing of the actual kstat.
2714 	 * xdf_dev_lk can't be used for this purpose because kstat
2715 	 * readers use it to access the contents of the kstat and
2716 	 * hence it can't be held when calling kstat_delete().
2717 	 */
2718 	mutex_enter(&vdp->xdf_iostat_lk);
2719 	mutex_enter(&vdp->xdf_dev_lk);
2720 
2721 	if (vdp->xdf_xdev_iostat == NULL) {
2722 		mutex_exit(&vdp->xdf_dev_lk);
2723 		mutex_exit(&vdp->xdf_iostat_lk);
2724 		return;
2725 	}
2726 
2727 	kstat = vdp->xdf_xdev_iostat;
2728 	vdp->xdf_xdev_iostat = NULL;
2729 	mutex_exit(&vdp->xdf_dev_lk);
2730 
2731 	kstat_delete(kstat);
2732 	mutex_exit(&vdp->xdf_iostat_lk);
2733 }
2734 
2735 int
2736 xdf_kstat_create(dev_info_t *dip, char *ks_module, int ks_instance)
2737 {
2738 	xdf_t	*vdp = (xdf_t *)ddi_get_driver_private(dip);
2739 
2740 	/* See comment about locking in xdf_kstat_delete(). */
2741 	mutex_enter(&vdp->xdf_iostat_lk);
2742 	mutex_enter(&vdp->xdf_dev_lk);
2743 
2744 	if (vdp->xdf_xdev_iostat != NULL) {
2745 		mutex_exit(&vdp->xdf_dev_lk);
2746 		mutex_exit(&vdp->xdf_iostat_lk);
2747 		return (-1);
2748 	}
2749 
2750 	if ((vdp->xdf_xdev_iostat = kstat_create(
2751 	    ks_module, ks_instance, NULL, "disk",
2752 	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) == NULL) {
2753 		mutex_exit(&vdp->xdf_dev_lk);
2754 		mutex_exit(&vdp->xdf_iostat_lk);
2755 		return (-1);
2756 	}
2757 
2758 	vdp->xdf_xdev_iostat->ks_lock = &vdp->xdf_dev_lk;
2759 	kstat_install(vdp->xdf_xdev_iostat);
2760 	mutex_exit(&vdp->xdf_dev_lk);
2761 	mutex_exit(&vdp->xdf_iostat_lk);
2762 
2763 	return (0);
2764 }
2765 
2766 #if defined(XPV_HVM_DRIVER)
2767 
2768 typedef struct xdf_hvm_entry {
2769 	list_node_t	xdf_he_list;
2770 	char		*xdf_he_path;
2771 	dev_info_t	*xdf_he_dip;
2772 } xdf_hvm_entry_t;
2773 
2774 static list_t xdf_hvm_list;
2775 static kmutex_t xdf_hvm_list_lock;
2776 
2777 static xdf_hvm_entry_t *
2778 i_xdf_hvm_find(char *path, dev_info_t *dip)
2779 {
2780 	xdf_hvm_entry_t	*i;
2781 
2782 	ASSERT((path != NULL) || (dip != NULL));
2783 	ASSERT(MUTEX_HELD(&xdf_hvm_list_lock));
2784 
2785 	i = list_head(&xdf_hvm_list);
2786 	while (i != NULL) {
2787 		if ((path != NULL) && strcmp(i->xdf_he_path, path) != 0) {
2788 			i = list_next(&xdf_hvm_list, i);
2789 			continue;
2790 		}
2791 		if ((dip != NULL) && (i->xdf_he_dip != dip)) {
2792 			i = list_next(&xdf_hvm_list, i);
2793 			continue;
2794 		}
2795 		break;
2796 	}
2797 	return (i);
2798 }
2799 
2800 dev_info_t *
2801 xdf_hvm_hold(char *path)
2802 {
2803 	xdf_hvm_entry_t	*i;
2804 	dev_info_t	*dip;
2805 
2806 	mutex_enter(&xdf_hvm_list_lock);
2807 	i = i_xdf_hvm_find(path, NULL);
2808 	if (i == NULL) {
2809 		mutex_exit(&xdf_hvm_list_lock);
2810 		return (B_FALSE);
2811 	}
2812 	ndi_hold_devi(dip = i->xdf_he_dip);
2813 	mutex_exit(&xdf_hvm_list_lock);
2814 	return (dip);
2815 }
2816 
2817 static void
2818 xdf_hvm_add(dev_info_t *dip)
2819 {
2820 	xdf_hvm_entry_t	*i;
2821 	char		*path;
2822 
2823 	/* figure out the path for the dip */
2824 	path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
2825 	(void) ddi_pathname(dip, path);
2826 
2827 	i = kmem_alloc(sizeof (*i), KM_SLEEP);
2828 	i->xdf_he_dip = dip;
2829 	i->xdf_he_path = i_ddi_strdup(path, KM_SLEEP);
2830 
2831 	mutex_enter(&xdf_hvm_list_lock);
2832 	ASSERT(i_xdf_hvm_find(path, NULL) == NULL);
2833 	ASSERT(i_xdf_hvm_find(NULL, dip) == NULL);
2834 	list_insert_head(&xdf_hvm_list, i);
2835 	mutex_exit(&xdf_hvm_list_lock);
2836 
2837 	kmem_free(path, MAXPATHLEN);
2838 }
2839 
2840 static void
2841 xdf_hvm_rm(dev_info_t *dip)
2842 {
2843 	xdf_hvm_entry_t	*i;
2844 
2845 	mutex_enter(&xdf_hvm_list_lock);
2846 	VERIFY((i = i_xdf_hvm_find(NULL, dip)) != NULL);
2847 	list_remove(&xdf_hvm_list, i);
2848 	mutex_exit(&xdf_hvm_list_lock);
2849 
2850 	kmem_free(i->xdf_he_path, strlen(i->xdf_he_path) + 1);
2851 	kmem_free(i, sizeof (*i));
2852 }
2853 
2854 static void
2855 xdf_hvm_init(void)
2856 {
2857 	list_create(&xdf_hvm_list, sizeof (xdf_hvm_entry_t),
2858 	    offsetof(xdf_hvm_entry_t, xdf_he_list));
2859 	mutex_init(&xdf_hvm_list_lock, NULL, MUTEX_DEFAULT, NULL);
2860 }
2861 
2862 static void
2863 xdf_hvm_fini(void)
2864 {
2865 	ASSERT(list_head(&xdf_hvm_list) == NULL);
2866 	list_destroy(&xdf_hvm_list);
2867 	mutex_destroy(&xdf_hvm_list_lock);
2868 }
2869 
2870 int
2871 xdf_hvm_connect(dev_info_t *dip)
2872 {
2873 	xdf_t	*vdp = (xdf_t *)ddi_get_driver_private(dip);
2874 	int	rv;
2875 
2876 	/* do cv_wait until connected or failed */
2877 	mutex_enter(&vdp->xdf_dev_lk);
2878 	rv = xdf_connect(vdp, B_TRUE);
2879 	mutex_exit(&vdp->xdf_dev_lk);
2880 	return ((rv == XD_READY) ? 0 : -1);
2881 }
2882 
2883 int
2884 xdf_hvm_setpgeom(dev_info_t *dip, cmlb_geom_t *geomp)
2885 {
2886 	xdf_t	*vdp = (xdf_t *)ddi_get_driver_private(dip);
2887 
2888 	/* sanity check the requested physical geometry */
2889 	mutex_enter(&vdp->xdf_dev_lk);
2890 	if ((geomp->g_secsize != XB_BSIZE) ||
2891 	    (geomp->g_capacity == 0)) {
2892 		mutex_exit(&vdp->xdf_dev_lk);
2893 		return (EINVAL);
2894 	}
2895 
2896 	/*
2897 	 * If we've already connected to the backend device then make sure
2898 	 * we're not defining a physical geometry larger than our backend
2899 	 * device.
2900 	 */
2901 	if ((vdp->xdf_xdev_nblocks != 0) &&
2902 	    (geomp->g_capacity > vdp->xdf_xdev_nblocks)) {
2903 		mutex_exit(&vdp->xdf_dev_lk);
2904 		return (EINVAL);
2905 	}
2906 
2907 	vdp->xdf_pgeom = *geomp;
2908 	mutex_exit(&vdp->xdf_dev_lk);
2909 
2910 	/* force a re-validation */
2911 	cmlb_invalidate(vdp->xdf_vd_lbl, NULL);
2912 
2913 	return (0);
2914 }
2915 
2916 #endif /* XPV_HVM_DRIVER */
2917