xref: /titanic_51/usr/src/uts/common/xen/io/xdf.c (revision c5cd6260c3d6c06a9359df595ad9dddbfd00a80e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * xdf.c - Xen Virtual Block Device Driver
29  * TODO:
30  *	- support alternate block size (currently only DEV_BSIZE supported)
31  *	- revalidate geometry for removable devices
32  */
33 
34 #include <sys/ddi.h>
35 #include <sys/sunddi.h>
36 #include <sys/conf.h>
37 #include <sys/cmlb.h>
38 #include <sys/dkio.h>
39 #include <sys/promif.h>
40 #include <sys/sysmacros.h>
41 #include <sys/kstat.h>
42 #include <sys/mach_mmu.h>
43 #ifdef XPV_HVM_DRIVER
44 #include <sys/xpv_support.h>
45 #include <sys/sunndi.h>
46 #endif /* XPV_HVM_DRIVER */
47 #include <public/io/xenbus.h>
48 #include <xen/sys/xenbus_impl.h>
49 #include <xen/sys/xendev.h>
50 #include <sys/gnttab.h>
51 #include <sys/scsi/generic/inquiry.h>
52 #include <xen/io/blkif_impl.h>
53 #include <io/xdf.h>
54 
55 #define	FLUSH_DISKCACHE	0x1
56 #define	WRITE_BARRIER	0x2
57 #define	DEFAULT_FLUSH_BLOCK	156 /* block to write to cause a cache flush */
58 #define	USE_WRITE_BARRIER(vdp)				\
59 	((vdp)->xdf_feature_barrier && !(vdp)->xdf_flush_supported)
60 #define	USE_FLUSH_DISKCACHE(vdp)			\
61 	((vdp)->xdf_feature_barrier && (vdp)->xdf_flush_supported)
62 #define	IS_WRITE_BARRIER(vdp, bp)			\
63 	(!IS_READ(bp) && USE_WRITE_BARRIER(vdp) &&	\
64 	((bp)->b_un.b_addr == (vdp)->xdf_cache_flush_block))
65 #define	IS_FLUSH_DISKCACHE(bp)				\
66 	(!IS_READ(bp) && USE_FLUSH_DISKCACHE(vdp) && ((bp)->b_bcount == 0))
67 
68 static void *vbd_ss;
69 static kmem_cache_t *xdf_vreq_cache;
70 static kmem_cache_t *xdf_gs_cache;
71 static int xdf_maxphys = XB_MAXPHYS;
72 int xdfdebug = 0;
73 extern int do_polled_io;
74 diskaddr_t xdf_flush_block = DEFAULT_FLUSH_BLOCK;
75 int	xdf_barrier_flush_disable = 0;
76 
77 /*
78  * dev_ops and cb_ops entrypoints
79  */
80 static int xdf_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
81 static int xdf_attach(dev_info_t *, ddi_attach_cmd_t);
82 static int xdf_detach(dev_info_t *, ddi_detach_cmd_t);
83 static int xdf_reset(dev_info_t *, ddi_reset_cmd_t);
84 static int xdf_open(dev_t *, int, int, cred_t *);
85 static int xdf_close(dev_t, int, int, struct cred *);
86 static int xdf_strategy(struct buf *);
87 static int xdf_read(dev_t, struct uio *, cred_t *);
88 static int xdf_aread(dev_t, struct aio_req *, cred_t *);
89 static int xdf_write(dev_t, struct uio *, cred_t *);
90 static int xdf_awrite(dev_t, struct aio_req *, cred_t *);
91 static int xdf_dump(dev_t, caddr_t, daddr_t, int);
92 static int xdf_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
93 static uint_t xdf_intr(caddr_t);
94 static int xdf_prop_op(dev_t, dev_info_t *, ddi_prop_op_t, int, char *,
95     caddr_t, int *);
96 
97 /*
98  * misc private functions
99  */
100 static int xdf_suspend(dev_info_t *);
101 static int xdf_resume(dev_info_t *);
102 static int xdf_start_connect(xdf_t *);
103 static int xdf_start_disconnect(xdf_t *);
104 static int xdf_post_connect(xdf_t *);
105 static void xdf_post_disconnect(xdf_t *);
106 static void xdf_oe_change(dev_info_t *, ddi_eventcookie_t, void *, void *);
107 static void xdf_iostart(xdf_t *);
108 static void xdf_iofini(xdf_t *, uint64_t, int);
109 static int xdf_prepare_rreq(xdf_t *, struct buf *, blkif_request_t *);
110 static int xdf_drain_io(xdf_t *);
111 static boolean_t xdf_isopen(xdf_t *, int);
112 static int xdf_check_state_transition(xdf_t *, XenbusState);
113 static int xdf_connect(xdf_t *, boolean_t);
114 static int xdf_dmacallback(caddr_t);
115 static void xdf_timeout_handler(void *);
116 static uint_t xdf_iorestart(caddr_t);
117 static v_req_t *vreq_get(xdf_t *, buf_t *);
118 static void vreq_free(xdf_t *, v_req_t *);
119 static int vreq_setup(xdf_t *, v_req_t *);
120 static ge_slot_t *gs_get(xdf_t *, int);
121 static void gs_free(xdf_t *, ge_slot_t *);
122 static grant_ref_t gs_grant(ge_slot_t *, mfn_t);
123 static void unexpectedie(xdf_t *);
124 static void xdfmin(struct buf *);
125 static void xdf_synthetic_pgeom(dev_info_t *, cmlb_geom_t *);
126 extern int xdf_kstat_create(dev_info_t *, char *, int);
127 extern void xdf_kstat_delete(dev_info_t *);
128 
129 #if defined(XPV_HVM_DRIVER)
130 static void xdf_hvm_add(dev_info_t *);
131 static void xdf_hvm_rm(dev_info_t *);
132 static void xdf_hvm_init(void);
133 static void xdf_hvm_fini(void);
134 #endif /* XPV_HVM_DRIVER */
135 
136 static 	struct cb_ops xdf_cbops = {
137 	xdf_open,
138 	xdf_close,
139 	xdf_strategy,
140 	nodev,
141 	xdf_dump,
142 	xdf_read,
143 	xdf_write,
144 	xdf_ioctl,
145 	nodev,
146 	nodev,
147 	nodev,
148 	nochpoll,
149 	xdf_prop_op,
150 	NULL,
151 	D_MP | D_NEW | D_64BIT,
152 	CB_REV,
153 	xdf_aread,
154 	xdf_awrite
155 };
156 
157 struct dev_ops xdf_devops = {
158 	DEVO_REV,		/* devo_rev */
159 	0,			/* devo_refcnt */
160 	xdf_getinfo,		/* devo_getinfo */
161 	nulldev,		/* devo_identify */
162 	nulldev,		/* devo_probe */
163 	xdf_attach,		/* devo_attach */
164 	xdf_detach,		/* devo_detach */
165 	xdf_reset,		/* devo_reset */
166 	&xdf_cbops,		/* devo_cb_ops */
167 	(struct bus_ops *)NULL	/* devo_bus_ops */
168 };
169 
170 static struct modldrv modldrv = {
171 	&mod_driverops,		/* Type of module.  This one is a driver */
172 	"virtual block driver",	/* short description */
173 	&xdf_devops		/* driver specific ops */
174 };
175 
176 static struct modlinkage xdf_modlinkage = {
177 	MODREV_1, (void *)&modldrv, NULL
178 };
179 
180 /*
181  * I/O buffer DMA attributes
182  * Make sure: one DMA window contains BLKIF_MAX_SEGMENTS_PER_REQUEST at most
183  */
184 static ddi_dma_attr_t xb_dma_attr = {
185 	DMA_ATTR_V0,
186 	(uint64_t)0,			/* lowest address */
187 	(uint64_t)0xffffffffffffffff,	/* highest usable address */
188 	(uint64_t)0xffffff,		/* DMA counter limit max */
189 	(uint64_t)XB_BSIZE,		/* alignment in bytes */
190 	XB_BSIZE - 1,			/* bitmap of burst sizes */
191 	XB_BSIZE,			/* min transfer */
192 	(uint64_t)XB_MAX_XFER, 		/* maximum transfer */
193 	(uint64_t)PAGEOFFSET,		/* 1 page segment length  */
194 	BLKIF_MAX_SEGMENTS_PER_REQUEST,	/* maximum number of segments */
195 	XB_BSIZE,			/* granularity */
196 	0,				/* flags (reserved) */
197 };
198 
199 static ddi_device_acc_attr_t xc_acc_attr = {
200 	DDI_DEVICE_ATTR_V0,
201 	DDI_NEVERSWAP_ACC,
202 	DDI_STRICTORDER_ACC
203 };
204 
205 /* callbacks from commmon label */
206 
207 int xdf_lb_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t, void *);
208 int xdf_lb_getinfo(dev_info_t *, int, void *, void *);
209 
210 static cmlb_tg_ops_t xdf_lb_ops = {
211 	TG_DK_OPS_VERSION_1,
212 	xdf_lb_rdwr,
213 	xdf_lb_getinfo
214 };
215 
216 int
217 _init(void)
218 {
219 	int rc;
220 
221 	if ((rc = ddi_soft_state_init(&vbd_ss, sizeof (xdf_t), 0)) != 0)
222 		return (rc);
223 
224 	xdf_vreq_cache = kmem_cache_create("xdf_vreq_cache",
225 	    sizeof (v_req_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
226 	xdf_gs_cache = kmem_cache_create("xdf_gs_cache",
227 	    sizeof (ge_slot_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
228 
229 #if defined(XPV_HVM_DRIVER)
230 	xdf_hvm_init();
231 #endif /* XPV_HVM_DRIVER */
232 
233 	if ((rc = mod_install(&xdf_modlinkage)) != 0) {
234 #if defined(XPV_HVM_DRIVER)
235 		xdf_hvm_fini();
236 #endif /* XPV_HVM_DRIVER */
237 		kmem_cache_destroy(xdf_vreq_cache);
238 		kmem_cache_destroy(xdf_gs_cache);
239 		ddi_soft_state_fini(&vbd_ss);
240 		return (rc);
241 	}
242 
243 	return (rc);
244 }
245 
246 int
247 _fini(void)
248 {
249 
250 	int err;
251 	if ((err = mod_remove(&xdf_modlinkage)) != 0)
252 		return (err);
253 
254 #if defined(XPV_HVM_DRIVER)
255 	xdf_hvm_fini();
256 #endif /* XPV_HVM_DRIVER */
257 
258 	kmem_cache_destroy(xdf_vreq_cache);
259 	kmem_cache_destroy(xdf_gs_cache);
260 	ddi_soft_state_fini(&vbd_ss);
261 
262 	return (0);
263 }
264 
265 int
266 _info(struct modinfo *modinfop)
267 {
268 	return (mod_info(&xdf_modlinkage, modinfop));
269 }
270 
271 /*ARGSUSED*/
272 static int
273 xdf_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **rp)
274 {
275 	int instance;
276 	xdf_t *vbdp;
277 
278 	instance = XDF_INST(getminor((dev_t)arg));
279 
280 	switch (cmd) {
281 	case DDI_INFO_DEVT2DEVINFO:
282 		if ((vbdp = ddi_get_soft_state(vbd_ss, instance)) == NULL) {
283 			*rp = NULL;
284 			return (DDI_FAILURE);
285 		}
286 		*rp = vbdp->xdf_dip;
287 		return (DDI_SUCCESS);
288 
289 	case DDI_INFO_DEVT2INSTANCE:
290 		*rp = (void *)(uintptr_t)instance;
291 		return (DDI_SUCCESS);
292 
293 	default:
294 		return (DDI_FAILURE);
295 	}
296 }
297 
298 static int
299 xdf_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags,
300 	char *name, caddr_t valuep, int *lengthp)
301 {
302 	xdf_t	*vdp;
303 
304 	if ((vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(dip))) == NULL)
305 		return (ddi_prop_op(dev, dip, prop_op, mod_flags,
306 		    name, valuep, lengthp));
307 
308 	return (cmlb_prop_op(vdp->xdf_vd_lbl,
309 	    dev, dip, prop_op, mod_flags, name, valuep, lengthp,
310 	    XDF_PART(getminor(dev)), NULL));
311 }
312 
313 static int
314 xdf_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
315 {
316 	xdf_t *vdp;
317 	ddi_iblock_cookie_t softibc;
318 	int instance;
319 
320 	xdfdebug = ddi_prop_get_int(DDI_DEV_T_ANY, devi, DDI_PROP_NOTPROM,
321 	    "xdfdebug", 0);
322 
323 	switch (cmd) {
324 		case DDI_ATTACH:
325 			break;
326 
327 		case DDI_RESUME:
328 			return (xdf_resume(devi));
329 
330 		default:
331 			return (DDI_FAILURE);
332 	}
333 
334 	instance = ddi_get_instance(devi);
335 	if (ddi_soft_state_zalloc(vbd_ss, instance) != DDI_SUCCESS)
336 		return (DDI_FAILURE);
337 
338 	DPRINTF(DDI_DBG, ("xdf%d: attaching\n", instance));
339 	vdp = ddi_get_soft_state(vbd_ss, instance);
340 	ddi_set_driver_private(devi, vdp);
341 	vdp->xdf_dip = devi;
342 	cv_init(&vdp->xdf_dev_cv, NULL, CV_DEFAULT, NULL);
343 
344 	if (ddi_get_iblock_cookie(devi, 0, &vdp->xdf_ibc) != DDI_SUCCESS) {
345 		cmn_err(CE_WARN, "xdf@%s: failed to get iblock cookie",
346 		    ddi_get_name_addr(devi));
347 		goto errout0;
348 	}
349 	mutex_init(&vdp->xdf_dev_lk, NULL, MUTEX_DRIVER, (void *)vdp->xdf_ibc);
350 	mutex_init(&vdp->xdf_cb_lk, NULL, MUTEX_DRIVER, (void *)vdp->xdf_ibc);
351 	mutex_init(&vdp->xdf_iostat_lk, NULL, MUTEX_DRIVER,
352 	    (void *)vdp->xdf_ibc);
353 
354 	if (ddi_get_soft_iblock_cookie(devi, DDI_SOFTINT_LOW, &softibc)
355 	    != DDI_SUCCESS) {
356 		cmn_err(CE_WARN, "xdf@%s: failed to get softintr iblock cookie",
357 		    ddi_get_name_addr(devi));
358 		goto errout0;
359 	}
360 	if (ddi_add_softintr(devi, DDI_SOFTINT_LOW, &vdp->xdf_softintr_id,
361 	    &softibc, NULL, xdf_iorestart, (caddr_t)vdp) != DDI_SUCCESS) {
362 		cmn_err(CE_WARN, "xdf@%s: failed to add softintr",
363 		    ddi_get_name_addr(devi));
364 		goto errout0;
365 	}
366 
367 #if !defined(XPV_HVM_DRIVER)
368 	/* create kstat for iostat(1M) */
369 	if (xdf_kstat_create(devi, "xdf", instance) != 0) {
370 		cmn_err(CE_WARN, "xdf@%s: failed to create kstat",
371 		    ddi_get_name_addr(devi));
372 		goto errout0;
373 	}
374 #endif /* !XPV_HVM_DRIVER */
375 
376 	/* driver handles kernel-issued IOCTLs */
377 	if (ddi_prop_create(DDI_DEV_T_NONE, devi, DDI_PROP_CANSLEEP,
378 	    DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) {
379 		cmn_err(CE_WARN, "xdf@%s: cannot create DDI_KERNEL_IOCTL prop",
380 		    ddi_get_name_addr(devi));
381 		goto errout0;
382 	}
383 
384 	/*
385 	 * Initialize the physical geometry stucture.  Note that currently
386 	 * we don't know the size of the backend device so the number
387 	 * of blocks on the device will be initialized to zero.  Once
388 	 * we connect to the backend device we'll update the physical
389 	 * geometry to reflect the real size of the device.
390 	 */
391 	xdf_synthetic_pgeom(devi, &vdp->xdf_pgeom);
392 
393 	/*
394 	 * create default device minor nodes: non-removable disk
395 	 * we will adjust minor nodes after we are connected w/ backend
396 	 */
397 	cmlb_alloc_handle(&vdp->xdf_vd_lbl);
398 	if (cmlb_attach(devi, &xdf_lb_ops, DTYPE_DIRECT, 0, 1,
399 	    DDI_NT_BLOCK_XVMD,
400 #if defined(XPV_HVM_DRIVER)
401 	    CMLB_CREATE_ALTSLICE_VTOC_16_DTYPE_DIRECT |
402 	    CMLB_INTERNAL_MINOR_NODES,
403 #else /* !XPV_HVM_DRIVER */
404 	    CMLB_FAKE_LABEL_ONE_PARTITION,
405 #endif /* !XPV_HVM_DRIVER */
406 	    vdp->xdf_vd_lbl, NULL) != 0) {
407 		cmn_err(CE_WARN, "xdf@%s: default cmlb attach failed",
408 		    ddi_get_name_addr(devi));
409 		goto errout0;
410 	}
411 
412 	/*
413 	 * We ship with cache-enabled disks
414 	 */
415 	vdp->xdf_wce = 1;
416 
417 	mutex_enter(&vdp->xdf_cb_lk);
418 
419 	/* Watch backend XenbusState change */
420 	if (xvdi_add_event_handler(devi, XS_OE_STATE,
421 	    xdf_oe_change) != DDI_SUCCESS) {
422 		mutex_exit(&vdp->xdf_cb_lk);
423 		goto errout0;
424 	}
425 
426 	if (xdf_start_connect(vdp) != DDI_SUCCESS) {
427 		cmn_err(CE_WARN, "xdf@%s: start connection failed",
428 		    ddi_get_name_addr(devi));
429 		(void) xdf_start_disconnect(vdp);
430 		mutex_exit(&vdp->xdf_cb_lk);
431 		goto errout1;
432 	}
433 
434 	mutex_exit(&vdp->xdf_cb_lk);
435 
436 	list_create(&vdp->xdf_vreq_act, sizeof (v_req_t),
437 	    offsetof(v_req_t, v_link));
438 	list_create(&vdp->xdf_gs_act, sizeof (ge_slot_t),
439 	    offsetof(ge_slot_t, link));
440 
441 #if defined(XPV_HVM_DRIVER)
442 	xdf_hvm_add(devi);
443 
444 	(void) ddi_prop_update_int(DDI_DEV_T_NONE, devi, DDI_NO_AUTODETACH, 1);
445 
446 	/*
447 	 * Report our version to dom0.
448 	 */
449 	if (xenbus_printf(XBT_NULL, "hvmpv/xdf", "version", "%d",
450 	    HVMPV_XDF_VERS))
451 		cmn_err(CE_WARN, "xdf: couldn't write version\n");
452 #endif /* XPV_HVM_DRIVER */
453 
454 	ddi_report_dev(devi);
455 
456 	DPRINTF(DDI_DBG, ("xdf%d: attached\n", instance));
457 
458 	return (DDI_SUCCESS);
459 
460 errout1:
461 	xvdi_remove_event_handler(devi, XS_OE_STATE);
462 errout0:
463 	if (vdp->xdf_vd_lbl != NULL) {
464 		cmlb_detach(vdp->xdf_vd_lbl, NULL);
465 		cmlb_free_handle(&vdp->xdf_vd_lbl);
466 		vdp->xdf_vd_lbl = NULL;
467 	}
468 #if !defined(XPV_HVM_DRIVER)
469 	xdf_kstat_delete(devi);
470 #endif /* !XPV_HVM_DRIVER */
471 	if (vdp->xdf_softintr_id != NULL)
472 		ddi_remove_softintr(vdp->xdf_softintr_id);
473 	if (vdp->xdf_ibc != NULL) {
474 		mutex_destroy(&vdp->xdf_cb_lk);
475 		mutex_destroy(&vdp->xdf_dev_lk);
476 	}
477 	cv_destroy(&vdp->xdf_dev_cv);
478 	ddi_soft_state_free(vbd_ss, instance);
479 	ddi_set_driver_private(devi, NULL);
480 	ddi_prop_remove_all(devi);
481 	cmn_err(CE_WARN, "xdf@%s: attach failed", ddi_get_name_addr(devi));
482 	return (DDI_FAILURE);
483 }
484 
485 static int
486 xdf_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
487 {
488 	xdf_t *vdp;
489 	int instance;
490 
491 	switch (cmd) {
492 
493 	case DDI_PM_SUSPEND:
494 		break;
495 
496 	case DDI_SUSPEND:
497 		return (xdf_suspend(devi));
498 
499 	case DDI_DETACH:
500 		break;
501 
502 	default:
503 		return (DDI_FAILURE);
504 	}
505 
506 	instance = ddi_get_instance(devi);
507 	DPRINTF(DDI_DBG, ("xdf%d: detaching\n", instance));
508 	vdp = ddi_get_soft_state(vbd_ss, instance);
509 
510 	if (vdp == NULL)
511 		return (DDI_FAILURE);
512 
513 	mutex_enter(&vdp->xdf_dev_lk);
514 	if (xdf_isopen(vdp, -1)) {
515 		mutex_exit(&vdp->xdf_dev_lk);
516 		return (DDI_FAILURE);
517 	}
518 
519 	if (vdp->xdf_status != XD_CLOSED) {
520 		mutex_exit(&vdp->xdf_dev_lk);
521 		return (DDI_FAILURE);
522 	}
523 
524 #if defined(XPV_HVM_DRIVER)
525 	xdf_hvm_rm(devi);
526 #endif /* XPV_HVM_DRIVER */
527 
528 	ASSERT(!ISDMACBON(vdp));
529 	mutex_exit(&vdp->xdf_dev_lk);
530 
531 	if (vdp->xdf_timeout_id != 0)
532 		(void) untimeout(vdp->xdf_timeout_id);
533 
534 	xvdi_remove_event_handler(devi, XS_OE_STATE);
535 
536 	/* we'll support backend running in domU later */
537 #ifdef	DOMU_BACKEND
538 	(void) xvdi_post_event(devi, XEN_HP_REMOVE);
539 #endif
540 
541 	list_destroy(&vdp->xdf_vreq_act);
542 	list_destroy(&vdp->xdf_gs_act);
543 	ddi_prop_remove_all(devi);
544 	xdf_kstat_delete(devi);
545 	ddi_remove_softintr(vdp->xdf_softintr_id);
546 	ddi_set_driver_private(devi, NULL);
547 	cv_destroy(&vdp->xdf_dev_cv);
548 	mutex_destroy(&vdp->xdf_cb_lk);
549 	mutex_destroy(&vdp->xdf_dev_lk);
550 	if (vdp->xdf_cache_flush_block != NULL)
551 		kmem_free(vdp->xdf_flush_mem, 2 * DEV_BSIZE);
552 	ddi_soft_state_free(vbd_ss, instance);
553 	return (DDI_SUCCESS);
554 }
555 
556 static int
557 xdf_suspend(dev_info_t *devi)
558 {
559 	xdf_t *vdp;
560 	int instance;
561 	enum xdf_state st;
562 
563 	instance = ddi_get_instance(devi);
564 
565 	if (xdfdebug & SUSRES_DBG)
566 		xen_printf("xdf_suspend: xdf#%d\n", instance);
567 
568 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
569 		return (DDI_FAILURE);
570 
571 	xvdi_suspend(devi);
572 
573 	mutex_enter(&vdp->xdf_cb_lk);
574 	mutex_enter(&vdp->xdf_dev_lk);
575 	st = vdp->xdf_status;
576 	/* change status to stop further I/O requests */
577 	if (st == XD_READY)
578 		vdp->xdf_status = XD_SUSPEND;
579 	mutex_exit(&vdp->xdf_dev_lk);
580 	mutex_exit(&vdp->xdf_cb_lk);
581 
582 	/* make sure no more I/O responses left in the ring buffer */
583 	if ((st == XD_INIT) || (st == XD_READY)) {
584 #ifdef XPV_HVM_DRIVER
585 		ec_unbind_evtchn(vdp->xdf_evtchn);
586 		xvdi_free_evtchn(devi);
587 #else /* !XPV_HVM_DRIVER */
588 		(void) ddi_remove_intr(devi, 0, NULL);
589 #endif /* !XPV_HVM_DRIVER */
590 		(void) xdf_drain_io(vdp);
591 		/*
592 		 * no need to teardown the ring buffer here
593 		 * it will be simply re-init'ed during resume when
594 		 * we call xvdi_alloc_ring
595 		 */
596 	}
597 
598 	if (xdfdebug & SUSRES_DBG)
599 		xen_printf("xdf_suspend: SUCCESS\n");
600 
601 	return (DDI_SUCCESS);
602 }
603 
604 /*ARGSUSED*/
605 static int
606 xdf_resume(dev_info_t *devi)
607 {
608 	xdf_t *vdp;
609 	int instance;
610 
611 	instance = ddi_get_instance(devi);
612 	if (xdfdebug & SUSRES_DBG)
613 		xen_printf("xdf_resume: xdf%d\n", instance);
614 
615 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
616 		return (DDI_FAILURE);
617 
618 	mutex_enter(&vdp->xdf_cb_lk);
619 
620 	if (xvdi_resume(devi) != DDI_SUCCESS) {
621 		mutex_exit(&vdp->xdf_cb_lk);
622 		return (DDI_FAILURE);
623 	}
624 
625 	mutex_enter(&vdp->xdf_dev_lk);
626 	ASSERT(vdp->xdf_status != XD_READY);
627 	vdp->xdf_status = XD_UNKNOWN;
628 	mutex_exit(&vdp->xdf_dev_lk);
629 
630 	if (xdf_start_connect(vdp) != DDI_SUCCESS) {
631 		mutex_exit(&vdp->xdf_cb_lk);
632 		return (DDI_FAILURE);
633 	}
634 
635 	mutex_exit(&vdp->xdf_cb_lk);
636 
637 	if (xdfdebug & SUSRES_DBG)
638 		xen_printf("xdf_resume: done\n");
639 	return (DDI_SUCCESS);
640 }
641 
642 /*ARGSUSED*/
643 static int
644 xdf_reset(dev_info_t *devi, ddi_reset_cmd_t cmd)
645 {
646 	xdf_t *vdp;
647 	int instance;
648 
649 	instance = ddi_get_instance(devi);
650 	DPRINTF(DDI_DBG, ("xdf%d: resetting\n", instance));
651 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
652 		return (DDI_FAILURE);
653 
654 	/*
655 	 * wait for any outstanding I/O to complete
656 	 */
657 	(void) xdf_drain_io(vdp);
658 
659 	DPRINTF(DDI_DBG, ("xdf%d: reset complete\n", instance));
660 	return (DDI_SUCCESS);
661 }
662 
663 static int
664 xdf_open(dev_t *devp, int flag, int otyp, cred_t *credp)
665 {
666 	minor_t	minor;
667 	xdf_t	*vdp;
668 	int part;
669 	ulong_t parbit;
670 	diskaddr_t p_blkct = 0;
671 	boolean_t firstopen;
672 	boolean_t nodelay;
673 
674 	minor = getminor(*devp);
675 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
676 		return (ENXIO);
677 
678 	nodelay = (flag & (FNDELAY | FNONBLOCK));
679 
680 	DPRINTF(DDI_DBG, ("xdf%d: opening\n", XDF_INST(minor)));
681 
682 	/* do cv_wait until connected or failed */
683 	mutex_enter(&vdp->xdf_dev_lk);
684 	if (!nodelay && (xdf_connect(vdp, B_TRUE) != XD_READY)) {
685 		mutex_exit(&vdp->xdf_dev_lk);
686 		return (ENXIO);
687 	}
688 
689 	if ((flag & FWRITE) && XD_IS_RO(vdp)) {
690 		mutex_exit(&vdp->xdf_dev_lk);
691 		return (EROFS);
692 	}
693 
694 	part = XDF_PART(minor);
695 	parbit = 1 << part;
696 	if ((vdp->xdf_vd_exclopen & parbit) ||
697 	    ((flag & FEXCL) && xdf_isopen(vdp, part))) {
698 		mutex_exit(&vdp->xdf_dev_lk);
699 		return (EBUSY);
700 	}
701 
702 	/* are we the first one to open this node? */
703 	firstopen = !xdf_isopen(vdp, -1);
704 
705 	if (otyp == OTYP_LYR)
706 		vdp->xdf_vd_lyropen[part]++;
707 
708 	vdp->xdf_vd_open[otyp] |= parbit;
709 
710 	if (flag & FEXCL)
711 		vdp->xdf_vd_exclopen |= parbit;
712 
713 	mutex_exit(&vdp->xdf_dev_lk);
714 
715 	/* force a re-validation */
716 	if (firstopen)
717 		cmlb_invalidate(vdp->xdf_vd_lbl, NULL);
718 
719 	/*
720 	 * check size
721 	 * ignore CD/DVD which contains a zero-sized s0
722 	 */
723 	if (!nodelay && !XD_IS_CD(vdp) &&
724 	    ((cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
725 	    NULL, NULL, NULL, NULL) != 0) || (p_blkct == 0))) {
726 		(void) xdf_close(*devp, flag, otyp, credp);
727 		return (ENXIO);
728 	}
729 
730 	return (0);
731 }
732 
733 /*ARGSUSED*/
734 static int
735 xdf_close(dev_t dev, int flag, int otyp, struct cred *credp)
736 {
737 	minor_t	minor;
738 	xdf_t	*vdp;
739 	int part;
740 	ulong_t parbit;
741 
742 	minor = getminor(dev);
743 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
744 		return (ENXIO);
745 
746 	mutex_enter(&vdp->xdf_dev_lk);
747 	part = XDF_PART(minor);
748 	if (!xdf_isopen(vdp, part)) {
749 		mutex_exit(&vdp->xdf_dev_lk);
750 		return (ENXIO);
751 	}
752 	parbit = 1 << part;
753 
754 	ASSERT((vdp->xdf_vd_open[otyp] & parbit) != 0);
755 	if (otyp == OTYP_LYR) {
756 		ASSERT(vdp->xdf_vd_lyropen[part] > 0);
757 		if (--vdp->xdf_vd_lyropen[part] == 0)
758 			vdp->xdf_vd_open[otyp] &= ~parbit;
759 	} else {
760 		vdp->xdf_vd_open[otyp] &= ~parbit;
761 	}
762 	vdp->xdf_vd_exclopen &= ~parbit;
763 
764 	mutex_exit(&vdp->xdf_dev_lk);
765 	return (0);
766 }
767 
768 static int
769 xdf_strategy(struct buf *bp)
770 {
771 	xdf_t	*vdp;
772 	minor_t minor;
773 	diskaddr_t p_blkct, p_blkst;
774 	ulong_t nblks;
775 	int part;
776 
777 	minor = getminor(bp->b_edev);
778 	part = XDF_PART(minor);
779 
780 	vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor));
781 	if ((vdp == NULL) || !xdf_isopen(vdp, part)) {
782 		bioerror(bp, ENXIO);
783 		bp->b_resid = bp->b_bcount;
784 		biodone(bp);
785 		return (0);
786 	}
787 
788 	/* Check for writes to a read only device */
789 	if (!IS_READ(bp) && XD_IS_RO(vdp)) {
790 		bioerror(bp, EROFS);
791 		bp->b_resid = bp->b_bcount;
792 		biodone(bp);
793 		return (0);
794 	}
795 
796 	/* Check if this I/O is accessing a partition or the entire disk */
797 	if ((long)bp->b_private == XB_SLICE_NONE) {
798 		/* This I/O is using an absolute offset */
799 		p_blkct = vdp->xdf_xdev_nblocks;
800 		p_blkst = 0;
801 	} else {
802 		/* This I/O is using a partition relative offset */
803 		if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
804 		    &p_blkst, NULL, NULL, NULL)) {
805 			bioerror(bp, ENXIO);
806 			bp->b_resid = bp->b_bcount;
807 			biodone(bp);
808 			return (0);
809 		}
810 	}
811 
812 	/* check for a starting block beyond the disk or partition limit */
813 	if (bp->b_blkno > p_blkct) {
814 		DPRINTF(IO_DBG, ("xdf: block %lld exceeds VBD size %"PRIu64,
815 		    (longlong_t)bp->b_blkno, (uint64_t)p_blkct));
816 		bioerror(bp, EINVAL);
817 		bp->b_resid = bp->b_bcount;
818 		biodone(bp);
819 		return (0);
820 	}
821 
822 	/* Legacy: don't set error flag at this case */
823 	if (bp->b_blkno == p_blkct) {
824 		bp->b_resid = bp->b_bcount;
825 		biodone(bp);
826 		return (0);
827 	}
828 
829 	/* Adjust for partial transfer */
830 	nblks = bp->b_bcount >> XB_BSHIFT;
831 	if ((bp->b_blkno + nblks) > p_blkct) {
832 		bp->b_resid = ((bp->b_blkno + nblks) - p_blkct) << XB_BSHIFT;
833 		bp->b_bcount -= bp->b_resid;
834 	}
835 
836 	DPRINTF(IO_DBG, ("xdf: strategy blk %lld len %lu\n",
837 	    (longlong_t)bp->b_blkno, (ulong_t)bp->b_bcount));
838 
839 	/* Fix up the buf struct */
840 	bp->b_flags |= B_BUSY;
841 	bp->av_forw = bp->av_back = NULL; /* not tagged with a v_req */
842 	bp->b_private = (void *)(uintptr_t)p_blkst;
843 
844 	mutex_enter(&vdp->xdf_dev_lk);
845 	if (vdp->xdf_xdev_iostat != NULL)
846 		kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
847 	if (vdp->xdf_f_act == NULL) {
848 		vdp->xdf_f_act = vdp->xdf_l_act = bp;
849 	} else {
850 		vdp->xdf_l_act->av_forw = bp;
851 		vdp->xdf_l_act = bp;
852 	}
853 	mutex_exit(&vdp->xdf_dev_lk);
854 
855 	xdf_iostart(vdp);
856 	if (do_polled_io)
857 		(void) xdf_drain_io(vdp);
858 	return (0);
859 }
860 
861 /*ARGSUSED*/
862 static int
863 xdf_read(dev_t dev, struct uio *uiop, cred_t *credp)
864 {
865 
866 	xdf_t	*vdp;
867 	minor_t minor;
868 	diskaddr_t p_blkcnt;
869 	int part;
870 
871 	minor = getminor(dev);
872 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
873 		return (ENXIO);
874 
875 	DPRINTF(IO_DBG, ("xdf: read offset 0x%"PRIx64"\n",
876 	    (int64_t)uiop->uio_offset));
877 
878 	part = XDF_PART(minor);
879 	if (!xdf_isopen(vdp, part))
880 		return (ENXIO);
881 
882 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
883 	    NULL, NULL, NULL, NULL))
884 		return (ENXIO);
885 
886 	if (U_INVAL(uiop))
887 		return (EINVAL);
888 
889 	return (physio(xdf_strategy, NULL, dev, B_READ, xdfmin, uiop));
890 }
891 
892 /*ARGSUSED*/
893 static int
894 xdf_write(dev_t dev, struct uio *uiop, cred_t *credp)
895 {
896 	xdf_t *vdp;
897 	minor_t minor;
898 	diskaddr_t p_blkcnt;
899 	int part;
900 
901 	minor = getminor(dev);
902 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
903 		return (ENXIO);
904 
905 	DPRINTF(IO_DBG, ("xdf: write offset 0x%"PRIx64"\n",
906 	    (int64_t)uiop->uio_offset));
907 
908 	part = XDF_PART(minor);
909 	if (!xdf_isopen(vdp, part))
910 		return (ENXIO);
911 
912 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
913 	    NULL, NULL, NULL, NULL))
914 		return (ENXIO);
915 
916 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt))
917 		return (ENOSPC);
918 
919 	if (U_INVAL(uiop))
920 		return (EINVAL);
921 
922 	return (physio(xdf_strategy, NULL, dev, B_WRITE, minphys, uiop));
923 }
924 
925 /*ARGSUSED*/
926 static int
927 xdf_aread(dev_t dev, struct aio_req *aiop, cred_t *credp)
928 {
929 	xdf_t	*vdp;
930 	minor_t minor;
931 	struct uio *uiop = aiop->aio_uio;
932 	diskaddr_t p_blkcnt;
933 	int part;
934 
935 	minor = getminor(dev);
936 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
937 		return (ENXIO);
938 
939 	part = XDF_PART(minor);
940 	if (!xdf_isopen(vdp, part))
941 		return (ENXIO);
942 
943 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
944 	    NULL, NULL, NULL, NULL))
945 		return (ENXIO);
946 
947 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt))
948 		return (ENOSPC);
949 
950 	if (U_INVAL(uiop))
951 		return (EINVAL);
952 
953 	return (aphysio(xdf_strategy, anocancel, dev, B_READ, minphys, aiop));
954 }
955 
956 /*ARGSUSED*/
957 static int
958 xdf_awrite(dev_t dev, struct aio_req *aiop, cred_t *credp)
959 {
960 	xdf_t *vdp;
961 	minor_t minor;
962 	struct uio *uiop = aiop->aio_uio;
963 	diskaddr_t p_blkcnt;
964 	int part;
965 
966 	minor = getminor(dev);
967 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
968 		return (ENXIO);
969 
970 	part = XDF_PART(minor);
971 	if (!xdf_isopen(vdp, part))
972 		return (ENXIO);
973 
974 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
975 	    NULL, NULL, NULL, NULL))
976 		return (ENXIO);
977 
978 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt))
979 		return (ENOSPC);
980 
981 	if (U_INVAL(uiop))
982 		return (EINVAL);
983 
984 	return (aphysio(xdf_strategy, anocancel, dev, B_WRITE, minphys, aiop));
985 }
986 
987 static int
988 xdf_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
989 {
990 	struct buf dumpbuf, *dbp;
991 	xdf_t	*vdp;
992 	minor_t minor;
993 	int err = 0;
994 	int part;
995 	diskaddr_t p_blkcnt, p_blkst;
996 
997 	minor = getminor(dev);
998 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
999 		return (ENXIO);
1000 
1001 	DPRINTF(IO_DBG, ("xdf: dump addr (0x%p) blk (%ld) nblks (%d)\n",
1002 	    addr, blkno, nblk));
1003 
1004 	part = XDF_PART(minor);
1005 	if (!xdf_isopen(vdp, part))
1006 		return (ENXIO);
1007 
1008 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt, &p_blkst,
1009 	    NULL, NULL, NULL))
1010 		return (ENXIO);
1011 
1012 	if ((blkno + nblk) > p_blkcnt) {
1013 		cmn_err(CE_WARN, "xdf: block %ld exceeds VBD size %"PRIu64,
1014 		    blkno + nblk, (uint64_t)p_blkcnt);
1015 		return (EINVAL);
1016 	}
1017 
1018 	dbp = &dumpbuf;
1019 	bioinit(dbp);
1020 	dbp->b_flags = B_BUSY;
1021 	dbp->b_un.b_addr = addr;
1022 	dbp->b_bcount = nblk << DEV_BSHIFT;
1023 	dbp->b_blkno = blkno;
1024 	dbp->b_edev = dev;
1025 	dbp->b_private = (void *)(uintptr_t)p_blkst;
1026 
1027 	mutex_enter(&vdp->xdf_dev_lk);
1028 	if (vdp->xdf_xdev_iostat != NULL)
1029 		kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1030 	if (vdp->xdf_f_act == NULL) {
1031 		vdp->xdf_f_act = vdp->xdf_l_act = dbp;
1032 	} else {
1033 		vdp->xdf_l_act->av_forw = dbp;
1034 		vdp->xdf_l_act = dbp;
1035 	}
1036 	dbp->av_forw = NULL;
1037 	dbp->av_back = NULL;
1038 	mutex_exit(&vdp->xdf_dev_lk);
1039 	xdf_iostart(vdp);
1040 	err = xdf_drain_io(vdp);
1041 	biofini(dbp);
1042 	return (err);
1043 }
1044 
1045 /*ARGSUSED*/
1046 static int
1047 xdf_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
1048     int *rvalp)
1049 {
1050 	int instance;
1051 	xdf_t	*vdp;
1052 	minor_t minor;
1053 	int part;
1054 
1055 	minor = getminor(dev);
1056 	instance = XDF_INST(minor);
1057 
1058 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
1059 		return (ENXIO);
1060 
1061 	DPRINTF(IOCTL_DBG, ("xdf%d:ioctl: cmd %d (0x%x)\n",
1062 	    instance, cmd, cmd));
1063 
1064 	part = XDF_PART(minor);
1065 	if (!xdf_isopen(vdp, part))
1066 		return (ENXIO);
1067 
1068 	switch (cmd) {
1069 	case DKIOCGMEDIAINFO: {
1070 		struct dk_minfo	media_info;
1071 
1072 		media_info.dki_lbsize = DEV_BSIZE;
1073 		media_info.dki_capacity = vdp->xdf_pgeom.g_capacity;
1074 		media_info.dki_media_type = DK_FIXED_DISK;
1075 
1076 		if (ddi_copyout(&media_info, (void *)arg,
1077 		    sizeof (struct dk_minfo), mode)) {
1078 			return (EFAULT);
1079 		} else {
1080 			return (0);
1081 		}
1082 	}
1083 
1084 	case DKIOCINFO: {
1085 		struct dk_cinfo info;
1086 
1087 		/* controller information */
1088 		if (XD_IS_CD(vdp))
1089 			info.dki_ctype = DKC_CDROM;
1090 		else
1091 			info.dki_ctype = DKC_VBD;
1092 
1093 		info.dki_cnum = 0;
1094 		(void) strncpy((char *)(&info.dki_cname), "xdf", 8);
1095 
1096 		/* unit information */
1097 		info.dki_unit = ddi_get_instance(vdp->xdf_dip);
1098 		(void) strncpy((char *)(&info.dki_dname), "xdf", 8);
1099 		info.dki_flags = DKI_FMTVOL;
1100 		info.dki_partition = part;
1101 		info.dki_maxtransfer = maxphys / DEV_BSIZE;
1102 		info.dki_addr = 0;
1103 		info.dki_space = 0;
1104 		info.dki_prio = 0;
1105 		info.dki_vec = 0;
1106 
1107 		if (ddi_copyout(&info, (void *)arg, sizeof (info), mode))
1108 			return (EFAULT);
1109 		else
1110 			return (0);
1111 	}
1112 
1113 	case DKIOCSTATE: {
1114 		enum dkio_state	dkstate = DKIO_INSERTED;
1115 		if (ddi_copyout(&dkstate, (void *)arg, sizeof (dkstate),
1116 		    mode) != 0)
1117 			return (EFAULT);
1118 		return (0);
1119 	}
1120 
1121 	/*
1122 	 * is media removable?
1123 	 */
1124 	case DKIOCREMOVABLE: {
1125 		int i = XD_IS_RM(vdp) ? 1 : 0;
1126 		if (ddi_copyout(&i, (caddr_t)arg, sizeof (int), mode))
1127 			return (EFAULT);
1128 		return (0);
1129 	}
1130 
1131 	case DKIOCG_PHYGEOM:
1132 	case DKIOCG_VIRTGEOM:
1133 	case DKIOCGGEOM:
1134 	case DKIOCSGEOM:
1135 	case DKIOCGAPART:
1136 	case DKIOCSAPART:
1137 	case DKIOCGVTOC:
1138 	case DKIOCSVTOC:
1139 	case DKIOCPARTINFO:
1140 	case DKIOCGMBOOT:
1141 	case DKIOCSMBOOT:
1142 	case DKIOCGETEFI:
1143 	case DKIOCSETEFI:
1144 	case DKIOCPARTITION: {
1145 		int rc;
1146 
1147 		rc = cmlb_ioctl(vdp->xdf_vd_lbl, dev, cmd, arg, mode, credp,
1148 		    rvalp, NULL);
1149 		return (rc);
1150 	}
1151 
1152 	case DKIOCGETWCE:
1153 		if (ddi_copyout(&vdp->xdf_wce, (void *)arg,
1154 		    sizeof (vdp->xdf_wce), mode))
1155 			return (EFAULT);
1156 		return (0);
1157 	case DKIOCSETWCE:
1158 		if (ddi_copyin((void *)arg, &vdp->xdf_wce,
1159 		    sizeof (vdp->xdf_wce), mode))
1160 			return (EFAULT);
1161 		return (0);
1162 	case DKIOCFLUSHWRITECACHE: {
1163 		int rc;
1164 		struct dk_callback *dkc = (struct dk_callback *)arg;
1165 
1166 		if (vdp->xdf_flush_supported) {
1167 			rc = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
1168 			    NULL, 0, 0, (void *)dev);
1169 		} else if (vdp->xdf_feature_barrier &&
1170 		    !xdf_barrier_flush_disable) {
1171 			rc = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
1172 			    vdp->xdf_cache_flush_block, xdf_flush_block,
1173 			    DEV_BSIZE, (void *)dev);
1174 		} else {
1175 			return (ENOTTY);
1176 		}
1177 		if ((mode & FKIOCTL) && (dkc != NULL) &&
1178 		    (dkc->dkc_callback != NULL)) {
1179 			(*dkc->dkc_callback)(dkc->dkc_cookie, rc);
1180 			/* need to return 0 after calling callback */
1181 			rc = 0;
1182 		}
1183 		return (rc);
1184 	}
1185 
1186 	default:
1187 		return (ENOTTY);
1188 	}
1189 }
1190 
1191 /*
1192  * xdf interrupt handler
1193  */
1194 static uint_t
1195 xdf_intr(caddr_t arg)
1196 {
1197 	xdf_t *vdp = (xdf_t *)arg;
1198 	xendev_ring_t *xbr;
1199 	blkif_response_t *resp;
1200 	int bioerr;
1201 	uint64_t id;
1202 	extern int do_polled_io;
1203 	uint8_t op;
1204 	uint16_t status;
1205 	ddi_acc_handle_t acchdl;
1206 
1207 	mutex_enter(&vdp->xdf_dev_lk);
1208 
1209 	if ((xbr = vdp->xdf_xb_ring) == NULL) {
1210 		mutex_exit(&vdp->xdf_dev_lk);
1211 		return (DDI_INTR_UNCLAIMED);
1212 	}
1213 
1214 	acchdl = vdp->xdf_xb_ring_hdl;
1215 
1216 	/*
1217 	 * complete all requests which have a response
1218 	 */
1219 	while (resp = xvdi_ring_get_response(xbr)) {
1220 		id = ddi_get64(acchdl, &resp->id);
1221 		op = ddi_get8(acchdl, &resp->operation);
1222 		status = ddi_get16(acchdl, (uint16_t *)&resp->status);
1223 		DPRINTF(INTR_DBG, ("resp: op %d id %"PRIu64" status %d\n",
1224 		    op, id, status));
1225 
1226 		/*
1227 		 * XXPV - close connection to the backend and restart
1228 		 */
1229 		if (status != BLKIF_RSP_OKAY) {
1230 			DPRINTF(IO_DBG, ("xdf@%s: I/O error while %s",
1231 			    ddi_get_name_addr(vdp->xdf_dip),
1232 			    (op == BLKIF_OP_READ) ? "reading" : "writing"));
1233 			bioerr = EIO;
1234 		} else {
1235 			bioerr = 0;
1236 		}
1237 
1238 		xdf_iofini(vdp, id, bioerr);
1239 	}
1240 
1241 	mutex_exit(&vdp->xdf_dev_lk);
1242 
1243 	if (!do_polled_io)
1244 		xdf_iostart(vdp);
1245 
1246 	return (DDI_INTR_CLAIMED);
1247 }
1248 
1249 int xdf_fbrewrites;	/* how many times was our flush block rewritten */
1250 
1251 /*
1252  * Snarf new data if our flush block was re-written
1253  */
1254 static void
1255 check_fbwrite(xdf_t *vdp, buf_t *bp, daddr_t blkno)
1256 {
1257 	int nblks;
1258 	boolean_t mapin;
1259 
1260 	if (IS_WRITE_BARRIER(vdp, bp))
1261 		return; /* write was a flush write */
1262 
1263 	mapin = B_FALSE;
1264 	nblks = bp->b_bcount >> DEV_BSHIFT;
1265 	if (xdf_flush_block >= blkno && xdf_flush_block < (blkno + nblks)) {
1266 		xdf_fbrewrites++;
1267 		if (bp->b_flags & (B_PAGEIO | B_PHYS)) {
1268 			mapin = B_TRUE;
1269 			bp_mapin(bp);
1270 		}
1271 		bcopy(bp->b_un.b_addr +
1272 		    ((xdf_flush_block - blkno) << DEV_BSHIFT),
1273 		    vdp->xdf_cache_flush_block, DEV_BSIZE);
1274 		if (mapin)
1275 			bp_mapout(bp);
1276 	}
1277 }
1278 
1279 static void
1280 xdf_iofini(xdf_t *vdp, uint64_t id, int bioerr)
1281 {
1282 	ge_slot_t *gs = (ge_slot_t *)(uintptr_t)id;
1283 	v_req_t *vreq = gs->vreq;
1284 	buf_t *bp = vreq->v_buf;
1285 
1286 	gs_free(vdp, gs);
1287 	if (bioerr)
1288 		bioerror(bp, bioerr);
1289 	vreq->v_nslots--;
1290 	if (vreq->v_nslots != 0)
1291 		return;
1292 
1293 	XDF_UPDATE_IO_STAT(vdp, bp);
1294 	if (vdp->xdf_xdev_iostat != NULL)
1295 		kstat_runq_exit(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1296 
1297 	if (IS_ERROR(bp))
1298 		bp->b_resid = bp->b_bcount;
1299 
1300 	vreq_free(vdp, vreq);
1301 	biodone(bp);
1302 }
1303 
1304 /*
1305  * return value of xdf_prepare_rreq()
1306  * used in xdf_iostart()
1307  */
1308 #define	XF_PARTIAL	0 /* rreq is full, not all I/O in buf transferred */
1309 #define	XF_COMP		1 /* no more I/O left in buf */
1310 
1311 static void
1312 xdf_iostart(xdf_t *vdp)
1313 {
1314 	xendev_ring_t *xbr;
1315 	struct buf *bp;
1316 	blkif_request_t *rreq;
1317 	int retval;
1318 	int rreqready = 0;
1319 
1320 	xbr = vdp->xdf_xb_ring;
1321 
1322 	/*
1323 	 * populate the ring request(s)
1324 	 *
1325 	 * loop until there is no buf to transfer or no free slot
1326 	 * available in I/O ring
1327 	 */
1328 	mutex_enter(&vdp->xdf_dev_lk);
1329 
1330 	for (;;) {
1331 		if (vdp->xdf_status != XD_READY)
1332 			break;
1333 
1334 		/* active buf queue empty? */
1335 		if ((bp = vdp->xdf_f_act) == NULL)
1336 			break;
1337 
1338 		/* try to grab a vreq for this bp */
1339 		if ((BP2VREQ(bp) == NULL) && (vreq_get(vdp, bp) == NULL))
1340 				break;
1341 		/* alloc DMA/GTE resources */
1342 		if (vreq_setup(vdp, BP2VREQ(bp)) != DDI_SUCCESS)
1343 			break;
1344 
1345 		/* get next blkif_request in the ring */
1346 		if ((rreq = xvdi_ring_get_request(xbr)) == NULL)
1347 			break;
1348 		bzero(rreq, sizeof (blkif_request_t));
1349 
1350 		/* populate blkif_request with this buf */
1351 		rreqready++;
1352 		retval = xdf_prepare_rreq(vdp, bp, rreq);
1353 		if (retval == XF_COMP) {
1354 			/* finish this bp, switch to next one */
1355 			if (vdp->xdf_xdev_iostat != NULL)
1356 				kstat_waitq_to_runq(
1357 				    KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1358 			vdp->xdf_f_act = bp->av_forw;
1359 			bp->av_forw = NULL;
1360 		}
1361 	}
1362 
1363 	/*
1364 	 * Send the request(s) to the backend
1365 	 */
1366 	if (rreqready) {
1367 		if (xvdi_ring_push_request(xbr)) {
1368 			DPRINTF(IO_DBG, ("xdf_iostart: "
1369 			    "sent request(s) to backend\n"));
1370 			xvdi_notify_oe(vdp->xdf_dip);
1371 		}
1372 	}
1373 
1374 	mutex_exit(&vdp->xdf_dev_lk);
1375 }
1376 
1377 /*
1378  * populate a single blkif_request_t w/ a buf
1379  */
1380 static int
1381 xdf_prepare_rreq(xdf_t *vdp, struct buf *bp, blkif_request_t *rreq)
1382 {
1383 	int		rval;
1384 	grant_ref_t	gr;
1385 	uint8_t		fsect, lsect;
1386 	size_t		bcnt;
1387 	paddr_t		dma_addr;
1388 	off_t		blk_off;
1389 	dev_info_t	*dip = vdp->xdf_dip;
1390 	blkif_vdev_t	vdev = xvdi_get_vdevnum(dip);
1391 	v_req_t		*vreq = BP2VREQ(bp);
1392 	uint64_t	blkno = vreq->v_blkno;
1393 	uint_t		ndmacs = vreq->v_ndmacs;
1394 	ddi_acc_handle_t acchdl = vdp->xdf_xb_ring_hdl;
1395 	int		seg = 0;
1396 	int		isread = IS_READ(bp);
1397 
1398 	if (isread)
1399 		ddi_put8(acchdl, &rreq->operation, BLKIF_OP_READ);
1400 	else {
1401 		switch (vreq->v_flush_diskcache) {
1402 		case FLUSH_DISKCACHE:
1403 			ddi_put8(acchdl, &rreq->operation,
1404 			    BLKIF_OP_FLUSH_DISKCACHE);
1405 			ddi_put16(acchdl, &rreq->handle, vdev);
1406 			ddi_put64(acchdl, &rreq->id,
1407 			    (uint64_t)(uintptr_t)(vreq->v_gs));
1408 			ddi_put8(acchdl, &rreq->nr_segments, 0);
1409 			return (XF_COMP);
1410 		case WRITE_BARRIER:
1411 			ddi_put8(acchdl, &rreq->operation,
1412 			    BLKIF_OP_WRITE_BARRIER);
1413 			break;
1414 		default:
1415 			if (!vdp->xdf_wce)
1416 				ddi_put8(acchdl, &rreq->operation,
1417 				    BLKIF_OP_WRITE_BARRIER);
1418 			else
1419 				ddi_put8(acchdl, &rreq->operation,
1420 				    BLKIF_OP_WRITE);
1421 			break;
1422 		}
1423 	}
1424 
1425 	ddi_put16(acchdl, &rreq->handle, vdev);
1426 	ddi_put64(acchdl, &rreq->sector_number, blkno);
1427 	ddi_put64(acchdl, &rreq->id, (uint64_t)(uintptr_t)(vreq->v_gs));
1428 
1429 	/*
1430 	 * loop until all segments are populated or no more dma cookie in buf
1431 	 */
1432 	for (;;) {
1433 	/*
1434 	 * Each segment of a blkif request can transfer up to
1435 	 * one 4K page of data.
1436 	 */
1437 		bcnt = vreq->v_dmac.dmac_size;
1438 		ASSERT(bcnt <= PAGESIZE);
1439 		ASSERT((bcnt % XB_BSIZE) == 0);
1440 		dma_addr = vreq->v_dmac.dmac_laddress;
1441 		blk_off = (uint_t)((paddr_t)XB_SEGOFFSET & dma_addr);
1442 		ASSERT((blk_off & XB_BMASK) == 0);
1443 		fsect = blk_off >> XB_BSHIFT;
1444 		lsect = fsect + (bcnt >> XB_BSHIFT) - 1;
1445 		ASSERT(fsect < XB_MAX_SEGLEN / XB_BSIZE &&
1446 		    lsect < XB_MAX_SEGLEN / XB_BSIZE);
1447 		DPRINTF(IO_DBG, ("  ""seg%d: dmacS %lu blk_off %ld\n",
1448 		    seg, vreq->v_dmac.dmac_size, blk_off));
1449 		gr = gs_grant(vreq->v_gs, PATOMA(dma_addr) >> PAGESHIFT);
1450 		ddi_put32(acchdl, &rreq->seg[seg].gref, gr);
1451 		ddi_put8(acchdl, &rreq->seg[seg].first_sect, fsect);
1452 		ddi_put8(acchdl, &rreq->seg[seg].last_sect, lsect);
1453 		DPRINTF(IO_DBG, ("  ""seg%d: fs %d ls %d gr %d dma 0x%"PRIx64
1454 		    "\n", seg, fsect, lsect, gr, dma_addr));
1455 
1456 		blkno += (bcnt >> XB_BSHIFT);
1457 		seg++;
1458 		ASSERT(seg <= BLKIF_MAX_SEGMENTS_PER_REQUEST);
1459 		if (--ndmacs) {
1460 			ddi_dma_nextcookie(vreq->v_dmahdl, &vreq->v_dmac);
1461 			continue;
1462 		}
1463 
1464 		vreq->v_status = VREQ_DMAWIN_DONE;
1465 		vreq->v_blkno = blkno;
1466 		if (vreq->v_dmaw + 1 == vreq->v_ndmaws)
1467 			/* last win */
1468 			rval = XF_COMP;
1469 		else
1470 			rval = XF_PARTIAL;
1471 		break;
1472 	}
1473 	ddi_put8(acchdl,  &rreq->nr_segments, seg);
1474 	DPRINTF(IO_DBG, ("xdf_prepare_rreq: request id=%"PRIx64" ready\n",
1475 	    rreq->id));
1476 
1477 	return (rval);
1478 }
1479 
1480 #define	XDF_QSEC	50000	/* .005 second */
1481 #define	XDF_POLLCNT	12	/* loop for 12 times before time out */
1482 
1483 static int
1484 xdf_drain_io(xdf_t *vdp)
1485 {
1486 	int pollc, rval;
1487 	xendev_ring_t *xbr;
1488 
1489 	if (xdfdebug & SUSRES_DBG)
1490 		xen_printf("xdf_drain_io: start\n");
1491 
1492 	mutex_enter(&vdp->xdf_dev_lk);
1493 
1494 	if ((vdp->xdf_status != XD_READY) && (vdp->xdf_status != XD_SUSPEND))
1495 		goto out;
1496 
1497 	rval = 0;
1498 	xbr = vdp->xdf_xb_ring;
1499 	ASSERT(xbr != NULL);
1500 
1501 	for (pollc = 0; pollc < XDF_POLLCNT; pollc++) {
1502 		if (xvdi_ring_has_unconsumed_responses(xbr)) {
1503 			mutex_exit(&vdp->xdf_dev_lk);
1504 			(void) xdf_intr((caddr_t)vdp);
1505 			mutex_enter(&vdp->xdf_dev_lk);
1506 		}
1507 		if (!xvdi_ring_has_incomp_request(xbr))
1508 			goto out;
1509 
1510 #ifndef	XPV_HVM_DRIVER
1511 		(void) HYPERVISOR_yield();
1512 #endif /* XPV_HVM_DRIVER */
1513 		/*
1514 		 * file-backed devices can be slow
1515 		 */
1516 		drv_usecwait(XDF_QSEC << pollc);
1517 	}
1518 	cmn_err(CE_WARN, "xdf_polled_io: timeout");
1519 	rval = EIO;
1520 out:
1521 	mutex_exit(&vdp->xdf_dev_lk);
1522 	if (xdfdebug & SUSRES_DBG)
1523 		xen_printf("xdf_drain_io: end, err=%d\n", rval);
1524 	return (rval);
1525 }
1526 
1527 /* ARGSUSED5 */
1528 int
1529 xdf_lb_rdwr(dev_info_t *devi, uchar_t cmd, void *bufp,
1530     diskaddr_t start, size_t reqlen, void *tg_cookie)
1531 {
1532 	xdf_t *vdp;
1533 	struct buf *bp;
1534 	int err = 0;
1535 
1536 	vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi));
1537 	if (vdp == NULL)
1538 		return (ENXIO);
1539 
1540 	if ((start + (reqlen >> DEV_BSHIFT)) > vdp->xdf_pgeom.g_capacity)
1541 		return (EINVAL);
1542 
1543 	bp = getrbuf(KM_SLEEP);
1544 	if (cmd == TG_READ)
1545 		bp->b_flags = B_BUSY | B_READ;
1546 	else
1547 		bp->b_flags = B_BUSY | B_WRITE;
1548 	bp->b_un.b_addr = bufp;
1549 	bp->b_bcount = reqlen;
1550 	bp->b_blkno = start;
1551 	bp->b_edev = DDI_DEV_T_NONE; /* don't have dev_t */
1552 
1553 	mutex_enter(&vdp->xdf_dev_lk);
1554 	if (vdp->xdf_xdev_iostat != NULL)
1555 		kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1556 	if (vdp->xdf_f_act == NULL) {
1557 		vdp->xdf_f_act = vdp->xdf_l_act = bp;
1558 	} else {
1559 		vdp->xdf_l_act->av_forw = bp;
1560 		vdp->xdf_l_act = bp;
1561 	}
1562 	mutex_exit(&vdp->xdf_dev_lk);
1563 	xdf_iostart(vdp);
1564 	err = biowait(bp);
1565 
1566 	ASSERT(bp->b_flags & B_DONE);
1567 
1568 	freerbuf(bp);
1569 	return (err);
1570 }
1571 
1572 /*
1573  * synthetic geometry
1574  */
1575 #define	XDF_NSECTS	256
1576 #define	XDF_NHEADS	16
1577 
1578 static void
1579 xdf_synthetic_pgeom(dev_info_t *devi, cmlb_geom_t *geomp)
1580 {
1581 	xdf_t *vdp;
1582 	uint_t ncyl;
1583 
1584 	vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi));
1585 
1586 	ncyl = vdp->xdf_xdev_nblocks / (XDF_NHEADS * XDF_NSECTS);
1587 
1588 	geomp->g_ncyl = ncyl == 0 ? 1 : ncyl;
1589 	geomp->g_acyl = 0;
1590 	geomp->g_nhead = XDF_NHEADS;
1591 	geomp->g_secsize = XB_BSIZE;
1592 	geomp->g_nsect = XDF_NSECTS;
1593 	geomp->g_intrlv = 0;
1594 	geomp->g_rpm = 7200;
1595 	geomp->g_capacity = vdp->xdf_xdev_nblocks;
1596 }
1597 
1598 static int
1599 xdf_lb_getcap(dev_info_t *devi, diskaddr_t *capp)
1600 {
1601 	xdf_t *vdp;
1602 
1603 	vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi));
1604 
1605 	if (vdp == NULL)
1606 		return (ENXIO);
1607 
1608 	mutex_enter(&vdp->xdf_dev_lk);
1609 	*capp = vdp->xdf_pgeom.g_capacity;
1610 	DPRINTF(LBL_DBG, ("capacity %llu\n", *capp));
1611 	mutex_exit(&vdp->xdf_dev_lk);
1612 	return (0);
1613 }
1614 
1615 static int
1616 xdf_lb_getpgeom(dev_info_t *devi, cmlb_geom_t *geomp)
1617 {
1618 	xdf_t *vdp;
1619 
1620 	if ((vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi))) == NULL)
1621 		return (ENXIO);
1622 	*geomp = vdp->xdf_pgeom;
1623 	return (0);
1624 }
1625 
1626 /*
1627  * No real HBA, no geometry available from it
1628  */
1629 /*ARGSUSED*/
1630 static int
1631 xdf_lb_getvgeom(dev_info_t *devi, cmlb_geom_t *geomp)
1632 {
1633 	return (EINVAL);
1634 }
1635 
1636 static int
1637 xdf_lb_getattribute(dev_info_t *devi, tg_attribute_t *tgattributep)
1638 {
1639 	xdf_t *vdp;
1640 
1641 	if (!(vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi))))
1642 		return (ENXIO);
1643 
1644 	if (XD_IS_RO(vdp))
1645 		tgattributep->media_is_writable = 0;
1646 	else
1647 		tgattributep->media_is_writable = 1;
1648 	return (0);
1649 }
1650 
1651 /* ARGSUSED3 */
1652 int
1653 xdf_lb_getinfo(dev_info_t *devi, int cmd, void *arg, void *tg_cookie)
1654 {
1655 	switch (cmd) {
1656 	case TG_GETPHYGEOM:
1657 		return (xdf_lb_getpgeom(devi, (cmlb_geom_t *)arg));
1658 	case TG_GETVIRTGEOM:
1659 		return (xdf_lb_getvgeom(devi, (cmlb_geom_t *)arg));
1660 	case TG_GETCAPACITY:
1661 		return (xdf_lb_getcap(devi, (diskaddr_t *)arg));
1662 	case TG_GETBLOCKSIZE:
1663 		*(uint32_t *)arg = XB_BSIZE;
1664 		return (0);
1665 	case TG_GETATTR:
1666 		return (xdf_lb_getattribute(devi, (tg_attribute_t *)arg));
1667 	default:
1668 		return (ENOTTY);
1669 	}
1670 }
1671 
1672 /*
1673  * Kick-off connect process
1674  * Status should be XD_UNKNOWN or XD_CLOSED
1675  * On success, status will be changed to XD_INIT
1676  * On error, status won't be changed
1677  */
1678 static int
1679 xdf_start_connect(xdf_t *vdp)
1680 {
1681 	char *xsnode;
1682 	grant_ref_t gref;
1683 	xenbus_transaction_t xbt;
1684 	int rv;
1685 	dev_info_t *dip = vdp->xdf_dip;
1686 
1687 	if ((vdp->xdf_peer = xvdi_get_oeid(dip)) == (domid_t)-1)
1688 		goto errout;
1689 
1690 	if (xvdi_alloc_evtchn(dip) != DDI_SUCCESS) {
1691 		cmn_err(CE_WARN, "xdf@%s: failed to alloc event channel",
1692 		    ddi_get_name_addr(dip));
1693 		goto errout;
1694 	}
1695 	vdp->xdf_evtchn = xvdi_get_evtchn(dip);
1696 #ifdef XPV_HVM_DRIVER
1697 	ec_bind_evtchn_to_handler(vdp->xdf_evtchn, IPL_VBD, xdf_intr, vdp);
1698 #else /* !XPV_HVM_DRIVER */
1699 	if (ddi_add_intr(dip, 0, NULL, NULL, xdf_intr, (caddr_t)vdp) !=
1700 	    DDI_SUCCESS) {
1701 		cmn_err(CE_WARN, "xdf_start_connect: xdf@%s: "
1702 		    "failed to add intr handler", ddi_get_name_addr(dip));
1703 		goto errout1;
1704 	}
1705 #endif /* !XPV_HVM_DRIVER */
1706 
1707 	if (xvdi_alloc_ring(dip, BLKIF_RING_SIZE,
1708 	    sizeof (union blkif_sring_entry), &gref, &vdp->xdf_xb_ring) !=
1709 	    DDI_SUCCESS) {
1710 		cmn_err(CE_WARN, "xdf@%s: failed to alloc comm ring",
1711 		    ddi_get_name_addr(dip));
1712 		goto errout2;
1713 	}
1714 	vdp->xdf_xb_ring_hdl = vdp->xdf_xb_ring->xr_acc_hdl; /* ugly!! */
1715 
1716 	/*
1717 	 * Write into xenstore the info needed by backend
1718 	 */
1719 	if ((xsnode = xvdi_get_xsname(dip)) == NULL) {
1720 		cmn_err(CE_WARN, "xdf@%s: "
1721 		    "failed to get xenstore node path",
1722 		    ddi_get_name_addr(dip));
1723 		goto fail_trans;
1724 	}
1725 trans_retry:
1726 	if (xenbus_transaction_start(&xbt)) {
1727 		cmn_err(CE_WARN, "xdf@%s: failed to start transaction",
1728 		    ddi_get_name_addr(dip));
1729 		xvdi_fatal_error(dip, EIO, "transaction start");
1730 		goto fail_trans;
1731 	}
1732 
1733 	if (rv = xenbus_printf(xbt, xsnode, "ring-ref", "%u", gref)) {
1734 		cmn_err(CE_WARN, "xdf@%s: failed to write ring-ref",
1735 		    ddi_get_name_addr(dip));
1736 		xvdi_fatal_error(dip, rv, "writing ring-ref");
1737 		goto abort_trans;
1738 	}
1739 
1740 	if (rv = xenbus_printf(xbt, xsnode, "event-channel", "%u",
1741 	    vdp->xdf_evtchn)) {
1742 		cmn_err(CE_WARN, "xdf@%s: failed to write event-channel",
1743 		    ddi_get_name_addr(dip));
1744 		xvdi_fatal_error(dip, rv, "writing event-channel");
1745 		goto abort_trans;
1746 	}
1747 
1748 	/*
1749 	 * "protocol" is written by the domain builder in the case of PV
1750 	 * domains. However, it is not written for HVM domains, so let's
1751 	 * write it here.
1752 	 */
1753 	if (rv = xenbus_printf(xbt, xsnode, "protocol", "%s",
1754 	    XEN_IO_PROTO_ABI_NATIVE)) {
1755 		cmn_err(CE_WARN, "xdf@%s: failed to write protocol",
1756 		    ddi_get_name_addr(dip));
1757 		xvdi_fatal_error(dip, rv, "writing protocol");
1758 		goto abort_trans;
1759 	}
1760 
1761 	if ((rv = xvdi_switch_state(dip, xbt, XenbusStateInitialised)) > 0) {
1762 		cmn_err(CE_WARN, "xdf@%s: "
1763 		    "failed to switch state to XenbusStateInitialised",
1764 		    ddi_get_name_addr(dip));
1765 		xvdi_fatal_error(dip, rv, "writing state");
1766 		goto abort_trans;
1767 	}
1768 
1769 	/* kick-off connect process */
1770 	if (rv = xenbus_transaction_end(xbt, 0)) {
1771 		if (rv == EAGAIN)
1772 			goto trans_retry;
1773 		cmn_err(CE_WARN, "xdf@%s: failed to end transaction",
1774 		    ddi_get_name_addr(dip));
1775 		xvdi_fatal_error(dip, rv, "completing transaction");
1776 		goto fail_trans;
1777 	}
1778 
1779 	ASSERT(mutex_owned(&vdp->xdf_cb_lk));
1780 	mutex_enter(&vdp->xdf_dev_lk);
1781 	vdp->xdf_status = XD_INIT;
1782 	mutex_exit(&vdp->xdf_dev_lk);
1783 
1784 	return (DDI_SUCCESS);
1785 
1786 abort_trans:
1787 	(void) xenbus_transaction_end(xbt, 1);
1788 fail_trans:
1789 	xvdi_free_ring(vdp->xdf_xb_ring);
1790 errout2:
1791 #ifdef XPV_HVM_DRIVER
1792 	ec_unbind_evtchn(vdp->xdf_evtchn);
1793 #else /* !XPV_HVM_DRIVER */
1794 	(void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
1795 #endif /* !XPV_HVM_DRIVER */
1796 errout1:
1797 	xvdi_free_evtchn(dip);
1798 errout:
1799 	cmn_err(CE_WARN, "xdf@%s: fail to kick-off connecting",
1800 	    ddi_get_name_addr(dip));
1801 	return (DDI_FAILURE);
1802 }
1803 
1804 /*
1805  * Kick-off disconnect process
1806  * Status won't be changed
1807  */
1808 static int
1809 xdf_start_disconnect(xdf_t *vdp)
1810 {
1811 	if (xvdi_switch_state(vdp->xdf_dip, XBT_NULL, XenbusStateClosed) > 0) {
1812 		cmn_err(CE_WARN, "xdf@%s: fail to kick-off disconnecting",
1813 		    ddi_get_name_addr(vdp->xdf_dip));
1814 		return (DDI_FAILURE);
1815 	}
1816 
1817 	return (DDI_SUCCESS);
1818 }
1819 
1820 int
1821 xdf_get_flush_block(xdf_t *vdp)
1822 {
1823 	/*
1824 	 * Get a DEV_BSIZE aligned bufer
1825 	 */
1826 	vdp->xdf_flush_mem = kmem_alloc(DEV_BSIZE * 2, KM_SLEEP);
1827 	vdp->xdf_cache_flush_block =
1828 	    (char *)P2ROUNDUP((uintptr_t)(vdp->xdf_flush_mem), DEV_BSIZE);
1829 	if (xdf_lb_rdwr(vdp->xdf_dip, TG_READ, vdp->xdf_cache_flush_block,
1830 	    xdf_flush_block, DEV_BSIZE, NULL) != 0)
1831 		return (DDI_FAILURE);
1832 	return (DDI_SUCCESS);
1833 }
1834 
1835 /*
1836  * Finish other initialization after we've connected to backend
1837  * Status should be XD_INIT before calling this routine
1838  * On success, status should be changed to XD_READY
1839  * On error, status should stay XD_INIT
1840  */
1841 static int
1842 xdf_post_connect(xdf_t *vdp)
1843 {
1844 	int rv;
1845 	uint_t len;
1846 	char *type;
1847 	char *barrier;
1848 	dev_info_t *devi = vdp->xdf_dip;
1849 
1850 	/*
1851 	 * Determine if feature barrier is supported by backend
1852 	 */
1853 	if (xenbus_read(XBT_NULL, xvdi_get_oename(devi),
1854 	    "feature-barrier", (void **)&barrier, &len) == 0) {
1855 		vdp->xdf_feature_barrier = 1;
1856 		kmem_free(barrier, len);
1857 	} else {
1858 		cmn_err(CE_NOTE, "xdf@%s: failed to read feature-barrier",
1859 		    ddi_get_name_addr(vdp->xdf_dip));
1860 		vdp->xdf_feature_barrier = 0;
1861 	}
1862 
1863 	/* probe backend */
1864 	if (rv = xenbus_gather(XBT_NULL, xvdi_get_oename(devi),
1865 	    "sectors", "%"SCNu64, &vdp->xdf_xdev_nblocks,
1866 	    "info", "%u", &vdp->xdf_xdev_info, NULL)) {
1867 		cmn_err(CE_WARN, "xdf_post_connect: xdf@%s: "
1868 		    "cannot read backend info", ddi_get_name_addr(devi));
1869 		xvdi_fatal_error(devi, rv, "reading backend info");
1870 		return (DDI_FAILURE);
1871 	}
1872 
1873 	/*
1874 	 * Make sure that the device we're connecting isn't smaller than
1875 	 * the old connected device.
1876 	 */
1877 	if (vdp->xdf_xdev_nblocks < vdp->xdf_pgeom.g_capacity) {
1878 		cmn_err(CE_WARN, "xdf_post_connect: xdf@%s: "
1879 		    "backend disk device shrank", ddi_get_name_addr(devi));
1880 		/* XXX:  call xvdi_fatal_error() here? */
1881 		xvdi_fatal_error(devi, rv, "reading backend info");
1882 		return (DDI_FAILURE);
1883 	}
1884 
1885 	/*
1886 	 * Only update the physical geometry to reflect the new device
1887 	 * size if this is the first time we're connecting to the backend
1888 	 * device.  Once we assign a physical geometry to a device it stays
1889 	 * fixed until:
1890 	 *	- we get detach and re-attached (at which point we
1891 	 *	  automatically assign a new physical geometry).
1892 	 *	- someone calls TG_SETPHYGEOM to explicity set the
1893 	 *	  physical geometry.
1894 	 */
1895 	if (vdp->xdf_pgeom.g_capacity == 0)
1896 		xdf_synthetic_pgeom(devi, &vdp->xdf_pgeom);
1897 
1898 	/* fix disk type */
1899 	if (xenbus_read(XBT_NULL, xvdi_get_xsname(devi), "device-type",
1900 	    (void **)&type, &len) != 0) {
1901 		cmn_err(CE_WARN, "xdf_post_connect: xdf@%s: "
1902 		    "cannot read device-type", ddi_get_name_addr(devi));
1903 		xvdi_fatal_error(devi, rv, "reading device-type");
1904 		return (DDI_FAILURE);
1905 	}
1906 	if (strcmp(type, "cdrom") == 0)
1907 		vdp->xdf_xdev_info |= VDISK_CDROM;
1908 	kmem_free(type, len);
1909 
1910 	/*
1911 	 * We've created all the minor nodes via cmlb_attach() using default
1912 	 * value in xdf_attach() to make it possible to block in xdf_open(),
1913 	 * in case there's anyone (say, booting thread) ever trying to open
1914 	 * it before connected to backend. We will refresh all those minor
1915 	 * nodes w/ latest info we've got now when we are almost connected.
1916 	 *
1917 	 * Don't do this when xdf is already opened by someone (could happen
1918 	 * during resume), for that cmlb_attach() will invalid the label info
1919 	 * and confuse those who has already opened the node, which is bad.
1920 	 */
1921 	if (!xdf_isopen(vdp, -1) && (XD_IS_CD(vdp) || XD_IS_RM(vdp))) {
1922 		/* re-init cmlb w/ latest info we got from backend */
1923 		if (cmlb_attach(devi, &xdf_lb_ops,
1924 		    XD_IS_CD(vdp) ? DTYPE_RODIRECT : DTYPE_DIRECT,
1925 		    XD_IS_RM(vdp), 1,
1926 		    XD_IS_CD(vdp) ? DDI_NT_CD_XVMD : DDI_NT_BLOCK_XVMD,
1927 #if defined(XPV_HVM_DRIVER)
1928 		    CMLB_CREATE_ALTSLICE_VTOC_16_DTYPE_DIRECT |
1929 		    CMLB_INTERNAL_MINOR_NODES,
1930 #else /* !XPV_HVM_DRIVER */
1931 		    CMLB_FAKE_LABEL_ONE_PARTITION,
1932 #endif /* !XPV_HVM_DRIVER */
1933 		    vdp->xdf_vd_lbl, NULL) != 0) {
1934 			cmn_err(CE_WARN, "xdf@%s: cmlb attach failed",
1935 			    ddi_get_name_addr(devi));
1936 			return (DDI_FAILURE);
1937 		}
1938 	}
1939 
1940 	/* mark vbd is ready for I/O */
1941 	ASSERT(mutex_owned(&vdp->xdf_cb_lk));
1942 	mutex_enter(&vdp->xdf_dev_lk);
1943 	vdp->xdf_status = XD_READY;
1944 	mutex_exit(&vdp->xdf_dev_lk);
1945 	/*
1946 	 * If backend has feature-barrier, see if it supports disk
1947 	 * cache flush op.
1948 	 */
1949 	vdp->xdf_flush_supported = 0;
1950 	if (vdp->xdf_feature_barrier) {
1951 		/*
1952 		 * Pretend we already know flush is supported so probe
1953 		 * will attempt the correct op.
1954 		 */
1955 		vdp->xdf_flush_supported = 1;
1956 		if (xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE, NULL, 0, 0, 0) == 0) {
1957 			vdp->xdf_flush_supported = 1;
1958 		} else {
1959 			vdp->xdf_flush_supported = 0;
1960 			/*
1961 			 * If the other end does not support the cache flush op
1962 			 * then we must use a barrier-write to force disk
1963 			 * cache flushing.  Barrier writes require that a data
1964 			 * block actually be written.
1965 			 * Cache a block to barrier-write when we are
1966 			 * asked to perform a flush.
1967 			 * XXX - would it be better to just copy 1 block
1968 			 * (512 bytes) from whatever write we did last
1969 			 * and rewrite that block?
1970 			 */
1971 			if (xdf_get_flush_block(vdp) != DDI_SUCCESS)
1972 				return (DDI_FAILURE);
1973 		}
1974 	}
1975 
1976 	cmn_err(CE_CONT, "?xdf@%s: %"PRIu64" blocks", ddi_get_name_addr(devi),
1977 	    (uint64_t)vdp->xdf_xdev_nblocks);
1978 
1979 	return (DDI_SUCCESS);
1980 }
1981 
1982 /*
1983  * Finish other uninitialization after we've disconnected from backend
1984  * when status is XD_CLOSING or XD_INIT. After returns, status is XD_CLOSED
1985  */
1986 static void
1987 xdf_post_disconnect(xdf_t *vdp)
1988 {
1989 #ifdef XPV_HVM_DRIVER
1990 	ec_unbind_evtchn(vdp->xdf_evtchn);
1991 #else /* !XPV_HVM_DRIVER */
1992 	(void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
1993 #endif /* !XPV_HVM_DRIVER */
1994 	xvdi_free_evtchn(vdp->xdf_dip);
1995 	xvdi_free_ring(vdp->xdf_xb_ring);
1996 	vdp->xdf_xb_ring = NULL;
1997 	vdp->xdf_xb_ring_hdl = NULL;
1998 	vdp->xdf_peer = (domid_t)-1;
1999 
2000 	ASSERT(mutex_owned(&vdp->xdf_cb_lk));
2001 	mutex_enter(&vdp->xdf_dev_lk);
2002 	vdp->xdf_status = XD_CLOSED;
2003 	mutex_exit(&vdp->xdf_dev_lk);
2004 }
2005 
2006 /*ARGSUSED*/
2007 static void
2008 xdf_oe_change(dev_info_t *dip, ddi_eventcookie_t id, void *arg, void *impl_data)
2009 {
2010 	XenbusState new_state = *(XenbusState *)impl_data;
2011 	xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip);
2012 	boolean_t unexpect_die = B_FALSE;
2013 	int status;
2014 
2015 	DPRINTF(DDI_DBG, ("xdf@%s: otherend state change to %d!\n",
2016 	    ddi_get_name_addr(dip), new_state));
2017 
2018 	mutex_enter(&vdp->xdf_cb_lk);
2019 
2020 	if (xdf_check_state_transition(vdp, new_state) == DDI_FAILURE) {
2021 		mutex_exit(&vdp->xdf_cb_lk);
2022 		return;
2023 	}
2024 
2025 	switch (new_state) {
2026 	case XenbusStateInitialising:
2027 		ASSERT(vdp->xdf_status == XD_CLOSED);
2028 		/*
2029 		 * backend recovered from a previous failure,
2030 		 * kick-off connect process again
2031 		 */
2032 		if (xdf_start_connect(vdp) != DDI_SUCCESS) {
2033 			cmn_err(CE_WARN, "xdf@%s:"
2034 			    " failed to start reconnecting to backend",
2035 			    ddi_get_name_addr(dip));
2036 		}
2037 		break;
2038 	case XenbusStateConnected:
2039 		ASSERT(vdp->xdf_status == XD_INIT);
2040 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
2041 		/* finish final init after connect */
2042 		if (xdf_post_connect(vdp) != DDI_SUCCESS)
2043 			(void) xdf_start_disconnect(vdp);
2044 		break;
2045 	case XenbusStateClosing:
2046 		if (vdp->xdf_status == XD_READY) {
2047 			mutex_enter(&vdp->xdf_dev_lk);
2048 			if (xdf_isopen(vdp, -1)) {
2049 				cmn_err(CE_NOTE, "xdf@%s: hot-unplug failed, "
2050 				    "still in use", ddi_get_name_addr(dip));
2051 				mutex_exit(&vdp->xdf_dev_lk);
2052 				break;
2053 			} else {
2054 				vdp->xdf_status = XD_CLOSING;
2055 			}
2056 			mutex_exit(&vdp->xdf_dev_lk);
2057 		}
2058 		(void) xdf_start_disconnect(vdp);
2059 		break;
2060 	case XenbusStateClosed:
2061 		/* first check if BE closed unexpectedly */
2062 		mutex_enter(&vdp->xdf_dev_lk);
2063 		if (xdf_isopen(vdp, -1)) {
2064 			unexpect_die = B_TRUE;
2065 			unexpectedie(vdp);
2066 			cmn_err(CE_WARN, "xdf@%s: backend closed, "
2067 			    "reconnecting...", ddi_get_name_addr(dip));
2068 		}
2069 		mutex_exit(&vdp->xdf_dev_lk);
2070 
2071 		if (vdp->xdf_status == XD_READY) {
2072 			mutex_enter(&vdp->xdf_dev_lk);
2073 			vdp->xdf_status = XD_CLOSING;
2074 			mutex_exit(&vdp->xdf_dev_lk);
2075 
2076 #ifdef	DOMU_BACKEND
2077 			(void) xvdi_post_event(dip, XEN_HP_REMOVE);
2078 #endif
2079 
2080 			xdf_post_disconnect(vdp);
2081 			(void) xvdi_switch_state(dip, XBT_NULL,
2082 			    XenbusStateClosed);
2083 		} else if ((vdp->xdf_status == XD_INIT) ||
2084 		    (vdp->xdf_status == XD_CLOSING)) {
2085 			xdf_post_disconnect(vdp);
2086 		} else {
2087 			mutex_enter(&vdp->xdf_dev_lk);
2088 			vdp->xdf_status = XD_CLOSED;
2089 			mutex_exit(&vdp->xdf_dev_lk);
2090 		}
2091 	}
2092 
2093 	/* notify anybody waiting for oe state change */
2094 	mutex_enter(&vdp->xdf_dev_lk);
2095 	cv_broadcast(&vdp->xdf_dev_cv);
2096 	mutex_exit(&vdp->xdf_dev_lk);
2097 
2098 	status = vdp->xdf_status;
2099 	mutex_exit(&vdp->xdf_cb_lk);
2100 
2101 	if (status == XD_READY) {
2102 		xdf_iostart(vdp);
2103 	} else if ((status == XD_CLOSED) && !unexpect_die) {
2104 		/* interface is closed successfully, remove all minor nodes */
2105 		if (vdp->xdf_vd_lbl != NULL) {
2106 			cmlb_detach(vdp->xdf_vd_lbl, NULL);
2107 			cmlb_free_handle(&vdp->xdf_vd_lbl);
2108 			vdp->xdf_vd_lbl = NULL;
2109 		}
2110 	}
2111 }
2112 
2113 /* check if partition is open, -1 - check all partitions on the disk */
2114 static boolean_t
2115 xdf_isopen(xdf_t *vdp, int partition)
2116 {
2117 	int i;
2118 	ulong_t parbit;
2119 	boolean_t rval = B_FALSE;
2120 
2121 	ASSERT((partition == -1) ||
2122 	    ((partition >= 0) || (partition < XDF_PEXT)));
2123 
2124 	if (partition == -1)
2125 		parbit = (ulong_t)-1;
2126 	else
2127 		parbit = 1 << partition;
2128 
2129 	for (i = 0; i < OTYPCNT; i++) {
2130 		if (vdp->xdf_vd_open[i] & parbit)
2131 			rval = B_TRUE;
2132 	}
2133 
2134 	return (rval);
2135 }
2136 
2137 /*
2138  * Xdf_check_state_transition will check the XenbusState change to see
2139  * if the change is a valid transition or not.
2140  * The new state is written by backend domain, or by running xenstore-write
2141  * to change it manually in dom0
2142  */
2143 static int
2144 xdf_check_state_transition(xdf_t *vdp, XenbusState oestate)
2145 {
2146 	int status;
2147 	int stcheck;
2148 #define	STOK	0 /* need further process */
2149 #define	STNOP	1 /* no action need taking */
2150 #define	STBUG	2 /* unexpected state change, could be a bug */
2151 
2152 	status = vdp->xdf_status;
2153 	stcheck = STOK;
2154 
2155 	switch (status) {
2156 	case XD_UNKNOWN:
2157 		if ((oestate == XenbusStateUnknown)		||
2158 		    (oestate == XenbusStateConnected))
2159 			stcheck = STBUG;
2160 		else if ((oestate == XenbusStateInitialising)	||
2161 		    (oestate == XenbusStateInitWait)		||
2162 		    (oestate == XenbusStateInitialised))
2163 			stcheck = STNOP;
2164 		break;
2165 	case XD_INIT:
2166 		if (oestate == XenbusStateUnknown)
2167 			stcheck = STBUG;
2168 		else if ((oestate == XenbusStateInitialising)	||
2169 		    (oestate == XenbusStateInitWait)		||
2170 		    (oestate == XenbusStateInitialised))
2171 			stcheck = STNOP;
2172 		break;
2173 	case XD_READY:
2174 		if ((oestate == XenbusStateUnknown)		||
2175 		    (oestate == XenbusStateInitialising)	||
2176 		    (oestate == XenbusStateInitWait)		||
2177 		    (oestate == XenbusStateInitialised))
2178 			stcheck = STBUG;
2179 		else if (oestate == XenbusStateConnected)
2180 			stcheck = STNOP;
2181 		break;
2182 	case XD_CLOSING:
2183 		if ((oestate == XenbusStateUnknown)		||
2184 		    (oestate == XenbusStateInitialising)	||
2185 		    (oestate == XenbusStateInitWait)		||
2186 		    (oestate == XenbusStateInitialised)		||
2187 		    (oestate == XenbusStateConnected))
2188 			stcheck = STBUG;
2189 		else if (oestate == XenbusStateClosing)
2190 			stcheck = STNOP;
2191 		break;
2192 	case XD_CLOSED:
2193 		if ((oestate == XenbusStateUnknown)		||
2194 		    (oestate == XenbusStateConnected))
2195 			stcheck = STBUG;
2196 		else if ((oestate == XenbusStateInitWait)	||
2197 		    (oestate == XenbusStateInitialised)		||
2198 		    (oestate == XenbusStateClosing)		||
2199 		    (oestate == XenbusStateClosed))
2200 			stcheck = STNOP;
2201 		break;
2202 	case XD_SUSPEND:
2203 	default:
2204 			stcheck = STBUG;
2205 	}
2206 
2207 	if (stcheck == STOK)
2208 		return (DDI_SUCCESS);
2209 
2210 	if (stcheck == STBUG)
2211 		cmn_err(CE_NOTE, "xdf@%s: unexpected otherend "
2212 		    "state change to %d!, when status is %d",
2213 		    ddi_get_name_addr(vdp->xdf_dip), oestate, status);
2214 
2215 	return (DDI_FAILURE);
2216 }
2217 
2218 static int
2219 xdf_connect(xdf_t *vdp, boolean_t wait)
2220 {
2221 	ASSERT(mutex_owned(&vdp->xdf_dev_lk));
2222 	while (vdp->xdf_status != XD_READY) {
2223 		if (!wait || (vdp->xdf_status > XD_READY))
2224 			break;
2225 
2226 		if (cv_wait_sig(&vdp->xdf_dev_cv, &vdp->xdf_dev_lk) == 0)
2227 			break;
2228 	}
2229 
2230 	return (vdp->xdf_status);
2231 }
2232 
2233 /*
2234  * callback func when DMA/GTE resources is available
2235  *
2236  * Note: we only register one callback function to grant table subsystem
2237  * since we only have one 'struct gnttab_free_callback' in xdf_t.
2238  */
2239 static int
2240 xdf_dmacallback(caddr_t arg)
2241 {
2242 	xdf_t *vdp = (xdf_t *)arg;
2243 	ASSERT(vdp != NULL);
2244 
2245 	DPRINTF(DMA_DBG, ("xdf@%s: DMA callback started\n",
2246 	    ddi_get_name_addr(vdp->xdf_dip)));
2247 
2248 	ddi_trigger_softintr(vdp->xdf_softintr_id);
2249 	return (DDI_DMA_CALLBACK_DONE);
2250 }
2251 
2252 static uint_t
2253 xdf_iorestart(caddr_t arg)
2254 {
2255 	xdf_t *vdp = (xdf_t *)arg;
2256 
2257 	ASSERT(vdp != NULL);
2258 
2259 	mutex_enter(&vdp->xdf_dev_lk);
2260 	ASSERT(ISDMACBON(vdp));
2261 	SETDMACBOFF(vdp);
2262 	mutex_exit(&vdp->xdf_dev_lk);
2263 
2264 	xdf_iostart(vdp);
2265 
2266 	return (DDI_INTR_CLAIMED);
2267 }
2268 
2269 static void
2270 xdf_timeout_handler(void *arg)
2271 {
2272 	xdf_t *vdp = arg;
2273 
2274 	mutex_enter(&vdp->xdf_dev_lk);
2275 	vdp->xdf_timeout_id = 0;
2276 	mutex_exit(&vdp->xdf_dev_lk);
2277 
2278 	/* new timeout thread could be re-scheduled */
2279 	xdf_iostart(vdp);
2280 }
2281 
2282 /*
2283  * Alloc a vreq for this bp
2284  * bp->av_back contains the pointer to the vreq upon return
2285  */
2286 static v_req_t *
2287 vreq_get(xdf_t *vdp, buf_t *bp)
2288 {
2289 	v_req_t *vreq = NULL;
2290 
2291 	ASSERT(BP2VREQ(bp) == NULL);
2292 
2293 	vreq = kmem_cache_alloc(xdf_vreq_cache, KM_NOSLEEP);
2294 	if (vreq == NULL) {
2295 		if (vdp->xdf_timeout_id == 0)
2296 			/* restart I/O after one second */
2297 			vdp->xdf_timeout_id =
2298 			    timeout(xdf_timeout_handler, vdp, hz);
2299 		return (NULL);
2300 	}
2301 	bzero(vreq, sizeof (v_req_t));
2302 
2303 	list_insert_head(&vdp->xdf_vreq_act, (void *)vreq);
2304 	bp->av_back = (buf_t *)vreq;
2305 	vreq->v_buf = bp;
2306 	vreq->v_status = VREQ_INIT;
2307 	/* init of other fields in vreq is up to the caller */
2308 
2309 	return (vreq);
2310 }
2311 
2312 static void
2313 vreq_free(xdf_t *vdp, v_req_t *vreq)
2314 {
2315 	buf_t *bp = vreq->v_buf;
2316 
2317 	list_remove(&vdp->xdf_vreq_act, (void *)vreq);
2318 
2319 	if (vreq->v_flush_diskcache == FLUSH_DISKCACHE)
2320 		goto done;
2321 
2322 	switch (vreq->v_status) {
2323 	case VREQ_DMAWIN_DONE:
2324 	case VREQ_GS_ALLOCED:
2325 	case VREQ_DMABUF_BOUND:
2326 		(void) ddi_dma_unbind_handle(vreq->v_dmahdl);
2327 		/*FALLTHRU*/
2328 	case VREQ_DMAMEM_ALLOCED:
2329 		if (!ALIGNED_XFER(bp)) {
2330 			ASSERT(vreq->v_abuf != NULL);
2331 			if (!IS_ERROR(bp) && IS_READ(bp))
2332 				bcopy(vreq->v_abuf, bp->b_un.b_addr,
2333 				    bp->b_bcount);
2334 			ddi_dma_mem_free(&vreq->v_align);
2335 		}
2336 		/*FALLTHRU*/
2337 	case VREQ_MEMDMAHDL_ALLOCED:
2338 		if (!ALIGNED_XFER(bp))
2339 			ddi_dma_free_handle(&vreq->v_memdmahdl);
2340 		/*FALLTHRU*/
2341 	case VREQ_DMAHDL_ALLOCED:
2342 		ddi_dma_free_handle(&vreq->v_dmahdl);
2343 		break;
2344 	default:
2345 		break;
2346 	}
2347 done:
2348 	vreq->v_buf->av_back = NULL;
2349 	kmem_cache_free(xdf_vreq_cache, vreq);
2350 }
2351 
2352 /*
2353  * Initalize the DMA and grant table resources for the buf
2354  */
2355 static int
2356 vreq_setup(xdf_t *vdp, v_req_t *vreq)
2357 {
2358 	int rc;
2359 	ddi_dma_attr_t dmaattr;
2360 	uint_t ndcs, ndws;
2361 	ddi_dma_handle_t dh;
2362 	ddi_dma_handle_t mdh;
2363 	ddi_dma_cookie_t dc;
2364 	ddi_acc_handle_t abh;
2365 	caddr_t	aba;
2366 	ge_slot_t *gs;
2367 	size_t bufsz;
2368 	off_t off;
2369 	size_t sz;
2370 	buf_t *bp = vreq->v_buf;
2371 	int dma_flags = (IS_READ(bp) ? DDI_DMA_READ : DDI_DMA_WRITE) |
2372 	    DDI_DMA_STREAMING | DDI_DMA_PARTIAL;
2373 
2374 	switch (vreq->v_status) {
2375 	case VREQ_INIT:
2376 		if (IS_FLUSH_DISKCACHE(bp)) {
2377 			if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
2378 				DPRINTF(DMA_DBG, (
2379 				    "xdf@%s: get ge_slotfailed\n",
2380 				    ddi_get_name_addr(vdp->xdf_dip)));
2381 				return (DDI_FAILURE);
2382 			}
2383 			vreq->v_blkno = 0;
2384 			vreq->v_nslots = 1;
2385 			vreq->v_gs = gs;
2386 			vreq->v_flush_diskcache = FLUSH_DISKCACHE;
2387 			vreq->v_status = VREQ_GS_ALLOCED;
2388 			gs->vreq = vreq;
2389 			return (DDI_SUCCESS);
2390 		}
2391 
2392 		if (IS_WRITE_BARRIER(vdp, bp))
2393 			vreq->v_flush_diskcache = WRITE_BARRIER;
2394 		vreq->v_blkno = bp->b_blkno +
2395 		    (diskaddr_t)(uintptr_t)bp->b_private;
2396 		bp->b_private = NULL;
2397 		/* See if we wrote new data to our flush block */
2398 		if (!IS_READ(bp) && USE_WRITE_BARRIER(vdp))
2399 			check_fbwrite(vdp, bp, vreq->v_blkno);
2400 		vreq->v_status = VREQ_INIT_DONE;
2401 		/*FALLTHRU*/
2402 
2403 	case VREQ_INIT_DONE:
2404 		/*
2405 		 * alloc DMA handle
2406 		 */
2407 		rc = ddi_dma_alloc_handle(vdp->xdf_dip, &xb_dma_attr,
2408 		    xdf_dmacallback, (caddr_t)vdp, &dh);
2409 		if (rc != DDI_SUCCESS) {
2410 			SETDMACBON(vdp);
2411 			DPRINTF(DMA_DBG, ("xdf@%s: DMA handle alloc failed\n",
2412 			    ddi_get_name_addr(vdp->xdf_dip)));
2413 			return (DDI_FAILURE);
2414 		}
2415 
2416 		vreq->v_dmahdl = dh;
2417 		vreq->v_status = VREQ_DMAHDL_ALLOCED;
2418 		/*FALLTHRU*/
2419 
2420 	case VREQ_DMAHDL_ALLOCED:
2421 		/*
2422 		 * alloc dma handle for 512-byte aligned buf
2423 		 */
2424 		if (!ALIGNED_XFER(bp)) {
2425 			/*
2426 			 * XXPV: we need to temporarily enlarge the seg
2427 			 * boundary and s/g length to work round CR6381968
2428 			 */
2429 			dmaattr = xb_dma_attr;
2430 			dmaattr.dma_attr_seg = (uint64_t)-1;
2431 			dmaattr.dma_attr_sgllen = INT_MAX;
2432 			rc = ddi_dma_alloc_handle(vdp->xdf_dip, &dmaattr,
2433 			    xdf_dmacallback, (caddr_t)vdp, &mdh);
2434 			if (rc != DDI_SUCCESS) {
2435 				SETDMACBON(vdp);
2436 				DPRINTF(DMA_DBG, ("xdf@%s: unaligned buf DMA"
2437 				    "handle alloc failed\n",
2438 				    ddi_get_name_addr(vdp->xdf_dip)));
2439 				return (DDI_FAILURE);
2440 			}
2441 			vreq->v_memdmahdl = mdh;
2442 			vreq->v_status = VREQ_MEMDMAHDL_ALLOCED;
2443 		}
2444 		/*FALLTHRU*/
2445 
2446 	case VREQ_MEMDMAHDL_ALLOCED:
2447 		/*
2448 		 * alloc 512-byte aligned buf
2449 		 */
2450 		if (!ALIGNED_XFER(bp)) {
2451 			if (bp->b_flags & (B_PAGEIO | B_PHYS))
2452 				bp_mapin(bp);
2453 
2454 			rc = ddi_dma_mem_alloc(vreq->v_memdmahdl,
2455 			    roundup(bp->b_bcount, XB_BSIZE), &xc_acc_attr,
2456 			    DDI_DMA_STREAMING, xdf_dmacallback, (caddr_t)vdp,
2457 			    &aba, &bufsz, &abh);
2458 			if (rc != DDI_SUCCESS) {
2459 				SETDMACBON(vdp);
2460 				DPRINTF(DMA_DBG, (
2461 				    "xdf@%s: DMA mem allocation failed\n",
2462 				    ddi_get_name_addr(vdp->xdf_dip)));
2463 				return (DDI_FAILURE);
2464 			}
2465 
2466 			vreq->v_abuf = aba;
2467 			vreq->v_align = abh;
2468 			vreq->v_status = VREQ_DMAMEM_ALLOCED;
2469 
2470 			ASSERT(bufsz >= bp->b_bcount);
2471 			if (!IS_READ(bp))
2472 				bcopy(bp->b_un.b_addr, vreq->v_abuf,
2473 				    bp->b_bcount);
2474 		}
2475 		/*FALLTHRU*/
2476 
2477 	case VREQ_DMAMEM_ALLOCED:
2478 		/*
2479 		 * dma bind
2480 		 */
2481 		if (ALIGNED_XFER(bp)) {
2482 			rc = ddi_dma_buf_bind_handle(vreq->v_dmahdl, bp,
2483 			    dma_flags, xdf_dmacallback, (caddr_t)vdp,
2484 			    &dc, &ndcs);
2485 		} else {
2486 			rc = ddi_dma_addr_bind_handle(vreq->v_dmahdl,
2487 			    NULL, vreq->v_abuf, bp->b_bcount, dma_flags,
2488 			    xdf_dmacallback, (caddr_t)vdp, &dc, &ndcs);
2489 		}
2490 		if (rc == DDI_DMA_MAPPED || rc == DDI_DMA_PARTIAL_MAP) {
2491 			/* get num of dma windows */
2492 			if (rc == DDI_DMA_PARTIAL_MAP) {
2493 				rc = ddi_dma_numwin(vreq->v_dmahdl, &ndws);
2494 				ASSERT(rc == DDI_SUCCESS);
2495 			} else {
2496 				ndws = 1;
2497 			}
2498 		} else {
2499 			SETDMACBON(vdp);
2500 			DPRINTF(DMA_DBG, ("xdf@%s: DMA bind failed\n",
2501 			    ddi_get_name_addr(vdp->xdf_dip)));
2502 			return (DDI_FAILURE);
2503 		}
2504 
2505 		vreq->v_dmac = dc;
2506 		vreq->v_dmaw = 0;
2507 		vreq->v_ndmacs = ndcs;
2508 		vreq->v_ndmaws = ndws;
2509 		vreq->v_nslots = ndws;
2510 		vreq->v_status = VREQ_DMABUF_BOUND;
2511 		/*FALLTHRU*/
2512 
2513 	case VREQ_DMABUF_BOUND:
2514 		/*
2515 		 * get ge_slot, callback is set upon failure from gs_get(),
2516 		 * if not set previously
2517 		 */
2518 		if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
2519 			DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
2520 			    ddi_get_name_addr(vdp->xdf_dip)));
2521 			return (DDI_FAILURE);
2522 		}
2523 
2524 		vreq->v_gs = gs;
2525 		gs->vreq = vreq;
2526 		vreq->v_status = VREQ_GS_ALLOCED;
2527 		break;
2528 
2529 	case VREQ_GS_ALLOCED:
2530 		/* nothing need to be done */
2531 		break;
2532 
2533 	case VREQ_DMAWIN_DONE:
2534 		/*
2535 		 * move to the next dma window
2536 		 */
2537 		ASSERT((vreq->v_dmaw + 1) < vreq->v_ndmaws);
2538 
2539 		/* get a ge_slot for this DMA window */
2540 		if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
2541 			DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
2542 			    ddi_get_name_addr(vdp->xdf_dip)));
2543 			return (DDI_FAILURE);
2544 		}
2545 
2546 		vreq->v_gs = gs;
2547 		gs->vreq = vreq;
2548 		vreq->v_dmaw++;
2549 		rc = ddi_dma_getwin(vreq->v_dmahdl, vreq->v_dmaw, &off, &sz,
2550 		    &vreq->v_dmac, &vreq->v_ndmacs);
2551 		ASSERT(rc == DDI_SUCCESS);
2552 		vreq->v_status = VREQ_GS_ALLOCED;
2553 		break;
2554 
2555 	default:
2556 		return (DDI_FAILURE);
2557 	}
2558 
2559 	return (DDI_SUCCESS);
2560 }
2561 
2562 static ge_slot_t *
2563 gs_get(xdf_t *vdp, int isread)
2564 {
2565 	grant_ref_t gh;
2566 	ge_slot_t *gs;
2567 
2568 	/* try to alloc GTEs needed in this slot, first */
2569 	if (gnttab_alloc_grant_references(
2570 	    BLKIF_MAX_SEGMENTS_PER_REQUEST, &gh) == -1) {
2571 		if (vdp->xdf_gnt_callback.next == NULL) {
2572 			SETDMACBON(vdp);
2573 			gnttab_request_free_callback(
2574 			    &vdp->xdf_gnt_callback,
2575 			    (void (*)(void *))xdf_dmacallback,
2576 			    (void *)vdp,
2577 			    BLKIF_MAX_SEGMENTS_PER_REQUEST);
2578 		}
2579 		return (NULL);
2580 	}
2581 
2582 	gs = kmem_cache_alloc(xdf_gs_cache, KM_NOSLEEP);
2583 	if (gs == NULL) {
2584 		gnttab_free_grant_references(gh);
2585 		if (vdp->xdf_timeout_id == 0)
2586 			/* restart I/O after one second */
2587 			vdp->xdf_timeout_id =
2588 			    timeout(xdf_timeout_handler, vdp, hz);
2589 		return (NULL);
2590 	}
2591 
2592 	/* init gs_slot */
2593 	list_insert_head(&vdp->xdf_gs_act, (void *)gs);
2594 	gs->oeid = vdp->xdf_peer;
2595 	gs->isread = isread;
2596 	gs->ghead = gh;
2597 	gs->ngrefs = 0;
2598 
2599 	return (gs);
2600 }
2601 
2602 static void
2603 gs_free(xdf_t *vdp, ge_slot_t *gs)
2604 {
2605 	int i;
2606 	grant_ref_t *gp = gs->ge;
2607 	int ngrefs = gs->ngrefs;
2608 	boolean_t isread = gs->isread;
2609 
2610 	list_remove(&vdp->xdf_gs_act, (void *)gs);
2611 
2612 	/* release all grant table entry resources used in this slot */
2613 	for (i = 0; i < ngrefs; i++, gp++)
2614 		gnttab_end_foreign_access(*gp, !isread, 0);
2615 	gnttab_free_grant_references(gs->ghead);
2616 
2617 	kmem_cache_free(xdf_gs_cache, (void *)gs);
2618 }
2619 
2620 static grant_ref_t
2621 gs_grant(ge_slot_t *gs, mfn_t mfn)
2622 {
2623 	grant_ref_t gr = gnttab_claim_grant_reference(&gs->ghead);
2624 
2625 	ASSERT(gr != -1);
2626 	ASSERT(gs->ngrefs < BLKIF_MAX_SEGMENTS_PER_REQUEST);
2627 	gs->ge[gs->ngrefs++] = gr;
2628 	gnttab_grant_foreign_access_ref(gr, gs->oeid, mfn, !gs->isread);
2629 
2630 	return (gr);
2631 }
2632 
2633 static void
2634 unexpectedie(xdf_t *vdp)
2635 {
2636 	/* clean up I/Os in ring that have responses */
2637 	if (xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring)) {
2638 		mutex_exit(&vdp->xdf_dev_lk);
2639 		(void) xdf_intr((caddr_t)vdp);
2640 		mutex_enter(&vdp->xdf_dev_lk);
2641 	}
2642 
2643 	/* free up all grant table entries */
2644 	while (!list_is_empty(&vdp->xdf_gs_act))
2645 		gs_free(vdp, list_head(&vdp->xdf_gs_act));
2646 
2647 	/*
2648 	 * move bp back to active list orderly
2649 	 * vreq_busy is updated in vreq_free()
2650 	 */
2651 	while (!list_is_empty(&vdp->xdf_vreq_act)) {
2652 		v_req_t *vreq = list_head(&vdp->xdf_vreq_act);
2653 		buf_t *bp = vreq->v_buf;
2654 
2655 		bp->av_back = NULL;
2656 		bp->b_resid = bp->b_bcount;
2657 		if (vdp->xdf_f_act == NULL) {
2658 			vdp->xdf_f_act = vdp->xdf_l_act = bp;
2659 		} else {
2660 			/* move to the head of list */
2661 			bp->av_forw = vdp->xdf_f_act;
2662 			vdp->xdf_f_act = bp;
2663 		}
2664 		if (vdp->xdf_xdev_iostat != NULL)
2665 			kstat_runq_back_to_waitq(
2666 			    KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
2667 		vreq_free(vdp, vreq);
2668 	}
2669 }
2670 
2671 static void
2672 xdfmin(struct buf *bp)
2673 {
2674 	if (bp->b_bcount > xdf_maxphys)
2675 		bp->b_bcount = xdf_maxphys;
2676 }
2677 
2678 void
2679 xdf_kstat_delete(dev_info_t *dip)
2680 {
2681 	xdf_t	*vdp = (xdf_t *)ddi_get_driver_private(dip);
2682 	kstat_t	*kstat;
2683 
2684 	/*
2685 	 * The locking order here is xdf_iostat_lk and then xdf_dev_lk.
2686 	 * xdf_dev_lk is used to protect the xdf_xdev_iostat pointer
2687 	 * and the contents of the our kstat.  xdf_iostat_lk is used
2688 	 * to protect the allocation and freeing of the actual kstat.
2689 	 * xdf_dev_lk can't be used for this purpose because kstat
2690 	 * readers use it to access the contents of the kstat and
2691 	 * hence it can't be held when calling kstat_delete().
2692 	 */
2693 	mutex_enter(&vdp->xdf_iostat_lk);
2694 	mutex_enter(&vdp->xdf_dev_lk);
2695 
2696 	if (vdp->xdf_xdev_iostat == NULL) {
2697 		mutex_exit(&vdp->xdf_dev_lk);
2698 		mutex_exit(&vdp->xdf_iostat_lk);
2699 		return;
2700 	}
2701 
2702 	kstat = vdp->xdf_xdev_iostat;
2703 	vdp->xdf_xdev_iostat = NULL;
2704 	mutex_exit(&vdp->xdf_dev_lk);
2705 
2706 	kstat_delete(kstat);
2707 	mutex_exit(&vdp->xdf_iostat_lk);
2708 }
2709 
2710 int
2711 xdf_kstat_create(dev_info_t *dip, char *ks_module, int ks_instance)
2712 {
2713 	xdf_t	*vdp = (xdf_t *)ddi_get_driver_private(dip);
2714 
2715 	/* See comment about locking in xdf_kstat_delete(). */
2716 	mutex_enter(&vdp->xdf_iostat_lk);
2717 	mutex_enter(&vdp->xdf_dev_lk);
2718 
2719 	if (vdp->xdf_xdev_iostat != NULL) {
2720 		mutex_exit(&vdp->xdf_dev_lk);
2721 		mutex_exit(&vdp->xdf_iostat_lk);
2722 		return (-1);
2723 	}
2724 
2725 	if ((vdp->xdf_xdev_iostat = kstat_create(
2726 	    ks_module, ks_instance, NULL, "disk",
2727 	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) == NULL) {
2728 		mutex_exit(&vdp->xdf_dev_lk);
2729 		mutex_exit(&vdp->xdf_iostat_lk);
2730 		return (-1);
2731 	}
2732 
2733 	vdp->xdf_xdev_iostat->ks_lock = &vdp->xdf_dev_lk;
2734 	kstat_install(vdp->xdf_xdev_iostat);
2735 	mutex_exit(&vdp->xdf_dev_lk);
2736 	mutex_exit(&vdp->xdf_iostat_lk);
2737 
2738 	return (0);
2739 }
2740 
2741 #if defined(XPV_HVM_DRIVER)
2742 
2743 typedef struct xdf_hvm_entry {
2744 	list_node_t	xdf_he_list;
2745 	char		*xdf_he_path;
2746 	dev_info_t	*xdf_he_dip;
2747 } xdf_hvm_entry_t;
2748 
2749 static list_t xdf_hvm_list;
2750 static kmutex_t xdf_hvm_list_lock;
2751 
2752 static xdf_hvm_entry_t *
2753 i_xdf_hvm_find(char *path, dev_info_t *dip)
2754 {
2755 	xdf_hvm_entry_t	*i;
2756 
2757 	ASSERT((path != NULL) || (dip != NULL));
2758 	ASSERT(MUTEX_HELD(&xdf_hvm_list_lock));
2759 
2760 	i = list_head(&xdf_hvm_list);
2761 	while (i != NULL) {
2762 		if ((path != NULL) && strcmp(i->xdf_he_path, path) != 0) {
2763 			i = list_next(&xdf_hvm_list, i);
2764 			continue;
2765 		}
2766 		if ((dip != NULL) && (i->xdf_he_dip != dip)) {
2767 			i = list_next(&xdf_hvm_list, i);
2768 			continue;
2769 		}
2770 		break;
2771 	}
2772 	return (i);
2773 }
2774 
2775 dev_info_t *
2776 xdf_hvm_hold(char *path)
2777 {
2778 	xdf_hvm_entry_t	*i;
2779 	dev_info_t	*dip;
2780 
2781 	mutex_enter(&xdf_hvm_list_lock);
2782 	i = i_xdf_hvm_find(path, NULL);
2783 	if (i == NULL) {
2784 		mutex_exit(&xdf_hvm_list_lock);
2785 		return (B_FALSE);
2786 	}
2787 	ndi_hold_devi(dip = i->xdf_he_dip);
2788 	mutex_exit(&xdf_hvm_list_lock);
2789 	return (dip);
2790 }
2791 
2792 static void
2793 xdf_hvm_add(dev_info_t *dip)
2794 {
2795 	xdf_hvm_entry_t	*i;
2796 	char		*path;
2797 
2798 	/* figure out the path for the dip */
2799 	path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
2800 	(void) ddi_pathname(dip, path);
2801 
2802 	i = kmem_alloc(sizeof (*i), KM_SLEEP);
2803 	i->xdf_he_dip = dip;
2804 	i->xdf_he_path = i_ddi_strdup(path, KM_SLEEP);
2805 
2806 	mutex_enter(&xdf_hvm_list_lock);
2807 	ASSERT(i_xdf_hvm_find(path, NULL) == NULL);
2808 	ASSERT(i_xdf_hvm_find(NULL, dip) == NULL);
2809 	list_insert_head(&xdf_hvm_list, i);
2810 	mutex_exit(&xdf_hvm_list_lock);
2811 
2812 	kmem_free(path, MAXPATHLEN);
2813 }
2814 
2815 static void
2816 xdf_hvm_rm(dev_info_t *dip)
2817 {
2818 	xdf_hvm_entry_t	*i;
2819 
2820 	mutex_enter(&xdf_hvm_list_lock);
2821 	VERIFY((i = i_xdf_hvm_find(NULL, dip)) != NULL);
2822 	list_remove(&xdf_hvm_list, i);
2823 	mutex_exit(&xdf_hvm_list_lock);
2824 
2825 	kmem_free(i->xdf_he_path, strlen(i->xdf_he_path) + 1);
2826 	kmem_free(i, sizeof (*i));
2827 }
2828 
2829 static void
2830 xdf_hvm_init(void)
2831 {
2832 	list_create(&xdf_hvm_list, sizeof (xdf_hvm_entry_t),
2833 	    offsetof(xdf_hvm_entry_t, xdf_he_list));
2834 	mutex_init(&xdf_hvm_list_lock, NULL, MUTEX_DEFAULT, NULL);
2835 }
2836 
2837 static void
2838 xdf_hvm_fini(void)
2839 {
2840 	ASSERT(list_head(&xdf_hvm_list) == NULL);
2841 	list_destroy(&xdf_hvm_list);
2842 	mutex_destroy(&xdf_hvm_list_lock);
2843 }
2844 
2845 int
2846 xdf_hvm_connect(dev_info_t *dip)
2847 {
2848 	xdf_t	*vdp = (xdf_t *)ddi_get_driver_private(dip);
2849 	int	rv;
2850 
2851 	/* do cv_wait until connected or failed */
2852 	mutex_enter(&vdp->xdf_dev_lk);
2853 	rv = xdf_connect(vdp, B_TRUE);
2854 	mutex_exit(&vdp->xdf_dev_lk);
2855 	return ((rv == XD_READY) ? 0 : -1);
2856 }
2857 
2858 int
2859 xdf_hvm_setpgeom(dev_info_t *dip, cmlb_geom_t *geomp)
2860 {
2861 	xdf_t	*vdp = (xdf_t *)ddi_get_driver_private(dip);
2862 
2863 	/* sanity check the requested physical geometry */
2864 	mutex_enter(&vdp->xdf_dev_lk);
2865 	if ((geomp->g_secsize != XB_BSIZE) ||
2866 	    (geomp->g_capacity == 0)) {
2867 		mutex_exit(&vdp->xdf_dev_lk);
2868 		return (EINVAL);
2869 	}
2870 
2871 	/*
2872 	 * If we've already connected to the backend device then make sure
2873 	 * we're not defining a physical geometry larger than our backend
2874 	 * device.
2875 	 */
2876 	if ((vdp->xdf_xdev_nblocks != 0) &&
2877 	    (geomp->g_capacity > vdp->xdf_xdev_nblocks)) {
2878 		mutex_exit(&vdp->xdf_dev_lk);
2879 		return (EINVAL);
2880 	}
2881 
2882 	vdp->xdf_pgeom = *geomp;
2883 	mutex_exit(&vdp->xdf_dev_lk);
2884 
2885 	/* force a re-validation */
2886 	cmlb_invalidate(vdp->xdf_vd_lbl, NULL);
2887 
2888 	return (0);
2889 }
2890 
2891 #endif /* XPV_HVM_DRIVER */
2892