xref: /illumos-gate/usr/src/uts/common/xen/io/xdf.c (revision da6c28aaf62fa55f0fdb8004aa40f88f23bf53f0)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * xdf.c - Xen Virtual Block Device Driver
29  * TODO:
30  *	- support alternate block size (currently only DEV_BSIZE supported)
31  *	- revalidate geometry for removable devices
32  */
33 
34 #pragma ident	"%Z%%M%	%I%	%E% SMI"
35 
36 #include "xdf.h"
37 
38 #define	FLUSH_DISKCACHE	0x1
39 #define	WRITE_BARRIER	0x2
40 #define	DEFAULT_FLUSH_BLOCK	156 /* block to write to cause a cache flush */
41 #define	USE_WRITE_BARRIER(vdp)				\
42 	((vdp)->xdf_feature_barrier && !(vdp)->xdf_flush_supported)
43 #define	USE_FLUSH_DISKCACHE(vdp)			\
44 	((vdp)->xdf_feature_barrier && (vdp)->xdf_flush_supported)
45 #define	IS_WRITE_BARRIER(vdp, bp)			\
46 	(!IS_READ(bp) && USE_WRITE_BARRIER(vdp) &&	\
47 	((bp)->b_un.b_addr == (vdp)->xdf_cache_flush_block))
48 #define	IS_FLUSH_DISKCACHE(bp)				\
49 	(!IS_READ(bp) && USE_FLUSH_DISKCACHE(vdp) && ((bp)->b_bcount == 0))
50 
51 static void *vbd_ss;
52 static kmem_cache_t *xdf_vreq_cache;
53 static kmem_cache_t *xdf_gs_cache;
54 static int xdf_maxphys = XB_MAXPHYS;
55 int xdfdebug = 0;
56 extern int do_polled_io;
57 diskaddr_t xdf_flush_block = DEFAULT_FLUSH_BLOCK;
58 int	xdf_barrier_flush_disable = 0;
59 
60 /*
61  * dev_ops and cb_ops entrypoints
62  */
63 static int xdf_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
64 static int xdf_attach(dev_info_t *, ddi_attach_cmd_t);
65 static int xdf_detach(dev_info_t *, ddi_detach_cmd_t);
66 static int xdf_reset(dev_info_t *, ddi_reset_cmd_t);
67 static int xdf_open(dev_t *, int, int, cred_t *);
68 static int xdf_close(dev_t, int, int, struct cred *);
69 static int xdf_strategy(struct buf *);
70 static int xdf_read(dev_t, struct uio *, cred_t *);
71 static int xdf_aread(dev_t, struct aio_req *, cred_t *);
72 static int xdf_write(dev_t, struct uio *, cred_t *);
73 static int xdf_awrite(dev_t, struct aio_req *, cred_t *);
74 static int xdf_dump(dev_t, caddr_t, daddr_t, int);
75 static int xdf_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
76 static uint_t xdf_intr(caddr_t);
77 static int xdf_prop_op(dev_t, dev_info_t *, ddi_prop_op_t, int, char *,
78     caddr_t, int *);
79 
80 /*
81  * misc private functions
82  */
83 static int xdf_suspend(dev_info_t *);
84 static int xdf_resume(dev_info_t *);
85 static int xdf_start_connect(xdf_t *);
86 static int xdf_start_disconnect(xdf_t *);
87 static int xdf_post_connect(xdf_t *);
88 static void xdf_post_disconnect(xdf_t *);
89 static void xdf_oe_change(dev_info_t *, ddi_eventcookie_t, void *, void *);
90 static void xdf_iostart(xdf_t *);
91 static void xdf_iofini(xdf_t *, uint64_t, int);
92 static int xdf_prepare_rreq(xdf_t *, struct buf *, blkif_request_t *);
93 static int xdf_drain_io(xdf_t *);
94 static boolean_t xdf_isopen(xdf_t *, int);
95 static int xdf_check_state_transition(xdf_t *, XenbusState);
96 static int xdf_connect(xdf_t *, boolean_t);
97 static int xdf_dmacallback(caddr_t);
98 static void xdf_timeout_handler(void *);
99 static uint_t xdf_iorestart(caddr_t);
100 static v_req_t *vreq_get(xdf_t *, buf_t *);
101 static void vreq_free(xdf_t *, v_req_t *);
102 static int vreq_setup(xdf_t *, v_req_t *);
103 static ge_slot_t *gs_get(xdf_t *, int);
104 static void gs_free(xdf_t *, ge_slot_t *);
105 static grant_ref_t gs_grant(ge_slot_t *, mfn_t);
106 static void unexpectedie(xdf_t *);
107 static void xdfmin(struct buf *);
108 
109 static 	struct cb_ops xdf_cbops = {
110 	xdf_open,
111 	xdf_close,
112 	xdf_strategy,
113 	nodev,
114 	xdf_dump,
115 	xdf_read,
116 	xdf_write,
117 	xdf_ioctl,
118 	nodev,
119 	nodev,
120 	nodev,
121 	nochpoll,
122 	xdf_prop_op,
123 	NULL,
124 	D_MP | D_NEW | D_64BIT,
125 	CB_REV,
126 	xdf_aread,
127 	xdf_awrite
128 };
129 
130 struct dev_ops xdf_devops = {
131 	DEVO_REV,		/* devo_rev */
132 	0,			/* devo_refcnt */
133 	xdf_getinfo,		/* devo_getinfo */
134 	nulldev,		/* devo_identify */
135 	nulldev,		/* devo_probe */
136 	xdf_attach,		/* devo_attach */
137 	xdf_detach,		/* devo_detach */
138 	xdf_reset,		/* devo_reset */
139 	&xdf_cbops,		/* devo_cb_ops */
140 	(struct bus_ops *)NULL	/* devo_bus_ops */
141 };
142 
143 static struct modldrv modldrv = {
144 	&mod_driverops,		/* Type of module.  This one is a driver */
145 	"virtual block driver %I%",	/* short description */
146 	&xdf_devops		/* driver specific ops */
147 };
148 
149 static struct modlinkage xdf_modlinkage = {
150 	MODREV_1, (void *)&modldrv, NULL
151 };
152 
153 /*
154  * I/O buffer DMA attributes
155  * Make sure: one DMA window contains BLKIF_MAX_SEGMENTS_PER_REQUEST at most
156  */
157 static ddi_dma_attr_t xb_dma_attr = {
158 	DMA_ATTR_V0,
159 	(uint64_t)0,			/* lowest address */
160 	(uint64_t)0xffffffffffffffff,	/* highest usable address */
161 	(uint64_t)0xffffff,		/* DMA counter limit max */
162 	(uint64_t)XB_BSIZE,		/* alignment in bytes */
163 	XB_BSIZE - 1,			/* bitmap of burst sizes */
164 	XB_BSIZE,			/* min transfer */
165 	(uint64_t)XB_MAX_XFER, 		/* maximum transfer */
166 	(uint64_t)PAGEOFFSET,		/* 1 page segment length  */
167 	BLKIF_MAX_SEGMENTS_PER_REQUEST,	/* maximum number of segments */
168 	XB_BSIZE,			/* granularity */
169 	0,				/* flags (reserved) */
170 };
171 
172 static ddi_device_acc_attr_t xc_acc_attr = {
173 	DDI_DEVICE_ATTR_V0,
174 	DDI_NEVERSWAP_ACC,
175 	DDI_STRICTORDER_ACC
176 };
177 
178 /* callbacks from commmon label */
179 
180 static int xdf_lb_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t,
181 	void *);
182 static int xdf_lb_getinfo(dev_info_t *, int, void *, void *);
183 
184 static cmlb_tg_ops_t xdf_lb_ops = {
185 	TG_DK_OPS_VERSION_1,
186 	xdf_lb_rdwr,
187 	xdf_lb_getinfo
188 };
189 
190 int
191 _init(void)
192 {
193 	int rc;
194 
195 	if ((rc = ddi_soft_state_init(&vbd_ss, sizeof (xdf_t), 0)) == 0) {
196 		xdf_vreq_cache = kmem_cache_create("xdf_vreq_cache",
197 		    sizeof (v_req_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
198 		ASSERT(xdf_vreq_cache != NULL);
199 		xdf_gs_cache = kmem_cache_create("xdf_gs_cache",
200 		    sizeof (ge_slot_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
201 		ASSERT(xdf_gs_cache != NULL);
202 		if ((rc = mod_install(&xdf_modlinkage)) != 0) {
203 			kmem_cache_destroy(xdf_vreq_cache);
204 			kmem_cache_destroy(xdf_gs_cache);
205 			ddi_soft_state_fini(&vbd_ss);
206 		}
207 	}
208 
209 	return (rc);
210 }
211 
212 int
213 _fini(void)
214 {
215 	int err;
216 
217 	if ((err = mod_remove(&xdf_modlinkage)) != 0)
218 		return (err);
219 
220 	kmem_cache_destroy(xdf_vreq_cache);
221 	kmem_cache_destroy(xdf_gs_cache);
222 	ddi_soft_state_fini(&vbd_ss);
223 
224 	return (0);
225 }
226 
227 int
228 _info(struct modinfo *modinfop)
229 {
230 	return (mod_info(&xdf_modlinkage, modinfop));
231 }
232 
233 /*ARGSUSED*/
234 static int
235 xdf_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **rp)
236 {
237 	int instance;
238 	xdf_t *vbdp;
239 
240 	instance = XDF_INST(getminor((dev_t)arg));
241 
242 	switch (cmd) {
243 	case DDI_INFO_DEVT2DEVINFO:
244 		if ((vbdp = ddi_get_soft_state(vbd_ss, instance)) == NULL) {
245 			*rp = NULL;
246 			return (DDI_FAILURE);
247 		}
248 		*rp = vbdp->xdf_dip;
249 		return (DDI_SUCCESS);
250 
251 	case DDI_INFO_DEVT2INSTANCE:
252 		*rp = (void *)(uintptr_t)instance;
253 		return (DDI_SUCCESS);
254 
255 	default:
256 		return (DDI_FAILURE);
257 	}
258 }
259 
260 static int
261 xdf_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags,
262 	char *name, caddr_t valuep, int *lengthp)
263 {
264 	int instance = ddi_get_instance(dip);
265 	xdf_t *vdp;
266 	diskaddr_t p_blkcnt;
267 
268 	/*
269 	 * xdf dynamic properties are device specific and size oriented.
270 	 * Requests issued under conditions where size is valid are passed
271 	 * to ddi_prop_op_nblocks with the size information, otherwise the
272 	 * request is passed to ddi_prop_op.
273 	 */
274 	vdp = ddi_get_soft_state(vbd_ss, instance);
275 
276 	if ((dev == DDI_DEV_T_ANY) || (vdp == NULL))
277 		return (ddi_prop_op(dev, dip, prop_op, mod_flags,
278 		    name, valuep, lengthp));
279 
280 	/* do cv_wait until connected or failed */
281 	mutex_enter(&vdp->xdf_dev_lk);
282 	if (xdf_connect(vdp, B_TRUE) != XD_READY) {
283 		mutex_exit(&vdp->xdf_dev_lk);
284 		goto out;
285 	}
286 	mutex_exit(&vdp->xdf_dev_lk);
287 
288 	if (cmlb_partinfo(vdp->xdf_vd_lbl, XDF_PART(getminor(dev)), &p_blkcnt,
289 	    NULL, NULL, NULL, NULL) == 0)
290 		return (ddi_prop_op_nblocks(dev, dip, prop_op, mod_flags,
291 		    name, valuep, lengthp, (uint64_t)p_blkcnt));
292 
293 out:
294 	return (ddi_prop_op(dev, dip, prop_op, mod_flags, name, valuep,
295 	    lengthp));
296 }
297 
298 static int
299 xdf_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
300 {
301 	xdf_t *vdp;
302 	ddi_iblock_cookie_t ibc;
303 	ddi_iblock_cookie_t softibc;
304 	int instance;
305 
306 	xdfdebug = ddi_prop_get_int(DDI_DEV_T_ANY, devi, DDI_PROP_NOTPROM,
307 	    "xdfdebug", 0);
308 
309 	switch (cmd) {
310 		case DDI_ATTACH:
311 			break;
312 
313 		case DDI_RESUME:
314 			return (xdf_resume(devi));
315 
316 		default:
317 			return (DDI_FAILURE);
318 	}
319 
320 	instance = ddi_get_instance(devi);
321 	if (ddi_soft_state_zalloc(vbd_ss, instance) != DDI_SUCCESS)
322 		return (DDI_FAILURE);
323 
324 	DPRINTF(DDI_DBG, ("xdf%d: attaching\n", instance));
325 	vdp = ddi_get_soft_state(vbd_ss, instance);
326 	vdp->xdf_dip = devi;
327 	if (ddi_get_iblock_cookie(devi, 0, &ibc) != DDI_SUCCESS) {
328 		cmn_err(CE_WARN, "xdf@%s: failed to get iblock cookie",
329 		    ddi_get_name_addr(devi));
330 		goto errout1;
331 	}
332 
333 	mutex_init(&vdp->xdf_dev_lk, NULL, MUTEX_DRIVER, (void *)ibc);
334 	mutex_init(&vdp->xdf_cb_lk, NULL, MUTEX_DRIVER, (void *)ibc);
335 	cv_init(&vdp->xdf_dev_cv, NULL, CV_DEFAULT, NULL);
336 	ddi_set_driver_private(devi, vdp);
337 
338 	if (ddi_get_soft_iblock_cookie(devi, DDI_SOFTINT_LOW, &softibc)
339 	    != DDI_SUCCESS) {
340 		cmn_err(CE_WARN, "xdf@%s: failed to get softintr iblock cookie",
341 		    ddi_get_name_addr(devi));
342 		goto errout2;
343 	}
344 	if (ddi_add_softintr(devi, DDI_SOFTINT_LOW, &vdp->xdf_softintr_id,
345 	    &softibc, NULL, xdf_iorestart, (caddr_t)vdp) != DDI_SUCCESS) {
346 		cmn_err(CE_WARN, "xdf@%s: failed to add softintr",
347 		    ddi_get_name_addr(devi));
348 		goto errout2;
349 	}
350 
351 	/*
352 	 * create kstat for iostat(1M)
353 	 */
354 	if ((vdp->xdf_xdev_iostat = kstat_create("xdf", instance, NULL, "disk",
355 	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) != NULL) {
356 		vdp->xdf_xdev_iostat->ks_lock = &vdp->xdf_dev_lk;
357 		kstat_install(vdp->xdf_xdev_iostat);
358 	} else {
359 		cmn_err(CE_WARN, "xdf@%s: failed to create kstat",
360 		    ddi_get_name_addr(devi));
361 		goto errout3;
362 	}
363 
364 	/*
365 	 * driver handles kernel-issued IOCTLs
366 	 */
367 	if (ddi_prop_create(DDI_DEV_T_NONE, devi, DDI_PROP_CANSLEEP,
368 	    DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) {
369 		cmn_err(CE_WARN, "xdf@%s: cannot create DDI_KERNEL_IOCTL prop",
370 		    ddi_get_name_addr(devi));
371 		goto errout4;
372 	}
373 
374 	/*
375 	 * create default device minor nodes: non-removable disk
376 	 * we will adjust minor nodes after we are connected w/ backend
377 	 */
378 	cmlb_alloc_handle(&vdp->xdf_vd_lbl);
379 	if (cmlb_attach(devi, &xdf_lb_ops, DTYPE_DIRECT, 0, 1, DDI_NT_BLOCK,
380 	    CMLB_FAKE_LABEL_ONE_PARTITION, vdp->xdf_vd_lbl, NULL) != 0) {
381 		cmn_err(CE_WARN, "xdf@%s: default cmlb attach failed",
382 		    ddi_get_name_addr(devi));
383 		goto errout5;
384 	}
385 
386 	/*
387 	 * We ship with cache-enabled disks
388 	 */
389 	vdp->xdf_wce = 1;
390 
391 	mutex_enter(&vdp->xdf_cb_lk);
392 
393 	/* Watch backend XenbusState change */
394 	if (xvdi_add_event_handler(devi, XS_OE_STATE,
395 	    xdf_oe_change) != DDI_SUCCESS) {
396 		mutex_exit(&vdp->xdf_cb_lk);
397 		goto errout6;
398 	}
399 
400 	if (xdf_start_connect(vdp) != DDI_SUCCESS) {
401 		cmn_err(CE_WARN, "xdf@%s: start connection failed",
402 		    ddi_get_name_addr(devi));
403 		(void) xdf_start_disconnect(vdp);
404 		mutex_exit(&vdp->xdf_cb_lk);
405 		goto errout7;
406 	}
407 
408 	mutex_exit(&vdp->xdf_cb_lk);
409 
410 	list_create(&vdp->xdf_vreq_act, sizeof (v_req_t),
411 	    offsetof(v_req_t, v_link));
412 	list_create(&vdp->xdf_gs_act, sizeof (ge_slot_t),
413 	    offsetof(ge_slot_t, link));
414 
415 	ddi_report_dev(devi);
416 	DPRINTF(DDI_DBG, ("xdf%d: attached\n", instance));
417 
418 	return (DDI_SUCCESS);
419 
420 errout7:
421 	xvdi_remove_event_handler(devi, XS_OE_STATE);
422 errout6:
423 	cmlb_detach(vdp->xdf_vd_lbl, NULL);
424 errout5:
425 	cmlb_free_handle(&vdp->xdf_vd_lbl);
426 	ddi_prop_remove_all(devi);
427 errout4:
428 	kstat_delete(vdp->xdf_xdev_iostat);
429 errout3:
430 	ddi_remove_softintr(vdp->xdf_softintr_id);
431 errout2:
432 	ddi_set_driver_private(devi, NULL);
433 	cv_destroy(&vdp->xdf_dev_cv);
434 	mutex_destroy(&vdp->xdf_cb_lk);
435 	mutex_destroy(&vdp->xdf_dev_lk);
436 errout1:
437 	cmn_err(CE_WARN, "xdf@%s: attach failed", ddi_get_name_addr(devi));
438 	ddi_soft_state_free(vbd_ss, instance);
439 	return (DDI_FAILURE);
440 }
441 
442 static int
443 xdf_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
444 {
445 	xdf_t *vdp;
446 	int instance;
447 
448 	switch (cmd) {
449 
450 	case DDI_PM_SUSPEND:
451 		break;
452 
453 	case DDI_SUSPEND:
454 		return (xdf_suspend(devi));
455 
456 	case DDI_DETACH:
457 		break;
458 
459 	default:
460 		return (DDI_FAILURE);
461 	}
462 
463 	instance = ddi_get_instance(devi);
464 	DPRINTF(DDI_DBG, ("xdf%d: detaching\n", instance));
465 	vdp = ddi_get_soft_state(vbd_ss, instance);
466 
467 	if (vdp == NULL)
468 		return (DDI_FAILURE);
469 
470 	mutex_enter(&vdp->xdf_dev_lk);
471 	if (xdf_isopen(vdp, -1)) {
472 		mutex_exit(&vdp->xdf_dev_lk);
473 		return (DDI_FAILURE);
474 	}
475 
476 	if (vdp->xdf_status != XD_CLOSED) {
477 		mutex_exit(&vdp->xdf_dev_lk);
478 		return (DDI_FAILURE);
479 	}
480 
481 	ASSERT(!ISDMACBON(vdp));
482 	mutex_exit(&vdp->xdf_dev_lk);
483 
484 	if (vdp->xdf_timeout_id != 0)
485 		(void) untimeout(vdp->xdf_timeout_id);
486 
487 	xvdi_remove_event_handler(devi, XS_OE_STATE);
488 
489 	/* we'll support backend running in domU later */
490 #ifdef	DOMU_BACKEND
491 	(void) xvdi_post_event(devi, XEN_HP_REMOVE);
492 #endif
493 
494 	list_destroy(&vdp->xdf_vreq_act);
495 	list_destroy(&vdp->xdf_gs_act);
496 	ddi_prop_remove_all(devi);
497 	kstat_delete(vdp->xdf_xdev_iostat);
498 	ddi_remove_softintr(vdp->xdf_softintr_id);
499 	ddi_set_driver_private(devi, NULL);
500 	cv_destroy(&vdp->xdf_dev_cv);
501 	mutex_destroy(&vdp->xdf_cb_lk);
502 	mutex_destroy(&vdp->xdf_dev_lk);
503 	if (vdp->xdf_cache_flush_block != NULL)
504 		kmem_free(vdp->xdf_flush_mem, 2 * DEV_BSIZE);
505 	ddi_soft_state_free(vbd_ss, instance);
506 	return (DDI_SUCCESS);
507 }
508 
509 static int
510 xdf_suspend(dev_info_t *devi)
511 {
512 	xdf_t *vdp;
513 	int instance;
514 	enum xdf_state st;
515 
516 	instance = ddi_get_instance(devi);
517 
518 	if (xdfdebug & SUSRES_DBG)
519 		xen_printf("xdf_suspend: xdf#%d\n", instance);
520 
521 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
522 		return (DDI_FAILURE);
523 
524 	xvdi_suspend(devi);
525 
526 	mutex_enter(&vdp->xdf_cb_lk);
527 	mutex_enter(&vdp->xdf_dev_lk);
528 	st = vdp->xdf_status;
529 	/* change status to stop further I/O requests */
530 	if (st == XD_READY)
531 		vdp->xdf_status = XD_SUSPEND;
532 	mutex_exit(&vdp->xdf_dev_lk);
533 	mutex_exit(&vdp->xdf_cb_lk);
534 
535 	/* make sure no more I/O responses left in the ring buffer */
536 	if ((st == XD_INIT) || (st == XD_READY)) {
537 		(void) ddi_remove_intr(devi, 0, NULL);
538 		(void) xdf_drain_io(vdp);
539 		/*
540 		 * no need to teardown the ring buffer here
541 		 * it will be simply re-init'ed during resume when
542 		 * we call xvdi_alloc_ring
543 		 */
544 	}
545 
546 	if (xdfdebug & SUSRES_DBG)
547 		xen_printf("xdf_suspend: SUCCESS\n");
548 
549 	return (DDI_SUCCESS);
550 }
551 
552 /*ARGSUSED*/
553 static int
554 xdf_resume(dev_info_t *devi)
555 {
556 	xdf_t *vdp;
557 	int instance;
558 
559 	instance = ddi_get_instance(devi);
560 	if (xdfdebug & SUSRES_DBG)
561 		xen_printf("xdf_resume: xdf%d\n", instance);
562 
563 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
564 		return (DDI_FAILURE);
565 
566 	mutex_enter(&vdp->xdf_cb_lk);
567 
568 	if (xvdi_resume(devi) != DDI_SUCCESS) {
569 		mutex_exit(&vdp->xdf_cb_lk);
570 		return (DDI_FAILURE);
571 	}
572 
573 	mutex_enter(&vdp->xdf_dev_lk);
574 	ASSERT(vdp->xdf_status != XD_READY);
575 	vdp->xdf_status = XD_UNKNOWN;
576 	mutex_exit(&vdp->xdf_dev_lk);
577 
578 	if (xdf_start_connect(vdp) != DDI_SUCCESS) {
579 		mutex_exit(&vdp->xdf_cb_lk);
580 		return (DDI_FAILURE);
581 	}
582 
583 	mutex_exit(&vdp->xdf_cb_lk);
584 
585 	if (xdfdebug & SUSRES_DBG)
586 		xen_printf("xdf_resume: done\n");
587 	return (DDI_SUCCESS);
588 }
589 
590 /*ARGSUSED*/
591 static int
592 xdf_reset(dev_info_t *devi, ddi_reset_cmd_t cmd)
593 {
594 	xdf_t *vdp;
595 	int instance;
596 
597 	instance = ddi_get_instance(devi);
598 	DPRINTF(DDI_DBG, ("xdf%d: resetting\n", instance));
599 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
600 		return (DDI_FAILURE);
601 
602 	/*
603 	 * wait for any outstanding I/O to complete
604 	 */
605 	(void) xdf_drain_io(vdp);
606 
607 	DPRINTF(DDI_DBG, ("xdf%d: reset complete\n", instance));
608 	return (DDI_SUCCESS);
609 }
610 
611 static int
612 xdf_open(dev_t *devp, int flag, int otyp, cred_t *credp)
613 {
614 	minor_t	minor;
615 	xdf_t	*vdp;
616 	int part;
617 	ulong_t parbit;
618 	diskaddr_t p_blkct = 0;
619 	boolean_t firstopen;
620 
621 	minor = getminor(*devp);
622 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
623 		return (ENXIO);
624 
625 	DPRINTF(DDI_DBG, ("xdf%d: opening\n", XDF_INST(minor)));
626 
627 	/* do cv_wait until connected or failed */
628 	mutex_enter(&vdp->xdf_dev_lk);
629 	if (xdf_connect(vdp, B_TRUE) != XD_READY) {
630 		mutex_exit(&vdp->xdf_dev_lk);
631 		return (ENXIO);
632 	}
633 
634 	if ((flag & FWRITE) && XD_IS_RO(vdp)) {
635 		mutex_exit(&vdp->xdf_dev_lk);
636 		return (EROFS);
637 	}
638 
639 	part = XDF_PART(minor);
640 	parbit = 1 << part;
641 	if (vdp->xdf_vd_exclopen & parbit) {
642 		mutex_exit(&vdp->xdf_dev_lk);
643 		return (EBUSY);
644 	}
645 
646 	/* are we the first one to open this node? */
647 	firstopen = !xdf_isopen(vdp, -1);
648 
649 	if ((flag & FEXCL) && !firstopen) {
650 		mutex_exit(&vdp->xdf_dev_lk);
651 		return (EBUSY);
652 	}
653 
654 	if (otyp == OTYP_LYR)
655 		vdp->xdf_vd_lyropen[part]++;
656 
657 	vdp->xdf_vd_open[otyp] |= parbit;
658 
659 	if (flag & FEXCL)
660 		vdp->xdf_vd_exclopen |= parbit;
661 
662 	mutex_exit(&vdp->xdf_dev_lk);
663 
664 	/* force a re-validation */
665 	if (firstopen)
666 		cmlb_invalidate(vdp->xdf_vd_lbl, NULL);
667 
668 	/*
669 	 * check size
670 	 * ignore CD/DVD which contains a zero-sized s0
671 	 */
672 	if (!(flag & (FNDELAY | FNONBLOCK)) && !XD_IS_CD(vdp) &&
673 	    ((cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
674 	    NULL, NULL, NULL, NULL) != 0) || (p_blkct == 0))) {
675 		(void) xdf_close(*devp, flag, otyp, credp);
676 		return (ENXIO);
677 	}
678 
679 	return (0);
680 }
681 
682 /*ARGSUSED*/
683 static int
684 xdf_close(dev_t dev, int flag, int otyp, struct cred *credp)
685 {
686 	minor_t	minor;
687 	xdf_t	*vdp;
688 	int part;
689 	ulong_t parbit;
690 
691 	minor = getminor(dev);
692 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
693 		return (ENXIO);
694 
695 	mutex_enter(&vdp->xdf_dev_lk);
696 	part = XDF_PART(minor);
697 	if (!xdf_isopen(vdp, part)) {
698 		mutex_exit(&vdp->xdf_dev_lk);
699 		return (ENXIO);
700 	}
701 	parbit = 1 << part;
702 
703 	if (otyp == OTYP_LYR) {
704 		if (vdp->xdf_vd_lyropen[part] != 0)
705 			vdp->xdf_vd_lyropen[part]--;
706 		if (vdp->xdf_vd_lyropen[part] == 0)
707 			vdp->xdf_vd_open[OTYP_LYR] &= ~parbit;
708 	} else {
709 		vdp->xdf_vd_open[otyp] &= ~parbit;
710 	}
711 	vdp->xdf_vd_exclopen &= ~parbit;
712 
713 	mutex_exit(&vdp->xdf_dev_lk);
714 	return (0);
715 }
716 
717 static int
718 xdf_strategy(struct buf *bp)
719 {
720 	xdf_t	*vdp;
721 	minor_t minor;
722 	diskaddr_t p_blkct, p_blkst;
723 	ulong_t nblks;
724 	int part;
725 
726 	minor = getminor(bp->b_edev);
727 	part = XDF_PART(minor);
728 	if (!(vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) ||
729 	    !xdf_isopen(vdp, part) ||
730 	    cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
731 	    &p_blkst, NULL, NULL, NULL)) {
732 		bioerror(bp, ENXIO);
733 		bp->b_resid = bp->b_bcount;
734 		biodone(bp);
735 		return (0);
736 	}
737 
738 	if (!IS_READ(bp) && XD_IS_RO(vdp)) {
739 		bioerror(bp, EROFS);
740 		bp->b_resid = bp->b_bcount;
741 		biodone(bp);
742 		return (0);
743 	}
744 
745 	/*
746 	 * starting beyond partition
747 	 */
748 	if (bp->b_blkno > p_blkct) {
749 		DPRINTF(IO_DBG, ("xdf: block %lld exceeds VBD size %"PRIu64,
750 		    (longlong_t)bp->b_blkno, (uint64_t)p_blkct));
751 		bioerror(bp, EINVAL);
752 		bp->b_resid = bp->b_bcount;
753 		biodone(bp);
754 		return (0);
755 	}
756 
757 	/* Legacy: don't set error flag at this case */
758 	if (bp->b_blkno == p_blkct) {
759 		bp->b_resid = bp->b_bcount;
760 		biodone(bp);
761 		return (0);
762 	}
763 
764 	/*
765 	 * adjust for partial transfer
766 	 */
767 	nblks = bp->b_bcount >> XB_BSHIFT;
768 	if ((bp->b_blkno + nblks) > p_blkct) {
769 		bp->b_resid = ((bp->b_blkno + nblks) - p_blkct) << XB_BSHIFT;
770 		bp->b_bcount -= bp->b_resid;
771 	}
772 
773 
774 	DPRINTF(IO_DBG, ("xdf: strategy blk %lld len %lu\n",
775 	    (longlong_t)bp->b_blkno, (ulong_t)bp->b_bcount));
776 
777 	mutex_enter(&vdp->xdf_dev_lk);
778 	kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
779 	if (vdp->xdf_f_act == NULL) {
780 		vdp->xdf_f_act = vdp->xdf_l_act = bp;
781 	} else {
782 		vdp->xdf_l_act->av_forw = bp;
783 		vdp->xdf_l_act = bp;
784 	}
785 	bp->av_forw = NULL;
786 	bp->av_back = NULL; /* not tagged with a v_req */
787 	bp->b_private = (void *)(uintptr_t)p_blkst;
788 	mutex_exit(&vdp->xdf_dev_lk);
789 	xdf_iostart(vdp);
790 	if (do_polled_io)
791 		(void) xdf_drain_io(vdp);
792 	return (0);
793 }
794 
795 /*ARGSUSED*/
796 static int
797 xdf_read(dev_t dev, struct uio *uiop, cred_t *credp)
798 {
799 
800 	xdf_t	*vdp;
801 	minor_t minor;
802 	diskaddr_t p_blkcnt;
803 	int part;
804 
805 	minor = getminor(dev);
806 	if (!(vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))))
807 		return (ENXIO);
808 
809 	DPRINTF(IO_DBG, ("xdf: read offset 0x%"PRIx64"\n",
810 	    (int64_t)uiop->uio_offset));
811 
812 	part = XDF_PART(minor);
813 	if (!xdf_isopen(vdp, part))
814 		return (ENXIO);
815 
816 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
817 	    NULL, NULL, NULL, NULL))
818 		return (ENXIO);
819 
820 	if (U_INVAL(uiop))
821 		return (EINVAL);
822 
823 	return (physio(xdf_strategy, NULL, dev, B_READ, xdfmin, uiop));
824 }
825 
826 /*ARGSUSED*/
827 static int
828 xdf_write(dev_t dev, struct uio *uiop, cred_t *credp)
829 {
830 	xdf_t *vdp;
831 	minor_t minor;
832 	diskaddr_t p_blkcnt;
833 	int part;
834 
835 	minor = getminor(dev);
836 	if (!(vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))))
837 		return (ENXIO);
838 
839 	DPRINTF(IO_DBG, ("xdf: write offset 0x%"PRIx64"\n",
840 	    (int64_t)uiop->uio_offset));
841 
842 	part = XDF_PART(minor);
843 	if (!xdf_isopen(vdp, part))
844 		return (ENXIO);
845 
846 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
847 	    NULL, NULL, NULL, NULL))
848 		return (ENXIO);
849 
850 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt))
851 		return (ENOSPC);
852 
853 	if (U_INVAL(uiop))
854 		return (EINVAL);
855 
856 	return (physio(xdf_strategy, NULL, dev, B_WRITE, minphys, uiop));
857 }
858 
859 /*ARGSUSED*/
860 static int
861 xdf_aread(dev_t dev, struct aio_req *aiop, cred_t *credp)
862 {
863 	xdf_t	*vdp;
864 	minor_t minor;
865 	struct uio *uiop = aiop->aio_uio;
866 	diskaddr_t p_blkcnt;
867 	int part;
868 
869 	minor = getminor(dev);
870 	if (!(vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))))
871 		return (ENXIO);
872 
873 	part = XDF_PART(minor);
874 	if (!xdf_isopen(vdp, part))
875 		return (ENXIO);
876 
877 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
878 	    NULL, NULL, NULL, NULL))
879 		return (ENXIO);
880 
881 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt))
882 		return (ENOSPC);
883 
884 	if (U_INVAL(uiop))
885 		return (EINVAL);
886 
887 	return (aphysio(xdf_strategy, anocancel, dev, B_READ, minphys, aiop));
888 }
889 
890 /*ARGSUSED*/
891 static int
892 xdf_awrite(dev_t dev, struct aio_req *aiop, cred_t *credp)
893 {
894 	xdf_t *vdp;
895 	minor_t minor;
896 	struct uio *uiop = aiop->aio_uio;
897 	diskaddr_t p_blkcnt;
898 	int part;
899 
900 	minor = getminor(dev);
901 	if (!(vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))))
902 		return (ENXIO);
903 
904 	part = XDF_PART(minor);
905 	if (!xdf_isopen(vdp, part))
906 		return (ENXIO);
907 
908 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
909 	    NULL, NULL, NULL, NULL))
910 		return (ENXIO);
911 
912 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt))
913 		return (ENOSPC);
914 
915 	if (U_INVAL(uiop))
916 		return (EINVAL);
917 
918 	return (aphysio(xdf_strategy, anocancel, dev, B_WRITE, minphys, aiop));
919 }
920 
921 static int
922 xdf_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
923 {
924 	struct buf dumpbuf, *dbp;
925 	xdf_t	*vdp;
926 	minor_t minor;
927 	int err = 0;
928 	int part;
929 	diskaddr_t p_blkcnt, p_blkst;
930 
931 	minor = getminor(dev);
932 	if (!(vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))))
933 		return (ENXIO);
934 
935 	DPRINTF(IO_DBG, ("xdf: dump addr (0x%p) blk (%ld) nblks (%d)\n",
936 	    addr, blkno, nblk));
937 
938 	part = XDF_PART(minor);
939 	if (!xdf_isopen(vdp, part))
940 		return (ENXIO);
941 
942 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt, &p_blkst,
943 	    NULL, NULL, NULL))
944 		return (ENXIO);
945 
946 	if ((blkno + nblk) > p_blkcnt) {
947 		cmn_err(CE_WARN, "xdf: block %ld exceeds VBD size %"PRIu64,
948 		    blkno + nblk, (uint64_t)vdp->xdf_xdev_nblocks);
949 		return (EINVAL);
950 	}
951 
952 	dbp = &dumpbuf;
953 	bioinit(dbp);
954 	dbp->b_flags = B_BUSY;
955 	dbp->b_un.b_addr = addr;
956 	dbp->b_bcount	= nblk << DEV_BSHIFT;
957 	dbp->b_resid = 0;
958 	dbp->b_blkno = blkno;
959 	dbp->b_edev = dev;
960 	dbp->b_private = (void *)(uintptr_t)p_blkst;
961 
962 	mutex_enter(&vdp->xdf_dev_lk);
963 	kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
964 	if (vdp->xdf_f_act == NULL) {
965 		vdp->xdf_f_act = vdp->xdf_l_act = dbp;
966 	} else {
967 		vdp->xdf_l_act->av_forw = dbp;
968 		vdp->xdf_l_act = dbp;
969 	}
970 	dbp->av_forw = NULL;
971 	dbp->av_back = NULL;
972 	mutex_exit(&vdp->xdf_dev_lk);
973 	xdf_iostart(vdp);
974 	err = xdf_drain_io(vdp);
975 	biofini(dbp);
976 	return (err);
977 }
978 
979 /*ARGSUSED*/
980 static int
981 xdf_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
982     int *rvalp)
983 {
984 	int instance;
985 	xdf_t	*vdp;
986 	minor_t minor;
987 	int part;
988 
989 	minor = getminor(dev);
990 	instance = XDF_INST(minor);
991 
992 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
993 		return (ENXIO);
994 
995 	DPRINTF(IOCTL_DBG, ("xdf%d:ioctl: cmd %d (0x%x)\n",
996 	    instance, cmd, cmd));
997 
998 	part = XDF_PART(minor);
999 	if (!xdf_isopen(vdp, part))
1000 		return (ENXIO);
1001 
1002 	switch (cmd) {
1003 	case DKIOCGMEDIAINFO: {
1004 		struct dk_minfo	media_info;
1005 
1006 		media_info.dki_lbsize = DEV_BSIZE;
1007 		media_info.dki_capacity = vdp->xdf_xdev_nblocks;
1008 		media_info.dki_media_type = DK_FIXED_DISK;
1009 
1010 		if (ddi_copyout(&media_info, (void *)arg,
1011 		    sizeof (struct dk_minfo), mode)) {
1012 			return (EFAULT);
1013 		} else {
1014 			return (0);
1015 		}
1016 	}
1017 
1018 	case DKIOCINFO: {
1019 		struct dk_cinfo info;
1020 
1021 		/* controller information */
1022 		if (XD_IS_CD(vdp))
1023 			info.dki_ctype = DKC_CDROM;
1024 		else
1025 			info.dki_ctype = DKC_VBD;
1026 
1027 		info.dki_cnum = 0;
1028 		(void) strncpy((char *)(&info.dki_cname), "xdf", 8);
1029 
1030 		/* unit information */
1031 		info.dki_unit = ddi_get_instance(vdp->xdf_dip);
1032 		(void) strncpy((char *)(&info.dki_dname), "xdf", 8);
1033 		info.dki_flags = DKI_FMTVOL;
1034 		info.dki_partition = part;
1035 		info.dki_maxtransfer = maxphys / DEV_BSIZE;
1036 		info.dki_addr = 0;
1037 		info.dki_space = 0;
1038 		info.dki_prio = 0;
1039 		info.dki_vec = 0;
1040 
1041 		if (ddi_copyout(&info, (void *)arg, sizeof (info), mode))
1042 			return (EFAULT);
1043 		else
1044 			return (0);
1045 	}
1046 
1047 	case DKIOCSTATE: {
1048 		enum dkio_state	dkstate = DKIO_INSERTED;
1049 		if (ddi_copyout(&dkstate, (void *)arg, sizeof (dkstate),
1050 		    mode) != 0)
1051 			return (EFAULT);
1052 		return (0);
1053 	}
1054 
1055 	/*
1056 	 * is media removable?
1057 	 */
1058 	case DKIOCREMOVABLE: {
1059 		int i = XD_IS_RM(vdp) ? 1 : 0;
1060 		if (ddi_copyout(&i, (caddr_t)arg, sizeof (int), mode))
1061 			return (EFAULT);
1062 		return (0);
1063 	}
1064 
1065 	case DKIOCG_PHYGEOM:
1066 	case DKIOCG_VIRTGEOM:
1067 	case DKIOCGGEOM:
1068 	case DKIOCSGEOM:
1069 	case DKIOCGAPART:
1070 	case DKIOCGVTOC:
1071 	case DKIOCSVTOC:
1072 	case DKIOCPARTINFO:
1073 	case DKIOCGETEFI:
1074 	case DKIOCSETEFI:
1075 	case DKIOCPARTITION: {
1076 		int rc;
1077 
1078 		rc = cmlb_ioctl(vdp->xdf_vd_lbl, dev, cmd, arg, mode, credp,
1079 		    rvalp, NULL);
1080 		return (rc);
1081 	}
1082 
1083 	case DKIOCGETWCE:
1084 		if (ddi_copyout(&vdp->xdf_wce, (void *)arg,
1085 		    sizeof (vdp->xdf_wce), mode))
1086 			return (EFAULT);
1087 		return (0);
1088 	case DKIOCSETWCE:
1089 		if (ddi_copyin((void *)arg, &vdp->xdf_wce,
1090 		    sizeof (vdp->xdf_wce), mode))
1091 			return (EFAULT);
1092 		return (0);
1093 	case DKIOCFLUSHWRITECACHE: {
1094 		int rc;
1095 		struct dk_callback *dkc = (struct dk_callback *)arg;
1096 
1097 		if (vdp->xdf_flush_supported) {
1098 			rc = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
1099 			    NULL, 0, 0, (void *)dev);
1100 		} else {
1101 			if (xdf_barrier_flush_disable)
1102 				return (ENOTTY);
1103 			rc = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
1104 			    vdp->xdf_cache_flush_block, xdf_flush_block,
1105 			    DEV_BSIZE, (void *)dev);
1106 		}
1107 		if ((mode & FKIOCTL) && (dkc != NULL) &&
1108 		    (dkc->dkc_callback != NULL)) {
1109 			(*dkc->dkc_callback)(dkc->dkc_cookie, rc);
1110 			/* need to return 0 after calling callback */
1111 			rc = 0;
1112 		}
1113 		return (rc);
1114 	}
1115 
1116 	default:
1117 		return (ENOTTY);
1118 	}
1119 }
1120 
1121 /*
1122  * xdf interrupt handler
1123  */
1124 static uint_t
1125 xdf_intr(caddr_t arg)
1126 {
1127 	xdf_t *vdp = (xdf_t *)arg;
1128 	xendev_ring_t *xbr;
1129 	blkif_response_t *resp;
1130 	int bioerr = 0;
1131 	uint64_t id;
1132 	extern int do_polled_io;
1133 	uint8_t op;
1134 	uint16_t status;
1135 	ddi_acc_handle_t acchdl;
1136 
1137 	mutex_enter(&vdp->xdf_dev_lk);
1138 
1139 	if ((xbr = vdp->xdf_xb_ring) == NULL) {
1140 		mutex_exit(&vdp->xdf_dev_lk);
1141 		return (DDI_INTR_UNCLAIMED);
1142 	}
1143 
1144 	acchdl = vdp->xdf_xb_ring_hdl;
1145 
1146 	/*
1147 	 * complete all requests which have a response
1148 	 */
1149 	while (resp = xvdi_ring_get_response(xbr)) {
1150 		id = ddi_get64(acchdl, &resp->id);
1151 		op = ddi_get8(acchdl, &resp->operation);
1152 		status = ddi_get16(acchdl, (uint16_t *)&resp->status);
1153 		DPRINTF(INTR_DBG, ("resp: op %d id %"PRIu64" status %d\n",
1154 		    op, id, status));
1155 
1156 		/*
1157 		 * XXPV - close connection to the backend and restart
1158 		 */
1159 		if (status != BLKIF_RSP_OKAY) {
1160 			DPRINTF(IO_DBG, ("xdf@%s: I/O error while %s",
1161 			    ddi_get_name_addr(vdp->xdf_dip),
1162 			    (op == BLKIF_OP_READ) ? "reading" : "writing"));
1163 			bioerr = EIO;
1164 		}
1165 
1166 		xdf_iofini(vdp, id, bioerr);
1167 	}
1168 
1169 	mutex_exit(&vdp->xdf_dev_lk);
1170 
1171 	if (!do_polled_io)
1172 		xdf_iostart(vdp);
1173 
1174 	return (DDI_INTR_CLAIMED);
1175 }
1176 
1177 int xdf_fbrewrites;	/* how many times was our flush block rewritten */
1178 
1179 /*
1180  * Snarf new data if our flush block was re-written
1181  */
1182 static void
1183 check_fbwrite(xdf_t *vdp, buf_t *bp, daddr_t blkno)
1184 {
1185 	int nblks;
1186 	boolean_t mapin;
1187 
1188 	if (IS_WRITE_BARRIER(vdp, bp))
1189 		return; /* write was a flush write */
1190 
1191 	mapin = B_FALSE;
1192 	nblks = bp->b_bcount >> DEV_BSHIFT;
1193 	if (xdf_flush_block >= blkno && xdf_flush_block < (blkno + nblks)) {
1194 		xdf_fbrewrites++;
1195 		if (bp->b_flags & (B_PAGEIO | B_PHYS)) {
1196 			mapin = B_TRUE;
1197 			bp_mapin(bp);
1198 		}
1199 		bcopy(bp->b_un.b_addr +
1200 		    ((xdf_flush_block - blkno) << DEV_BSHIFT),
1201 		    vdp->xdf_cache_flush_block, DEV_BSIZE);
1202 		if (mapin)
1203 			bp_mapout(bp);
1204 	}
1205 }
1206 
1207 static void
1208 xdf_iofini(xdf_t *vdp, uint64_t id, int bioerr)
1209 {
1210 	ge_slot_t *gs = (ge_slot_t *)(uintptr_t)id;
1211 	v_req_t *vreq = gs->vreq;
1212 	buf_t *bp = vreq->v_buf;
1213 
1214 	gs_free(vdp, gs);
1215 	if (bioerr)
1216 		bioerror(bp, bioerr);
1217 	vreq->v_nslots--;
1218 	if (vreq->v_nslots != 0)
1219 		return;
1220 
1221 	XDF_UPDATE_IO_STAT(vdp, bp);
1222 	kstat_runq_exit(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1223 
1224 	if (IS_ERROR(bp))
1225 		bp->b_resid = bp->b_bcount;
1226 
1227 	vreq_free(vdp, vreq);
1228 	biodone(bp);
1229 }
1230 
1231 /*
1232  * return value of xdf_prepare_rreq()
1233  * used in xdf_iostart()
1234  */
1235 #define	XF_PARTIAL	0 /* rreq is full, not all I/O in buf transferred */
1236 #define	XF_COMP		1 /* no more I/O left in buf */
1237 
1238 static void
1239 xdf_iostart(xdf_t *vdp)
1240 {
1241 	xendev_ring_t *xbr;
1242 	struct buf *bp;
1243 	blkif_request_t *rreq;
1244 	int retval;
1245 	int rreqready = 0;
1246 
1247 	xbr = vdp->xdf_xb_ring;
1248 
1249 	/*
1250 	 * populate the ring request(s)
1251 	 *
1252 	 * loop until there is no buf to transfer or no free slot
1253 	 * available in I/O ring
1254 	 */
1255 	for (;;) {
1256 		mutex_enter(&vdp->xdf_dev_lk);
1257 
1258 		if (vdp->xdf_status != XD_READY)
1259 			break;
1260 
1261 		/* active buf queue empty? */
1262 		if ((bp = vdp->xdf_f_act) == NULL)
1263 			break;
1264 
1265 		/* try to grab a vreq for this bp */
1266 		if ((BP2VREQ(bp) == NULL) && (vreq_get(vdp, bp) == NULL))
1267 				break;
1268 		/* alloc DMA/GTE resources */
1269 		if (vreq_setup(vdp, BP2VREQ(bp)) != DDI_SUCCESS)
1270 			break;
1271 
1272 		/* get next blkif_request in the ring */
1273 		if ((rreq = xvdi_ring_get_request(xbr)) == NULL)
1274 			break;
1275 		bzero(rreq, sizeof (blkif_request_t));
1276 
1277 		/* populate blkif_request with this buf */
1278 		rreqready++;
1279 		retval = xdf_prepare_rreq(vdp, bp, rreq);
1280 		if (retval == XF_COMP) {
1281 			/* finish this bp, switch to next one */
1282 			kstat_waitq_to_runq(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1283 			vdp->xdf_f_act = bp->av_forw;
1284 			bp->av_forw = NULL;
1285 		}
1286 
1287 		mutex_exit(&vdp->xdf_dev_lk);
1288 	}
1289 
1290 	/*
1291 	 * Send the request(s) to the backend
1292 	 */
1293 	if (rreqready) {
1294 		if (xvdi_ring_push_request(xbr)) {
1295 			DPRINTF(IO_DBG, ("xdf_iostart: "
1296 			    "sent request(s) to backend\n"));
1297 			xvdi_notify_oe(vdp->xdf_dip);
1298 		}
1299 	}
1300 
1301 	mutex_exit(&vdp->xdf_dev_lk);
1302 }
1303 
1304 /*
1305  * populate a single blkif_request_t w/ a buf
1306  */
1307 static int
1308 xdf_prepare_rreq(xdf_t *vdp, struct buf *bp, blkif_request_t *rreq)
1309 {
1310 	int		rval;
1311 	grant_ref_t	gr;
1312 	uint8_t		fsect, lsect;
1313 	size_t		bcnt;
1314 	paddr_t		dma_addr;
1315 	off_t		blk_off;
1316 	dev_info_t	*dip = vdp->xdf_dip;
1317 	blkif_vdev_t	vdev = xvdi_get_vdevnum(dip);
1318 	v_req_t		*vreq = BP2VREQ(bp);
1319 	uint64_t	blkno = vreq->v_blkno;
1320 	uint_t		ndmacs = vreq->v_ndmacs;
1321 	ddi_acc_handle_t acchdl = vdp->xdf_xb_ring_hdl;
1322 	int		seg = 0;
1323 	int		isread = IS_READ(bp);
1324 
1325 	if (isread)
1326 		ddi_put8(acchdl, &rreq->operation, BLKIF_OP_READ);
1327 	else {
1328 		switch (vreq->v_flush_diskcache) {
1329 		case FLUSH_DISKCACHE:
1330 			ddi_put8(acchdl, &rreq->operation,
1331 			    BLKIF_OP_FLUSH_DISKCACHE);
1332 			ddi_put16(acchdl, &rreq->handle, vdev);
1333 			ddi_put64(acchdl, &rreq->id,
1334 			    (uint64_t)(uintptr_t)(vreq->v_gs));
1335 			ddi_put8(acchdl, &rreq->nr_segments, 0);
1336 			return (XF_COMP);
1337 		case WRITE_BARRIER:
1338 			ddi_put8(acchdl, &rreq->operation,
1339 			    BLKIF_OP_WRITE_BARRIER);
1340 			break;
1341 		default:
1342 			if (!vdp->xdf_wce)
1343 				ddi_put8(acchdl, &rreq->operation,
1344 				    BLKIF_OP_WRITE_BARRIER);
1345 			else
1346 				ddi_put8(acchdl, &rreq->operation,
1347 				    BLKIF_OP_WRITE);
1348 			break;
1349 		}
1350 	}
1351 
1352 	ddi_put16(acchdl, &rreq->handle, vdev);
1353 	ddi_put64(acchdl, &rreq->sector_number, blkno);
1354 	ddi_put64(acchdl, &rreq->id, (uint64_t)(uintptr_t)(vreq->v_gs));
1355 
1356 	/*
1357 	 * loop until all segments are populated or no more dma cookie in buf
1358 	 */
1359 	for (;;) {
1360 	/*
1361 	 * Each segment of a blkif request can transfer up to
1362 	 * one 4K page of data.
1363 	 */
1364 		bcnt = vreq->v_dmac.dmac_size;
1365 		ASSERT(bcnt <= PAGESIZE);
1366 		ASSERT((bcnt % XB_BSIZE) == 0);
1367 		dma_addr = vreq->v_dmac.dmac_laddress;
1368 		blk_off = (uint_t)((paddr_t)XB_SEGOFFSET & dma_addr);
1369 		ASSERT((blk_off & XB_BMASK) == 0);
1370 		fsect = blk_off >> XB_BSHIFT;
1371 		lsect = fsect + (bcnt >> XB_BSHIFT) - 1;
1372 		ASSERT(fsect < XB_MAX_SEGLEN / XB_BSIZE &&
1373 		    lsect < XB_MAX_SEGLEN / XB_BSIZE);
1374 		DPRINTF(IO_DBG, ("  ""seg%d: dmacS %lu blk_off %ld\n",
1375 		    seg, vreq->v_dmac.dmac_size, blk_off));
1376 		gr = gs_grant(vreq->v_gs, PATOMA(dma_addr) >> PAGESHIFT);
1377 		ddi_put32(acchdl, &rreq->seg[seg].gref, gr);
1378 		ddi_put8(acchdl, &rreq->seg[seg].first_sect, fsect);
1379 		ddi_put8(acchdl, &rreq->seg[seg].last_sect, lsect);
1380 		DPRINTF(IO_DBG, ("  ""seg%d: fs %d ls %d gr %d dma 0x%"PRIx64
1381 		    "\n", seg, fsect, lsect, gr, dma_addr));
1382 
1383 		blkno += (bcnt >> XB_BSHIFT);
1384 		seg++;
1385 		ASSERT(seg <= BLKIF_MAX_SEGMENTS_PER_REQUEST);
1386 		if (--ndmacs) {
1387 			ddi_dma_nextcookie(vreq->v_dmahdl, &vreq->v_dmac);
1388 			continue;
1389 		}
1390 
1391 		vreq->v_status = VREQ_DMAWIN_DONE;
1392 		vreq->v_blkno = blkno;
1393 		if (vreq->v_dmaw + 1 == vreq->v_ndmaws)
1394 			/* last win */
1395 			rval = XF_COMP;
1396 		else
1397 			rval = XF_PARTIAL;
1398 		break;
1399 	}
1400 	ddi_put8(acchdl,  &rreq->nr_segments, seg);
1401 	DPRINTF(IO_DBG, ("xdf_prepare_rreq: request id=%"PRIx64" ready\n",
1402 	    rreq->id));
1403 
1404 	return (rval);
1405 }
1406 
1407 #define	XDF_QSEC	50000	/* .005 second */
1408 #define	XDF_POLLCNT	12	/* loop for 12 times before time out */
1409 
1410 static int
1411 xdf_drain_io(xdf_t *vdp)
1412 {
1413 	int pollc, rval;
1414 	xendev_ring_t *xbr;
1415 
1416 	if (xdfdebug & SUSRES_DBG)
1417 		xen_printf("xdf_drain_io: start\n");
1418 
1419 	mutex_enter(&vdp->xdf_dev_lk);
1420 
1421 	if ((vdp->xdf_status != XD_READY) && (vdp->xdf_status != XD_SUSPEND))
1422 		goto out;
1423 
1424 	rval = 0;
1425 	xbr = vdp->xdf_xb_ring;
1426 	ASSERT(xbr != NULL);
1427 
1428 	for (pollc = 0; pollc < XDF_POLLCNT; pollc++) {
1429 		if (xvdi_ring_has_unconsumed_responses(xbr)) {
1430 			mutex_exit(&vdp->xdf_dev_lk);
1431 			(void) xdf_intr((caddr_t)vdp);
1432 			mutex_enter(&vdp->xdf_dev_lk);
1433 		}
1434 		if (!xvdi_ring_has_incomp_request(xbr))
1435 			goto out;
1436 
1437 		(void) HYPERVISOR_yield();
1438 		/*
1439 		 * file-backed devices can be slow
1440 		 */
1441 		drv_usecwait(XDF_QSEC << pollc);
1442 	}
1443 	cmn_err(CE_WARN, "xdf_polled_io: timeout");
1444 	rval = EIO;
1445 out:
1446 	mutex_exit(&vdp->xdf_dev_lk);
1447 	if (xdfdebug & SUSRES_DBG)
1448 		xen_printf("xdf_drain_io: end, err=%d\n", rval);
1449 	return (rval);
1450 }
1451 
1452 /* ARGSUSED5 */
1453 static int
1454 xdf_lb_rdwr(dev_info_t *devi, uchar_t cmd, void *bufp,
1455     diskaddr_t start, size_t reqlen, void *tg_cookie)
1456 {
1457 	xdf_t *vdp;
1458 	struct buf *bp;
1459 	int err = 0;
1460 
1461 	vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi));
1462 	if (vdp == NULL)
1463 		return (ENXIO);
1464 
1465 	if ((start + (reqlen >> DEV_BSHIFT)) > vdp->xdf_xdev_nblocks)
1466 		return (EINVAL);
1467 
1468 	bp = getrbuf(KM_SLEEP);
1469 	if (cmd == TG_READ)
1470 		bp->b_flags = B_BUSY | B_READ;
1471 	else
1472 		bp->b_flags = B_BUSY | B_WRITE;
1473 	bp->b_un.b_addr = bufp;
1474 	bp->b_bcount = reqlen;
1475 	bp->b_resid = 0;
1476 	bp->b_blkno = start;
1477 	bp->av_forw = NULL;
1478 	bp->av_back = NULL;
1479 	bp->b_edev = DDI_DEV_T_NONE; /* don't have dev_t */
1480 
1481 	mutex_enter(&vdp->xdf_dev_lk);
1482 	kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1483 	if (vdp->xdf_f_act == NULL) {
1484 		vdp->xdf_f_act = vdp->xdf_l_act = bp;
1485 	} else {
1486 		vdp->xdf_l_act->av_forw = bp;
1487 		vdp->xdf_l_act = bp;
1488 	}
1489 	mutex_exit(&vdp->xdf_dev_lk);
1490 	xdf_iostart(vdp);
1491 	err = biowait(bp);
1492 
1493 	ASSERT(bp->b_flags & B_DONE);
1494 
1495 	freerbuf(bp);
1496 	return (err);
1497 }
1498 
1499 /*
1500  * synthetic geometry
1501  */
1502 #define	XDF_NSECTS	256
1503 #define	XDF_NHEADS	16
1504 
1505 static int
1506 xdf_lb_getcap(dev_info_t *devi, diskaddr_t *capp)
1507 {
1508 	xdf_t *vdp;
1509 
1510 	vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi));
1511 
1512 	if (vdp == NULL)
1513 		return (ENXIO);
1514 
1515 	mutex_enter(&vdp->xdf_dev_lk);
1516 	*capp = vdp->xdf_xdev_nblocks;
1517 	DPRINTF(LBL_DBG, ("capacity %llu\n", *capp));
1518 	mutex_exit(&vdp->xdf_dev_lk);
1519 	return (0);
1520 }
1521 
1522 static int
1523 xdf_lb_getpgeom(dev_info_t *devi, cmlb_geom_t *geomp)
1524 {
1525 	xdf_t *vdp;
1526 	uint_t ncyl;
1527 	uint_t spc = XDF_NHEADS * XDF_NSECTS;
1528 
1529 	vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi));
1530 
1531 	if (vdp == NULL)
1532 		return (ENXIO);
1533 
1534 	ncyl = vdp->xdf_xdev_nblocks / spc;
1535 
1536 	geomp->g_ncyl = ncyl == 0 ? 1 : ncyl;
1537 	geomp->g_acyl = 0;
1538 	geomp->g_nhead = XDF_NHEADS;
1539 	geomp->g_secsize = XB_BSIZE;
1540 	geomp->g_nsect = XDF_NSECTS;
1541 	geomp->g_intrlv = 0;
1542 	geomp->g_rpm = 7200;
1543 	geomp->g_capacity = vdp->xdf_xdev_nblocks;
1544 	return (0);
1545 }
1546 
1547 /*
1548  * No real HBA, no geometry available from it
1549  */
1550 /*ARGSUSED*/
1551 static int
1552 xdf_lb_getvgeom(dev_info_t *devi, cmlb_geom_t *geomp)
1553 {
1554 	return (EINVAL);
1555 }
1556 
1557 static int
1558 xdf_lb_getattribute(dev_info_t *devi, tg_attribute_t *tgattributep)
1559 {
1560 	xdf_t *vdp;
1561 
1562 	if (!(vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi))))
1563 		return (ENXIO);
1564 
1565 	if (XD_IS_RO(vdp))
1566 		tgattributep->media_is_writable = 0;
1567 	else
1568 		tgattributep->media_is_writable = 1;
1569 	return (0);
1570 }
1571 
1572 /* ARGSUSED3 */
1573 static int
1574 xdf_lb_getinfo(dev_info_t *devi, int cmd, void *arg, void *tg_cookie)
1575 {
1576 	switch (cmd) {
1577 	case TG_GETPHYGEOM:
1578 		return (xdf_lb_getpgeom(devi, (cmlb_geom_t *)arg));
1579 	case TG_GETVIRTGEOM:
1580 		return (xdf_lb_getvgeom(devi, (cmlb_geom_t *)arg));
1581 	case TG_GETCAPACITY:
1582 		return (xdf_lb_getcap(devi, (diskaddr_t *)arg));
1583 	case TG_GETBLOCKSIZE:
1584 		*(uint32_t *)arg = XB_BSIZE;
1585 		return (0);
1586 	case TG_GETATTR:
1587 		return (xdf_lb_getattribute(devi, (tg_attribute_t *)arg));
1588 	default:
1589 		return (ENOTTY);
1590 	}
1591 }
1592 
1593 /*
1594  * Kick-off connect process
1595  * Status should be XD_UNKNOWN or XD_CLOSED
1596  * On success, status will be changed to XD_INIT
1597  * On error, status won't be changed
1598  */
1599 static int
1600 xdf_start_connect(xdf_t *vdp)
1601 {
1602 	char *xsnode;
1603 	grant_ref_t gref;
1604 	xenbus_transaction_t xbt;
1605 	int rv;
1606 	dev_info_t *dip = vdp->xdf_dip;
1607 
1608 	if ((vdp->xdf_peer = xvdi_get_oeid(dip)) == (domid_t)-1)
1609 		goto errout;
1610 
1611 	if (xvdi_alloc_evtchn(dip) != DDI_SUCCESS) {
1612 		cmn_err(CE_WARN, "xdf@%s: failed to alloc event channel",
1613 		    ddi_get_name_addr(dip));
1614 		goto errout;
1615 	}
1616 	if (ddi_add_intr(dip, 0, NULL, NULL, xdf_intr, (caddr_t)vdp) !=
1617 	    DDI_SUCCESS) {
1618 		cmn_err(CE_WARN, "xdf_start_connect: xdf@%s: "
1619 		    "failed to add intr handler", ddi_get_name_addr(dip));
1620 		goto errout1;
1621 	}
1622 
1623 	if (xvdi_alloc_ring(dip, BLKIF_RING_SIZE,
1624 	    sizeof (union blkif_sring_entry), &gref, &vdp->xdf_xb_ring) !=
1625 	    DDI_SUCCESS) {
1626 		cmn_err(CE_WARN, "xdf@%s: failed to alloc comm ring",
1627 		    ddi_get_name_addr(dip));
1628 		goto errout2;
1629 	}
1630 	vdp->xdf_xb_ring_hdl = vdp->xdf_xb_ring->xr_acc_hdl; /* ugly!! */
1631 
1632 	/*
1633 	 * Write into xenstore the info needed by backend
1634 	 */
1635 	if ((xsnode = xvdi_get_xsname(dip)) == NULL) {
1636 		cmn_err(CE_WARN, "xdf@%s: "
1637 		    "failed to get xenstore node path",
1638 		    ddi_get_name_addr(dip));
1639 		goto fail_trans;
1640 	}
1641 trans_retry:
1642 	if (xenbus_transaction_start(&xbt)) {
1643 		cmn_err(CE_WARN, "xdf@%s: failed to start transaction",
1644 		    ddi_get_name_addr(dip));
1645 		xvdi_fatal_error(dip, EIO, "transaction start");
1646 		goto fail_trans;
1647 	}
1648 
1649 	if (rv = xenbus_printf(xbt, xsnode, "ring-ref", "%u", gref)) {
1650 		cmn_err(CE_WARN, "xdf@%s: failed to write ring-ref",
1651 		    ddi_get_name_addr(dip));
1652 		xvdi_fatal_error(dip, rv, "writing ring-ref");
1653 		goto abort_trans;
1654 	}
1655 
1656 	if (rv = xenbus_printf(xbt, xsnode, "event-channel", "%u",
1657 	    xvdi_get_evtchn(dip))) {
1658 		cmn_err(CE_WARN, "xdf@%s: failed to write event-channel",
1659 		    ddi_get_name_addr(dip));
1660 		xvdi_fatal_error(dip, rv, "writing event-channel");
1661 		goto abort_trans;
1662 	}
1663 
1664 	if ((rv = xvdi_switch_state(dip, xbt, XenbusStateInitialised)) > 0) {
1665 		cmn_err(CE_WARN, "xdf@%s: "
1666 		    "failed to switch state to XenbusStateInitialised",
1667 		    ddi_get_name_addr(dip));
1668 		xvdi_fatal_error(dip, rv, "writing state");
1669 		goto abort_trans;
1670 	}
1671 
1672 	/* kick-off connect process */
1673 	if (rv = xenbus_transaction_end(xbt, 0)) {
1674 		if (rv == EAGAIN)
1675 			goto trans_retry;
1676 		cmn_err(CE_WARN, "xdf@%s: failed to end transaction",
1677 		    ddi_get_name_addr(dip));
1678 		xvdi_fatal_error(dip, rv, "completing transaction");
1679 		goto fail_trans;
1680 	}
1681 
1682 	ASSERT(mutex_owned(&vdp->xdf_cb_lk));
1683 	mutex_enter(&vdp->xdf_dev_lk);
1684 	vdp->xdf_status = XD_INIT;
1685 	mutex_exit(&vdp->xdf_dev_lk);
1686 
1687 	return (DDI_SUCCESS);
1688 
1689 abort_trans:
1690 	(void) xenbus_transaction_end(xbt, 1);
1691 fail_trans:
1692 	xvdi_free_ring(vdp->xdf_xb_ring);
1693 errout2:
1694 	(void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
1695 errout1:
1696 	xvdi_free_evtchn(dip);
1697 errout:
1698 	cmn_err(CE_WARN, "xdf@%s: fail to kick-off connecting",
1699 	    ddi_get_name_addr(dip));
1700 	return (DDI_FAILURE);
1701 }
1702 
1703 /*
1704  * Kick-off disconnect process
1705  * Status won't be changed
1706  */
1707 static int
1708 xdf_start_disconnect(xdf_t *vdp)
1709 {
1710 	if (xvdi_switch_state(vdp->xdf_dip, XBT_NULL, XenbusStateClosed) > 0) {
1711 		cmn_err(CE_WARN, "xdf@%s: fail to kick-off disconnecting",
1712 		    ddi_get_name_addr(vdp->xdf_dip));
1713 		return (DDI_FAILURE);
1714 	}
1715 
1716 	return (DDI_SUCCESS);
1717 }
1718 
1719 int
1720 xdf_get_flush_block(xdf_t *vdp)
1721 {
1722 	/*
1723 	 * Get a DEV_BSIZE aligned bufer
1724 	 */
1725 	vdp->xdf_flush_mem = kmem_alloc(DEV_BSIZE * 2, KM_SLEEP);
1726 	vdp->xdf_cache_flush_block =
1727 	    (char *)P2ROUNDUP((uintptr_t)(vdp->xdf_flush_mem), DEV_BSIZE);
1728 	if (xdf_lb_rdwr(vdp->xdf_dip, TG_READ, vdp->xdf_cache_flush_block,
1729 	    xdf_flush_block, DEV_BSIZE, NULL) != 0)
1730 		return (DDI_FAILURE);
1731 	return (DDI_SUCCESS);
1732 }
1733 
1734 /*
1735  * Finish other initialization after we've connected to backend
1736  * Status should be XD_INIT before calling this routine
1737  * On success, status should be changed to XD_READY
1738  * On error, status should stay XD_INIT
1739  */
1740 static int
1741 xdf_post_connect(xdf_t *vdp)
1742 {
1743 	int rv;
1744 	uint_t len;
1745 	char *type;
1746 	char *barrier;
1747 	dev_info_t *devi = vdp->xdf_dip;
1748 
1749 	/*
1750 	 * Determine if feature barrier is supported by backend
1751 	 */
1752 	if (xenbus_read(XBT_NULL, xvdi_get_oename(devi),
1753 	    "feature-barrier", (void **)&barrier, &len) == 0) {
1754 		vdp->xdf_feature_barrier = 1;
1755 		kmem_free(barrier, len);
1756 	} else {
1757 		cmn_err(CE_NOTE, "xdf@%s: failed to read feature-barrier",
1758 		    ddi_get_name_addr(vdp->xdf_dip));
1759 		vdp->xdf_feature_barrier = 0;
1760 	}
1761 
1762 	/* probe backend */
1763 	if (rv = xenbus_gather(XBT_NULL, xvdi_get_oename(devi),
1764 	    "sectors", "%"SCNu64, &vdp->xdf_xdev_nblocks,
1765 	    "info", "%u", &vdp->xdf_xdev_info, NULL)) {
1766 		cmn_err(CE_WARN, "xdf_post_connect: xdf@%s: "
1767 		    "cannot read backend info", ddi_get_name_addr(devi));
1768 		xvdi_fatal_error(devi, rv, "reading backend info");
1769 		return (DDI_FAILURE);
1770 	}
1771 
1772 	/* fix disk type */
1773 	if (xenbus_read(XBT_NULL, xvdi_get_xsname(devi), "device-type",
1774 	    (void **)&type, &len) != 0) {
1775 		cmn_err(CE_WARN, "xdf_post_connect: xdf@%s: "
1776 		    "cannot read device-type", ddi_get_name_addr(devi));
1777 		xvdi_fatal_error(devi, rv, "reading device-type");
1778 		return (DDI_FAILURE);
1779 	}
1780 	if (strcmp(type, "cdrom") == 0)
1781 		vdp->xdf_xdev_info |= VDISK_CDROM;
1782 	kmem_free(type, len);
1783 
1784 	/*
1785 	 * We've created all the minor nodes via cmlb_attach() using default
1786 	 * value in xdf_attach() to make it possbile to block in xdf_open(),
1787 	 * in case there's anyone (say, booting thread) ever trying to open
1788 	 * it before connected to backend. We will refresh all those minor
1789 	 * nodes w/ latest info we've got now when we are almost connected.
1790 	 *
1791 	 * Don't do this when xdf is already opened by someone (could happen
1792 	 * during resume), for that cmlb_attach() will invalid the label info
1793 	 * and confuse those who has already opened the node, which is bad.
1794 	 */
1795 	if (!xdf_isopen(vdp, -1) && (XD_IS_CD(vdp) || XD_IS_RM(vdp))) {
1796 		/* re-init cmlb w/ latest info we got from backend */
1797 		if (cmlb_attach(devi, &xdf_lb_ops,
1798 		    XD_IS_CD(vdp) ? DTYPE_RODIRECT : DTYPE_DIRECT,
1799 		    XD_IS_RM(vdp), 1, DDI_NT_BLOCK,
1800 		    CMLB_FAKE_LABEL_ONE_PARTITION,
1801 		    vdp->xdf_vd_lbl, NULL) != 0) {
1802 			cmn_err(CE_WARN, "xdf@%s: cmlb attach failed",
1803 			    ddi_get_name_addr(devi));
1804 			return (DDI_FAILURE);
1805 		}
1806 	}
1807 
1808 	/* mark vbd is ready for I/O */
1809 	ASSERT(mutex_owned(&vdp->xdf_cb_lk));
1810 	mutex_enter(&vdp->xdf_dev_lk);
1811 	vdp->xdf_status = XD_READY;
1812 	mutex_exit(&vdp->xdf_dev_lk);
1813 	/*
1814 	 * If backend has feature-barrier, see if it supports disk
1815 	 * cache flush op.
1816 	 */
1817 	vdp->xdf_flush_supported = 0;
1818 	if (vdp->xdf_feature_barrier) {
1819 		/*
1820 		 * Pretend we already know flush is supported so probe
1821 		 * will attempt the correct op.
1822 		 */
1823 		vdp->xdf_flush_supported = 1;
1824 		if (xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE, NULL, 0, 0, 0) == 0) {
1825 			vdp->xdf_flush_supported = 1;
1826 		} else {
1827 			vdp->xdf_flush_supported = 0;
1828 			/*
1829 			 * If the other end does not support the cache flush op
1830 			 * then we must use a barrier-write to force disk
1831 			 * cache flushing.  Barrier writes require that a data
1832 			 * block actually be written.
1833 			 * Cache a block to barrier-write when we are
1834 			 * asked to perform a flush.
1835 			 * XXX - would it be better to just copy 1 block
1836 			 * (512 bytes) from whatever write we did last
1837 			 * and rewrite that block?
1838 			 */
1839 			if (xdf_get_flush_block(vdp) != DDI_SUCCESS)
1840 				return (DDI_FAILURE);
1841 		}
1842 	}
1843 
1844 	cmn_err(CE_CONT, "?xdf@%s: %"PRIu64" blocks", ddi_get_name_addr(devi),
1845 	    (uint64_t)vdp->xdf_xdev_nblocks);
1846 
1847 	return (DDI_SUCCESS);
1848 }
1849 
1850 /*
1851  * Finish other uninitialization after we've disconnected from backend
1852  * when status is XD_CLOSING or XD_INIT. After returns, status is XD_CLOSED
1853  */
1854 static void
1855 xdf_post_disconnect(xdf_t *vdp)
1856 {
1857 	(void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
1858 	xvdi_free_evtchn(vdp->xdf_dip);
1859 	xvdi_free_ring(vdp->xdf_xb_ring);
1860 	vdp->xdf_xb_ring = NULL;
1861 	vdp->xdf_xb_ring_hdl = NULL;
1862 	vdp->xdf_peer = (domid_t)-1;
1863 
1864 	ASSERT(mutex_owned(&vdp->xdf_cb_lk));
1865 	mutex_enter(&vdp->xdf_dev_lk);
1866 	vdp->xdf_status = XD_CLOSED;
1867 	mutex_exit(&vdp->xdf_dev_lk);
1868 }
1869 
1870 /*ARGSUSED*/
1871 static void
1872 xdf_oe_change(dev_info_t *dip, ddi_eventcookie_t id, void *arg, void *impl_data)
1873 {
1874 	XenbusState new_state = *(XenbusState *)impl_data;
1875 	xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip);
1876 	boolean_t unexpect_die = B_FALSE;
1877 	int status;
1878 
1879 	DPRINTF(DDI_DBG, ("xdf@%s: otherend state change to %d!\n",
1880 	    ddi_get_name_addr(dip), new_state));
1881 
1882 	mutex_enter(&vdp->xdf_cb_lk);
1883 
1884 	if (xdf_check_state_transition(vdp, new_state) == DDI_FAILURE) {
1885 		mutex_exit(&vdp->xdf_cb_lk);
1886 		return;
1887 	}
1888 
1889 	switch (new_state) {
1890 	case XenbusStateInitialising:
1891 		ASSERT(vdp->xdf_status == XD_CLOSED);
1892 		/*
1893 		 * backend recovered from a previous failure,
1894 		 * kick-off connect process again
1895 		 */
1896 		if (xdf_start_connect(vdp) != DDI_SUCCESS) {
1897 			cmn_err(CE_WARN, "xdf@%s:"
1898 			    " failed to start reconnecting to backend",
1899 			    ddi_get_name_addr(dip));
1900 		}
1901 		break;
1902 	case XenbusStateConnected:
1903 		ASSERT(vdp->xdf_status == XD_INIT);
1904 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
1905 		/* finish final init after connect */
1906 		if (xdf_post_connect(vdp) != DDI_SUCCESS)
1907 			(void) xdf_start_disconnect(vdp);
1908 		break;
1909 	case XenbusStateClosing:
1910 		if (vdp->xdf_status == XD_READY) {
1911 			mutex_enter(&vdp->xdf_dev_lk);
1912 			if (xdf_isopen(vdp, -1)) {
1913 				cmn_err(CE_NOTE, "xdf@%s: hot-unplug failed, "
1914 				    "still in use", ddi_get_name_addr(dip));
1915 				mutex_exit(&vdp->xdf_dev_lk);
1916 				break;
1917 			} else {
1918 				vdp->xdf_status = XD_CLOSING;
1919 			}
1920 			mutex_exit(&vdp->xdf_dev_lk);
1921 		}
1922 		(void) xdf_start_disconnect(vdp);
1923 		break;
1924 	case XenbusStateClosed:
1925 		/* first check if BE closed unexpectedly */
1926 		mutex_enter(&vdp->xdf_dev_lk);
1927 		if (xdf_isopen(vdp, -1)) {
1928 			unexpect_die = B_TRUE;
1929 			unexpectedie(vdp);
1930 			cmn_err(CE_WARN, "xdf@%s: backend closed, "
1931 			    "reconnecting...", ddi_get_name_addr(dip));
1932 		}
1933 		mutex_exit(&vdp->xdf_dev_lk);
1934 
1935 		if (vdp->xdf_status == XD_READY) {
1936 			mutex_enter(&vdp->xdf_dev_lk);
1937 			vdp->xdf_status = XD_CLOSING;
1938 			mutex_exit(&vdp->xdf_dev_lk);
1939 
1940 #ifdef	DOMU_BACKEND
1941 			(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1942 #endif
1943 
1944 			xdf_post_disconnect(vdp);
1945 			(void) xvdi_switch_state(dip, XBT_NULL,
1946 			    XenbusStateClosed);
1947 		} else if ((vdp->xdf_status == XD_INIT) ||
1948 		    (vdp->xdf_status == XD_CLOSING)) {
1949 			xdf_post_disconnect(vdp);
1950 		} else {
1951 			mutex_enter(&vdp->xdf_dev_lk);
1952 			vdp->xdf_status = XD_CLOSED;
1953 			mutex_exit(&vdp->xdf_dev_lk);
1954 		}
1955 	}
1956 
1957 	/* notify anybody waiting for oe state change */
1958 	mutex_enter(&vdp->xdf_dev_lk);
1959 	cv_broadcast(&vdp->xdf_dev_cv);
1960 	mutex_exit(&vdp->xdf_dev_lk);
1961 
1962 	status = vdp->xdf_status;
1963 	mutex_exit(&vdp->xdf_cb_lk);
1964 
1965 	if (status == XD_READY) {
1966 		xdf_iostart(vdp);
1967 	} else if ((status == XD_CLOSED) && !unexpect_die) {
1968 		/* interface is closed successfully, remove all minor nodes */
1969 		cmlb_detach(vdp->xdf_vd_lbl, NULL);
1970 		cmlb_free_handle(&vdp->xdf_vd_lbl);
1971 	}
1972 }
1973 
1974 /* check if partition is open, -1 - check all partitions on the disk */
1975 static boolean_t
1976 xdf_isopen(xdf_t *vdp, int partition)
1977 {
1978 	int i;
1979 	ulong_t parbit;
1980 	boolean_t rval = B_FALSE;
1981 
1982 	if (partition == -1)
1983 		parbit = (ulong_t)-1;
1984 	else
1985 		parbit = 1 << partition;
1986 
1987 	for (i = 0; i < OTYPCNT; i++) {
1988 		if (vdp->xdf_vd_open[i] & parbit)
1989 			rval = B_TRUE;
1990 	}
1991 
1992 	return (rval);
1993 }
1994 
1995 /*
1996  * Xdf_check_state_transition will check the XenbusState change to see
1997  * if the change is a valid transition or not.
1998  * The new state is written by backend domain, or by running xenstore-write
1999  * to change it manually in dom0
2000  */
2001 static int
2002 xdf_check_state_transition(xdf_t *vdp, XenbusState oestate)
2003 {
2004 	int status;
2005 	int stcheck;
2006 #define	STOK	0 /* need further process */
2007 #define	STNOP	1 /* no action need taking */
2008 #define	STBUG	2 /* unexpected state change, could be a bug */
2009 
2010 	status = vdp->xdf_status;
2011 	stcheck = STOK;
2012 
2013 	switch (status) {
2014 	case XD_UNKNOWN:
2015 		if ((oestate == XenbusStateUnknown)		||
2016 		    (oestate == XenbusStateConnected))
2017 			stcheck = STBUG;
2018 		else if ((oestate == XenbusStateInitialising)	||
2019 		    (oestate == XenbusStateInitWait)		||
2020 		    (oestate == XenbusStateInitialised))
2021 			stcheck = STNOP;
2022 		break;
2023 	case XD_INIT:
2024 		if (oestate == XenbusStateUnknown)
2025 			stcheck = STBUG;
2026 		else if ((oestate == XenbusStateInitialising)	||
2027 		    (oestate == XenbusStateInitWait)		||
2028 		    (oestate == XenbusStateInitialised))
2029 			stcheck = STNOP;
2030 		break;
2031 	case XD_READY:
2032 		if ((oestate == XenbusStateUnknown)		||
2033 		    (oestate == XenbusStateInitialising)	||
2034 		    (oestate == XenbusStateInitWait)		||
2035 		    (oestate == XenbusStateInitialised))
2036 			stcheck = STBUG;
2037 		else if (oestate == XenbusStateConnected)
2038 			stcheck = STNOP;
2039 		break;
2040 	case XD_CLOSING:
2041 		if ((oestate == XenbusStateUnknown)		||
2042 		    (oestate == XenbusStateInitialising)	||
2043 		    (oestate == XenbusStateInitWait)		||
2044 		    (oestate == XenbusStateInitialised)		||
2045 		    (oestate == XenbusStateConnected))
2046 			stcheck = STBUG;
2047 		else if (oestate == XenbusStateClosing)
2048 			stcheck = STNOP;
2049 		break;
2050 	case XD_CLOSED:
2051 		if ((oestate == XenbusStateUnknown)		||
2052 		    (oestate == XenbusStateConnected))
2053 			stcheck = STBUG;
2054 		else if ((oestate == XenbusStateInitWait)	||
2055 		    (oestate == XenbusStateInitialised)		||
2056 		    (oestate == XenbusStateClosing)		||
2057 		    (oestate == XenbusStateClosed))
2058 			stcheck = STNOP;
2059 		break;
2060 	case XD_SUSPEND:
2061 	default:
2062 			stcheck = STBUG;
2063 	}
2064 
2065 	if (stcheck == STOK)
2066 		return (DDI_SUCCESS);
2067 
2068 	if (stcheck == STBUG)
2069 		cmn_err(CE_NOTE, "xdf@%s: unexpected otherend "
2070 		    "state change to %d!, when status is %d",
2071 		    ddi_get_name_addr(vdp->xdf_dip), oestate, status);
2072 
2073 	return (DDI_FAILURE);
2074 }
2075 
2076 static int
2077 xdf_connect(xdf_t *vdp, boolean_t wait)
2078 {
2079 	ASSERT(mutex_owned(&vdp->xdf_dev_lk));
2080 	while (vdp->xdf_status != XD_READY) {
2081 		if (!wait || (vdp->xdf_status > XD_READY))
2082 			break;
2083 
2084 		if (cv_wait_sig(&vdp->xdf_dev_cv, &vdp->xdf_dev_lk) == 0)
2085 			break;
2086 	}
2087 
2088 	return (vdp->xdf_status);
2089 }
2090 
2091 /*
2092  * callback func when DMA/GTE resources is available
2093  *
2094  * Note: we only register one callback function to grant table subsystem
2095  * since we only have one 'struct gnttab_free_callback' in xdf_t.
2096  */
2097 static int
2098 xdf_dmacallback(caddr_t arg)
2099 {
2100 	xdf_t *vdp = (xdf_t *)arg;
2101 	ASSERT(vdp != NULL);
2102 
2103 	DPRINTF(DMA_DBG, ("xdf@%s: DMA callback started\n",
2104 	    ddi_get_name_addr(vdp->xdf_dip)));
2105 
2106 	ddi_trigger_softintr(vdp->xdf_softintr_id);
2107 	return (DDI_DMA_CALLBACK_DONE);
2108 }
2109 
2110 static uint_t
2111 xdf_iorestart(caddr_t arg)
2112 {
2113 	xdf_t *vdp = (xdf_t *)arg;
2114 
2115 	ASSERT(vdp != NULL);
2116 
2117 	mutex_enter(&vdp->xdf_dev_lk);
2118 	ASSERT(ISDMACBON(vdp));
2119 	SETDMACBOFF(vdp);
2120 	mutex_exit(&vdp->xdf_dev_lk);
2121 
2122 	xdf_iostart(vdp);
2123 
2124 	return (DDI_INTR_CLAIMED);
2125 }
2126 
2127 static void
2128 xdf_timeout_handler(void *arg)
2129 {
2130 	xdf_t *vdp = arg;
2131 
2132 	mutex_enter(&vdp->xdf_dev_lk);
2133 	vdp->xdf_timeout_id = 0;
2134 	mutex_exit(&vdp->xdf_dev_lk);
2135 
2136 	/* new timeout thread could be re-scheduled */
2137 	xdf_iostart(vdp);
2138 }
2139 
2140 /*
2141  * Alloc a vreq for this bp
2142  * bp->av_back contains the pointer to the vreq upon return
2143  */
2144 static v_req_t *
2145 vreq_get(xdf_t *vdp, buf_t *bp)
2146 {
2147 	v_req_t *vreq = NULL;
2148 
2149 	ASSERT(BP2VREQ(bp) == NULL);
2150 
2151 	vreq = kmem_cache_alloc(xdf_vreq_cache, KM_NOSLEEP);
2152 	if (vreq == NULL) {
2153 		if (vdp->xdf_timeout_id == 0)
2154 			/* restart I/O after one second */
2155 			vdp->xdf_timeout_id =
2156 			    timeout(xdf_timeout_handler, vdp, hz);
2157 		return (NULL);
2158 	}
2159 	bzero(vreq, sizeof (v_req_t));
2160 
2161 	list_insert_head(&vdp->xdf_vreq_act, (void *)vreq);
2162 	bp->av_back = (buf_t *)vreq;
2163 	vreq->v_buf = bp;
2164 	vreq->v_status = VREQ_INIT;
2165 	/* init of other fields in vreq is up to the caller */
2166 
2167 	return (vreq);
2168 }
2169 
2170 static void
2171 vreq_free(xdf_t *vdp, v_req_t *vreq)
2172 {
2173 	buf_t *bp = vreq->v_buf;
2174 
2175 	list_remove(&vdp->xdf_vreq_act, (void *)vreq);
2176 
2177 	switch (vreq->v_status) {
2178 	case VREQ_DMAWIN_DONE:
2179 	case VREQ_GS_ALLOCED:
2180 	case VREQ_DMABUF_BOUND:
2181 		(void) ddi_dma_unbind_handle(vreq->v_dmahdl);
2182 		/*FALLTHRU*/
2183 	case VREQ_DMAMEM_ALLOCED:
2184 		if (!ALIGNED_XFER(bp)) {
2185 			ASSERT(vreq->v_abuf != NULL);
2186 			if (!IS_ERROR(bp) && IS_READ(bp))
2187 				bcopy(vreq->v_abuf, bp->b_un.b_addr,
2188 				    bp->b_bcount);
2189 			ddi_dma_mem_free(&vreq->v_align);
2190 		}
2191 		/*FALLTHRU*/
2192 	case VREQ_MEMDMAHDL_ALLOCED:
2193 		if (!ALIGNED_XFER(bp))
2194 			ddi_dma_free_handle(&vreq->v_memdmahdl);
2195 		/*FALLTHRU*/
2196 	case VREQ_DMAHDL_ALLOCED:
2197 		ddi_dma_free_handle(&vreq->v_dmahdl);
2198 		break;
2199 	default:
2200 		break;
2201 	}
2202 	vreq->v_buf->av_back = NULL;
2203 	kmem_cache_free(xdf_vreq_cache, vreq);
2204 }
2205 
2206 /*
2207  * Initalize the DMA and grant table resources for the buf
2208  */
2209 static int
2210 vreq_setup(xdf_t *vdp, v_req_t *vreq)
2211 {
2212 	int rc;
2213 	ddi_dma_attr_t dmaattr;
2214 	uint_t ndcs, ndws;
2215 	ddi_dma_handle_t dh;
2216 	ddi_dma_handle_t mdh;
2217 	ddi_dma_cookie_t dc;
2218 	ddi_acc_handle_t abh;
2219 	caddr_t	aba;
2220 	ge_slot_t *gs;
2221 	size_t bufsz;
2222 	off_t off;
2223 	size_t sz;
2224 	buf_t *bp = vreq->v_buf;
2225 	int dma_flags = (IS_READ(bp) ? DDI_DMA_READ : DDI_DMA_WRITE) |
2226 	    DDI_DMA_STREAMING | DDI_DMA_PARTIAL;
2227 
2228 	switch (vreq->v_status) {
2229 	case VREQ_INIT:
2230 		if (IS_FLUSH_DISKCACHE(bp)) {
2231 			if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
2232 				DPRINTF(DMA_DBG, (
2233 				    "xdf@%s: get ge_slotfailed\n",
2234 				    ddi_get_name_addr(vdp->xdf_dip)));
2235 				return (DDI_FAILURE);
2236 			}
2237 			vreq->v_blkno = 0;
2238 			vreq->v_nslots = 1;
2239 			vreq->v_gs = gs;
2240 			vreq->v_flush_diskcache = FLUSH_DISKCACHE;
2241 			gs->vreq = vreq;
2242 			return (DDI_SUCCESS);
2243 		}
2244 
2245 		if (IS_WRITE_BARRIER(vdp, bp))
2246 			vreq->v_flush_diskcache = WRITE_BARRIER;
2247 		vreq->v_blkno = bp->b_blkno +
2248 		    (diskaddr_t)(uintptr_t)bp->b_private;
2249 		bp->b_private = NULL;
2250 		/* See if we wrote new data to our flush block */
2251 		if (!IS_READ(bp) && USE_WRITE_BARRIER(vdp))
2252 			check_fbwrite(vdp, bp, vreq->v_blkno);
2253 		vreq->v_status = VREQ_INIT_DONE;
2254 		/*FALLTHRU*/
2255 
2256 	case VREQ_INIT_DONE:
2257 		/*
2258 		 * alloc DMA handle
2259 		 */
2260 		rc = ddi_dma_alloc_handle(vdp->xdf_dip, &xb_dma_attr,
2261 		    xdf_dmacallback, (caddr_t)vdp, &dh);
2262 		if (rc != DDI_SUCCESS) {
2263 			SETDMACBON(vdp);
2264 			DPRINTF(DMA_DBG, ("xdf@%s: DMA handle alloc failed\n",
2265 			    ddi_get_name_addr(vdp->xdf_dip)));
2266 			return (DDI_FAILURE);
2267 		}
2268 
2269 		vreq->v_dmahdl = dh;
2270 		vreq->v_status = VREQ_DMAHDL_ALLOCED;
2271 		/*FALLTHRU*/
2272 
2273 	case VREQ_DMAHDL_ALLOCED:
2274 		/*
2275 		 * alloc dma handle for 512-byte aligned buf
2276 		 */
2277 		if (!ALIGNED_XFER(bp)) {
2278 			/*
2279 			 * XXPV: we need to temporarily enlarge the seg
2280 			 * boundary and s/g length to work round CR6381968
2281 			 */
2282 			dmaattr = xb_dma_attr;
2283 			dmaattr.dma_attr_seg = (uint64_t)-1;
2284 			dmaattr.dma_attr_sgllen = INT_MAX;
2285 			rc = ddi_dma_alloc_handle(vdp->xdf_dip, &dmaattr,
2286 			    xdf_dmacallback, (caddr_t)vdp, &mdh);
2287 			if (rc != DDI_SUCCESS) {
2288 				SETDMACBON(vdp);
2289 				DPRINTF(DMA_DBG, ("xdf@%s: unaligned buf DMA"
2290 				    "handle alloc failed\n",
2291 				    ddi_get_name_addr(vdp->xdf_dip)));
2292 				return (DDI_FAILURE);
2293 			}
2294 			vreq->v_memdmahdl = mdh;
2295 			vreq->v_status = VREQ_MEMDMAHDL_ALLOCED;
2296 		}
2297 		/*FALLTHRU*/
2298 
2299 	case VREQ_MEMDMAHDL_ALLOCED:
2300 		/*
2301 		 * alloc 512-byte aligned buf
2302 		 */
2303 		if (!ALIGNED_XFER(bp)) {
2304 			if (bp->b_flags & (B_PAGEIO | B_PHYS))
2305 				bp_mapin(bp);
2306 
2307 			rc = ddi_dma_mem_alloc(vreq->v_memdmahdl,
2308 			    roundup(bp->b_bcount, XB_BSIZE), &xc_acc_attr,
2309 			    DDI_DMA_STREAMING, xdf_dmacallback, (caddr_t)vdp,
2310 			    &aba, &bufsz, &abh);
2311 			if (rc != DDI_SUCCESS) {
2312 				SETDMACBON(vdp);
2313 				DPRINTF(DMA_DBG, (
2314 				    "xdf@%s: DMA mem allocation failed\n",
2315 				    ddi_get_name_addr(vdp->xdf_dip)));
2316 				return (DDI_FAILURE);
2317 			}
2318 
2319 			vreq->v_abuf = aba;
2320 			vreq->v_align = abh;
2321 			vreq->v_status = VREQ_DMAMEM_ALLOCED;
2322 
2323 			ASSERT(bufsz >= bp->b_bcount);
2324 			if (!IS_READ(bp))
2325 				bcopy(bp->b_un.b_addr, vreq->v_abuf,
2326 				    bp->b_bcount);
2327 		}
2328 		/*FALLTHRU*/
2329 
2330 	case VREQ_DMAMEM_ALLOCED:
2331 		/*
2332 		 * dma bind
2333 		 */
2334 		if (ALIGNED_XFER(bp)) {
2335 			rc = ddi_dma_buf_bind_handle(vreq->v_dmahdl, bp,
2336 			    dma_flags, xdf_dmacallback, (caddr_t)vdp,
2337 			    &dc, &ndcs);
2338 		} else {
2339 			rc = ddi_dma_addr_bind_handle(vreq->v_dmahdl,
2340 			    NULL, vreq->v_abuf, bp->b_bcount, dma_flags,
2341 			    xdf_dmacallback, (caddr_t)vdp, &dc, &ndcs);
2342 		}
2343 		if (rc == DDI_DMA_MAPPED || rc == DDI_DMA_PARTIAL_MAP) {
2344 			/* get num of dma windows */
2345 			if (rc == DDI_DMA_PARTIAL_MAP) {
2346 				rc = ddi_dma_numwin(vreq->v_dmahdl, &ndws);
2347 				ASSERT(rc == DDI_SUCCESS);
2348 			} else {
2349 				ndws = 1;
2350 			}
2351 		} else {
2352 			SETDMACBON(vdp);
2353 			DPRINTF(DMA_DBG, ("xdf@%s: DMA bind failed\n",
2354 			    ddi_get_name_addr(vdp->xdf_dip)));
2355 			return (DDI_FAILURE);
2356 		}
2357 
2358 		vreq->v_dmac = dc;
2359 		vreq->v_dmaw = 0;
2360 		vreq->v_ndmacs = ndcs;
2361 		vreq->v_ndmaws = ndws;
2362 		vreq->v_nslots = ndws;
2363 		vreq->v_status = VREQ_DMABUF_BOUND;
2364 		/*FALLTHRU*/
2365 
2366 	case VREQ_DMABUF_BOUND:
2367 		/*
2368 		 * get ge_slot, callback is set upon failure from gs_get(),
2369 		 * if not set previously
2370 		 */
2371 		if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
2372 			DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
2373 			    ddi_get_name_addr(vdp->xdf_dip)));
2374 			return (DDI_FAILURE);
2375 		}
2376 
2377 		vreq->v_gs = gs;
2378 		gs->vreq = vreq;
2379 		vreq->v_status = VREQ_GS_ALLOCED;
2380 		break;
2381 
2382 	case VREQ_GS_ALLOCED:
2383 		/* nothing need to be done */
2384 		break;
2385 
2386 	case VREQ_DMAWIN_DONE:
2387 		/*
2388 		 * move to the next dma window
2389 		 */
2390 		ASSERT((vreq->v_dmaw + 1) < vreq->v_ndmaws);
2391 
2392 		/* get a ge_slot for this DMA window */
2393 		if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
2394 			DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
2395 			    ddi_get_name_addr(vdp->xdf_dip)));
2396 			return (DDI_FAILURE);
2397 		}
2398 
2399 		vreq->v_gs = gs;
2400 		gs->vreq = vreq;
2401 		vreq->v_dmaw++;
2402 		rc = ddi_dma_getwin(vreq->v_dmahdl, vreq->v_dmaw, &off, &sz,
2403 		    &vreq->v_dmac, &vreq->v_ndmacs);
2404 		ASSERT(rc == DDI_SUCCESS);
2405 		vreq->v_status = VREQ_GS_ALLOCED;
2406 		break;
2407 
2408 	default:
2409 		return (DDI_FAILURE);
2410 	}
2411 
2412 	return (DDI_SUCCESS);
2413 }
2414 
2415 static ge_slot_t *
2416 gs_get(xdf_t *vdp, int isread)
2417 {
2418 	grant_ref_t gh;
2419 	ge_slot_t *gs;
2420 
2421 	/* try to alloc GTEs needed in this slot, first */
2422 	if (gnttab_alloc_grant_references(
2423 	    BLKIF_MAX_SEGMENTS_PER_REQUEST, &gh) == -1) {
2424 		if (vdp->xdf_gnt_callback.next == NULL) {
2425 			SETDMACBON(vdp);
2426 			gnttab_request_free_callback(
2427 			    &vdp->xdf_gnt_callback,
2428 			    (void (*)(void *))xdf_dmacallback,
2429 			    (void *)vdp,
2430 			    BLKIF_MAX_SEGMENTS_PER_REQUEST);
2431 		}
2432 		return (NULL);
2433 	}
2434 
2435 	gs = kmem_cache_alloc(xdf_gs_cache, KM_NOSLEEP);
2436 	if (gs == NULL) {
2437 		gnttab_free_grant_references(gh);
2438 		if (vdp->xdf_timeout_id == 0)
2439 			/* restart I/O after one second */
2440 			vdp->xdf_timeout_id =
2441 			    timeout(xdf_timeout_handler, vdp, hz);
2442 		return (NULL);
2443 	}
2444 
2445 	/* init gs_slot */
2446 	list_insert_head(&vdp->xdf_gs_act, (void *)gs);
2447 	gs->oeid = vdp->xdf_peer;
2448 	gs->isread = isread;
2449 	gs->ghead = gh;
2450 	gs->ngrefs = 0;
2451 
2452 	return (gs);
2453 }
2454 
2455 static void
2456 gs_free(xdf_t *vdp, ge_slot_t *gs)
2457 {
2458 	int i;
2459 	grant_ref_t *gp = gs->ge;
2460 	int ngrefs = gs->ngrefs;
2461 	boolean_t isread = gs->isread;
2462 
2463 	list_remove(&vdp->xdf_gs_act, (void *)gs);
2464 
2465 	/* release all grant table entry resources used in this slot */
2466 	for (i = 0; i < ngrefs; i++, gp++)
2467 		gnttab_end_foreign_access(*gp, !isread, 0);
2468 	gnttab_free_grant_references(gs->ghead);
2469 
2470 	kmem_cache_free(xdf_gs_cache, (void *)gs);
2471 }
2472 
2473 static grant_ref_t
2474 gs_grant(ge_slot_t *gs, mfn_t mfn)
2475 {
2476 	grant_ref_t gr = gnttab_claim_grant_reference(&gs->ghead);
2477 
2478 	ASSERT(gr != -1);
2479 	ASSERT(gs->ngrefs < BLKIF_MAX_SEGMENTS_PER_REQUEST);
2480 	gs->ge[gs->ngrefs++] = gr;
2481 	gnttab_grant_foreign_access_ref(gr, gs->oeid, mfn, !gs->isread);
2482 
2483 	return (gr);
2484 }
2485 
2486 static void
2487 unexpectedie(xdf_t *vdp)
2488 {
2489 	/* clean up I/Os in ring that have responses */
2490 	if (xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring)) {
2491 		mutex_exit(&vdp->xdf_dev_lk);
2492 		(void) xdf_intr((caddr_t)vdp);
2493 		mutex_enter(&vdp->xdf_dev_lk);
2494 	}
2495 
2496 	/* free up all grant table entries */
2497 	while (!list_is_empty(&vdp->xdf_gs_act))
2498 		gs_free(vdp, list_head(&vdp->xdf_gs_act));
2499 
2500 	/*
2501 	 * move bp back to active list orderly
2502 	 * vreq_busy is updated in vreq_free()
2503 	 */
2504 	while (!list_is_empty(&vdp->xdf_vreq_act)) {
2505 		v_req_t *vreq = list_head(&vdp->xdf_vreq_act);
2506 		buf_t *bp = vreq->v_buf;
2507 
2508 		bp->av_back = NULL;
2509 		bp->b_resid = bp->b_bcount;
2510 		if (vdp->xdf_f_act == NULL) {
2511 			vdp->xdf_f_act = vdp->xdf_l_act = bp;
2512 		} else {
2513 			/* move to the head of list */
2514 			bp->av_forw = vdp->xdf_f_act;
2515 			vdp->xdf_f_act = bp;
2516 		}
2517 		kstat_runq_back_to_waitq(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
2518 		vreq_free(vdp, vreq);
2519 	}
2520 }
2521 
2522 static void
2523 xdfmin(struct buf *bp)
2524 {
2525 	if (bp->b_bcount > xdf_maxphys)
2526 		bp->b_bcount = xdf_maxphys;
2527 }
2528