xref: /titanic_50/usr/src/uts/common/xen/io/xdf.c (revision 4421e67684faea98cd9bffa503bdc3779557762f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * xdf.c - Xen Virtual Block Device Driver
29  * TODO:
30  *	- support alternate block size (currently only DEV_BSIZE supported)
31  *	- revalidate geometry for removable devices
32  */
33 
34 #pragma ident	"%Z%%M%	%I%	%E% SMI"
35 
36 #include "xdf.h"
37 
38 #define	FLUSH_DISKCACHE	0x1
39 #define	WRITE_BARRIER	0x2
40 #define	DEFAULT_FLUSH_BLOCK	156 /* block to write to cause a cache flush */
41 #define	USE_WRITE_BARRIER(vdp)				\
42 	((vdp)->xdf_feature_barrier && !(vdp)->xdf_flush_supported)
43 #define	USE_FLUSH_DISKCACHE(vdp)			\
44 	((vdp)->xdf_feature_barrier && (vdp)->xdf_flush_supported)
45 #define	IS_WRITE_BARRIER(vdp, bp)			\
46 	(!IS_READ(bp) && USE_WRITE_BARRIER(vdp) &&	\
47 	((bp)->b_un.b_addr == (vdp)->xdf_cache_flush_block))
48 #define	IS_FLUSH_DISKCACHE(bp)				\
49 	(!IS_READ(bp) && USE_FLUSH_DISKCACHE(vdp) && ((bp)->b_bcount == 0))
50 
51 static void *vbd_ss;
52 static kmem_cache_t *xdf_vreq_cache;
53 static kmem_cache_t *xdf_gs_cache;
54 static int xdf_maxphys = XB_MAXPHYS;
55 int xdfdebug = 0;
56 extern int do_polled_io;
57 diskaddr_t xdf_flush_block = DEFAULT_FLUSH_BLOCK;
58 int	xdf_barrier_flush_disable = 0;
59 
60 /*
61  * dev_ops and cb_ops entrypoints
62  */
63 static int xdf_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
64 static int xdf_attach(dev_info_t *, ddi_attach_cmd_t);
65 static int xdf_detach(dev_info_t *, ddi_detach_cmd_t);
66 static int xdf_reset(dev_info_t *, ddi_reset_cmd_t);
67 static int xdf_open(dev_t *, int, int, cred_t *);
68 static int xdf_close(dev_t, int, int, struct cred *);
69 static int xdf_strategy(struct buf *);
70 static int xdf_read(dev_t, struct uio *, cred_t *);
71 static int xdf_aread(dev_t, struct aio_req *, cred_t *);
72 static int xdf_write(dev_t, struct uio *, cred_t *);
73 static int xdf_awrite(dev_t, struct aio_req *, cred_t *);
74 static int xdf_dump(dev_t, caddr_t, daddr_t, int);
75 static int xdf_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
76 static uint_t xdf_intr(caddr_t);
77 static int xdf_prop_op(dev_t, dev_info_t *, ddi_prop_op_t, int, char *,
78     caddr_t, int *);
79 
80 /*
81  * misc private functions
82  */
83 static int xdf_suspend(dev_info_t *);
84 static int xdf_resume(dev_info_t *);
85 static int xdf_start_connect(xdf_t *);
86 static int xdf_start_disconnect(xdf_t *);
87 static int xdf_post_connect(xdf_t *);
88 static void xdf_post_disconnect(xdf_t *);
89 static void xdf_oe_change(dev_info_t *, ddi_eventcookie_t, void *, void *);
90 static void xdf_iostart(xdf_t *);
91 static void xdf_iofini(xdf_t *, uint64_t, int);
92 static int xdf_prepare_rreq(xdf_t *, struct buf *, blkif_request_t *);
93 static int xdf_drain_io(xdf_t *);
94 static boolean_t xdf_isopen(xdf_t *, int);
95 static int xdf_check_state_transition(xdf_t *, XenbusState);
96 static int xdf_connect(xdf_t *, boolean_t);
97 static int xdf_dmacallback(caddr_t);
98 static void xdf_timeout_handler(void *);
99 static uint_t xdf_iorestart(caddr_t);
100 static v_req_t *vreq_get(xdf_t *, buf_t *);
101 static void vreq_free(xdf_t *, v_req_t *);
102 static int vreq_setup(xdf_t *, v_req_t *);
103 static ge_slot_t *gs_get(xdf_t *, int);
104 static void gs_free(xdf_t *, ge_slot_t *);
105 static grant_ref_t gs_grant(ge_slot_t *, mfn_t);
106 static void unexpectedie(xdf_t *);
107 static void xdfmin(struct buf *);
108 
109 static 	struct cb_ops xdf_cbops = {
110 	xdf_open,
111 	xdf_close,
112 	xdf_strategy,
113 	nodev,
114 	xdf_dump,
115 	xdf_read,
116 	xdf_write,
117 	xdf_ioctl,
118 	nodev,
119 	nodev,
120 	nodev,
121 	nochpoll,
122 	xdf_prop_op,
123 	NULL,
124 	D_MP | D_NEW | D_64BIT,
125 	CB_REV,
126 	xdf_aread,
127 	xdf_awrite
128 };
129 
130 struct dev_ops xdf_devops = {
131 	DEVO_REV,		/* devo_rev */
132 	0,			/* devo_refcnt */
133 	xdf_getinfo,		/* devo_getinfo */
134 	nulldev,		/* devo_identify */
135 	nulldev,		/* devo_probe */
136 	xdf_attach,		/* devo_attach */
137 	xdf_detach,		/* devo_detach */
138 	xdf_reset,		/* devo_reset */
139 	&xdf_cbops,		/* devo_cb_ops */
140 	(struct bus_ops *)NULL	/* devo_bus_ops */
141 };
142 
143 static struct modldrv modldrv = {
144 	&mod_driverops,		/* Type of module.  This one is a driver */
145 	"virtual block driver %I%",	/* short description */
146 	&xdf_devops		/* driver specific ops */
147 };
148 
149 static struct modlinkage xdf_modlinkage = {
150 	MODREV_1, (void *)&modldrv, NULL
151 };
152 
153 /*
154  * I/O buffer DMA attributes
155  * Make sure: one DMA window contains BLKIF_MAX_SEGMENTS_PER_REQUEST at most
156  */
157 static ddi_dma_attr_t xb_dma_attr = {
158 	DMA_ATTR_V0,
159 	(uint64_t)0,			/* lowest address */
160 	(uint64_t)0xffffffffffffffff,	/* highest usable address */
161 	(uint64_t)0xffffff,		/* DMA counter limit max */
162 	(uint64_t)XB_BSIZE,		/* alignment in bytes */
163 	XB_BSIZE - 1,			/* bitmap of burst sizes */
164 	XB_BSIZE,			/* min transfer */
165 	(uint64_t)XB_MAX_XFER, 		/* maximum transfer */
166 	(uint64_t)PAGEOFFSET,		/* 1 page segment length  */
167 	BLKIF_MAX_SEGMENTS_PER_REQUEST,	/* maximum number of segments */
168 	XB_BSIZE,			/* granularity */
169 	0,				/* flags (reserved) */
170 };
171 
172 static ddi_device_acc_attr_t xc_acc_attr = {
173 	DDI_DEVICE_ATTR_V0,
174 	DDI_NEVERSWAP_ACC,
175 	DDI_STRICTORDER_ACC
176 };
177 
178 /* callbacks from commmon label */
179 
180 static int xdf_lb_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t,
181 	void *);
182 static int xdf_lb_getinfo(dev_info_t *, int, void *, void *);
183 
184 static cmlb_tg_ops_t xdf_lb_ops = {
185 	TG_DK_OPS_VERSION_1,
186 	xdf_lb_rdwr,
187 	xdf_lb_getinfo
188 };
189 
190 int
191 _init(void)
192 {
193 	int rc;
194 
195 	if ((rc = ddi_soft_state_init(&vbd_ss, sizeof (xdf_t), 0)) == 0) {
196 		xdf_vreq_cache = kmem_cache_create("xdf_vreq_cache",
197 		    sizeof (v_req_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
198 		ASSERT(xdf_vreq_cache != NULL);
199 		xdf_gs_cache = kmem_cache_create("xdf_gs_cache",
200 		    sizeof (ge_slot_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
201 		ASSERT(xdf_gs_cache != NULL);
202 		if ((rc = mod_install(&xdf_modlinkage)) != 0) {
203 			kmem_cache_destroy(xdf_vreq_cache);
204 			kmem_cache_destroy(xdf_gs_cache);
205 			ddi_soft_state_fini(&vbd_ss);
206 		}
207 	}
208 
209 	return (rc);
210 }
211 
212 int
213 _fini(void)
214 {
215 	int err;
216 
217 	if ((err = mod_remove(&xdf_modlinkage)) != 0)
218 		return (err);
219 
220 	kmem_cache_destroy(xdf_vreq_cache);
221 	kmem_cache_destroy(xdf_gs_cache);
222 	ddi_soft_state_fini(&vbd_ss);
223 
224 	return (0);
225 }
226 
227 int
228 _info(struct modinfo *modinfop)
229 {
230 	return (mod_info(&xdf_modlinkage, modinfop));
231 }
232 
233 /*ARGSUSED*/
234 static int
235 xdf_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **rp)
236 {
237 	int instance;
238 	xdf_t *vbdp;
239 
240 	instance = XDF_INST(getminor((dev_t)arg));
241 
242 	switch (cmd) {
243 	case DDI_INFO_DEVT2DEVINFO:
244 		if ((vbdp = ddi_get_soft_state(vbd_ss, instance)) == NULL) {
245 			*rp = NULL;
246 			return (DDI_FAILURE);
247 		}
248 		*rp = vbdp->xdf_dip;
249 		return (DDI_SUCCESS);
250 
251 	case DDI_INFO_DEVT2INSTANCE:
252 		*rp = (void *)(uintptr_t)instance;
253 		return (DDI_SUCCESS);
254 
255 	default:
256 		return (DDI_FAILURE);
257 	}
258 }
259 
260 static int
261 xdf_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags,
262 	char *name, caddr_t valuep, int *lengthp)
263 {
264 	int instance = ddi_get_instance(dip);
265 	xdf_t *vdp;
266 	diskaddr_t p_blkcnt;
267 
268 	/*
269 	 * xdf dynamic properties are device specific and size oriented.
270 	 * Requests issued under conditions where size is valid are passed
271 	 * to ddi_prop_op_nblocks with the size information, otherwise the
272 	 * request is passed to ddi_prop_op.
273 	 */
274 	vdp = ddi_get_soft_state(vbd_ss, instance);
275 
276 	if ((dev == DDI_DEV_T_ANY) || (vdp == NULL))
277 		return (ddi_prop_op(dev, dip, prop_op, mod_flags,
278 		    name, valuep, lengthp));
279 
280 	/* do cv_wait until connected or failed */
281 	mutex_enter(&vdp->xdf_dev_lk);
282 	if (xdf_connect(vdp, B_TRUE) != XD_READY) {
283 		mutex_exit(&vdp->xdf_dev_lk);
284 		goto out;
285 	}
286 	mutex_exit(&vdp->xdf_dev_lk);
287 
288 	if (cmlb_partinfo(vdp->xdf_vd_lbl, XDF_PART(getminor(dev)), &p_blkcnt,
289 	    NULL, NULL, NULL, NULL) == 0)
290 		return (ddi_prop_op_nblocks(dev, dip, prop_op, mod_flags,
291 		    name, valuep, lengthp, (uint64_t)p_blkcnt));
292 
293 out:
294 	return (ddi_prop_op(dev, dip, prop_op, mod_flags, name, valuep,
295 	    lengthp));
296 }
297 
298 static int
299 xdf_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
300 {
301 	xdf_t *vdp;
302 	ddi_iblock_cookie_t ibc;
303 	ddi_iblock_cookie_t softibc;
304 	int instance;
305 
306 	xdfdebug = ddi_prop_get_int(DDI_DEV_T_ANY, devi, DDI_PROP_NOTPROM,
307 	    "xdfdebug", 0);
308 
309 	switch (cmd) {
310 		case DDI_ATTACH:
311 			break;
312 
313 		case DDI_RESUME:
314 			return (xdf_resume(devi));
315 
316 		default:
317 			return (DDI_FAILURE);
318 	}
319 
320 	instance = ddi_get_instance(devi);
321 	if (ddi_soft_state_zalloc(vbd_ss, instance) != DDI_SUCCESS)
322 		return (DDI_FAILURE);
323 
324 	DPRINTF(DDI_DBG, ("xdf%d: attaching\n", instance));
325 	vdp = ddi_get_soft_state(vbd_ss, instance);
326 	vdp->xdf_dip = devi;
327 	if (ddi_get_iblock_cookie(devi, 0, &ibc) != DDI_SUCCESS) {
328 		cmn_err(CE_WARN, "xdf@%s: failed to get iblock cookie",
329 		    ddi_get_name_addr(devi));
330 		goto errout1;
331 	}
332 
333 	mutex_init(&vdp->xdf_dev_lk, NULL, MUTEX_DRIVER, (void *)ibc);
334 	mutex_init(&vdp->xdf_cb_lk, NULL, MUTEX_DRIVER, (void *)ibc);
335 	cv_init(&vdp->xdf_dev_cv, NULL, CV_DEFAULT, NULL);
336 	ddi_set_driver_private(devi, vdp);
337 
338 	if (ddi_get_soft_iblock_cookie(devi, DDI_SOFTINT_LOW, &softibc)
339 	    != DDI_SUCCESS) {
340 		cmn_err(CE_WARN, "xdf@%s: failed to get softintr iblock cookie",
341 		    ddi_get_name_addr(devi));
342 		goto errout2;
343 	}
344 	if (ddi_add_softintr(devi, DDI_SOFTINT_LOW, &vdp->xdf_softintr_id,
345 	    &softibc, NULL, xdf_iorestart, (caddr_t)vdp) != DDI_SUCCESS) {
346 		cmn_err(CE_WARN, "xdf@%s: failed to add softintr",
347 		    ddi_get_name_addr(devi));
348 		goto errout2;
349 	}
350 
351 	/*
352 	 * create kstat for iostat(1M)
353 	 */
354 	if ((vdp->xdf_xdev_iostat = kstat_create("xdf", instance, NULL, "disk",
355 	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) != NULL) {
356 		vdp->xdf_xdev_iostat->ks_lock = &vdp->xdf_dev_lk;
357 		kstat_install(vdp->xdf_xdev_iostat);
358 	} else {
359 		cmn_err(CE_WARN, "xdf@%s: failed to create kstat",
360 		    ddi_get_name_addr(devi));
361 		goto errout3;
362 	}
363 
364 	/*
365 	 * driver handles kernel-issued IOCTLs
366 	 */
367 	if (ddi_prop_create(DDI_DEV_T_NONE, devi, DDI_PROP_CANSLEEP,
368 	    DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) {
369 		cmn_err(CE_WARN, "xdf@%s: cannot create DDI_KERNEL_IOCTL prop",
370 		    ddi_get_name_addr(devi));
371 		goto errout4;
372 	}
373 
374 	/*
375 	 * create default device minor nodes: non-removable disk
376 	 * we will adjust minor nodes after we are connected w/ backend
377 	 */
378 	cmlb_alloc_handle(&vdp->xdf_vd_lbl);
379 	if (cmlb_attach(devi, &xdf_lb_ops, DTYPE_DIRECT, 0, 1, DDI_NT_BLOCK,
380 	    CMLB_FAKE_LABEL_ONE_PARTITION, vdp->xdf_vd_lbl, NULL) != 0) {
381 		cmn_err(CE_WARN, "xdf@%s: default cmlb attach failed",
382 		    ddi_get_name_addr(devi));
383 		goto errout5;
384 	}
385 
386 	/*
387 	 * We ship with cache-enabled disks
388 	 */
389 	vdp->xdf_wce = 1;
390 
391 	mutex_enter(&vdp->xdf_cb_lk);
392 
393 	/* Watch backend XenbusState change */
394 	if (xvdi_add_event_handler(devi, XS_OE_STATE,
395 	    xdf_oe_change) != DDI_SUCCESS) {
396 		mutex_exit(&vdp->xdf_cb_lk);
397 		goto errout6;
398 	}
399 
400 	if (xdf_start_connect(vdp) != DDI_SUCCESS) {
401 		cmn_err(CE_WARN, "xdf@%s: start connection failed",
402 		    ddi_get_name_addr(devi));
403 		(void) xdf_start_disconnect(vdp);
404 		mutex_exit(&vdp->xdf_cb_lk);
405 		goto errout7;
406 	}
407 
408 	mutex_exit(&vdp->xdf_cb_lk);
409 
410 	list_create(&vdp->xdf_vreq_act, sizeof (v_req_t),
411 	    offsetof(v_req_t, v_link));
412 	list_create(&vdp->xdf_gs_act, sizeof (ge_slot_t),
413 	    offsetof(ge_slot_t, link));
414 
415 	ddi_report_dev(devi);
416 	DPRINTF(DDI_DBG, ("xdf%d: attached\n", instance));
417 
418 	return (DDI_SUCCESS);
419 
420 errout7:
421 	xvdi_remove_event_handler(devi, XS_OE_STATE);
422 errout6:
423 	cmlb_detach(vdp->xdf_vd_lbl, NULL);
424 errout5:
425 	cmlb_free_handle(&vdp->xdf_vd_lbl);
426 	ddi_prop_remove_all(devi);
427 errout4:
428 	kstat_delete(vdp->xdf_xdev_iostat);
429 errout3:
430 	ddi_remove_softintr(vdp->xdf_softintr_id);
431 errout2:
432 	ddi_set_driver_private(devi, NULL);
433 	cv_destroy(&vdp->xdf_dev_cv);
434 	mutex_destroy(&vdp->xdf_cb_lk);
435 	mutex_destroy(&vdp->xdf_dev_lk);
436 errout1:
437 	cmn_err(CE_WARN, "xdf@%s: attach failed", ddi_get_name_addr(devi));
438 	ddi_soft_state_free(vbd_ss, instance);
439 	return (DDI_FAILURE);
440 }
441 
442 static int
443 xdf_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
444 {
445 	xdf_t *vdp;
446 	int instance;
447 
448 	switch (cmd) {
449 
450 	case DDI_PM_SUSPEND:
451 		break;
452 
453 	case DDI_SUSPEND:
454 		return (xdf_suspend(devi));
455 
456 	case DDI_DETACH:
457 		break;
458 
459 	default:
460 		return (DDI_FAILURE);
461 	}
462 
463 	instance = ddi_get_instance(devi);
464 	DPRINTF(DDI_DBG, ("xdf%d: detaching\n", instance));
465 	vdp = ddi_get_soft_state(vbd_ss, instance);
466 
467 	if (vdp == NULL)
468 		return (DDI_FAILURE);
469 
470 	mutex_enter(&vdp->xdf_dev_lk);
471 	if (xdf_isopen(vdp, -1)) {
472 		mutex_exit(&vdp->xdf_dev_lk);
473 		return (DDI_FAILURE);
474 	}
475 
476 	if (vdp->xdf_status != XD_CLOSED) {
477 		mutex_exit(&vdp->xdf_dev_lk);
478 		return (DDI_FAILURE);
479 	}
480 
481 	ASSERT(!ISDMACBON(vdp));
482 	mutex_exit(&vdp->xdf_dev_lk);
483 
484 	if (vdp->xdf_timeout_id != 0)
485 		(void) untimeout(vdp->xdf_timeout_id);
486 
487 	xvdi_remove_event_handler(devi, XS_OE_STATE);
488 
489 	/* we'll support backend running in domU later */
490 #ifdef	DOMU_BACKEND
491 	(void) xvdi_post_event(devi, XEN_HP_REMOVE);
492 #endif
493 
494 	list_destroy(&vdp->xdf_vreq_act);
495 	list_destroy(&vdp->xdf_gs_act);
496 	ddi_prop_remove_all(devi);
497 	kstat_delete(vdp->xdf_xdev_iostat);
498 	ddi_remove_softintr(vdp->xdf_softintr_id);
499 	ddi_set_driver_private(devi, NULL);
500 	cv_destroy(&vdp->xdf_dev_cv);
501 	mutex_destroy(&vdp->xdf_cb_lk);
502 	mutex_destroy(&vdp->xdf_dev_lk);
503 	if (vdp->xdf_cache_flush_block != NULL)
504 		kmem_free(vdp->xdf_flush_mem, 2 * DEV_BSIZE);
505 	ddi_soft_state_free(vbd_ss, instance);
506 	return (DDI_SUCCESS);
507 }
508 
509 static int
510 xdf_suspend(dev_info_t *devi)
511 {
512 	xdf_t *vdp;
513 	int instance;
514 	enum xdf_state st;
515 
516 	instance = ddi_get_instance(devi);
517 
518 	if (xdfdebug & SUSRES_DBG)
519 		xen_printf("xdf_suspend: xdf#%d\n", instance);
520 
521 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
522 		return (DDI_FAILURE);
523 
524 	xvdi_suspend(devi);
525 
526 	mutex_enter(&vdp->xdf_cb_lk);
527 	mutex_enter(&vdp->xdf_dev_lk);
528 	st = vdp->xdf_status;
529 	/* change status to stop further I/O requests */
530 	if (st == XD_READY)
531 		vdp->xdf_status = XD_SUSPEND;
532 	mutex_exit(&vdp->xdf_dev_lk);
533 	mutex_exit(&vdp->xdf_cb_lk);
534 
535 	/* make sure no more I/O responses left in the ring buffer */
536 	if ((st == XD_INIT) || (st == XD_READY)) {
537 		(void) ddi_remove_intr(devi, 0, NULL);
538 		(void) xdf_drain_io(vdp);
539 		/*
540 		 * no need to teardown the ring buffer here
541 		 * it will be simply re-init'ed during resume when
542 		 * we call xvdi_alloc_ring
543 		 */
544 	}
545 
546 	if (xdfdebug & SUSRES_DBG)
547 		xen_printf("xdf_suspend: SUCCESS\n");
548 
549 	return (DDI_SUCCESS);
550 }
551 
552 /*ARGSUSED*/
553 static int
554 xdf_resume(dev_info_t *devi)
555 {
556 	xdf_t *vdp;
557 	int instance;
558 
559 	instance = ddi_get_instance(devi);
560 	if (xdfdebug & SUSRES_DBG)
561 		xen_printf("xdf_resume: xdf%d\n", instance);
562 
563 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
564 		return (DDI_FAILURE);
565 
566 	mutex_enter(&vdp->xdf_cb_lk);
567 
568 	if (xvdi_resume(devi) != DDI_SUCCESS) {
569 		mutex_exit(&vdp->xdf_cb_lk);
570 		return (DDI_FAILURE);
571 	}
572 
573 	mutex_enter(&vdp->xdf_dev_lk);
574 	ASSERT(vdp->xdf_status != XD_READY);
575 	vdp->xdf_status = XD_UNKNOWN;
576 	mutex_exit(&vdp->xdf_dev_lk);
577 
578 	if (xdf_start_connect(vdp) != DDI_SUCCESS) {
579 		mutex_exit(&vdp->xdf_cb_lk);
580 		return (DDI_FAILURE);
581 	}
582 
583 	mutex_exit(&vdp->xdf_cb_lk);
584 
585 	if (xdfdebug & SUSRES_DBG)
586 		xen_printf("xdf_resume: done\n");
587 	return (DDI_SUCCESS);
588 }
589 
590 /*ARGSUSED*/
591 static int
592 xdf_reset(dev_info_t *devi, ddi_reset_cmd_t cmd)
593 {
594 	xdf_t *vdp;
595 	int instance;
596 
597 	instance = ddi_get_instance(devi);
598 	DPRINTF(DDI_DBG, ("xdf%d: resetting\n", instance));
599 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
600 		return (DDI_FAILURE);
601 
602 	/*
603 	 * wait for any outstanding I/O to complete
604 	 */
605 	(void) xdf_drain_io(vdp);
606 
607 	DPRINTF(DDI_DBG, ("xdf%d: reset complete\n", instance));
608 	return (DDI_SUCCESS);
609 }
610 
611 static int
612 xdf_open(dev_t *devp, int flag, int otyp, cred_t *credp)
613 {
614 	minor_t	minor;
615 	xdf_t	*vdp;
616 	int part;
617 	ulong_t parbit;
618 	diskaddr_t p_blkct = 0;
619 	boolean_t firstopen;
620 	boolean_t nodelay;
621 
622 	nodelay = (flag & (FNDELAY | FNONBLOCK));
623 	minor = getminor(*devp);
624 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
625 		return (ENXIO);
626 
627 	DPRINTF(DDI_DBG, ("xdf%d: opening\n", XDF_INST(minor)));
628 
629 	/* do cv_wait until connected or failed */
630 	mutex_enter(&vdp->xdf_dev_lk);
631 	if (!nodelay && (xdf_connect(vdp, B_TRUE) != XD_READY)) {
632 		mutex_exit(&vdp->xdf_dev_lk);
633 		return (ENXIO);
634 	}
635 
636 	if ((flag & FWRITE) && XD_IS_RO(vdp)) {
637 		mutex_exit(&vdp->xdf_dev_lk);
638 		return (EROFS);
639 	}
640 
641 	part = XDF_PART(minor);
642 	parbit = 1 << part;
643 	if (vdp->xdf_vd_exclopen & parbit) {
644 		mutex_exit(&vdp->xdf_dev_lk);
645 		return (EBUSY);
646 	}
647 
648 	/* are we the first one to open this node? */
649 	firstopen = !xdf_isopen(vdp, -1);
650 
651 	if ((flag & FEXCL) && !firstopen) {
652 		mutex_exit(&vdp->xdf_dev_lk);
653 		return (EBUSY);
654 	}
655 
656 	if (otyp == OTYP_LYR)
657 		vdp->xdf_vd_lyropen[part]++;
658 
659 	vdp->xdf_vd_open[otyp] |= parbit;
660 
661 	if (flag & FEXCL)
662 		vdp->xdf_vd_exclopen |= parbit;
663 
664 	mutex_exit(&vdp->xdf_dev_lk);
665 
666 	/* force a re-validation */
667 	if (firstopen)
668 		cmlb_invalidate(vdp->xdf_vd_lbl, NULL);
669 
670 	/*
671 	 * check size
672 	 * ignore CD/DVD which contains a zero-sized s0
673 	 */
674 	if (!nodelay && !XD_IS_CD(vdp) &&
675 	    ((cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
676 	    NULL, NULL, NULL, NULL) != 0) || (p_blkct == 0))) {
677 		(void) xdf_close(*devp, flag, otyp, credp);
678 		return (ENXIO);
679 	}
680 
681 	return (0);
682 }
683 
684 /*ARGSUSED*/
685 static int
686 xdf_close(dev_t dev, int flag, int otyp, struct cred *credp)
687 {
688 	minor_t	minor;
689 	xdf_t	*vdp;
690 	int part;
691 	ulong_t parbit;
692 
693 	minor = getminor(dev);
694 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
695 		return (ENXIO);
696 
697 	mutex_enter(&vdp->xdf_dev_lk);
698 	part = XDF_PART(minor);
699 	if (!xdf_isopen(vdp, part)) {
700 		mutex_exit(&vdp->xdf_dev_lk);
701 		return (ENXIO);
702 	}
703 	parbit = 1 << part;
704 
705 	if (otyp == OTYP_LYR) {
706 		if (vdp->xdf_vd_lyropen[part] != 0)
707 			vdp->xdf_vd_lyropen[part]--;
708 		if (vdp->xdf_vd_lyropen[part] == 0)
709 			vdp->xdf_vd_open[OTYP_LYR] &= ~parbit;
710 	} else {
711 		vdp->xdf_vd_open[otyp] &= ~parbit;
712 	}
713 	vdp->xdf_vd_exclopen &= ~parbit;
714 
715 	mutex_exit(&vdp->xdf_dev_lk);
716 	return (0);
717 }
718 
719 static int
720 xdf_strategy(struct buf *bp)
721 {
722 	xdf_t	*vdp;
723 	minor_t minor;
724 	diskaddr_t p_blkct, p_blkst;
725 	ulong_t nblks;
726 	int part;
727 
728 	minor = getminor(bp->b_edev);
729 	part = XDF_PART(minor);
730 	if (!(vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) ||
731 	    !xdf_isopen(vdp, part) ||
732 	    cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
733 	    &p_blkst, NULL, NULL, NULL)) {
734 		bioerror(bp, ENXIO);
735 		bp->b_resid = bp->b_bcount;
736 		biodone(bp);
737 		return (0);
738 	}
739 
740 	if (!IS_READ(bp) && XD_IS_RO(vdp)) {
741 		bioerror(bp, EROFS);
742 		bp->b_resid = bp->b_bcount;
743 		biodone(bp);
744 		return (0);
745 	}
746 
747 	/*
748 	 * starting beyond partition
749 	 */
750 	if (bp->b_blkno > p_blkct) {
751 		DPRINTF(IO_DBG, ("xdf: block %lld exceeds VBD size %"PRIu64,
752 		    (longlong_t)bp->b_blkno, (uint64_t)p_blkct));
753 		bioerror(bp, EINVAL);
754 		bp->b_resid = bp->b_bcount;
755 		biodone(bp);
756 		return (0);
757 	}
758 
759 	/* Legacy: don't set error flag at this case */
760 	if (bp->b_blkno == p_blkct) {
761 		bp->b_resid = bp->b_bcount;
762 		biodone(bp);
763 		return (0);
764 	}
765 
766 	/*
767 	 * adjust for partial transfer
768 	 */
769 	nblks = bp->b_bcount >> XB_BSHIFT;
770 	if ((bp->b_blkno + nblks) > p_blkct) {
771 		bp->b_resid = ((bp->b_blkno + nblks) - p_blkct) << XB_BSHIFT;
772 		bp->b_bcount -= bp->b_resid;
773 	}
774 
775 
776 	DPRINTF(IO_DBG, ("xdf: strategy blk %lld len %lu\n",
777 	    (longlong_t)bp->b_blkno, (ulong_t)bp->b_bcount));
778 
779 	mutex_enter(&vdp->xdf_dev_lk);
780 	kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
781 	if (vdp->xdf_f_act == NULL) {
782 		vdp->xdf_f_act = vdp->xdf_l_act = bp;
783 	} else {
784 		vdp->xdf_l_act->av_forw = bp;
785 		vdp->xdf_l_act = bp;
786 	}
787 	bp->av_forw = NULL;
788 	bp->av_back = NULL; /* not tagged with a v_req */
789 	bp->b_private = (void *)(uintptr_t)p_blkst;
790 	mutex_exit(&vdp->xdf_dev_lk);
791 	xdf_iostart(vdp);
792 	if (do_polled_io)
793 		(void) xdf_drain_io(vdp);
794 	return (0);
795 }
796 
797 /*ARGSUSED*/
798 static int
799 xdf_read(dev_t dev, struct uio *uiop, cred_t *credp)
800 {
801 
802 	xdf_t	*vdp;
803 	minor_t minor;
804 	diskaddr_t p_blkcnt;
805 	int part;
806 
807 	minor = getminor(dev);
808 	if (!(vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))))
809 		return (ENXIO);
810 
811 	DPRINTF(IO_DBG, ("xdf: read offset 0x%"PRIx64"\n",
812 	    (int64_t)uiop->uio_offset));
813 
814 	part = XDF_PART(minor);
815 	if (!xdf_isopen(vdp, part))
816 		return (ENXIO);
817 
818 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
819 	    NULL, NULL, NULL, NULL))
820 		return (ENXIO);
821 
822 	if (U_INVAL(uiop))
823 		return (EINVAL);
824 
825 	return (physio(xdf_strategy, NULL, dev, B_READ, xdfmin, uiop));
826 }
827 
828 /*ARGSUSED*/
829 static int
830 xdf_write(dev_t dev, struct uio *uiop, cred_t *credp)
831 {
832 	xdf_t *vdp;
833 	minor_t minor;
834 	diskaddr_t p_blkcnt;
835 	int part;
836 
837 	minor = getminor(dev);
838 	if (!(vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))))
839 		return (ENXIO);
840 
841 	DPRINTF(IO_DBG, ("xdf: write offset 0x%"PRIx64"\n",
842 	    (int64_t)uiop->uio_offset));
843 
844 	part = XDF_PART(minor);
845 	if (!xdf_isopen(vdp, part))
846 		return (ENXIO);
847 
848 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
849 	    NULL, NULL, NULL, NULL))
850 		return (ENXIO);
851 
852 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt))
853 		return (ENOSPC);
854 
855 	if (U_INVAL(uiop))
856 		return (EINVAL);
857 
858 	return (physio(xdf_strategy, NULL, dev, B_WRITE, minphys, uiop));
859 }
860 
861 /*ARGSUSED*/
862 static int
863 xdf_aread(dev_t dev, struct aio_req *aiop, cred_t *credp)
864 {
865 	xdf_t	*vdp;
866 	minor_t minor;
867 	struct uio *uiop = aiop->aio_uio;
868 	diskaddr_t p_blkcnt;
869 	int part;
870 
871 	minor = getminor(dev);
872 	if (!(vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))))
873 		return (ENXIO);
874 
875 	part = XDF_PART(minor);
876 	if (!xdf_isopen(vdp, part))
877 		return (ENXIO);
878 
879 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
880 	    NULL, NULL, NULL, NULL))
881 		return (ENXIO);
882 
883 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt))
884 		return (ENOSPC);
885 
886 	if (U_INVAL(uiop))
887 		return (EINVAL);
888 
889 	return (aphysio(xdf_strategy, anocancel, dev, B_READ, minphys, aiop));
890 }
891 
892 /*ARGSUSED*/
893 static int
894 xdf_awrite(dev_t dev, struct aio_req *aiop, cred_t *credp)
895 {
896 	xdf_t *vdp;
897 	minor_t minor;
898 	struct uio *uiop = aiop->aio_uio;
899 	diskaddr_t p_blkcnt;
900 	int part;
901 
902 	minor = getminor(dev);
903 	if (!(vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))))
904 		return (ENXIO);
905 
906 	part = XDF_PART(minor);
907 	if (!xdf_isopen(vdp, part))
908 		return (ENXIO);
909 
910 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
911 	    NULL, NULL, NULL, NULL))
912 		return (ENXIO);
913 
914 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt))
915 		return (ENOSPC);
916 
917 	if (U_INVAL(uiop))
918 		return (EINVAL);
919 
920 	return (aphysio(xdf_strategy, anocancel, dev, B_WRITE, minphys, aiop));
921 }
922 
923 static int
924 xdf_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
925 {
926 	struct buf dumpbuf, *dbp;
927 	xdf_t	*vdp;
928 	minor_t minor;
929 	int err = 0;
930 	int part;
931 	diskaddr_t p_blkcnt, p_blkst;
932 
933 	minor = getminor(dev);
934 	if (!(vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))))
935 		return (ENXIO);
936 
937 	DPRINTF(IO_DBG, ("xdf: dump addr (0x%p) blk (%ld) nblks (%d)\n",
938 	    addr, blkno, nblk));
939 
940 	part = XDF_PART(minor);
941 	if (!xdf_isopen(vdp, part))
942 		return (ENXIO);
943 
944 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt, &p_blkst,
945 	    NULL, NULL, NULL))
946 		return (ENXIO);
947 
948 	if ((blkno + nblk) > p_blkcnt) {
949 		cmn_err(CE_WARN, "xdf: block %ld exceeds VBD size %"PRIu64,
950 		    blkno + nblk, (uint64_t)vdp->xdf_xdev_nblocks);
951 		return (EINVAL);
952 	}
953 
954 	dbp = &dumpbuf;
955 	bioinit(dbp);
956 	dbp->b_flags = B_BUSY;
957 	dbp->b_un.b_addr = addr;
958 	dbp->b_bcount	= nblk << DEV_BSHIFT;
959 	dbp->b_resid = 0;
960 	dbp->b_blkno = blkno;
961 	dbp->b_edev = dev;
962 	dbp->b_private = (void *)(uintptr_t)p_blkst;
963 
964 	mutex_enter(&vdp->xdf_dev_lk);
965 	kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
966 	if (vdp->xdf_f_act == NULL) {
967 		vdp->xdf_f_act = vdp->xdf_l_act = dbp;
968 	} else {
969 		vdp->xdf_l_act->av_forw = dbp;
970 		vdp->xdf_l_act = dbp;
971 	}
972 	dbp->av_forw = NULL;
973 	dbp->av_back = NULL;
974 	mutex_exit(&vdp->xdf_dev_lk);
975 	xdf_iostart(vdp);
976 	err = xdf_drain_io(vdp);
977 	biofini(dbp);
978 	return (err);
979 }
980 
981 /*ARGSUSED*/
982 static int
983 xdf_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
984     int *rvalp)
985 {
986 	int instance;
987 	xdf_t	*vdp;
988 	minor_t minor;
989 	int part;
990 
991 	minor = getminor(dev);
992 	instance = XDF_INST(minor);
993 
994 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
995 		return (ENXIO);
996 
997 	DPRINTF(IOCTL_DBG, ("xdf%d:ioctl: cmd %d (0x%x)\n",
998 	    instance, cmd, cmd));
999 
1000 	part = XDF_PART(minor);
1001 	if (!xdf_isopen(vdp, part))
1002 		return (ENXIO);
1003 
1004 	switch (cmd) {
1005 	case DKIOCGMEDIAINFO: {
1006 		struct dk_minfo	media_info;
1007 
1008 		media_info.dki_lbsize = DEV_BSIZE;
1009 		media_info.dki_capacity = vdp->xdf_xdev_nblocks;
1010 		media_info.dki_media_type = DK_FIXED_DISK;
1011 
1012 		if (ddi_copyout(&media_info, (void *)arg,
1013 		    sizeof (struct dk_minfo), mode)) {
1014 			return (EFAULT);
1015 		} else {
1016 			return (0);
1017 		}
1018 	}
1019 
1020 	case DKIOCINFO: {
1021 		struct dk_cinfo info;
1022 
1023 		/* controller information */
1024 		if (XD_IS_CD(vdp))
1025 			info.dki_ctype = DKC_CDROM;
1026 		else
1027 			info.dki_ctype = DKC_VBD;
1028 
1029 		info.dki_cnum = 0;
1030 		(void) strncpy((char *)(&info.dki_cname), "xdf", 8);
1031 
1032 		/* unit information */
1033 		info.dki_unit = ddi_get_instance(vdp->xdf_dip);
1034 		(void) strncpy((char *)(&info.dki_dname), "xdf", 8);
1035 		info.dki_flags = DKI_FMTVOL;
1036 		info.dki_partition = part;
1037 		info.dki_maxtransfer = maxphys / DEV_BSIZE;
1038 		info.dki_addr = 0;
1039 		info.dki_space = 0;
1040 		info.dki_prio = 0;
1041 		info.dki_vec = 0;
1042 
1043 		if (ddi_copyout(&info, (void *)arg, sizeof (info), mode))
1044 			return (EFAULT);
1045 		else
1046 			return (0);
1047 	}
1048 
1049 	case DKIOCSTATE: {
1050 		enum dkio_state	dkstate = DKIO_INSERTED;
1051 		if (ddi_copyout(&dkstate, (void *)arg, sizeof (dkstate),
1052 		    mode) != 0)
1053 			return (EFAULT);
1054 		return (0);
1055 	}
1056 
1057 	/*
1058 	 * is media removable?
1059 	 */
1060 	case DKIOCREMOVABLE: {
1061 		int i = XD_IS_RM(vdp) ? 1 : 0;
1062 		if (ddi_copyout(&i, (caddr_t)arg, sizeof (int), mode))
1063 			return (EFAULT);
1064 		return (0);
1065 	}
1066 
1067 	case DKIOCG_PHYGEOM:
1068 	case DKIOCG_VIRTGEOM:
1069 	case DKIOCGGEOM:
1070 	case DKIOCSGEOM:
1071 	case DKIOCGAPART:
1072 	case DKIOCGVTOC:
1073 	case DKIOCSVTOC:
1074 	case DKIOCPARTINFO:
1075 	case DKIOCGETEFI:
1076 	case DKIOCSETEFI:
1077 	case DKIOCPARTITION: {
1078 		int rc;
1079 
1080 		rc = cmlb_ioctl(vdp->xdf_vd_lbl, dev, cmd, arg, mode, credp,
1081 		    rvalp, NULL);
1082 		return (rc);
1083 	}
1084 
1085 	case DKIOCGETWCE:
1086 		if (ddi_copyout(&vdp->xdf_wce, (void *)arg,
1087 		    sizeof (vdp->xdf_wce), mode))
1088 			return (EFAULT);
1089 		return (0);
1090 	case DKIOCSETWCE:
1091 		if (ddi_copyin((void *)arg, &vdp->xdf_wce,
1092 		    sizeof (vdp->xdf_wce), mode))
1093 			return (EFAULT);
1094 		return (0);
1095 	case DKIOCFLUSHWRITECACHE: {
1096 		int rc;
1097 		struct dk_callback *dkc = (struct dk_callback *)arg;
1098 
1099 		if (vdp->xdf_flush_supported) {
1100 			rc = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
1101 			    NULL, 0, 0, (void *)dev);
1102 		} else {
1103 			if (xdf_barrier_flush_disable)
1104 				return (ENOTTY);
1105 			rc = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
1106 			    vdp->xdf_cache_flush_block, xdf_flush_block,
1107 			    DEV_BSIZE, (void *)dev);
1108 		}
1109 		if ((mode & FKIOCTL) && (dkc != NULL) &&
1110 		    (dkc->dkc_callback != NULL)) {
1111 			(*dkc->dkc_callback)(dkc->dkc_cookie, rc);
1112 			/* need to return 0 after calling callback */
1113 			rc = 0;
1114 		}
1115 		return (rc);
1116 	}
1117 
1118 	default:
1119 		return (ENOTTY);
1120 	}
1121 }
1122 
1123 /*
1124  * xdf interrupt handler
1125  */
1126 static uint_t
1127 xdf_intr(caddr_t arg)
1128 {
1129 	xdf_t *vdp = (xdf_t *)arg;
1130 	xendev_ring_t *xbr;
1131 	blkif_response_t *resp;
1132 	int bioerr = 0;
1133 	uint64_t id;
1134 	extern int do_polled_io;
1135 	uint8_t op;
1136 	uint16_t status;
1137 	ddi_acc_handle_t acchdl;
1138 
1139 	mutex_enter(&vdp->xdf_dev_lk);
1140 
1141 	if ((xbr = vdp->xdf_xb_ring) == NULL) {
1142 		mutex_exit(&vdp->xdf_dev_lk);
1143 		return (DDI_INTR_UNCLAIMED);
1144 	}
1145 
1146 	acchdl = vdp->xdf_xb_ring_hdl;
1147 
1148 	/*
1149 	 * complete all requests which have a response
1150 	 */
1151 	while (resp = xvdi_ring_get_response(xbr)) {
1152 		id = ddi_get64(acchdl, &resp->id);
1153 		op = ddi_get8(acchdl, &resp->operation);
1154 		status = ddi_get16(acchdl, (uint16_t *)&resp->status);
1155 		DPRINTF(INTR_DBG, ("resp: op %d id %"PRIu64" status %d\n",
1156 		    op, id, status));
1157 
1158 		/*
1159 		 * XXPV - close connection to the backend and restart
1160 		 */
1161 		if (status != BLKIF_RSP_OKAY) {
1162 			DPRINTF(IO_DBG, ("xdf@%s: I/O error while %s",
1163 			    ddi_get_name_addr(vdp->xdf_dip),
1164 			    (op == BLKIF_OP_READ) ? "reading" : "writing"));
1165 			bioerr = EIO;
1166 		}
1167 
1168 		xdf_iofini(vdp, id, bioerr);
1169 	}
1170 
1171 	mutex_exit(&vdp->xdf_dev_lk);
1172 
1173 	if (!do_polled_io)
1174 		xdf_iostart(vdp);
1175 
1176 	return (DDI_INTR_CLAIMED);
1177 }
1178 
1179 int xdf_fbrewrites;	/* how many times was our flush block rewritten */
1180 
1181 /*
1182  * Snarf new data if our flush block was re-written
1183  */
1184 static void
1185 check_fbwrite(xdf_t *vdp, buf_t *bp, daddr_t blkno)
1186 {
1187 	int nblks;
1188 	boolean_t mapin;
1189 
1190 	if (IS_WRITE_BARRIER(vdp, bp))
1191 		return; /* write was a flush write */
1192 
1193 	mapin = B_FALSE;
1194 	nblks = bp->b_bcount >> DEV_BSHIFT;
1195 	if (xdf_flush_block >= blkno && xdf_flush_block < (blkno + nblks)) {
1196 		xdf_fbrewrites++;
1197 		if (bp->b_flags & (B_PAGEIO | B_PHYS)) {
1198 			mapin = B_TRUE;
1199 			bp_mapin(bp);
1200 		}
1201 		bcopy(bp->b_un.b_addr +
1202 		    ((xdf_flush_block - blkno) << DEV_BSHIFT),
1203 		    vdp->xdf_cache_flush_block, DEV_BSIZE);
1204 		if (mapin)
1205 			bp_mapout(bp);
1206 	}
1207 }
1208 
1209 static void
1210 xdf_iofini(xdf_t *vdp, uint64_t id, int bioerr)
1211 {
1212 	ge_slot_t *gs = (ge_slot_t *)(uintptr_t)id;
1213 	v_req_t *vreq = gs->vreq;
1214 	buf_t *bp = vreq->v_buf;
1215 
1216 	gs_free(vdp, gs);
1217 	if (bioerr)
1218 		bioerror(bp, bioerr);
1219 	vreq->v_nslots--;
1220 	if (vreq->v_nslots != 0)
1221 		return;
1222 
1223 	XDF_UPDATE_IO_STAT(vdp, bp);
1224 	kstat_runq_exit(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1225 
1226 	if (IS_ERROR(bp))
1227 		bp->b_resid = bp->b_bcount;
1228 
1229 	vreq_free(vdp, vreq);
1230 	biodone(bp);
1231 }
1232 
1233 /*
1234  * return value of xdf_prepare_rreq()
1235  * used in xdf_iostart()
1236  */
1237 #define	XF_PARTIAL	0 /* rreq is full, not all I/O in buf transferred */
1238 #define	XF_COMP		1 /* no more I/O left in buf */
1239 
1240 static void
1241 xdf_iostart(xdf_t *vdp)
1242 {
1243 	xendev_ring_t *xbr;
1244 	struct buf *bp;
1245 	blkif_request_t *rreq;
1246 	int retval;
1247 	int rreqready = 0;
1248 
1249 	xbr = vdp->xdf_xb_ring;
1250 
1251 	/*
1252 	 * populate the ring request(s)
1253 	 *
1254 	 * loop until there is no buf to transfer or no free slot
1255 	 * available in I/O ring
1256 	 */
1257 	mutex_enter(&vdp->xdf_dev_lk);
1258 
1259 	for (;;) {
1260 		if (vdp->xdf_status != XD_READY)
1261 			break;
1262 
1263 		/* active buf queue empty? */
1264 		if ((bp = vdp->xdf_f_act) == NULL)
1265 			break;
1266 
1267 		/* try to grab a vreq for this bp */
1268 		if ((BP2VREQ(bp) == NULL) && (vreq_get(vdp, bp) == NULL))
1269 				break;
1270 		/* alloc DMA/GTE resources */
1271 		if (vreq_setup(vdp, BP2VREQ(bp)) != DDI_SUCCESS)
1272 			break;
1273 
1274 		/* get next blkif_request in the ring */
1275 		if ((rreq = xvdi_ring_get_request(xbr)) == NULL)
1276 			break;
1277 		bzero(rreq, sizeof (blkif_request_t));
1278 
1279 		/* populate blkif_request with this buf */
1280 		rreqready++;
1281 		retval = xdf_prepare_rreq(vdp, bp, rreq);
1282 		if (retval == XF_COMP) {
1283 			/* finish this bp, switch to next one */
1284 			kstat_waitq_to_runq(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1285 			vdp->xdf_f_act = bp->av_forw;
1286 			bp->av_forw = NULL;
1287 		}
1288 	}
1289 
1290 	/*
1291 	 * Send the request(s) to the backend
1292 	 */
1293 	if (rreqready) {
1294 		if (xvdi_ring_push_request(xbr)) {
1295 			DPRINTF(IO_DBG, ("xdf_iostart: "
1296 			    "sent request(s) to backend\n"));
1297 			xvdi_notify_oe(vdp->xdf_dip);
1298 		}
1299 	}
1300 
1301 	mutex_exit(&vdp->xdf_dev_lk);
1302 }
1303 
1304 /*
1305  * populate a single blkif_request_t w/ a buf
1306  */
1307 static int
1308 xdf_prepare_rreq(xdf_t *vdp, struct buf *bp, blkif_request_t *rreq)
1309 {
1310 	int		rval;
1311 	grant_ref_t	gr;
1312 	uint8_t		fsect, lsect;
1313 	size_t		bcnt;
1314 	paddr_t		dma_addr;
1315 	off_t		blk_off;
1316 	dev_info_t	*dip = vdp->xdf_dip;
1317 	blkif_vdev_t	vdev = xvdi_get_vdevnum(dip);
1318 	v_req_t		*vreq = BP2VREQ(bp);
1319 	uint64_t	blkno = vreq->v_blkno;
1320 	uint_t		ndmacs = vreq->v_ndmacs;
1321 	ddi_acc_handle_t acchdl = vdp->xdf_xb_ring_hdl;
1322 	int		seg = 0;
1323 	int		isread = IS_READ(bp);
1324 
1325 	if (isread)
1326 		ddi_put8(acchdl, &rreq->operation, BLKIF_OP_READ);
1327 	else {
1328 		switch (vreq->v_flush_diskcache) {
1329 		case FLUSH_DISKCACHE:
1330 			ddi_put8(acchdl, &rreq->operation,
1331 			    BLKIF_OP_FLUSH_DISKCACHE);
1332 			ddi_put16(acchdl, &rreq->handle, vdev);
1333 			ddi_put64(acchdl, &rreq->id,
1334 			    (uint64_t)(uintptr_t)(vreq->v_gs));
1335 			ddi_put8(acchdl, &rreq->nr_segments, 0);
1336 			return (XF_COMP);
1337 		case WRITE_BARRIER:
1338 			ddi_put8(acchdl, &rreq->operation,
1339 			    BLKIF_OP_WRITE_BARRIER);
1340 			break;
1341 		default:
1342 			if (!vdp->xdf_wce)
1343 				ddi_put8(acchdl, &rreq->operation,
1344 				    BLKIF_OP_WRITE_BARRIER);
1345 			else
1346 				ddi_put8(acchdl, &rreq->operation,
1347 				    BLKIF_OP_WRITE);
1348 			break;
1349 		}
1350 	}
1351 
1352 	ddi_put16(acchdl, &rreq->handle, vdev);
1353 	ddi_put64(acchdl, &rreq->sector_number, blkno);
1354 	ddi_put64(acchdl, &rreq->id, (uint64_t)(uintptr_t)(vreq->v_gs));
1355 
1356 	/*
1357 	 * loop until all segments are populated or no more dma cookie in buf
1358 	 */
1359 	for (;;) {
1360 	/*
1361 	 * Each segment of a blkif request can transfer up to
1362 	 * one 4K page of data.
1363 	 */
1364 		bcnt = vreq->v_dmac.dmac_size;
1365 		ASSERT(bcnt <= PAGESIZE);
1366 		ASSERT((bcnt % XB_BSIZE) == 0);
1367 		dma_addr = vreq->v_dmac.dmac_laddress;
1368 		blk_off = (uint_t)((paddr_t)XB_SEGOFFSET & dma_addr);
1369 		ASSERT((blk_off & XB_BMASK) == 0);
1370 		fsect = blk_off >> XB_BSHIFT;
1371 		lsect = fsect + (bcnt >> XB_BSHIFT) - 1;
1372 		ASSERT(fsect < XB_MAX_SEGLEN / XB_BSIZE &&
1373 		    lsect < XB_MAX_SEGLEN / XB_BSIZE);
1374 		DPRINTF(IO_DBG, ("  ""seg%d: dmacS %lu blk_off %ld\n",
1375 		    seg, vreq->v_dmac.dmac_size, blk_off));
1376 		gr = gs_grant(vreq->v_gs, PATOMA(dma_addr) >> PAGESHIFT);
1377 		ddi_put32(acchdl, &rreq->seg[seg].gref, gr);
1378 		ddi_put8(acchdl, &rreq->seg[seg].first_sect, fsect);
1379 		ddi_put8(acchdl, &rreq->seg[seg].last_sect, lsect);
1380 		DPRINTF(IO_DBG, ("  ""seg%d: fs %d ls %d gr %d dma 0x%"PRIx64
1381 		    "\n", seg, fsect, lsect, gr, dma_addr));
1382 
1383 		blkno += (bcnt >> XB_BSHIFT);
1384 		seg++;
1385 		ASSERT(seg <= BLKIF_MAX_SEGMENTS_PER_REQUEST);
1386 		if (--ndmacs) {
1387 			ddi_dma_nextcookie(vreq->v_dmahdl, &vreq->v_dmac);
1388 			continue;
1389 		}
1390 
1391 		vreq->v_status = VREQ_DMAWIN_DONE;
1392 		vreq->v_blkno = blkno;
1393 		if (vreq->v_dmaw + 1 == vreq->v_ndmaws)
1394 			/* last win */
1395 			rval = XF_COMP;
1396 		else
1397 			rval = XF_PARTIAL;
1398 		break;
1399 	}
1400 	ddi_put8(acchdl,  &rreq->nr_segments, seg);
1401 	DPRINTF(IO_DBG, ("xdf_prepare_rreq: request id=%"PRIx64" ready\n",
1402 	    rreq->id));
1403 
1404 	return (rval);
1405 }
1406 
1407 #define	XDF_QSEC	50000	/* .005 second */
1408 #define	XDF_POLLCNT	12	/* loop for 12 times before time out */
1409 
1410 static int
1411 xdf_drain_io(xdf_t *vdp)
1412 {
1413 	int pollc, rval;
1414 	xendev_ring_t *xbr;
1415 
1416 	if (xdfdebug & SUSRES_DBG)
1417 		xen_printf("xdf_drain_io: start\n");
1418 
1419 	mutex_enter(&vdp->xdf_dev_lk);
1420 
1421 	if ((vdp->xdf_status != XD_READY) && (vdp->xdf_status != XD_SUSPEND))
1422 		goto out;
1423 
1424 	rval = 0;
1425 	xbr = vdp->xdf_xb_ring;
1426 	ASSERT(xbr != NULL);
1427 
1428 	for (pollc = 0; pollc < XDF_POLLCNT; pollc++) {
1429 		if (xvdi_ring_has_unconsumed_responses(xbr)) {
1430 			mutex_exit(&vdp->xdf_dev_lk);
1431 			(void) xdf_intr((caddr_t)vdp);
1432 			mutex_enter(&vdp->xdf_dev_lk);
1433 		}
1434 		if (!xvdi_ring_has_incomp_request(xbr))
1435 			goto out;
1436 
1437 		(void) HYPERVISOR_yield();
1438 		/*
1439 		 * file-backed devices can be slow
1440 		 */
1441 		drv_usecwait(XDF_QSEC << pollc);
1442 	}
1443 	cmn_err(CE_WARN, "xdf_polled_io: timeout");
1444 	rval = EIO;
1445 out:
1446 	mutex_exit(&vdp->xdf_dev_lk);
1447 	if (xdfdebug & SUSRES_DBG)
1448 		xen_printf("xdf_drain_io: end, err=%d\n", rval);
1449 	return (rval);
1450 }
1451 
1452 /* ARGSUSED5 */
1453 static int
1454 xdf_lb_rdwr(dev_info_t *devi, uchar_t cmd, void *bufp,
1455     diskaddr_t start, size_t reqlen, void *tg_cookie)
1456 {
1457 	xdf_t *vdp;
1458 	struct buf *bp;
1459 	int err = 0;
1460 
1461 	vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi));
1462 	if (vdp == NULL)
1463 		return (ENXIO);
1464 
1465 	if ((start + (reqlen >> DEV_BSHIFT)) > vdp->xdf_xdev_nblocks)
1466 		return (EINVAL);
1467 
1468 	bp = getrbuf(KM_SLEEP);
1469 	if (cmd == TG_READ)
1470 		bp->b_flags = B_BUSY | B_READ;
1471 	else
1472 		bp->b_flags = B_BUSY | B_WRITE;
1473 	bp->b_un.b_addr = bufp;
1474 	bp->b_bcount = reqlen;
1475 	bp->b_resid = 0;
1476 	bp->b_blkno = start;
1477 	bp->av_forw = NULL;
1478 	bp->av_back = NULL;
1479 	bp->b_edev = DDI_DEV_T_NONE; /* don't have dev_t */
1480 
1481 	mutex_enter(&vdp->xdf_dev_lk);
1482 	kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1483 	if (vdp->xdf_f_act == NULL) {
1484 		vdp->xdf_f_act = vdp->xdf_l_act = bp;
1485 	} else {
1486 		vdp->xdf_l_act->av_forw = bp;
1487 		vdp->xdf_l_act = bp;
1488 	}
1489 	mutex_exit(&vdp->xdf_dev_lk);
1490 	xdf_iostart(vdp);
1491 	err = biowait(bp);
1492 
1493 	ASSERT(bp->b_flags & B_DONE);
1494 
1495 	freerbuf(bp);
1496 	return (err);
1497 }
1498 
1499 /*
1500  * synthetic geometry
1501  */
1502 #define	XDF_NSECTS	256
1503 #define	XDF_NHEADS	16
1504 
1505 static int
1506 xdf_lb_getcap(dev_info_t *devi, diskaddr_t *capp)
1507 {
1508 	xdf_t *vdp;
1509 
1510 	vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi));
1511 
1512 	if (vdp == NULL)
1513 		return (ENXIO);
1514 
1515 	mutex_enter(&vdp->xdf_dev_lk);
1516 	*capp = vdp->xdf_xdev_nblocks;
1517 	DPRINTF(LBL_DBG, ("capacity %llu\n", *capp));
1518 	mutex_exit(&vdp->xdf_dev_lk);
1519 	return (0);
1520 }
1521 
1522 static int
1523 xdf_lb_getpgeom(dev_info_t *devi, cmlb_geom_t *geomp)
1524 {
1525 	xdf_t *vdp;
1526 	uint_t ncyl;
1527 	uint_t spc = XDF_NHEADS * XDF_NSECTS;
1528 
1529 	vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi));
1530 
1531 	if (vdp == NULL)
1532 		return (ENXIO);
1533 
1534 	ncyl = vdp->xdf_xdev_nblocks / spc;
1535 
1536 	geomp->g_ncyl = ncyl == 0 ? 1 : ncyl;
1537 	geomp->g_acyl = 0;
1538 	geomp->g_nhead = XDF_NHEADS;
1539 	geomp->g_secsize = XB_BSIZE;
1540 	geomp->g_nsect = XDF_NSECTS;
1541 	geomp->g_intrlv = 0;
1542 	geomp->g_rpm = 7200;
1543 	geomp->g_capacity = vdp->xdf_xdev_nblocks;
1544 	return (0);
1545 }
1546 
1547 /*
1548  * No real HBA, no geometry available from it
1549  */
1550 /*ARGSUSED*/
1551 static int
1552 xdf_lb_getvgeom(dev_info_t *devi, cmlb_geom_t *geomp)
1553 {
1554 	return (EINVAL);
1555 }
1556 
1557 static int
1558 xdf_lb_getattribute(dev_info_t *devi, tg_attribute_t *tgattributep)
1559 {
1560 	xdf_t *vdp;
1561 
1562 	if (!(vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi))))
1563 		return (ENXIO);
1564 
1565 	if (XD_IS_RO(vdp))
1566 		tgattributep->media_is_writable = 0;
1567 	else
1568 		tgattributep->media_is_writable = 1;
1569 	return (0);
1570 }
1571 
1572 /* ARGSUSED3 */
1573 static int
1574 xdf_lb_getinfo(dev_info_t *devi, int cmd, void *arg, void *tg_cookie)
1575 {
1576 	switch (cmd) {
1577 	case TG_GETPHYGEOM:
1578 		return (xdf_lb_getpgeom(devi, (cmlb_geom_t *)arg));
1579 	case TG_GETVIRTGEOM:
1580 		return (xdf_lb_getvgeom(devi, (cmlb_geom_t *)arg));
1581 	case TG_GETCAPACITY:
1582 		return (xdf_lb_getcap(devi, (diskaddr_t *)arg));
1583 	case TG_GETBLOCKSIZE:
1584 		*(uint32_t *)arg = XB_BSIZE;
1585 		return (0);
1586 	case TG_GETATTR:
1587 		return (xdf_lb_getattribute(devi, (tg_attribute_t *)arg));
1588 	default:
1589 		return (ENOTTY);
1590 	}
1591 }
1592 
1593 /*
1594  * Kick-off connect process
1595  * Status should be XD_UNKNOWN or XD_CLOSED
1596  * On success, status will be changed to XD_INIT
1597  * On error, status won't be changed
1598  */
1599 static int
1600 xdf_start_connect(xdf_t *vdp)
1601 {
1602 	char *xsnode;
1603 	grant_ref_t gref;
1604 	xenbus_transaction_t xbt;
1605 	int rv;
1606 	dev_info_t *dip = vdp->xdf_dip;
1607 
1608 	if ((vdp->xdf_peer = xvdi_get_oeid(dip)) == (domid_t)-1)
1609 		goto errout;
1610 
1611 	if (xvdi_alloc_evtchn(dip) != DDI_SUCCESS) {
1612 		cmn_err(CE_WARN, "xdf@%s: failed to alloc event channel",
1613 		    ddi_get_name_addr(dip));
1614 		goto errout;
1615 	}
1616 	if (ddi_add_intr(dip, 0, NULL, NULL, xdf_intr, (caddr_t)vdp) !=
1617 	    DDI_SUCCESS) {
1618 		cmn_err(CE_WARN, "xdf_start_connect: xdf@%s: "
1619 		    "failed to add intr handler", ddi_get_name_addr(dip));
1620 		goto errout1;
1621 	}
1622 
1623 	if (xvdi_alloc_ring(dip, BLKIF_RING_SIZE,
1624 	    sizeof (union blkif_sring_entry), &gref, &vdp->xdf_xb_ring) !=
1625 	    DDI_SUCCESS) {
1626 		cmn_err(CE_WARN, "xdf@%s: failed to alloc comm ring",
1627 		    ddi_get_name_addr(dip));
1628 		goto errout2;
1629 	}
1630 	vdp->xdf_xb_ring_hdl = vdp->xdf_xb_ring->xr_acc_hdl; /* ugly!! */
1631 
1632 	/*
1633 	 * Write into xenstore the info needed by backend
1634 	 */
1635 	if ((xsnode = xvdi_get_xsname(dip)) == NULL) {
1636 		cmn_err(CE_WARN, "xdf@%s: "
1637 		    "failed to get xenstore node path",
1638 		    ddi_get_name_addr(dip));
1639 		goto fail_trans;
1640 	}
1641 trans_retry:
1642 	if (xenbus_transaction_start(&xbt)) {
1643 		cmn_err(CE_WARN, "xdf@%s: failed to start transaction",
1644 		    ddi_get_name_addr(dip));
1645 		xvdi_fatal_error(dip, EIO, "transaction start");
1646 		goto fail_trans;
1647 	}
1648 
1649 	if (rv = xenbus_printf(xbt, xsnode, "ring-ref", "%u", gref)) {
1650 		cmn_err(CE_WARN, "xdf@%s: failed to write ring-ref",
1651 		    ddi_get_name_addr(dip));
1652 		xvdi_fatal_error(dip, rv, "writing ring-ref");
1653 		goto abort_trans;
1654 	}
1655 
1656 	if (rv = xenbus_printf(xbt, xsnode, "event-channel", "%u",
1657 	    xvdi_get_evtchn(dip))) {
1658 		cmn_err(CE_WARN, "xdf@%s: failed to write event-channel",
1659 		    ddi_get_name_addr(dip));
1660 		xvdi_fatal_error(dip, rv, "writing event-channel");
1661 		goto abort_trans;
1662 	}
1663 
1664 	if ((rv = xvdi_switch_state(dip, xbt, XenbusStateInitialised)) > 0) {
1665 		cmn_err(CE_WARN, "xdf@%s: "
1666 		    "failed to switch state to XenbusStateInitialised",
1667 		    ddi_get_name_addr(dip));
1668 		xvdi_fatal_error(dip, rv, "writing state");
1669 		goto abort_trans;
1670 	}
1671 
1672 	/* kick-off connect process */
1673 	if (rv = xenbus_transaction_end(xbt, 0)) {
1674 		if (rv == EAGAIN)
1675 			goto trans_retry;
1676 		cmn_err(CE_WARN, "xdf@%s: failed to end transaction",
1677 		    ddi_get_name_addr(dip));
1678 		xvdi_fatal_error(dip, rv, "completing transaction");
1679 		goto fail_trans;
1680 	}
1681 
1682 	ASSERT(mutex_owned(&vdp->xdf_cb_lk));
1683 	mutex_enter(&vdp->xdf_dev_lk);
1684 	vdp->xdf_status = XD_INIT;
1685 	mutex_exit(&vdp->xdf_dev_lk);
1686 
1687 	return (DDI_SUCCESS);
1688 
1689 abort_trans:
1690 	(void) xenbus_transaction_end(xbt, 1);
1691 fail_trans:
1692 	xvdi_free_ring(vdp->xdf_xb_ring);
1693 errout2:
1694 	(void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
1695 errout1:
1696 	xvdi_free_evtchn(dip);
1697 errout:
1698 	cmn_err(CE_WARN, "xdf@%s: fail to kick-off connecting",
1699 	    ddi_get_name_addr(dip));
1700 	return (DDI_FAILURE);
1701 }
1702 
1703 /*
1704  * Kick-off disconnect process
1705  * Status won't be changed
1706  */
1707 static int
1708 xdf_start_disconnect(xdf_t *vdp)
1709 {
1710 	if (xvdi_switch_state(vdp->xdf_dip, XBT_NULL, XenbusStateClosed) > 0) {
1711 		cmn_err(CE_WARN, "xdf@%s: fail to kick-off disconnecting",
1712 		    ddi_get_name_addr(vdp->xdf_dip));
1713 		return (DDI_FAILURE);
1714 	}
1715 
1716 	return (DDI_SUCCESS);
1717 }
1718 
1719 int
1720 xdf_get_flush_block(xdf_t *vdp)
1721 {
1722 	/*
1723 	 * Get a DEV_BSIZE aligned bufer
1724 	 */
1725 	vdp->xdf_flush_mem = kmem_alloc(DEV_BSIZE * 2, KM_SLEEP);
1726 	vdp->xdf_cache_flush_block =
1727 	    (char *)P2ROUNDUP((uintptr_t)(vdp->xdf_flush_mem), DEV_BSIZE);
1728 	if (xdf_lb_rdwr(vdp->xdf_dip, TG_READ, vdp->xdf_cache_flush_block,
1729 	    xdf_flush_block, DEV_BSIZE, NULL) != 0)
1730 		return (DDI_FAILURE);
1731 	return (DDI_SUCCESS);
1732 }
1733 
1734 /*
1735  * Finish other initialization after we've connected to backend
1736  * Status should be XD_INIT before calling this routine
1737  * On success, status should be changed to XD_READY
1738  * On error, status should stay XD_INIT
1739  */
1740 static int
1741 xdf_post_connect(xdf_t *vdp)
1742 {
1743 	int rv;
1744 	uint_t len;
1745 	char *type;
1746 	char *barrier;
1747 	dev_info_t *devi = vdp->xdf_dip;
1748 
1749 	/*
1750 	 * Determine if feature barrier is supported by backend
1751 	 */
1752 	if (xenbus_read(XBT_NULL, xvdi_get_oename(devi),
1753 	    "feature-barrier", (void **)&barrier, &len) == 0) {
1754 		vdp->xdf_feature_barrier = 1;
1755 		kmem_free(barrier, len);
1756 	} else {
1757 		cmn_err(CE_NOTE, "xdf@%s: failed to read feature-barrier",
1758 		    ddi_get_name_addr(vdp->xdf_dip));
1759 		vdp->xdf_feature_barrier = 0;
1760 	}
1761 
1762 	/* probe backend */
1763 	if (rv = xenbus_gather(XBT_NULL, xvdi_get_oename(devi),
1764 	    "sectors", "%"SCNu64, &vdp->xdf_xdev_nblocks,
1765 	    "info", "%u", &vdp->xdf_xdev_info, NULL)) {
1766 		cmn_err(CE_WARN, "xdf_post_connect: xdf@%s: "
1767 		    "cannot read backend info", ddi_get_name_addr(devi));
1768 		xvdi_fatal_error(devi, rv, "reading backend info");
1769 		return (DDI_FAILURE);
1770 	}
1771 
1772 	/* fix disk type */
1773 	if (xenbus_read(XBT_NULL, xvdi_get_xsname(devi), "device-type",
1774 	    (void **)&type, &len) != 0) {
1775 		cmn_err(CE_WARN, "xdf_post_connect: xdf@%s: "
1776 		    "cannot read device-type", ddi_get_name_addr(devi));
1777 		xvdi_fatal_error(devi, rv, "reading device-type");
1778 		return (DDI_FAILURE);
1779 	}
1780 	if (strcmp(type, "cdrom") == 0)
1781 		vdp->xdf_xdev_info |= VDISK_CDROM;
1782 	kmem_free(type, len);
1783 
1784 	/*
1785 	 * We've created all the minor nodes via cmlb_attach() using default
1786 	 * value in xdf_attach() to make it possbile to block in xdf_open(),
1787 	 * in case there's anyone (say, booting thread) ever trying to open
1788 	 * it before connected to backend. We will refresh all those minor
1789 	 * nodes w/ latest info we've got now when we are almost connected.
1790 	 *
1791 	 * Don't do this when xdf is already opened by someone (could happen
1792 	 * during resume), for that cmlb_attach() will invalid the label info
1793 	 * and confuse those who has already opened the node, which is bad.
1794 	 */
1795 	if (!xdf_isopen(vdp, -1) && (XD_IS_CD(vdp) || XD_IS_RM(vdp))) {
1796 		/* re-init cmlb w/ latest info we got from backend */
1797 		if (cmlb_attach(devi, &xdf_lb_ops,
1798 		    XD_IS_CD(vdp) ? DTYPE_RODIRECT : DTYPE_DIRECT,
1799 		    XD_IS_RM(vdp), 1, DDI_NT_BLOCK,
1800 		    CMLB_FAKE_LABEL_ONE_PARTITION,
1801 		    vdp->xdf_vd_lbl, NULL) != 0) {
1802 			cmn_err(CE_WARN, "xdf@%s: cmlb attach failed",
1803 			    ddi_get_name_addr(devi));
1804 			return (DDI_FAILURE);
1805 		}
1806 	}
1807 
1808 	/* mark vbd is ready for I/O */
1809 	ASSERT(mutex_owned(&vdp->xdf_cb_lk));
1810 	mutex_enter(&vdp->xdf_dev_lk);
1811 	vdp->xdf_status = XD_READY;
1812 	mutex_exit(&vdp->xdf_dev_lk);
1813 	/*
1814 	 * If backend has feature-barrier, see if it supports disk
1815 	 * cache flush op.
1816 	 */
1817 	vdp->xdf_flush_supported = 0;
1818 	if (vdp->xdf_feature_barrier) {
1819 		/*
1820 		 * Pretend we already know flush is supported so probe
1821 		 * will attempt the correct op.
1822 		 */
1823 		vdp->xdf_flush_supported = 1;
1824 		if (xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE, NULL, 0, 0, 0) == 0) {
1825 			vdp->xdf_flush_supported = 1;
1826 		} else {
1827 			vdp->xdf_flush_supported = 0;
1828 			/*
1829 			 * If the other end does not support the cache flush op
1830 			 * then we must use a barrier-write to force disk
1831 			 * cache flushing.  Barrier writes require that a data
1832 			 * block actually be written.
1833 			 * Cache a block to barrier-write when we are
1834 			 * asked to perform a flush.
1835 			 * XXX - would it be better to just copy 1 block
1836 			 * (512 bytes) from whatever write we did last
1837 			 * and rewrite that block?
1838 			 */
1839 			if (xdf_get_flush_block(vdp) != DDI_SUCCESS)
1840 				return (DDI_FAILURE);
1841 		}
1842 	}
1843 
1844 	cmn_err(CE_CONT, "?xdf@%s: %"PRIu64" blocks", ddi_get_name_addr(devi),
1845 	    (uint64_t)vdp->xdf_xdev_nblocks);
1846 
1847 	return (DDI_SUCCESS);
1848 }
1849 
1850 /*
1851  * Finish other uninitialization after we've disconnected from backend
1852  * when status is XD_CLOSING or XD_INIT. After returns, status is XD_CLOSED
1853  */
1854 static void
1855 xdf_post_disconnect(xdf_t *vdp)
1856 {
1857 	(void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
1858 	xvdi_free_evtchn(vdp->xdf_dip);
1859 	xvdi_free_ring(vdp->xdf_xb_ring);
1860 	vdp->xdf_xb_ring = NULL;
1861 	vdp->xdf_xb_ring_hdl = NULL;
1862 	vdp->xdf_peer = (domid_t)-1;
1863 
1864 	ASSERT(mutex_owned(&vdp->xdf_cb_lk));
1865 	mutex_enter(&vdp->xdf_dev_lk);
1866 	vdp->xdf_status = XD_CLOSED;
1867 	mutex_exit(&vdp->xdf_dev_lk);
1868 }
1869 
1870 /*ARGSUSED*/
1871 static void
1872 xdf_oe_change(dev_info_t *dip, ddi_eventcookie_t id, void *arg, void *impl_data)
1873 {
1874 	XenbusState new_state = *(XenbusState *)impl_data;
1875 	xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip);
1876 	boolean_t unexpect_die = B_FALSE;
1877 	int status;
1878 
1879 	DPRINTF(DDI_DBG, ("xdf@%s: otherend state change to %d!\n",
1880 	    ddi_get_name_addr(dip), new_state));
1881 
1882 	mutex_enter(&vdp->xdf_cb_lk);
1883 
1884 	if (xdf_check_state_transition(vdp, new_state) == DDI_FAILURE) {
1885 		mutex_exit(&vdp->xdf_cb_lk);
1886 		return;
1887 	}
1888 
1889 	switch (new_state) {
1890 	case XenbusStateInitialising:
1891 		ASSERT(vdp->xdf_status == XD_CLOSED);
1892 		/*
1893 		 * backend recovered from a previous failure,
1894 		 * kick-off connect process again
1895 		 */
1896 		if (xdf_start_connect(vdp) != DDI_SUCCESS) {
1897 			cmn_err(CE_WARN, "xdf@%s:"
1898 			    " failed to start reconnecting to backend",
1899 			    ddi_get_name_addr(dip));
1900 		}
1901 		break;
1902 	case XenbusStateConnected:
1903 		ASSERT(vdp->xdf_status == XD_INIT);
1904 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
1905 		/* finish final init after connect */
1906 		if (xdf_post_connect(vdp) != DDI_SUCCESS)
1907 			(void) xdf_start_disconnect(vdp);
1908 		break;
1909 	case XenbusStateClosing:
1910 		if (vdp->xdf_status == XD_READY) {
1911 			mutex_enter(&vdp->xdf_dev_lk);
1912 			if (xdf_isopen(vdp, -1)) {
1913 				cmn_err(CE_NOTE, "xdf@%s: hot-unplug failed, "
1914 				    "still in use", ddi_get_name_addr(dip));
1915 				mutex_exit(&vdp->xdf_dev_lk);
1916 				break;
1917 			} else {
1918 				vdp->xdf_status = XD_CLOSING;
1919 			}
1920 			mutex_exit(&vdp->xdf_dev_lk);
1921 		}
1922 		(void) xdf_start_disconnect(vdp);
1923 		break;
1924 	case XenbusStateClosed:
1925 		/* first check if BE closed unexpectedly */
1926 		mutex_enter(&vdp->xdf_dev_lk);
1927 		if (xdf_isopen(vdp, -1)) {
1928 			unexpect_die = B_TRUE;
1929 			unexpectedie(vdp);
1930 			cmn_err(CE_WARN, "xdf@%s: backend closed, "
1931 			    "reconnecting...", ddi_get_name_addr(dip));
1932 		}
1933 		mutex_exit(&vdp->xdf_dev_lk);
1934 
1935 		if (vdp->xdf_status == XD_READY) {
1936 			mutex_enter(&vdp->xdf_dev_lk);
1937 			vdp->xdf_status = XD_CLOSING;
1938 			mutex_exit(&vdp->xdf_dev_lk);
1939 
1940 #ifdef	DOMU_BACKEND
1941 			(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1942 #endif
1943 
1944 			xdf_post_disconnect(vdp);
1945 			(void) xvdi_switch_state(dip, XBT_NULL,
1946 			    XenbusStateClosed);
1947 		} else if ((vdp->xdf_status == XD_INIT) ||
1948 		    (vdp->xdf_status == XD_CLOSING)) {
1949 			xdf_post_disconnect(vdp);
1950 		} else {
1951 			mutex_enter(&vdp->xdf_dev_lk);
1952 			vdp->xdf_status = XD_CLOSED;
1953 			mutex_exit(&vdp->xdf_dev_lk);
1954 		}
1955 	}
1956 
1957 	/* notify anybody waiting for oe state change */
1958 	mutex_enter(&vdp->xdf_dev_lk);
1959 	cv_broadcast(&vdp->xdf_dev_cv);
1960 	mutex_exit(&vdp->xdf_dev_lk);
1961 
1962 	status = vdp->xdf_status;
1963 	mutex_exit(&vdp->xdf_cb_lk);
1964 
1965 	if (status == XD_READY) {
1966 		xdf_iostart(vdp);
1967 	} else if ((status == XD_CLOSED) && !unexpect_die) {
1968 		/* interface is closed successfully, remove all minor nodes */
1969 		cmlb_detach(vdp->xdf_vd_lbl, NULL);
1970 		cmlb_free_handle(&vdp->xdf_vd_lbl);
1971 	}
1972 }
1973 
1974 /* check if partition is open, -1 - check all partitions on the disk */
1975 static boolean_t
1976 xdf_isopen(xdf_t *vdp, int partition)
1977 {
1978 	int i;
1979 	ulong_t parbit;
1980 	boolean_t rval = B_FALSE;
1981 
1982 	if (partition == -1)
1983 		parbit = (ulong_t)-1;
1984 	else
1985 		parbit = 1 << partition;
1986 
1987 	for (i = 0; i < OTYPCNT; i++) {
1988 		if (vdp->xdf_vd_open[i] & parbit)
1989 			rval = B_TRUE;
1990 	}
1991 
1992 	return (rval);
1993 }
1994 
1995 /*
1996  * Xdf_check_state_transition will check the XenbusState change to see
1997  * if the change is a valid transition or not.
1998  * The new state is written by backend domain, or by running xenstore-write
1999  * to change it manually in dom0
2000  */
2001 static int
2002 xdf_check_state_transition(xdf_t *vdp, XenbusState oestate)
2003 {
2004 	int status;
2005 	int stcheck;
2006 #define	STOK	0 /* need further process */
2007 #define	STNOP	1 /* no action need taking */
2008 #define	STBUG	2 /* unexpected state change, could be a bug */
2009 
2010 	status = vdp->xdf_status;
2011 	stcheck = STOK;
2012 
2013 	switch (status) {
2014 	case XD_UNKNOWN:
2015 		if ((oestate == XenbusStateUnknown)		||
2016 		    (oestate == XenbusStateConnected))
2017 			stcheck = STBUG;
2018 		else if ((oestate == XenbusStateInitialising)	||
2019 		    (oestate == XenbusStateInitWait)		||
2020 		    (oestate == XenbusStateInitialised))
2021 			stcheck = STNOP;
2022 		break;
2023 	case XD_INIT:
2024 		if (oestate == XenbusStateUnknown)
2025 			stcheck = STBUG;
2026 		else if ((oestate == XenbusStateInitialising)	||
2027 		    (oestate == XenbusStateInitWait)		||
2028 		    (oestate == XenbusStateInitialised))
2029 			stcheck = STNOP;
2030 		break;
2031 	case XD_READY:
2032 		if ((oestate == XenbusStateUnknown)		||
2033 		    (oestate == XenbusStateInitialising)	||
2034 		    (oestate == XenbusStateInitWait)		||
2035 		    (oestate == XenbusStateInitialised))
2036 			stcheck = STBUG;
2037 		else if (oestate == XenbusStateConnected)
2038 			stcheck = STNOP;
2039 		break;
2040 	case XD_CLOSING:
2041 		if ((oestate == XenbusStateUnknown)		||
2042 		    (oestate == XenbusStateInitialising)	||
2043 		    (oestate == XenbusStateInitWait)		||
2044 		    (oestate == XenbusStateInitialised)		||
2045 		    (oestate == XenbusStateConnected))
2046 			stcheck = STBUG;
2047 		else if (oestate == XenbusStateClosing)
2048 			stcheck = STNOP;
2049 		break;
2050 	case XD_CLOSED:
2051 		if ((oestate == XenbusStateUnknown)		||
2052 		    (oestate == XenbusStateConnected))
2053 			stcheck = STBUG;
2054 		else if ((oestate == XenbusStateInitWait)	||
2055 		    (oestate == XenbusStateInitialised)		||
2056 		    (oestate == XenbusStateClosing)		||
2057 		    (oestate == XenbusStateClosed))
2058 			stcheck = STNOP;
2059 		break;
2060 	case XD_SUSPEND:
2061 	default:
2062 			stcheck = STBUG;
2063 	}
2064 
2065 	if (stcheck == STOK)
2066 		return (DDI_SUCCESS);
2067 
2068 	if (stcheck == STBUG)
2069 		cmn_err(CE_NOTE, "xdf@%s: unexpected otherend "
2070 		    "state change to %d!, when status is %d",
2071 		    ddi_get_name_addr(vdp->xdf_dip), oestate, status);
2072 
2073 	return (DDI_FAILURE);
2074 }
2075 
2076 static int
2077 xdf_connect(xdf_t *vdp, boolean_t wait)
2078 {
2079 	ASSERT(mutex_owned(&vdp->xdf_dev_lk));
2080 	while (vdp->xdf_status != XD_READY) {
2081 		if (!wait || (vdp->xdf_status > XD_READY))
2082 			break;
2083 
2084 		if (cv_wait_sig(&vdp->xdf_dev_cv, &vdp->xdf_dev_lk) == 0)
2085 			break;
2086 	}
2087 
2088 	return (vdp->xdf_status);
2089 }
2090 
2091 /*
2092  * callback func when DMA/GTE resources is available
2093  *
2094  * Note: we only register one callback function to grant table subsystem
2095  * since we only have one 'struct gnttab_free_callback' in xdf_t.
2096  */
2097 static int
2098 xdf_dmacallback(caddr_t arg)
2099 {
2100 	xdf_t *vdp = (xdf_t *)arg;
2101 	ASSERT(vdp != NULL);
2102 
2103 	DPRINTF(DMA_DBG, ("xdf@%s: DMA callback started\n",
2104 	    ddi_get_name_addr(vdp->xdf_dip)));
2105 
2106 	ddi_trigger_softintr(vdp->xdf_softintr_id);
2107 	return (DDI_DMA_CALLBACK_DONE);
2108 }
2109 
2110 static uint_t
2111 xdf_iorestart(caddr_t arg)
2112 {
2113 	xdf_t *vdp = (xdf_t *)arg;
2114 
2115 	ASSERT(vdp != NULL);
2116 
2117 	mutex_enter(&vdp->xdf_dev_lk);
2118 	ASSERT(ISDMACBON(vdp));
2119 	SETDMACBOFF(vdp);
2120 	mutex_exit(&vdp->xdf_dev_lk);
2121 
2122 	xdf_iostart(vdp);
2123 
2124 	return (DDI_INTR_CLAIMED);
2125 }
2126 
2127 static void
2128 xdf_timeout_handler(void *arg)
2129 {
2130 	xdf_t *vdp = arg;
2131 
2132 	mutex_enter(&vdp->xdf_dev_lk);
2133 	vdp->xdf_timeout_id = 0;
2134 	mutex_exit(&vdp->xdf_dev_lk);
2135 
2136 	/* new timeout thread could be re-scheduled */
2137 	xdf_iostart(vdp);
2138 }
2139 
2140 /*
2141  * Alloc a vreq for this bp
2142  * bp->av_back contains the pointer to the vreq upon return
2143  */
2144 static v_req_t *
2145 vreq_get(xdf_t *vdp, buf_t *bp)
2146 {
2147 	v_req_t *vreq = NULL;
2148 
2149 	ASSERT(BP2VREQ(bp) == NULL);
2150 
2151 	vreq = kmem_cache_alloc(xdf_vreq_cache, KM_NOSLEEP);
2152 	if (vreq == NULL) {
2153 		if (vdp->xdf_timeout_id == 0)
2154 			/* restart I/O after one second */
2155 			vdp->xdf_timeout_id =
2156 			    timeout(xdf_timeout_handler, vdp, hz);
2157 		return (NULL);
2158 	}
2159 	bzero(vreq, sizeof (v_req_t));
2160 
2161 	list_insert_head(&vdp->xdf_vreq_act, (void *)vreq);
2162 	bp->av_back = (buf_t *)vreq;
2163 	vreq->v_buf = bp;
2164 	vreq->v_status = VREQ_INIT;
2165 	/* init of other fields in vreq is up to the caller */
2166 
2167 	return (vreq);
2168 }
2169 
2170 static void
2171 vreq_free(xdf_t *vdp, v_req_t *vreq)
2172 {
2173 	buf_t *bp = vreq->v_buf;
2174 
2175 	list_remove(&vdp->xdf_vreq_act, (void *)vreq);
2176 
2177 	if (vreq->v_flush_diskcache == FLUSH_DISKCACHE)
2178 		goto done;
2179 
2180 	switch (vreq->v_status) {
2181 	case VREQ_DMAWIN_DONE:
2182 	case VREQ_GS_ALLOCED:
2183 	case VREQ_DMABUF_BOUND:
2184 		(void) ddi_dma_unbind_handle(vreq->v_dmahdl);
2185 		/*FALLTHRU*/
2186 	case VREQ_DMAMEM_ALLOCED:
2187 		if (!ALIGNED_XFER(bp)) {
2188 			ASSERT(vreq->v_abuf != NULL);
2189 			if (!IS_ERROR(bp) && IS_READ(bp))
2190 				bcopy(vreq->v_abuf, bp->b_un.b_addr,
2191 				    bp->b_bcount);
2192 			ddi_dma_mem_free(&vreq->v_align);
2193 		}
2194 		/*FALLTHRU*/
2195 	case VREQ_MEMDMAHDL_ALLOCED:
2196 		if (!ALIGNED_XFER(bp))
2197 			ddi_dma_free_handle(&vreq->v_memdmahdl);
2198 		/*FALLTHRU*/
2199 	case VREQ_DMAHDL_ALLOCED:
2200 		ddi_dma_free_handle(&vreq->v_dmahdl);
2201 		break;
2202 	default:
2203 		break;
2204 	}
2205 done:
2206 	vreq->v_buf->av_back = NULL;
2207 	kmem_cache_free(xdf_vreq_cache, vreq);
2208 }
2209 
2210 /*
2211  * Initalize the DMA and grant table resources for the buf
2212  */
2213 static int
2214 vreq_setup(xdf_t *vdp, v_req_t *vreq)
2215 {
2216 	int rc;
2217 	ddi_dma_attr_t dmaattr;
2218 	uint_t ndcs, ndws;
2219 	ddi_dma_handle_t dh;
2220 	ddi_dma_handle_t mdh;
2221 	ddi_dma_cookie_t dc;
2222 	ddi_acc_handle_t abh;
2223 	caddr_t	aba;
2224 	ge_slot_t *gs;
2225 	size_t bufsz;
2226 	off_t off;
2227 	size_t sz;
2228 	buf_t *bp = vreq->v_buf;
2229 	int dma_flags = (IS_READ(bp) ? DDI_DMA_READ : DDI_DMA_WRITE) |
2230 	    DDI_DMA_STREAMING | DDI_DMA_PARTIAL;
2231 
2232 	switch (vreq->v_status) {
2233 	case VREQ_INIT:
2234 		if (IS_FLUSH_DISKCACHE(bp)) {
2235 			if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
2236 				DPRINTF(DMA_DBG, (
2237 				    "xdf@%s: get ge_slotfailed\n",
2238 				    ddi_get_name_addr(vdp->xdf_dip)));
2239 				return (DDI_FAILURE);
2240 			}
2241 			vreq->v_blkno = 0;
2242 			vreq->v_nslots = 1;
2243 			vreq->v_gs = gs;
2244 			vreq->v_flush_diskcache = FLUSH_DISKCACHE;
2245 			vreq->v_status = VREQ_GS_ALLOCED;
2246 			gs->vreq = vreq;
2247 			return (DDI_SUCCESS);
2248 		}
2249 
2250 		if (IS_WRITE_BARRIER(vdp, bp))
2251 			vreq->v_flush_diskcache = WRITE_BARRIER;
2252 		vreq->v_blkno = bp->b_blkno +
2253 		    (diskaddr_t)(uintptr_t)bp->b_private;
2254 		bp->b_private = NULL;
2255 		/* See if we wrote new data to our flush block */
2256 		if (!IS_READ(bp) && USE_WRITE_BARRIER(vdp))
2257 			check_fbwrite(vdp, bp, vreq->v_blkno);
2258 		vreq->v_status = VREQ_INIT_DONE;
2259 		/*FALLTHRU*/
2260 
2261 	case VREQ_INIT_DONE:
2262 		/*
2263 		 * alloc DMA handle
2264 		 */
2265 		rc = ddi_dma_alloc_handle(vdp->xdf_dip, &xb_dma_attr,
2266 		    xdf_dmacallback, (caddr_t)vdp, &dh);
2267 		if (rc != DDI_SUCCESS) {
2268 			SETDMACBON(vdp);
2269 			DPRINTF(DMA_DBG, ("xdf@%s: DMA handle alloc failed\n",
2270 			    ddi_get_name_addr(vdp->xdf_dip)));
2271 			return (DDI_FAILURE);
2272 		}
2273 
2274 		vreq->v_dmahdl = dh;
2275 		vreq->v_status = VREQ_DMAHDL_ALLOCED;
2276 		/*FALLTHRU*/
2277 
2278 	case VREQ_DMAHDL_ALLOCED:
2279 		/*
2280 		 * alloc dma handle for 512-byte aligned buf
2281 		 */
2282 		if (!ALIGNED_XFER(bp)) {
2283 			/*
2284 			 * XXPV: we need to temporarily enlarge the seg
2285 			 * boundary and s/g length to work round CR6381968
2286 			 */
2287 			dmaattr = xb_dma_attr;
2288 			dmaattr.dma_attr_seg = (uint64_t)-1;
2289 			dmaattr.dma_attr_sgllen = INT_MAX;
2290 			rc = ddi_dma_alloc_handle(vdp->xdf_dip, &dmaattr,
2291 			    xdf_dmacallback, (caddr_t)vdp, &mdh);
2292 			if (rc != DDI_SUCCESS) {
2293 				SETDMACBON(vdp);
2294 				DPRINTF(DMA_DBG, ("xdf@%s: unaligned buf DMA"
2295 				    "handle alloc failed\n",
2296 				    ddi_get_name_addr(vdp->xdf_dip)));
2297 				return (DDI_FAILURE);
2298 			}
2299 			vreq->v_memdmahdl = mdh;
2300 			vreq->v_status = VREQ_MEMDMAHDL_ALLOCED;
2301 		}
2302 		/*FALLTHRU*/
2303 
2304 	case VREQ_MEMDMAHDL_ALLOCED:
2305 		/*
2306 		 * alloc 512-byte aligned buf
2307 		 */
2308 		if (!ALIGNED_XFER(bp)) {
2309 			if (bp->b_flags & (B_PAGEIO | B_PHYS))
2310 				bp_mapin(bp);
2311 
2312 			rc = ddi_dma_mem_alloc(vreq->v_memdmahdl,
2313 			    roundup(bp->b_bcount, XB_BSIZE), &xc_acc_attr,
2314 			    DDI_DMA_STREAMING, xdf_dmacallback, (caddr_t)vdp,
2315 			    &aba, &bufsz, &abh);
2316 			if (rc != DDI_SUCCESS) {
2317 				SETDMACBON(vdp);
2318 				DPRINTF(DMA_DBG, (
2319 				    "xdf@%s: DMA mem allocation failed\n",
2320 				    ddi_get_name_addr(vdp->xdf_dip)));
2321 				return (DDI_FAILURE);
2322 			}
2323 
2324 			vreq->v_abuf = aba;
2325 			vreq->v_align = abh;
2326 			vreq->v_status = VREQ_DMAMEM_ALLOCED;
2327 
2328 			ASSERT(bufsz >= bp->b_bcount);
2329 			if (!IS_READ(bp))
2330 				bcopy(bp->b_un.b_addr, vreq->v_abuf,
2331 				    bp->b_bcount);
2332 		}
2333 		/*FALLTHRU*/
2334 
2335 	case VREQ_DMAMEM_ALLOCED:
2336 		/*
2337 		 * dma bind
2338 		 */
2339 		if (ALIGNED_XFER(bp)) {
2340 			rc = ddi_dma_buf_bind_handle(vreq->v_dmahdl, bp,
2341 			    dma_flags, xdf_dmacallback, (caddr_t)vdp,
2342 			    &dc, &ndcs);
2343 		} else {
2344 			rc = ddi_dma_addr_bind_handle(vreq->v_dmahdl,
2345 			    NULL, vreq->v_abuf, bp->b_bcount, dma_flags,
2346 			    xdf_dmacallback, (caddr_t)vdp, &dc, &ndcs);
2347 		}
2348 		if (rc == DDI_DMA_MAPPED || rc == DDI_DMA_PARTIAL_MAP) {
2349 			/* get num of dma windows */
2350 			if (rc == DDI_DMA_PARTIAL_MAP) {
2351 				rc = ddi_dma_numwin(vreq->v_dmahdl, &ndws);
2352 				ASSERT(rc == DDI_SUCCESS);
2353 			} else {
2354 				ndws = 1;
2355 			}
2356 		} else {
2357 			SETDMACBON(vdp);
2358 			DPRINTF(DMA_DBG, ("xdf@%s: DMA bind failed\n",
2359 			    ddi_get_name_addr(vdp->xdf_dip)));
2360 			return (DDI_FAILURE);
2361 		}
2362 
2363 		vreq->v_dmac = dc;
2364 		vreq->v_dmaw = 0;
2365 		vreq->v_ndmacs = ndcs;
2366 		vreq->v_ndmaws = ndws;
2367 		vreq->v_nslots = ndws;
2368 		vreq->v_status = VREQ_DMABUF_BOUND;
2369 		/*FALLTHRU*/
2370 
2371 	case VREQ_DMABUF_BOUND:
2372 		/*
2373 		 * get ge_slot, callback is set upon failure from gs_get(),
2374 		 * if not set previously
2375 		 */
2376 		if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
2377 			DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
2378 			    ddi_get_name_addr(vdp->xdf_dip)));
2379 			return (DDI_FAILURE);
2380 		}
2381 
2382 		vreq->v_gs = gs;
2383 		gs->vreq = vreq;
2384 		vreq->v_status = VREQ_GS_ALLOCED;
2385 		break;
2386 
2387 	case VREQ_GS_ALLOCED:
2388 		/* nothing need to be done */
2389 		break;
2390 
2391 	case VREQ_DMAWIN_DONE:
2392 		/*
2393 		 * move to the next dma window
2394 		 */
2395 		ASSERT((vreq->v_dmaw + 1) < vreq->v_ndmaws);
2396 
2397 		/* get a ge_slot for this DMA window */
2398 		if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
2399 			DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
2400 			    ddi_get_name_addr(vdp->xdf_dip)));
2401 			return (DDI_FAILURE);
2402 		}
2403 
2404 		vreq->v_gs = gs;
2405 		gs->vreq = vreq;
2406 		vreq->v_dmaw++;
2407 		rc = ddi_dma_getwin(vreq->v_dmahdl, vreq->v_dmaw, &off, &sz,
2408 		    &vreq->v_dmac, &vreq->v_ndmacs);
2409 		ASSERT(rc == DDI_SUCCESS);
2410 		vreq->v_status = VREQ_GS_ALLOCED;
2411 		break;
2412 
2413 	default:
2414 		return (DDI_FAILURE);
2415 	}
2416 
2417 	return (DDI_SUCCESS);
2418 }
2419 
2420 static ge_slot_t *
2421 gs_get(xdf_t *vdp, int isread)
2422 {
2423 	grant_ref_t gh;
2424 	ge_slot_t *gs;
2425 
2426 	/* try to alloc GTEs needed in this slot, first */
2427 	if (gnttab_alloc_grant_references(
2428 	    BLKIF_MAX_SEGMENTS_PER_REQUEST, &gh) == -1) {
2429 		if (vdp->xdf_gnt_callback.next == NULL) {
2430 			SETDMACBON(vdp);
2431 			gnttab_request_free_callback(
2432 			    &vdp->xdf_gnt_callback,
2433 			    (void (*)(void *))xdf_dmacallback,
2434 			    (void *)vdp,
2435 			    BLKIF_MAX_SEGMENTS_PER_REQUEST);
2436 		}
2437 		return (NULL);
2438 	}
2439 
2440 	gs = kmem_cache_alloc(xdf_gs_cache, KM_NOSLEEP);
2441 	if (gs == NULL) {
2442 		gnttab_free_grant_references(gh);
2443 		if (vdp->xdf_timeout_id == 0)
2444 			/* restart I/O after one second */
2445 			vdp->xdf_timeout_id =
2446 			    timeout(xdf_timeout_handler, vdp, hz);
2447 		return (NULL);
2448 	}
2449 
2450 	/* init gs_slot */
2451 	list_insert_head(&vdp->xdf_gs_act, (void *)gs);
2452 	gs->oeid = vdp->xdf_peer;
2453 	gs->isread = isread;
2454 	gs->ghead = gh;
2455 	gs->ngrefs = 0;
2456 
2457 	return (gs);
2458 }
2459 
2460 static void
2461 gs_free(xdf_t *vdp, ge_slot_t *gs)
2462 {
2463 	int i;
2464 	grant_ref_t *gp = gs->ge;
2465 	int ngrefs = gs->ngrefs;
2466 	boolean_t isread = gs->isread;
2467 
2468 	list_remove(&vdp->xdf_gs_act, (void *)gs);
2469 
2470 	/* release all grant table entry resources used in this slot */
2471 	for (i = 0; i < ngrefs; i++, gp++)
2472 		gnttab_end_foreign_access(*gp, !isread, 0);
2473 	gnttab_free_grant_references(gs->ghead);
2474 
2475 	kmem_cache_free(xdf_gs_cache, (void *)gs);
2476 }
2477 
2478 static grant_ref_t
2479 gs_grant(ge_slot_t *gs, mfn_t mfn)
2480 {
2481 	grant_ref_t gr = gnttab_claim_grant_reference(&gs->ghead);
2482 
2483 	ASSERT(gr != -1);
2484 	ASSERT(gs->ngrefs < BLKIF_MAX_SEGMENTS_PER_REQUEST);
2485 	gs->ge[gs->ngrefs++] = gr;
2486 	gnttab_grant_foreign_access_ref(gr, gs->oeid, mfn, !gs->isread);
2487 
2488 	return (gr);
2489 }
2490 
2491 static void
2492 unexpectedie(xdf_t *vdp)
2493 {
2494 	/* clean up I/Os in ring that have responses */
2495 	if (xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring)) {
2496 		mutex_exit(&vdp->xdf_dev_lk);
2497 		(void) xdf_intr((caddr_t)vdp);
2498 		mutex_enter(&vdp->xdf_dev_lk);
2499 	}
2500 
2501 	/* free up all grant table entries */
2502 	while (!list_is_empty(&vdp->xdf_gs_act))
2503 		gs_free(vdp, list_head(&vdp->xdf_gs_act));
2504 
2505 	/*
2506 	 * move bp back to active list orderly
2507 	 * vreq_busy is updated in vreq_free()
2508 	 */
2509 	while (!list_is_empty(&vdp->xdf_vreq_act)) {
2510 		v_req_t *vreq = list_head(&vdp->xdf_vreq_act);
2511 		buf_t *bp = vreq->v_buf;
2512 
2513 		bp->av_back = NULL;
2514 		bp->b_resid = bp->b_bcount;
2515 		if (vdp->xdf_f_act == NULL) {
2516 			vdp->xdf_f_act = vdp->xdf_l_act = bp;
2517 		} else {
2518 			/* move to the head of list */
2519 			bp->av_forw = vdp->xdf_f_act;
2520 			vdp->xdf_f_act = bp;
2521 		}
2522 		kstat_runq_back_to_waitq(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
2523 		vreq_free(vdp, vreq);
2524 	}
2525 }
2526 
2527 static void
2528 xdfmin(struct buf *bp)
2529 {
2530 	if (bp->b_bcount > xdf_maxphys)
2531 		bp->b_bcount = xdf_maxphys;
2532 }
2533