xref: /illumos-gate/usr/src/uts/common/xen/io/xdf.c (revision f8c3982ab1838a24e4b671d13329f52bbbebc2a7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * xdf.c - Xen Virtual Block Device Driver
29  * TODO:
30  *	- support alternate block size (currently only DEV_BSIZE supported)
31  *	- revalidate geometry for removable devices
32  */
33 
34 #pragma ident	"%Z%%M%	%I%	%E% SMI"
35 
36 #include <sys/types.h>
37 #include <sys/conf.h>
38 #include <sys/ddi.h>
39 #include <sys/dditypes.h>
40 #include <sys/sunddi.h>
41 #include <sys/list.h>
42 #include <sys/cmlb.h>
43 #include <sys/dkio.h>
44 #include <sys/vtoc.h>
45 #include <sys/modctl.h>
46 #include <sys/bootconf.h>
47 #include <sys/promif.h>
48 #include <sys/sysmacros.h>
49 #include <sys/kstat.h>
50 #include <sys/mach_mmu.h>
51 #ifdef XPV_HVM_DRIVER
52 #include <sys/xpv_support.h>
53 #endif
54 #include <public/io/xenbus.h>
55 #include <xen/sys/xenbus_impl.h>
56 #include <xen/sys/xendev.h>
57 #include <sys/gnttab.h>
58 #include <sys/scsi/generic/inquiry.h>
59 #include <io/xdf.h>
60 
61 #define	FLUSH_DISKCACHE	0x1
62 #define	WRITE_BARRIER	0x2
63 #define	DEFAULT_FLUSH_BLOCK	156 /* block to write to cause a cache flush */
64 #define	USE_WRITE_BARRIER(vdp)				\
65 	((vdp)->xdf_feature_barrier && !(vdp)->xdf_flush_supported)
66 #define	USE_FLUSH_DISKCACHE(vdp)			\
67 	((vdp)->xdf_feature_barrier && (vdp)->xdf_flush_supported)
68 #define	IS_WRITE_BARRIER(vdp, bp)			\
69 	(!IS_READ(bp) && USE_WRITE_BARRIER(vdp) &&	\
70 	((bp)->b_un.b_addr == (vdp)->xdf_cache_flush_block))
71 #define	IS_FLUSH_DISKCACHE(bp)				\
72 	(!IS_READ(bp) && USE_FLUSH_DISKCACHE(vdp) && ((bp)->b_bcount == 0))
73 
74 static void *vbd_ss;
75 static kmem_cache_t *xdf_vreq_cache;
76 static kmem_cache_t *xdf_gs_cache;
77 static int xdf_maxphys = XB_MAXPHYS;
78 int xdfdebug = 0;
79 extern int do_polled_io;
80 diskaddr_t xdf_flush_block = DEFAULT_FLUSH_BLOCK;
81 int	xdf_barrier_flush_disable = 0;
82 
83 /*
84  * dev_ops and cb_ops entrypoints
85  */
86 static int xdf_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
87 static int xdf_attach(dev_info_t *, ddi_attach_cmd_t);
88 static int xdf_detach(dev_info_t *, ddi_detach_cmd_t);
89 static int xdf_reset(dev_info_t *, ddi_reset_cmd_t);
90 static int xdf_open(dev_t *, int, int, cred_t *);
91 static int xdf_close(dev_t, int, int, struct cred *);
92 static int xdf_strategy(struct buf *);
93 static int xdf_read(dev_t, struct uio *, cred_t *);
94 static int xdf_aread(dev_t, struct aio_req *, cred_t *);
95 static int xdf_write(dev_t, struct uio *, cred_t *);
96 static int xdf_awrite(dev_t, struct aio_req *, cred_t *);
97 static int xdf_dump(dev_t, caddr_t, daddr_t, int);
98 static int xdf_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
99 static uint_t xdf_intr(caddr_t);
100 static int xdf_prop_op(dev_t, dev_info_t *, ddi_prop_op_t, int, char *,
101     caddr_t, int *);
102 
103 /*
104  * misc private functions
105  */
106 static int xdf_suspend(dev_info_t *);
107 static int xdf_resume(dev_info_t *);
108 static int xdf_start_connect(xdf_t *);
109 static int xdf_start_disconnect(xdf_t *);
110 static int xdf_post_connect(xdf_t *);
111 static void xdf_post_disconnect(xdf_t *);
112 static void xdf_oe_change(dev_info_t *, ddi_eventcookie_t, void *, void *);
113 static void xdf_iostart(xdf_t *);
114 static void xdf_iofini(xdf_t *, uint64_t, int);
115 static int xdf_prepare_rreq(xdf_t *, struct buf *, blkif_request_t *);
116 static int xdf_drain_io(xdf_t *);
117 static boolean_t xdf_isopen(xdf_t *, int);
118 static int xdf_check_state_transition(xdf_t *, XenbusState);
119 static int xdf_connect(xdf_t *, boolean_t);
120 static int xdf_dmacallback(caddr_t);
121 static void xdf_timeout_handler(void *);
122 static uint_t xdf_iorestart(caddr_t);
123 static v_req_t *vreq_get(xdf_t *, buf_t *);
124 static void vreq_free(xdf_t *, v_req_t *);
125 static int vreq_setup(xdf_t *, v_req_t *);
126 static ge_slot_t *gs_get(xdf_t *, int);
127 static void gs_free(xdf_t *, ge_slot_t *);
128 static grant_ref_t gs_grant(ge_slot_t *, mfn_t);
129 static void unexpectedie(xdf_t *);
130 static void xdfmin(struct buf *);
131 
132 static 	struct cb_ops xdf_cbops = {
133 	xdf_open,
134 	xdf_close,
135 	xdf_strategy,
136 	nodev,
137 	xdf_dump,
138 	xdf_read,
139 	xdf_write,
140 	xdf_ioctl,
141 	nodev,
142 	nodev,
143 	nodev,
144 	nochpoll,
145 	xdf_prop_op,
146 	NULL,
147 	D_MP | D_NEW | D_64BIT,
148 	CB_REV,
149 	xdf_aread,
150 	xdf_awrite
151 };
152 
153 struct dev_ops xdf_devops = {
154 	DEVO_REV,		/* devo_rev */
155 	0,			/* devo_refcnt */
156 	xdf_getinfo,		/* devo_getinfo */
157 	nulldev,		/* devo_identify */
158 	nulldev,		/* devo_probe */
159 	xdf_attach,		/* devo_attach */
160 	xdf_detach,		/* devo_detach */
161 	xdf_reset,		/* devo_reset */
162 	&xdf_cbops,		/* devo_cb_ops */
163 	(struct bus_ops *)NULL	/* devo_bus_ops */
164 };
165 
166 static struct modldrv modldrv = {
167 	&mod_driverops,		/* Type of module.  This one is a driver */
168 	"virtual block driver %I%",	/* short description */
169 	&xdf_devops		/* driver specific ops */
170 };
171 
172 static struct modlinkage xdf_modlinkage = {
173 	MODREV_1, (void *)&modldrv, NULL
174 };
175 
176 /*
177  * I/O buffer DMA attributes
178  * Make sure: one DMA window contains BLKIF_MAX_SEGMENTS_PER_REQUEST at most
179  */
180 static ddi_dma_attr_t xb_dma_attr = {
181 	DMA_ATTR_V0,
182 	(uint64_t)0,			/* lowest address */
183 	(uint64_t)0xffffffffffffffff,	/* highest usable address */
184 	(uint64_t)0xffffff,		/* DMA counter limit max */
185 	(uint64_t)XB_BSIZE,		/* alignment in bytes */
186 	XB_BSIZE - 1,			/* bitmap of burst sizes */
187 	XB_BSIZE,			/* min transfer */
188 	(uint64_t)XB_MAX_XFER, 		/* maximum transfer */
189 	(uint64_t)PAGEOFFSET,		/* 1 page segment length  */
190 	BLKIF_MAX_SEGMENTS_PER_REQUEST,	/* maximum number of segments */
191 	XB_BSIZE,			/* granularity */
192 	0,				/* flags (reserved) */
193 };
194 
195 static ddi_device_acc_attr_t xc_acc_attr = {
196 	DDI_DEVICE_ATTR_V0,
197 	DDI_NEVERSWAP_ACC,
198 	DDI_STRICTORDER_ACC
199 };
200 
201 /* callbacks from commmon label */
202 
203 static int xdf_lb_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t,
204 	void *);
205 static int xdf_lb_getinfo(dev_info_t *, int, void *, void *);
206 
207 static cmlb_tg_ops_t xdf_lb_ops = {
208 	TG_DK_OPS_VERSION_1,
209 	xdf_lb_rdwr,
210 	xdf_lb_getinfo
211 };
212 
213 int
214 _init(void)
215 {
216 	int rc;
217 
218 	if ((rc = ddi_soft_state_init(&vbd_ss, sizeof (xdf_t), 0)) == 0) {
219 		xdf_vreq_cache = kmem_cache_create("xdf_vreq_cache",
220 		    sizeof (v_req_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
221 		ASSERT(xdf_vreq_cache != NULL);
222 		xdf_gs_cache = kmem_cache_create("xdf_gs_cache",
223 		    sizeof (ge_slot_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
224 		ASSERT(xdf_gs_cache != NULL);
225 		if ((rc = mod_install(&xdf_modlinkage)) != 0) {
226 			kmem_cache_destroy(xdf_vreq_cache);
227 			kmem_cache_destroy(xdf_gs_cache);
228 			ddi_soft_state_fini(&vbd_ss);
229 		}
230 	}
231 
232 	return (rc);
233 }
234 
235 int
236 _fini(void)
237 {
238 	int err;
239 
240 	if ((err = mod_remove(&xdf_modlinkage)) != 0)
241 		return (err);
242 
243 	kmem_cache_destroy(xdf_vreq_cache);
244 	kmem_cache_destroy(xdf_gs_cache);
245 	ddi_soft_state_fini(&vbd_ss);
246 
247 	return (0);
248 }
249 
250 int
251 _info(struct modinfo *modinfop)
252 {
253 	return (mod_info(&xdf_modlinkage, modinfop));
254 }
255 
256 /*ARGSUSED*/
257 static int
258 xdf_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **rp)
259 {
260 	int instance;
261 	xdf_t *vbdp;
262 
263 	instance = XDF_INST(getminor((dev_t)arg));
264 
265 	switch (cmd) {
266 	case DDI_INFO_DEVT2DEVINFO:
267 		if ((vbdp = ddi_get_soft_state(vbd_ss, instance)) == NULL) {
268 			*rp = NULL;
269 			return (DDI_FAILURE);
270 		}
271 		*rp = vbdp->xdf_dip;
272 		return (DDI_SUCCESS);
273 
274 	case DDI_INFO_DEVT2INSTANCE:
275 		*rp = (void *)(uintptr_t)instance;
276 		return (DDI_SUCCESS);
277 
278 	default:
279 		return (DDI_FAILURE);
280 	}
281 }
282 
283 static int
284 xdf_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags,
285 	char *name, caddr_t valuep, int *lengthp)
286 {
287 	int instance = ddi_get_instance(dip);
288 	xdf_t *vdp;
289 	diskaddr_t p_blkcnt;
290 
291 	/*
292 	 * xdf dynamic properties are device specific and size oriented.
293 	 * Requests issued under conditions where size is valid are passed
294 	 * to ddi_prop_op_nblocks with the size information, otherwise the
295 	 * request is passed to ddi_prop_op.
296 	 */
297 	vdp = ddi_get_soft_state(vbd_ss, instance);
298 
299 	if ((dev == DDI_DEV_T_ANY) || (vdp == NULL))
300 		return (ddi_prop_op(dev, dip, prop_op, mod_flags,
301 		    name, valuep, lengthp));
302 
303 	/* do cv_wait until connected or failed */
304 	mutex_enter(&vdp->xdf_dev_lk);
305 	if (xdf_connect(vdp, B_TRUE) != XD_READY) {
306 		mutex_exit(&vdp->xdf_dev_lk);
307 		goto out;
308 	}
309 	mutex_exit(&vdp->xdf_dev_lk);
310 
311 	if (cmlb_partinfo(vdp->xdf_vd_lbl, XDF_PART(getminor(dev)), &p_blkcnt,
312 	    NULL, NULL, NULL, NULL) == 0)
313 		return (ddi_prop_op_nblocks(dev, dip, prop_op, mod_flags,
314 		    name, valuep, lengthp, (uint64_t)p_blkcnt));
315 
316 out:
317 	return (ddi_prop_op(dev, dip, prop_op, mod_flags, name, valuep,
318 	    lengthp));
319 }
320 
321 static int
322 xdf_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
323 {
324 	xdf_t *vdp;
325 	ddi_iblock_cookie_t ibc;
326 	ddi_iblock_cookie_t softibc;
327 	int instance;
328 #if defined(XPV_HVM_DRIVER) && defined(__i386)
329 	/* XXX: 6609126 32-bit xdf driver panics on a 64-bit dom0 */
330 	extern int xen_is_64bit;
331 
332 	if (xen_is_64bit) {
333 		cmn_err(CE_WARN, "xdf cannot be used in 32-bit domUs on a"
334 		    " 64-bit dom0.");
335 		return (DDI_FAILURE);
336 	}
337 #endif
338 
339 	xdfdebug = ddi_prop_get_int(DDI_DEV_T_ANY, devi, DDI_PROP_NOTPROM,
340 	    "xdfdebug", 0);
341 
342 	switch (cmd) {
343 		case DDI_ATTACH:
344 			break;
345 
346 		case DDI_RESUME:
347 			return (xdf_resume(devi));
348 
349 		default:
350 			return (DDI_FAILURE);
351 	}
352 
353 	instance = ddi_get_instance(devi);
354 	if (ddi_soft_state_zalloc(vbd_ss, instance) != DDI_SUCCESS)
355 		return (DDI_FAILURE);
356 
357 	DPRINTF(DDI_DBG, ("xdf%d: attaching\n", instance));
358 	vdp = ddi_get_soft_state(vbd_ss, instance);
359 	vdp->xdf_dip = devi;
360 	if (ddi_get_iblock_cookie(devi, 0, &ibc) != DDI_SUCCESS) {
361 		cmn_err(CE_WARN, "xdf@%s: failed to get iblock cookie",
362 		    ddi_get_name_addr(devi));
363 		goto errout1;
364 	}
365 
366 	mutex_init(&vdp->xdf_dev_lk, NULL, MUTEX_DRIVER, (void *)ibc);
367 	mutex_init(&vdp->xdf_cb_lk, NULL, MUTEX_DRIVER, (void *)ibc);
368 	cv_init(&vdp->xdf_dev_cv, NULL, CV_DEFAULT, NULL);
369 	ddi_set_driver_private(devi, vdp);
370 
371 	if (ddi_get_soft_iblock_cookie(devi, DDI_SOFTINT_LOW, &softibc)
372 	    != DDI_SUCCESS) {
373 		cmn_err(CE_WARN, "xdf@%s: failed to get softintr iblock cookie",
374 		    ddi_get_name_addr(devi));
375 		goto errout2;
376 	}
377 	if (ddi_add_softintr(devi, DDI_SOFTINT_LOW, &vdp->xdf_softintr_id,
378 	    &softibc, NULL, xdf_iorestart, (caddr_t)vdp) != DDI_SUCCESS) {
379 		cmn_err(CE_WARN, "xdf@%s: failed to add softintr",
380 		    ddi_get_name_addr(devi));
381 		goto errout2;
382 	}
383 
384 	/*
385 	 * create kstat for iostat(1M)
386 	 */
387 	if ((vdp->xdf_xdev_iostat = kstat_create("xdf", instance, NULL, "disk",
388 	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) != NULL) {
389 		vdp->xdf_xdev_iostat->ks_lock = &vdp->xdf_dev_lk;
390 		kstat_install(vdp->xdf_xdev_iostat);
391 	} else {
392 		cmn_err(CE_WARN, "xdf@%s: failed to create kstat",
393 		    ddi_get_name_addr(devi));
394 		goto errout3;
395 	}
396 
397 	/*
398 	 * driver handles kernel-issued IOCTLs
399 	 */
400 	if (ddi_prop_create(DDI_DEV_T_NONE, devi, DDI_PROP_CANSLEEP,
401 	    DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) {
402 		cmn_err(CE_WARN, "xdf@%s: cannot create DDI_KERNEL_IOCTL prop",
403 		    ddi_get_name_addr(devi));
404 		goto errout4;
405 	}
406 
407 	/*
408 	 * create default device minor nodes: non-removable disk
409 	 * we will adjust minor nodes after we are connected w/ backend
410 	 */
411 	cmlb_alloc_handle(&vdp->xdf_vd_lbl);
412 	if (cmlb_attach(devi, &xdf_lb_ops, DTYPE_DIRECT, 0, 1, DDI_NT_BLOCK,
413 	    CMLB_FAKE_LABEL_ONE_PARTITION, vdp->xdf_vd_lbl, NULL) != 0) {
414 		cmn_err(CE_WARN, "xdf@%s: default cmlb attach failed",
415 		    ddi_get_name_addr(devi));
416 		goto errout5;
417 	}
418 
419 	/*
420 	 * We ship with cache-enabled disks
421 	 */
422 	vdp->xdf_wce = 1;
423 
424 	mutex_enter(&vdp->xdf_cb_lk);
425 
426 	/* Watch backend XenbusState change */
427 	if (xvdi_add_event_handler(devi, XS_OE_STATE,
428 	    xdf_oe_change) != DDI_SUCCESS) {
429 		mutex_exit(&vdp->xdf_cb_lk);
430 		goto errout6;
431 	}
432 
433 	if (xdf_start_connect(vdp) != DDI_SUCCESS) {
434 		cmn_err(CE_WARN, "xdf@%s: start connection failed",
435 		    ddi_get_name_addr(devi));
436 		(void) xdf_start_disconnect(vdp);
437 		mutex_exit(&vdp->xdf_cb_lk);
438 		goto errout7;
439 	}
440 
441 	mutex_exit(&vdp->xdf_cb_lk);
442 
443 	list_create(&vdp->xdf_vreq_act, sizeof (v_req_t),
444 	    offsetof(v_req_t, v_link));
445 	list_create(&vdp->xdf_gs_act, sizeof (ge_slot_t),
446 	    offsetof(ge_slot_t, link));
447 
448 	ddi_report_dev(devi);
449 	DPRINTF(DDI_DBG, ("xdf%d: attached\n", instance));
450 
451 	return (DDI_SUCCESS);
452 
453 errout7:
454 	xvdi_remove_event_handler(devi, XS_OE_STATE);
455 errout6:
456 	cmlb_detach(vdp->xdf_vd_lbl, NULL);
457 errout5:
458 	cmlb_free_handle(&vdp->xdf_vd_lbl);
459 	ddi_prop_remove_all(devi);
460 errout4:
461 	kstat_delete(vdp->xdf_xdev_iostat);
462 errout3:
463 	ddi_remove_softintr(vdp->xdf_softintr_id);
464 errout2:
465 	ddi_set_driver_private(devi, NULL);
466 	cv_destroy(&vdp->xdf_dev_cv);
467 	mutex_destroy(&vdp->xdf_cb_lk);
468 	mutex_destroy(&vdp->xdf_dev_lk);
469 errout1:
470 	cmn_err(CE_WARN, "xdf@%s: attach failed", ddi_get_name_addr(devi));
471 	ddi_soft_state_free(vbd_ss, instance);
472 	return (DDI_FAILURE);
473 }
474 
475 static int
476 xdf_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
477 {
478 	xdf_t *vdp;
479 	int instance;
480 
481 	switch (cmd) {
482 
483 	case DDI_PM_SUSPEND:
484 		break;
485 
486 	case DDI_SUSPEND:
487 		return (xdf_suspend(devi));
488 
489 	case DDI_DETACH:
490 		break;
491 
492 	default:
493 		return (DDI_FAILURE);
494 	}
495 
496 	instance = ddi_get_instance(devi);
497 	DPRINTF(DDI_DBG, ("xdf%d: detaching\n", instance));
498 	vdp = ddi_get_soft_state(vbd_ss, instance);
499 
500 	if (vdp == NULL)
501 		return (DDI_FAILURE);
502 
503 	mutex_enter(&vdp->xdf_dev_lk);
504 	if (xdf_isopen(vdp, -1)) {
505 		mutex_exit(&vdp->xdf_dev_lk);
506 		return (DDI_FAILURE);
507 	}
508 
509 	if (vdp->xdf_status != XD_CLOSED) {
510 		mutex_exit(&vdp->xdf_dev_lk);
511 		return (DDI_FAILURE);
512 	}
513 
514 	ASSERT(!ISDMACBON(vdp));
515 	mutex_exit(&vdp->xdf_dev_lk);
516 
517 	if (vdp->xdf_timeout_id != 0)
518 		(void) untimeout(vdp->xdf_timeout_id);
519 
520 	xvdi_remove_event_handler(devi, XS_OE_STATE);
521 
522 	/* we'll support backend running in domU later */
523 #ifdef	DOMU_BACKEND
524 	(void) xvdi_post_event(devi, XEN_HP_REMOVE);
525 #endif
526 
527 	list_destroy(&vdp->xdf_vreq_act);
528 	list_destroy(&vdp->xdf_gs_act);
529 	ddi_prop_remove_all(devi);
530 	kstat_delete(vdp->xdf_xdev_iostat);
531 	ddi_remove_softintr(vdp->xdf_softintr_id);
532 	ddi_set_driver_private(devi, NULL);
533 	cv_destroy(&vdp->xdf_dev_cv);
534 	mutex_destroy(&vdp->xdf_cb_lk);
535 	mutex_destroy(&vdp->xdf_dev_lk);
536 	if (vdp->xdf_cache_flush_block != NULL)
537 		kmem_free(vdp->xdf_flush_mem, 2 * DEV_BSIZE);
538 	ddi_soft_state_free(vbd_ss, instance);
539 	return (DDI_SUCCESS);
540 }
541 
542 static int
543 xdf_suspend(dev_info_t *devi)
544 {
545 	xdf_t *vdp;
546 	int instance;
547 	enum xdf_state st;
548 
549 	instance = ddi_get_instance(devi);
550 
551 	if (xdfdebug & SUSRES_DBG)
552 		xen_printf("xdf_suspend: xdf#%d\n", instance);
553 
554 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
555 		return (DDI_FAILURE);
556 
557 	xvdi_suspend(devi);
558 
559 	mutex_enter(&vdp->xdf_cb_lk);
560 	mutex_enter(&vdp->xdf_dev_lk);
561 	st = vdp->xdf_status;
562 	/* change status to stop further I/O requests */
563 	if (st == XD_READY)
564 		vdp->xdf_status = XD_SUSPEND;
565 	mutex_exit(&vdp->xdf_dev_lk);
566 	mutex_exit(&vdp->xdf_cb_lk);
567 
568 	/* make sure no more I/O responses left in the ring buffer */
569 	if ((st == XD_INIT) || (st == XD_READY)) {
570 #ifdef XPV_HVM_DRIVER
571 		ec_unbind_evtchn(vdp->xdf_evtchn);
572 #else
573 		(void) ddi_remove_intr(devi, 0, NULL);
574 #endif
575 		(void) xdf_drain_io(vdp);
576 		/*
577 		 * no need to teardown the ring buffer here
578 		 * it will be simply re-init'ed during resume when
579 		 * we call xvdi_alloc_ring
580 		 */
581 	}
582 
583 	if (xdfdebug & SUSRES_DBG)
584 		xen_printf("xdf_suspend: SUCCESS\n");
585 
586 	return (DDI_SUCCESS);
587 }
588 
589 /*ARGSUSED*/
590 static int
591 xdf_resume(dev_info_t *devi)
592 {
593 	xdf_t *vdp;
594 	int instance;
595 
596 	instance = ddi_get_instance(devi);
597 	if (xdfdebug & SUSRES_DBG)
598 		xen_printf("xdf_resume: xdf%d\n", instance);
599 
600 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
601 		return (DDI_FAILURE);
602 
603 	mutex_enter(&vdp->xdf_cb_lk);
604 
605 	if (xvdi_resume(devi) != DDI_SUCCESS) {
606 		mutex_exit(&vdp->xdf_cb_lk);
607 		return (DDI_FAILURE);
608 	}
609 
610 	mutex_enter(&vdp->xdf_dev_lk);
611 	ASSERT(vdp->xdf_status != XD_READY);
612 	vdp->xdf_status = XD_UNKNOWN;
613 	mutex_exit(&vdp->xdf_dev_lk);
614 
615 	if (xdf_start_connect(vdp) != DDI_SUCCESS) {
616 		mutex_exit(&vdp->xdf_cb_lk);
617 		return (DDI_FAILURE);
618 	}
619 
620 	mutex_exit(&vdp->xdf_cb_lk);
621 
622 	if (xdfdebug & SUSRES_DBG)
623 		xen_printf("xdf_resume: done\n");
624 	return (DDI_SUCCESS);
625 }
626 
627 /*ARGSUSED*/
628 static int
629 xdf_reset(dev_info_t *devi, ddi_reset_cmd_t cmd)
630 {
631 	xdf_t *vdp;
632 	int instance;
633 
634 	instance = ddi_get_instance(devi);
635 	DPRINTF(DDI_DBG, ("xdf%d: resetting\n", instance));
636 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
637 		return (DDI_FAILURE);
638 
639 	/*
640 	 * wait for any outstanding I/O to complete
641 	 */
642 	(void) xdf_drain_io(vdp);
643 
644 	DPRINTF(DDI_DBG, ("xdf%d: reset complete\n", instance));
645 	return (DDI_SUCCESS);
646 }
647 
648 static int
649 xdf_open(dev_t *devp, int flag, int otyp, cred_t *credp)
650 {
651 	minor_t	minor;
652 	xdf_t	*vdp;
653 	int part;
654 	ulong_t parbit;
655 	diskaddr_t p_blkct = 0;
656 	boolean_t firstopen;
657 	boolean_t nodelay;
658 
659 	nodelay = (flag & (FNDELAY | FNONBLOCK));
660 	minor = getminor(*devp);
661 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
662 		return (ENXIO);
663 
664 	DPRINTF(DDI_DBG, ("xdf%d: opening\n", XDF_INST(minor)));
665 
666 	/* do cv_wait until connected or failed */
667 	mutex_enter(&vdp->xdf_dev_lk);
668 	if (!nodelay && (xdf_connect(vdp, B_TRUE) != XD_READY)) {
669 		mutex_exit(&vdp->xdf_dev_lk);
670 		return (ENXIO);
671 	}
672 
673 	if ((flag & FWRITE) && XD_IS_RO(vdp)) {
674 		mutex_exit(&vdp->xdf_dev_lk);
675 		return (EROFS);
676 	}
677 
678 	part = XDF_PART(minor);
679 	parbit = 1 << part;
680 	if (vdp->xdf_vd_exclopen & parbit) {
681 		mutex_exit(&vdp->xdf_dev_lk);
682 		return (EBUSY);
683 	}
684 
685 	/* are we the first one to open this node? */
686 	firstopen = !xdf_isopen(vdp, -1);
687 
688 	if ((flag & FEXCL) && !firstopen) {
689 		mutex_exit(&vdp->xdf_dev_lk);
690 		return (EBUSY);
691 	}
692 
693 	if (otyp == OTYP_LYR)
694 		vdp->xdf_vd_lyropen[part]++;
695 
696 	vdp->xdf_vd_open[otyp] |= parbit;
697 
698 	if (flag & FEXCL)
699 		vdp->xdf_vd_exclopen |= parbit;
700 
701 	mutex_exit(&vdp->xdf_dev_lk);
702 
703 	/* force a re-validation */
704 	if (firstopen)
705 		cmlb_invalidate(vdp->xdf_vd_lbl, NULL);
706 
707 	/*
708 	 * check size
709 	 * ignore CD/DVD which contains a zero-sized s0
710 	 */
711 	if (!nodelay && !XD_IS_CD(vdp) &&
712 	    ((cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
713 	    NULL, NULL, NULL, NULL) != 0) || (p_blkct == 0))) {
714 		(void) xdf_close(*devp, flag, otyp, credp);
715 		return (ENXIO);
716 	}
717 
718 	return (0);
719 }
720 
721 /*ARGSUSED*/
722 static int
723 xdf_close(dev_t dev, int flag, int otyp, struct cred *credp)
724 {
725 	minor_t	minor;
726 	xdf_t	*vdp;
727 	int part;
728 	ulong_t parbit;
729 
730 	minor = getminor(dev);
731 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
732 		return (ENXIO);
733 
734 	mutex_enter(&vdp->xdf_dev_lk);
735 	part = XDF_PART(minor);
736 	if (!xdf_isopen(vdp, part)) {
737 		mutex_exit(&vdp->xdf_dev_lk);
738 		return (ENXIO);
739 	}
740 	parbit = 1 << part;
741 
742 	if (otyp == OTYP_LYR) {
743 		if (vdp->xdf_vd_lyropen[part] != 0)
744 			vdp->xdf_vd_lyropen[part]--;
745 		if (vdp->xdf_vd_lyropen[part] == 0)
746 			vdp->xdf_vd_open[OTYP_LYR] &= ~parbit;
747 	} else {
748 		vdp->xdf_vd_open[otyp] &= ~parbit;
749 	}
750 	vdp->xdf_vd_exclopen &= ~parbit;
751 
752 	mutex_exit(&vdp->xdf_dev_lk);
753 	return (0);
754 }
755 
756 static int
757 xdf_strategy(struct buf *bp)
758 {
759 	xdf_t	*vdp;
760 	minor_t minor;
761 	diskaddr_t p_blkct, p_blkst;
762 	ulong_t nblks;
763 	int part;
764 
765 	minor = getminor(bp->b_edev);
766 	part = XDF_PART(minor);
767 	if (!(vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) ||
768 	    !xdf_isopen(vdp, part) ||
769 	    cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
770 	    &p_blkst, NULL, NULL, NULL)) {
771 		bioerror(bp, ENXIO);
772 		bp->b_resid = bp->b_bcount;
773 		biodone(bp);
774 		return (0);
775 	}
776 
777 	if (!IS_READ(bp) && XD_IS_RO(vdp)) {
778 		bioerror(bp, EROFS);
779 		bp->b_resid = bp->b_bcount;
780 		biodone(bp);
781 		return (0);
782 	}
783 
784 	/*
785 	 * starting beyond partition
786 	 */
787 	if (bp->b_blkno > p_blkct) {
788 		DPRINTF(IO_DBG, ("xdf: block %lld exceeds VBD size %"PRIu64,
789 		    (longlong_t)bp->b_blkno, (uint64_t)p_blkct));
790 		bioerror(bp, EINVAL);
791 		bp->b_resid = bp->b_bcount;
792 		biodone(bp);
793 		return (0);
794 	}
795 
796 	/* Legacy: don't set error flag at this case */
797 	if (bp->b_blkno == p_blkct) {
798 		bp->b_resid = bp->b_bcount;
799 		biodone(bp);
800 		return (0);
801 	}
802 
803 	/*
804 	 * adjust for partial transfer
805 	 */
806 	nblks = bp->b_bcount >> XB_BSHIFT;
807 	if ((bp->b_blkno + nblks) > p_blkct) {
808 		bp->b_resid = ((bp->b_blkno + nblks) - p_blkct) << XB_BSHIFT;
809 		bp->b_bcount -= bp->b_resid;
810 	}
811 
812 
813 	DPRINTF(IO_DBG, ("xdf: strategy blk %lld len %lu\n",
814 	    (longlong_t)bp->b_blkno, (ulong_t)bp->b_bcount));
815 
816 	mutex_enter(&vdp->xdf_dev_lk);
817 	kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
818 	if (vdp->xdf_f_act == NULL) {
819 		vdp->xdf_f_act = vdp->xdf_l_act = bp;
820 	} else {
821 		vdp->xdf_l_act->av_forw = bp;
822 		vdp->xdf_l_act = bp;
823 	}
824 	bp->av_forw = NULL;
825 	bp->av_back = NULL; /* not tagged with a v_req */
826 	bp->b_private = (void *)(uintptr_t)p_blkst;
827 	mutex_exit(&vdp->xdf_dev_lk);
828 	xdf_iostart(vdp);
829 	if (do_polled_io)
830 		(void) xdf_drain_io(vdp);
831 	return (0);
832 }
833 
834 /*ARGSUSED*/
835 static int
836 xdf_read(dev_t dev, struct uio *uiop, cred_t *credp)
837 {
838 
839 	xdf_t	*vdp;
840 	minor_t minor;
841 	diskaddr_t p_blkcnt;
842 	int part;
843 
844 	minor = getminor(dev);
845 	if (!(vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))))
846 		return (ENXIO);
847 
848 	DPRINTF(IO_DBG, ("xdf: read offset 0x%"PRIx64"\n",
849 	    (int64_t)uiop->uio_offset));
850 
851 	part = XDF_PART(minor);
852 	if (!xdf_isopen(vdp, part))
853 		return (ENXIO);
854 
855 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
856 	    NULL, NULL, NULL, NULL))
857 		return (ENXIO);
858 
859 	if (U_INVAL(uiop))
860 		return (EINVAL);
861 
862 	return (physio(xdf_strategy, NULL, dev, B_READ, xdfmin, uiop));
863 }
864 
865 /*ARGSUSED*/
866 static int
867 xdf_write(dev_t dev, struct uio *uiop, cred_t *credp)
868 {
869 	xdf_t *vdp;
870 	minor_t minor;
871 	diskaddr_t p_blkcnt;
872 	int part;
873 
874 	minor = getminor(dev);
875 	if (!(vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))))
876 		return (ENXIO);
877 
878 	DPRINTF(IO_DBG, ("xdf: write offset 0x%"PRIx64"\n",
879 	    (int64_t)uiop->uio_offset));
880 
881 	part = XDF_PART(minor);
882 	if (!xdf_isopen(vdp, part))
883 		return (ENXIO);
884 
885 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
886 	    NULL, NULL, NULL, NULL))
887 		return (ENXIO);
888 
889 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt))
890 		return (ENOSPC);
891 
892 	if (U_INVAL(uiop))
893 		return (EINVAL);
894 
895 	return (physio(xdf_strategy, NULL, dev, B_WRITE, minphys, uiop));
896 }
897 
898 /*ARGSUSED*/
899 static int
900 xdf_aread(dev_t dev, struct aio_req *aiop, cred_t *credp)
901 {
902 	xdf_t	*vdp;
903 	minor_t minor;
904 	struct uio *uiop = aiop->aio_uio;
905 	diskaddr_t p_blkcnt;
906 	int part;
907 
908 	minor = getminor(dev);
909 	if (!(vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))))
910 		return (ENXIO);
911 
912 	part = XDF_PART(minor);
913 	if (!xdf_isopen(vdp, part))
914 		return (ENXIO);
915 
916 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
917 	    NULL, NULL, NULL, NULL))
918 		return (ENXIO);
919 
920 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt))
921 		return (ENOSPC);
922 
923 	if (U_INVAL(uiop))
924 		return (EINVAL);
925 
926 	return (aphysio(xdf_strategy, anocancel, dev, B_READ, minphys, aiop));
927 }
928 
929 /*ARGSUSED*/
930 static int
931 xdf_awrite(dev_t dev, struct aio_req *aiop, cred_t *credp)
932 {
933 	xdf_t *vdp;
934 	minor_t minor;
935 	struct uio *uiop = aiop->aio_uio;
936 	diskaddr_t p_blkcnt;
937 	int part;
938 
939 	minor = getminor(dev);
940 	if (!(vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))))
941 		return (ENXIO);
942 
943 	part = XDF_PART(minor);
944 	if (!xdf_isopen(vdp, part))
945 		return (ENXIO);
946 
947 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
948 	    NULL, NULL, NULL, NULL))
949 		return (ENXIO);
950 
951 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt))
952 		return (ENOSPC);
953 
954 	if (U_INVAL(uiop))
955 		return (EINVAL);
956 
957 	return (aphysio(xdf_strategy, anocancel, dev, B_WRITE, minphys, aiop));
958 }
959 
960 static int
961 xdf_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
962 {
963 	struct buf dumpbuf, *dbp;
964 	xdf_t	*vdp;
965 	minor_t minor;
966 	int err = 0;
967 	int part;
968 	diskaddr_t p_blkcnt, p_blkst;
969 
970 	minor = getminor(dev);
971 	if (!(vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))))
972 		return (ENXIO);
973 
974 	DPRINTF(IO_DBG, ("xdf: dump addr (0x%p) blk (%ld) nblks (%d)\n",
975 	    addr, blkno, nblk));
976 
977 	part = XDF_PART(minor);
978 	if (!xdf_isopen(vdp, part))
979 		return (ENXIO);
980 
981 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt, &p_blkst,
982 	    NULL, NULL, NULL))
983 		return (ENXIO);
984 
985 	if ((blkno + nblk) > p_blkcnt) {
986 		cmn_err(CE_WARN, "xdf: block %ld exceeds VBD size %"PRIu64,
987 		    blkno + nblk, (uint64_t)vdp->xdf_xdev_nblocks);
988 		return (EINVAL);
989 	}
990 
991 	dbp = &dumpbuf;
992 	bioinit(dbp);
993 	dbp->b_flags = B_BUSY;
994 	dbp->b_un.b_addr = addr;
995 	dbp->b_bcount	= nblk << DEV_BSHIFT;
996 	dbp->b_resid = 0;
997 	dbp->b_blkno = blkno;
998 	dbp->b_edev = dev;
999 	dbp->b_private = (void *)(uintptr_t)p_blkst;
1000 
1001 	mutex_enter(&vdp->xdf_dev_lk);
1002 	kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1003 	if (vdp->xdf_f_act == NULL) {
1004 		vdp->xdf_f_act = vdp->xdf_l_act = dbp;
1005 	} else {
1006 		vdp->xdf_l_act->av_forw = dbp;
1007 		vdp->xdf_l_act = dbp;
1008 	}
1009 	dbp->av_forw = NULL;
1010 	dbp->av_back = NULL;
1011 	mutex_exit(&vdp->xdf_dev_lk);
1012 	xdf_iostart(vdp);
1013 	err = xdf_drain_io(vdp);
1014 	biofini(dbp);
1015 	return (err);
1016 }
1017 
1018 /*ARGSUSED*/
1019 static int
1020 xdf_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
1021     int *rvalp)
1022 {
1023 	int instance;
1024 	xdf_t	*vdp;
1025 	minor_t minor;
1026 	int part;
1027 
1028 	minor = getminor(dev);
1029 	instance = XDF_INST(minor);
1030 
1031 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
1032 		return (ENXIO);
1033 
1034 	DPRINTF(IOCTL_DBG, ("xdf%d:ioctl: cmd %d (0x%x)\n",
1035 	    instance, cmd, cmd));
1036 
1037 	part = XDF_PART(minor);
1038 	if (!xdf_isopen(vdp, part))
1039 		return (ENXIO);
1040 
1041 	switch (cmd) {
1042 	case DKIOCGMEDIAINFO: {
1043 		struct dk_minfo	media_info;
1044 
1045 		media_info.dki_lbsize = DEV_BSIZE;
1046 		media_info.dki_capacity = vdp->xdf_xdev_nblocks;
1047 		media_info.dki_media_type = DK_FIXED_DISK;
1048 
1049 		if (ddi_copyout(&media_info, (void *)arg,
1050 		    sizeof (struct dk_minfo), mode)) {
1051 			return (EFAULT);
1052 		} else {
1053 			return (0);
1054 		}
1055 	}
1056 
1057 	case DKIOCINFO: {
1058 		struct dk_cinfo info;
1059 
1060 		/* controller information */
1061 		if (XD_IS_CD(vdp))
1062 			info.dki_ctype = DKC_CDROM;
1063 		else
1064 			info.dki_ctype = DKC_VBD;
1065 
1066 		info.dki_cnum = 0;
1067 		(void) strncpy((char *)(&info.dki_cname), "xdf", 8);
1068 
1069 		/* unit information */
1070 		info.dki_unit = ddi_get_instance(vdp->xdf_dip);
1071 		(void) strncpy((char *)(&info.dki_dname), "xdf", 8);
1072 		info.dki_flags = DKI_FMTVOL;
1073 		info.dki_partition = part;
1074 		info.dki_maxtransfer = maxphys / DEV_BSIZE;
1075 		info.dki_addr = 0;
1076 		info.dki_space = 0;
1077 		info.dki_prio = 0;
1078 		info.dki_vec = 0;
1079 
1080 		if (ddi_copyout(&info, (void *)arg, sizeof (info), mode))
1081 			return (EFAULT);
1082 		else
1083 			return (0);
1084 	}
1085 
1086 	case DKIOCSTATE: {
1087 		enum dkio_state	dkstate = DKIO_INSERTED;
1088 		if (ddi_copyout(&dkstate, (void *)arg, sizeof (dkstate),
1089 		    mode) != 0)
1090 			return (EFAULT);
1091 		return (0);
1092 	}
1093 
1094 	/*
1095 	 * is media removable?
1096 	 */
1097 	case DKIOCREMOVABLE: {
1098 		int i = XD_IS_RM(vdp) ? 1 : 0;
1099 		if (ddi_copyout(&i, (caddr_t)arg, sizeof (int), mode))
1100 			return (EFAULT);
1101 		return (0);
1102 	}
1103 
1104 	case DKIOCG_PHYGEOM:
1105 	case DKIOCG_VIRTGEOM:
1106 	case DKIOCGGEOM:
1107 	case DKIOCSGEOM:
1108 	case DKIOCGAPART:
1109 	case DKIOCGVTOC:
1110 	case DKIOCSVTOC:
1111 	case DKIOCPARTINFO:
1112 	case DKIOCGETEFI:
1113 	case DKIOCSETEFI:
1114 	case DKIOCPARTITION: {
1115 		int rc;
1116 
1117 		rc = cmlb_ioctl(vdp->xdf_vd_lbl, dev, cmd, arg, mode, credp,
1118 		    rvalp, NULL);
1119 		return (rc);
1120 	}
1121 
1122 	case DKIOCGETWCE:
1123 		if (ddi_copyout(&vdp->xdf_wce, (void *)arg,
1124 		    sizeof (vdp->xdf_wce), mode))
1125 			return (EFAULT);
1126 		return (0);
1127 	case DKIOCSETWCE:
1128 		if (ddi_copyin((void *)arg, &vdp->xdf_wce,
1129 		    sizeof (vdp->xdf_wce), mode))
1130 			return (EFAULT);
1131 		return (0);
1132 	case DKIOCFLUSHWRITECACHE: {
1133 		int rc;
1134 		struct dk_callback *dkc = (struct dk_callback *)arg;
1135 
1136 		if (vdp->xdf_flush_supported) {
1137 			rc = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
1138 			    NULL, 0, 0, (void *)dev);
1139 		} else if (vdp->xdf_feature_barrier &&
1140 		    !xdf_barrier_flush_disable) {
1141 			rc = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
1142 			    vdp->xdf_cache_flush_block, xdf_flush_block,
1143 			    DEV_BSIZE, (void *)dev);
1144 		} else {
1145 			return (ENOTTY);
1146 		}
1147 		if ((mode & FKIOCTL) && (dkc != NULL) &&
1148 		    (dkc->dkc_callback != NULL)) {
1149 			(*dkc->dkc_callback)(dkc->dkc_cookie, rc);
1150 			/* need to return 0 after calling callback */
1151 			rc = 0;
1152 		}
1153 		return (rc);
1154 	}
1155 
1156 	default:
1157 		return (ENOTTY);
1158 	}
1159 }
1160 
1161 /*
1162  * xdf interrupt handler
1163  */
1164 static uint_t
1165 xdf_intr(caddr_t arg)
1166 {
1167 	xdf_t *vdp = (xdf_t *)arg;
1168 	xendev_ring_t *xbr;
1169 	blkif_response_t *resp;
1170 	int bioerr;
1171 	uint64_t id;
1172 	extern int do_polled_io;
1173 	uint8_t op;
1174 	uint16_t status;
1175 	ddi_acc_handle_t acchdl;
1176 
1177 	mutex_enter(&vdp->xdf_dev_lk);
1178 
1179 	if ((xbr = vdp->xdf_xb_ring) == NULL) {
1180 		mutex_exit(&vdp->xdf_dev_lk);
1181 		return (DDI_INTR_UNCLAIMED);
1182 	}
1183 
1184 	acchdl = vdp->xdf_xb_ring_hdl;
1185 
1186 	/*
1187 	 * complete all requests which have a response
1188 	 */
1189 	while (resp = xvdi_ring_get_response(xbr)) {
1190 		id = ddi_get64(acchdl, &resp->id);
1191 		op = ddi_get8(acchdl, &resp->operation);
1192 		status = ddi_get16(acchdl, (uint16_t *)&resp->status);
1193 		DPRINTF(INTR_DBG, ("resp: op %d id %"PRIu64" status %d\n",
1194 		    op, id, status));
1195 
1196 		/*
1197 		 * XXPV - close connection to the backend and restart
1198 		 */
1199 		if (status != BLKIF_RSP_OKAY) {
1200 			DPRINTF(IO_DBG, ("xdf@%s: I/O error while %s",
1201 			    ddi_get_name_addr(vdp->xdf_dip),
1202 			    (op == BLKIF_OP_READ) ? "reading" : "writing"));
1203 			bioerr = EIO;
1204 		} else {
1205 			bioerr = 0;
1206 		}
1207 
1208 		xdf_iofini(vdp, id, bioerr);
1209 	}
1210 
1211 	mutex_exit(&vdp->xdf_dev_lk);
1212 
1213 	if (!do_polled_io)
1214 		xdf_iostart(vdp);
1215 
1216 	return (DDI_INTR_CLAIMED);
1217 }
1218 
1219 int xdf_fbrewrites;	/* how many times was our flush block rewritten */
1220 
1221 /*
1222  * Snarf new data if our flush block was re-written
1223  */
1224 static void
1225 check_fbwrite(xdf_t *vdp, buf_t *bp, daddr_t blkno)
1226 {
1227 	int nblks;
1228 	boolean_t mapin;
1229 
1230 	if (IS_WRITE_BARRIER(vdp, bp))
1231 		return; /* write was a flush write */
1232 
1233 	mapin = B_FALSE;
1234 	nblks = bp->b_bcount >> DEV_BSHIFT;
1235 	if (xdf_flush_block >= blkno && xdf_flush_block < (blkno + nblks)) {
1236 		xdf_fbrewrites++;
1237 		if (bp->b_flags & (B_PAGEIO | B_PHYS)) {
1238 			mapin = B_TRUE;
1239 			bp_mapin(bp);
1240 		}
1241 		bcopy(bp->b_un.b_addr +
1242 		    ((xdf_flush_block - blkno) << DEV_BSHIFT),
1243 		    vdp->xdf_cache_flush_block, DEV_BSIZE);
1244 		if (mapin)
1245 			bp_mapout(bp);
1246 	}
1247 }
1248 
1249 static void
1250 xdf_iofini(xdf_t *vdp, uint64_t id, int bioerr)
1251 {
1252 	ge_slot_t *gs = (ge_slot_t *)(uintptr_t)id;
1253 	v_req_t *vreq = gs->vreq;
1254 	buf_t *bp = vreq->v_buf;
1255 
1256 	gs_free(vdp, gs);
1257 	if (bioerr)
1258 		bioerror(bp, bioerr);
1259 	vreq->v_nslots--;
1260 	if (vreq->v_nslots != 0)
1261 		return;
1262 
1263 	XDF_UPDATE_IO_STAT(vdp, bp);
1264 	kstat_runq_exit(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1265 
1266 	if (IS_ERROR(bp))
1267 		bp->b_resid = bp->b_bcount;
1268 
1269 	vreq_free(vdp, vreq);
1270 	biodone(bp);
1271 }
1272 
1273 /*
1274  * return value of xdf_prepare_rreq()
1275  * used in xdf_iostart()
1276  */
1277 #define	XF_PARTIAL	0 /* rreq is full, not all I/O in buf transferred */
1278 #define	XF_COMP		1 /* no more I/O left in buf */
1279 
1280 static void
1281 xdf_iostart(xdf_t *vdp)
1282 {
1283 	xendev_ring_t *xbr;
1284 	struct buf *bp;
1285 	blkif_request_t *rreq;
1286 	int retval;
1287 	int rreqready = 0;
1288 
1289 	xbr = vdp->xdf_xb_ring;
1290 
1291 	/*
1292 	 * populate the ring request(s)
1293 	 *
1294 	 * loop until there is no buf to transfer or no free slot
1295 	 * available in I/O ring
1296 	 */
1297 	mutex_enter(&vdp->xdf_dev_lk);
1298 
1299 	for (;;) {
1300 		if (vdp->xdf_status != XD_READY)
1301 			break;
1302 
1303 		/* active buf queue empty? */
1304 		if ((bp = vdp->xdf_f_act) == NULL)
1305 			break;
1306 
1307 		/* try to grab a vreq for this bp */
1308 		if ((BP2VREQ(bp) == NULL) && (vreq_get(vdp, bp) == NULL))
1309 				break;
1310 		/* alloc DMA/GTE resources */
1311 		if (vreq_setup(vdp, BP2VREQ(bp)) != DDI_SUCCESS)
1312 			break;
1313 
1314 		/* get next blkif_request in the ring */
1315 		if ((rreq = xvdi_ring_get_request(xbr)) == NULL)
1316 			break;
1317 		bzero(rreq, sizeof (blkif_request_t));
1318 
1319 		/* populate blkif_request with this buf */
1320 		rreqready++;
1321 		retval = xdf_prepare_rreq(vdp, bp, rreq);
1322 		if (retval == XF_COMP) {
1323 			/* finish this bp, switch to next one */
1324 			kstat_waitq_to_runq(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1325 			vdp->xdf_f_act = bp->av_forw;
1326 			bp->av_forw = NULL;
1327 		}
1328 	}
1329 
1330 	/*
1331 	 * Send the request(s) to the backend
1332 	 */
1333 	if (rreqready) {
1334 		if (xvdi_ring_push_request(xbr)) {
1335 			DPRINTF(IO_DBG, ("xdf_iostart: "
1336 			    "sent request(s) to backend\n"));
1337 			xvdi_notify_oe(vdp->xdf_dip);
1338 		}
1339 	}
1340 
1341 	mutex_exit(&vdp->xdf_dev_lk);
1342 }
1343 
1344 /*
1345  * populate a single blkif_request_t w/ a buf
1346  */
1347 static int
1348 xdf_prepare_rreq(xdf_t *vdp, struct buf *bp, blkif_request_t *rreq)
1349 {
1350 	int		rval;
1351 	grant_ref_t	gr;
1352 	uint8_t		fsect, lsect;
1353 	size_t		bcnt;
1354 	paddr_t		dma_addr;
1355 	off_t		blk_off;
1356 	dev_info_t	*dip = vdp->xdf_dip;
1357 	blkif_vdev_t	vdev = xvdi_get_vdevnum(dip);
1358 	v_req_t		*vreq = BP2VREQ(bp);
1359 	uint64_t	blkno = vreq->v_blkno;
1360 	uint_t		ndmacs = vreq->v_ndmacs;
1361 	ddi_acc_handle_t acchdl = vdp->xdf_xb_ring_hdl;
1362 	int		seg = 0;
1363 	int		isread = IS_READ(bp);
1364 
1365 	if (isread)
1366 		ddi_put8(acchdl, &rreq->operation, BLKIF_OP_READ);
1367 	else {
1368 		switch (vreq->v_flush_diskcache) {
1369 		case FLUSH_DISKCACHE:
1370 			ddi_put8(acchdl, &rreq->operation,
1371 			    BLKIF_OP_FLUSH_DISKCACHE);
1372 			ddi_put16(acchdl, &rreq->handle, vdev);
1373 			ddi_put64(acchdl, &rreq->id,
1374 			    (uint64_t)(uintptr_t)(vreq->v_gs));
1375 			ddi_put8(acchdl, &rreq->nr_segments, 0);
1376 			return (XF_COMP);
1377 		case WRITE_BARRIER:
1378 			ddi_put8(acchdl, &rreq->operation,
1379 			    BLKIF_OP_WRITE_BARRIER);
1380 			break;
1381 		default:
1382 			if (!vdp->xdf_wce)
1383 				ddi_put8(acchdl, &rreq->operation,
1384 				    BLKIF_OP_WRITE_BARRIER);
1385 			else
1386 				ddi_put8(acchdl, &rreq->operation,
1387 				    BLKIF_OP_WRITE);
1388 			break;
1389 		}
1390 	}
1391 
1392 	ddi_put16(acchdl, &rreq->handle, vdev);
1393 	ddi_put64(acchdl, &rreq->sector_number, blkno);
1394 	ddi_put64(acchdl, &rreq->id, (uint64_t)(uintptr_t)(vreq->v_gs));
1395 
1396 	/*
1397 	 * loop until all segments are populated or no more dma cookie in buf
1398 	 */
1399 	for (;;) {
1400 	/*
1401 	 * Each segment of a blkif request can transfer up to
1402 	 * one 4K page of data.
1403 	 */
1404 		bcnt = vreq->v_dmac.dmac_size;
1405 		ASSERT(bcnt <= PAGESIZE);
1406 		ASSERT((bcnt % XB_BSIZE) == 0);
1407 		dma_addr = vreq->v_dmac.dmac_laddress;
1408 		blk_off = (uint_t)((paddr_t)XB_SEGOFFSET & dma_addr);
1409 		ASSERT((blk_off & XB_BMASK) == 0);
1410 		fsect = blk_off >> XB_BSHIFT;
1411 		lsect = fsect + (bcnt >> XB_BSHIFT) - 1;
1412 		ASSERT(fsect < XB_MAX_SEGLEN / XB_BSIZE &&
1413 		    lsect < XB_MAX_SEGLEN / XB_BSIZE);
1414 		DPRINTF(IO_DBG, ("  ""seg%d: dmacS %lu blk_off %ld\n",
1415 		    seg, vreq->v_dmac.dmac_size, blk_off));
1416 		gr = gs_grant(vreq->v_gs, PATOMA(dma_addr) >> PAGESHIFT);
1417 		ddi_put32(acchdl, &rreq->seg[seg].gref, gr);
1418 		ddi_put8(acchdl, &rreq->seg[seg].first_sect, fsect);
1419 		ddi_put8(acchdl, &rreq->seg[seg].last_sect, lsect);
1420 		DPRINTF(IO_DBG, ("  ""seg%d: fs %d ls %d gr %d dma 0x%"PRIx64
1421 		    "\n", seg, fsect, lsect, gr, dma_addr));
1422 
1423 		blkno += (bcnt >> XB_BSHIFT);
1424 		seg++;
1425 		ASSERT(seg <= BLKIF_MAX_SEGMENTS_PER_REQUEST);
1426 		if (--ndmacs) {
1427 			ddi_dma_nextcookie(vreq->v_dmahdl, &vreq->v_dmac);
1428 			continue;
1429 		}
1430 
1431 		vreq->v_status = VREQ_DMAWIN_DONE;
1432 		vreq->v_blkno = blkno;
1433 		if (vreq->v_dmaw + 1 == vreq->v_ndmaws)
1434 			/* last win */
1435 			rval = XF_COMP;
1436 		else
1437 			rval = XF_PARTIAL;
1438 		break;
1439 	}
1440 	ddi_put8(acchdl,  &rreq->nr_segments, seg);
1441 	DPRINTF(IO_DBG, ("xdf_prepare_rreq: request id=%"PRIx64" ready\n",
1442 	    rreq->id));
1443 
1444 	return (rval);
1445 }
1446 
1447 #define	XDF_QSEC	50000	/* .005 second */
1448 #define	XDF_POLLCNT	12	/* loop for 12 times before time out */
1449 
1450 static int
1451 xdf_drain_io(xdf_t *vdp)
1452 {
1453 	int pollc, rval;
1454 	xendev_ring_t *xbr;
1455 
1456 	if (xdfdebug & SUSRES_DBG)
1457 		xen_printf("xdf_drain_io: start\n");
1458 
1459 	mutex_enter(&vdp->xdf_dev_lk);
1460 
1461 	if ((vdp->xdf_status != XD_READY) && (vdp->xdf_status != XD_SUSPEND))
1462 		goto out;
1463 
1464 	rval = 0;
1465 	xbr = vdp->xdf_xb_ring;
1466 	ASSERT(xbr != NULL);
1467 
1468 	for (pollc = 0; pollc < XDF_POLLCNT; pollc++) {
1469 		if (xvdi_ring_has_unconsumed_responses(xbr)) {
1470 			mutex_exit(&vdp->xdf_dev_lk);
1471 			(void) xdf_intr((caddr_t)vdp);
1472 			mutex_enter(&vdp->xdf_dev_lk);
1473 		}
1474 		if (!xvdi_ring_has_incomp_request(xbr))
1475 			goto out;
1476 
1477 #ifndef	XPV_HVM_DRIVER
1478 		(void) HYPERVISOR_yield();
1479 #endif
1480 		/*
1481 		 * file-backed devices can be slow
1482 		 */
1483 		drv_usecwait(XDF_QSEC << pollc);
1484 	}
1485 	cmn_err(CE_WARN, "xdf_polled_io: timeout");
1486 	rval = EIO;
1487 out:
1488 	mutex_exit(&vdp->xdf_dev_lk);
1489 	if (xdfdebug & SUSRES_DBG)
1490 		xen_printf("xdf_drain_io: end, err=%d\n", rval);
1491 	return (rval);
1492 }
1493 
1494 /* ARGSUSED5 */
1495 static int
1496 xdf_lb_rdwr(dev_info_t *devi, uchar_t cmd, void *bufp,
1497     diskaddr_t start, size_t reqlen, void *tg_cookie)
1498 {
1499 	xdf_t *vdp;
1500 	struct buf *bp;
1501 	int err = 0;
1502 
1503 	vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi));
1504 	if (vdp == NULL)
1505 		return (ENXIO);
1506 
1507 	if ((start + (reqlen >> DEV_BSHIFT)) > vdp->xdf_xdev_nblocks)
1508 		return (EINVAL);
1509 
1510 	bp = getrbuf(KM_SLEEP);
1511 	if (cmd == TG_READ)
1512 		bp->b_flags = B_BUSY | B_READ;
1513 	else
1514 		bp->b_flags = B_BUSY | B_WRITE;
1515 	bp->b_un.b_addr = bufp;
1516 	bp->b_bcount = reqlen;
1517 	bp->b_resid = 0;
1518 	bp->b_blkno = start;
1519 	bp->av_forw = NULL;
1520 	bp->av_back = NULL;
1521 	bp->b_edev = DDI_DEV_T_NONE; /* don't have dev_t */
1522 
1523 	mutex_enter(&vdp->xdf_dev_lk);
1524 	kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1525 	if (vdp->xdf_f_act == NULL) {
1526 		vdp->xdf_f_act = vdp->xdf_l_act = bp;
1527 	} else {
1528 		vdp->xdf_l_act->av_forw = bp;
1529 		vdp->xdf_l_act = bp;
1530 	}
1531 	mutex_exit(&vdp->xdf_dev_lk);
1532 	xdf_iostart(vdp);
1533 	err = biowait(bp);
1534 
1535 	ASSERT(bp->b_flags & B_DONE);
1536 
1537 	freerbuf(bp);
1538 	return (err);
1539 }
1540 
1541 /*
1542  * synthetic geometry
1543  */
1544 #define	XDF_NSECTS	256
1545 #define	XDF_NHEADS	16
1546 
1547 static int
1548 xdf_lb_getcap(dev_info_t *devi, diskaddr_t *capp)
1549 {
1550 	xdf_t *vdp;
1551 
1552 	vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi));
1553 
1554 	if (vdp == NULL)
1555 		return (ENXIO);
1556 
1557 	mutex_enter(&vdp->xdf_dev_lk);
1558 	*capp = vdp->xdf_xdev_nblocks;
1559 	DPRINTF(LBL_DBG, ("capacity %llu\n", *capp));
1560 	mutex_exit(&vdp->xdf_dev_lk);
1561 	return (0);
1562 }
1563 
1564 static int
1565 xdf_lb_getpgeom(dev_info_t *devi, cmlb_geom_t *geomp)
1566 {
1567 	xdf_t *vdp;
1568 	uint_t ncyl;
1569 	uint_t spc = XDF_NHEADS * XDF_NSECTS;
1570 
1571 	vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi));
1572 
1573 	if (vdp == NULL)
1574 		return (ENXIO);
1575 
1576 	ncyl = vdp->xdf_xdev_nblocks / spc;
1577 
1578 	geomp->g_ncyl = ncyl == 0 ? 1 : ncyl;
1579 	geomp->g_acyl = 0;
1580 	geomp->g_nhead = XDF_NHEADS;
1581 	geomp->g_secsize = XB_BSIZE;
1582 	geomp->g_nsect = XDF_NSECTS;
1583 	geomp->g_intrlv = 0;
1584 	geomp->g_rpm = 7200;
1585 	geomp->g_capacity = vdp->xdf_xdev_nblocks;
1586 	return (0);
1587 }
1588 
1589 /*
1590  * No real HBA, no geometry available from it
1591  */
1592 /*ARGSUSED*/
1593 static int
1594 xdf_lb_getvgeom(dev_info_t *devi, cmlb_geom_t *geomp)
1595 {
1596 	return (EINVAL);
1597 }
1598 
1599 static int
1600 xdf_lb_getattribute(dev_info_t *devi, tg_attribute_t *tgattributep)
1601 {
1602 	xdf_t *vdp;
1603 
1604 	if (!(vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi))))
1605 		return (ENXIO);
1606 
1607 	if (XD_IS_RO(vdp))
1608 		tgattributep->media_is_writable = 0;
1609 	else
1610 		tgattributep->media_is_writable = 1;
1611 	return (0);
1612 }
1613 
1614 /* ARGSUSED3 */
1615 static int
1616 xdf_lb_getinfo(dev_info_t *devi, int cmd, void *arg, void *tg_cookie)
1617 {
1618 	switch (cmd) {
1619 	case TG_GETPHYGEOM:
1620 		return (xdf_lb_getpgeom(devi, (cmlb_geom_t *)arg));
1621 	case TG_GETVIRTGEOM:
1622 		return (xdf_lb_getvgeom(devi, (cmlb_geom_t *)arg));
1623 	case TG_GETCAPACITY:
1624 		return (xdf_lb_getcap(devi, (diskaddr_t *)arg));
1625 	case TG_GETBLOCKSIZE:
1626 		*(uint32_t *)arg = XB_BSIZE;
1627 		return (0);
1628 	case TG_GETATTR:
1629 		return (xdf_lb_getattribute(devi, (tg_attribute_t *)arg));
1630 	default:
1631 		return (ENOTTY);
1632 	}
1633 }
1634 
1635 /*
1636  * Kick-off connect process
1637  * Status should be XD_UNKNOWN or XD_CLOSED
1638  * On success, status will be changed to XD_INIT
1639  * On error, status won't be changed
1640  */
1641 static int
1642 xdf_start_connect(xdf_t *vdp)
1643 {
1644 	char *xsnode;
1645 	grant_ref_t gref;
1646 	xenbus_transaction_t xbt;
1647 	int rv;
1648 	dev_info_t *dip = vdp->xdf_dip;
1649 
1650 	if ((vdp->xdf_peer = xvdi_get_oeid(dip)) == (domid_t)-1)
1651 		goto errout;
1652 
1653 	if (xvdi_alloc_evtchn(dip) != DDI_SUCCESS) {
1654 		cmn_err(CE_WARN, "xdf@%s: failed to alloc event channel",
1655 		    ddi_get_name_addr(dip));
1656 		goto errout;
1657 	}
1658 	vdp->xdf_evtchn = xvdi_get_evtchn(dip);
1659 #ifdef XPV_HVM_DRIVER
1660 	ec_bind_evtchn_to_handler(vdp->xdf_evtchn, IPL_VBD, xdf_intr, vdp);
1661 #else
1662 	if (ddi_add_intr(dip, 0, NULL, NULL, xdf_intr, (caddr_t)vdp) !=
1663 	    DDI_SUCCESS) {
1664 		cmn_err(CE_WARN, "xdf_start_connect: xdf@%s: "
1665 		    "failed to add intr handler", ddi_get_name_addr(dip));
1666 		goto errout1;
1667 	}
1668 #endif
1669 
1670 	if (xvdi_alloc_ring(dip, BLKIF_RING_SIZE,
1671 	    sizeof (union blkif_sring_entry), &gref, &vdp->xdf_xb_ring) !=
1672 	    DDI_SUCCESS) {
1673 		cmn_err(CE_WARN, "xdf@%s: failed to alloc comm ring",
1674 		    ddi_get_name_addr(dip));
1675 		goto errout2;
1676 	}
1677 	vdp->xdf_xb_ring_hdl = vdp->xdf_xb_ring->xr_acc_hdl; /* ugly!! */
1678 
1679 	/*
1680 	 * Write into xenstore the info needed by backend
1681 	 */
1682 	if ((xsnode = xvdi_get_xsname(dip)) == NULL) {
1683 		cmn_err(CE_WARN, "xdf@%s: "
1684 		    "failed to get xenstore node path",
1685 		    ddi_get_name_addr(dip));
1686 		goto fail_trans;
1687 	}
1688 trans_retry:
1689 	if (xenbus_transaction_start(&xbt)) {
1690 		cmn_err(CE_WARN, "xdf@%s: failed to start transaction",
1691 		    ddi_get_name_addr(dip));
1692 		xvdi_fatal_error(dip, EIO, "transaction start");
1693 		goto fail_trans;
1694 	}
1695 
1696 	if (rv = xenbus_printf(xbt, xsnode, "ring-ref", "%u", gref)) {
1697 		cmn_err(CE_WARN, "xdf@%s: failed to write ring-ref",
1698 		    ddi_get_name_addr(dip));
1699 		xvdi_fatal_error(dip, rv, "writing ring-ref");
1700 		goto abort_trans;
1701 	}
1702 
1703 	if (rv = xenbus_printf(xbt, xsnode, "event-channel", "%u",
1704 	    vdp->xdf_evtchn)) {
1705 		cmn_err(CE_WARN, "xdf@%s: failed to write event-channel",
1706 		    ddi_get_name_addr(dip));
1707 		xvdi_fatal_error(dip, rv, "writing event-channel");
1708 		goto abort_trans;
1709 	}
1710 
1711 	if ((rv = xvdi_switch_state(dip, xbt, XenbusStateInitialised)) > 0) {
1712 		cmn_err(CE_WARN, "xdf@%s: "
1713 		    "failed to switch state to XenbusStateInitialised",
1714 		    ddi_get_name_addr(dip));
1715 		xvdi_fatal_error(dip, rv, "writing state");
1716 		goto abort_trans;
1717 	}
1718 
1719 	/* kick-off connect process */
1720 	if (rv = xenbus_transaction_end(xbt, 0)) {
1721 		if (rv == EAGAIN)
1722 			goto trans_retry;
1723 		cmn_err(CE_WARN, "xdf@%s: failed to end transaction",
1724 		    ddi_get_name_addr(dip));
1725 		xvdi_fatal_error(dip, rv, "completing transaction");
1726 		goto fail_trans;
1727 	}
1728 
1729 	ASSERT(mutex_owned(&vdp->xdf_cb_lk));
1730 	mutex_enter(&vdp->xdf_dev_lk);
1731 	vdp->xdf_status = XD_INIT;
1732 	mutex_exit(&vdp->xdf_dev_lk);
1733 
1734 	return (DDI_SUCCESS);
1735 
1736 abort_trans:
1737 	(void) xenbus_transaction_end(xbt, 1);
1738 fail_trans:
1739 	xvdi_free_ring(vdp->xdf_xb_ring);
1740 errout2:
1741 #ifdef XPV_HVM_DRIVER
1742 	ec_unbind_evtchn(vdp->xdf_evtchn);
1743 #else
1744 	(void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
1745 #endif
1746 errout1:
1747 	xvdi_free_evtchn(dip);
1748 errout:
1749 	cmn_err(CE_WARN, "xdf@%s: fail to kick-off connecting",
1750 	    ddi_get_name_addr(dip));
1751 	return (DDI_FAILURE);
1752 }
1753 
1754 /*
1755  * Kick-off disconnect process
1756  * Status won't be changed
1757  */
1758 static int
1759 xdf_start_disconnect(xdf_t *vdp)
1760 {
1761 	if (xvdi_switch_state(vdp->xdf_dip, XBT_NULL, XenbusStateClosed) > 0) {
1762 		cmn_err(CE_WARN, "xdf@%s: fail to kick-off disconnecting",
1763 		    ddi_get_name_addr(vdp->xdf_dip));
1764 		return (DDI_FAILURE);
1765 	}
1766 
1767 	return (DDI_SUCCESS);
1768 }
1769 
1770 int
1771 xdf_get_flush_block(xdf_t *vdp)
1772 {
1773 	/*
1774 	 * Get a DEV_BSIZE aligned bufer
1775 	 */
1776 	vdp->xdf_flush_mem = kmem_alloc(DEV_BSIZE * 2, KM_SLEEP);
1777 	vdp->xdf_cache_flush_block =
1778 	    (char *)P2ROUNDUP((uintptr_t)(vdp->xdf_flush_mem), DEV_BSIZE);
1779 	if (xdf_lb_rdwr(vdp->xdf_dip, TG_READ, vdp->xdf_cache_flush_block,
1780 	    xdf_flush_block, DEV_BSIZE, NULL) != 0)
1781 		return (DDI_FAILURE);
1782 	return (DDI_SUCCESS);
1783 }
1784 
1785 /*
1786  * Finish other initialization after we've connected to backend
1787  * Status should be XD_INIT before calling this routine
1788  * On success, status should be changed to XD_READY
1789  * On error, status should stay XD_INIT
1790  */
1791 static int
1792 xdf_post_connect(xdf_t *vdp)
1793 {
1794 	int rv;
1795 	uint_t len;
1796 	char *type;
1797 	char *barrier;
1798 	dev_info_t *devi = vdp->xdf_dip;
1799 
1800 	/*
1801 	 * Determine if feature barrier is supported by backend
1802 	 */
1803 	if (xenbus_read(XBT_NULL, xvdi_get_oename(devi),
1804 	    "feature-barrier", (void **)&barrier, &len) == 0) {
1805 		vdp->xdf_feature_barrier = 1;
1806 		kmem_free(barrier, len);
1807 	} else {
1808 		cmn_err(CE_NOTE, "xdf@%s: failed to read feature-barrier",
1809 		    ddi_get_name_addr(vdp->xdf_dip));
1810 		vdp->xdf_feature_barrier = 0;
1811 	}
1812 
1813 	/* probe backend */
1814 	if (rv = xenbus_gather(XBT_NULL, xvdi_get_oename(devi),
1815 	    "sectors", "%"SCNu64, &vdp->xdf_xdev_nblocks,
1816 	    "info", "%u", &vdp->xdf_xdev_info, NULL)) {
1817 		cmn_err(CE_WARN, "xdf_post_connect: xdf@%s: "
1818 		    "cannot read backend info", ddi_get_name_addr(devi));
1819 		xvdi_fatal_error(devi, rv, "reading backend info");
1820 		return (DDI_FAILURE);
1821 	}
1822 
1823 	/* fix disk type */
1824 	if (xenbus_read(XBT_NULL, xvdi_get_xsname(devi), "device-type",
1825 	    (void **)&type, &len) != 0) {
1826 		cmn_err(CE_WARN, "xdf_post_connect: xdf@%s: "
1827 		    "cannot read device-type", ddi_get_name_addr(devi));
1828 		xvdi_fatal_error(devi, rv, "reading device-type");
1829 		return (DDI_FAILURE);
1830 	}
1831 	if (strcmp(type, "cdrom") == 0)
1832 		vdp->xdf_xdev_info |= VDISK_CDROM;
1833 	kmem_free(type, len);
1834 
1835 	/*
1836 	 * We've created all the minor nodes via cmlb_attach() using default
1837 	 * value in xdf_attach() to make it possible to block in xdf_open(),
1838 	 * in case there's anyone (say, booting thread) ever trying to open
1839 	 * it before connected to backend. We will refresh all those minor
1840 	 * nodes w/ latest info we've got now when we are almost connected.
1841 	 *
1842 	 * Don't do this when xdf is already opened by someone (could happen
1843 	 * during resume), for that cmlb_attach() will invalid the label info
1844 	 * and confuse those who has already opened the node, which is bad.
1845 	 */
1846 	if (!xdf_isopen(vdp, -1) && (XD_IS_CD(vdp) || XD_IS_RM(vdp))) {
1847 		/* re-init cmlb w/ latest info we got from backend */
1848 		if (cmlb_attach(devi, &xdf_lb_ops,
1849 		    XD_IS_CD(vdp) ? DTYPE_RODIRECT : DTYPE_DIRECT,
1850 		    XD_IS_RM(vdp), 1, DDI_NT_BLOCK,
1851 		    CMLB_FAKE_LABEL_ONE_PARTITION,
1852 		    vdp->xdf_vd_lbl, NULL) != 0) {
1853 			cmn_err(CE_WARN, "xdf@%s: cmlb attach failed",
1854 			    ddi_get_name_addr(devi));
1855 			return (DDI_FAILURE);
1856 		}
1857 	}
1858 
1859 	/* mark vbd is ready for I/O */
1860 	ASSERT(mutex_owned(&vdp->xdf_cb_lk));
1861 	mutex_enter(&vdp->xdf_dev_lk);
1862 	vdp->xdf_status = XD_READY;
1863 	mutex_exit(&vdp->xdf_dev_lk);
1864 	/*
1865 	 * If backend has feature-barrier, see if it supports disk
1866 	 * cache flush op.
1867 	 */
1868 	vdp->xdf_flush_supported = 0;
1869 	if (vdp->xdf_feature_barrier) {
1870 		/*
1871 		 * Pretend we already know flush is supported so probe
1872 		 * will attempt the correct op.
1873 		 */
1874 		vdp->xdf_flush_supported = 1;
1875 		if (xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE, NULL, 0, 0, 0) == 0) {
1876 			vdp->xdf_flush_supported = 1;
1877 		} else {
1878 			vdp->xdf_flush_supported = 0;
1879 			/*
1880 			 * If the other end does not support the cache flush op
1881 			 * then we must use a barrier-write to force disk
1882 			 * cache flushing.  Barrier writes require that a data
1883 			 * block actually be written.
1884 			 * Cache a block to barrier-write when we are
1885 			 * asked to perform a flush.
1886 			 * XXX - would it be better to just copy 1 block
1887 			 * (512 bytes) from whatever write we did last
1888 			 * and rewrite that block?
1889 			 */
1890 			if (xdf_get_flush_block(vdp) != DDI_SUCCESS)
1891 				return (DDI_FAILURE);
1892 		}
1893 	}
1894 
1895 	cmn_err(CE_CONT, "?xdf@%s: %"PRIu64" blocks", ddi_get_name_addr(devi),
1896 	    (uint64_t)vdp->xdf_xdev_nblocks);
1897 
1898 	return (DDI_SUCCESS);
1899 }
1900 
1901 /*
1902  * Finish other uninitialization after we've disconnected from backend
1903  * when status is XD_CLOSING or XD_INIT. After returns, status is XD_CLOSED
1904  */
1905 static void
1906 xdf_post_disconnect(xdf_t *vdp)
1907 {
1908 #ifdef XPV_HVM_DRIVER
1909 	ec_unbind_evtchn(vdp->xdf_evtchn);
1910 #else
1911 	(void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
1912 #endif
1913 	xvdi_free_evtchn(vdp->xdf_dip);
1914 	xvdi_free_ring(vdp->xdf_xb_ring);
1915 	vdp->xdf_xb_ring = NULL;
1916 	vdp->xdf_xb_ring_hdl = NULL;
1917 	vdp->xdf_peer = (domid_t)-1;
1918 
1919 	ASSERT(mutex_owned(&vdp->xdf_cb_lk));
1920 	mutex_enter(&vdp->xdf_dev_lk);
1921 	vdp->xdf_status = XD_CLOSED;
1922 	mutex_exit(&vdp->xdf_dev_lk);
1923 }
1924 
1925 /*ARGSUSED*/
1926 static void
1927 xdf_oe_change(dev_info_t *dip, ddi_eventcookie_t id, void *arg, void *impl_data)
1928 {
1929 	XenbusState new_state = *(XenbusState *)impl_data;
1930 	xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip);
1931 	boolean_t unexpect_die = B_FALSE;
1932 	int status;
1933 
1934 	DPRINTF(DDI_DBG, ("xdf@%s: otherend state change to %d!\n",
1935 	    ddi_get_name_addr(dip), new_state));
1936 
1937 	mutex_enter(&vdp->xdf_cb_lk);
1938 
1939 	if (xdf_check_state_transition(vdp, new_state) == DDI_FAILURE) {
1940 		mutex_exit(&vdp->xdf_cb_lk);
1941 		return;
1942 	}
1943 
1944 	switch (new_state) {
1945 	case XenbusStateInitialising:
1946 		ASSERT(vdp->xdf_status == XD_CLOSED);
1947 		/*
1948 		 * backend recovered from a previous failure,
1949 		 * kick-off connect process again
1950 		 */
1951 		if (xdf_start_connect(vdp) != DDI_SUCCESS) {
1952 			cmn_err(CE_WARN, "xdf@%s:"
1953 			    " failed to start reconnecting to backend",
1954 			    ddi_get_name_addr(dip));
1955 		}
1956 		break;
1957 	case XenbusStateConnected:
1958 		ASSERT(vdp->xdf_status == XD_INIT);
1959 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
1960 		/* finish final init after connect */
1961 		if (xdf_post_connect(vdp) != DDI_SUCCESS)
1962 			(void) xdf_start_disconnect(vdp);
1963 		break;
1964 	case XenbusStateClosing:
1965 		if (vdp->xdf_status == XD_READY) {
1966 			mutex_enter(&vdp->xdf_dev_lk);
1967 			if (xdf_isopen(vdp, -1)) {
1968 				cmn_err(CE_NOTE, "xdf@%s: hot-unplug failed, "
1969 				    "still in use", ddi_get_name_addr(dip));
1970 				mutex_exit(&vdp->xdf_dev_lk);
1971 				break;
1972 			} else {
1973 				vdp->xdf_status = XD_CLOSING;
1974 			}
1975 			mutex_exit(&vdp->xdf_dev_lk);
1976 		}
1977 		(void) xdf_start_disconnect(vdp);
1978 		break;
1979 	case XenbusStateClosed:
1980 		/* first check if BE closed unexpectedly */
1981 		mutex_enter(&vdp->xdf_dev_lk);
1982 		if (xdf_isopen(vdp, -1)) {
1983 			unexpect_die = B_TRUE;
1984 			unexpectedie(vdp);
1985 			cmn_err(CE_WARN, "xdf@%s: backend closed, "
1986 			    "reconnecting...", ddi_get_name_addr(dip));
1987 		}
1988 		mutex_exit(&vdp->xdf_dev_lk);
1989 
1990 		if (vdp->xdf_status == XD_READY) {
1991 			mutex_enter(&vdp->xdf_dev_lk);
1992 			vdp->xdf_status = XD_CLOSING;
1993 			mutex_exit(&vdp->xdf_dev_lk);
1994 
1995 #ifdef	DOMU_BACKEND
1996 			(void) xvdi_post_event(dip, XEN_HP_REMOVE);
1997 #endif
1998 
1999 			xdf_post_disconnect(vdp);
2000 			(void) xvdi_switch_state(dip, XBT_NULL,
2001 			    XenbusStateClosed);
2002 		} else if ((vdp->xdf_status == XD_INIT) ||
2003 		    (vdp->xdf_status == XD_CLOSING)) {
2004 			xdf_post_disconnect(vdp);
2005 		} else {
2006 			mutex_enter(&vdp->xdf_dev_lk);
2007 			vdp->xdf_status = XD_CLOSED;
2008 			mutex_exit(&vdp->xdf_dev_lk);
2009 		}
2010 	}
2011 
2012 	/* notify anybody waiting for oe state change */
2013 	mutex_enter(&vdp->xdf_dev_lk);
2014 	cv_broadcast(&vdp->xdf_dev_cv);
2015 	mutex_exit(&vdp->xdf_dev_lk);
2016 
2017 	status = vdp->xdf_status;
2018 	mutex_exit(&vdp->xdf_cb_lk);
2019 
2020 	if (status == XD_READY) {
2021 		xdf_iostart(vdp);
2022 	} else if ((status == XD_CLOSED) && !unexpect_die) {
2023 		/* interface is closed successfully, remove all minor nodes */
2024 		cmlb_detach(vdp->xdf_vd_lbl, NULL);
2025 		cmlb_free_handle(&vdp->xdf_vd_lbl);
2026 	}
2027 }
2028 
2029 /* check if partition is open, -1 - check all partitions on the disk */
2030 static boolean_t
2031 xdf_isopen(xdf_t *vdp, int partition)
2032 {
2033 	int i;
2034 	ulong_t parbit;
2035 	boolean_t rval = B_FALSE;
2036 
2037 	if (partition == -1)
2038 		parbit = (ulong_t)-1;
2039 	else
2040 		parbit = 1 << partition;
2041 
2042 	for (i = 0; i < OTYPCNT; i++) {
2043 		if (vdp->xdf_vd_open[i] & parbit)
2044 			rval = B_TRUE;
2045 	}
2046 
2047 	return (rval);
2048 }
2049 
2050 /*
2051  * Xdf_check_state_transition will check the XenbusState change to see
2052  * if the change is a valid transition or not.
2053  * The new state is written by backend domain, or by running xenstore-write
2054  * to change it manually in dom0
2055  */
2056 static int
2057 xdf_check_state_transition(xdf_t *vdp, XenbusState oestate)
2058 {
2059 	int status;
2060 	int stcheck;
2061 #define	STOK	0 /* need further process */
2062 #define	STNOP	1 /* no action need taking */
2063 #define	STBUG	2 /* unexpected state change, could be a bug */
2064 
2065 	status = vdp->xdf_status;
2066 	stcheck = STOK;
2067 
2068 	switch (status) {
2069 	case XD_UNKNOWN:
2070 		if ((oestate == XenbusStateUnknown)		||
2071 		    (oestate == XenbusStateConnected))
2072 			stcheck = STBUG;
2073 		else if ((oestate == XenbusStateInitialising)	||
2074 		    (oestate == XenbusStateInitWait)		||
2075 		    (oestate == XenbusStateInitialised))
2076 			stcheck = STNOP;
2077 		break;
2078 	case XD_INIT:
2079 		if (oestate == XenbusStateUnknown)
2080 			stcheck = STBUG;
2081 		else if ((oestate == XenbusStateInitialising)	||
2082 		    (oestate == XenbusStateInitWait)		||
2083 		    (oestate == XenbusStateInitialised))
2084 			stcheck = STNOP;
2085 		break;
2086 	case XD_READY:
2087 		if ((oestate == XenbusStateUnknown)		||
2088 		    (oestate == XenbusStateInitialising)	||
2089 		    (oestate == XenbusStateInitWait)		||
2090 		    (oestate == XenbusStateInitialised))
2091 			stcheck = STBUG;
2092 		else if (oestate == XenbusStateConnected)
2093 			stcheck = STNOP;
2094 		break;
2095 	case XD_CLOSING:
2096 		if ((oestate == XenbusStateUnknown)		||
2097 		    (oestate == XenbusStateInitialising)	||
2098 		    (oestate == XenbusStateInitWait)		||
2099 		    (oestate == XenbusStateInitialised)		||
2100 		    (oestate == XenbusStateConnected))
2101 			stcheck = STBUG;
2102 		else if (oestate == XenbusStateClosing)
2103 			stcheck = STNOP;
2104 		break;
2105 	case XD_CLOSED:
2106 		if ((oestate == XenbusStateUnknown)		||
2107 		    (oestate == XenbusStateConnected))
2108 			stcheck = STBUG;
2109 		else if ((oestate == XenbusStateInitWait)	||
2110 		    (oestate == XenbusStateInitialised)		||
2111 		    (oestate == XenbusStateClosing)		||
2112 		    (oestate == XenbusStateClosed))
2113 			stcheck = STNOP;
2114 		break;
2115 	case XD_SUSPEND:
2116 	default:
2117 			stcheck = STBUG;
2118 	}
2119 
2120 	if (stcheck == STOK)
2121 		return (DDI_SUCCESS);
2122 
2123 	if (stcheck == STBUG)
2124 		cmn_err(CE_NOTE, "xdf@%s: unexpected otherend "
2125 		    "state change to %d!, when status is %d",
2126 		    ddi_get_name_addr(vdp->xdf_dip), oestate, status);
2127 
2128 	return (DDI_FAILURE);
2129 }
2130 
2131 static int
2132 xdf_connect(xdf_t *vdp, boolean_t wait)
2133 {
2134 	ASSERT(mutex_owned(&vdp->xdf_dev_lk));
2135 	while (vdp->xdf_status != XD_READY) {
2136 		if (!wait || (vdp->xdf_status > XD_READY))
2137 			break;
2138 
2139 		if (cv_wait_sig(&vdp->xdf_dev_cv, &vdp->xdf_dev_lk) == 0)
2140 			break;
2141 	}
2142 
2143 	return (vdp->xdf_status);
2144 }
2145 
2146 /*
2147  * callback func when DMA/GTE resources is available
2148  *
2149  * Note: we only register one callback function to grant table subsystem
2150  * since we only have one 'struct gnttab_free_callback' in xdf_t.
2151  */
2152 static int
2153 xdf_dmacallback(caddr_t arg)
2154 {
2155 	xdf_t *vdp = (xdf_t *)arg;
2156 	ASSERT(vdp != NULL);
2157 
2158 	DPRINTF(DMA_DBG, ("xdf@%s: DMA callback started\n",
2159 	    ddi_get_name_addr(vdp->xdf_dip)));
2160 
2161 	ddi_trigger_softintr(vdp->xdf_softintr_id);
2162 	return (DDI_DMA_CALLBACK_DONE);
2163 }
2164 
2165 static uint_t
2166 xdf_iorestart(caddr_t arg)
2167 {
2168 	xdf_t *vdp = (xdf_t *)arg;
2169 
2170 	ASSERT(vdp != NULL);
2171 
2172 	mutex_enter(&vdp->xdf_dev_lk);
2173 	ASSERT(ISDMACBON(vdp));
2174 	SETDMACBOFF(vdp);
2175 	mutex_exit(&vdp->xdf_dev_lk);
2176 
2177 	xdf_iostart(vdp);
2178 
2179 	return (DDI_INTR_CLAIMED);
2180 }
2181 
2182 static void
2183 xdf_timeout_handler(void *arg)
2184 {
2185 	xdf_t *vdp = arg;
2186 
2187 	mutex_enter(&vdp->xdf_dev_lk);
2188 	vdp->xdf_timeout_id = 0;
2189 	mutex_exit(&vdp->xdf_dev_lk);
2190 
2191 	/* new timeout thread could be re-scheduled */
2192 	xdf_iostart(vdp);
2193 }
2194 
2195 /*
2196  * Alloc a vreq for this bp
2197  * bp->av_back contains the pointer to the vreq upon return
2198  */
2199 static v_req_t *
2200 vreq_get(xdf_t *vdp, buf_t *bp)
2201 {
2202 	v_req_t *vreq = NULL;
2203 
2204 	ASSERT(BP2VREQ(bp) == NULL);
2205 
2206 	vreq = kmem_cache_alloc(xdf_vreq_cache, KM_NOSLEEP);
2207 	if (vreq == NULL) {
2208 		if (vdp->xdf_timeout_id == 0)
2209 			/* restart I/O after one second */
2210 			vdp->xdf_timeout_id =
2211 			    timeout(xdf_timeout_handler, vdp, hz);
2212 		return (NULL);
2213 	}
2214 	bzero(vreq, sizeof (v_req_t));
2215 
2216 	list_insert_head(&vdp->xdf_vreq_act, (void *)vreq);
2217 	bp->av_back = (buf_t *)vreq;
2218 	vreq->v_buf = bp;
2219 	vreq->v_status = VREQ_INIT;
2220 	/* init of other fields in vreq is up to the caller */
2221 
2222 	return (vreq);
2223 }
2224 
2225 static void
2226 vreq_free(xdf_t *vdp, v_req_t *vreq)
2227 {
2228 	buf_t *bp = vreq->v_buf;
2229 
2230 	list_remove(&vdp->xdf_vreq_act, (void *)vreq);
2231 
2232 	if (vreq->v_flush_diskcache == FLUSH_DISKCACHE)
2233 		goto done;
2234 
2235 	switch (vreq->v_status) {
2236 	case VREQ_DMAWIN_DONE:
2237 	case VREQ_GS_ALLOCED:
2238 	case VREQ_DMABUF_BOUND:
2239 		(void) ddi_dma_unbind_handle(vreq->v_dmahdl);
2240 		/*FALLTHRU*/
2241 	case VREQ_DMAMEM_ALLOCED:
2242 		if (!ALIGNED_XFER(bp)) {
2243 			ASSERT(vreq->v_abuf != NULL);
2244 			if (!IS_ERROR(bp) && IS_READ(bp))
2245 				bcopy(vreq->v_abuf, bp->b_un.b_addr,
2246 				    bp->b_bcount);
2247 			ddi_dma_mem_free(&vreq->v_align);
2248 		}
2249 		/*FALLTHRU*/
2250 	case VREQ_MEMDMAHDL_ALLOCED:
2251 		if (!ALIGNED_XFER(bp))
2252 			ddi_dma_free_handle(&vreq->v_memdmahdl);
2253 		/*FALLTHRU*/
2254 	case VREQ_DMAHDL_ALLOCED:
2255 		ddi_dma_free_handle(&vreq->v_dmahdl);
2256 		break;
2257 	default:
2258 		break;
2259 	}
2260 done:
2261 	vreq->v_buf->av_back = NULL;
2262 	kmem_cache_free(xdf_vreq_cache, vreq);
2263 }
2264 
2265 /*
2266  * Initalize the DMA and grant table resources for the buf
2267  */
2268 static int
2269 vreq_setup(xdf_t *vdp, v_req_t *vreq)
2270 {
2271 	int rc;
2272 	ddi_dma_attr_t dmaattr;
2273 	uint_t ndcs, ndws;
2274 	ddi_dma_handle_t dh;
2275 	ddi_dma_handle_t mdh;
2276 	ddi_dma_cookie_t dc;
2277 	ddi_acc_handle_t abh;
2278 	caddr_t	aba;
2279 	ge_slot_t *gs;
2280 	size_t bufsz;
2281 	off_t off;
2282 	size_t sz;
2283 	buf_t *bp = vreq->v_buf;
2284 	int dma_flags = (IS_READ(bp) ? DDI_DMA_READ : DDI_DMA_WRITE) |
2285 	    DDI_DMA_STREAMING | DDI_DMA_PARTIAL;
2286 
2287 	switch (vreq->v_status) {
2288 	case VREQ_INIT:
2289 		if (IS_FLUSH_DISKCACHE(bp)) {
2290 			if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
2291 				DPRINTF(DMA_DBG, (
2292 				    "xdf@%s: get ge_slotfailed\n",
2293 				    ddi_get_name_addr(vdp->xdf_dip)));
2294 				return (DDI_FAILURE);
2295 			}
2296 			vreq->v_blkno = 0;
2297 			vreq->v_nslots = 1;
2298 			vreq->v_gs = gs;
2299 			vreq->v_flush_diskcache = FLUSH_DISKCACHE;
2300 			vreq->v_status = VREQ_GS_ALLOCED;
2301 			gs->vreq = vreq;
2302 			return (DDI_SUCCESS);
2303 		}
2304 
2305 		if (IS_WRITE_BARRIER(vdp, bp))
2306 			vreq->v_flush_diskcache = WRITE_BARRIER;
2307 		vreq->v_blkno = bp->b_blkno +
2308 		    (diskaddr_t)(uintptr_t)bp->b_private;
2309 		bp->b_private = NULL;
2310 		/* See if we wrote new data to our flush block */
2311 		if (!IS_READ(bp) && USE_WRITE_BARRIER(vdp))
2312 			check_fbwrite(vdp, bp, vreq->v_blkno);
2313 		vreq->v_status = VREQ_INIT_DONE;
2314 		/*FALLTHRU*/
2315 
2316 	case VREQ_INIT_DONE:
2317 		/*
2318 		 * alloc DMA handle
2319 		 */
2320 		rc = ddi_dma_alloc_handle(vdp->xdf_dip, &xb_dma_attr,
2321 		    xdf_dmacallback, (caddr_t)vdp, &dh);
2322 		if (rc != DDI_SUCCESS) {
2323 			SETDMACBON(vdp);
2324 			DPRINTF(DMA_DBG, ("xdf@%s: DMA handle alloc failed\n",
2325 			    ddi_get_name_addr(vdp->xdf_dip)));
2326 			return (DDI_FAILURE);
2327 		}
2328 
2329 		vreq->v_dmahdl = dh;
2330 		vreq->v_status = VREQ_DMAHDL_ALLOCED;
2331 		/*FALLTHRU*/
2332 
2333 	case VREQ_DMAHDL_ALLOCED:
2334 		/*
2335 		 * alloc dma handle for 512-byte aligned buf
2336 		 */
2337 		if (!ALIGNED_XFER(bp)) {
2338 			/*
2339 			 * XXPV: we need to temporarily enlarge the seg
2340 			 * boundary and s/g length to work round CR6381968
2341 			 */
2342 			dmaattr = xb_dma_attr;
2343 			dmaattr.dma_attr_seg = (uint64_t)-1;
2344 			dmaattr.dma_attr_sgllen = INT_MAX;
2345 			rc = ddi_dma_alloc_handle(vdp->xdf_dip, &dmaattr,
2346 			    xdf_dmacallback, (caddr_t)vdp, &mdh);
2347 			if (rc != DDI_SUCCESS) {
2348 				SETDMACBON(vdp);
2349 				DPRINTF(DMA_DBG, ("xdf@%s: unaligned buf DMA"
2350 				    "handle alloc failed\n",
2351 				    ddi_get_name_addr(vdp->xdf_dip)));
2352 				return (DDI_FAILURE);
2353 			}
2354 			vreq->v_memdmahdl = mdh;
2355 			vreq->v_status = VREQ_MEMDMAHDL_ALLOCED;
2356 		}
2357 		/*FALLTHRU*/
2358 
2359 	case VREQ_MEMDMAHDL_ALLOCED:
2360 		/*
2361 		 * alloc 512-byte aligned buf
2362 		 */
2363 		if (!ALIGNED_XFER(bp)) {
2364 			if (bp->b_flags & (B_PAGEIO | B_PHYS))
2365 				bp_mapin(bp);
2366 
2367 			rc = ddi_dma_mem_alloc(vreq->v_memdmahdl,
2368 			    roundup(bp->b_bcount, XB_BSIZE), &xc_acc_attr,
2369 			    DDI_DMA_STREAMING, xdf_dmacallback, (caddr_t)vdp,
2370 			    &aba, &bufsz, &abh);
2371 			if (rc != DDI_SUCCESS) {
2372 				SETDMACBON(vdp);
2373 				DPRINTF(DMA_DBG, (
2374 				    "xdf@%s: DMA mem allocation failed\n",
2375 				    ddi_get_name_addr(vdp->xdf_dip)));
2376 				return (DDI_FAILURE);
2377 			}
2378 
2379 			vreq->v_abuf = aba;
2380 			vreq->v_align = abh;
2381 			vreq->v_status = VREQ_DMAMEM_ALLOCED;
2382 
2383 			ASSERT(bufsz >= bp->b_bcount);
2384 			if (!IS_READ(bp))
2385 				bcopy(bp->b_un.b_addr, vreq->v_abuf,
2386 				    bp->b_bcount);
2387 		}
2388 		/*FALLTHRU*/
2389 
2390 	case VREQ_DMAMEM_ALLOCED:
2391 		/*
2392 		 * dma bind
2393 		 */
2394 		if (ALIGNED_XFER(bp)) {
2395 			rc = ddi_dma_buf_bind_handle(vreq->v_dmahdl, bp,
2396 			    dma_flags, xdf_dmacallback, (caddr_t)vdp,
2397 			    &dc, &ndcs);
2398 		} else {
2399 			rc = ddi_dma_addr_bind_handle(vreq->v_dmahdl,
2400 			    NULL, vreq->v_abuf, bp->b_bcount, dma_flags,
2401 			    xdf_dmacallback, (caddr_t)vdp, &dc, &ndcs);
2402 		}
2403 		if (rc == DDI_DMA_MAPPED || rc == DDI_DMA_PARTIAL_MAP) {
2404 			/* get num of dma windows */
2405 			if (rc == DDI_DMA_PARTIAL_MAP) {
2406 				rc = ddi_dma_numwin(vreq->v_dmahdl, &ndws);
2407 				ASSERT(rc == DDI_SUCCESS);
2408 			} else {
2409 				ndws = 1;
2410 			}
2411 		} else {
2412 			SETDMACBON(vdp);
2413 			DPRINTF(DMA_DBG, ("xdf@%s: DMA bind failed\n",
2414 			    ddi_get_name_addr(vdp->xdf_dip)));
2415 			return (DDI_FAILURE);
2416 		}
2417 
2418 		vreq->v_dmac = dc;
2419 		vreq->v_dmaw = 0;
2420 		vreq->v_ndmacs = ndcs;
2421 		vreq->v_ndmaws = ndws;
2422 		vreq->v_nslots = ndws;
2423 		vreq->v_status = VREQ_DMABUF_BOUND;
2424 		/*FALLTHRU*/
2425 
2426 	case VREQ_DMABUF_BOUND:
2427 		/*
2428 		 * get ge_slot, callback is set upon failure from gs_get(),
2429 		 * if not set previously
2430 		 */
2431 		if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
2432 			DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
2433 			    ddi_get_name_addr(vdp->xdf_dip)));
2434 			return (DDI_FAILURE);
2435 		}
2436 
2437 		vreq->v_gs = gs;
2438 		gs->vreq = vreq;
2439 		vreq->v_status = VREQ_GS_ALLOCED;
2440 		break;
2441 
2442 	case VREQ_GS_ALLOCED:
2443 		/* nothing need to be done */
2444 		break;
2445 
2446 	case VREQ_DMAWIN_DONE:
2447 		/*
2448 		 * move to the next dma window
2449 		 */
2450 		ASSERT((vreq->v_dmaw + 1) < vreq->v_ndmaws);
2451 
2452 		/* get a ge_slot for this DMA window */
2453 		if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
2454 			DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
2455 			    ddi_get_name_addr(vdp->xdf_dip)));
2456 			return (DDI_FAILURE);
2457 		}
2458 
2459 		vreq->v_gs = gs;
2460 		gs->vreq = vreq;
2461 		vreq->v_dmaw++;
2462 		rc = ddi_dma_getwin(vreq->v_dmahdl, vreq->v_dmaw, &off, &sz,
2463 		    &vreq->v_dmac, &vreq->v_ndmacs);
2464 		ASSERT(rc == DDI_SUCCESS);
2465 		vreq->v_status = VREQ_GS_ALLOCED;
2466 		break;
2467 
2468 	default:
2469 		return (DDI_FAILURE);
2470 	}
2471 
2472 	return (DDI_SUCCESS);
2473 }
2474 
2475 static ge_slot_t *
2476 gs_get(xdf_t *vdp, int isread)
2477 {
2478 	grant_ref_t gh;
2479 	ge_slot_t *gs;
2480 
2481 	/* try to alloc GTEs needed in this slot, first */
2482 	if (gnttab_alloc_grant_references(
2483 	    BLKIF_MAX_SEGMENTS_PER_REQUEST, &gh) == -1) {
2484 		if (vdp->xdf_gnt_callback.next == NULL) {
2485 			SETDMACBON(vdp);
2486 			gnttab_request_free_callback(
2487 			    &vdp->xdf_gnt_callback,
2488 			    (void (*)(void *))xdf_dmacallback,
2489 			    (void *)vdp,
2490 			    BLKIF_MAX_SEGMENTS_PER_REQUEST);
2491 		}
2492 		return (NULL);
2493 	}
2494 
2495 	gs = kmem_cache_alloc(xdf_gs_cache, KM_NOSLEEP);
2496 	if (gs == NULL) {
2497 		gnttab_free_grant_references(gh);
2498 		if (vdp->xdf_timeout_id == 0)
2499 			/* restart I/O after one second */
2500 			vdp->xdf_timeout_id =
2501 			    timeout(xdf_timeout_handler, vdp, hz);
2502 		return (NULL);
2503 	}
2504 
2505 	/* init gs_slot */
2506 	list_insert_head(&vdp->xdf_gs_act, (void *)gs);
2507 	gs->oeid = vdp->xdf_peer;
2508 	gs->isread = isread;
2509 	gs->ghead = gh;
2510 	gs->ngrefs = 0;
2511 
2512 	return (gs);
2513 }
2514 
2515 static void
2516 gs_free(xdf_t *vdp, ge_slot_t *gs)
2517 {
2518 	int i;
2519 	grant_ref_t *gp = gs->ge;
2520 	int ngrefs = gs->ngrefs;
2521 	boolean_t isread = gs->isread;
2522 
2523 	list_remove(&vdp->xdf_gs_act, (void *)gs);
2524 
2525 	/* release all grant table entry resources used in this slot */
2526 	for (i = 0; i < ngrefs; i++, gp++)
2527 		gnttab_end_foreign_access(*gp, !isread, 0);
2528 	gnttab_free_grant_references(gs->ghead);
2529 
2530 	kmem_cache_free(xdf_gs_cache, (void *)gs);
2531 }
2532 
2533 static grant_ref_t
2534 gs_grant(ge_slot_t *gs, mfn_t mfn)
2535 {
2536 	grant_ref_t gr = gnttab_claim_grant_reference(&gs->ghead);
2537 
2538 	ASSERT(gr != -1);
2539 	ASSERT(gs->ngrefs < BLKIF_MAX_SEGMENTS_PER_REQUEST);
2540 	gs->ge[gs->ngrefs++] = gr;
2541 	gnttab_grant_foreign_access_ref(gr, gs->oeid, mfn, !gs->isread);
2542 
2543 	return (gr);
2544 }
2545 
2546 static void
2547 unexpectedie(xdf_t *vdp)
2548 {
2549 	/* clean up I/Os in ring that have responses */
2550 	if (xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring)) {
2551 		mutex_exit(&vdp->xdf_dev_lk);
2552 		(void) xdf_intr((caddr_t)vdp);
2553 		mutex_enter(&vdp->xdf_dev_lk);
2554 	}
2555 
2556 	/* free up all grant table entries */
2557 	while (!list_is_empty(&vdp->xdf_gs_act))
2558 		gs_free(vdp, list_head(&vdp->xdf_gs_act));
2559 
2560 	/*
2561 	 * move bp back to active list orderly
2562 	 * vreq_busy is updated in vreq_free()
2563 	 */
2564 	while (!list_is_empty(&vdp->xdf_vreq_act)) {
2565 		v_req_t *vreq = list_head(&vdp->xdf_vreq_act);
2566 		buf_t *bp = vreq->v_buf;
2567 
2568 		bp->av_back = NULL;
2569 		bp->b_resid = bp->b_bcount;
2570 		if (vdp->xdf_f_act == NULL) {
2571 			vdp->xdf_f_act = vdp->xdf_l_act = bp;
2572 		} else {
2573 			/* move to the head of list */
2574 			bp->av_forw = vdp->xdf_f_act;
2575 			vdp->xdf_f_act = bp;
2576 		}
2577 		kstat_runq_back_to_waitq(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
2578 		vreq_free(vdp, vreq);
2579 	}
2580 }
2581 
2582 static void
2583 xdfmin(struct buf *bp)
2584 {
2585 	if (bp->b_bcount > xdf_maxphys)
2586 		bp->b_bcount = xdf_maxphys;
2587 }
2588