xref: /titanic_51/usr/src/uts/common/xen/io/xdf.c (revision ba2be53024c0b999e74ba9adcd7d80fec5df8c57)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * xdf.c - Xen Virtual Block Device Driver
29  * TODO:
30  *	- support alternate block size (currently only DEV_BSIZE supported)
31  *	- revalidate geometry for removable devices
32  */
33 
34 #pragma ident	"%Z%%M%	%I%	%E% SMI"
35 
36 #include <sys/types.h>
37 #include <sys/conf.h>
38 #include <sys/ddi.h>
39 #include <sys/dditypes.h>
40 #include <sys/sunddi.h>
41 #include <sys/list.h>
42 #include <sys/cmlb.h>
43 #include <sys/dkio.h>
44 #include <sys/vtoc.h>
45 #include <sys/modctl.h>
46 #include <sys/bootconf.h>
47 #include <sys/promif.h>
48 #include <sys/sysmacros.h>
49 #include <sys/kstat.h>
50 #include <sys/mach_mmu.h>
51 #ifdef XPV_HVM_DRIVER
52 #include <sys/xpv_support.h>
53 #endif
54 #include <public/io/xenbus.h>
55 #include <xen/sys/xenbus_impl.h>
56 #include <xen/sys/xendev.h>
57 #include <sys/gnttab.h>
58 #include <sys/scsi/generic/inquiry.h>
59 #include <xen/io/blkif_impl.h>
60 #include <io/xdf.h>
61 
62 #define	FLUSH_DISKCACHE	0x1
63 #define	WRITE_BARRIER	0x2
64 #define	DEFAULT_FLUSH_BLOCK	156 /* block to write to cause a cache flush */
65 #define	USE_WRITE_BARRIER(vdp)				\
66 	((vdp)->xdf_feature_barrier && !(vdp)->xdf_flush_supported)
67 #define	USE_FLUSH_DISKCACHE(vdp)			\
68 	((vdp)->xdf_feature_barrier && (vdp)->xdf_flush_supported)
69 #define	IS_WRITE_BARRIER(vdp, bp)			\
70 	(!IS_READ(bp) && USE_WRITE_BARRIER(vdp) &&	\
71 	((bp)->b_un.b_addr == (vdp)->xdf_cache_flush_block))
72 #define	IS_FLUSH_DISKCACHE(bp)				\
73 	(!IS_READ(bp) && USE_FLUSH_DISKCACHE(vdp) && ((bp)->b_bcount == 0))
74 
75 static void *vbd_ss;
76 static kmem_cache_t *xdf_vreq_cache;
77 static kmem_cache_t *xdf_gs_cache;
78 static int xdf_maxphys = XB_MAXPHYS;
79 int xdfdebug = 0;
80 extern int do_polled_io;
81 diskaddr_t xdf_flush_block = DEFAULT_FLUSH_BLOCK;
82 int	xdf_barrier_flush_disable = 0;
83 
84 /*
85  * dev_ops and cb_ops entrypoints
86  */
87 static int xdf_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
88 static int xdf_attach(dev_info_t *, ddi_attach_cmd_t);
89 static int xdf_detach(dev_info_t *, ddi_detach_cmd_t);
90 static int xdf_reset(dev_info_t *, ddi_reset_cmd_t);
91 static int xdf_open(dev_t *, int, int, cred_t *);
92 static int xdf_close(dev_t, int, int, struct cred *);
93 static int xdf_strategy(struct buf *);
94 static int xdf_read(dev_t, struct uio *, cred_t *);
95 static int xdf_aread(dev_t, struct aio_req *, cred_t *);
96 static int xdf_write(dev_t, struct uio *, cred_t *);
97 static int xdf_awrite(dev_t, struct aio_req *, cred_t *);
98 static int xdf_dump(dev_t, caddr_t, daddr_t, int);
99 static int xdf_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
100 static uint_t xdf_intr(caddr_t);
101 static int xdf_prop_op(dev_t, dev_info_t *, ddi_prop_op_t, int, char *,
102     caddr_t, int *);
103 
104 /*
105  * misc private functions
106  */
107 static int xdf_suspend(dev_info_t *);
108 static int xdf_resume(dev_info_t *);
109 static int xdf_start_connect(xdf_t *);
110 static int xdf_start_disconnect(xdf_t *);
111 static int xdf_post_connect(xdf_t *);
112 static void xdf_post_disconnect(xdf_t *);
113 static void xdf_oe_change(dev_info_t *, ddi_eventcookie_t, void *, void *);
114 static void xdf_iostart(xdf_t *);
115 static void xdf_iofini(xdf_t *, uint64_t, int);
116 static int xdf_prepare_rreq(xdf_t *, struct buf *, blkif_request_t *);
117 static int xdf_drain_io(xdf_t *);
118 static boolean_t xdf_isopen(xdf_t *, int);
119 static int xdf_check_state_transition(xdf_t *, XenbusState);
120 static int xdf_connect(xdf_t *, boolean_t);
121 static int xdf_dmacallback(caddr_t);
122 static void xdf_timeout_handler(void *);
123 static uint_t xdf_iorestart(caddr_t);
124 static v_req_t *vreq_get(xdf_t *, buf_t *);
125 static void vreq_free(xdf_t *, v_req_t *);
126 static int vreq_setup(xdf_t *, v_req_t *);
127 static ge_slot_t *gs_get(xdf_t *, int);
128 static void gs_free(xdf_t *, ge_slot_t *);
129 static grant_ref_t gs_grant(ge_slot_t *, mfn_t);
130 static void unexpectedie(xdf_t *);
131 static void xdfmin(struct buf *);
132 
133 static 	struct cb_ops xdf_cbops = {
134 	xdf_open,
135 	xdf_close,
136 	xdf_strategy,
137 	nodev,
138 	xdf_dump,
139 	xdf_read,
140 	xdf_write,
141 	xdf_ioctl,
142 	nodev,
143 	nodev,
144 	nodev,
145 	nochpoll,
146 	xdf_prop_op,
147 	NULL,
148 	D_MP | D_NEW | D_64BIT,
149 	CB_REV,
150 	xdf_aread,
151 	xdf_awrite
152 };
153 
154 struct dev_ops xdf_devops = {
155 	DEVO_REV,		/* devo_rev */
156 	0,			/* devo_refcnt */
157 	xdf_getinfo,		/* devo_getinfo */
158 	nulldev,		/* devo_identify */
159 	nulldev,		/* devo_probe */
160 	xdf_attach,		/* devo_attach */
161 	xdf_detach,		/* devo_detach */
162 	xdf_reset,		/* devo_reset */
163 	&xdf_cbops,		/* devo_cb_ops */
164 	(struct bus_ops *)NULL	/* devo_bus_ops */
165 };
166 
167 static struct modldrv modldrv = {
168 	&mod_driverops,		/* Type of module.  This one is a driver */
169 	"virtual block driver %I%",	/* short description */
170 	&xdf_devops		/* driver specific ops */
171 };
172 
173 static struct modlinkage xdf_modlinkage = {
174 	MODREV_1, (void *)&modldrv, NULL
175 };
176 
177 /*
178  * I/O buffer DMA attributes
179  * Make sure: one DMA window contains BLKIF_MAX_SEGMENTS_PER_REQUEST at most
180  */
181 static ddi_dma_attr_t xb_dma_attr = {
182 	DMA_ATTR_V0,
183 	(uint64_t)0,			/* lowest address */
184 	(uint64_t)0xffffffffffffffff,	/* highest usable address */
185 	(uint64_t)0xffffff,		/* DMA counter limit max */
186 	(uint64_t)XB_BSIZE,		/* alignment in bytes */
187 	XB_BSIZE - 1,			/* bitmap of burst sizes */
188 	XB_BSIZE,			/* min transfer */
189 	(uint64_t)XB_MAX_XFER, 		/* maximum transfer */
190 	(uint64_t)PAGEOFFSET,		/* 1 page segment length  */
191 	BLKIF_MAX_SEGMENTS_PER_REQUEST,	/* maximum number of segments */
192 	XB_BSIZE,			/* granularity */
193 	0,				/* flags (reserved) */
194 };
195 
196 static ddi_device_acc_attr_t xc_acc_attr = {
197 	DDI_DEVICE_ATTR_V0,
198 	DDI_NEVERSWAP_ACC,
199 	DDI_STRICTORDER_ACC
200 };
201 
202 /* callbacks from commmon label */
203 
204 static int xdf_lb_rdwr(dev_info_t *, uchar_t, void *, diskaddr_t, size_t,
205 	void *);
206 static int xdf_lb_getinfo(dev_info_t *, int, void *, void *);
207 
208 static cmlb_tg_ops_t xdf_lb_ops = {
209 	TG_DK_OPS_VERSION_1,
210 	xdf_lb_rdwr,
211 	xdf_lb_getinfo
212 };
213 
214 int
215 _init(void)
216 {
217 	int rc;
218 
219 	if ((rc = ddi_soft_state_init(&vbd_ss, sizeof (xdf_t), 0)) == 0) {
220 		xdf_vreq_cache = kmem_cache_create("xdf_vreq_cache",
221 		    sizeof (v_req_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
222 		ASSERT(xdf_vreq_cache != NULL);
223 		xdf_gs_cache = kmem_cache_create("xdf_gs_cache",
224 		    sizeof (ge_slot_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
225 		ASSERT(xdf_gs_cache != NULL);
226 		if ((rc = mod_install(&xdf_modlinkage)) != 0) {
227 			kmem_cache_destroy(xdf_vreq_cache);
228 			kmem_cache_destroy(xdf_gs_cache);
229 			ddi_soft_state_fini(&vbd_ss);
230 		}
231 	}
232 
233 	return (rc);
234 }
235 
236 int
237 _fini(void)
238 {
239 	int err;
240 
241 	if ((err = mod_remove(&xdf_modlinkage)) != 0)
242 		return (err);
243 
244 	kmem_cache_destroy(xdf_vreq_cache);
245 	kmem_cache_destroy(xdf_gs_cache);
246 	ddi_soft_state_fini(&vbd_ss);
247 
248 	return (0);
249 }
250 
251 int
252 _info(struct modinfo *modinfop)
253 {
254 	return (mod_info(&xdf_modlinkage, modinfop));
255 }
256 
257 /*ARGSUSED*/
258 static int
259 xdf_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **rp)
260 {
261 	int instance;
262 	xdf_t *vbdp;
263 
264 	instance = XDF_INST(getminor((dev_t)arg));
265 
266 	switch (cmd) {
267 	case DDI_INFO_DEVT2DEVINFO:
268 		if ((vbdp = ddi_get_soft_state(vbd_ss, instance)) == NULL) {
269 			*rp = NULL;
270 			return (DDI_FAILURE);
271 		}
272 		*rp = vbdp->xdf_dip;
273 		return (DDI_SUCCESS);
274 
275 	case DDI_INFO_DEVT2INSTANCE:
276 		*rp = (void *)(uintptr_t)instance;
277 		return (DDI_SUCCESS);
278 
279 	default:
280 		return (DDI_FAILURE);
281 	}
282 }
283 
284 static int
285 xdf_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op, int mod_flags,
286 	char *name, caddr_t valuep, int *lengthp)
287 {
288 	int instance = ddi_get_instance(dip);
289 	xdf_t *vdp;
290 	diskaddr_t p_blkcnt;
291 
292 	/*
293 	 * xdf dynamic properties are device specific and size oriented.
294 	 * Requests issued under conditions where size is valid are passed
295 	 * to ddi_prop_op_nblocks with the size information, otherwise the
296 	 * request is passed to ddi_prop_op.
297 	 */
298 	vdp = ddi_get_soft_state(vbd_ss, instance);
299 
300 	if ((dev == DDI_DEV_T_ANY) || (vdp == NULL))
301 		return (ddi_prop_op(dev, dip, prop_op, mod_flags,
302 		    name, valuep, lengthp));
303 
304 	/* do cv_wait until connected or failed */
305 	mutex_enter(&vdp->xdf_dev_lk);
306 	if (xdf_connect(vdp, B_TRUE) != XD_READY) {
307 		mutex_exit(&vdp->xdf_dev_lk);
308 		goto out;
309 	}
310 	mutex_exit(&vdp->xdf_dev_lk);
311 
312 	if (cmlb_partinfo(vdp->xdf_vd_lbl, XDF_PART(getminor(dev)), &p_blkcnt,
313 	    NULL, NULL, NULL, NULL) == 0)
314 		return (ddi_prop_op_nblocks(dev, dip, prop_op, mod_flags,
315 		    name, valuep, lengthp, (uint64_t)p_blkcnt));
316 
317 out:
318 	return (ddi_prop_op(dev, dip, prop_op, mod_flags, name, valuep,
319 	    lengthp));
320 }
321 
322 static int
323 xdf_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
324 {
325 	xdf_t *vdp;
326 	ddi_iblock_cookie_t ibc;
327 	ddi_iblock_cookie_t softibc;
328 	int instance;
329 
330 	xdfdebug = ddi_prop_get_int(DDI_DEV_T_ANY, devi, DDI_PROP_NOTPROM,
331 	    "xdfdebug", 0);
332 
333 	switch (cmd) {
334 		case DDI_ATTACH:
335 			break;
336 
337 		case DDI_RESUME:
338 			return (xdf_resume(devi));
339 
340 		default:
341 			return (DDI_FAILURE);
342 	}
343 
344 	instance = ddi_get_instance(devi);
345 	if (ddi_soft_state_zalloc(vbd_ss, instance) != DDI_SUCCESS)
346 		return (DDI_FAILURE);
347 
348 	DPRINTF(DDI_DBG, ("xdf%d: attaching\n", instance));
349 	vdp = ddi_get_soft_state(vbd_ss, instance);
350 	vdp->xdf_dip = devi;
351 	if (ddi_get_iblock_cookie(devi, 0, &ibc) != DDI_SUCCESS) {
352 		cmn_err(CE_WARN, "xdf@%s: failed to get iblock cookie",
353 		    ddi_get_name_addr(devi));
354 		goto errout1;
355 	}
356 
357 	mutex_init(&vdp->xdf_dev_lk, NULL, MUTEX_DRIVER, (void *)ibc);
358 	mutex_init(&vdp->xdf_cb_lk, NULL, MUTEX_DRIVER, (void *)ibc);
359 	cv_init(&vdp->xdf_dev_cv, NULL, CV_DEFAULT, NULL);
360 	ddi_set_driver_private(devi, vdp);
361 
362 	if (ddi_get_soft_iblock_cookie(devi, DDI_SOFTINT_LOW, &softibc)
363 	    != DDI_SUCCESS) {
364 		cmn_err(CE_WARN, "xdf@%s: failed to get softintr iblock cookie",
365 		    ddi_get_name_addr(devi));
366 		goto errout2;
367 	}
368 	if (ddi_add_softintr(devi, DDI_SOFTINT_LOW, &vdp->xdf_softintr_id,
369 	    &softibc, NULL, xdf_iorestart, (caddr_t)vdp) != DDI_SUCCESS) {
370 		cmn_err(CE_WARN, "xdf@%s: failed to add softintr",
371 		    ddi_get_name_addr(devi));
372 		goto errout2;
373 	}
374 
375 	/*
376 	 * create kstat for iostat(1M)
377 	 */
378 	if ((vdp->xdf_xdev_iostat = kstat_create("xdf", instance, NULL, "disk",
379 	    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) != NULL) {
380 		vdp->xdf_xdev_iostat->ks_lock = &vdp->xdf_dev_lk;
381 		kstat_install(vdp->xdf_xdev_iostat);
382 	} else {
383 		cmn_err(CE_WARN, "xdf@%s: failed to create kstat",
384 		    ddi_get_name_addr(devi));
385 		goto errout3;
386 	}
387 
388 	/*
389 	 * driver handles kernel-issued IOCTLs
390 	 */
391 	if (ddi_prop_create(DDI_DEV_T_NONE, devi, DDI_PROP_CANSLEEP,
392 	    DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) {
393 		cmn_err(CE_WARN, "xdf@%s: cannot create DDI_KERNEL_IOCTL prop",
394 		    ddi_get_name_addr(devi));
395 		goto errout4;
396 	}
397 
398 	/*
399 	 * create default device minor nodes: non-removable disk
400 	 * we will adjust minor nodes after we are connected w/ backend
401 	 */
402 	cmlb_alloc_handle(&vdp->xdf_vd_lbl);
403 	if (cmlb_attach(devi, &xdf_lb_ops, DTYPE_DIRECT, 0, 1, DDI_NT_BLOCK,
404 	    CMLB_FAKE_LABEL_ONE_PARTITION, vdp->xdf_vd_lbl, NULL) != 0) {
405 		cmn_err(CE_WARN, "xdf@%s: default cmlb attach failed",
406 		    ddi_get_name_addr(devi));
407 		goto errout5;
408 	}
409 
410 	/*
411 	 * We ship with cache-enabled disks
412 	 */
413 	vdp->xdf_wce = 1;
414 
415 	mutex_enter(&vdp->xdf_cb_lk);
416 
417 	/* Watch backend XenbusState change */
418 	if (xvdi_add_event_handler(devi, XS_OE_STATE,
419 	    xdf_oe_change) != DDI_SUCCESS) {
420 		mutex_exit(&vdp->xdf_cb_lk);
421 		goto errout6;
422 	}
423 
424 	if (xdf_start_connect(vdp) != DDI_SUCCESS) {
425 		cmn_err(CE_WARN, "xdf@%s: start connection failed",
426 		    ddi_get_name_addr(devi));
427 		(void) xdf_start_disconnect(vdp);
428 		mutex_exit(&vdp->xdf_cb_lk);
429 		goto errout7;
430 	}
431 
432 	mutex_exit(&vdp->xdf_cb_lk);
433 
434 	list_create(&vdp->xdf_vreq_act, sizeof (v_req_t),
435 	    offsetof(v_req_t, v_link));
436 	list_create(&vdp->xdf_gs_act, sizeof (ge_slot_t),
437 	    offsetof(ge_slot_t, link));
438 
439 	ddi_report_dev(devi);
440 	DPRINTF(DDI_DBG, ("xdf%d: attached\n", instance));
441 
442 	return (DDI_SUCCESS);
443 
444 errout7:
445 	xvdi_remove_event_handler(devi, XS_OE_STATE);
446 errout6:
447 	cmlb_detach(vdp->xdf_vd_lbl, NULL);
448 errout5:
449 	cmlb_free_handle(&vdp->xdf_vd_lbl);
450 	ddi_prop_remove_all(devi);
451 errout4:
452 	kstat_delete(vdp->xdf_xdev_iostat);
453 errout3:
454 	ddi_remove_softintr(vdp->xdf_softintr_id);
455 errout2:
456 	ddi_set_driver_private(devi, NULL);
457 	cv_destroy(&vdp->xdf_dev_cv);
458 	mutex_destroy(&vdp->xdf_cb_lk);
459 	mutex_destroy(&vdp->xdf_dev_lk);
460 errout1:
461 	cmn_err(CE_WARN, "xdf@%s: attach failed", ddi_get_name_addr(devi));
462 	ddi_soft_state_free(vbd_ss, instance);
463 	return (DDI_FAILURE);
464 }
465 
466 static int
467 xdf_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
468 {
469 	xdf_t *vdp;
470 	int instance;
471 
472 	switch (cmd) {
473 
474 	case DDI_PM_SUSPEND:
475 		break;
476 
477 	case DDI_SUSPEND:
478 		return (xdf_suspend(devi));
479 
480 	case DDI_DETACH:
481 		break;
482 
483 	default:
484 		return (DDI_FAILURE);
485 	}
486 
487 	instance = ddi_get_instance(devi);
488 	DPRINTF(DDI_DBG, ("xdf%d: detaching\n", instance));
489 	vdp = ddi_get_soft_state(vbd_ss, instance);
490 
491 	if (vdp == NULL)
492 		return (DDI_FAILURE);
493 
494 	mutex_enter(&vdp->xdf_dev_lk);
495 	if (xdf_isopen(vdp, -1)) {
496 		mutex_exit(&vdp->xdf_dev_lk);
497 		return (DDI_FAILURE);
498 	}
499 
500 	if (vdp->xdf_status != XD_CLOSED) {
501 		mutex_exit(&vdp->xdf_dev_lk);
502 		return (DDI_FAILURE);
503 	}
504 
505 	ASSERT(!ISDMACBON(vdp));
506 	mutex_exit(&vdp->xdf_dev_lk);
507 
508 	if (vdp->xdf_timeout_id != 0)
509 		(void) untimeout(vdp->xdf_timeout_id);
510 
511 	xvdi_remove_event_handler(devi, XS_OE_STATE);
512 
513 	/* we'll support backend running in domU later */
514 #ifdef	DOMU_BACKEND
515 	(void) xvdi_post_event(devi, XEN_HP_REMOVE);
516 #endif
517 
518 	list_destroy(&vdp->xdf_vreq_act);
519 	list_destroy(&vdp->xdf_gs_act);
520 	ddi_prop_remove_all(devi);
521 	kstat_delete(vdp->xdf_xdev_iostat);
522 	ddi_remove_softintr(vdp->xdf_softintr_id);
523 	ddi_set_driver_private(devi, NULL);
524 	cv_destroy(&vdp->xdf_dev_cv);
525 	mutex_destroy(&vdp->xdf_cb_lk);
526 	mutex_destroy(&vdp->xdf_dev_lk);
527 	if (vdp->xdf_cache_flush_block != NULL)
528 		kmem_free(vdp->xdf_flush_mem, 2 * DEV_BSIZE);
529 	ddi_soft_state_free(vbd_ss, instance);
530 	return (DDI_SUCCESS);
531 }
532 
533 static int
534 xdf_suspend(dev_info_t *devi)
535 {
536 	xdf_t *vdp;
537 	int instance;
538 	enum xdf_state st;
539 
540 	instance = ddi_get_instance(devi);
541 
542 	if (xdfdebug & SUSRES_DBG)
543 		xen_printf("xdf_suspend: xdf#%d\n", instance);
544 
545 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
546 		return (DDI_FAILURE);
547 
548 	xvdi_suspend(devi);
549 
550 	mutex_enter(&vdp->xdf_cb_lk);
551 	mutex_enter(&vdp->xdf_dev_lk);
552 	st = vdp->xdf_status;
553 	/* change status to stop further I/O requests */
554 	if (st == XD_READY)
555 		vdp->xdf_status = XD_SUSPEND;
556 	mutex_exit(&vdp->xdf_dev_lk);
557 	mutex_exit(&vdp->xdf_cb_lk);
558 
559 	/* make sure no more I/O responses left in the ring buffer */
560 	if ((st == XD_INIT) || (st == XD_READY)) {
561 #ifdef XPV_HVM_DRIVER
562 		ec_unbind_evtchn(vdp->xdf_evtchn);
563 #else
564 		(void) ddi_remove_intr(devi, 0, NULL);
565 #endif
566 		(void) xdf_drain_io(vdp);
567 		/*
568 		 * no need to teardown the ring buffer here
569 		 * it will be simply re-init'ed during resume when
570 		 * we call xvdi_alloc_ring
571 		 */
572 	}
573 
574 	if (xdfdebug & SUSRES_DBG)
575 		xen_printf("xdf_suspend: SUCCESS\n");
576 
577 	return (DDI_SUCCESS);
578 }
579 
580 /*ARGSUSED*/
581 static int
582 xdf_resume(dev_info_t *devi)
583 {
584 	xdf_t *vdp;
585 	int instance;
586 
587 	instance = ddi_get_instance(devi);
588 	if (xdfdebug & SUSRES_DBG)
589 		xen_printf("xdf_resume: xdf%d\n", instance);
590 
591 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
592 		return (DDI_FAILURE);
593 
594 	mutex_enter(&vdp->xdf_cb_lk);
595 
596 	if (xvdi_resume(devi) != DDI_SUCCESS) {
597 		mutex_exit(&vdp->xdf_cb_lk);
598 		return (DDI_FAILURE);
599 	}
600 
601 	mutex_enter(&vdp->xdf_dev_lk);
602 	ASSERT(vdp->xdf_status != XD_READY);
603 	vdp->xdf_status = XD_UNKNOWN;
604 	mutex_exit(&vdp->xdf_dev_lk);
605 
606 	if (xdf_start_connect(vdp) != DDI_SUCCESS) {
607 		mutex_exit(&vdp->xdf_cb_lk);
608 		return (DDI_FAILURE);
609 	}
610 
611 	mutex_exit(&vdp->xdf_cb_lk);
612 
613 	if (xdfdebug & SUSRES_DBG)
614 		xen_printf("xdf_resume: done\n");
615 	return (DDI_SUCCESS);
616 }
617 
618 /*ARGSUSED*/
619 static int
620 xdf_reset(dev_info_t *devi, ddi_reset_cmd_t cmd)
621 {
622 	xdf_t *vdp;
623 	int instance;
624 
625 	instance = ddi_get_instance(devi);
626 	DPRINTF(DDI_DBG, ("xdf%d: resetting\n", instance));
627 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
628 		return (DDI_FAILURE);
629 
630 	/*
631 	 * wait for any outstanding I/O to complete
632 	 */
633 	(void) xdf_drain_io(vdp);
634 
635 	DPRINTF(DDI_DBG, ("xdf%d: reset complete\n", instance));
636 	return (DDI_SUCCESS);
637 }
638 
639 static int
640 xdf_open(dev_t *devp, int flag, int otyp, cred_t *credp)
641 {
642 	minor_t	minor;
643 	xdf_t	*vdp;
644 	int part;
645 	ulong_t parbit;
646 	diskaddr_t p_blkct = 0;
647 	boolean_t firstopen;
648 	boolean_t nodelay;
649 
650 	nodelay = (flag & (FNDELAY | FNONBLOCK));
651 	minor = getminor(*devp);
652 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
653 		return (ENXIO);
654 
655 	DPRINTF(DDI_DBG, ("xdf%d: opening\n", XDF_INST(minor)));
656 
657 	/* do cv_wait until connected or failed */
658 	mutex_enter(&vdp->xdf_dev_lk);
659 	if (!nodelay && (xdf_connect(vdp, B_TRUE) != XD_READY)) {
660 		mutex_exit(&vdp->xdf_dev_lk);
661 		return (ENXIO);
662 	}
663 
664 	if ((flag & FWRITE) && XD_IS_RO(vdp)) {
665 		mutex_exit(&vdp->xdf_dev_lk);
666 		return (EROFS);
667 	}
668 
669 	part = XDF_PART(minor);
670 	parbit = 1 << part;
671 	if (vdp->xdf_vd_exclopen & parbit) {
672 		mutex_exit(&vdp->xdf_dev_lk);
673 		return (EBUSY);
674 	}
675 
676 	/* are we the first one to open this node? */
677 	firstopen = !xdf_isopen(vdp, -1);
678 
679 	if ((flag & FEXCL) && !firstopen) {
680 		mutex_exit(&vdp->xdf_dev_lk);
681 		return (EBUSY);
682 	}
683 
684 	if (otyp == OTYP_LYR)
685 		vdp->xdf_vd_lyropen[part]++;
686 
687 	vdp->xdf_vd_open[otyp] |= parbit;
688 
689 	if (flag & FEXCL)
690 		vdp->xdf_vd_exclopen |= parbit;
691 
692 	mutex_exit(&vdp->xdf_dev_lk);
693 
694 	/* force a re-validation */
695 	if (firstopen)
696 		cmlb_invalidate(vdp->xdf_vd_lbl, NULL);
697 
698 	/*
699 	 * check size
700 	 * ignore CD/DVD which contains a zero-sized s0
701 	 */
702 	if (!nodelay && !XD_IS_CD(vdp) &&
703 	    ((cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
704 	    NULL, NULL, NULL, NULL) != 0) || (p_blkct == 0))) {
705 		(void) xdf_close(*devp, flag, otyp, credp);
706 		return (ENXIO);
707 	}
708 
709 	return (0);
710 }
711 
712 /*ARGSUSED*/
713 static int
714 xdf_close(dev_t dev, int flag, int otyp, struct cred *credp)
715 {
716 	minor_t	minor;
717 	xdf_t	*vdp;
718 	int part;
719 	ulong_t parbit;
720 
721 	minor = getminor(dev);
722 	if ((vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) == NULL)
723 		return (ENXIO);
724 
725 	mutex_enter(&vdp->xdf_dev_lk);
726 	part = XDF_PART(minor);
727 	if (!xdf_isopen(vdp, part)) {
728 		mutex_exit(&vdp->xdf_dev_lk);
729 		return (ENXIO);
730 	}
731 	parbit = 1 << part;
732 
733 	if (otyp == OTYP_LYR) {
734 		if (vdp->xdf_vd_lyropen[part] != 0)
735 			vdp->xdf_vd_lyropen[part]--;
736 		if (vdp->xdf_vd_lyropen[part] == 0)
737 			vdp->xdf_vd_open[OTYP_LYR] &= ~parbit;
738 	} else {
739 		vdp->xdf_vd_open[otyp] &= ~parbit;
740 	}
741 	vdp->xdf_vd_exclopen &= ~parbit;
742 
743 	mutex_exit(&vdp->xdf_dev_lk);
744 	return (0);
745 }
746 
747 static int
748 xdf_strategy(struct buf *bp)
749 {
750 	xdf_t	*vdp;
751 	minor_t minor;
752 	diskaddr_t p_blkct, p_blkst;
753 	ulong_t nblks;
754 	int part;
755 
756 	minor = getminor(bp->b_edev);
757 	part = XDF_PART(minor);
758 	if (!(vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))) ||
759 	    !xdf_isopen(vdp, part) ||
760 	    cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkct,
761 	    &p_blkst, NULL, NULL, NULL)) {
762 		bioerror(bp, ENXIO);
763 		bp->b_resid = bp->b_bcount;
764 		biodone(bp);
765 		return (0);
766 	}
767 
768 	if (!IS_READ(bp) && XD_IS_RO(vdp)) {
769 		bioerror(bp, EROFS);
770 		bp->b_resid = bp->b_bcount;
771 		biodone(bp);
772 		return (0);
773 	}
774 
775 	/*
776 	 * starting beyond partition
777 	 */
778 	if (bp->b_blkno > p_blkct) {
779 		DPRINTF(IO_DBG, ("xdf: block %lld exceeds VBD size %"PRIu64,
780 		    (longlong_t)bp->b_blkno, (uint64_t)p_blkct));
781 		bioerror(bp, EINVAL);
782 		bp->b_resid = bp->b_bcount;
783 		biodone(bp);
784 		return (0);
785 	}
786 
787 	/* Legacy: don't set error flag at this case */
788 	if (bp->b_blkno == p_blkct) {
789 		bp->b_resid = bp->b_bcount;
790 		biodone(bp);
791 		return (0);
792 	}
793 
794 	/*
795 	 * adjust for partial transfer
796 	 */
797 	nblks = bp->b_bcount >> XB_BSHIFT;
798 	if ((bp->b_blkno + nblks) > p_blkct) {
799 		bp->b_resid = ((bp->b_blkno + nblks) - p_blkct) << XB_BSHIFT;
800 		bp->b_bcount -= bp->b_resid;
801 	}
802 
803 
804 	DPRINTF(IO_DBG, ("xdf: strategy blk %lld len %lu\n",
805 	    (longlong_t)bp->b_blkno, (ulong_t)bp->b_bcount));
806 
807 	mutex_enter(&vdp->xdf_dev_lk);
808 	kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
809 	if (vdp->xdf_f_act == NULL) {
810 		vdp->xdf_f_act = vdp->xdf_l_act = bp;
811 	} else {
812 		vdp->xdf_l_act->av_forw = bp;
813 		vdp->xdf_l_act = bp;
814 	}
815 	bp->av_forw = NULL;
816 	bp->av_back = NULL; /* not tagged with a v_req */
817 	bp->b_private = (void *)(uintptr_t)p_blkst;
818 	mutex_exit(&vdp->xdf_dev_lk);
819 	xdf_iostart(vdp);
820 	if (do_polled_io)
821 		(void) xdf_drain_io(vdp);
822 	return (0);
823 }
824 
825 /*ARGSUSED*/
826 static int
827 xdf_read(dev_t dev, struct uio *uiop, cred_t *credp)
828 {
829 
830 	xdf_t	*vdp;
831 	minor_t minor;
832 	diskaddr_t p_blkcnt;
833 	int part;
834 
835 	minor = getminor(dev);
836 	if (!(vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))))
837 		return (ENXIO);
838 
839 	DPRINTF(IO_DBG, ("xdf: read offset 0x%"PRIx64"\n",
840 	    (int64_t)uiop->uio_offset));
841 
842 	part = XDF_PART(minor);
843 	if (!xdf_isopen(vdp, part))
844 		return (ENXIO);
845 
846 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
847 	    NULL, NULL, NULL, NULL))
848 		return (ENXIO);
849 
850 	if (U_INVAL(uiop))
851 		return (EINVAL);
852 
853 	return (physio(xdf_strategy, NULL, dev, B_READ, xdfmin, uiop));
854 }
855 
856 /*ARGSUSED*/
857 static int
858 xdf_write(dev_t dev, struct uio *uiop, cred_t *credp)
859 {
860 	xdf_t *vdp;
861 	minor_t minor;
862 	diskaddr_t p_blkcnt;
863 	int part;
864 
865 	minor = getminor(dev);
866 	if (!(vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))))
867 		return (ENXIO);
868 
869 	DPRINTF(IO_DBG, ("xdf: write offset 0x%"PRIx64"\n",
870 	    (int64_t)uiop->uio_offset));
871 
872 	part = XDF_PART(minor);
873 	if (!xdf_isopen(vdp, part))
874 		return (ENXIO);
875 
876 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
877 	    NULL, NULL, NULL, NULL))
878 		return (ENXIO);
879 
880 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt))
881 		return (ENOSPC);
882 
883 	if (U_INVAL(uiop))
884 		return (EINVAL);
885 
886 	return (physio(xdf_strategy, NULL, dev, B_WRITE, minphys, uiop));
887 }
888 
889 /*ARGSUSED*/
890 static int
891 xdf_aread(dev_t dev, struct aio_req *aiop, cred_t *credp)
892 {
893 	xdf_t	*vdp;
894 	minor_t minor;
895 	struct uio *uiop = aiop->aio_uio;
896 	diskaddr_t p_blkcnt;
897 	int part;
898 
899 	minor = getminor(dev);
900 	if (!(vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))))
901 		return (ENXIO);
902 
903 	part = XDF_PART(minor);
904 	if (!xdf_isopen(vdp, part))
905 		return (ENXIO);
906 
907 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
908 	    NULL, NULL, NULL, NULL))
909 		return (ENXIO);
910 
911 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt))
912 		return (ENOSPC);
913 
914 	if (U_INVAL(uiop))
915 		return (EINVAL);
916 
917 	return (aphysio(xdf_strategy, anocancel, dev, B_READ, minphys, aiop));
918 }
919 
920 /*ARGSUSED*/
921 static int
922 xdf_awrite(dev_t dev, struct aio_req *aiop, cred_t *credp)
923 {
924 	xdf_t *vdp;
925 	minor_t minor;
926 	struct uio *uiop = aiop->aio_uio;
927 	diskaddr_t p_blkcnt;
928 	int part;
929 
930 	minor = getminor(dev);
931 	if (!(vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))))
932 		return (ENXIO);
933 
934 	part = XDF_PART(minor);
935 	if (!xdf_isopen(vdp, part))
936 		return (ENXIO);
937 
938 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt,
939 	    NULL, NULL, NULL, NULL))
940 		return (ENXIO);
941 
942 	if (uiop->uio_loffset >= XB_DTOB(p_blkcnt))
943 		return (ENOSPC);
944 
945 	if (U_INVAL(uiop))
946 		return (EINVAL);
947 
948 	return (aphysio(xdf_strategy, anocancel, dev, B_WRITE, minphys, aiop));
949 }
950 
951 static int
952 xdf_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
953 {
954 	struct buf dumpbuf, *dbp;
955 	xdf_t	*vdp;
956 	minor_t minor;
957 	int err = 0;
958 	int part;
959 	diskaddr_t p_blkcnt, p_blkst;
960 
961 	minor = getminor(dev);
962 	if (!(vdp = ddi_get_soft_state(vbd_ss, XDF_INST(minor))))
963 		return (ENXIO);
964 
965 	DPRINTF(IO_DBG, ("xdf: dump addr (0x%p) blk (%ld) nblks (%d)\n",
966 	    addr, blkno, nblk));
967 
968 	part = XDF_PART(minor);
969 	if (!xdf_isopen(vdp, part))
970 		return (ENXIO);
971 
972 	if (cmlb_partinfo(vdp->xdf_vd_lbl, part, &p_blkcnt, &p_blkst,
973 	    NULL, NULL, NULL))
974 		return (ENXIO);
975 
976 	if ((blkno + nblk) > p_blkcnt) {
977 		cmn_err(CE_WARN, "xdf: block %ld exceeds VBD size %"PRIu64,
978 		    blkno + nblk, (uint64_t)vdp->xdf_xdev_nblocks);
979 		return (EINVAL);
980 	}
981 
982 	dbp = &dumpbuf;
983 	bioinit(dbp);
984 	dbp->b_flags = B_BUSY;
985 	dbp->b_un.b_addr = addr;
986 	dbp->b_bcount	= nblk << DEV_BSHIFT;
987 	dbp->b_resid = 0;
988 	dbp->b_blkno = blkno;
989 	dbp->b_edev = dev;
990 	dbp->b_private = (void *)(uintptr_t)p_blkst;
991 
992 	mutex_enter(&vdp->xdf_dev_lk);
993 	kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
994 	if (vdp->xdf_f_act == NULL) {
995 		vdp->xdf_f_act = vdp->xdf_l_act = dbp;
996 	} else {
997 		vdp->xdf_l_act->av_forw = dbp;
998 		vdp->xdf_l_act = dbp;
999 	}
1000 	dbp->av_forw = NULL;
1001 	dbp->av_back = NULL;
1002 	mutex_exit(&vdp->xdf_dev_lk);
1003 	xdf_iostart(vdp);
1004 	err = xdf_drain_io(vdp);
1005 	biofini(dbp);
1006 	return (err);
1007 }
1008 
1009 /*ARGSUSED*/
1010 static int
1011 xdf_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
1012     int *rvalp)
1013 {
1014 	int instance;
1015 	xdf_t	*vdp;
1016 	minor_t minor;
1017 	int part;
1018 
1019 	minor = getminor(dev);
1020 	instance = XDF_INST(minor);
1021 
1022 	if ((vdp = ddi_get_soft_state(vbd_ss, instance)) == NULL)
1023 		return (ENXIO);
1024 
1025 	DPRINTF(IOCTL_DBG, ("xdf%d:ioctl: cmd %d (0x%x)\n",
1026 	    instance, cmd, cmd));
1027 
1028 	part = XDF_PART(minor);
1029 	if (!xdf_isopen(vdp, part))
1030 		return (ENXIO);
1031 
1032 	switch (cmd) {
1033 	case DKIOCGMEDIAINFO: {
1034 		struct dk_minfo	media_info;
1035 
1036 		media_info.dki_lbsize = DEV_BSIZE;
1037 		media_info.dki_capacity = vdp->xdf_xdev_nblocks;
1038 		media_info.dki_media_type = DK_FIXED_DISK;
1039 
1040 		if (ddi_copyout(&media_info, (void *)arg,
1041 		    sizeof (struct dk_minfo), mode)) {
1042 			return (EFAULT);
1043 		} else {
1044 			return (0);
1045 		}
1046 	}
1047 
1048 	case DKIOCINFO: {
1049 		struct dk_cinfo info;
1050 
1051 		/* controller information */
1052 		if (XD_IS_CD(vdp))
1053 			info.dki_ctype = DKC_CDROM;
1054 		else
1055 			info.dki_ctype = DKC_VBD;
1056 
1057 		info.dki_cnum = 0;
1058 		(void) strncpy((char *)(&info.dki_cname), "xdf", 8);
1059 
1060 		/* unit information */
1061 		info.dki_unit = ddi_get_instance(vdp->xdf_dip);
1062 		(void) strncpy((char *)(&info.dki_dname), "xdf", 8);
1063 		info.dki_flags = DKI_FMTVOL;
1064 		info.dki_partition = part;
1065 		info.dki_maxtransfer = maxphys / DEV_BSIZE;
1066 		info.dki_addr = 0;
1067 		info.dki_space = 0;
1068 		info.dki_prio = 0;
1069 		info.dki_vec = 0;
1070 
1071 		if (ddi_copyout(&info, (void *)arg, sizeof (info), mode))
1072 			return (EFAULT);
1073 		else
1074 			return (0);
1075 	}
1076 
1077 	case DKIOCSTATE: {
1078 		enum dkio_state	dkstate = DKIO_INSERTED;
1079 		if (ddi_copyout(&dkstate, (void *)arg, sizeof (dkstate),
1080 		    mode) != 0)
1081 			return (EFAULT);
1082 		return (0);
1083 	}
1084 
1085 	/*
1086 	 * is media removable?
1087 	 */
1088 	case DKIOCREMOVABLE: {
1089 		int i = XD_IS_RM(vdp) ? 1 : 0;
1090 		if (ddi_copyout(&i, (caddr_t)arg, sizeof (int), mode))
1091 			return (EFAULT);
1092 		return (0);
1093 	}
1094 
1095 	case DKIOCG_PHYGEOM:
1096 	case DKIOCG_VIRTGEOM:
1097 	case DKIOCGGEOM:
1098 	case DKIOCSGEOM:
1099 	case DKIOCGAPART:
1100 	case DKIOCGVTOC:
1101 	case DKIOCSVTOC:
1102 	case DKIOCPARTINFO:
1103 	case DKIOCGETEFI:
1104 	case DKIOCSETEFI:
1105 	case DKIOCPARTITION: {
1106 		int rc;
1107 
1108 		rc = cmlb_ioctl(vdp->xdf_vd_lbl, dev, cmd, arg, mode, credp,
1109 		    rvalp, NULL);
1110 		return (rc);
1111 	}
1112 
1113 	case DKIOCGETWCE:
1114 		if (ddi_copyout(&vdp->xdf_wce, (void *)arg,
1115 		    sizeof (vdp->xdf_wce), mode))
1116 			return (EFAULT);
1117 		return (0);
1118 	case DKIOCSETWCE:
1119 		if (ddi_copyin((void *)arg, &vdp->xdf_wce,
1120 		    sizeof (vdp->xdf_wce), mode))
1121 			return (EFAULT);
1122 		return (0);
1123 	case DKIOCFLUSHWRITECACHE: {
1124 		int rc;
1125 		struct dk_callback *dkc = (struct dk_callback *)arg;
1126 
1127 		if (vdp->xdf_flush_supported) {
1128 			rc = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
1129 			    NULL, 0, 0, (void *)dev);
1130 		} else if (vdp->xdf_feature_barrier &&
1131 		    !xdf_barrier_flush_disable) {
1132 			rc = xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE,
1133 			    vdp->xdf_cache_flush_block, xdf_flush_block,
1134 			    DEV_BSIZE, (void *)dev);
1135 		} else {
1136 			return (ENOTTY);
1137 		}
1138 		if ((mode & FKIOCTL) && (dkc != NULL) &&
1139 		    (dkc->dkc_callback != NULL)) {
1140 			(*dkc->dkc_callback)(dkc->dkc_cookie, rc);
1141 			/* need to return 0 after calling callback */
1142 			rc = 0;
1143 		}
1144 		return (rc);
1145 	}
1146 
1147 	default:
1148 		return (ENOTTY);
1149 	}
1150 }
1151 
1152 /*
1153  * xdf interrupt handler
1154  */
1155 static uint_t
1156 xdf_intr(caddr_t arg)
1157 {
1158 	xdf_t *vdp = (xdf_t *)arg;
1159 	xendev_ring_t *xbr;
1160 	blkif_response_t *resp;
1161 	int bioerr;
1162 	uint64_t id;
1163 	extern int do_polled_io;
1164 	uint8_t op;
1165 	uint16_t status;
1166 	ddi_acc_handle_t acchdl;
1167 
1168 	mutex_enter(&vdp->xdf_dev_lk);
1169 
1170 	if ((xbr = vdp->xdf_xb_ring) == NULL) {
1171 		mutex_exit(&vdp->xdf_dev_lk);
1172 		return (DDI_INTR_UNCLAIMED);
1173 	}
1174 
1175 	acchdl = vdp->xdf_xb_ring_hdl;
1176 
1177 	/*
1178 	 * complete all requests which have a response
1179 	 */
1180 	while (resp = xvdi_ring_get_response(xbr)) {
1181 		id = ddi_get64(acchdl, &resp->id);
1182 		op = ddi_get8(acchdl, &resp->operation);
1183 		status = ddi_get16(acchdl, (uint16_t *)&resp->status);
1184 		DPRINTF(INTR_DBG, ("resp: op %d id %"PRIu64" status %d\n",
1185 		    op, id, status));
1186 
1187 		/*
1188 		 * XXPV - close connection to the backend and restart
1189 		 */
1190 		if (status != BLKIF_RSP_OKAY) {
1191 			DPRINTF(IO_DBG, ("xdf@%s: I/O error while %s",
1192 			    ddi_get_name_addr(vdp->xdf_dip),
1193 			    (op == BLKIF_OP_READ) ? "reading" : "writing"));
1194 			bioerr = EIO;
1195 		} else {
1196 			bioerr = 0;
1197 		}
1198 
1199 		xdf_iofini(vdp, id, bioerr);
1200 	}
1201 
1202 	mutex_exit(&vdp->xdf_dev_lk);
1203 
1204 	if (!do_polled_io)
1205 		xdf_iostart(vdp);
1206 
1207 	return (DDI_INTR_CLAIMED);
1208 }
1209 
1210 int xdf_fbrewrites;	/* how many times was our flush block rewritten */
1211 
1212 /*
1213  * Snarf new data if our flush block was re-written
1214  */
1215 static void
1216 check_fbwrite(xdf_t *vdp, buf_t *bp, daddr_t blkno)
1217 {
1218 	int nblks;
1219 	boolean_t mapin;
1220 
1221 	if (IS_WRITE_BARRIER(vdp, bp))
1222 		return; /* write was a flush write */
1223 
1224 	mapin = B_FALSE;
1225 	nblks = bp->b_bcount >> DEV_BSHIFT;
1226 	if (xdf_flush_block >= blkno && xdf_flush_block < (blkno + nblks)) {
1227 		xdf_fbrewrites++;
1228 		if (bp->b_flags & (B_PAGEIO | B_PHYS)) {
1229 			mapin = B_TRUE;
1230 			bp_mapin(bp);
1231 		}
1232 		bcopy(bp->b_un.b_addr +
1233 		    ((xdf_flush_block - blkno) << DEV_BSHIFT),
1234 		    vdp->xdf_cache_flush_block, DEV_BSIZE);
1235 		if (mapin)
1236 			bp_mapout(bp);
1237 	}
1238 }
1239 
1240 static void
1241 xdf_iofini(xdf_t *vdp, uint64_t id, int bioerr)
1242 {
1243 	ge_slot_t *gs = (ge_slot_t *)(uintptr_t)id;
1244 	v_req_t *vreq = gs->vreq;
1245 	buf_t *bp = vreq->v_buf;
1246 
1247 	gs_free(vdp, gs);
1248 	if (bioerr)
1249 		bioerror(bp, bioerr);
1250 	vreq->v_nslots--;
1251 	if (vreq->v_nslots != 0)
1252 		return;
1253 
1254 	XDF_UPDATE_IO_STAT(vdp, bp);
1255 	kstat_runq_exit(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1256 
1257 	if (IS_ERROR(bp))
1258 		bp->b_resid = bp->b_bcount;
1259 
1260 	vreq_free(vdp, vreq);
1261 	biodone(bp);
1262 }
1263 
1264 /*
1265  * return value of xdf_prepare_rreq()
1266  * used in xdf_iostart()
1267  */
1268 #define	XF_PARTIAL	0 /* rreq is full, not all I/O in buf transferred */
1269 #define	XF_COMP		1 /* no more I/O left in buf */
1270 
1271 static void
1272 xdf_iostart(xdf_t *vdp)
1273 {
1274 	xendev_ring_t *xbr;
1275 	struct buf *bp;
1276 	blkif_request_t *rreq;
1277 	int retval;
1278 	int rreqready = 0;
1279 
1280 	xbr = vdp->xdf_xb_ring;
1281 
1282 	/*
1283 	 * populate the ring request(s)
1284 	 *
1285 	 * loop until there is no buf to transfer or no free slot
1286 	 * available in I/O ring
1287 	 */
1288 	mutex_enter(&vdp->xdf_dev_lk);
1289 
1290 	for (;;) {
1291 		if (vdp->xdf_status != XD_READY)
1292 			break;
1293 
1294 		/* active buf queue empty? */
1295 		if ((bp = vdp->xdf_f_act) == NULL)
1296 			break;
1297 
1298 		/* try to grab a vreq for this bp */
1299 		if ((BP2VREQ(bp) == NULL) && (vreq_get(vdp, bp) == NULL))
1300 				break;
1301 		/* alloc DMA/GTE resources */
1302 		if (vreq_setup(vdp, BP2VREQ(bp)) != DDI_SUCCESS)
1303 			break;
1304 
1305 		/* get next blkif_request in the ring */
1306 		if ((rreq = xvdi_ring_get_request(xbr)) == NULL)
1307 			break;
1308 		bzero(rreq, sizeof (blkif_request_t));
1309 
1310 		/* populate blkif_request with this buf */
1311 		rreqready++;
1312 		retval = xdf_prepare_rreq(vdp, bp, rreq);
1313 		if (retval == XF_COMP) {
1314 			/* finish this bp, switch to next one */
1315 			kstat_waitq_to_runq(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1316 			vdp->xdf_f_act = bp->av_forw;
1317 			bp->av_forw = NULL;
1318 		}
1319 	}
1320 
1321 	/*
1322 	 * Send the request(s) to the backend
1323 	 */
1324 	if (rreqready) {
1325 		if (xvdi_ring_push_request(xbr)) {
1326 			DPRINTF(IO_DBG, ("xdf_iostart: "
1327 			    "sent request(s) to backend\n"));
1328 			xvdi_notify_oe(vdp->xdf_dip);
1329 		}
1330 	}
1331 
1332 	mutex_exit(&vdp->xdf_dev_lk);
1333 }
1334 
1335 /*
1336  * populate a single blkif_request_t w/ a buf
1337  */
1338 static int
1339 xdf_prepare_rreq(xdf_t *vdp, struct buf *bp, blkif_request_t *rreq)
1340 {
1341 	int		rval;
1342 	grant_ref_t	gr;
1343 	uint8_t		fsect, lsect;
1344 	size_t		bcnt;
1345 	paddr_t		dma_addr;
1346 	off_t		blk_off;
1347 	dev_info_t	*dip = vdp->xdf_dip;
1348 	blkif_vdev_t	vdev = xvdi_get_vdevnum(dip);
1349 	v_req_t		*vreq = BP2VREQ(bp);
1350 	uint64_t	blkno = vreq->v_blkno;
1351 	uint_t		ndmacs = vreq->v_ndmacs;
1352 	ddi_acc_handle_t acchdl = vdp->xdf_xb_ring_hdl;
1353 	int		seg = 0;
1354 	int		isread = IS_READ(bp);
1355 
1356 	if (isread)
1357 		ddi_put8(acchdl, &rreq->operation, BLKIF_OP_READ);
1358 	else {
1359 		switch (vreq->v_flush_diskcache) {
1360 		case FLUSH_DISKCACHE:
1361 			ddi_put8(acchdl, &rreq->operation,
1362 			    BLKIF_OP_FLUSH_DISKCACHE);
1363 			ddi_put16(acchdl, &rreq->handle, vdev);
1364 			ddi_put64(acchdl, &rreq->id,
1365 			    (uint64_t)(uintptr_t)(vreq->v_gs));
1366 			ddi_put8(acchdl, &rreq->nr_segments, 0);
1367 			return (XF_COMP);
1368 		case WRITE_BARRIER:
1369 			ddi_put8(acchdl, &rreq->operation,
1370 			    BLKIF_OP_WRITE_BARRIER);
1371 			break;
1372 		default:
1373 			if (!vdp->xdf_wce)
1374 				ddi_put8(acchdl, &rreq->operation,
1375 				    BLKIF_OP_WRITE_BARRIER);
1376 			else
1377 				ddi_put8(acchdl, &rreq->operation,
1378 				    BLKIF_OP_WRITE);
1379 			break;
1380 		}
1381 	}
1382 
1383 	ddi_put16(acchdl, &rreq->handle, vdev);
1384 	ddi_put64(acchdl, &rreq->sector_number, blkno);
1385 	ddi_put64(acchdl, &rreq->id, (uint64_t)(uintptr_t)(vreq->v_gs));
1386 
1387 	/*
1388 	 * loop until all segments are populated or no more dma cookie in buf
1389 	 */
1390 	for (;;) {
1391 	/*
1392 	 * Each segment of a blkif request can transfer up to
1393 	 * one 4K page of data.
1394 	 */
1395 		bcnt = vreq->v_dmac.dmac_size;
1396 		ASSERT(bcnt <= PAGESIZE);
1397 		ASSERT((bcnt % XB_BSIZE) == 0);
1398 		dma_addr = vreq->v_dmac.dmac_laddress;
1399 		blk_off = (uint_t)((paddr_t)XB_SEGOFFSET & dma_addr);
1400 		ASSERT((blk_off & XB_BMASK) == 0);
1401 		fsect = blk_off >> XB_BSHIFT;
1402 		lsect = fsect + (bcnt >> XB_BSHIFT) - 1;
1403 		ASSERT(fsect < XB_MAX_SEGLEN / XB_BSIZE &&
1404 		    lsect < XB_MAX_SEGLEN / XB_BSIZE);
1405 		DPRINTF(IO_DBG, ("  ""seg%d: dmacS %lu blk_off %ld\n",
1406 		    seg, vreq->v_dmac.dmac_size, blk_off));
1407 		gr = gs_grant(vreq->v_gs, PATOMA(dma_addr) >> PAGESHIFT);
1408 		ddi_put32(acchdl, &rreq->seg[seg].gref, gr);
1409 		ddi_put8(acchdl, &rreq->seg[seg].first_sect, fsect);
1410 		ddi_put8(acchdl, &rreq->seg[seg].last_sect, lsect);
1411 		DPRINTF(IO_DBG, ("  ""seg%d: fs %d ls %d gr %d dma 0x%"PRIx64
1412 		    "\n", seg, fsect, lsect, gr, dma_addr));
1413 
1414 		blkno += (bcnt >> XB_BSHIFT);
1415 		seg++;
1416 		ASSERT(seg <= BLKIF_MAX_SEGMENTS_PER_REQUEST);
1417 		if (--ndmacs) {
1418 			ddi_dma_nextcookie(vreq->v_dmahdl, &vreq->v_dmac);
1419 			continue;
1420 		}
1421 
1422 		vreq->v_status = VREQ_DMAWIN_DONE;
1423 		vreq->v_blkno = blkno;
1424 		if (vreq->v_dmaw + 1 == vreq->v_ndmaws)
1425 			/* last win */
1426 			rval = XF_COMP;
1427 		else
1428 			rval = XF_PARTIAL;
1429 		break;
1430 	}
1431 	ddi_put8(acchdl,  &rreq->nr_segments, seg);
1432 	DPRINTF(IO_DBG, ("xdf_prepare_rreq: request id=%"PRIx64" ready\n",
1433 	    rreq->id));
1434 
1435 	return (rval);
1436 }
1437 
1438 #define	XDF_QSEC	50000	/* .005 second */
1439 #define	XDF_POLLCNT	12	/* loop for 12 times before time out */
1440 
1441 static int
1442 xdf_drain_io(xdf_t *vdp)
1443 {
1444 	int pollc, rval;
1445 	xendev_ring_t *xbr;
1446 
1447 	if (xdfdebug & SUSRES_DBG)
1448 		xen_printf("xdf_drain_io: start\n");
1449 
1450 	mutex_enter(&vdp->xdf_dev_lk);
1451 
1452 	if ((vdp->xdf_status != XD_READY) && (vdp->xdf_status != XD_SUSPEND))
1453 		goto out;
1454 
1455 	rval = 0;
1456 	xbr = vdp->xdf_xb_ring;
1457 	ASSERT(xbr != NULL);
1458 
1459 	for (pollc = 0; pollc < XDF_POLLCNT; pollc++) {
1460 		if (xvdi_ring_has_unconsumed_responses(xbr)) {
1461 			mutex_exit(&vdp->xdf_dev_lk);
1462 			(void) xdf_intr((caddr_t)vdp);
1463 			mutex_enter(&vdp->xdf_dev_lk);
1464 		}
1465 		if (!xvdi_ring_has_incomp_request(xbr))
1466 			goto out;
1467 
1468 #ifndef	XPV_HVM_DRIVER
1469 		(void) HYPERVISOR_yield();
1470 #endif
1471 		/*
1472 		 * file-backed devices can be slow
1473 		 */
1474 		drv_usecwait(XDF_QSEC << pollc);
1475 	}
1476 	cmn_err(CE_WARN, "xdf_polled_io: timeout");
1477 	rval = EIO;
1478 out:
1479 	mutex_exit(&vdp->xdf_dev_lk);
1480 	if (xdfdebug & SUSRES_DBG)
1481 		xen_printf("xdf_drain_io: end, err=%d\n", rval);
1482 	return (rval);
1483 }
1484 
1485 /* ARGSUSED5 */
1486 static int
1487 xdf_lb_rdwr(dev_info_t *devi, uchar_t cmd, void *bufp,
1488     diskaddr_t start, size_t reqlen, void *tg_cookie)
1489 {
1490 	xdf_t *vdp;
1491 	struct buf *bp;
1492 	int err = 0;
1493 
1494 	vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi));
1495 	if (vdp == NULL)
1496 		return (ENXIO);
1497 
1498 	if ((start + (reqlen >> DEV_BSHIFT)) > vdp->xdf_xdev_nblocks)
1499 		return (EINVAL);
1500 
1501 	bp = getrbuf(KM_SLEEP);
1502 	if (cmd == TG_READ)
1503 		bp->b_flags = B_BUSY | B_READ;
1504 	else
1505 		bp->b_flags = B_BUSY | B_WRITE;
1506 	bp->b_un.b_addr = bufp;
1507 	bp->b_bcount = reqlen;
1508 	bp->b_resid = 0;
1509 	bp->b_blkno = start;
1510 	bp->av_forw = NULL;
1511 	bp->av_back = NULL;
1512 	bp->b_edev = DDI_DEV_T_NONE; /* don't have dev_t */
1513 
1514 	mutex_enter(&vdp->xdf_dev_lk);
1515 	kstat_waitq_enter(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
1516 	if (vdp->xdf_f_act == NULL) {
1517 		vdp->xdf_f_act = vdp->xdf_l_act = bp;
1518 	} else {
1519 		vdp->xdf_l_act->av_forw = bp;
1520 		vdp->xdf_l_act = bp;
1521 	}
1522 	mutex_exit(&vdp->xdf_dev_lk);
1523 	xdf_iostart(vdp);
1524 	err = biowait(bp);
1525 
1526 	ASSERT(bp->b_flags & B_DONE);
1527 
1528 	freerbuf(bp);
1529 	return (err);
1530 }
1531 
1532 /*
1533  * synthetic geometry
1534  */
1535 #define	XDF_NSECTS	256
1536 #define	XDF_NHEADS	16
1537 
1538 static int
1539 xdf_lb_getcap(dev_info_t *devi, diskaddr_t *capp)
1540 {
1541 	xdf_t *vdp;
1542 
1543 	vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi));
1544 
1545 	if (vdp == NULL)
1546 		return (ENXIO);
1547 
1548 	mutex_enter(&vdp->xdf_dev_lk);
1549 	*capp = vdp->xdf_xdev_nblocks;
1550 	DPRINTF(LBL_DBG, ("capacity %llu\n", *capp));
1551 	mutex_exit(&vdp->xdf_dev_lk);
1552 	return (0);
1553 }
1554 
1555 static int
1556 xdf_lb_getpgeom(dev_info_t *devi, cmlb_geom_t *geomp)
1557 {
1558 	xdf_t *vdp;
1559 	uint_t ncyl;
1560 	uint_t spc = XDF_NHEADS * XDF_NSECTS;
1561 
1562 	vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi));
1563 
1564 	if (vdp == NULL)
1565 		return (ENXIO);
1566 
1567 	ncyl = vdp->xdf_xdev_nblocks / spc;
1568 
1569 	geomp->g_ncyl = ncyl == 0 ? 1 : ncyl;
1570 	geomp->g_acyl = 0;
1571 	geomp->g_nhead = XDF_NHEADS;
1572 	geomp->g_secsize = XB_BSIZE;
1573 	geomp->g_nsect = XDF_NSECTS;
1574 	geomp->g_intrlv = 0;
1575 	geomp->g_rpm = 7200;
1576 	geomp->g_capacity = vdp->xdf_xdev_nblocks;
1577 	return (0);
1578 }
1579 
1580 /*
1581  * No real HBA, no geometry available from it
1582  */
1583 /*ARGSUSED*/
1584 static int
1585 xdf_lb_getvgeom(dev_info_t *devi, cmlb_geom_t *geomp)
1586 {
1587 	return (EINVAL);
1588 }
1589 
1590 static int
1591 xdf_lb_getattribute(dev_info_t *devi, tg_attribute_t *tgattributep)
1592 {
1593 	xdf_t *vdp;
1594 
1595 	if (!(vdp = ddi_get_soft_state(vbd_ss, ddi_get_instance(devi))))
1596 		return (ENXIO);
1597 
1598 	if (XD_IS_RO(vdp))
1599 		tgattributep->media_is_writable = 0;
1600 	else
1601 		tgattributep->media_is_writable = 1;
1602 	return (0);
1603 }
1604 
1605 /* ARGSUSED3 */
1606 static int
1607 xdf_lb_getinfo(dev_info_t *devi, int cmd, void *arg, void *tg_cookie)
1608 {
1609 	switch (cmd) {
1610 	case TG_GETPHYGEOM:
1611 		return (xdf_lb_getpgeom(devi, (cmlb_geom_t *)arg));
1612 	case TG_GETVIRTGEOM:
1613 		return (xdf_lb_getvgeom(devi, (cmlb_geom_t *)arg));
1614 	case TG_GETCAPACITY:
1615 		return (xdf_lb_getcap(devi, (diskaddr_t *)arg));
1616 	case TG_GETBLOCKSIZE:
1617 		*(uint32_t *)arg = XB_BSIZE;
1618 		return (0);
1619 	case TG_GETATTR:
1620 		return (xdf_lb_getattribute(devi, (tg_attribute_t *)arg));
1621 	default:
1622 		return (ENOTTY);
1623 	}
1624 }
1625 
1626 /*
1627  * Kick-off connect process
1628  * Status should be XD_UNKNOWN or XD_CLOSED
1629  * On success, status will be changed to XD_INIT
1630  * On error, status won't be changed
1631  */
1632 static int
1633 xdf_start_connect(xdf_t *vdp)
1634 {
1635 	char *xsnode;
1636 	grant_ref_t gref;
1637 	xenbus_transaction_t xbt;
1638 	int rv;
1639 	dev_info_t *dip = vdp->xdf_dip;
1640 
1641 	if ((vdp->xdf_peer = xvdi_get_oeid(dip)) == (domid_t)-1)
1642 		goto errout;
1643 
1644 	if (xvdi_alloc_evtchn(dip) != DDI_SUCCESS) {
1645 		cmn_err(CE_WARN, "xdf@%s: failed to alloc event channel",
1646 		    ddi_get_name_addr(dip));
1647 		goto errout;
1648 	}
1649 	vdp->xdf_evtchn = xvdi_get_evtchn(dip);
1650 #ifdef XPV_HVM_DRIVER
1651 	ec_bind_evtchn_to_handler(vdp->xdf_evtchn, IPL_VBD, xdf_intr, vdp);
1652 #else
1653 	if (ddi_add_intr(dip, 0, NULL, NULL, xdf_intr, (caddr_t)vdp) !=
1654 	    DDI_SUCCESS) {
1655 		cmn_err(CE_WARN, "xdf_start_connect: xdf@%s: "
1656 		    "failed to add intr handler", ddi_get_name_addr(dip));
1657 		goto errout1;
1658 	}
1659 #endif
1660 
1661 	if (xvdi_alloc_ring(dip, BLKIF_RING_SIZE,
1662 	    sizeof (union blkif_sring_entry), &gref, &vdp->xdf_xb_ring) !=
1663 	    DDI_SUCCESS) {
1664 		cmn_err(CE_WARN, "xdf@%s: failed to alloc comm ring",
1665 		    ddi_get_name_addr(dip));
1666 		goto errout2;
1667 	}
1668 	vdp->xdf_xb_ring_hdl = vdp->xdf_xb_ring->xr_acc_hdl; /* ugly!! */
1669 
1670 	/*
1671 	 * Write into xenstore the info needed by backend
1672 	 */
1673 	if ((xsnode = xvdi_get_xsname(dip)) == NULL) {
1674 		cmn_err(CE_WARN, "xdf@%s: "
1675 		    "failed to get xenstore node path",
1676 		    ddi_get_name_addr(dip));
1677 		goto fail_trans;
1678 	}
1679 trans_retry:
1680 	if (xenbus_transaction_start(&xbt)) {
1681 		cmn_err(CE_WARN, "xdf@%s: failed to start transaction",
1682 		    ddi_get_name_addr(dip));
1683 		xvdi_fatal_error(dip, EIO, "transaction start");
1684 		goto fail_trans;
1685 	}
1686 
1687 	if (rv = xenbus_printf(xbt, xsnode, "ring-ref", "%u", gref)) {
1688 		cmn_err(CE_WARN, "xdf@%s: failed to write ring-ref",
1689 		    ddi_get_name_addr(dip));
1690 		xvdi_fatal_error(dip, rv, "writing ring-ref");
1691 		goto abort_trans;
1692 	}
1693 
1694 	if (rv = xenbus_printf(xbt, xsnode, "event-channel", "%u",
1695 	    vdp->xdf_evtchn)) {
1696 		cmn_err(CE_WARN, "xdf@%s: failed to write event-channel",
1697 		    ddi_get_name_addr(dip));
1698 		xvdi_fatal_error(dip, rv, "writing event-channel");
1699 		goto abort_trans;
1700 	}
1701 
1702 	/*
1703 	 * "protocol" is written by the domain builder in the case of PV
1704 	 * domains. However, it is not written for HVM domains, so let's
1705 	 * write it here.
1706 	 */
1707 	if (rv = xenbus_printf(xbt, xsnode, "protocol", "%s",
1708 	    XEN_IO_PROTO_ABI_NATIVE)) {
1709 		cmn_err(CE_WARN, "xdf@%s: failed to write protocol",
1710 		    ddi_get_name_addr(dip));
1711 		xvdi_fatal_error(dip, rv, "writing protocol");
1712 		goto abort_trans;
1713 	}
1714 
1715 	if ((rv = xvdi_switch_state(dip, xbt, XenbusStateInitialised)) > 0) {
1716 		cmn_err(CE_WARN, "xdf@%s: "
1717 		    "failed to switch state to XenbusStateInitialised",
1718 		    ddi_get_name_addr(dip));
1719 		xvdi_fatal_error(dip, rv, "writing state");
1720 		goto abort_trans;
1721 	}
1722 
1723 	/* kick-off connect process */
1724 	if (rv = xenbus_transaction_end(xbt, 0)) {
1725 		if (rv == EAGAIN)
1726 			goto trans_retry;
1727 		cmn_err(CE_WARN, "xdf@%s: failed to end transaction",
1728 		    ddi_get_name_addr(dip));
1729 		xvdi_fatal_error(dip, rv, "completing transaction");
1730 		goto fail_trans;
1731 	}
1732 
1733 	ASSERT(mutex_owned(&vdp->xdf_cb_lk));
1734 	mutex_enter(&vdp->xdf_dev_lk);
1735 	vdp->xdf_status = XD_INIT;
1736 	mutex_exit(&vdp->xdf_dev_lk);
1737 
1738 	return (DDI_SUCCESS);
1739 
1740 abort_trans:
1741 	(void) xenbus_transaction_end(xbt, 1);
1742 fail_trans:
1743 	xvdi_free_ring(vdp->xdf_xb_ring);
1744 errout2:
1745 #ifdef XPV_HVM_DRIVER
1746 	ec_unbind_evtchn(vdp->xdf_evtchn);
1747 #else
1748 	(void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
1749 #endif
1750 errout1:
1751 	xvdi_free_evtchn(dip);
1752 errout:
1753 	cmn_err(CE_WARN, "xdf@%s: fail to kick-off connecting",
1754 	    ddi_get_name_addr(dip));
1755 	return (DDI_FAILURE);
1756 }
1757 
1758 /*
1759  * Kick-off disconnect process
1760  * Status won't be changed
1761  */
1762 static int
1763 xdf_start_disconnect(xdf_t *vdp)
1764 {
1765 	if (xvdi_switch_state(vdp->xdf_dip, XBT_NULL, XenbusStateClosed) > 0) {
1766 		cmn_err(CE_WARN, "xdf@%s: fail to kick-off disconnecting",
1767 		    ddi_get_name_addr(vdp->xdf_dip));
1768 		return (DDI_FAILURE);
1769 	}
1770 
1771 	return (DDI_SUCCESS);
1772 }
1773 
1774 int
1775 xdf_get_flush_block(xdf_t *vdp)
1776 {
1777 	/*
1778 	 * Get a DEV_BSIZE aligned bufer
1779 	 */
1780 	vdp->xdf_flush_mem = kmem_alloc(DEV_BSIZE * 2, KM_SLEEP);
1781 	vdp->xdf_cache_flush_block =
1782 	    (char *)P2ROUNDUP((uintptr_t)(vdp->xdf_flush_mem), DEV_BSIZE);
1783 	if (xdf_lb_rdwr(vdp->xdf_dip, TG_READ, vdp->xdf_cache_flush_block,
1784 	    xdf_flush_block, DEV_BSIZE, NULL) != 0)
1785 		return (DDI_FAILURE);
1786 	return (DDI_SUCCESS);
1787 }
1788 
1789 /*
1790  * Finish other initialization after we've connected to backend
1791  * Status should be XD_INIT before calling this routine
1792  * On success, status should be changed to XD_READY
1793  * On error, status should stay XD_INIT
1794  */
1795 static int
1796 xdf_post_connect(xdf_t *vdp)
1797 {
1798 	int rv;
1799 	uint_t len;
1800 	char *type;
1801 	char *barrier;
1802 	dev_info_t *devi = vdp->xdf_dip;
1803 
1804 	/*
1805 	 * Determine if feature barrier is supported by backend
1806 	 */
1807 	if (xenbus_read(XBT_NULL, xvdi_get_oename(devi),
1808 	    "feature-barrier", (void **)&barrier, &len) == 0) {
1809 		vdp->xdf_feature_barrier = 1;
1810 		kmem_free(barrier, len);
1811 	} else {
1812 		cmn_err(CE_NOTE, "xdf@%s: failed to read feature-barrier",
1813 		    ddi_get_name_addr(vdp->xdf_dip));
1814 		vdp->xdf_feature_barrier = 0;
1815 	}
1816 
1817 	/* probe backend */
1818 	if (rv = xenbus_gather(XBT_NULL, xvdi_get_oename(devi),
1819 	    "sectors", "%"SCNu64, &vdp->xdf_xdev_nblocks,
1820 	    "info", "%u", &vdp->xdf_xdev_info, NULL)) {
1821 		cmn_err(CE_WARN, "xdf_post_connect: xdf@%s: "
1822 		    "cannot read backend info", ddi_get_name_addr(devi));
1823 		xvdi_fatal_error(devi, rv, "reading backend info");
1824 		return (DDI_FAILURE);
1825 	}
1826 
1827 	/* fix disk type */
1828 	if (xenbus_read(XBT_NULL, xvdi_get_xsname(devi), "device-type",
1829 	    (void **)&type, &len) != 0) {
1830 		cmn_err(CE_WARN, "xdf_post_connect: xdf@%s: "
1831 		    "cannot read device-type", ddi_get_name_addr(devi));
1832 		xvdi_fatal_error(devi, rv, "reading device-type");
1833 		return (DDI_FAILURE);
1834 	}
1835 	if (strcmp(type, "cdrom") == 0)
1836 		vdp->xdf_xdev_info |= VDISK_CDROM;
1837 	kmem_free(type, len);
1838 
1839 	/*
1840 	 * We've created all the minor nodes via cmlb_attach() using default
1841 	 * value in xdf_attach() to make it possible to block in xdf_open(),
1842 	 * in case there's anyone (say, booting thread) ever trying to open
1843 	 * it before connected to backend. We will refresh all those minor
1844 	 * nodes w/ latest info we've got now when we are almost connected.
1845 	 *
1846 	 * Don't do this when xdf is already opened by someone (could happen
1847 	 * during resume), for that cmlb_attach() will invalid the label info
1848 	 * and confuse those who has already opened the node, which is bad.
1849 	 */
1850 	if (!xdf_isopen(vdp, -1) && (XD_IS_CD(vdp) || XD_IS_RM(vdp))) {
1851 		/* re-init cmlb w/ latest info we got from backend */
1852 		if (cmlb_attach(devi, &xdf_lb_ops,
1853 		    XD_IS_CD(vdp) ? DTYPE_RODIRECT : DTYPE_DIRECT,
1854 		    XD_IS_RM(vdp), 1, DDI_NT_BLOCK,
1855 		    CMLB_FAKE_LABEL_ONE_PARTITION,
1856 		    vdp->xdf_vd_lbl, NULL) != 0) {
1857 			cmn_err(CE_WARN, "xdf@%s: cmlb attach failed",
1858 			    ddi_get_name_addr(devi));
1859 			return (DDI_FAILURE);
1860 		}
1861 	}
1862 
1863 	/* mark vbd is ready for I/O */
1864 	ASSERT(mutex_owned(&vdp->xdf_cb_lk));
1865 	mutex_enter(&vdp->xdf_dev_lk);
1866 	vdp->xdf_status = XD_READY;
1867 	mutex_exit(&vdp->xdf_dev_lk);
1868 	/*
1869 	 * If backend has feature-barrier, see if it supports disk
1870 	 * cache flush op.
1871 	 */
1872 	vdp->xdf_flush_supported = 0;
1873 	if (vdp->xdf_feature_barrier) {
1874 		/*
1875 		 * Pretend we already know flush is supported so probe
1876 		 * will attempt the correct op.
1877 		 */
1878 		vdp->xdf_flush_supported = 1;
1879 		if (xdf_lb_rdwr(vdp->xdf_dip, TG_WRITE, NULL, 0, 0, 0) == 0) {
1880 			vdp->xdf_flush_supported = 1;
1881 		} else {
1882 			vdp->xdf_flush_supported = 0;
1883 			/*
1884 			 * If the other end does not support the cache flush op
1885 			 * then we must use a barrier-write to force disk
1886 			 * cache flushing.  Barrier writes require that a data
1887 			 * block actually be written.
1888 			 * Cache a block to barrier-write when we are
1889 			 * asked to perform a flush.
1890 			 * XXX - would it be better to just copy 1 block
1891 			 * (512 bytes) from whatever write we did last
1892 			 * and rewrite that block?
1893 			 */
1894 			if (xdf_get_flush_block(vdp) != DDI_SUCCESS)
1895 				return (DDI_FAILURE);
1896 		}
1897 	}
1898 
1899 	cmn_err(CE_CONT, "?xdf@%s: %"PRIu64" blocks", ddi_get_name_addr(devi),
1900 	    (uint64_t)vdp->xdf_xdev_nblocks);
1901 
1902 	return (DDI_SUCCESS);
1903 }
1904 
1905 /*
1906  * Finish other uninitialization after we've disconnected from backend
1907  * when status is XD_CLOSING or XD_INIT. After returns, status is XD_CLOSED
1908  */
1909 static void
1910 xdf_post_disconnect(xdf_t *vdp)
1911 {
1912 #ifdef XPV_HVM_DRIVER
1913 	ec_unbind_evtchn(vdp->xdf_evtchn);
1914 #else
1915 	(void) ddi_remove_intr(vdp->xdf_dip, 0, NULL);
1916 #endif
1917 	xvdi_free_evtchn(vdp->xdf_dip);
1918 	xvdi_free_ring(vdp->xdf_xb_ring);
1919 	vdp->xdf_xb_ring = NULL;
1920 	vdp->xdf_xb_ring_hdl = NULL;
1921 	vdp->xdf_peer = (domid_t)-1;
1922 
1923 	ASSERT(mutex_owned(&vdp->xdf_cb_lk));
1924 	mutex_enter(&vdp->xdf_dev_lk);
1925 	vdp->xdf_status = XD_CLOSED;
1926 	mutex_exit(&vdp->xdf_dev_lk);
1927 }
1928 
1929 /*ARGSUSED*/
1930 static void
1931 xdf_oe_change(dev_info_t *dip, ddi_eventcookie_t id, void *arg, void *impl_data)
1932 {
1933 	XenbusState new_state = *(XenbusState *)impl_data;
1934 	xdf_t *vdp = (xdf_t *)ddi_get_driver_private(dip);
1935 	boolean_t unexpect_die = B_FALSE;
1936 	int status;
1937 
1938 	DPRINTF(DDI_DBG, ("xdf@%s: otherend state change to %d!\n",
1939 	    ddi_get_name_addr(dip), new_state));
1940 
1941 	mutex_enter(&vdp->xdf_cb_lk);
1942 
1943 	if (xdf_check_state_transition(vdp, new_state) == DDI_FAILURE) {
1944 		mutex_exit(&vdp->xdf_cb_lk);
1945 		return;
1946 	}
1947 
1948 	switch (new_state) {
1949 	case XenbusStateInitialising:
1950 		ASSERT(vdp->xdf_status == XD_CLOSED);
1951 		/*
1952 		 * backend recovered from a previous failure,
1953 		 * kick-off connect process again
1954 		 */
1955 		if (xdf_start_connect(vdp) != DDI_SUCCESS) {
1956 			cmn_err(CE_WARN, "xdf@%s:"
1957 			    " failed to start reconnecting to backend",
1958 			    ddi_get_name_addr(dip));
1959 		}
1960 		break;
1961 	case XenbusStateConnected:
1962 		ASSERT(vdp->xdf_status == XD_INIT);
1963 		(void) xvdi_switch_state(dip, XBT_NULL, XenbusStateConnected);
1964 		/* finish final init after connect */
1965 		if (xdf_post_connect(vdp) != DDI_SUCCESS)
1966 			(void) xdf_start_disconnect(vdp);
1967 		break;
1968 	case XenbusStateClosing:
1969 		if (vdp->xdf_status == XD_READY) {
1970 			mutex_enter(&vdp->xdf_dev_lk);
1971 			if (xdf_isopen(vdp, -1)) {
1972 				cmn_err(CE_NOTE, "xdf@%s: hot-unplug failed, "
1973 				    "still in use", ddi_get_name_addr(dip));
1974 				mutex_exit(&vdp->xdf_dev_lk);
1975 				break;
1976 			} else {
1977 				vdp->xdf_status = XD_CLOSING;
1978 			}
1979 			mutex_exit(&vdp->xdf_dev_lk);
1980 		}
1981 		(void) xdf_start_disconnect(vdp);
1982 		break;
1983 	case XenbusStateClosed:
1984 		/* first check if BE closed unexpectedly */
1985 		mutex_enter(&vdp->xdf_dev_lk);
1986 		if (xdf_isopen(vdp, -1)) {
1987 			unexpect_die = B_TRUE;
1988 			unexpectedie(vdp);
1989 			cmn_err(CE_WARN, "xdf@%s: backend closed, "
1990 			    "reconnecting...", ddi_get_name_addr(dip));
1991 		}
1992 		mutex_exit(&vdp->xdf_dev_lk);
1993 
1994 		if (vdp->xdf_status == XD_READY) {
1995 			mutex_enter(&vdp->xdf_dev_lk);
1996 			vdp->xdf_status = XD_CLOSING;
1997 			mutex_exit(&vdp->xdf_dev_lk);
1998 
1999 #ifdef	DOMU_BACKEND
2000 			(void) xvdi_post_event(dip, XEN_HP_REMOVE);
2001 #endif
2002 
2003 			xdf_post_disconnect(vdp);
2004 			(void) xvdi_switch_state(dip, XBT_NULL,
2005 			    XenbusStateClosed);
2006 		} else if ((vdp->xdf_status == XD_INIT) ||
2007 		    (vdp->xdf_status == XD_CLOSING)) {
2008 			xdf_post_disconnect(vdp);
2009 		} else {
2010 			mutex_enter(&vdp->xdf_dev_lk);
2011 			vdp->xdf_status = XD_CLOSED;
2012 			mutex_exit(&vdp->xdf_dev_lk);
2013 		}
2014 	}
2015 
2016 	/* notify anybody waiting for oe state change */
2017 	mutex_enter(&vdp->xdf_dev_lk);
2018 	cv_broadcast(&vdp->xdf_dev_cv);
2019 	mutex_exit(&vdp->xdf_dev_lk);
2020 
2021 	status = vdp->xdf_status;
2022 	mutex_exit(&vdp->xdf_cb_lk);
2023 
2024 	if (status == XD_READY) {
2025 		xdf_iostart(vdp);
2026 	} else if ((status == XD_CLOSED) && !unexpect_die) {
2027 		/* interface is closed successfully, remove all minor nodes */
2028 		cmlb_detach(vdp->xdf_vd_lbl, NULL);
2029 		cmlb_free_handle(&vdp->xdf_vd_lbl);
2030 	}
2031 }
2032 
2033 /* check if partition is open, -1 - check all partitions on the disk */
2034 static boolean_t
2035 xdf_isopen(xdf_t *vdp, int partition)
2036 {
2037 	int i;
2038 	ulong_t parbit;
2039 	boolean_t rval = B_FALSE;
2040 
2041 	if (partition == -1)
2042 		parbit = (ulong_t)-1;
2043 	else
2044 		parbit = 1 << partition;
2045 
2046 	for (i = 0; i < OTYPCNT; i++) {
2047 		if (vdp->xdf_vd_open[i] & parbit)
2048 			rval = B_TRUE;
2049 	}
2050 
2051 	return (rval);
2052 }
2053 
2054 /*
2055  * Xdf_check_state_transition will check the XenbusState change to see
2056  * if the change is a valid transition or not.
2057  * The new state is written by backend domain, or by running xenstore-write
2058  * to change it manually in dom0
2059  */
2060 static int
2061 xdf_check_state_transition(xdf_t *vdp, XenbusState oestate)
2062 {
2063 	int status;
2064 	int stcheck;
2065 #define	STOK	0 /* need further process */
2066 #define	STNOP	1 /* no action need taking */
2067 #define	STBUG	2 /* unexpected state change, could be a bug */
2068 
2069 	status = vdp->xdf_status;
2070 	stcheck = STOK;
2071 
2072 	switch (status) {
2073 	case XD_UNKNOWN:
2074 		if ((oestate == XenbusStateUnknown)		||
2075 		    (oestate == XenbusStateConnected))
2076 			stcheck = STBUG;
2077 		else if ((oestate == XenbusStateInitialising)	||
2078 		    (oestate == XenbusStateInitWait)		||
2079 		    (oestate == XenbusStateInitialised))
2080 			stcheck = STNOP;
2081 		break;
2082 	case XD_INIT:
2083 		if (oestate == XenbusStateUnknown)
2084 			stcheck = STBUG;
2085 		else if ((oestate == XenbusStateInitialising)	||
2086 		    (oestate == XenbusStateInitWait)		||
2087 		    (oestate == XenbusStateInitialised))
2088 			stcheck = STNOP;
2089 		break;
2090 	case XD_READY:
2091 		if ((oestate == XenbusStateUnknown)		||
2092 		    (oestate == XenbusStateInitialising)	||
2093 		    (oestate == XenbusStateInitWait)		||
2094 		    (oestate == XenbusStateInitialised))
2095 			stcheck = STBUG;
2096 		else if (oestate == XenbusStateConnected)
2097 			stcheck = STNOP;
2098 		break;
2099 	case XD_CLOSING:
2100 		if ((oestate == XenbusStateUnknown)		||
2101 		    (oestate == XenbusStateInitialising)	||
2102 		    (oestate == XenbusStateInitWait)		||
2103 		    (oestate == XenbusStateInitialised)		||
2104 		    (oestate == XenbusStateConnected))
2105 			stcheck = STBUG;
2106 		else if (oestate == XenbusStateClosing)
2107 			stcheck = STNOP;
2108 		break;
2109 	case XD_CLOSED:
2110 		if ((oestate == XenbusStateUnknown)		||
2111 		    (oestate == XenbusStateConnected))
2112 			stcheck = STBUG;
2113 		else if ((oestate == XenbusStateInitWait)	||
2114 		    (oestate == XenbusStateInitialised)		||
2115 		    (oestate == XenbusStateClosing)		||
2116 		    (oestate == XenbusStateClosed))
2117 			stcheck = STNOP;
2118 		break;
2119 	case XD_SUSPEND:
2120 	default:
2121 			stcheck = STBUG;
2122 	}
2123 
2124 	if (stcheck == STOK)
2125 		return (DDI_SUCCESS);
2126 
2127 	if (stcheck == STBUG)
2128 		cmn_err(CE_NOTE, "xdf@%s: unexpected otherend "
2129 		    "state change to %d!, when status is %d",
2130 		    ddi_get_name_addr(vdp->xdf_dip), oestate, status);
2131 
2132 	return (DDI_FAILURE);
2133 }
2134 
2135 static int
2136 xdf_connect(xdf_t *vdp, boolean_t wait)
2137 {
2138 	ASSERT(mutex_owned(&vdp->xdf_dev_lk));
2139 	while (vdp->xdf_status != XD_READY) {
2140 		if (!wait || (vdp->xdf_status > XD_READY))
2141 			break;
2142 
2143 		if (cv_wait_sig(&vdp->xdf_dev_cv, &vdp->xdf_dev_lk) == 0)
2144 			break;
2145 	}
2146 
2147 	return (vdp->xdf_status);
2148 }
2149 
2150 /*
2151  * callback func when DMA/GTE resources is available
2152  *
2153  * Note: we only register one callback function to grant table subsystem
2154  * since we only have one 'struct gnttab_free_callback' in xdf_t.
2155  */
2156 static int
2157 xdf_dmacallback(caddr_t arg)
2158 {
2159 	xdf_t *vdp = (xdf_t *)arg;
2160 	ASSERT(vdp != NULL);
2161 
2162 	DPRINTF(DMA_DBG, ("xdf@%s: DMA callback started\n",
2163 	    ddi_get_name_addr(vdp->xdf_dip)));
2164 
2165 	ddi_trigger_softintr(vdp->xdf_softintr_id);
2166 	return (DDI_DMA_CALLBACK_DONE);
2167 }
2168 
2169 static uint_t
2170 xdf_iorestart(caddr_t arg)
2171 {
2172 	xdf_t *vdp = (xdf_t *)arg;
2173 
2174 	ASSERT(vdp != NULL);
2175 
2176 	mutex_enter(&vdp->xdf_dev_lk);
2177 	ASSERT(ISDMACBON(vdp));
2178 	SETDMACBOFF(vdp);
2179 	mutex_exit(&vdp->xdf_dev_lk);
2180 
2181 	xdf_iostart(vdp);
2182 
2183 	return (DDI_INTR_CLAIMED);
2184 }
2185 
2186 static void
2187 xdf_timeout_handler(void *arg)
2188 {
2189 	xdf_t *vdp = arg;
2190 
2191 	mutex_enter(&vdp->xdf_dev_lk);
2192 	vdp->xdf_timeout_id = 0;
2193 	mutex_exit(&vdp->xdf_dev_lk);
2194 
2195 	/* new timeout thread could be re-scheduled */
2196 	xdf_iostart(vdp);
2197 }
2198 
2199 /*
2200  * Alloc a vreq for this bp
2201  * bp->av_back contains the pointer to the vreq upon return
2202  */
2203 static v_req_t *
2204 vreq_get(xdf_t *vdp, buf_t *bp)
2205 {
2206 	v_req_t *vreq = NULL;
2207 
2208 	ASSERT(BP2VREQ(bp) == NULL);
2209 
2210 	vreq = kmem_cache_alloc(xdf_vreq_cache, KM_NOSLEEP);
2211 	if (vreq == NULL) {
2212 		if (vdp->xdf_timeout_id == 0)
2213 			/* restart I/O after one second */
2214 			vdp->xdf_timeout_id =
2215 			    timeout(xdf_timeout_handler, vdp, hz);
2216 		return (NULL);
2217 	}
2218 	bzero(vreq, sizeof (v_req_t));
2219 
2220 	list_insert_head(&vdp->xdf_vreq_act, (void *)vreq);
2221 	bp->av_back = (buf_t *)vreq;
2222 	vreq->v_buf = bp;
2223 	vreq->v_status = VREQ_INIT;
2224 	/* init of other fields in vreq is up to the caller */
2225 
2226 	return (vreq);
2227 }
2228 
2229 static void
2230 vreq_free(xdf_t *vdp, v_req_t *vreq)
2231 {
2232 	buf_t *bp = vreq->v_buf;
2233 
2234 	list_remove(&vdp->xdf_vreq_act, (void *)vreq);
2235 
2236 	if (vreq->v_flush_diskcache == FLUSH_DISKCACHE)
2237 		goto done;
2238 
2239 	switch (vreq->v_status) {
2240 	case VREQ_DMAWIN_DONE:
2241 	case VREQ_GS_ALLOCED:
2242 	case VREQ_DMABUF_BOUND:
2243 		(void) ddi_dma_unbind_handle(vreq->v_dmahdl);
2244 		/*FALLTHRU*/
2245 	case VREQ_DMAMEM_ALLOCED:
2246 		if (!ALIGNED_XFER(bp)) {
2247 			ASSERT(vreq->v_abuf != NULL);
2248 			if (!IS_ERROR(bp) && IS_READ(bp))
2249 				bcopy(vreq->v_abuf, bp->b_un.b_addr,
2250 				    bp->b_bcount);
2251 			ddi_dma_mem_free(&vreq->v_align);
2252 		}
2253 		/*FALLTHRU*/
2254 	case VREQ_MEMDMAHDL_ALLOCED:
2255 		if (!ALIGNED_XFER(bp))
2256 			ddi_dma_free_handle(&vreq->v_memdmahdl);
2257 		/*FALLTHRU*/
2258 	case VREQ_DMAHDL_ALLOCED:
2259 		ddi_dma_free_handle(&vreq->v_dmahdl);
2260 		break;
2261 	default:
2262 		break;
2263 	}
2264 done:
2265 	vreq->v_buf->av_back = NULL;
2266 	kmem_cache_free(xdf_vreq_cache, vreq);
2267 }
2268 
2269 /*
2270  * Initalize the DMA and grant table resources for the buf
2271  */
2272 static int
2273 vreq_setup(xdf_t *vdp, v_req_t *vreq)
2274 {
2275 	int rc;
2276 	ddi_dma_attr_t dmaattr;
2277 	uint_t ndcs, ndws;
2278 	ddi_dma_handle_t dh;
2279 	ddi_dma_handle_t mdh;
2280 	ddi_dma_cookie_t dc;
2281 	ddi_acc_handle_t abh;
2282 	caddr_t	aba;
2283 	ge_slot_t *gs;
2284 	size_t bufsz;
2285 	off_t off;
2286 	size_t sz;
2287 	buf_t *bp = vreq->v_buf;
2288 	int dma_flags = (IS_READ(bp) ? DDI_DMA_READ : DDI_DMA_WRITE) |
2289 	    DDI_DMA_STREAMING | DDI_DMA_PARTIAL;
2290 
2291 	switch (vreq->v_status) {
2292 	case VREQ_INIT:
2293 		if (IS_FLUSH_DISKCACHE(bp)) {
2294 			if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
2295 				DPRINTF(DMA_DBG, (
2296 				    "xdf@%s: get ge_slotfailed\n",
2297 				    ddi_get_name_addr(vdp->xdf_dip)));
2298 				return (DDI_FAILURE);
2299 			}
2300 			vreq->v_blkno = 0;
2301 			vreq->v_nslots = 1;
2302 			vreq->v_gs = gs;
2303 			vreq->v_flush_diskcache = FLUSH_DISKCACHE;
2304 			vreq->v_status = VREQ_GS_ALLOCED;
2305 			gs->vreq = vreq;
2306 			return (DDI_SUCCESS);
2307 		}
2308 
2309 		if (IS_WRITE_BARRIER(vdp, bp))
2310 			vreq->v_flush_diskcache = WRITE_BARRIER;
2311 		vreq->v_blkno = bp->b_blkno +
2312 		    (diskaddr_t)(uintptr_t)bp->b_private;
2313 		bp->b_private = NULL;
2314 		/* See if we wrote new data to our flush block */
2315 		if (!IS_READ(bp) && USE_WRITE_BARRIER(vdp))
2316 			check_fbwrite(vdp, bp, vreq->v_blkno);
2317 		vreq->v_status = VREQ_INIT_DONE;
2318 		/*FALLTHRU*/
2319 
2320 	case VREQ_INIT_DONE:
2321 		/*
2322 		 * alloc DMA handle
2323 		 */
2324 		rc = ddi_dma_alloc_handle(vdp->xdf_dip, &xb_dma_attr,
2325 		    xdf_dmacallback, (caddr_t)vdp, &dh);
2326 		if (rc != DDI_SUCCESS) {
2327 			SETDMACBON(vdp);
2328 			DPRINTF(DMA_DBG, ("xdf@%s: DMA handle alloc failed\n",
2329 			    ddi_get_name_addr(vdp->xdf_dip)));
2330 			return (DDI_FAILURE);
2331 		}
2332 
2333 		vreq->v_dmahdl = dh;
2334 		vreq->v_status = VREQ_DMAHDL_ALLOCED;
2335 		/*FALLTHRU*/
2336 
2337 	case VREQ_DMAHDL_ALLOCED:
2338 		/*
2339 		 * alloc dma handle for 512-byte aligned buf
2340 		 */
2341 		if (!ALIGNED_XFER(bp)) {
2342 			/*
2343 			 * XXPV: we need to temporarily enlarge the seg
2344 			 * boundary and s/g length to work round CR6381968
2345 			 */
2346 			dmaattr = xb_dma_attr;
2347 			dmaattr.dma_attr_seg = (uint64_t)-1;
2348 			dmaattr.dma_attr_sgllen = INT_MAX;
2349 			rc = ddi_dma_alloc_handle(vdp->xdf_dip, &dmaattr,
2350 			    xdf_dmacallback, (caddr_t)vdp, &mdh);
2351 			if (rc != DDI_SUCCESS) {
2352 				SETDMACBON(vdp);
2353 				DPRINTF(DMA_DBG, ("xdf@%s: unaligned buf DMA"
2354 				    "handle alloc failed\n",
2355 				    ddi_get_name_addr(vdp->xdf_dip)));
2356 				return (DDI_FAILURE);
2357 			}
2358 			vreq->v_memdmahdl = mdh;
2359 			vreq->v_status = VREQ_MEMDMAHDL_ALLOCED;
2360 		}
2361 		/*FALLTHRU*/
2362 
2363 	case VREQ_MEMDMAHDL_ALLOCED:
2364 		/*
2365 		 * alloc 512-byte aligned buf
2366 		 */
2367 		if (!ALIGNED_XFER(bp)) {
2368 			if (bp->b_flags & (B_PAGEIO | B_PHYS))
2369 				bp_mapin(bp);
2370 
2371 			rc = ddi_dma_mem_alloc(vreq->v_memdmahdl,
2372 			    roundup(bp->b_bcount, XB_BSIZE), &xc_acc_attr,
2373 			    DDI_DMA_STREAMING, xdf_dmacallback, (caddr_t)vdp,
2374 			    &aba, &bufsz, &abh);
2375 			if (rc != DDI_SUCCESS) {
2376 				SETDMACBON(vdp);
2377 				DPRINTF(DMA_DBG, (
2378 				    "xdf@%s: DMA mem allocation failed\n",
2379 				    ddi_get_name_addr(vdp->xdf_dip)));
2380 				return (DDI_FAILURE);
2381 			}
2382 
2383 			vreq->v_abuf = aba;
2384 			vreq->v_align = abh;
2385 			vreq->v_status = VREQ_DMAMEM_ALLOCED;
2386 
2387 			ASSERT(bufsz >= bp->b_bcount);
2388 			if (!IS_READ(bp))
2389 				bcopy(bp->b_un.b_addr, vreq->v_abuf,
2390 				    bp->b_bcount);
2391 		}
2392 		/*FALLTHRU*/
2393 
2394 	case VREQ_DMAMEM_ALLOCED:
2395 		/*
2396 		 * dma bind
2397 		 */
2398 		if (ALIGNED_XFER(bp)) {
2399 			rc = ddi_dma_buf_bind_handle(vreq->v_dmahdl, bp,
2400 			    dma_flags, xdf_dmacallback, (caddr_t)vdp,
2401 			    &dc, &ndcs);
2402 		} else {
2403 			rc = ddi_dma_addr_bind_handle(vreq->v_dmahdl,
2404 			    NULL, vreq->v_abuf, bp->b_bcount, dma_flags,
2405 			    xdf_dmacallback, (caddr_t)vdp, &dc, &ndcs);
2406 		}
2407 		if (rc == DDI_DMA_MAPPED || rc == DDI_DMA_PARTIAL_MAP) {
2408 			/* get num of dma windows */
2409 			if (rc == DDI_DMA_PARTIAL_MAP) {
2410 				rc = ddi_dma_numwin(vreq->v_dmahdl, &ndws);
2411 				ASSERT(rc == DDI_SUCCESS);
2412 			} else {
2413 				ndws = 1;
2414 			}
2415 		} else {
2416 			SETDMACBON(vdp);
2417 			DPRINTF(DMA_DBG, ("xdf@%s: DMA bind failed\n",
2418 			    ddi_get_name_addr(vdp->xdf_dip)));
2419 			return (DDI_FAILURE);
2420 		}
2421 
2422 		vreq->v_dmac = dc;
2423 		vreq->v_dmaw = 0;
2424 		vreq->v_ndmacs = ndcs;
2425 		vreq->v_ndmaws = ndws;
2426 		vreq->v_nslots = ndws;
2427 		vreq->v_status = VREQ_DMABUF_BOUND;
2428 		/*FALLTHRU*/
2429 
2430 	case VREQ_DMABUF_BOUND:
2431 		/*
2432 		 * get ge_slot, callback is set upon failure from gs_get(),
2433 		 * if not set previously
2434 		 */
2435 		if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
2436 			DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
2437 			    ddi_get_name_addr(vdp->xdf_dip)));
2438 			return (DDI_FAILURE);
2439 		}
2440 
2441 		vreq->v_gs = gs;
2442 		gs->vreq = vreq;
2443 		vreq->v_status = VREQ_GS_ALLOCED;
2444 		break;
2445 
2446 	case VREQ_GS_ALLOCED:
2447 		/* nothing need to be done */
2448 		break;
2449 
2450 	case VREQ_DMAWIN_DONE:
2451 		/*
2452 		 * move to the next dma window
2453 		 */
2454 		ASSERT((vreq->v_dmaw + 1) < vreq->v_ndmaws);
2455 
2456 		/* get a ge_slot for this DMA window */
2457 		if ((gs = gs_get(vdp, IS_READ(bp))) == NULL) {
2458 			DPRINTF(DMA_DBG, ("xdf@%s: get ge_slot failed\n",
2459 			    ddi_get_name_addr(vdp->xdf_dip)));
2460 			return (DDI_FAILURE);
2461 		}
2462 
2463 		vreq->v_gs = gs;
2464 		gs->vreq = vreq;
2465 		vreq->v_dmaw++;
2466 		rc = ddi_dma_getwin(vreq->v_dmahdl, vreq->v_dmaw, &off, &sz,
2467 		    &vreq->v_dmac, &vreq->v_ndmacs);
2468 		ASSERT(rc == DDI_SUCCESS);
2469 		vreq->v_status = VREQ_GS_ALLOCED;
2470 		break;
2471 
2472 	default:
2473 		return (DDI_FAILURE);
2474 	}
2475 
2476 	return (DDI_SUCCESS);
2477 }
2478 
2479 static ge_slot_t *
2480 gs_get(xdf_t *vdp, int isread)
2481 {
2482 	grant_ref_t gh;
2483 	ge_slot_t *gs;
2484 
2485 	/* try to alloc GTEs needed in this slot, first */
2486 	if (gnttab_alloc_grant_references(
2487 	    BLKIF_MAX_SEGMENTS_PER_REQUEST, &gh) == -1) {
2488 		if (vdp->xdf_gnt_callback.next == NULL) {
2489 			SETDMACBON(vdp);
2490 			gnttab_request_free_callback(
2491 			    &vdp->xdf_gnt_callback,
2492 			    (void (*)(void *))xdf_dmacallback,
2493 			    (void *)vdp,
2494 			    BLKIF_MAX_SEGMENTS_PER_REQUEST);
2495 		}
2496 		return (NULL);
2497 	}
2498 
2499 	gs = kmem_cache_alloc(xdf_gs_cache, KM_NOSLEEP);
2500 	if (gs == NULL) {
2501 		gnttab_free_grant_references(gh);
2502 		if (vdp->xdf_timeout_id == 0)
2503 			/* restart I/O after one second */
2504 			vdp->xdf_timeout_id =
2505 			    timeout(xdf_timeout_handler, vdp, hz);
2506 		return (NULL);
2507 	}
2508 
2509 	/* init gs_slot */
2510 	list_insert_head(&vdp->xdf_gs_act, (void *)gs);
2511 	gs->oeid = vdp->xdf_peer;
2512 	gs->isread = isread;
2513 	gs->ghead = gh;
2514 	gs->ngrefs = 0;
2515 
2516 	return (gs);
2517 }
2518 
2519 static void
2520 gs_free(xdf_t *vdp, ge_slot_t *gs)
2521 {
2522 	int i;
2523 	grant_ref_t *gp = gs->ge;
2524 	int ngrefs = gs->ngrefs;
2525 	boolean_t isread = gs->isread;
2526 
2527 	list_remove(&vdp->xdf_gs_act, (void *)gs);
2528 
2529 	/* release all grant table entry resources used in this slot */
2530 	for (i = 0; i < ngrefs; i++, gp++)
2531 		gnttab_end_foreign_access(*gp, !isread, 0);
2532 	gnttab_free_grant_references(gs->ghead);
2533 
2534 	kmem_cache_free(xdf_gs_cache, (void *)gs);
2535 }
2536 
2537 static grant_ref_t
2538 gs_grant(ge_slot_t *gs, mfn_t mfn)
2539 {
2540 	grant_ref_t gr = gnttab_claim_grant_reference(&gs->ghead);
2541 
2542 	ASSERT(gr != -1);
2543 	ASSERT(gs->ngrefs < BLKIF_MAX_SEGMENTS_PER_REQUEST);
2544 	gs->ge[gs->ngrefs++] = gr;
2545 	gnttab_grant_foreign_access_ref(gr, gs->oeid, mfn, !gs->isread);
2546 
2547 	return (gr);
2548 }
2549 
2550 static void
2551 unexpectedie(xdf_t *vdp)
2552 {
2553 	/* clean up I/Os in ring that have responses */
2554 	if (xvdi_ring_has_unconsumed_responses(vdp->xdf_xb_ring)) {
2555 		mutex_exit(&vdp->xdf_dev_lk);
2556 		(void) xdf_intr((caddr_t)vdp);
2557 		mutex_enter(&vdp->xdf_dev_lk);
2558 	}
2559 
2560 	/* free up all grant table entries */
2561 	while (!list_is_empty(&vdp->xdf_gs_act))
2562 		gs_free(vdp, list_head(&vdp->xdf_gs_act));
2563 
2564 	/*
2565 	 * move bp back to active list orderly
2566 	 * vreq_busy is updated in vreq_free()
2567 	 */
2568 	while (!list_is_empty(&vdp->xdf_vreq_act)) {
2569 		v_req_t *vreq = list_head(&vdp->xdf_vreq_act);
2570 		buf_t *bp = vreq->v_buf;
2571 
2572 		bp->av_back = NULL;
2573 		bp->b_resid = bp->b_bcount;
2574 		if (vdp->xdf_f_act == NULL) {
2575 			vdp->xdf_f_act = vdp->xdf_l_act = bp;
2576 		} else {
2577 			/* move to the head of list */
2578 			bp->av_forw = vdp->xdf_f_act;
2579 			vdp->xdf_f_act = bp;
2580 		}
2581 		kstat_runq_back_to_waitq(KSTAT_IO_PTR(vdp->xdf_xdev_iostat));
2582 		vreq_free(vdp, vreq);
2583 	}
2584 }
2585 
2586 static void
2587 xdfmin(struct buf *bp)
2588 {
2589 	if (bp->b_bcount > xdf_maxphys)
2590 		bp->b_bcount = xdf_maxphys;
2591 }
2592